-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | A high level web scraping library for Haskell.
--   
--   Scalpel is a web scraping library inspired by libraries like Parsec
--   and Perl's Web::Scraper Scalpel builds on top of TagSoup to provide a
--   declarative and monadic interface.
@package scalpel
@version 0.5.1


-- | Scalpel is a web scraping library inspired by libraries like parsec
--   and Perl's <a>Web::Scraper</a>. Scalpel builds on top of
--   <a>Text.HTML.TagSoup</a> to provide a declarative and monadic
--   interface.
--   
--   There are two general mechanisms provided by this library that are
--   used to build web scrapers: Selectors and Scrapers.
--   
--   Selectors describe a location within an HTML DOM tree. The simplest
--   selector, that can be written is a simple string value. For example,
--   the selector <tt>"div"</tt> matches every single div node in a DOM.
--   Selectors can be combined using tag combinators. The <a>//</a>
--   operator to define nested relationships within a DOM tree. For
--   example, the selector <tt>"div" // "a"</tt> matches all anchor tags
--   nested arbitrarily deep within a div tag.
--   
--   In addition to describing the nested relationships between tags,
--   selectors can also include predicates on the attributes of a tag. The
--   <a>@:</a> operator creates a selector that matches a tag based on the
--   name and various conditions on the tag's attributes. An attribute
--   predicate is just a function that takes an attribute and returns a
--   boolean indicating if the attribute matches a criteria. There are
--   several attribute operators that can be used to generate common
--   predicates. The <a>@=</a> operator creates a predicate that matches
--   the name and value of an attribute exactly. For example, the selector
--   <tt>"div" @: ["id" @= "article"]</tt> matches div tags where the id
--   attribute is equal to <tt>"article"</tt>.
--   
--   Scrapers are values that are parameterized over a selector and produce
--   a value from an HTML DOM tree. The <a>Scraper</a> type takes two type
--   parameters. The first is the string like type that is used to store
--   the text values within a DOM tree. Any string like type supported by
--   <a>Text.StringLike</a> is valid. The second type is the type of value
--   that the scraper produces.
--   
--   There are several scraper primitives that take selectors and extract
--   content from the DOM. Each primitive defined by this library comes in
--   two variants: singular and plural. The singular variants extract the
--   first instance matching the given selector, while the plural variants
--   match every instance.
--   
--   The following is an example that demonstrates most of the features
--   provided by this library. Suppose you have the following hypothetical
--   HTML located at <tt>"http:/<i>example.com</i>article.html"</tt> and
--   you would like to extract a list of all of the comments.
--   
--   <pre>
--   &lt;html&gt;
--     &lt;body&gt;
--       &lt;div class='comments'&gt;
--         &lt;div class='comment container'&gt;
--           &lt;span class='comment author'&gt;Sally&lt;/span&gt;
--           &lt;div class='comment text'&gt;Woo hoo!&lt;/div&gt;
--         &lt;/div&gt;
--         &lt;div class='comment container'&gt;
--           &lt;span class='comment author'&gt;Bill&lt;/span&gt;
--           &lt;img class='comment image' src='http://example.com/cat.gif' /&gt;
--         &lt;/div&gt;
--         &lt;div class='comment container'&gt;
--           &lt;span class='comment author'&gt;Susan&lt;/span&gt;
--           &lt;div class='comment text'&gt;WTF!?!&lt;/div&gt;
--         &lt;/div&gt;
--       &lt;/div&gt;
--     &lt;/body&gt;
--   &lt;/html&gt;
--   </pre>
--   
--   The following snippet defines a function, <tt>allComments</tt>, that
--   will download the web page, and extract all of the comments into a
--   list:
--   
--   <pre>
--   type Author = String
--   
--   data Comment
--       = TextComment Author String
--       | ImageComment Author URL
--       deriving (Show, Eq)
--   
--   allComments :: IO (Maybe [Comment])
--   allComments = scrapeURL "http://example.com/article.html" comments
--      where
--          comments :: Scraper String [Comment]
--          comments = chroots ("div" @: [hasClass "container"]) comment
--   
--          comment :: Scraper String Comment
--          comment = textComment &lt;|&gt; imageComment
--   
--          textComment :: Scraper String Comment
--          textComment = do
--              author      &lt;- text $ "span" @: [hasClass "author"]
--              commentText &lt;- text $ "div"  @: [hasClass "text"]
--              return $ TextComment author commentText
--   
--          imageComment :: Scraper String Comment
--          imageComment = do
--              author   &lt;- text       $ "span" @: [hasClass "author"]
--              imageURL &lt;- attr "src" $ "img"  @: [hasClass "image"]
--              return $ ImageComment author imageURL
--   </pre>
--   
--   Complete examples can be found in the <a>examples</a> folder in the
--   scalpel git repository.
module Text.HTML.Scalpel

-- | <a>Selector</a> defines a selection of an HTML DOM tree to be operated
--   on by a web scraper. The selection includes the opening tag that
--   matches the selection, all of the inner tags, and the corresponding
--   closing tag.
data Selector

-- | An <a>AttributePredicate</a> is a method that takes a <a>Attribute</a>
--   and returns a <a>Bool</a> indicating if the given attribute matches a
--   predicate.
data AttributePredicate

-- | The <a>AttributeName</a> type can be used when creating
--   <a>Selector</a>s to specify the name of an attribute of a tag.
data AttributeName
AnyAttribute :: AttributeName
AttributeString :: String -> AttributeName

-- | The <a>TagName</a> type is used when creating a <a>Selector</a> to
--   specify the name of a tag.
data TagName
AnyTag :: TagName
TagString :: String -> TagName
tagSelector :: String -> Selector

-- | A selector which will match all tags
anySelector :: Selector

-- | The <a>//</a> operator creates an <a>Selector</a> by nesting one
--   <a>Selector</a> in another. For example, <tt>"div" // "a"</tt> will
--   create a <a>Selector</a> that matches anchor tags that are nested
--   arbitrarily deep within a div tag.
(//) :: Selector -> Selector -> Selector
infixl 5 //

-- | The <a>@:</a> operator creates a <a>Selector</a> by combining a
--   <a>TagName</a> with a list of <a>AttributePredicate</a>s.
(@:) :: TagName -> [AttributePredicate] -> Selector
infixl 9 @:

-- | The <a>@=</a> operator creates an <a>AttributePredicate</a> that will
--   match attributes with the given name and value.
--   
--   If you are attempting to match a specific class of a tag with
--   potentially multiple classes, you should use the <a>hasClass</a>
--   utility function.
(@=) :: AttributeName -> String -> AttributePredicate
infixl 6 @=

-- | The <a>@=~</a> operator creates an <a>AttributePredicate</a> that will
--   match attributes with the given name and whose value matches the given
--   regular expression.
(@=~) :: RegexLike re String => AttributeName -> re -> AttributePredicate
infixl 6 @=~

-- | The classes of a tag are defined in HTML as a space separated list
--   given by the <tt>class</tt> attribute. The <a>hasClass</a> function
--   will match a <tt>class</tt> attribute if the given class appears
--   anywhere in the space separated list of classes.
hasClass :: String -> AttributePredicate

-- | Negates an <a>AttributePredicate</a>.
notP :: AttributePredicate -> AttributePredicate

-- | The <a>match</a> function allows for the creation of arbitrary
--   <a>AttributePredicate</a>s. The argument is a function that takes the
--   attribute key followed by the attribute value and returns a boolean
--   indicating if the attribute satisfies the predicate.
match :: String -> String -> Bool -> AttributePredicate

-- | A value of <a>Scraper</a> <tt>a</tt> defines a web scraper that is
--   capable of consuming a list of <a>Tag</a>s and optionally producing a
--   value of type <tt>a</tt>.
data Scraper str a

-- | The <a>attr</a> function takes an attribute name and a selector and
--   returns the value of the attribute of the given name for the first
--   opening tag that matches the given selector.
--   
--   This function will match only the opening tag matching the selector,
--   to match every tag, use <a>attrs</a>.
attr :: (Ord str, Show str, StringLike str) => String -> Selector -> Scraper str str

-- | The <a>attrs</a> function takes an attribute name and a selector and
--   returns the value of the attribute of the given name for every opening
--   tag that matches the given selector.
attrs :: (Ord str, Show str, StringLike str) => String -> Selector -> Scraper str [str]

-- | The <a>html</a> function takes a selector and returns the html string
--   from the set of tags described by the given selector.
--   
--   This function will match only the first set of tags matching the
--   selector, to match every set of tags, use <a>htmls</a>.
html :: (Ord str, StringLike str) => Selector -> Scraper str str

-- | The <a>htmls</a> function takes a selector and returns the html string
--   from every set of tags matching the given selector.
htmls :: (Ord str, StringLike str) => Selector -> Scraper str [str]

-- | The <a>innerHTML</a> function takes a selector and returns the inner
--   html string from the set of tags described by the given selector.
--   Inner html here meaning the html within but not including the selected
--   tags.
--   
--   This function will match only the first set of tags matching the
--   selector, to match every set of tags, use <a>innerHTMLs</a>.
innerHTML :: (Ord str, StringLike str) => Selector -> Scraper str str

-- | The <a>innerHTMLs</a> function takes a selector and returns the inner
--   html string from every set of tags matching the given selector.
innerHTMLs :: (Ord str, StringLike str) => Selector -> Scraper str [str]

-- | The <a>text</a> function takes a selector and returns the inner text
--   from the set of tags described by the given selector.
--   
--   This function will match only the first set of tags matching the
--   selector, to match every set of tags, use <a>texts</a>.
text :: (Ord str, StringLike str) => Selector -> Scraper str str

-- | The <a>texts</a> function takes a selector and returns the inner text
--   from every set of tags matching the given selector.
texts :: (Ord str, StringLike str) => Selector -> Scraper str [str]

-- | The <a>chroot</a> function takes a selector and an inner scraper and
--   executes the inner scraper as if it were scraping a document that
--   consists solely of the tags corresponding to the selector.
--   
--   This function will match only the first set of tags matching the
--   selector, to match every set of tags, use <a>chroots</a>.
chroot :: (Ord str, StringLike str) => Selector -> Scraper str a -> Scraper str a

-- | The <a>chroots</a> function takes a selector and an inner scraper and
--   executes the inner scraper as if it were scraping a document that
--   consists solely of the tags corresponding to the selector. The inner
--   scraper is executed for each set of tags matching the given selector.
chroots :: (Ord str, StringLike str) => Selector -> Scraper str a -> Scraper str [a]

-- | The <a>position</a> function is intended to be used within the
--   do-block of a <a>chroots</a> call. Within the do-block position will
--   return the index of the current sub-tree within the list of all
--   sub-trees matched by the selector passed to <a>chroots</a>.
--   
--   For example, consider the following HTML:
--   
--   <pre>
--   &lt;article&gt;
--    &lt;p&gt; First paragraph. &lt;/p&gt;
--    &lt;p&gt; Second paragraph. &lt;/p&gt;
--    &lt;p&gt; Third paragraph. &lt;/p&gt;
--   &lt;/article&gt;
--   </pre>
--   
--   The <a>position</a> function can be used to determine the index of
--   each <tt>&lt;p&gt;</tt> tag within the <tt>article</tt> tag by doing
--   the following.
--   
--   <pre>
--   chroots "article" // "p" $ do
--     index   &lt;- position
--     content &lt;- text "p"
--     return (index, content)
--   </pre>
--   
--   Which will evaluate to the list:
--   
--   <pre>
--   [
--     (0, "First paragraph.")
--   , (1, "Second paragraph.")
--   , (2, "Third paragraph.")
--   ]
--   </pre>
position :: (Ord str, StringLike str) => Scraper str Int

-- | The <a>scrape</a> function executes a <a>Scraper</a> on a list of
--   <a>Tag</a>s and produces an optional value.
scrape :: (Ord str, StringLike str) => Scraper str a -> [Tag str] -> Maybe a

-- | The <a>scrapeStringLike</a> function parses a <tt>StringLike</tt>
--   value into a list of tags and executes a <a>Scraper</a> on it.
scrapeStringLike :: (Ord str, StringLike str) => str -> Scraper str a -> Maybe a
type URL = String

-- | The <a>scrapeURL</a> function downloads the contents of the given URL
--   and executes a <a>Scraper</a> on it.
--   
--   <a>scrapeURL</a> makes use of curl to make HTTP requests. The
--   dependency on curl may be too heavyweight for some use cases. In which
--   case users who do not require inbuilt networking support can depend on
--   <a>scalpel-core</a> for a lightweight subset of this library that does
--   not depend on curl.
scrapeURL :: (Ord str, StringLike str) => URL -> Scraper str a -> IO (Maybe a)

-- | The <a>scrapeURLWithOpts</a> function take a list of curl options and
--   downloads the contents of the given URL and executes a <a>Scraper</a>
--   on it.
scrapeURLWithOpts :: (Ord str, StringLike str) => [CurlOption] -> URL -> Scraper str a -> IO (Maybe a)

-- | The <a>scrapeURLWithConfig</a> function takes a <a>Config</a> record
--   type and downloads the contents of the given URL and executes a
--   <a>Scraper</a> on it.
scrapeURLWithConfig :: (Ord str, StringLike str) => Config str -> URL -> Scraper str a -> IO (Maybe a)

-- | A record type that determines how <tt>scrapeUrlWithConfig</tt>
--   interacts with the HTTP server and interprets the results.
data Config str
Config :: [CurlOption] -> Decoder str -> Config str
[curlOpts] :: Config str -> [CurlOption]
[decoder] :: Config str -> Decoder str

-- | A method that takes a HTTP response as raw bytes and returns the body
--   as a string type.
type Decoder str = CurlResponse_ [(String, String)] ByteString -> str

-- | The default response decoder. This decoder attempts to infer the
--   character set of the HTTP response body from the `Content-Type`
--   header. If this header is not present, then the character set is
--   assumed to be `ISO-8859-1`.
defaultDecoder :: StringLike str => Decoder str

-- | A decoder that will always decode using `UTF-8`.
utf8Decoder :: StringLike str => Decoder str

-- | A decoder that will always decode using `ISO-8859-1`.
iso88591Decoder :: StringLike str => Decoder str
