HTMLTokenizer.h

Go to the documentation of this file.
00001 #ifndef HTML_TOKENIZER_H
00002 #define HTML_TOKENIZER_H
00003 
00004 #include <string>
00005 #include "URLInputStream.h"
00006 #include "HTMLToken.h"
00007 
00008 //!@defgroup htmltokenizer HTML Tokenizing Utils
00009 //!Functions used to break apart an html string into tokens
00010 //!@li Class HTMLTokenizer Used by students to break html strings into tokens.
00011 //!@li Class HTMLToken The token that is returned when the student requests to get the next token from an HTMLTokenizer.
00012 
00013 class HTMLTokenizerTester;
00014 
00015 /**
00016  * The HTMLTokenizer class is used to parse the html language into tokens.
00017  */
00018 //!@ingroup htmltokenizer
00019 class HTMLTokenizer
00020 {
00021 public:
00022     friend class HTMLTokenizerTester;
00023     
00024     /**
00025      * Initialize the HTMLTokenizer with an html string that you have already downloaded.
00026      */
00027     HTMLTokenizer(const std::string& htmlCode);
00028     /**
00029      * Initialize the HTMLTokenizer with an InputStream that the HTMLTokenizer will then
00030      * retrieve all of the HTML code from.  When the constructor is finished, the
00031      * InputStream that you pass in should be done and should be closed.
00032      */
00033     HTMLTokenizer(URLInputStream* input);
00034     /**
00035      * Destructor that cleans up any memory needing to be deleted by this tokenizer.
00036      */
00037     virtual ~HTMLTokenizer();
00038     
00039     /**
00040      * Get the next HTMLToken in the html string.
00041      */
00042     HTMLToken GetNextToken();
00043     
00044     /**
00045      * Return whether the tokenizer has another token to return.
00046      */
00047     bool HasNextToken() const;
00048     
00049 private:
00050     /**
00051      * Convert a tag string into an HTMLToken object.
00052      */
00053     HTMLToken TagToToken(const std::string& str) const;
00054     
00055     /**
00056      * Get the next string token from the HTML string contined in this HTMLTokenizer.  If the
00057      * current parsing position is at the end of the string, the empty string is returned.
00058      */
00059     std::string GetNextTokenString();
00060     
00061     /**
00062      * Returns whether the current parse position is at the beginning of a tag or not.
00063      */
00064     bool AtTag() const;
00065     /**
00066      * Returns whether the given position in the given string is at the beginning of a tag or not.
00067      */
00068     bool AtTag(const std::string& str, int pos) const;
00069     /**
00070      * Returns whether the current parse position is at the beginning of a comment or not.
00071      */
00072     bool AtComment() const;
00073     /**
00074      * Returns whether the given position in the given string is at the beginning of a comment or not.
00075      */
00076     bool AtComment(const std::string& str, int pos) const;
00077     /**
00078      * Returns whether the given character is a whitespace character or not.
00079      */
00080     bool IsWhitespace(char toCheck) const;
00081     
00082     /**
00083      * Return the first non-whitespace index in the given string after the given starting index.
00084      */
00085     int AfterWhitespace(int startIndex, const std::string& str) const;
00086     
00087     /**
00088      * Return the end position of the current tag being looked at.
00089      */
00090     int FindEndOfTag() const;
00091     
00092     /**
00093      * Return the end position of the current comment being looked at.
00094      */
00095     int FindEndOfComment() const;
00096     
00097     /**
00098      * Return the end position of the current block of text being looked at.
00099      */
00100     int FindEndOfText() const;
00101     
00102     /**
00103      * The HTML String to parse and return tokens from.
00104      */
00105     std::string htmlString;
00106     
00107     /**
00108      * The current position inside the htmlString that the parsing process is at.
00109      */
00110     int parsePosition;
00111 };
00112 
00113 
00114 #endif
00115 

Generated on Wed Jul 7 16:30:27 2010 for CS240Utils by  doxygen 1.5.8