00001 #ifndef HTML_TOKENIZER_H 00002 #define HTML_TOKENIZER_H 00003 00004 #include <string> 00005 #include "URLInputStream.h" 00006 #include "HTMLToken.h" 00007 00008 //!@defgroup htmltokenizer HTML Tokenizing Utils 00009 //!Functions used to break apart an html string into tokens 00010 //!@li Class HTMLTokenizer Used by students to break html strings into tokens. 00011 //!@li Class HTMLToken The token that is returned when the student requests to get the next token from an HTMLTokenizer. 00012 00013 class HTMLTokenizerTester; 00014 00015 /** 00016 * The HTMLTokenizer class is used to parse the html language into tokens. 00017 */ 00018 //!@ingroup htmltokenizer 00019 class HTMLTokenizer 00020 { 00021 public: 00022 friend class HTMLTokenizerTester; 00023 00024 /** 00025 * Initialize the HTMLTokenizer with an html string that you have already downloaded. 00026 */ 00027 HTMLTokenizer(const std::string& htmlCode); 00028 /** 00029 * Initialize the HTMLTokenizer with an InputStream that the HTMLTokenizer will then 00030 * retrieve all of the HTML code from. When the constructor is finished, the 00031 * InputStream that you pass in should be done and should be closed. 00032 */ 00033 HTMLTokenizer(URLInputStream* input); 00034 /** 00035 * Destructor that cleans up any memory needing to be deleted by this tokenizer. 00036 */ 00037 virtual ~HTMLTokenizer(); 00038 00039 /** 00040 * Get the next HTMLToken in the html string. 00041 */ 00042 HTMLToken GetNextToken(); 00043 00044 /** 00045 * Return whether the tokenizer has another token to return. 00046 */ 00047 bool HasNextToken() const; 00048 00049 private: 00050 /** 00051 * Convert a tag string into an HTMLToken object. 00052 */ 00053 HTMLToken TagToToken(const std::string& str) const; 00054 00055 /** 00056 * Get the next string token from the HTML string contined in this HTMLTokenizer. If the 00057 * current parsing position is at the end of the string, the empty string is returned. 00058 */ 00059 std::string GetNextTokenString(); 00060 00061 /** 00062 * Returns whether the current parse position is at the beginning of a tag or not. 00063 */ 00064 bool AtTag() const; 00065 /** 00066 * Returns whether the given position in the given string is at the beginning of a tag or not. 00067 */ 00068 bool AtTag(const std::string& str, int pos) const; 00069 /** 00070 * Returns whether the current parse position is at the beginning of a comment or not. 00071 */ 00072 bool AtComment() const; 00073 /** 00074 * Returns whether the given position in the given string is at the beginning of a comment or not. 00075 */ 00076 bool AtComment(const std::string& str, int pos) const; 00077 /** 00078 * Returns whether the given character is a whitespace character or not. 00079 */ 00080 bool IsWhitespace(char toCheck) const; 00081 00082 /** 00083 * Return the first non-whitespace index in the given string after the given starting index. 00084 */ 00085 int AfterWhitespace(int startIndex, const std::string& str) const; 00086 00087 /** 00088 * Return the end position of the current tag being looked at. 00089 */ 00090 int FindEndOfTag() const; 00091 00092 /** 00093 * Return the end position of the current comment being looked at. 00094 */ 00095 int FindEndOfComment() const; 00096 00097 /** 00098 * Return the end position of the current block of text being looked at. 00099 */ 00100 int FindEndOfText() const; 00101 00102 /** 00103 * The HTML String to parse and return tokens from. 00104 */ 00105 std::string htmlString; 00106 00107 /** 00108 * The current position inside the htmlString that the parsing process is at. 00109 */ 00110 int parsePosition; 00111 }; 00112 00113 00114 #endif 00115
1.5.8