HTMLTokenizer.cpp

Go to the documentation of this file.
00001 #include "HTMLTokenizer.h"
00002 
00003 #include <cctype>
00004 
00005 HTMLTokenizer::HTMLTokenizer(const std::string& htmlCode)
00006 {
00007     htmlString = htmlCode;
00008     parsePosition = 0;
00009 }
00010 HTMLTokenizer::HTMLTokenizer(URLInputStream* input)
00011 {
00012     htmlString = "";
00013     while(input->IsOpen() && !input->IsDone())
00014     {
00015         htmlString += input->Read();
00016     }
00017     parsePosition = 0;
00018 }
00019 HTMLTokenizer::~HTMLTokenizer()
00020 {
00021 }
00022 
00023 HTMLToken HTMLTokenizer::GetNextToken()
00024 {
00025     std::string tokenStr = GetNextTokenString();
00026     if(tokenStr == "")
00027     {
00028         return HTMLToken("", END);
00029     }
00030     else if(AtTag(tokenStr, 0))
00031     {
00032         return TagToToken(tokenStr);
00033     }
00034     else if(AtComment(tokenStr, 0))
00035     {
00036         return HTMLToken(tokenStr, COMMENT);
00037     }
00038     else
00039     {
00040         return HTMLToken(tokenStr, TEXT);
00041     }
00042 }
00043 
00044 bool HTMLTokenizer::HasNextToken() const
00045 {
00046     return parsePosition < (int)htmlString.length();
00047 }
00048 
00049 HTMLToken HTMLTokenizer::TagToToken(const std::string& str) const
00050 {
00051     // Find tag name
00052     int startPos = 1;
00053     HTMLTokenType type = TAG_START;
00054     
00055     if(str.length() >= 2 && str[startPos] == '/')
00056     {
00057         type = TAG_END;
00058         startPos++;
00059     }
00060     
00061     int runPos = startPos;
00062     while(runPos < (int)str.length() && IsWhitespace(str[runPos]) == false && str[runPos] != '>')
00063     {
00064         runPos++;
00065     }
00066     std::string tagName = str.substr(startPos, runPos - startPos);
00067     startPos = runPos;
00068     
00069     HTMLToken result(tagName, type);
00070     std::string key = "";
00071     char strTerm = ' ';
00072     
00073     // Find tag attributes
00074     while(startPos < (int)str.length() && str[startPos] != '>')
00075     {
00076         startPos = AfterWhitespace(startPos, str);
00077         
00078         // Find attribute name (key)
00079         runPos = startPos;
00080         while(runPos < (int)str.length() && IsWhitespace(str[runPos]) == false && str[runPos] != '=' && str[runPos] != '>')
00081         {
00082             runPos++;
00083         }
00084         key = str.substr(startPos, runPos - startPos);
00085         startPos = runPos;
00086         
00087         // Find equals sign
00088         startPos = AfterWhitespace(startPos, str);
00089         if(str[startPos] != '=')
00090         {
00091             // Return to beginning of while loop.
00092             continue;
00093         }
00094         else
00095         {
00096             // Move past equal sign
00097             startPos++;
00098         }
00099         
00100         // Find attribute value (value)
00101         startPos = AfterWhitespace(startPos, str);
00102         if(str[startPos] == '\"' || str[startPos] == '\'')
00103         {
00104             strTerm = str[startPos];
00105             startPos++;
00106         }
00107         else
00108         {
00109             strTerm = ' ';
00110         }
00111         
00112         runPos = startPos + 1;
00113         while(str[runPos] != '>' && str[runPos] != strTerm && !(strTerm == ' ' && IsWhitespace(str[runPos])))
00114         {
00115             runPos++;
00116         }
00117         
00118         // Add attribute
00119         result.SetAttribute(key, str.substr(startPos, runPos - startPos));
00120         
00121         if(strTerm == '\"' || strTerm == '\'')
00122         {
00123             runPos++;
00124         }
00125         startPos = runPos;
00126     }
00127     
00128     return result;
00129 }
00130     
00131 std::string HTMLTokenizer::GetNextTokenString()
00132 {
00133     int end;
00134     
00135     if(HasNextToken() == false)
00136     {
00137         end = parsePosition;
00138     }
00139     else
00140     {
00141         if(AtTag())
00142         {
00143             end = FindEndOfTag();
00144         }
00145         else if(AtComment())
00146         {
00147             end = FindEndOfComment();
00148         }
00149         else
00150         {
00151             end = FindEndOfText();
00152         }
00153         
00154         // This moves the end position to the first character of the next token.
00155         end++;
00156     }
00157     
00158     std::string result = htmlString.substr(parsePosition, end - parsePosition);
00159     parsePosition = end;
00160     
00161     return result;
00162 }
00163 
00164 bool HTMLTokenizer::AtTag() const
00165 {
00166     return AtTag(htmlString, parsePosition);
00167 }
00168 
00169 bool HTMLTokenizer::AtTag(const std::string& str, int pos) const
00170 {
00171     if(pos >= (int)str.length() - 2)
00172     {
00173         return false;
00174     }
00175     else
00176     {
00177         return str[pos] == '<' && 
00178                     (isalpha(str[pos + 1]) != 0 || // A tag can either have a character after the bracket or
00179                     (str[pos + 1] == '/' && isalpha(str[pos + 2]) != 0)); // a slash followed by a character
00180     }
00181 }
00182 
00183 bool HTMLTokenizer::AtComment() const
00184 {
00185     return AtComment(htmlString, parsePosition);
00186 }
00187 
00188 bool HTMLTokenizer::AtComment(const std::string& str, int pos) const
00189 {
00190     if(pos >= (int)str.length() - 4)
00191     {
00192         return false;
00193     }
00194     else
00195     {
00196         return str.compare(pos, 4, "<!--") == 0;
00197     }
00198 }
00199 
00200 bool HTMLTokenizer::IsWhitespace(char toCheck) const
00201 {
00202     return isspace(toCheck) != 0;
00203 }
00204 
00205 int HTMLTokenizer::AfterWhitespace(int startIndex, const std::string& str) const
00206 {
00207     while(startIndex < (int)str.length() && IsWhitespace(str[startIndex]))
00208     {
00209         startIndex++;
00210     }
00211     return startIndex;
00212 }
00213 
00214 
00215 int HTMLTokenizer::FindEndOfTag() const
00216 {
00217     int endTagPos = parsePosition + 1;
00218     
00219     while(endTagPos <= (int)htmlString.length() - 1 &&
00220             htmlString[endTagPos] != '>')
00221     {
00222         endTagPos++;
00223     }
00224     
00225     if(endTagPos >= (int)htmlString.length())
00226     {
00227         endTagPos--;
00228     }
00229     
00230     return endTagPos;
00231 }
00232 
00233 int HTMLTokenizer::FindEndOfComment() const
00234 {
00235     // Move to the first character position after the opening bracket sequence.
00236     int endCommentPos = parsePosition + 4;
00237     
00238     while(endCommentPos <= (int)htmlString.length() - 3 && htmlString.compare(endCommentPos, 3, "-->") != 0)
00239     {
00240         endCommentPos++;
00241     }
00242     
00243     if(endCommentPos > (int)htmlString.length() - 3)
00244     {
00245         // If the end of the comment was the end of the html string, then just set the end of the comment position
00246         // to be the end of the string.
00247         endCommentPos = htmlString.length() - 1;
00248     }
00249     else
00250     {
00251         // If there actually is an end of a comment bracket, the jump forward two places so that we are now pointing
00252         // at the ending bracket, rather than the first dash.
00253         endCommentPos += 2;
00254     }
00255     
00256     return endCommentPos;
00257 }
00258 
00259 int HTMLTokenizer::FindEndOfText() const
00260 {
00261     int endTextPos = parsePosition + 1;
00262     
00263     while(endTextPos < (int)htmlString.length() &&
00264             htmlString[endTextPos] != '<')
00265     {
00266         endTextPos++;
00267     }
00268     
00269     // This is because the while loop is terminated when we find the end of the string or
00270     // the tag opening character, so to get the last text character, we have to step back once.
00271     endTextPos--;
00272     
00273     return endTextPos;
00274 }
00275 

Generated on Wed Jul 7 16:30:27 2010 for CS240Utils by  doxygen 1.5.8