00001 #ifndef HTML_TOKEN_H 00002 #define HTML_TOKEN_H 00003 00004 #include <string> 00005 #include <map> 00006 using namespace std; 00007 00008 /** 00009 * A enumeration used to determine whether an HTMLToken is a TAG or TEXT. 00010 */ 00011 //!@ingroup htmltokenizer 00012 enum HTMLTokenType 00013 { 00014 /** 00015 * Returned when the token is of the form <x ...>, where "x" could be anything. 00016 */ 00017 TAG_START, 00018 /** 00019 * Returned when the token is of the form </x ...>, where "x" could be anything. 00020 */ 00021 TAG_END, 00022 /** 00023 * Returned when the token is a comment tag of the form <!-- -->. 00024 */ 00025 COMMENT, 00026 /** 00027 * Returned when the token is not a tag but is some text. 00028 */ 00029 TEXT, 00030 /** 00031 * Returned when there are no more tokens to return. 00032 */ 00033 END 00034 }; 00035 00036 /** 00037 * Convert an HTMLTokenType to a String representation. This is mostly useful for 00038 * debugging purposes. 00039 */ 00040 //!@ingroup htmltokenizer 00041 string TypeToString(HTMLTokenType type); 00042 00043 class HTMLTokenizerTester; 00044 00045 /** 00046 * A Token of a HTML Document. It could be either a tag or a word, depending on the 00047 * type variable. 00048 */ 00049 //!@ingroup htmltokenizer 00050 class HTMLToken 00051 { 00052 public: 00053 friend class HTMLTokenizerTester; 00054 00055 /** 00056 * Standard Constructor that initializes member variables. 00057 */ 00058 HTMLToken(const string& tokenValue, HTMLTokenType tokenType); 00059 /** 00060 * Copy Constructor that performs a deep copy of all member values. 00061 */ 00062 HTMLToken(const HTMLToken& toCopy); 00063 /** 00064 * Standard Destructor that cleans up any memory used by this class. 00065 */ 00066 virtual ~HTMLToken(); 00067 00068 /** 00069 * Return the value of the token, whether that be the first part of the HTML tag or 00070 * it is the actual word value. 00071 */ 00072 string GetValue() const; 00073 00074 /** 00075 * Get the type of this token, whether it be a TAG or a WORD. 00076 */ 00077 HTMLTokenType GetType() const; 00078 00079 /** 00080 * Get whether the provided attribute was present in the originally parsed token or not. 00081 */ 00082 bool AttributeExists(const string& attribute); 00083 /** 00084 * Get the associated value for the attribute of this token. If the given attribute does not 00085 * exist in the token, or the token is a WORD, then an empty string will be returned. 00086 */ 00087 string GetAttribute(const string& attribute); 00088 /** 00089 * Either insert the provided attribute/value pair into the list of attributes for this token, 00090 * or update the given attribute's value. Note: If the HTMLTokenType is WORD, this function 00091 * will not save the attribute/value pair to the list of attributes. 00092 */ 00093 void SetAttribute(const string& attribute, const string& value); 00094 00095 00096 private: 00097 /** 00098 * Return a lowercase copy of the provided string. 00099 */ 00100 string ToLower(const string& str); 00101 /** 00102 * Return whether this token is a tag element (Start or End tag). 00103 */ 00104 bool IsTag() const; 00105 /** 00106 * Either the word itself or the first part of the tag, such as "a" in <a>. 00107 */ 00108 string value; 00109 /** 00110 * The type of this particular token, so either WORD or TAG. 00111 */ 00112 HTMLTokenType type; 00113 /** 00114 * A collection of attribute/value pairs. If the type of the token is WORD, 00115 * this will always be empty. 00116 */ 00117 map<string, string> attributes; 00118 }; 00119 00120 #endif 00121
1.5.8