HTMLToken.h

Go to the documentation of this file.
00001 #ifndef HTML_TOKEN_H
00002 #define HTML_TOKEN_H
00003 
00004 #include <string>
00005 #include <map>
00006 using namespace std;
00007 
00008 /**
00009  * A enumeration used to determine whether an HTMLToken is a TAG or TEXT.
00010  */
00011 //!@ingroup htmltokenizer
00012 enum HTMLTokenType
00013 {
00014     /**
00015      * Returned when the token is of the form <x ...>, where "x" could be anything.
00016      */
00017     TAG_START,
00018     /**
00019      * Returned when the token is of the form </x ...>, where "x" could be anything.
00020      */
00021     TAG_END,
00022     /**
00023      * Returned when the token is a comment tag of the form <!-- -->.
00024      */
00025     COMMENT,
00026     /**
00027      * Returned when the token is not a tag but is some text.
00028      */
00029     TEXT,
00030     /**
00031      * Returned when there are no more tokens to return.
00032      */
00033     END
00034 };
00035 
00036 /**
00037  * Convert an HTMLTokenType to a String representation.  This is mostly useful for
00038  * debugging purposes.
00039  */
00040 //!@ingroup htmltokenizer
00041 string TypeToString(HTMLTokenType type);
00042 
00043 class HTMLTokenizerTester;
00044 
00045 /**
00046  * A Token of a HTML Document.  It could be either a tag or a word, depending on the
00047  * type variable.
00048  */
00049 //!@ingroup htmltokenizer
00050 class HTMLToken
00051 {
00052 public:
00053     friend class HTMLTokenizerTester;
00054     
00055     /**
00056      * Standard Constructor that initializes member variables.
00057      */
00058     HTMLToken(const string& tokenValue, HTMLTokenType tokenType);
00059     /**
00060      * Copy Constructor that performs a deep copy of all member values.
00061      */
00062     HTMLToken(const HTMLToken& toCopy);
00063     /**
00064      * Standard Destructor that cleans up any memory used by this class.
00065      */
00066     virtual ~HTMLToken();
00067     
00068     /**
00069      * Return the value of the token, whether that be the first part of the HTML tag or
00070      * it is the actual word value.
00071      */
00072     string GetValue() const;
00073     
00074     /**
00075      * Get the type of this token, whether it be a TAG or a WORD.
00076      */
00077     HTMLTokenType GetType() const;
00078     
00079     /**
00080      * Get whether the provided attribute was present in the originally parsed token or not.
00081      */
00082     bool AttributeExists(const string& attribute);
00083     /**
00084      * Get the associated value for the attribute of this token.  If the given attribute does not
00085      * exist in the token, or the token is a WORD, then an empty string will be returned.
00086      */
00087     string GetAttribute(const string& attribute);
00088     /**
00089      * Either insert the provided attribute/value pair into the list of attributes for this token,
00090      * or update the given attribute's value.  Note: If the HTMLTokenType is WORD, this function
00091      * will not save the attribute/value pair to the list of attributes.
00092      */
00093     void SetAttribute(const string& attribute, const string& value);
00094     
00095     
00096 private:
00097     /**
00098      * Return a lowercase copy of the provided string.
00099      */
00100     string ToLower(const string& str);
00101     /**
00102      * Return whether this token is a tag element (Start or End tag).
00103      */
00104     bool IsTag() const;
00105     /**
00106      * Either the word itself or the first part of the tag, such as "a" in <a>.
00107      */
00108     string value;
00109     /**
00110      * The type of this particular token, so either WORD or TAG.
00111      */
00112     HTMLTokenType type;
00113     /**
00114      * A collection of attribute/value pairs.  If the type of the token is WORD,
00115      * this will always be empty.
00116      */
00117     map<string, string> attributes;
00118 };
00119 
00120 #endif
00121 

Generated on Wed Jul 7 16:30:27 2010 for CS240Utils by  doxygen 1.5.8