HTML Parser
Topics
:Overview
Enumerations
Data Structures
Functions
HTML Tag Handlers
The gxsHTML class is a base class used to parse html documents. The gxsHTML class works through multiple inheritance. It includes functions to load and parse HTML files. HTML tags are handled through the use of virtual tag handlers. Derived classes are responsible for processing HTML tags and any associated attributes by overriding the appropriate tag handler.
// The following list of HTML tags is a combination of HTML // 2.0, 3.0, 3.2 tags supported by Netscape's Navigator // web browser, Microsoft's Internet Explorer web browser, // and standards defined by the World Wide Web Consortium. // This list was taken from the Willcam's Comprehensive HTML // Cross Reference at: // http://www.willcam.com/cmat/html/crossref.html enum { // HTML tags and modifiers ID enumeration gxsHTML::gxs_invalid_tag = 0, // Invalid tag specified gxsHTML::gxs_unknown_tag, // Unknown tag specified gxsHTML::gxs_special_tag, // Unknown special tags starting with // an ampersand ending in a semicolon // &xxxx; // Tags and format specifiers with special meaning gxsHTML::gxs_comment_tag, // comment gxsHTML::gxs_less_then, // Less than sign "<" gxsHTML::gxs_greater_then, // Greater then sign ">" gxsHTML::gxs_ampersand, // Ampersand "&" gxsHTML::gxs_nb_space, // Non-breaking space " " gxsHTML::gxs_quote, // Quotation mark """ gxsHTML::gxs_ex_acsii_set, // Extended ASCII character set // HTML tag codes gxsHTML::gxs_a_tag, // anchor gxsHTML::gxs_abbrev_tag, // abbreviation gxsHTML::gxs_acronym_tag, // acronym gxsHTML::gxs_address_tag, // address gxsHTML::gxs_applet_tag, // java applet gxsHTML::gxs_area_tag, // area gxsHTML::gxs_au_tag, // author gxsHTML::gxs_author_tag, // author gxsHTML::gxs_b_tag, // bold gxsHTML::gxs_banner_tag, // banner gxsHTML::gxs_base_tag, // base gxsHTML::gxs_basefont_tag, // base font gxsHTML::gxs_bgsound_tag, // background sound gxsHTML::gxs_big_tag, // big text gxsHTML::gxs_blink_tag, // blink gxsHTML::gxs_blockquote_tag, // block quote gxsHTML::gxs_bq_tag, // block quote gxsHTML::gxs_body_tag, // body gxsHTML::gxs_br_tag, // line break gxsHTML::gxs_caption_tag, // caption gxsHTML::gxs_center_tag, // center gxsHTML::gxs_cite_tag, // citation gxsHTML::gxs_code_tag, // code gxsHTML::gxs_col_tag, // table column gxsHTML::gxs_colgroup_tag, // table column group gxsHTML::gxs_credit_tag, // credit gxsHTML::gxs_del_tag, // deleted text gxsHTML::gxs_dfn_tag, // definition gxsHTML::gxs_dir_tag, // directory list gxsHTML::gxs_div_tag, // division gxsHTML::gxs_dl_tag, // definition list gxsHTML::gxs_dt_tag, // definition term gxsHTML::gxs_dd_tag, // definition definition gxsHTML::gxs_em_tag, // emphasized gxsHTML::gxs_embed_tag, // embed gxsHTML::gxs_fig_tag, // figure gxsHTML::gxs_fn_tag, // footnote gxsHTML::gxs_font_tag, // font gxsHTML::gxs_form_tag, // form gxsHTML::gxs_frame_tag, // frame gxsHTML::gxs_frameset_tag, // frame set gxsHTML::gxs_h1_tag, // heading 1 gxsHTML::gxs_h2_tag, // heading 2 gxsHTML::gxs_h3_tag, // heading 3 gxsHTML::gxs_h4_tag, // heading 4 gxsHTML::gxs_h5_tag, // heading 5 gxsHTML::gxs_h6_tag, // heading 6 gxsHTML::gxs_head_tag, // head gxsHTML::gxs_hr_tag, // horizontal rule gxsHTML::gxs_html_tag, // html gxsHTML::gxs_i_tag, // italic gxsHTML::gxs_iframe_tag, // frame - floating gxsHTML::gxs_img_tag, // inline image gxsHTML::gxs_input_tag, // form input gxsHTML::gxs_ins_tag, // inserted text gxsHTML::gxs_isindex_tag, // is index gxsHTML::gxs_kbd_tag, // keyboard gxsHTML::gxs_lang_tag, // language gxsHTML::gxs_lh_tag, // list heading gxsHTML::gxs_li_tag, // list item gxsHTML::gxs_link_tag, // link gxsHTML::gxs_listing_tag, // listing gxsHTML::gxs_map_tag, // map gxsHTML::gxs_marquee_tag, // marquee gxsHTML::gxs_math_tag, // math gxsHTML::gxs_menu_tag, // menu list gxsHTML::gxs_meta_tag, // meta gxsHTML::gxs_multicol_tag, // multi column text gxsHTML::gxs_nobr_tag, // no break gxsHTML::gxs_noframes_tag, // no frames gxsHTML::gxs_note_tag, // note gxsHTML::gxs_ol_tag, // ordered list gxsHTML::gxs_overlay_tag, // overlay gxsHTML::gxs_p_tag, // paragraph gxsHTML::gxs_param_tag, // parameters gxsHTML::gxs_person_tag, // person gxsHTML::gxs_plaintext_tag, // plain text gxsHTML::gxs_pre_tag, // preformatted text gxsHTML::gxs_q_tag, // quote gxsHTML::gxs_range_tag, // range gxsHTML::gxs_samp_tag, // sample gxsHTML::gxs_script_tag, // script gxsHTML::gxs_select_tag, // form select gxsHTML::gxs_small_tag, // small text gxsHTML::gxs_spacer_tag, // white space gxsHTML::gxs_spot_tag, // spot gxsHTML::gxs_strike_tag, // strikethrough gxsHTML::gxs_strong_tag, // strong gxsHTML::gxs_sub_tag, // subscript gxsHTML::gxs_sup_tag, // superscript gxsHTML::gxs_tab_tag, // horizontal tab gxsHTML::gxs_table_tag, // table gxsHTML::gxs_tbody_tag, // table body gxsHTML::gxs_td_tag, // table data gxsHTML::gxs_textarea_tag, // form text area gxsHTML::gxs_textflow_tag, // java applet textflow gxsHTML::gxs_tfoot_tag, // table footer gxsHTML::gxs_th_tag, // table header gxsHTML::gxs_thead_tag, // table head gxsHTML::gxs_title_tag, // title gxsHTML::gxs_tr_tag, // table row gxsHTML::gxs_tt_tag, // teletype gxsHTML::gxs_u_tag, // underlined gxsHTML::gxs_ul_tag, // unordered list gxsHTML::gxs_var_tag, // variable gxsHTML::gxs_wbr_tag, // word break gxsHTML::gxs_xmp_tag // example };
Data structure used to store the file position of an html tag, the tag itself, its attributes and instructions.
struct gxsHTMLTagInfo { // File information df_StreamPos start_tag; // This tag's starting position in the file df_StreamPos end_tag; // This tag's ending position in the file unsigned tag_length; // The complete length of this tag "< ---- >" // Tag information int tag_id; // Numerical value used to identify supported tags gxString tag_info; // Complete tag from opening to closing bracket gxString tag; // HTML tag gxString attr; // HTML tag attributes // Tag instructions int start_instruction; // True if start of tag instruction "<" int end_instruction; // True if end of a tag instruction "/x>" int has_attributes; // True if this tag has associated attributes };
gxsHTML::gxsHTML()
gxsHTML::~gxsHTML()
gxsHTML::ClearTagList()
gxsHTML::CloseFile()
gxsHTML::CollectHTMLTags()
gxsHTML::Copy()
gxsHTML::GetTag()
gxsHTML::GetTagID()
gxsHTML::GetTagList()
gxsHTML::HandleHTMLTag()
gxsHTML::LoadHTMLFile()
gxsHTML::LoadMemoryBuffer()
gxsHTML::NumProcessed()
gxsHTML::NumTags()
gxsHTML::ParseHTMLTagInfo()
gxsHTML::ProcessHTMLTags()
gxsHTML::gxsHTML()
- Default class constructor.gxsHTML::gxsHTML(const gxsHTML &ob)
- Class copy constructor. - Class destructor. - Public member function used to clear the tag list. - Public member function used to close the open HTML file after a load operation.int gxsHTML::CollectHTMLTags()
- Internal processing function used to collect all the HTML tags in a previously opened file. Returns a zero if no file errors occur or a non-zero corresponding to DiskFileB error code if an error occurs.int gxsHTML::CollectHTMLTags(const MemoryBuffer &membuf)
- Internal processing function used to collect all the HTML tags from a previously loaded MemoryBuffer object. Returns a zero if no errors occur or a non-zero to indicate a failure.void gxsHTML::Copy(const gxsHTML &ob)
- Internal processing function used to copy gxsHTML objects.char *gxsHTML::GetTag(int tag_id)
- Public member function that returns a null terminated string based on the value of the tag ID number. The "tag_id" variable must equal one of the integer constants defined in the tag ID enumeration.int gxsHTML::GetTagID(const gxString &tag)
- Public member function that returns a numerical value defined in the tag ID enumeration that represents the specified tag.gxDLList
void gxsHTML::HandleHTMLTag(int tag_id)
- Internal processing function used to execute the derived class version of a specific tag handler.int gxsHTML::LoadHTMLFile(const char *fname)
- Public member function used to open the specified HTML file and process all the tags collected from the file. Returns a zero if no disk file errors occur or a non-zero corresponding to DiskFileB error code if an error occurs.int gxsHTML::LoadMemoryBuffer(const MemoryBuffer &membuf)
- Public member function used to process all the tags stored in a MemoryBuffer object. Returns a zero if no errors occur or a non-zero to indicate a failure.unsigned gxsHTML::NumProcessed()
- Public member function used to retrieve the total number of tags processed. - Public member function used to retrieve the total number of tags collected.void gxsHTML::ParseHTMLTagInfo(gxsHTMLTagInfo &t)
- Public member function used to parse the specific tag information based on the string contained in the gxsHTMLTagInfo::tag_info member.int gxsHTML::ProcessHTMLTags()
- Internal processing function used to read and process all the tags in a previously opened file. Returns a zero if no disk file errors occur or a non-zero corresponding to DiskFileB error code if an error occurs.int gxsHTML::ProcessHTMLTags(const MemoryBuffer &membuf)
- Internal processing function used to read and process all the tags in a MemoryBuffer object. Returns a zero if no errors occur or a non-zero to indicate a failure.Derived class interface used to process tags.
void gxsHTML::Handle_INVALID_Tag() { // Override to handle INVALID tags } void gxsHTML::Handle_UNKNOWN_Tag() { // Override to handle UNKNOWN tags } void gxsHTML::Handle_UNKNOWN_SPECIAL_Tag() { // Override to handle unknown special tags starting // with an ampersand ending in a semicolon &xxxx; } void gxsHTML::Handle_COMMENT_Tag() { // Override to handle COMMENT tags } void gxsHTML::Handle_LESS_THEN_Tag() { // Override to handle a less than sign "<" } void gxsHTML::Handle_GREATER_THEN_Tag() { // Override to handle a greater than sign ">" } void gxsHTML::Handle_AMPERSAND_Tag() { // Override to handle an ampersand "&" } void gxsHTML::Handle_NB_SPACE_Tag() { // Override to handle a non-breaking space " " } void gxsHTML::Handle_QUOTE_Tag() { // Override to handle a quotation mark """ } void gxsHTML::Handle_EX_ASCII_Tag() { // Override to handle the extended ASCII character set "&#" } void gxsHTML::Handle_A_Tag() { // Override to handle ANCHOR tags } void gxsHTML::Handle_ABBREV_Tag() { // Override to handle ABBREVIATION tags } void gxsHTML::Handle_ACRONYM_Tag() { // Override to handle ACRONYM tags } void gxsHTML::Handle_ADDRESS_Tag() { // Override to handle ADDRESS tags } void gxsHTML::Handle_APPLET_Tag() { // Override to handle JAVA APPLET tag } void gxsHTML::Handle_AREA_Tag() { // Override to handle AREA tags } void gxsHTML::Handle_AU_Tag() { // Override to handle AUTHOR tags } void gxsHTML::Handle_AUTHOR_Tag() { // Override to handle AUTHOR tags } void gxsHTML::Handle_B_Tag() { // Override to handle BOLD tags } void gxsHTML::Handle_BANNER_Tag() { // Override to handle BANNER tags } void gxsHTML::Handle_BASE_Tag() { // Override to handle BASE tags } void gxsHTML::Handle_BASEFONT_Tag() { // Override to handle BASE FONT } void gxsHTML::Handle_BGSOUND_Tag() { // Override to handle BACKGROUND SOUND } void gxsHTML::Handle_BIG_Tag() { // Override to handle BIG text } void gxsHTML::Handle_BLINK_Tag() { // Override to handle BLINK tags } void gxsHTML::Handle_BLOCKQUOTE_Tag() { // Override to handle BLOCK QUOTE tags } void gxsHTML::Handle_BQ_Tag() { // Override to handle BLOCK QUOTE tags } void gxsHTML::Handle_BODY_Tag() { // Override to handle BODY tags } void gxsHTML::Handle_BR_Tag() { // Override to handle LINE BREAK tags } void gxsHTML::Handle_CAPTION_Tag() { // Override to handle CAPTION tags } void gxsHTML::Handle_CENTER_Tag() { // Override to handle CENTER tags } void gxsHTML::Handle_CITE_Tag() { // Override to handle CITATION tags } void gxsHTML::Handle_CODE_Tag() { // Override to handle CODE tags } void gxsHTML::Handle_COL_Tag() { // Override to handle TABLE Cols tags } void gxsHTML::Handle_COLGROUP_Tag() { // Override to handle TABLE Cols tags } void gxsHTML::Handle_CREDIT_Tag() { // Override to handle CREDIT tags } void gxsHTML::Handle_DEL_Tag() { // Override to handle DELETED text tags } void gxsHTML::Handle_DFN_Tag() { // Override to handle DEFINITION tags } void gxsHTML::Handle_DIR_Tag() { // Override to handle DIRECTORY list tags } void gxsHTML::Handle_DIV_Tag() { // Override to handle DIVISION tags } void gxsHTML::Handle_DL_Tag() { // Override to handle DEFINITION list tags } void gxsHTML::Handle_DT_Tag() { // Override to handle DEFINITION term tags } void gxsHTML::Handle_DD_Tag() { // Override to handle DEFINITION tags } void gxsHTML::Handle_EM_Tag() { // Override to handle EMPHASIZED tags } void gxsHTML::Handle_EMBED_Tag() { // Override to handle EMBED tags } void gxsHTML::Handle_FIG_Tag() { // Override to handle FIGURE tags } void gxsHTML::Handle_FN_Tag() { // Override to handle FOOTNOTE tags } void gxsHTML::Handle_FONT_Tag() { // Override to handle FONT tags } void gxsHTML::Handle_FORM_Tag() { // Override to handle FORM tags } void gxsHTML::Handle_FRAME_Tag() { // Override to handle FRAME tags } void gxsHTML::Handle_FRAMESET_Tag() { // Override to handle FRAME sets } void gxsHTML::Handle_H1_Tag() { // Override to handle HEADING 1 tags } void gxsHTML::Handle_H2_Tag() { // Override to handle HEADING 2 tags } void gxsHTML::Handle_H3_Tag() { // Override to handle HEADING 3 tags } void gxsHTML::Handle_H4_Tag() { // Override to handle HEADING 4 tags } void gxsHTML::Handle_H5_Tag() { // Override to handle HEADING 5 tags } void gxsHTML::Handle_H6_Tag() { // Override to handle HEADING 6 tags } void gxsHTML::Handle_HEAD_Tag() { // Override to handle HEAD tags } void gxsHTML::Handle_HR_Tag() { // Override to handle HORIZONTAL rules } void gxsHTML::Handle_HTML_Tag() { // Override to handle HTML tags } void gxsHTML::Handle_I_Tag() { // Override to handle ITALIC tags } void gxsHTML::Handle_IFRAME_Tag() { // Override to handle FRAME - Floating tag } void gxsHTML::Handle_IMG_Tag() { // Override to handle INLINE images } void gxsHTML::Handle_INPUT_Tag() { // Override to handle FORM input tags } void gxsHTML::Handle_INS_Tag() { // Override to handle INSERTED text } void gxsHTML::Handle_ISINDEX_Tag() { // Override to handle ISINDEX tag } void gxsHTML::Handle_KBD_Tag() { // Override to handle KEYBOARD tags } void gxsHTML::Handle_LANG_Tag() { // Override to handle LANGUAGE tags } void gxsHTML::Handle_LH_Tag() { // Override to handle LIST header tags } void gxsHTML::Handle_LI_Tag() { // Override to handle LIST item tags } void gxsHTML::Handle_LINK_Tag() { // Override to handle LINK tags } void gxsHTML::Handle_LISTING_Tag() { // Override to handle LISTING tags } void gxsHTML::Handle_MAP_Tag() { // Override to handle MAP tags } void gxsHTML::Handle_MARQUEE_Tag() { // Override to handle MARQUEE tags } void gxsHTML::Handle_MATH_Tag() { // Override to handle MATH tags } void gxsHTML::Handle_MENU_Tag() { // Override to handle MENU list tags } void gxsHTML::Handle_META_Tag() { // Override to handle META tags } void gxsHTML::Handle_MULTICOL_Tag() { // Override to handle MULTI COLUMN tags } void gxsHTML::Handle_NOBR_Tag() { // Override to handle NO BREAK tags } void gxsHTML::Handle_NOFRAMES_Tag() { // Override to handle NO FRAMES tags } void gxsHTML::Handle_NOTE_Tag() { // Override to handle NOTE tags } void gxsHTML::Handle_OL_Tag() { // Override to handle ORDERED list tags } void gxsHTML::Handle_OVERLAY_Tag() { // Override to handle OVERLAY tags } void gxsHTML::Handle_P_Tag() { // Override to handle PARAGRAPH tags } void gxsHTML::Handle_PARAM_Tag() { // Override to handle PARAMETERS tags } void gxsHTML::Handle_PERSON_Tag() { // Override to handle PERSON tags } void gxsHTML::Handle_PLAINTEXT_Tag() { // Override to handle PLAIN text tags } void gxsHTML::Handle_PRE_Tag() { // Override to handle PREFORMATTED text tags } void gxsHTML::Handle_Q_Tag() { // Override to handle QUOTE tags } void gxsHTML::Handle_RANGE_Tag() { // Override to handle RANGE tags } void gxsHTML::Handle_SAMP_Tag() { // Override to handle SAMPLE tags } void gxsHTML::Handle_SCRIPT_Tag() { // Override to handle SCRIPT tags } void gxsHTML::Handle_SELECT_Tag() { // Override to handle FORM SELECT tags } void gxsHTML::Handle_SMALL_Tag() { // Override to handle SMALL text tags } void gxsHTML::Handle_SPACER_Tag() { // Override to handle WHITE SPACE tags } void gxsHTML::Handle_SPOT_Tag() { // Override to handle SPOT tags } void gxsHTML::Handle_STRIKE_Tag() { // Override to handle STRIKETHROUGH tags } void gxsHTML::Handle_STRONG_Tag() { // Override to handle STRONG tags } void gxsHTML::Handle_SUB_Tag() { // Override to handle SUBSCRIPT tags } void gxsHTML::Handle_SUP_Tag() { // Override to handle SUPERSCRIPT tags } void gxsHTML::Handle_TAB_Tag() { // Override to handle HORIZONTAL TABS tags } void gxsHTML::Handle_TABLE_Tag() { // Override to handle TABLE tags } void gxsHTML::Handle_TBODY_Tag() { // Override to handle TABLE body tags } void gxsHTML::Handle_TD_Tag() { // Override to handle TABLE data tags } void gxsHTML::Handle_TEXTAREA_Tag() { // Override to handle FORM form tags } void gxsHTML::Handle_TEXTFLOW_Tag() { // Override to handle JAVA applet textflow } void gxsHTML::Handle_TFOOT_Tag() { // Override to handle TABLE footer tags } void gxsHTML::Handle_TH_Tag() { // Override to handle TABLE head } void gxsHTML::Handle_THEAD_Tag() { // Override to handle TABLE head tag } void gxsHTML::Handle_TITLE_Tag() { // Override to handle TITLE tags } void gxsHTML::Handle_TR_Tag() { // Override to handle TABLE row tags } void gxsHTML::Handle_TT_Tag() { // Override to handle TELETYPE tags } void gxsHTML::Handle_U_Tag() { // Override to handle UNDERLINED tags } void gxsHTML::Handle_UL_Tag() { // Override to handle UNORDERED list tags } void gxsHTML::Handle_VAR_Tag() { // Override to handle VARIABLE tags } void gxsHTML::Handle_WBR_Tag() { // Override to handle WORD BREAK tags } void gxsHTML::Handle_XMP_Tag() { // Override to handle EXAMPLE tags }
End Of Document |