HTML Parser


Topics:

Overview
Enumerations
Data Structures
Functions
HTML Tag Handlers


Overview

The gxsHTML class is a base class used to parse html documents. The gxsHTML class works through multiple inheritance. It includes functions to load and parse HTML files. HTML tags are handled through the use of virtual tag handlers. Derived classes are responsible for processing HTML tags and any associated attributes by overriding the appropriate tag handler.


Enumerations

// The following list of HTML tags is a combination of HTML
// 2.0, 3.0, 3.2 tags supported by Netscape's Navigator
// web browser, Microsoft's Internet Explorer web browser, 
// and standards defined by the World Wide Web Consortium.
// This list was taken from the Willcam's Comprehensive HTML 
// Cross Reference at: 
// http://www.willcam.com/cmat/html/crossref.html 
enum { // HTML tags and modifiers ID enumeration
  gxsHTML::gxs_invalid_tag = 0, // Invalid tag specified
  gxsHTML::gxs_unknown_tag,     // Unknown tag specified
  gxsHTML::gxs_special_tag,     // Unknown special tags starting with 
                                // an ampersand ending in a semicolon
                                // &xxxx;

    // Tags and format specifiers with special meaning
  gxsHTML::gxs_comment_tag,     // comment
  gxsHTML::gxs_less_then,       // Less than sign "<" 
  gxsHTML::gxs_greater_then,    // Greater then sign ">"
  gxsHTML::gxs_ampersand,       // Ampersand "&"
  gxsHTML::gxs_nb_space,        // Non-breaking space " "
  gxsHTML::gxs_quote,           // Quotation mark """
  gxsHTML::gxs_ex_acsii_set,    // Extended ASCII character set

    // HTML tag codes
  gxsHTML::gxs_a_tag,           // anchor
  gxsHTML::gxs_abbrev_tag,      // abbreviation
  gxsHTML::gxs_acronym_tag,     // acronym
  gxsHTML::gxs_address_tag,     // address
  gxsHTML::gxs_applet_tag,      // java applet
  gxsHTML::gxs_area_tag,        // area
  gxsHTML::gxs_au_tag,          // author
  gxsHTML::gxs_author_tag,      // author
  gxsHTML::gxs_b_tag,           // bold
  gxsHTML::gxs_banner_tag,      // banner
  gxsHTML::gxs_base_tag,        // base
  gxsHTML::gxs_basefont_tag,    // base font
  gxsHTML::gxs_bgsound_tag,     // background sound
  gxsHTML::gxs_big_tag,         // big text
  gxsHTML::gxs_blink_tag,       // blink
  gxsHTML::gxs_blockquote_tag,  // block quote
  gxsHTML::gxs_bq_tag,          // block quote
  gxsHTML::gxs_body_tag,        // body
  gxsHTML::gxs_br_tag,          // line break
  gxsHTML::gxs_caption_tag,     // caption
  gxsHTML::gxs_center_tag,      // center
  gxsHTML::gxs_cite_tag,        // citation
  gxsHTML::gxs_code_tag,        // code
  gxsHTML::gxs_col_tag,         // table column
  gxsHTML::gxs_colgroup_tag,    // table column group
  gxsHTML::gxs_credit_tag,      // credit
  gxsHTML::gxs_del_tag,         // deleted text
  gxsHTML::gxs_dfn_tag,         // definition
  gxsHTML::gxs_dir_tag,         // directory list
  gxsHTML::gxs_div_tag,         // division
  gxsHTML::gxs_dl_tag,          // definition list
  gxsHTML::gxs_dt_tag,          // definition term
  gxsHTML::gxs_dd_tag,          // definition definition
  gxsHTML::gxs_em_tag,          // emphasized
  gxsHTML::gxs_embed_tag,       // embed
  gxsHTML::gxs_fig_tag,         // figure
  gxsHTML::gxs_fn_tag,          // footnote
  gxsHTML::gxs_font_tag,        // font
  gxsHTML::gxs_form_tag,        // form
  gxsHTML::gxs_frame_tag,       // frame
  gxsHTML::gxs_frameset_tag,    // frame set
  gxsHTML::gxs_h1_tag,          // heading 1
  gxsHTML::gxs_h2_tag,          // heading 2
  gxsHTML::gxs_h3_tag,          // heading 3
  gxsHTML::gxs_h4_tag,          // heading 4
  gxsHTML::gxs_h5_tag,          // heading 5
  gxsHTML::gxs_h6_tag,          // heading 6
  gxsHTML::gxs_head_tag,        // head
  gxsHTML::gxs_hr_tag,          // horizontal rule
  gxsHTML::gxs_html_tag,        // html
  gxsHTML::gxs_i_tag,           // italic
  gxsHTML::gxs_iframe_tag,      // frame - floating
  gxsHTML::gxs_img_tag,         // inline image
  gxsHTML::gxs_input_tag,       // form input
  gxsHTML::gxs_ins_tag,         // inserted text
  gxsHTML::gxs_isindex_tag,     // is index
  gxsHTML::gxs_kbd_tag,         // keyboard
  gxsHTML::gxs_lang_tag,        // language
  gxsHTML::gxs_lh_tag,          // list heading
  gxsHTML::gxs_li_tag,          // list item
  gxsHTML::gxs_link_tag,        // link
  gxsHTML::gxs_listing_tag,     // listing
  gxsHTML::gxs_map_tag,         // map
  gxsHTML::gxs_marquee_tag,     // marquee
  gxsHTML::gxs_math_tag,        // math
  gxsHTML::gxs_menu_tag,        // menu list
  gxsHTML::gxs_meta_tag,        // meta
  gxsHTML::gxs_multicol_tag,    // multi column text
  gxsHTML::gxs_nobr_tag,        // no break
  gxsHTML::gxs_noframes_tag,    // no frames
  gxsHTML::gxs_note_tag,        // note
  gxsHTML::gxs_ol_tag,          // ordered list
  gxsHTML::gxs_overlay_tag,     // overlay
  gxsHTML::gxs_p_tag,           // paragraph
  gxsHTML::gxs_param_tag,       // parameters
  gxsHTML::gxs_person_tag,      // person
  gxsHTML::gxs_plaintext_tag,   // plain text
  gxsHTML::gxs_pre_tag,         // preformatted text
  gxsHTML::gxs_q_tag,           // quote
  gxsHTML::gxs_range_tag,       // range
  gxsHTML::gxs_samp_tag,        // sample
  gxsHTML::gxs_script_tag,      // script
  gxsHTML::gxs_select_tag,      // form select
  gxsHTML::gxs_small_tag,       // small text
  gxsHTML::gxs_spacer_tag,      // white space
  gxsHTML::gxs_spot_tag,        // spot
  gxsHTML::gxs_strike_tag,      // strikethrough
  gxsHTML::gxs_strong_tag,      // strong
  gxsHTML::gxs_sub_tag,         // subscript
  gxsHTML::gxs_sup_tag,         // superscript
  gxsHTML::gxs_tab_tag,         // horizontal tab
  gxsHTML::gxs_table_tag,       // table
  gxsHTML::gxs_tbody_tag,       // table body
  gxsHTML::gxs_td_tag,          // table data
  gxsHTML::gxs_textarea_tag,    // form text area
  gxsHTML::gxs_textflow_tag,    // java applet textflow
  gxsHTML::gxs_tfoot_tag,       // table footer
  gxsHTML::gxs_th_tag,          // table header
  gxsHTML::gxs_thead_tag,       // table head
  gxsHTML::gxs_title_tag,       // title
  gxsHTML::gxs_tr_tag,          // table row
  gxsHTML::gxs_tt_tag,          // teletype
  gxsHTML::gxs_u_tag,           // underlined
  gxsHTML::gxs_ul_tag,          // unordered list
  gxsHTML::gxs_var_tag,         // variable
  gxsHTML::gxs_wbr_tag,         // word break
  gxsHTML::gxs_xmp_tag          // example
};


Data Structures

Data structure used to store the file position of an html tag, the tag itself, its attributes and instructions.

struct gxsHTMLTagInfo
{
  // File information
  df_StreamPos start_tag; // This tag's starting position in the file
  df_StreamPos end_tag;   // This tag's ending position in the file
  unsigned tag_length;    // The complete length of this tag "< ---- >"
  
  // Tag information
  int tag_id;        // Numerical value used to identify supported tags
  gxString tag_info; // Complete tag from opening to closing bracket
  gxString tag;      // HTML tag
  gxString attr;     // HTML tag attributes 

  // Tag instructions
  int start_instruction; // True if start of tag instruction "<"
  int end_instruction;   // True if end of a tag instruction "/x>"
  int has_attributes;    // True if this tag has associated attributes
}; 


Functions

gxsHTML::gxsHTML()
gxsHTML::~gxsHTML()
gxsHTML::ClearTagList()
gxsHTML::CloseFile()
gxsHTML::CollectHTMLTags()
gxsHTML::Copy()
gxsHTML::GetTag()
gxsHTML::GetTagID()
gxsHTML::GetTagList()
gxsHTML::HandleHTMLTag()
gxsHTML::LoadHTMLFile()
gxsHTML::LoadMemoryBuffer()
gxsHTML::NumProcessed()
gxsHTML::NumTags()
gxsHTML::ParseHTMLTagInfo()
gxsHTML::ProcessHTMLTags()

gxsHTML::gxsHTML() - Default class constructor.

gxsHTML::gxsHTML(const gxsHTML &ob) - Class copy constructor.

virtual gxsHTML::~gxsHTML() - Class destructor.

void gxsHTML::ClearTagList() - Public member function used to clear the tag list.

void gxsHTML::CloseFile() - Public member function used to close the open HTML file after a load operation.

int gxsHTML::CollectHTMLTags() - Internal processing function used to collect all the HTML tags in a previously opened file. Returns a zero if no file errors occur or a non-zero corresponding to DiskFileB error code if an error occurs.

int gxsHTML::CollectHTMLTags(const MemoryBuffer &membuf) - Internal processing function used to collect all the HTML tags from a previously loaded MemoryBuffer object. Returns a zero if no errors occur or a non-zero to indicate a failure.

void gxsHTML::Copy(const gxsHTML &ob) - Internal processing function used to copy gxsHTML objects.

char *gxsHTML::GetTag(int tag_id) - Public member function that returns a null terminated string based on the value of the tag ID number. The "tag_id" variable must equal one of the integer constants defined in the tag ID enumeration.

int gxsHTML::GetTagID(const gxString &tag) - Public member function that returns a numerical value defined in the tag ID enumeration that represents the specified tag.

gxDLList *gxsHTML::GetTagList() - Public member function that returns a pointer to the tag list.

void gxsHTML::HandleHTMLTag(int tag_id) - Internal processing function used to execute the derived class version of a specific tag handler.

int gxsHTML::LoadHTMLFile(const char *fname) - Public member function used to open the specified HTML file and process all the tags collected from the file. Returns a zero if no disk file errors occur or a non-zero corresponding to DiskFileB error code if an error occurs.

int gxsHTML::LoadMemoryBuffer(const MemoryBuffer &membuf) - Public member function used to process all the tags stored in a MemoryBuffer object. Returns a zero if no errors occur or a non-zero to indicate a failure.

unsigned gxsHTML::NumProcessed() - Public member function used to retrieve the total number of tags processed.

unsigned gxsHTML::NumTags() - Public member function used to retrieve the total number of tags collected.

void gxsHTML::ParseHTMLTagInfo(gxsHTMLTagInfo &t) - Public member function used to parse the specific tag information based on the string contained in the gxsHTMLTagInfo::tag_info member.

int gxsHTML::ProcessHTMLTags() - Internal processing function used to read and process all the tags in a previously opened file. Returns a zero if no disk file errors occur or a non-zero corresponding to DiskFileB error code if an error occurs.

int gxsHTML::ProcessHTMLTags(const MemoryBuffer &membuf) - Internal processing function used to read and process all the tags in a MemoryBuffer object. Returns a zero if no errors occur or a non-zero to indicate a failure.


HTML Tag Handlers

Derived class interface used to process tags.

void gxsHTML::Handle_INVALID_Tag()
{
  // Override to handle INVALID tags
}

void gxsHTML::Handle_UNKNOWN_Tag()
{
  // Override to handle UNKNOWN tags
}

void gxsHTML::Handle_UNKNOWN_SPECIAL_Tag()
{
  // Override to handle unknown special tags starting 
  // with an ampersand ending in a semicolon &xxxx;
}

void gxsHTML::Handle_COMMENT_Tag()
{
  // Override to handle COMMENT tags
}

void gxsHTML::Handle_LESS_THEN_Tag()
{
  // Override to handle a less than sign "&lt;" 
}

void gxsHTML::Handle_GREATER_THEN_Tag() 
{
  // Override to handle a greater than sign "&gt;"
}

void gxsHTML::Handle_AMPERSAND_Tag()
{
  // Override to handle an ampersand "&amp;" 
}

void gxsHTML::Handle_NB_SPACE_Tag()
{
  // Override to handle a non-breaking space "&nbsp;"
}

void gxsHTML::Handle_QUOTE_Tag() 
{
  // Override to handle a quotation mark "&quot;" 
}

void gxsHTML::Handle_EX_ASCII_Tag()
{
  // Override to handle the extended ASCII character set "&#"
}

void gxsHTML::Handle_A_Tag()
{
  // Override to handle ANCHOR tags
}

void gxsHTML::Handle_ABBREV_Tag()
{
  // Override to handle ABBREVIATION tags
}

void gxsHTML::Handle_ACRONYM_Tag()
{
  // Override to handle ACRONYM tags
}

void gxsHTML::Handle_ADDRESS_Tag()
{
  // Override to handle ADDRESS tags
}

void gxsHTML::Handle_APPLET_Tag()
{
  // Override to handle JAVA APPLET tag
}

void gxsHTML::Handle_AREA_Tag()
{
  // Override to handle AREA tags
}

void gxsHTML::Handle_AU_Tag()
{
  // Override to handle AUTHOR tags
}

void gxsHTML::Handle_AUTHOR_Tag()
{
  // Override to handle AUTHOR tags
}

void gxsHTML::Handle_B_Tag()
{
  // Override to handle BOLD tags
}

void gxsHTML::Handle_BANNER_Tag()
{
  // Override to handle BANNER tags
}

void gxsHTML::Handle_BASE_Tag()
{
  // Override to handle BASE tags
}

void gxsHTML::Handle_BASEFONT_Tag()
{
  // Override to handle BASE FONT
}

void gxsHTML::Handle_BGSOUND_Tag()
{
  // Override to handle BACKGROUND SOUND
}

void gxsHTML::Handle_BIG_Tag()
{
  // Override to handle BIG text
}

void gxsHTML::Handle_BLINK_Tag()
{
  // Override to handle BLINK tags
}

void gxsHTML::Handle_BLOCKQUOTE_Tag()
{
  // Override to handle BLOCK QUOTE tags
}

void gxsHTML::Handle_BQ_Tag()
{
  // Override to handle BLOCK QUOTE tags
}

void gxsHTML::Handle_BODY_Tag()
{
  // Override to handle BODY tags
}

void gxsHTML::Handle_BR_Tag()
{
  // Override to handle LINE BREAK tags
}

void gxsHTML::Handle_CAPTION_Tag()
{
  // Override to handle CAPTION tags
}

void gxsHTML::Handle_CENTER_Tag()
{
  // Override to handle CENTER tags
}

void gxsHTML::Handle_CITE_Tag()
{
  // Override to handle CITATION tags
}

void gxsHTML::Handle_CODE_Tag()
{
  // Override to handle CODE tags
}

void gxsHTML::Handle_COL_Tag()
{
  // Override to handle TABLE Cols tags
}

void gxsHTML::Handle_COLGROUP_Tag()
{
  // Override to handle TABLE Cols tags
}

void gxsHTML::Handle_CREDIT_Tag()
{
  // Override to handle CREDIT tags
}

void gxsHTML::Handle_DEL_Tag()
{
  // Override to handle DELETED text tags
}

void gxsHTML::Handle_DFN_Tag()
{
  // Override to handle DEFINITION tags
}

void gxsHTML::Handle_DIR_Tag()
{
  // Override to handle DIRECTORY list tags
}

void gxsHTML::Handle_DIV_Tag()
{
  // Override to handle DIVISION tags
}

void gxsHTML::Handle_DL_Tag()
{
  // Override to handle DEFINITION list tags
}

void gxsHTML::Handle_DT_Tag()
{
  // Override to handle DEFINITION term tags
}

void gxsHTML::Handle_DD_Tag()
{
  // Override to handle DEFINITION tags
}

void gxsHTML::Handle_EM_Tag()
{
  // Override to handle EMPHASIZED tags
}

void gxsHTML::Handle_EMBED_Tag()
{
  // Override to handle EMBED tags
}

void gxsHTML::Handle_FIG_Tag()
{
  // Override to handle FIGURE tags
}

void gxsHTML::Handle_FN_Tag()
{
  // Override to handle FOOTNOTE tags
}

void gxsHTML::Handle_FONT_Tag()
{
  // Override to handle FONT tags
}

void gxsHTML::Handle_FORM_Tag()
{
  // Override to handle FORM tags
}

void gxsHTML::Handle_FRAME_Tag()
{
  // Override to handle FRAME tags
}

void gxsHTML::Handle_FRAMESET_Tag()
{
  // Override to handle FRAME sets
}

void gxsHTML::Handle_H1_Tag()
{
  // Override to handle HEADING 1 tags
}

void gxsHTML::Handle_H2_Tag()
{
  // Override to handle HEADING 2 tags
}

void gxsHTML::Handle_H3_Tag()
{
  // Override to handle HEADING 3 tags
}

void gxsHTML::Handle_H4_Tag()
{
  // Override to handle HEADING 4 tags
}

void gxsHTML::Handle_H5_Tag()
{
  // Override to handle HEADING 5 tags
}

void gxsHTML::Handle_H6_Tag()
{
  // Override to handle HEADING 6 tags
}

void gxsHTML::Handle_HEAD_Tag()
{
  // Override to handle HEAD tags
}

void gxsHTML::Handle_HR_Tag()
{
  // Override to handle HORIZONTAL rules
}

void gxsHTML::Handle_HTML_Tag()
{
  // Override to handle HTML tags
}

void gxsHTML::Handle_I_Tag()
{
  // Override to handle ITALIC tags
}

void gxsHTML::Handle_IFRAME_Tag()
{
  // Override to handle FRAME - Floating tag
}

void gxsHTML::Handle_IMG_Tag()
{
  // Override to handle INLINE images
}

void gxsHTML::Handle_INPUT_Tag()
{
  // Override to handle FORM input tags
}

void gxsHTML::Handle_INS_Tag()
{
  // Override to handle INSERTED text
}

void gxsHTML::Handle_ISINDEX_Tag()
{
  // Override to handle ISINDEX tag
}

void gxsHTML::Handle_KBD_Tag()
{
  // Override to handle KEYBOARD tags
}

void gxsHTML::Handle_LANG_Tag()
{
  // Override to handle LANGUAGE tags
}

void gxsHTML::Handle_LH_Tag()
{
  // Override to handle LIST header tags
}

void gxsHTML::Handle_LI_Tag()
{
  // Override to handle LIST item tags
}

void gxsHTML::Handle_LINK_Tag()
{
  // Override to handle LINK tags
}

void gxsHTML::Handle_LISTING_Tag()
{
  // Override to handle LISTING tags
}

void gxsHTML::Handle_MAP_Tag()
{
  // Override to handle MAP tags
}

void gxsHTML::Handle_MARQUEE_Tag()
{
  // Override to handle MARQUEE tags
}

void gxsHTML::Handle_MATH_Tag()
{
  // Override to handle MATH tags
}

void gxsHTML::Handle_MENU_Tag()
{
  // Override to handle MENU list tags
}

void gxsHTML::Handle_META_Tag()
{
  // Override to handle META tags
}

void gxsHTML::Handle_MULTICOL_Tag()
{
  // Override to handle MULTI COLUMN tags
}

void gxsHTML::Handle_NOBR_Tag()
{
  // Override to handle NO BREAK tags
}

void gxsHTML::Handle_NOFRAMES_Tag()
{
  // Override to handle NO FRAMES tags
}

void gxsHTML::Handle_NOTE_Tag()
{
  // Override to handle NOTE tags
}

void gxsHTML::Handle_OL_Tag()
{
  // Override to handle ORDERED list tags
}

void gxsHTML::Handle_OVERLAY_Tag()
{
  // Override to handle OVERLAY tags
}

void gxsHTML::Handle_P_Tag()
{
  // Override to handle PARAGRAPH tags
}

void gxsHTML::Handle_PARAM_Tag()
{
  // Override to handle PARAMETERS tags
}

void gxsHTML::Handle_PERSON_Tag()
{
  // Override to handle PERSON tags
}

void gxsHTML::Handle_PLAINTEXT_Tag()
{
  // Override to handle PLAIN text tags
}

void gxsHTML::Handle_PRE_Tag()
{
  // Override to handle PREFORMATTED text tags
}

void gxsHTML::Handle_Q_Tag()
{
  // Override to handle QUOTE tags
}

void gxsHTML::Handle_RANGE_Tag()
{
  // Override to handle RANGE tags
}

void gxsHTML::Handle_SAMP_Tag()
{
  // Override to handle SAMPLE tags
}

void gxsHTML::Handle_SCRIPT_Tag()
{
  // Override to handle SCRIPT tags
}

void gxsHTML::Handle_SELECT_Tag()
{
  // Override to handle FORM SELECT tags
}

void gxsHTML::Handle_SMALL_Tag()
{
  // Override to handle SMALL text tags
}

void gxsHTML::Handle_SPACER_Tag()
{
  // Override to handle WHITE SPACE tags
}

void gxsHTML::Handle_SPOT_Tag()
{
  // Override to handle SPOT tags
}

void gxsHTML::Handle_STRIKE_Tag()
{
  // Override to handle STRIKETHROUGH tags
}

void gxsHTML::Handle_STRONG_Tag()
{
  // Override to handle STRONG tags
}

void gxsHTML::Handle_SUB_Tag()
{
  // Override to handle SUBSCRIPT tags
}

void gxsHTML::Handle_SUP_Tag()
{
  // Override to handle SUPERSCRIPT tags
}

void gxsHTML::Handle_TAB_Tag()
{
  // Override to handle HORIZONTAL TABS tags
}

void gxsHTML::Handle_TABLE_Tag()
{
  // Override to handle TABLE tags
}

void gxsHTML::Handle_TBODY_Tag()
{
  // Override to handle TABLE body tags
}

void gxsHTML::Handle_TD_Tag()
{
  // Override to handle TABLE data tags
}

void gxsHTML::Handle_TEXTAREA_Tag()
{
  // Override to handle FORM form tags
}

void gxsHTML::Handle_TEXTFLOW_Tag()
{
  // Override to handle JAVA applet textflow
}

void gxsHTML::Handle_TFOOT_Tag()
{
  // Override to handle TABLE footer tags
}

void gxsHTML::Handle_TH_Tag()
{
  // Override to handle TABLE head
}

void gxsHTML::Handle_THEAD_Tag()
{
  // Override to handle TABLE head tag
}

void gxsHTML::Handle_TITLE_Tag()
{
  // Override to handle TITLE tags
}

void gxsHTML::Handle_TR_Tag()
{
  // Override to handle TABLE row tags 
}

void gxsHTML::Handle_TT_Tag()
{
  // Override to handle TELETYPE tags
}

void gxsHTML::Handle_U_Tag()
{
  // Override to handle UNDERLINED tags
}

void gxsHTML::Handle_UL_Tag()
{
  // Override to handle UNORDERED list tags
}

void gxsHTML::Handle_VAR_Tag()
{
  // Override to handle VARIABLE tags 
}

void gxsHTML::Handle_WBR_Tag()
{
  // Override to handle WORD BREAK tags
}

void gxsHTML::Handle_XMP_Tag()
{
  // Override to handle EXAMPLE tags
}


End Of Document