/* General SGML Parser code SGML.c ** ======================== ** ** This module implements an HTStream object. To parse an ** SGML file, create this object which is a parser. The object ** is (currently) created by being passed a DTD structure, ** and a target HTStructured oject at which to throw the parsed stuff. ** ** 6 Feb 93 Binary seraches used. Intreface modified. */ #include "../config.h" #include "SGML.h" #include <ctype.h> #include <stdio.h> #include "HTUtils.h" #include "HTChunk.h" #include "../libnut/str-tools.h" #ifndef DISABLE_TRACE extern int www2Trace; #endif #define INVALID (-1) /* The State (context) of the parser ** ** This is passed with each call to make the parser reentrant ** */ #define MAX_ATTRIBUTES 20 /* Max number of attributes per element */ /* Element Stack ** ------------- ** This allows us to return down the stack reselcting styles. ** As we return, attribute values will be garbage in general. */ typedef struct _HTElement HTElement; struct _HTElement { HTElement * next; /* Previously nested element or 0 */ HTTag* tag; /* The tag at this level */ }; /* Internal Context Data Structure ** ------------------------------- */ struct _HTStream { WWW_CONST HTStreamClass * isa; /* inherited from HTStream */ WWW_CONST SGML_dtd *dtd; HTStructuredClass *actions; /* target class */ HTStructured *target; /* target object */ HTTag *current_tag; int current_attribute_number; HTChunk *string; HTElement *element_stack; enum sgml_state { S_text, S_litteral, S_tag, S_tag_gap, S_attr, S_attr_gap, S_equals, S_value, S_ero, S_cro, S_squoted, S_dquoted, S_end, S_entity, S_junk_tag} state; #ifdef CALLERDATA void * callerData; #endif BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */ char * value[MAX_ATTRIBUTES]; /* malloc'd strings or NULL if none */ } ; #define PUTC(ch) ((*context->actions->put_character)(context->target, ch)) /* Handle Attribute ** ---------------- */ /* PUBLIC WWW_CONST char * SGML_default = ""; ?? */ #ifdef __STDC__ PRIVATE void handle_attribute_name(HTStream * context, char * s) #else PRIVATE void handle_attribute_name(context, s) HTStream * context; char *s; #endif { HTTag * tag = context->current_tag; attr* attributes = tag->attributes; int high, low, i, diff; /* Binary search for attribute name */ for(low=0, high=tag->number_of_attributes; high > low ; diff < 0 ? (low = i+1) : (high = i) ) { i = (low + (high-low)/2); diff = my_strcasecmp(attributes[i].name, s); if (diff==0) { /* success: found it */ context->current_attribute_number = i; context->present[i] = YES; if (context->value[i]) { free(context->value[i]); context->value[i] = NULL; } return; } /* if */ } /* for */ #ifndef DISABLE_TRACE if (www2Trace) fprintf(stderr, "SGML: Unknown attribute %s for tag %s\n", s, context->current_tag->name); #endif context->current_attribute_number = INVALID; /* Invalid */ } /* Handle attribute value ** ---------------------- */ #ifdef __STDC__ PRIVATE void handle_attribute_value(HTStream * context, char * s) #else PRIVATE void handle_attribute_value(context, s) HTStream * context; char *s; #endif { if (context->current_attribute_number != INVALID) { StrAllocCopy(context->value[context->current_attribute_number], s); } else { #ifndef DISABLE_TRACE if (www2Trace) fprintf(stderr, "SGML: Attribute value %s ignored\n", s); #endif } context->current_attribute_number = INVALID; /* can't have two assignments! */ } /* Handle entity ** ------------- ** ** On entry, ** s contains the entity name zero terminated ** Bugs: ** If the entity name is unknown, the terminator is treated as ** a printable non-special character in all cases, even if it is '<' */ #ifdef __STDC__ PRIVATE void handle_entity(HTStream * context, char term) #else PRIVATE void handle_entity(context, term) HTStream * context; char term; #endif { WWW_CONST char ** entities = context->dtd->entity_names; WWW_CONST char *s = context->string->data; int high, low, i, diff; for(low=0, high = context->dtd->number_of_entities; high > low ; diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */ i = (low + (high-low)/2); diff = strcmp(entities[i], s); /* Csse sensitive! */ if (diff==0) { /* success: found it */ (*context->actions->put_entity)(context->target, i); return; } } /* If entity string not found, display as text */ #ifndef DISABLE_TRACE if (www2Trace) fprintf(stderr, "SGML: Unknown entity %s\n", s); #endif PUTC('&'); { WWW_CONST char *p; for (p=s; *p; p++) { PUTC(*p); } } PUTC(term); } /* End element ** ----------- */ #ifdef __STDC__ PRIVATE void end_element(HTStream * context, HTTag * old_tag) #else PRIVATE void end_element(context, old_tag) HTTag * old_tag; HTStream * context; #endif { #ifndef DISABLE_TRACE if (www2Trace) fprintf(stderr, "SGML: End </%s>\n", old_tag->name); #endif if (old_tag->contents == SGML_EMPTY) { #ifndef DISABLE_TRACE if (www2Trace) fprintf(stderr,"SGML: Illegal end tag </%s> found.\n", old_tag->name); #endif return; } while (context->element_stack) {/* Loop is error path only */ HTElement * N = context->element_stack; HTTag * t = N->tag; if (old_tag != t) { /* Mismatch: syntax error */ if (context->element_stack->next) { /* This is not the last level */ #ifndef DISABLE_TRACE if (www2Trace) fprintf(stderr, "SGML: Found </%s> when expecting </%s>. </%s> assumed.\n", old_tag->name, t->name, t->name); #endif } else { /* last level */ #ifndef DISABLE_TRACE if (www2Trace) fprintf(stderr, "SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n", old_tag->name, t->name, old_tag->name); #endif return; /* Ignore */ } } context->element_stack = N->next; /* Remove from stack */ free(N); (*context->actions->end_element)(context->target, t - context->dtd->tags); if (old_tag == t) return; /* Correct sequence */ /* Syntax error path only */ } #ifndef DISABLE_TRACE if (www2Trace) fprintf(stderr, "SGML: Extra end tag </%s> found and ignored.\n", old_tag->name); #endif } /* Start a element */ #ifdef __STDC__ PRIVATE void start_element(HTStream * context) #else PRIVATE void start_element(context) HTStream * context; #endif { HTTag * new_tag = context->current_tag; #ifndef DISABLE_TRACE if (www2Trace) fprintf(stderr, "SGML: Start <%s>\n", new_tag->name); #endif (*context->actions->start_element)( context->target, new_tag - context->dtd->tags, context->present, (WWW_CONST char**) context->value); /* coerce type for think c */ if (new_tag->contents != SGML_EMPTY) { /* i.e. tag not empty */ HTElement * N = (HTElement *)malloc(sizeof(HTElement)); if (N == NULL) outofmem(__FILE__, "start_element"); N->next = context->element_stack; N->tag = new_tag; context->element_stack = N; } } /* Find Tag in DTD tag list ** ------------------------ ** ** On entry, ** dtd points to dtd structire including valid tag list ** string points to name of tag in question ** ** On exit, ** returns: ** NULL tag not found ** else address of tag structure in dtd */ PRIVATE HTTag * find_tag ARGS2(WWW_CONST SGML_dtd*, dtd, char *, string) { int high, low, i, diff; for(low=0, high=dtd->number_of_tags; high > low ; diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */ i = (low + (high-low)/2); diff = my_strcasecmp(dtd->tags[i].name, string); /* Case insensitive */ if (diff==0) { /* success: found it */ return &dtd->tags[i]; } } return NULL; } /*________________________________________________________________________ ** Public Methods */ PUBLIC void SGML_end ARGS1(HTStream *, context) { /* Could check that we are back to bottom of stack! @@ */ (*context->actions->end_document)(context->target); } PUBLIC void SGML_free ARGS1(HTStream *, context) { (*context->actions->free)(context->target); HTChunkFree(context->string); free(context); } /* Read and write user callback handle ** ----------------------------------- ** ** The callbacks from the SGML parser have an SGML context parameter. ** These calls allow the caller to associate his own context with a ** particular SGML context. */ #ifdef CALLERDATA PUBLIC void* SGML_callerData ARGS1(HTStream *, context) { return context->callerData; } PUBLIC void SGML_setCallerData ARGS2(HTStream *, context, void*, data) { context->callerData = data; } #endif PUBLIC void SGML_character ARGS2(HTStream *, context, char,c) { WWW_CONST SGML_dtd *dtd = context->dtd; HTChunk *string = context->string; switch(context->state) { case S_text: if (c=='&' && (!context->element_stack || ( context->element_stack->tag && ( context->element_stack->tag->contents == SGML_MIXED || context->element_stack->tag->contents == SGML_RCDATA) ))) { string->size = 0; context->state = S_ero; } else if (c=='<') { string->size = 0; context->state = (context->element_stack && context->element_stack->tag && context->element_stack->tag->contents == SGML_LITTERAL) ? S_litteral : S_tag; } else PUTC(c); break; /* In litteral mode, waits only for specific end tag! ** Only foir compatibility with old servers. */ case S_litteral : HTChunkPutc(string, c); if ( TOUPPER(c) != ((string->size ==1) ? '/' : context->element_stack->tag->name[string->size-2])) { int i; /* If complete match, end litteral */ if ((c=='>') && (!context->element_stack->tag->name[string->size-2])) { end_element(context, context->element_stack->tag); string->size = 0; context->current_attribute_number = INVALID; context->state = S_text; break; } /* If Mismatch: recover string. */ PUTC( '<'); for (i=0; i<string->size; i++) /* recover */ PUTC( string->data[i]); context->state = S_text; } break; /* Character reference or Entity */ case S_ero: if (c=='#') { context->state = S_cro; /* &# is Char Ref Open */ break; } context->state = S_entity; /* Fall through! */ /* Handle Entities */ case S_entity: if (isalnum(c)) HTChunkPutc(string, c); else { HTChunkTerminate(string); handle_entity(context, c); context->state = S_text; } break; /* Character reference */ case S_cro: if (isalnum(c)) HTChunkPutc(string, c); /* accumulate a character NUMBER */ else { int value; HTChunkTerminate(string); if (sscanf(string->data, "%d", &value)==1) PUTC((char)value); context->state = S_text; } break; /* Tag */ case S_tag: /* new tag */ if (isalnum(c)) HTChunkPutc(string, c); else { /* End of tag name */ HTTag * t; if (c=='/') { #ifndef DISABLE_TRACE if (www2Trace) if (string->size!=0) fprintf(stderr,"SGML: `<%s/' found!\n", string->data); #endif context->state = S_end; break; } HTChunkTerminate(string) ; t = find_tag(dtd, string->data); if (!t) { #ifndef DISABLE_TRACE if(www2Trace) fprintf(stderr, "SGML: *** Unknown element %s\n", string->data); #endif context->state = (c=='>') ? S_text : S_junk_tag; break; } context->current_tag = t; /* Clear out attributes */ { int i; for (i=0; i< context->current_tag->number_of_attributes; i++) context->present[i] = NO; } string->size = 0; context->current_attribute_number = INVALID; if (c=='>') { if (context->current_tag->name) start_element(context); context->state = S_text; } else { context->state = S_tag_gap; } } break; case S_tag_gap: /* Expecting attribute or > */ if (WHITE(c)) break; /* Gap between attributes */ if (c=='>') { /* End of tag */ if (context->current_tag->name) start_element(context); context->state = S_text; break; } HTChunkPutc(string, c); context->state = S_attr; /* Get attribute */ break; /* accumulating value */ case S_attr: if (WHITE(c) || (c=='>') || (c=='=')) { /* End of word */ HTChunkTerminate(string) ; handle_attribute_name(context, string->data); string->size = 0; if (c=='>') { /* End of tag */ if (context->current_tag->name) start_element(context); context->state = S_text; break; } context->state = (c=='=' ? S_equals: S_attr_gap); } else { HTChunkPutc(string, c); } break; case S_attr_gap: /* Expecting attribute or = or > */ if (WHITE(c)) break; /* Gap after attribute */ if (c=='>') { /* End of tag */ if (context->current_tag->name) start_element(context); context->state = S_text; break; } else if (c=='=') { context->state = S_equals; break; } HTChunkPutc(string, c); context->state = S_attr; /* Get next attribute */ break; case S_equals: /* After attr = */ if (WHITE(c)) break; /* Before attribute value */ if (c=='>') { /* End of tag */ #ifndef DISABLE_TRACE if (www2Trace) fprintf(stderr, "SGML: found = but no value\n"); #endif if (context->current_tag->name) start_element(context); context->state = S_text; break; } else if (c=='\'') { context->state = S_squoted; break; } else if (c=='"') { context->state = S_dquoted; break; } HTChunkPutc(string, c); context->state = S_value; break; case S_value: if (WHITE(c) || (c=='>')) { /* End of word */ HTChunkTerminate(string) ; handle_attribute_value(context, string->data); string->size = 0; if (c=='>') { /* End of tag */ if (context->current_tag->name) start_element(context); context->state = S_text; break; } else context->state = S_tag_gap; } else { HTChunkPutc(string, c); } break; case S_squoted: /* Quoted attribute value */ if (c=='\'') { /* End of attribute value */ HTChunkTerminate(string) ; handle_attribute_value(context, string->data); string->size = 0; context->state = S_tag_gap; } else { HTChunkPutc(string, c); } break; case S_dquoted: /* Quoted attribute value */ if (c=='"') { /* End of attribute value */ HTChunkTerminate(string) ; handle_attribute_value(context, string->data); string->size = 0; context->state = S_tag_gap; } else { HTChunkPutc(string, c); } break; case S_end: /* </ */ if (isalnum(c)) HTChunkPutc(string, c); else { /* End of end tag name */ HTTag * t; HTChunkTerminate(string) ; if (!*string->data) { /* Empty end tag */ t = context->element_stack->tag; } else { t = find_tag(dtd, string->data); } if (!t) { #ifndef DISABLE_TRACE if(www2Trace) fprintf(stderr, "Unknown end tag </%s>\n", string->data); #endif } else { context->current_tag = t; end_element( context, context->current_tag); } string->size = 0; context->current_attribute_number = INVALID; if (c!='>') { #ifndef DISABLE_TRACE if (www2Trace && !WHITE(c)) fprintf(stderr,"SGML: `</%s%c' found!\n", string->data, c); #endif context->state = S_junk_tag; } else { context->state = S_text; } } break; case S_junk_tag: if (c=='>') { context->state = S_text; } } /* switch on context->state */ } /* SGML_character */ PUBLIC void SGML_string ARGS2(HTStream *, context, WWW_CONST char*, str) { WWW_CONST char *p; for(p=str; *p; p++) SGML_character(context, *p); } PUBLIC void SGML_write ARGS3(HTStream *, context, WWW_CONST char*, str, int, l) { WWW_CONST char *p; WWW_CONST char *e = str+l; for(p=str; p<e; p++) SGML_character(context, *p); } /*_______________________________________________________________________ */ PRIVATE void SGML_handle_interrupt ARGS1(HTStream *, context) { } /* Structured Object Class ** ----------------------- */ PUBLIC WWW_CONST HTStreamClass SGMLParser = { "SGMLParser", SGML_free, SGML_end, SGML_character, SGML_string, SGML_write, SGML_handle_interrupt }; /* Create SGML Engine ** ------------------ ** ** On entry, ** dtd represents the DTD, along with ** actions is the sink for the data as a set of routines. ** */ PUBLIC HTStream* SGML_new ARGS2( WWW_CONST SGML_dtd *, dtd, HTStructured *, target) { int i; HTStream* context = (HTStream *) malloc(sizeof(*context)); if (!context) outofmem(__FILE__, "SGML_begin"); context->isa = &SGMLParser; context->string = HTChunkCreate(128); /* Grow by this much */ context->dtd = dtd; context->target = target; context->actions = (HTStructuredClass*)(((HTStream*)target)->isa); /* Ugh: no OO */ context->state = S_text; context->element_stack = 0; /* empty */ #ifdef CALLERDATA context->callerData = (void*) callerData; #endif for(i=0; i<MAX_ATTRIBUTES; i++) context->value[i] = 0; return context; }