ncsa-mosaic/libwww2/SGML.c

700 lines
17 KiB
C
Raw Permalink Normal View History

2010-03-08 04:55:21 -06:00
/* General SGML Parser code SGML.c
** ========================
**
** This module implements an HTStream object. To parse an
** SGML file, create this object which is a parser. The object
** is (currently) created by being passed a DTD structure,
** and a target HTStructured oject at which to throw the parsed stuff.
**
2010-03-08 04:55:21 -06:00
** 6 Feb 93 Binary seraches used. Intreface modified.
*/
#include "../config.h"
#include "SGML.h"
#include <ctype.h>
#include <stdio.h>
#include <string.h>
2010-03-08 04:55:21 -06:00
#include "HTUtils.h"
#include "HTChunk.h"
#include "../libnut/str-tools.h"
#ifndef DISABLE_TRACE
extern int www2Trace;
#endif
#define INVALID (-1)
/* The State (context) of the parser
**
** This is passed with each call to make the parser reentrant
**
*/
#define MAX_ATTRIBUTES 20 /* Max number of attributes per element */
2010-03-08 04:55:21 -06:00
/* Element Stack
** -------------
** This allows us to return down the stack reselcting styles.
** As we return, attribute values will be garbage in general.
*/
typedef struct _HTElement HTElement;
struct _HTElement {
HTElement * next; /* Previously nested element or 0 */
HTTag* tag; /* The tag at this level */
};
/* Internal Context Data Structure
** -------------------------------
*/
struct _HTStream {
WWW_CONST HTStreamClass * isa; /* inherited from HTStream */
2010-03-08 04:55:21 -06:00
WWW_CONST SGML_dtd *dtd;
HTStructuredClass *actions; /* target class */
HTStructured *target; /* target object */
HTTag *current_tag;
int current_attribute_number;
HTChunk *string;
HTElement *element_stack;
enum sgml_state { S_text, S_litteral, S_tag, S_tag_gap,
2010-03-08 04:55:21 -06:00
S_attr, S_attr_gap, S_equals, S_value,
S_ero, S_cro,
S_squoted, S_dquoted, S_end, S_entity, S_junk_tag} state;
#ifdef CALLERDATA
2010-03-08 04:55:21 -06:00
void * callerData;
#endif
BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */
char * value[MAX_ATTRIBUTES]; /* malloc'd strings or NULL if none */
} ;
#define PUTC(ch) ((*context->actions->put_character)(context->target, ch))
/* Handle Attribute
** ----------------
*/
/* PUBLIC WWW_CONST char * SGML_default = ""; ?? */
#ifdef __STDC__
PRIVATE void handle_attribute_name(HTStream * context, char * s)
#else
PRIVATE void handle_attribute_name(context, s)
HTStream * context;
char *s;
#endif
{
HTTag * tag = context->current_tag;
attr* attributes = tag->attributes;
int high, low, i, diff; /* Binary search for attribute name */
for(low=0, high=tag->number_of_attributes;
high > low ;
diff < 0 ? (low = i+1) : (high = i) ) {
i = (low + (high-low)/2);
diff = my_strcasecmp(attributes[i].name, s);
if (diff==0) { /* success: found it */
context->current_attribute_number = i;
context->present[i] = YES;
if (context->value[i]) {
free(context->value[i]);
context->value[i] = NULL;
}
return;
} /* if */
2010-03-08 04:55:21 -06:00
} /* for */
2010-03-08 04:55:21 -06:00
#ifndef DISABLE_TRACE
if (www2Trace)
fprintf(stderr, "SGML: Unknown attribute %s for tag %s\n",
s, context->current_tag->name);
#endif
context->current_attribute_number = INVALID; /* Invalid */
}
/* Handle attribute value
** ----------------------
*/
#ifdef __STDC__
PRIVATE void handle_attribute_value(HTStream * context, char * s)
#else
PRIVATE void handle_attribute_value(context, s)
HTStream * context;
char *s;
#endif
{
if (context->current_attribute_number != INVALID) {
StrAllocCopy(context->value[context->current_attribute_number], s);
} else {
#ifndef DISABLE_TRACE
if (www2Trace) fprintf(stderr, "SGML: Attribute value %s ignored\n", s);
#endif
}
context->current_attribute_number = INVALID; /* can't have two assignments! */
}
/* Handle entity
** -------------
**
** On entry,
** s contains the entity name zero terminated
** Bugs:
** If the entity name is unknown, the terminator is treated as
** a printable non-special character in all cases, even if it is '<'
*/
#ifdef __STDC__
PRIVATE void handle_entity(HTStream * context, char term)
#else
PRIVATE void handle_entity(context, term)
HTStream * context;
char term;
#endif
{
WWW_CONST char ** entities = context->dtd->entity_names;
WWW_CONST char *s = context->string->data;
2010-03-08 04:55:21 -06:00
int high, low, i, diff;
for(low=0, high = context->dtd->number_of_entities;
high > low ;
diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
i = (low + (high-low)/2);
diff = strcmp(entities[i], s); /* Csse sensitive! */
if (diff==0) { /* success: found it */
(*context->actions->put_entity)(context->target, i);
return;
}
}
/* If entity string not found, display as text */
#ifndef DISABLE_TRACE
if (www2Trace)
fprintf(stderr, "SGML: Unknown entity %s\n", s);
2010-03-08 04:55:21 -06:00
#endif
PUTC('&');
{
WWW_CONST char *p;
for (p=s; *p; p++) {
PUTC(*p);
}
}
PUTC(term);
}
/* End element
** -----------
*/
#ifdef __STDC__
PRIVATE void end_element(HTStream * context, HTTag * old_tag)
#else
PRIVATE void end_element(context, old_tag)
HTTag * old_tag;
HTStream * context;
#endif
{
#ifndef DISABLE_TRACE
if (www2Trace) fprintf(stderr, "SGML: End </%s>\n", old_tag->name);
#endif
if (old_tag->contents == SGML_EMPTY) {
#ifndef DISABLE_TRACE
if (www2Trace) fprintf(stderr,"SGML: Illegal end tag </%s> found.\n",
old_tag->name);
#endif
return;
}
while (context->element_stack) {/* Loop is error path only */
HTElement * N = context->element_stack;
HTTag * t = N->tag;
2010-03-08 04:55:21 -06:00
if (old_tag != t) { /* Mismatch: syntax error */
if (context->element_stack->next) { /* This is not the last level */
#ifndef DISABLE_TRACE
if (www2Trace) fprintf(stderr,
"SGML: Found </%s> when expecting </%s>. </%s> assumed.\n",
old_tag->name, t->name, t->name);
#endif
} else { /* last level */
#ifndef DISABLE_TRACE
if (www2Trace) fprintf(stderr,
"SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n",
old_tag->name, t->name, old_tag->name);
#endif
return; /* Ignore */
}
}
2010-03-08 04:55:21 -06:00
context->element_stack = N->next; /* Remove from stack */
free(N);
(*context->actions->end_element)(context->target,
t - context->dtd->tags);
if (old_tag == t) return; /* Correct sequence */
2010-03-08 04:55:21 -06:00
/* Syntax error path only */
2010-03-08 04:55:21 -06:00
}
#ifndef DISABLE_TRACE
if (www2Trace) fprintf(stderr,
"SGML: Extra end tag </%s> found and ignored.\n", old_tag->name);
#endif
}
/* Start a element
*/
#ifdef __STDC__
PRIVATE void start_element(HTStream * context)
#else
PRIVATE void start_element(context)
HTStream * context;
#endif
{
HTTag * new_tag = context->current_tag;
2010-03-08 04:55:21 -06:00
#ifndef DISABLE_TRACE
if (www2Trace) fprintf(stderr, "SGML: Start <%s>\n", new_tag->name);
#endif
(*context->actions->start_element)(
context->target,
new_tag - context->dtd->tags,
context->present,
(WWW_CONST char**) context->value); /* coerce type for think c */
if (new_tag->contents != SGML_EMPTY) { /* i.e. tag not empty */
HTElement * N = (HTElement *)malloc(sizeof(HTElement));
if (N == NULL) outofmem(__FILE__, "start_element");
N->next = context->element_stack;
N->tag = new_tag;
context->element_stack = N;
}
}
/* Find Tag in DTD tag list
** ------------------------
**
** On entry,
** dtd points to dtd structire including valid tag list
** string points to name of tag in question
**
** On exit,
** returns:
** NULL tag not found
** else address of tag structure in dtd
*/
PRIVATE HTTag * find_tag ARGS2(WWW_CONST SGML_dtd*, dtd, char *, string)
{
int high, low, i, diff;
for(low=0, high=dtd->number_of_tags;
high > low ;
diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
i = (low + (high-low)/2);
diff = my_strcasecmp(dtd->tags[i].name, string); /* Case insensitive */
if (diff==0) { /* success: found it */
return &dtd->tags[i];
}
}
return NULL;
}
/*________________________________________________________________________
** Public Methods
*/
PUBLIC void SGML_end ARGS1(HTStream *, context)
{
/* Could check that we are back to bottom of stack! @@ */
(*context->actions->end_document)(context->target);
}
PUBLIC void SGML_free ARGS1(HTStream *, context)
{
(*context->actions->free)(context->target);
HTChunkFree(context->string);
free(context);
}
/* Read and write user callback handle
** -----------------------------------
**
** The callbacks from the SGML parser have an SGML context parameter.
** These calls allow the caller to associate his own context with a
** particular SGML context.
*/
#ifdef CALLERDATA
2010-03-08 04:55:21 -06:00
PUBLIC void* SGML_callerData ARGS1(HTStream *, context)
{
return context->callerData;
}
PUBLIC void SGML_setCallerData ARGS2(HTStream *, context, void*, data)
{
context->callerData = data;
}
#endif
PUBLIC void SGML_character ARGS2(HTStream *, context, char,c)
{
WWW_CONST SGML_dtd *dtd = context->dtd;
HTChunk *string = context->string;
switch(context->state) {
case S_text:
if (c=='&' && (!context->element_stack || (
context->element_stack->tag &&
( context->element_stack->tag->contents == SGML_MIXED
|| context->element_stack->tag->contents ==
SGML_RCDATA)
))) {
string->size = 0;
context->state = S_ero;
2010-03-08 04:55:21 -06:00
} else if (c=='<') {
string->size = 0;
context->state = (context->element_stack &&
context->element_stack->tag &&
context->element_stack->tag->contents == SGML_LITTERAL) ?
S_litteral : S_tag;
} else PUTC(c);
break;
/* In litteral mode, waits only for specific end tag!
** Only foir compatibility with old servers.
*/
case S_litteral :
HTChunkPutc(string, c);
if ( TOUPPER(c) != ((string->size ==1) ? '/'
: context->element_stack->tag->name[string->size-2])) {
int i;
2010-03-08 04:55:21 -06:00
/* If complete match, end litteral */
if ((c=='>') && (!context->element_stack->tag->name[string->size-2])) {
end_element(context, context->element_stack->tag);
string->size = 0;
context->current_attribute_number = INVALID;
context->state = S_text;
break;
} /* If Mismatch: recover string. */
PUTC( '<');
for (i=0; i<string->size; i++) /* recover */
PUTC(
string->data[i]);
context->state = S_text;
2010-03-08 04:55:21 -06:00
}
2010-03-08 04:55:21 -06:00
break;
/* Character reference or Entity
*/
case S_ero:
if (c=='#') {
context->state = S_cro; /* &# is Char Ref Open */
2010-03-08 04:55:21 -06:00
break;
}
context->state = S_entity; /* Fall through! */
2010-03-08 04:55:21 -06:00
/* Handle Entities
*/
case S_entity:
if (isalnum(c))
HTChunkPutc(string, c);
else {
HTChunkTerminate(string);
handle_entity(context, c);
context->state = S_text;
}
break;
/* Character reference
*/
case S_cro:
if (isalnum(c))
HTChunkPutc(string, c); /* accumulate a character NUMBER */
else {
int value;
HTChunkTerminate(string);
if (sscanf(string->data, "%d", &value)==1)
PUTC((char)value);
context->state = S_text;
}
break;
/* Tag
*/
2010-03-08 04:55:21 -06:00
case S_tag: /* new tag */
if (isalnum(c))
HTChunkPutc(string, c);
else { /* End of tag name */
HTTag * t;
if (c=='/') {
#ifndef DISABLE_TRACE
if (www2Trace) if (string->size!=0)
fprintf(stderr,"SGML: `<%s/' found!\n", string->data);
#endif
context->state = S_end;
break;
}
HTChunkTerminate(string) ;
t = find_tag(dtd, string->data);
if (!t) {
#ifndef DISABLE_TRACE
if(www2Trace) fprintf(stderr, "SGML: *** Unknown element %s\n",
string->data);
#endif
context->state = (c=='>') ? S_text : S_junk_tag;
break;
}
context->current_tag = t;
2010-03-08 04:55:21 -06:00
/* Clear out attributes
*/
2010-03-08 04:55:21 -06:00
{
int i;
for (i=0; i< context->current_tag->number_of_attributes; i++)
context->present[i] = NO;
}
string->size = 0;
context->current_attribute_number = INVALID;
2010-03-08 04:55:21 -06:00
if (c=='>') {
if (context->current_tag->name) start_element(context);
context->state = S_text;
} else {
context->state = S_tag_gap;
}
}
break;
2010-03-08 04:55:21 -06:00
case S_tag_gap: /* Expecting attribute or > */
if (WHITE(c)) break; /* Gap between attributes */
if (c=='>') { /* End of tag */
if (context->current_tag->name) start_element(context);
context->state = S_text;
break;
}
HTChunkPutc(string, c);
context->state = S_attr; /* Get attribute */
break;
2010-03-08 04:55:21 -06:00
/* accumulating value */
case S_attr:
if (WHITE(c) || (c=='>') || (c=='=')) { /* End of word */
HTChunkTerminate(string) ;
handle_attribute_name(context, string->data);
string->size = 0;
if (c=='>') { /* End of tag */
if (context->current_tag->name) start_element(context);
context->state = S_text;
break;
}
context->state = (c=='=' ? S_equals: S_attr_gap);
} else {
HTChunkPutc(string, c);
}
break;
2010-03-08 04:55:21 -06:00
case S_attr_gap: /* Expecting attribute or = or > */
if (WHITE(c)) break; /* Gap after attribute */
if (c=='>') { /* End of tag */
if (context->current_tag->name) start_element(context);
context->state = S_text;
break;
} else if (c=='=') {
context->state = S_equals;
break;
}
HTChunkPutc(string, c);
context->state = S_attr; /* Get next attribute */
break;
case S_equals: /* After attr = */
2010-03-08 04:55:21 -06:00
if (WHITE(c)) break; /* Before attribute value */
if (c=='>') { /* End of tag */
#ifndef DISABLE_TRACE
if (www2Trace) fprintf(stderr, "SGML: found = but no value\n");
#endif
if (context->current_tag->name) start_element(context);
context->state = S_text;
break;
2010-03-08 04:55:21 -06:00
} else if (c=='\'') {
context->state = S_squoted;
break;
} else if (c=='"') {
context->state = S_dquoted;
break;
}
HTChunkPutc(string, c);
context->state = S_value;
break;
2010-03-08 04:55:21 -06:00
case S_value:
if (WHITE(c) || (c=='>')) { /* End of word */
HTChunkTerminate(string) ;
handle_attribute_value(context, string->data);
string->size = 0;
if (c=='>') { /* End of tag */
if (context->current_tag->name) start_element(context);
context->state = S_text;
break;
}
else context->state = S_tag_gap;
} else {
HTChunkPutc(string, c);
}
break;
2010-03-08 04:55:21 -06:00
case S_squoted: /* Quoted attribute value */
if (c=='\'') { /* End of attribute value */
HTChunkTerminate(string) ;
handle_attribute_value(context, string->data);
string->size = 0;
context->state = S_tag_gap;
} else {
HTChunkPutc(string, c);
}
break;
2010-03-08 04:55:21 -06:00
case S_dquoted: /* Quoted attribute value */
if (c=='"') { /* End of attribute value */
HTChunkTerminate(string) ;
handle_attribute_value(context, string->data);
string->size = 0;
context->state = S_tag_gap;
} else {
HTChunkPutc(string, c);
}
break;
2010-03-08 04:55:21 -06:00
case S_end: /* </ */
if (isalnum(c))
HTChunkPutc(string, c);
else { /* End of end tag name */
HTTag * t;
HTChunkTerminate(string) ;
if (!*string->data) { /* Empty end tag */
t = context->element_stack->tag;
} else {
t = find_tag(dtd, string->data);
}
if (!t) {
#ifndef DISABLE_TRACE
if(www2Trace) fprintf(stderr,
"Unknown end tag </%s>\n", string->data);
2010-03-08 04:55:21 -06:00
#endif
} else {
context->current_tag = t;
end_element( context, context->current_tag);
}
string->size = 0;
context->current_attribute_number = INVALID;
if (c!='>') {
#ifndef DISABLE_TRACE
if (www2Trace && !WHITE(c))
fprintf(stderr,"SGML: `</%s%c' found!\n",
string->data, c);
#endif
context->state = S_junk_tag;
} else {
context->state = S_text;
}
}
break;
2010-03-08 04:55:21 -06:00
case S_junk_tag:
if (c=='>') {
context->state = S_text;
}
2010-03-08 04:55:21 -06:00
} /* switch on context->state */
} /* SGML_character */
PUBLIC void SGML_string ARGS2(HTStream *, context, WWW_CONST char*, str)
{
WWW_CONST char *p;
for(p=str; *p; p++)
SGML_character(context, *p);
}
PUBLIC void SGML_write ARGS3(HTStream *, context, WWW_CONST char*, str, int, l)
{
WWW_CONST char *p;
WWW_CONST char *e = str+l;
for(p=str; p<e; p++)
SGML_character(context, *p);
}
/*_______________________________________________________________________
*/
PRIVATE void SGML_handle_interrupt ARGS1(HTStream *, context)
{
}
/* Structured Object Class
** -----------------------
*/
PUBLIC WWW_CONST HTStreamClass SGMLParser =
{
2010-03-08 04:55:21 -06:00
"SGMLParser",
SGML_free,
SGML_end,
SGML_character, SGML_string, SGML_write,
SGML_handle_interrupt
};
2010-03-08 04:55:21 -06:00
/* Create SGML Engine
** ------------------
**
** On entry,
** dtd represents the DTD, along with
** actions is the sink for the data as a set of routines.
**
*/
PUBLIC HTStream* SGML_new ARGS2(
WWW_CONST SGML_dtd *, dtd,
HTStructured *, target)
{
int i;
HTStream* context = (HTStream *) malloc(sizeof(*context));
if (!context) outofmem(__FILE__, "SGML_begin");
context->isa = &SGMLParser;
context->string = HTChunkCreate(128); /* Grow by this much */
context->dtd = dtd;
context->target = target;
context->actions = (HTStructuredClass*)(((HTStream*)target)->isa);
/* Ugh: no OO */
context->state = S_text;
context->element_stack = 0; /* empty */
#ifdef CALLERDATA
2010-03-08 04:55:21 -06:00
context->callerData = (void*) callerData;
#endif
2010-03-08 04:55:21 -06:00
for(i=0; i<MAX_ATTRIBUTES; i++) context->value[i] = 0;
return context;
}