39b76427ae
The directory libwww2 now compiles without any warnings with gcc and clang.
700 lines
17 KiB
C
700 lines
17 KiB
C
/* General SGML Parser code SGML.c
|
|
** ========================
|
|
**
|
|
** This module implements an HTStream object. To parse an
|
|
** SGML file, create this object which is a parser. The object
|
|
** is (currently) created by being passed a DTD structure,
|
|
** and a target HTStructured oject at which to throw the parsed stuff.
|
|
**
|
|
** 6 Feb 93 Binary seraches used. Intreface modified.
|
|
*/
|
|
#include "../config.h"
|
|
#include "SGML.h"
|
|
|
|
#include <ctype.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include "HTUtils.h"
|
|
#include "HTChunk.h"
|
|
#include "../libnut/str-tools.h"
|
|
#ifndef DISABLE_TRACE
|
|
extern int www2Trace;
|
|
#endif
|
|
|
|
#define INVALID (-1)
|
|
|
|
/* The State (context) of the parser
|
|
**
|
|
** This is passed with each call to make the parser reentrant
|
|
**
|
|
*/
|
|
|
|
#define MAX_ATTRIBUTES 20 /* Max number of attributes per element */
|
|
|
|
|
|
/* Element Stack
|
|
** -------------
|
|
** This allows us to return down the stack reselcting styles.
|
|
** As we return, attribute values will be garbage in general.
|
|
*/
|
|
typedef struct _HTElement HTElement;
|
|
struct _HTElement {
|
|
HTElement * next; /* Previously nested element or 0 */
|
|
HTTag* tag; /* The tag at this level */
|
|
};
|
|
|
|
|
|
/* Internal Context Data Structure
|
|
** -------------------------------
|
|
*/
|
|
struct _HTStream {
|
|
|
|
WWW_CONST HTStreamClass * isa; /* inherited from HTStream */
|
|
|
|
WWW_CONST SGML_dtd *dtd;
|
|
HTStructuredClass *actions; /* target class */
|
|
HTStructured *target; /* target object */
|
|
|
|
HTTag *current_tag;
|
|
int current_attribute_number;
|
|
HTChunk *string;
|
|
HTElement *element_stack;
|
|
enum sgml_state { S_text, S_litteral, S_tag, S_tag_gap,
|
|
S_attr, S_attr_gap, S_equals, S_value,
|
|
S_ero, S_cro,
|
|
S_squoted, S_dquoted, S_end, S_entity, S_junk_tag} state;
|
|
#ifdef CALLERDATA
|
|
void * callerData;
|
|
#endif
|
|
BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */
|
|
char * value[MAX_ATTRIBUTES]; /* malloc'd strings or NULL if none */
|
|
} ;
|
|
|
|
|
|
#define PUTC(ch) ((*context->actions->put_character)(context->target, ch))
|
|
|
|
|
|
|
|
/* Handle Attribute
|
|
** ----------------
|
|
*/
|
|
/* PUBLIC WWW_CONST char * SGML_default = ""; ?? */
|
|
|
|
#ifdef __STDC__
|
|
PRIVATE void handle_attribute_name(HTStream * context, char * s)
|
|
#else
|
|
PRIVATE void handle_attribute_name(context, s)
|
|
HTStream * context;
|
|
char *s;
|
|
#endif
|
|
{
|
|
|
|
HTTag * tag = context->current_tag;
|
|
attr* attributes = tag->attributes;
|
|
|
|
int high, low, i, diff; /* Binary search for attribute name */
|
|
for(low=0, high=tag->number_of_attributes;
|
|
high > low ;
|
|
diff < 0 ? (low = i+1) : (high = i) ) {
|
|
i = (low + (high-low)/2);
|
|
diff = my_strcasecmp(attributes[i].name, s);
|
|
if (diff==0) { /* success: found it */
|
|
context->current_attribute_number = i;
|
|
context->present[i] = YES;
|
|
if (context->value[i]) {
|
|
free(context->value[i]);
|
|
context->value[i] = NULL;
|
|
}
|
|
return;
|
|
} /* if */
|
|
|
|
} /* for */
|
|
|
|
#ifndef DISABLE_TRACE
|
|
if (www2Trace)
|
|
fprintf(stderr, "SGML: Unknown attribute %s for tag %s\n",
|
|
s, context->current_tag->name);
|
|
#endif
|
|
context->current_attribute_number = INVALID; /* Invalid */
|
|
}
|
|
|
|
|
|
/* Handle attribute value
|
|
** ----------------------
|
|
*/
|
|
#ifdef __STDC__
|
|
PRIVATE void handle_attribute_value(HTStream * context, char * s)
|
|
#else
|
|
PRIVATE void handle_attribute_value(context, s)
|
|
HTStream * context;
|
|
char *s;
|
|
#endif
|
|
{
|
|
if (context->current_attribute_number != INVALID) {
|
|
StrAllocCopy(context->value[context->current_attribute_number], s);
|
|
} else {
|
|
#ifndef DISABLE_TRACE
|
|
if (www2Trace) fprintf(stderr, "SGML: Attribute value %s ignored\n", s);
|
|
#endif
|
|
}
|
|
context->current_attribute_number = INVALID; /* can't have two assignments! */
|
|
}
|
|
|
|
|
|
/* Handle entity
|
|
** -------------
|
|
**
|
|
** On entry,
|
|
** s contains the entity name zero terminated
|
|
** Bugs:
|
|
** If the entity name is unknown, the terminator is treated as
|
|
** a printable non-special character in all cases, even if it is '<'
|
|
*/
|
|
#ifdef __STDC__
|
|
PRIVATE void handle_entity(HTStream * context, char term)
|
|
#else
|
|
PRIVATE void handle_entity(context, term)
|
|
HTStream * context;
|
|
char term;
|
|
#endif
|
|
{
|
|
|
|
WWW_CONST char ** entities = context->dtd->entity_names;
|
|
WWW_CONST char *s = context->string->data;
|
|
|
|
int high, low, i, diff;
|
|
for(low=0, high = context->dtd->number_of_entities;
|
|
high > low ;
|
|
diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
|
|
i = (low + (high-low)/2);
|
|
diff = strcmp(entities[i], s); /* Csse sensitive! */
|
|
if (diff==0) { /* success: found it */
|
|
(*context->actions->put_entity)(context->target, i);
|
|
return;
|
|
}
|
|
}
|
|
/* If entity string not found, display as text */
|
|
#ifndef DISABLE_TRACE
|
|
if (www2Trace)
|
|
fprintf(stderr, "SGML: Unknown entity %s\n", s);
|
|
#endif
|
|
PUTC('&');
|
|
{
|
|
WWW_CONST char *p;
|
|
for (p=s; *p; p++) {
|
|
PUTC(*p);
|
|
}
|
|
}
|
|
PUTC(term);
|
|
}
|
|
|
|
|
|
/* End element
|
|
** -----------
|
|
*/
|
|
#ifdef __STDC__
|
|
PRIVATE void end_element(HTStream * context, HTTag * old_tag)
|
|
#else
|
|
PRIVATE void end_element(context, old_tag)
|
|
HTTag * old_tag;
|
|
HTStream * context;
|
|
#endif
|
|
{
|
|
#ifndef DISABLE_TRACE
|
|
if (www2Trace) fprintf(stderr, "SGML: End </%s>\n", old_tag->name);
|
|
#endif
|
|
if (old_tag->contents == SGML_EMPTY) {
|
|
#ifndef DISABLE_TRACE
|
|
if (www2Trace) fprintf(stderr,"SGML: Illegal end tag </%s> found.\n",
|
|
old_tag->name);
|
|
#endif
|
|
return;
|
|
}
|
|
while (context->element_stack) {/* Loop is error path only */
|
|
HTElement * N = context->element_stack;
|
|
HTTag * t = N->tag;
|
|
|
|
if (old_tag != t) { /* Mismatch: syntax error */
|
|
if (context->element_stack->next) { /* This is not the last level */
|
|
#ifndef DISABLE_TRACE
|
|
if (www2Trace) fprintf(stderr,
|
|
"SGML: Found </%s> when expecting </%s>. </%s> assumed.\n",
|
|
old_tag->name, t->name, t->name);
|
|
#endif
|
|
} else { /* last level */
|
|
#ifndef DISABLE_TRACE
|
|
if (www2Trace) fprintf(stderr,
|
|
"SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n",
|
|
old_tag->name, t->name, old_tag->name);
|
|
#endif
|
|
return; /* Ignore */
|
|
}
|
|
}
|
|
|
|
context->element_stack = N->next; /* Remove from stack */
|
|
free(N);
|
|
(*context->actions->end_element)(context->target,
|
|
t - context->dtd->tags);
|
|
if (old_tag == t) return; /* Correct sequence */
|
|
|
|
/* Syntax error path only */
|
|
|
|
}
|
|
#ifndef DISABLE_TRACE
|
|
if (www2Trace) fprintf(stderr,
|
|
"SGML: Extra end tag </%s> found and ignored.\n", old_tag->name);
|
|
#endif
|
|
}
|
|
|
|
|
|
/* Start a element
|
|
*/
|
|
#ifdef __STDC__
|
|
PRIVATE void start_element(HTStream * context)
|
|
#else
|
|
PRIVATE void start_element(context)
|
|
HTStream * context;
|
|
#endif
|
|
{
|
|
HTTag * new_tag = context->current_tag;
|
|
|
|
#ifndef DISABLE_TRACE
|
|
if (www2Trace) fprintf(stderr, "SGML: Start <%s>\n", new_tag->name);
|
|
#endif
|
|
(*context->actions->start_element)(
|
|
context->target,
|
|
new_tag - context->dtd->tags,
|
|
context->present,
|
|
(WWW_CONST char**) context->value); /* coerce type for think c */
|
|
if (new_tag->contents != SGML_EMPTY) { /* i.e. tag not empty */
|
|
HTElement * N = (HTElement *)malloc(sizeof(HTElement));
|
|
if (N == NULL) outofmem(__FILE__, "start_element");
|
|
N->next = context->element_stack;
|
|
N->tag = new_tag;
|
|
context->element_stack = N;
|
|
}
|
|
}
|
|
|
|
|
|
/* Find Tag in DTD tag list
|
|
** ------------------------
|
|
**
|
|
** On entry,
|
|
** dtd points to dtd structire including valid tag list
|
|
** string points to name of tag in question
|
|
**
|
|
** On exit,
|
|
** returns:
|
|
** NULL tag not found
|
|
** else address of tag structure in dtd
|
|
*/
|
|
PRIVATE HTTag * find_tag ARGS2(WWW_CONST SGML_dtd*, dtd, char *, string)
|
|
{
|
|
int high, low, i, diff;
|
|
for(low=0, high=dtd->number_of_tags;
|
|
high > low ;
|
|
diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
|
|
i = (low + (high-low)/2);
|
|
diff = my_strcasecmp(dtd->tags[i].name, string); /* Case insensitive */
|
|
if (diff==0) { /* success: found it */
|
|
return &dtd->tags[i];
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/*________________________________________________________________________
|
|
** Public Methods
|
|
*/
|
|
|
|
|
|
PUBLIC void SGML_end ARGS1(HTStream *, context)
|
|
{
|
|
/* Could check that we are back to bottom of stack! @@ */
|
|
|
|
(*context->actions->end_document)(context->target);
|
|
}
|
|
|
|
|
|
PUBLIC void SGML_free ARGS1(HTStream *, context)
|
|
{
|
|
(*context->actions->free)(context->target);
|
|
HTChunkFree(context->string);
|
|
free(context);
|
|
}
|
|
|
|
|
|
/* Read and write user callback handle
|
|
** -----------------------------------
|
|
**
|
|
** The callbacks from the SGML parser have an SGML context parameter.
|
|
** These calls allow the caller to associate his own context with a
|
|
** particular SGML context.
|
|
*/
|
|
|
|
#ifdef CALLERDATA
|
|
PUBLIC void* SGML_callerData ARGS1(HTStream *, context)
|
|
{
|
|
return context->callerData;
|
|
}
|
|
|
|
PUBLIC void SGML_setCallerData ARGS2(HTStream *, context, void*, data)
|
|
{
|
|
context->callerData = data;
|
|
}
|
|
#endif
|
|
|
|
PUBLIC void SGML_character ARGS2(HTStream *, context, char,c)
|
|
|
|
{
|
|
WWW_CONST SGML_dtd *dtd = context->dtd;
|
|
HTChunk *string = context->string;
|
|
|
|
switch(context->state) {
|
|
case S_text:
|
|
if (c=='&' && (!context->element_stack || (
|
|
context->element_stack->tag &&
|
|
( context->element_stack->tag->contents == SGML_MIXED
|
|
|| context->element_stack->tag->contents ==
|
|
SGML_RCDATA)
|
|
))) {
|
|
string->size = 0;
|
|
context->state = S_ero;
|
|
|
|
} else if (c=='<') {
|
|
string->size = 0;
|
|
context->state = (context->element_stack &&
|
|
context->element_stack->tag &&
|
|
context->element_stack->tag->contents == SGML_LITTERAL) ?
|
|
S_litteral : S_tag;
|
|
} else PUTC(c);
|
|
break;
|
|
|
|
/* In litteral mode, waits only for specific end tag!
|
|
** Only foir compatibility with old servers.
|
|
*/
|
|
case S_litteral :
|
|
HTChunkPutc(string, c);
|
|
if ( TOUPPER(c) != ((string->size ==1) ? '/'
|
|
: context->element_stack->tag->name[string->size-2])) {
|
|
int i;
|
|
|
|
/* If complete match, end litteral */
|
|
if ((c=='>') && (!context->element_stack->tag->name[string->size-2])) {
|
|
end_element(context, context->element_stack->tag);
|
|
string->size = 0;
|
|
context->current_attribute_number = INVALID;
|
|
context->state = S_text;
|
|
break;
|
|
} /* If Mismatch: recover string. */
|
|
PUTC( '<');
|
|
for (i=0; i<string->size; i++) /* recover */
|
|
PUTC(
|
|
string->data[i]);
|
|
context->state = S_text;
|
|
}
|
|
|
|
break;
|
|
|
|
/* Character reference or Entity
|
|
*/
|
|
case S_ero:
|
|
if (c=='#') {
|
|
context->state = S_cro; /* &# is Char Ref Open */
|
|
break;
|
|
}
|
|
context->state = S_entity; /* Fall through! */
|
|
|
|
/* Handle Entities
|
|
*/
|
|
case S_entity:
|
|
if (isalnum(c))
|
|
HTChunkPutc(string, c);
|
|
else {
|
|
HTChunkTerminate(string);
|
|
handle_entity(context, c);
|
|
context->state = S_text;
|
|
}
|
|
break;
|
|
|
|
/* Character reference
|
|
*/
|
|
case S_cro:
|
|
if (isalnum(c))
|
|
HTChunkPutc(string, c); /* accumulate a character NUMBER */
|
|
else {
|
|
int value;
|
|
HTChunkTerminate(string);
|
|
if (sscanf(string->data, "%d", &value)==1)
|
|
PUTC((char)value);
|
|
context->state = S_text;
|
|
}
|
|
break;
|
|
|
|
/* Tag
|
|
*/
|
|
case S_tag: /* new tag */
|
|
if (isalnum(c))
|
|
HTChunkPutc(string, c);
|
|
else { /* End of tag name */
|
|
HTTag * t;
|
|
if (c=='/') {
|
|
#ifndef DISABLE_TRACE
|
|
if (www2Trace) if (string->size!=0)
|
|
fprintf(stderr,"SGML: `<%s/' found!\n", string->data);
|
|
#endif
|
|
context->state = S_end;
|
|
break;
|
|
}
|
|
HTChunkTerminate(string) ;
|
|
|
|
t = find_tag(dtd, string->data);
|
|
if (!t) {
|
|
#ifndef DISABLE_TRACE
|
|
if(www2Trace) fprintf(stderr, "SGML: *** Unknown element %s\n",
|
|
string->data);
|
|
#endif
|
|
context->state = (c=='>') ? S_text : S_junk_tag;
|
|
break;
|
|
}
|
|
context->current_tag = t;
|
|
|
|
/* Clear out attributes
|
|
*/
|
|
|
|
{
|
|
int i;
|
|
for (i=0; i< context->current_tag->number_of_attributes; i++)
|
|
context->present[i] = NO;
|
|
}
|
|
string->size = 0;
|
|
context->current_attribute_number = INVALID;
|
|
|
|
if (c=='>') {
|
|
if (context->current_tag->name) start_element(context);
|
|
context->state = S_text;
|
|
} else {
|
|
context->state = S_tag_gap;
|
|
}
|
|
}
|
|
break;
|
|
|
|
|
|
case S_tag_gap: /* Expecting attribute or > */
|
|
if (WHITE(c)) break; /* Gap between attributes */
|
|
if (c=='>') { /* End of tag */
|
|
if (context->current_tag->name) start_element(context);
|
|
context->state = S_text;
|
|
break;
|
|
}
|
|
HTChunkPutc(string, c);
|
|
context->state = S_attr; /* Get attribute */
|
|
break;
|
|
|
|
/* accumulating value */
|
|
case S_attr:
|
|
if (WHITE(c) || (c=='>') || (c=='=')) { /* End of word */
|
|
HTChunkTerminate(string) ;
|
|
handle_attribute_name(context, string->data);
|
|
string->size = 0;
|
|
if (c=='>') { /* End of tag */
|
|
if (context->current_tag->name) start_element(context);
|
|
context->state = S_text;
|
|
break;
|
|
}
|
|
context->state = (c=='=' ? S_equals: S_attr_gap);
|
|
} else {
|
|
HTChunkPutc(string, c);
|
|
}
|
|
break;
|
|
|
|
case S_attr_gap: /* Expecting attribute or = or > */
|
|
if (WHITE(c)) break; /* Gap after attribute */
|
|
if (c=='>') { /* End of tag */
|
|
if (context->current_tag->name) start_element(context);
|
|
context->state = S_text;
|
|
break;
|
|
} else if (c=='=') {
|
|
context->state = S_equals;
|
|
break;
|
|
}
|
|
HTChunkPutc(string, c);
|
|
context->state = S_attr; /* Get next attribute */
|
|
break;
|
|
|
|
case S_equals: /* After attr = */
|
|
if (WHITE(c)) break; /* Before attribute value */
|
|
if (c=='>') { /* End of tag */
|
|
#ifndef DISABLE_TRACE
|
|
if (www2Trace) fprintf(stderr, "SGML: found = but no value\n");
|
|
#endif
|
|
if (context->current_tag->name) start_element(context);
|
|
context->state = S_text;
|
|
break;
|
|
|
|
} else if (c=='\'') {
|
|
context->state = S_squoted;
|
|
break;
|
|
|
|
} else if (c=='"') {
|
|
context->state = S_dquoted;
|
|
break;
|
|
}
|
|
HTChunkPutc(string, c);
|
|
context->state = S_value;
|
|
break;
|
|
|
|
case S_value:
|
|
if (WHITE(c) || (c=='>')) { /* End of word */
|
|
HTChunkTerminate(string) ;
|
|
handle_attribute_value(context, string->data);
|
|
string->size = 0;
|
|
if (c=='>') { /* End of tag */
|
|
if (context->current_tag->name) start_element(context);
|
|
context->state = S_text;
|
|
break;
|
|
}
|
|
else context->state = S_tag_gap;
|
|
} else {
|
|
HTChunkPutc(string, c);
|
|
}
|
|
break;
|
|
|
|
case S_squoted: /* Quoted attribute value */
|
|
if (c=='\'') { /* End of attribute value */
|
|
HTChunkTerminate(string) ;
|
|
handle_attribute_value(context, string->data);
|
|
string->size = 0;
|
|
context->state = S_tag_gap;
|
|
} else {
|
|
HTChunkPutc(string, c);
|
|
}
|
|
break;
|
|
|
|
case S_dquoted: /* Quoted attribute value */
|
|
if (c=='"') { /* End of attribute value */
|
|
HTChunkTerminate(string) ;
|
|
handle_attribute_value(context, string->data);
|
|
string->size = 0;
|
|
context->state = S_tag_gap;
|
|
} else {
|
|
HTChunkPutc(string, c);
|
|
}
|
|
break;
|
|
|
|
case S_end: /* </ */
|
|
if (isalnum(c))
|
|
HTChunkPutc(string, c);
|
|
else { /* End of end tag name */
|
|
HTTag * t;
|
|
HTChunkTerminate(string) ;
|
|
if (!*string->data) { /* Empty end tag */
|
|
t = context->element_stack->tag;
|
|
} else {
|
|
t = find_tag(dtd, string->data);
|
|
}
|
|
if (!t) {
|
|
#ifndef DISABLE_TRACE
|
|
if(www2Trace) fprintf(stderr,
|
|
"Unknown end tag </%s>\n", string->data);
|
|
#endif
|
|
} else {
|
|
context->current_tag = t;
|
|
end_element( context, context->current_tag);
|
|
}
|
|
|
|
string->size = 0;
|
|
context->current_attribute_number = INVALID;
|
|
if (c!='>') {
|
|
#ifndef DISABLE_TRACE
|
|
if (www2Trace && !WHITE(c))
|
|
fprintf(stderr,"SGML: `</%s%c' found!\n",
|
|
string->data, c);
|
|
#endif
|
|
context->state = S_junk_tag;
|
|
} else {
|
|
context->state = S_text;
|
|
}
|
|
}
|
|
break;
|
|
|
|
|
|
case S_junk_tag:
|
|
if (c=='>') {
|
|
context->state = S_text;
|
|
}
|
|
|
|
} /* switch on context->state */
|
|
|
|
} /* SGML_character */
|
|
|
|
|
|
PUBLIC void SGML_string ARGS2(HTStream *, context, WWW_CONST char*, str)
|
|
{
|
|
WWW_CONST char *p;
|
|
for(p=str; *p; p++)
|
|
SGML_character(context, *p);
|
|
}
|
|
|
|
|
|
PUBLIC void SGML_write ARGS3(HTStream *, context, WWW_CONST char*, str, int, l)
|
|
{
|
|
WWW_CONST char *p;
|
|
WWW_CONST char *e = str+l;
|
|
for(p=str; p<e; p++)
|
|
SGML_character(context, *p);
|
|
}
|
|
|
|
/*_______________________________________________________________________
|
|
*/
|
|
|
|
PRIVATE void SGML_handle_interrupt ARGS1(HTStream *, context)
|
|
{
|
|
}
|
|
|
|
/* Structured Object Class
|
|
** -----------------------
|
|
*/
|
|
PUBLIC WWW_CONST HTStreamClass SGMLParser =
|
|
{
|
|
"SGMLParser",
|
|
SGML_free,
|
|
SGML_end,
|
|
SGML_character, SGML_string, SGML_write,
|
|
SGML_handle_interrupt
|
|
};
|
|
|
|
/* Create SGML Engine
|
|
** ------------------
|
|
**
|
|
** On entry,
|
|
** dtd represents the DTD, along with
|
|
** actions is the sink for the data as a set of routines.
|
|
**
|
|
*/
|
|
|
|
PUBLIC HTStream* SGML_new ARGS2(
|
|
WWW_CONST SGML_dtd *, dtd,
|
|
HTStructured *, target)
|
|
{
|
|
int i;
|
|
HTStream* context = (HTStream *) malloc(sizeof(*context));
|
|
if (!context) outofmem(__FILE__, "SGML_begin");
|
|
|
|
context->isa = &SGMLParser;
|
|
context->string = HTChunkCreate(128); /* Grow by this much */
|
|
context->dtd = dtd;
|
|
context->target = target;
|
|
context->actions = (HTStructuredClass*)(((HTStream*)target)->isa);
|
|
/* Ugh: no OO */
|
|
context->state = S_text;
|
|
context->element_stack = 0; /* empty */
|
|
#ifdef CALLERDATA
|
|
context->callerData = (void*) callerData;
|
|
#endif
|
|
for(i=0; i<MAX_ATTRIBUTES; i++) context->value[i] = 0;
|
|
|
|
return context;
|
|
}
|
|
|