%{ #include #include #include #include #include "y.tab.h" #include "element_path.h" #include "output.h" static void dump_stream() { char buffer[40000]; extern FILE* yyin; while((fgets(buffer, sizeof(buffer), yyin))) { fprintf(stderr, "%s", buffer); } fflush(stderr); } void yyerror(const char *str) { fprintf(stderr,"error: %s\n",str); fprintf(stderr, "NEAR:\n"); dump_stream(); } int yywrap() { return 1; } static int HREF_open_P() { struct Cons* item; struct Cons* attributes; item = element_path_last; while(item) { if(strcasecmp(item->name, "A") == 0) { attributes = item->value; if(Cons_find(attributes, "HREF")) { return 1; } } item = item->prev; } return 0; } char* UTF8_from_unichr(int codepoint) { unsigned c = codepoint; unsigned char result[6] = {0}; if(codepoint <= 0x7F) { /* 0XXX XXXX one byte */ result[0] = (unsigned char) codepoint; } else if(codepoint <= 0x7FF) { /* 110X XXXX two bytes */ result[0] = (unsigned char) (0xC0 | (c >> 6)); result[1] = (unsigned char) (0x80 | (c & 0x3F)); } else if(codepoint <= 0xFFFF) { /* 1110 XXXX three bytes */ result[0] = (unsigned char) (0xE0 | (c >> 12)); result[1] = (unsigned char) (0x80 | ((c >> 6) & 0x3F)); result[2] = (unsigned char) (0x80 | (c & 0x3F)); #if 0 370 if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar ) 371 hasError = yes; 372 #if 0 /* Breaks Big5 D8 - DF */ 373 else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd ) 374 /* unpaired surrogates not allowed */ 375 hasError = yes; 376 #endif #endif } else if(codepoint <= 0x1FFFFF) { /* 1111 0XXX four bytes */ result[0] = (unsigned char) (0xF0 | (c >> 18)); result[1] = (unsigned char) (0x80 | ((c >> 12) & 0x3F)); result[2] = (unsigned char) (0x80 | ((c >> 6) & 0x3F)); result[3] = (unsigned char) (0x80 | (c & 0x3F)); #if 0 385 if (c > kMaxUTF8FromUCS4) 386 hasError = yes; #endif } else if(codepoint <= 0x3FFFFFF) { /* 1111 10XX five bytes */ result[0] = (unsigned char) (0xF8 | (c >> 24)); result[1] = (unsigned char) (0x80 | (c >> 18)); result[2] = (unsigned char) (0x80 | ((c >> 12) & 0x3F)); result[3] = (unsigned char) (0x80 | ((c >> 6) & 0x3F)); result[4] = (unsigned char) (0x80 | (c & 0x3F)); return NULL; } else if(codepoint <= 0x7FFFFFFF) { /* 1111 110X six bytes */ result[0] = (unsigned char) (0xFC | (c >> 30)); result[1] = (unsigned char) (0x80 | ((c >> 24) & 0x3F)); result[2] = (unsigned char) (0x80 | ((c >> 18) & 0x3F)); result[3] = (unsigned char) (0x80 | ((c >> 12) & 0x3F)); result[4] = (unsigned char) (0x80 | ((c >> 6) & 0x3F)); result[5] = (unsigned char) (0x80 | ((c >> 0) & 0x3F)); return NULL; } else { return NULL; } if(result[0]) return (char*) strdup(result); else return NULL; } %} %token COMMENT %token END_COMMENT %token DOCTYPE_KEYWORD %token S %token ELEMENT_KEYWORD %token CDATA %token ID %token IDREF %token IDREFS %token ENTITY %token ENTITIES %token NAME %token NMTOKEN %token NMTOKENS %token FIXED %token QUOTED_STRING %token ATTLIST_KEYWORD %token NOTATION_KEYWORD %token NDATA %token PE_ENTITY_KEYWORD %token IMPLIED %token ENCODING %token ENTITY_REFERENCE %token CHARACTER_REFERENCE %token END_BBBLOCK %token BEGIN_XML %token SECTION_BBLOCK %token REQUIRED %token INCLUDE_KEYWORD %token IGNORE_KEYWORD %token PCDATA %token ATTRIBUTE_VALUE %token SYSTEM %token PUBLIC %token GE_ENTITY_KEYWORD %token BLURB %token ENDTAG %token END_ENDTAG %token END_PI %token TPI %token STANDALONE %token VERSION %token NOTATION /* ATTLIST */ %token EMPTY %token ANY %token ORDERED /* ATTLIST */ %union { char* text; int symbol; struct Cons* cons; } %start document %type count_suffix %type optional_count_suffix %type Name %type Nmtoken %type PITarget %type SDDecl %type BeginTag %type optional_Attributes %type Attribute %type AttValue %type optional_AttValue_content %type AttValue_item %type Reference %type EntityRef %type CharRef %left NAME %% document: opt_S prolog element MiscList | conditionalSect /* actually not, just to shut the yacc up. */ ; blurbs: /**/ | blurbs BLURB /* TODO concatenate. */ ; Comment: COMMENT blurbs END_COMMENT ; Misc: Comment | PI | S; MiscList: /**/ | MiscList Misc ; prolog: optional_XMLDecl optional_Miscs prolog_suffix; optional_XMLDecl: /**/ | XMLDecl ; optional_Miscs: /**/ | optional_Miscs Misc ; prolog_suffix: /**/ | doctypedecl optional_Miscs opt_S ; XMLDecl: BEGIN_XML VersionInfo S EncodingDecl S SDDecl opt_S END_PI | BEGIN_XML VersionInfo S EncodingDecl opt_S END_PI | BEGIN_XML VersionInfo S SDDecl opt_S END_PI | BEGIN_XML VersionInfo opt_S END_PI ; opt_S: optional_space; optional_space: /**/ | S ; SDDecl: S STANDALONE Eq QUOTED_STRING { if(strcmp($4, "yes") != 0 && strcmp($4, "no") != 0) { YYERROR; } $$ = (strcmp($4, "no") != 0) ? 'y' : 'n'; } VersionInfo: VERSION Eq QUOTED_STRING; Eq: opt_S '=' opt_S; /*VersionNum: "1." [0-9]+;*/ /* DTD: */ optional_intSubset: /**/ | opt_S '[' intSubset ']' opt_S ; doctypedecl: DOCTYPE_KEYWORD S Name optional_ExternalID optional_intSubset opt_S '>'; optional_ExternalID: /**/ | S ExternalID ; DeclSep: PEReference | S; intSubset: /**/ | intSubset markup_or_DeclSep ; markup_or_DeclSep: markupdecl | DeclSep; markupdecl: elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment; elementdecl: ELEMENT_KEYWORD S Name S contentspec opt_S '>'; contentspec: "EMPTY" | "ANY" | Mixed | children; /* examples: */ optional_count_suffix: /**/ { return 0; } | count_suffix { return $1; } ; count_suffix: '?' { return '?'; } | '*' { return '*'; } | '+' { return '+'; } ; Name_or_choice_or_seq: Name | choice_or_seq ; children: choice_or_seq optional_count_suffix; cp: Name_or_choice_or_seq optional_count_suffix; OR_cp_list: /**/ | OR_cp_list OR_cp ; OR_cp: opt_S '|' opt_S cp; COMMA_cp: opt_S ',' opt_S cp; COMMA_cp_list: /**/ | COMMA_cp_list COMMA_cp ; choice_or_seq: choice | seq ; choice: '(' opt_S cp OR_cp OR_cp_list opt_S ')'; seq : '(' opt_S cp COMMA_cp_list opt_S ')'; OR_Name_list: /**/ | OR_Name_list OR_Name ; OR_Name: opt_S '|' opt_S Name; maybe_star: /**/ | '*' ; Mixed: '(' opt_S PCDATA OR_Name_list opt_S ')' maybe_star /* | '(' opt_S PCDATA opt_S ')'*/ ; AttDef_list: /**/ | AttDef_list AttDef ; AttlistDecl: ATTLIST_KEYWORD S Name AttDef_list opt_S '>'; AttDef: S Name S AttType S DefaultDecl; AttType: StringType | TokenizedType | EnumeratedType; StringType: CDATA; TokenizedType: ID | IDREF | IDREFS| ENTITY | ENTITIES | NMTOKEN | NMTOKENS; EnumeratedType: NotationType | Enumeration; NotationType: NOTATION_KEYWORD S '(' opt_S Name OR_Name_list opt_S ')'; OR_Nmtoken_list: /**/ | OR_Nmtoken_list OR_Nmtoken; OR_Nmtoken: opt_S '|' opt_S Nmtoken; Enumeration: '(' opt_S Nmtoken OR_Nmtoken_list opt_S ')'; Nmtoken: NAME { $$ = $1; } optional_FIXED: /**/ | FIXED S ; DefaultDecl: REQUIRED | IMPLIED | optional_FIXED AttValue ; /* examples: */ conditionalSect: includeSect | ignoreSect; includeSect: INCLUDE_KEYWORD opt_S '[' extSubsetDecl END_BBBLOCK; ignoreSectContents_list: /**/ | ignoreSectContents_list ignoreSectContents ; ignoreSect: IGNORE_KEYWORD opt_S '[' ignoreSectContents_list END_BBBLOCK; ignoreSectContentsBody_list: /**/ | ignoreSectContentsBody_list ignoreSectContentsBody ; ignoreSectContentsBody: SECTION_BBLOCK ignoreSectContents END_BBBLOCK Ignore; ignoreSectContents: Ignore ignoreSectContentsBody_list; BBlock_junk: SECTION_BBLOCK | END_BBBLOCK; Blurb_junk: BLURB BBlock_junk BLURB; Ignore: BLURB '-' Blurb_junk; /* examples: ]]> ]]> */ CharRef: CHARACTER_REFERENCE { $$ = UTF8_from_unichr($1); } ; Reference: EntityRef | CharRef { if(element_path_name_equals(element_paths, "html") && element_path_name_equals(element_paths->next, "body") && !element_path_name_equals(element_path_last, "script")) { printf("%s", $1); } }; EntityRef: ENTITY_REFERENCE; /* FIXME decode */ PEReference: '%'; /* TODO name */ EntityDecl: GEDecl | PEDecl; GEDecl: GE_ENTITY_KEYWORD S Name S EntityDef opt_S '>'; PEDecl: PE_ENTITY_KEYWORD S Name S PEDef opt_S '>'; optional_NDataDecl: /**/ | S NDataDecl ; EntityDef: EntityValue | ExternalID optional_NDataDecl ; PEDef: EntityValue | ExternalID; ExternalID: SYSTEM S SystemLiteral | PUBLIC S PubidLiteral S SystemLiteral; ExternalID_or_PublicID: ExternalID | PublicID ; PublicID: PUBLIC S PubidLiteral; NDataDecl: NDATA S Name; /*TextDecl: BEGIN_XML optional_VersionInfo EncodingDecl opt_S END_PI;*/ /*optional_VersionInfo: NOTHING | VersionInfo ;*/ /*optional_TextDecl: NOTHING | TextDecl ;*/ /*extParsedEnt: optional_TextDecl content;*/ EncodingDecl: ENCODING Eq QUOTED_STRING; /* TODO UTF-8, EUC-JP */ /* predefined entities: */ NotationDecl: NOTATION_KEYWORD S Name S ExternalID_or_PublicID opt_S '>'; /* actual content: */ BeginTag: '<' Name optional_Attributes opt_S { /* allow broken HTML */ if(!element_path_name_equals(element_paths, "HTML") && strcasecmp($2, "HTML") != 0) { element_path_enter("HTML", NULL); /* FIXME also somehow extend "document" */ } element_path_enter($2, $3); if(strcasecmp($2, "H1") == 0 || strcasecmp($2, "H2") == 0 || strcasecmp($2, "EM") == 0 || strcasecmp($2, "B") == 0) { printf("\033[1m"); } else if(strcasecmp($2, "INPUT") == 0) { display_HTML_input_element($3); } else if(strcasecmp($2, "BR") == 0) { printf("\n"); } else if(strcasecmp($2, "LI") == 0) { printf("* "); /* TODO check level */ } else if(strcasecmp($2, "P") == 0) { printf("\n"); /* TODO find out whether we already are at the beginning. If so, don't do anything. */ } else if(strcasecmp($2, "IMG") == 0) { struct Cons* alt_cell = Cons_find($3, "ALT"); const char* value = Cons_string_value(alt_cell, Cons_string_value(Cons_find($3, "SRC"), "")); printf("%s", HREF_open_P() ? underscore_string(value) : value); } else if(strcasecmp($2, "UL") == 0) { printf("\n"); /* TODO check level */ } else if(strcasecmp($2, "A") == 0 && Cons_find($3, "HREF")) { printf("\033[34m<"); } else if(strcasecmp($2, "LABEL") == 0) { } $$ = $2; /* FIXME entire thing. */ } ; element: BeginTag '/' '>' { element_path_leave($1); } | BeginTag '>' content ETag { if(element_path_name_equals(element_paths, "html") && element_path_name_equals(element_paths->next, "body") && !element_paths->next->next) { yyerrok; /* Google. */ yyclearin; } else { /* oops. */ } } ; optional_Attributes: /**/ { $$ = NULL; } | optional_Attributes S Attribute { if($1) { $3->next = $1; } $$ = $3; } ; Attribute: Name '=' AttValue { $$ = Cons_new(strdup($1), NULL); $$->value = Cons_new(strdup($3), NULL); } | Name /* HTML */ { $$ = Cons_new(strdup($1), NULL); $$->value = Cons_new(strdup($1), NULL); } ; optional_AttValue_content: /**/ { $$ = ""; } | optional_AttValue_content AttValue_item { $$ = calloc(strlen($1) + strlen($2) + 1, 1); memcpy($$, $1, strlen($1)); memcpy($$ + strlen($1), $2, strlen($2)); } ; AttValue_item: ATTRIBUTE_VALUE { $$ = $1; } | Reference ; AttValue: '"' optional_AttValue_content '"' { $$ = strdup($2); } | Name /* for HTML */ { $$ = strdup($1); } ; EntityValue: '"' optional_EntityValue_content '"' ; optional_EntityValue_content: /**/ | optional_EntityValue_content EntityValue_item ; EntityValue_item: AttValue_item | PEReference ; extSubsetDecl: /**/ | extSubsetDecl markupdecl_or_conditionalSect_or_DeclSep ; markupdecl_or_conditionalSect_or_DeclSep: markupdecl | conditionalSect | DeclSep; ETag: ENDTAG Name opt_S END_ENDTAG { struct Cons* attributes; attributes = element_path_leave($2); if(strcasecmp($2, "H1") == 0 || strcasecmp($2, "H2") == 0 || strcasecmp($2, "EM") == 0 || strcasecmp($2, "B") == 0) { printf("\033[m\n"); /* FIXME */ } else if(strcasecmp($2, "LI") == 0) { printf("\n"); /* FIXME */ } else if(strcasecmp($2, "A") == 0) { if(Cons_find(attributes, "HREF")) { printf(">\033[m"); } } else if(strcasecmp($2, "P") == 0) { printf("\n"); /* TODO find out whether we already are at the beginning. If so, don't do anything. */ } else if(strcasecmp($2, "UL") == 0) { printf("\n"); /* TODO check level */ } else if(strcasecmp($2, "TR") == 0) { printf("\n"); } else if(strcasecmp($2, "TD") == 0) { /* FIXME */ printf("\t"); } } Name: NAME { $$ = $1; }; PubidLiteral: QUOTED_STRING; /* FIXME check */ SystemLiteral: QUOTED_STRING; /* FIXME check */ optional_PI_Body: /**/ | S PI_Body ; PI_Body: NAME; /*Char* - (Char* '?>' Char*); FIXME more stuff */ PI: TPI PITarget optional_PI_Body END_PI PITarget: Name { /* FIXME - (('X' | 'x') ('M' | 'm') ('L' | 'l'))*/ $$ = $1; } nontext_content: element | Reference | CDSect | PI | Comment | S; node: BLURB { /* our actual client program */ if(element_path_name_equals(element_paths, "html") && element_path_name_equals(element_paths->next, "body") && !element_path_name_equals(element_path_last, "script")) { /* TODO do this properly: */ while(strchr($1, '\n')) { *strchr($1, '\n') = 32; } printf("%s", HREF_open_P() ? underscore_string($1) : $1); } } | nontext_content ; content: /**/ | content node ; CDSect: CDATA CDATABody END_BBBLOCK; CDATABody: /**/ | CDATABody BLURB ;