Delta Chat Core C-Library
mrsaxparser.c
1 /*******************************************************************************
2  *
3  * Delta Chat Core
4  * Copyright (C) 2017 Björn Petersen
5  * Contact: r10s@b44t.com, http://b44t.com
6  *
7  * This program is free software: you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License as published by the Free Software
9  * Foundation, either version 3 of the License, or (at your option) any later
10  * version.
11  *
12  * This program is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
15  * details.
16  *
17  * You should have received a copy of the GNU General Public License along with
18  * this program. If not, see http://www.gnu.org/licenses/ .
19  *
20  ******************************************************************************/
21 
22 
23 /* mrsaxparser_t parses XML and HTML files that may not be wellformed
24 and spits out all text and tags found.
25 - Attributes are recognized with single, double or no quotes
26 - Whitespace ignored inside tags
27 - Self-closing tags are issued as open-tag plus close-tag
28 - CDATA is supoorted; DTA, comments, processing instruction are
29  skipped properly
30 - The parser does not care about hierarchy, if needed this can be
31  done by the user.
32 - Input and output strings must be UTF-8 encoded.
33 - Tag and attribute names are converted to lower case.
34 - Parsing does not stop on errors; instead errors are recovered. */
35 
36 
37 #include <stdlib.h>
38 #include <string.h>
39 #include <ctype.h>
40 #include "mrmailbox.h"
41 #include "mrtools.h"
42 #include "mrsaxparser.h"
43 
44 
45 /*******************************************************************************
46  * Decoding text
47  ******************************************************************************/
48 
49 
50 static const char* s_ent[] = {
51  /* Convert entities as &auml; to UTF-8 characters.
52 
53  - The first strings MUST NOT start with `&` and MUST end with `;`.
54  - take care not to miss a comma between the strings.
55  - It's also possible to specify the destination as a character reference as `&#34;` (they are converted in a second pass without a table). */
56 
57  /* basic XML/HTML */
58  "lt;", "<", "gt;", ">", "quot;", "\"", "apos;", "'",
59  "amp;", "&", "nbsp;", " ",
60 
61  /* advanced HTML */
62  "iexcl;", "¡", "cent;", "¢", "pound;", "£", "curren;", "¤",
63  "yen;", "¥", "brvbar;", "¦", "sect;", "§", "uml;", "¨",
64  "copy;", "©", "ordf;", "ª", "laquo;", "«", "not;", "¬",
65  "shy;", "-", "reg;", "®", "macr;", "¯", "deg;", "°",
66  "plusmn;", "±", "sup2;", "²", "sup3;", "³", "acute;", "´",
67  "micro;", "µ", "para;", "¶", "middot;", "·", "cedil;", "¸",
68  "sup1;", "¹", "ordm;", "º", "raquo;", "»", "frac14;", "¼",
69  "frac12;", "½", "frac34;", "¾", "iquest;", "¿", "Agrave;", "À",
70  "Aacute;", "Á", "Acirc;", "Â", "Atilde;", "Ã", "Auml;", "Ä",
71  "Aring;", "Å", "AElig;", "Æ", "Ccedil;", "Ç", "Egrave;", "È",
72  "Eacute;", "É", "Ecirc;", "Ê", "Euml;", "Ë", "Igrave;", "Ì",
73  "Iacute;", "Í", "Icirc;", "Î", "Iuml;", "Ï", "ETH;", "Ð",
74  "Ntilde;", "Ñ", "Ograve;", "Ò", "Oacute;", "Ó", "Ocirc;", "Ô",
75  "Otilde;", "Õ", "Ouml;", "Ö", "times;", "×", "Oslash;", "Ø",
76  "Ugrave;", "Ù", "Uacute;", "Ú", "Ucirc;", "Û", "Uuml;", "Ü",
77  "Yacute;", "Ý", "THORN;", "Þ", "szlig;", "ß", "agrave;", "à",
78  "aacute;", "á", "acirc;", "â", "atilde;", "ã", "auml;", "ä",
79  "aring;", "å", "aelig;", "æ", "ccedil;", "ç", "egrave;", "è",
80  "eacute;", "é", "ecirc;", "ê", "euml;", "ë", "igrave;", "ì",
81  "iacute;", "í", "icirc;", "î", "iuml;", "ï", "eth;", "ð",
82  "ntilde;", "ñ", "ograve;", "ò", "oacute;", "ó", "ocirc;", "ô",
83  "otilde;", "õ", "ouml;", "ö", "divide;", "÷", "oslash;", "ø",
84  "ugrave;", "ù", "uacute;", "ú", "ucirc;", "û", "uuml;", "ü",
85  "yacute;", "ý", "thorn;", "þ", "yuml;", "ÿ", "OElig;", "Œ",
86  "oelig;", "œ", "Scaron;", "Š", "scaron;", "š", "Yuml;", "Ÿ",
87  "fnof;", "ƒ", "circ;", "ˆ", "tilde;", "˜", "Alpha;", "Α",
88  "Beta;", "Β", "Gamma;", "Γ", "Delta;", "Δ", "Epsilon;", "Ε",
89  "Zeta;", "Ζ", "Eta;", "Η", "Theta;", "Θ", "Iota;", "Ι",
90  "Kappa;", "Κ", "Lambda;", "Λ", "Mu;", "Μ", "Nu;", "Ν",
91  "Xi;", "Ξ", "Omicron;", "Ο", "Pi;", "Π", "Rho;", "Ρ",
92  "Sigma;", "Σ", "Tau;", "Τ", "Upsilon;", "Υ", "Phi;", "Φ",
93  "Chi;", "Χ", "Psi;", "Ψ", "Omega;", "Ω", "alpha;", "α",
94  "beta;", "β", "gamma;", "γ", "delta;", "δ", "epsilon;", "ε",
95  "zeta;", "ζ", "eta;", "η", "theta;", "θ", "iota;", "ι",
96  "kappa;", "κ", "lambda;", "λ", "mu;", "μ", "nu;", "ν",
97  "xi;", "ξ", "omicron;", "ο", "pi;", "π", "rho;", "ρ",
98  "sigmaf;", "ς", "sigma;", "σ", "tau;", "τ", "upsilon;", "υ",
99  "phi;", "φ", "chi;", "χ", "psi;", "ψ", "omega;", "ω",
100  "thetasym;","ϑ", "upsih;", "ϒ", "piv;", "ϖ", "ensp;", " ",
101  "emsp;", " ", "thinsp;", " ", "zwnj;", "" , "zwj;", "" ,
102  "lrm;", "" , "rlm;", "" , "ndash;", "–", "mdash;", "—",
103  "lsquo;", "‘", "rsquo;", "’", "sbquo;", "‚", "ldquo;", "“",
104  "rdquo;", "”", "bdquo;", "„", "dagger;", "†", "Dagger;", "‡",
105  "bull;", "•", "hellip;", "…", "permil;", "‰", "prime;", "′",
106  "Prime;", "″", "lsaquo;", "‹", "rsaquo;", "›", "oline;", "‾",
107  "frasl;", "⁄", "euro;", "€", "image;", "ℑ", "weierp;", "℘",
108  "real;", "ℜ", "trade;", "™", "alefsym;", "ℵ", "larr;", "←",
109  "uarr;", "↑", "rarr;", "→", "darr;", "↓", "harr;", "↔",
110  "crarr;", "↵", "lArr;", "⇐", "uArr;", "⇑", "rArr;", "⇒",
111  "dArr;", "⇓", "hArr;", "⇔", "forall;", "∀", "part;", "∂",
112  "exist;", "∃", "empty;", "∅", "nabla;", "∇", "isin;", "∈",
113  "notin;", "∉", "ni;", "∋", "prod;", "∏", "sum;", "∑",
114  "minus;", "−", "lowast;", "∗", "radic;", "√", "prop;", "∝",
115  "infin;", "∞", "ang;", "∠", "and;", "∧", "or;", "∨",
116  "cap;", "∩", "cup;", "∪", "int;", "∫", "there4;", "∴",
117  "sim;", "∼", "cong;", "≅", "asymp;", "≈", "ne;", "≠",
118  "equiv;", "≡", "le;", "≤", "ge;", "≥", "sub;", "⊂",
119  "sup;", "⊃", "nsub;", "⊄", "sube;", "⊆", "supe;", "⊇",
120  "oplus;", "⊕", "otimes;", "⊗", "perp;", "⊥", "sdot;", "⋅",
121  "lceil;", "⌈", "rceil;", "⌉", "lfloor;", "⌊", "rfloor;", "⌋",
122  "lang;", "<", "rang;", ">", "loz;", "◊", "spades;", "♠",
123  "clubs;", "♣", "hearts;", "♥", "diams;", "♦",
124 
125  /* MUST be last */
126  NULL, NULL,
127 };
128 
129 
130 /* Recursively decodes entity and character references and normalizes new lines.
131 set "type" to ...
132 '&' for general entity decoding,
133 '%' for parameter entity decoding (currently not needed),
134 'c' for cdata sections,
135 ' ' for attribute normalization, or
136 '*' for non-cdata attribute normalization (currently not needed).
137 Returns s, or if the decoded string is longer than s, returns a malloced string
138 that must be freed.
139 Function based upon ezxml_decode() from the "ezxml" parser which is
140 Copyright 2004-2006 Aaron Voisine <aaron@voisine.org> */
141 static char* xml_decode(char* s, char type)
142 {
143  char *e, *r = s, *m = s;
144  long b, c, d, l;
145 
146  for (; *s; s++) { /* normalize line endings */
147  while (*s == '\r') {
148  *(s++) = '\n';
149  if (*s == '\n') memmove(s, (s + 1), strlen(s));
150  }
151  }
152 
153  for (s = r; ; ) {
154  while( *s && *s != '&' /*&& (*s != '%' || type != '%')*/ && !isspace(*s)) s++;
155 
156  if( ! *s )
157  {
158  break;
159  }
160  else if( type != 'c' && ! strncmp(s, "&#", 2) )
161  {
162  /* character reference */
163  if (s[2] == 'x') c = strtol(s + 3, &e, 16); /* base 16 */
164  else c = strtol(s + 2, &e, 10); /* base 10 */
165  if (! c || *e != ';') { s++; continue; } /* not a character ref */
166 
167  if (c < 0x80) *(s++) = c; /* US-ASCII subset */
168  else { /* multi-byte UTF-8 sequence */
169  for (b = 0, d = c; d; d /= 2) b++; /* number of bits in c */
170  b = (b - 2) / 5; /* number of bytes in payload */
171  *(s++) = (0xFF << (7 - b)) | (c >> (6 * b)); /* head */
172  while (b) *(s++) = 0x80 | ((c >> (6 * --b)) & 0x3F); /* payload */
173  }
174 
175  memmove(s, strchr(s, ';') + 1, strlen(strchr(s, ';')));
176  }
177  else if( (*s == '&' && (type == '&' || type == ' ' /*|| type == '*'*/))
178  /*|| (*s == '%' && type == '%')*/ )
179  {
180  /* entity reference */
181  for (b = 0; s_ent[b] && strncmp(s + 1, s_ent[b], strlen(s_ent[b])); b += 2)
182  ; /* find entity in entity list */
183 
184  if (s_ent[b++]) { /* found a match */
185  if ((c = strlen(s_ent[b])) - 1 > (e = strchr(s, ';')) - s) {
186  l = (d = (s - r)) + c + strlen(e); /* new length */
187  r = (r == m) ? strcpy(malloc(l), r) : realloc(r, l);
188  e = strchr((s = r + d), ';'); /* fix up pointers */
189  }
190 
191  memmove(s + c, e + 1, strlen(e)); /* shift rest of string */
192  strncpy(s, s_ent[b], c); /* copy in replacement text */
193  }
194  else s++; /* not a known entity */
195  }
196  else if ((type == ' ' /*|| type == '*'*/) && isspace(*s))
197  {
198  *(s++) = ' ';
199  }
200  else s++; /* no decoding needed */
201  }
202 
203  /* normalize spaces for non-cdata attributes
204  if (type == '*') {
205  for (s = r; *s; s++) {
206  if ((l = strspn(s, " "))) memmove(s, s + l, strlen(s + l) + 1);
207  while (*s && *s != ' ') s++;
208  }
209  if (--s >= r && *s == ' ') *s = '\0';
210  }*/
211 
212  return r;
213 }
214 
215 
216 /*******************************************************************************
217  * Tools
218  ******************************************************************************/
219 
220 
221 #define XML_WS "\t\r\n "
222 
223 
224 static void def_starttag_cb (void* userdata, const char* tag, char** attr) { }
225 static void def_endtag_cb (void* userdata, const char* tag) { }
226 static void def_text_cb (void* userdata, const char* text, int len) { }
227 
228 
229 static void call_text_cb(mrsaxparser_t* ths, char* text, size_t len, char type)
230 {
231  if( text && len )
232  {
233  char bak = text[len], *text_new;
234 
235  text[len] = '\0';
236  text_new = xml_decode(text, type);
237  ths->m_text_cb(ths->m_userdata, text_new, len);
238  if( text != text_new ) { free(text_new); }
239 
240  text[len] = bak;
241  }
242 }
243 
244 
245 static void do_free_attr(char** attr, int* free_attr)
246 {
247  /* "attr" are key/value pairs; the function frees the data if the corresponding bit in "free_attr" is set.
248  (we need this as we try to use the strings from the "main" document instead of allocating small strings) */
249  #define FREE_KEY 0x01
250  #define FREE_VALUE 0x02
251  int i = 0;
252  while( attr[i] ) {
253  if( free_attr[i>>1]&FREE_KEY && attr[i] ) { free(attr[i]); }
254  if( free_attr[i>>1]&FREE_VALUE && attr[i+1] ) { free(attr[i+1]); }
255  i += 2;
256  }
257  attr[0] = NULL; /* set list to zero-length */
258 }
259 
260 
261 /*******************************************************************************
262  * Main interface
263  ******************************************************************************/
264 
265 
266 const char* mrattr_find(char** attr, const char* key)
267 {
268  if( attr && key ) {
269  int i = 0;
270  while( attr[i] && strcmp(key, attr[i]) ) {
271  i += 2;
272  }
273 
274  if( attr[i] ) {
275  return attr[i + 1];
276  }
277  }
278  return NULL;
279 }
280 
281 
282 void mrsaxparser_init(mrsaxparser_t* ths, void* userdata)
283 {
284  ths->m_userdata = userdata;
285  ths->m_starttag_cb = def_starttag_cb;
286  ths->m_endtag_cb = def_endtag_cb;
287  ths->m_text_cb = def_text_cb;
288 }
289 
290 
291 void mrsaxparser_set_tag_handler(mrsaxparser_t* ths, mrsaxparser_starttag_cb_t starttag_cb, mrsaxparser_endtag_cb_t endtag_cb)
292 {
293  if( ths == NULL ) {
294  return;
295  }
296 
297  ths->m_starttag_cb = starttag_cb? starttag_cb : def_starttag_cb;
298  ths->m_endtag_cb = endtag_cb? endtag_cb : def_endtag_cb;
299 }
300 
301 
302 void mrsaxparser_set_text_handler (mrsaxparser_t* ths, mrsaxparser_text_cb_t text_cb)
303 {
304  if( ths == NULL ) {
305  return;
306  }
307 
308  ths->m_text_cb = text_cb? text_cb : def_text_cb;
309 }
310 
311 
312 void mrsaxparser_parse(mrsaxparser_t* ths, const char* buf_start__)
313 {
314  char bak, *buf_start, *last_text_start, *p;
315 
316  #define MAX_ATTR 100 /* attributes per tag - a fixed border here is a security feature, not a limit */
317  char* attr[(MAX_ATTR+1)*2]; /* attributes as key/value pairs, +1 for terminating the list */
318  int free_attr[MAX_ATTR]; /* free the value at attr[i*2+1]? */
319 
320  attr[0] = NULL; /* null-terminate list, this also terminates "free_values" */
321 
322  if( ths == NULL ) {
323  return;
324  }
325 
326  buf_start = safe_strdup(buf_start__); /* we make a copy as we can easily null-terminate tag names and attributes "in place" */
327  last_text_start = buf_start;
328  p = buf_start;
329  while( *p )
330  {
331  if( *p == '<' )
332  {
333  call_text_cb(ths, last_text_start, p - last_text_start, '&'); /* flush pending text */
334 
335  p++;
336  if( strncmp(p, "!--", 3) == 0 )
337  {
338  /* skip <!-- ... --> comment
339  **************************************************************/
340 
341  p = strstr(p, "-->");
342  if( p == NULL ) { goto cleanup; }
343  p += 3;
344  }
345  else if( strncmp(p, "![CDATA[", 8) == 0 )
346  {
347  /* process <![CDATA[ ... ]]> text
348  **************************************************************/
349 
350  char* text_beg = p + 8;
351  if( (p = strstr(p, "]]>"))!=NULL ) /* `]]>` itself is not allowed in CDATA and must be escaped by dividing into two CDATA parts */ {
352  call_text_cb(ths, text_beg, p-text_beg, 'c');
353  p += 3;
354  }
355  else {
356  call_text_cb(ths, text_beg, strlen(text_beg), 'c'); /* CDATA not closed, add all remaining text */
357  goto cleanup;
358  }
359  }
360  else if( strncmp(p, "!DOCTYPE", 8) == 0 )
361  {
362  /* skip <!DOCTYPE ...> or <!DOCTYPE name [ ... ]>
363  **************************************************************/
364 
365  while( *p && *p != '[' && *p != '>' ) p++; /* search for [ or >, whatever comes first */
366  if( *p == 0 ) {
367  goto cleanup; /* unclosed doctype */
368  }
369  else if( *p == '[' ) {
370  p = strstr(p, "]>"); /* search end of inline doctype */
371  if( p == NULL ) {
372  goto cleanup; /* unclosed inline doctype */
373  }
374  else {
375  p += 2;
376  }
377  }
378  else {
379  p++;
380  }
381  }
382  else if( *p == '?' )
383  {
384  /* skip <? ... ?> processing instruction
385  **************************************************************/
386 
387  p = strstr(p, "?>");
388  if( p == NULL ) { goto cleanup; } /* unclosed processing instruction */
389  p += 2;
390  }
391  else
392  {
393  p += strspn(p, XML_WS); /* skip whitespace between `<` and tagname */
394  if( *p == '/' )
395  {
396  /* process </tag> end tag
397  **************************************************************/
398 
399  p++;
400  p += strspn(p, XML_WS); /* skip whitespace between `/` and tagname */
401  char* beg_tag_name = p;
402  p += strcspn(p, XML_WS "/>"); /* find character after tagname */
403  if( p != beg_tag_name )
404  {
405  bak = *p;
406  *p = '\0'; /* null-terminate tag name temporary, eg. a covered `>` may get important downwards */
407  mr_strlower_in_place(beg_tag_name);
408  ths->m_endtag_cb(ths->m_userdata, beg_tag_name);
409  *p = bak;
410  }
411  }
412  else
413  {
414  /* process <tag attr1="val" attr2='val' attr3=val ..>
415  **************************************************************/
416 
417  do_free_attr(attr, free_attr);
418 
419  char* beg_tag_name = p;
420  p += strcspn(p, XML_WS "/>"); /* find character after tagname */
421  if( p != beg_tag_name )
422  {
423  char* after_tag_name = p;
424 
425  /* scan for attributes */
426  int attr_index = 0;
427  while( isspace(*p) ) { p++; } /* forward to first attribute name beginning */
428  for( ; *p && *p != '/' && *p != '>'; attr_index += 2 )
429  {
430  char *beg_attr_name = p, *beg_attr_value = NULL, *beg_attr_value_new = NULL;
431 
432  p += strcspn(p, XML_WS "=/>"); /* get end of attribute name */
433  if( p != beg_attr_name )
434  {
435  /* attribute found */
436  char* after_attr_name = p;
437  p += strspn(p, XML_WS); /* skip whitespace between attribute name and possible `=` */
438  if( *p == '=' )
439  {
440  p += strspn(p, XML_WS "="); /* skip spaces and equal signs */
441  char quote = *p;
442  if( quote == '"' || quote == '\'' )
443  {
444  /* quoted attribute value */
445  p++;
446  beg_attr_value = p;
447 
448  while( *p && *p != quote ) { p++; }
449  if( *p ) {
450  *p = '\0'; /* null terminate attribute val */
451  p++;
452  }
453 
454  beg_attr_value_new = xml_decode(beg_attr_value, ' ');
455  }
456  else
457  {
458  /* unquoted attribute value, as the needed null-terminated may overwrite important characters, we'll create a copy */
459  beg_attr_value = p;
460  p += strcspn(p, XML_WS "/>"); /* get end of attribute value */
461  bak = *p;
462  *p = '\0';
463  char* temp = safe_strdup(beg_attr_value);
464  beg_attr_value_new = xml_decode(temp, ' ');
465  if( beg_attr_value_new!=temp ) { free(temp); }
466  *p = bak;
467  }
468  }
469  else
470  {
471  beg_attr_value_new = safe_strdup(NULL);
472  }
473 
474  /* add attribute */
475  if( attr_index < MAX_ATTR )
476  {
477  char* beg_attr_name_new = beg_attr_name;
478  int free_bits = (beg_attr_value_new != beg_attr_value)? FREE_VALUE : 0;
479  if( after_attr_name == p ) {
480  /* take care not to overwrite the current pointer (happens eg. for `<tag attrWithoutValue>` */
481  bak = *after_attr_name;
482  *after_attr_name = '\0';
483  beg_attr_name_new = safe_strdup(beg_attr_name);
484  *after_attr_name = bak;
485  free_bits |= FREE_KEY;
486  }
487  else {
488  *after_attr_name = '\0';
489  }
490 
491  mr_strlower_in_place(beg_attr_name_new);
492  attr[attr_index] = beg_attr_name_new;
493  attr[attr_index+1] = beg_attr_value_new;
494  attr[attr_index+2] = NULL; /* null-terminate list */
495  free_attr[attr_index>>1] = free_bits;
496  }
497  }
498 
499  while( isspace(*p) ) { p++; } /* forward to attribute name beginning */
500  }
501 
502  char bak = *after_tag_name; /* backup the character as it may be `/` or `>` which gets important downwards */
503  *after_tag_name = 0;
504  mr_strlower_in_place(beg_tag_name);
505  ths->m_starttag_cb(ths->m_userdata, beg_tag_name, attr);
506  *after_tag_name = bak;
507 
508  /* self-closing tag */
509  p += strspn(p, XML_WS); /* skip whitespace before possible `/` */
510  if( *p == '/' )
511  {
512  p++;
513  *after_tag_name = 0;
514  ths->m_endtag_cb(ths->m_userdata, beg_tag_name); /* already lowercase from starttag_cb()-call */
515  }
516  }
517 
518  } /* end of processing start-tag */
519 
520  p = strchr(p, '>');
521  if( p == NULL ) { goto cleanup; } /* unclosed start-tag or end-tag */
522  p++;
523 
524  } /* end of processing start-tag or end-tag */
525 
526  last_text_start = p;
527  }
528  else
529  {
530  p++;
531  }
532  }
533 
534  call_text_cb(ths, last_text_start, p - last_text_start, '&'); /* flush pending text */
535 
536 cleanup:
537  do_free_attr(attr, free_attr);
538  free(buf_start);
539 }
540