/*
 *  dancer-XML parser
 *  Copyright (C) 2000,2002,2003 Junichi Uekawa
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */
/*

XML parser 

copyright 2000 Junichi Uekawa

started 8 Sep 2000
11 Sep 2000 fixed #include

AIM: Make a easy-to-use library for parsing ISO-2022-JP etc. files.
"M-;" makes a comment in emacs... -- what's the use?


#define PRESERVE_NL will preserve newlines in the parsed XML.
the default behavior is XMLwise wrong, but unlikely that you 
want it in the default way for normal use...


 Notes on the usage:
  this piece of code skips all the headers, like the 
  <!--  ...
  <! ...
  <? ... 
  parts. So this might not be the most ideal code to use if you
  care about the DTD and the model.
*/


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "dancer-xml.h"

/**@name Internal static functions used inside XML parser.
   This section describes function that are used only inside the
   XML parsing system.
 */
/*@{*/

static void dxml_free_attrib(dxml_attribute * a); /* declare */

#define ERROR_RETURN return NULL;

/**
 * malloc with an error message on error, exits with 1.
 */
static void *
mallocwm(int l /** size of memory to allocate*/)
{
  char * t = malloc(l);
  if (!t)
    {
      fprintf(stderr, "Out of memory in initialize_element\n");
      exit(1);
    }
  return t;  
}


/**
 * skip whitechar characters...
 *
 * @return 0 on success, 1 on EOF
 */
static int 
skipwhitechars(FILE*f)
{
  int c;
  while (1)
    {
      c=getc(f);
      
      switch(c){
      case ' ': case '\t': case '\r': case '\n':
	break;
      case EOF:
	return 1;
      default:
	ungetc(c, f);
	return 0;
      }      
    }  
}

/**
 * check if the next char is c and barf if not.
 * return 1 if error.
 */
static int 
checkchar (int expect, FILE*f)
{
  int c=getc(f);
  if (c!=expect)
    {
      if (c == EOF)
	fprintf(stderr, "Unexpected end-of-file\n");
      else
	fprintf(stderr, "Expected token '%c', but got '%c' instead \n", expect,c );
      return 1;
    }
  return 0;
}

/**
 * Initializes element for use. 
 *
 * allocate, and initialize
 */
static dxml_element *
initialize_element(void)
{
  dxml_element * new_element = mallocwm(sizeof(dxml_element));

  new_element->element_name = NULL;
  new_element->element_attribute = NULL;
  new_element->child = new_element->next = NULL;
  new_element->element_type = element_type_element;
  
  return new_element;  
}

/** 
 * read character stream
 *
 * @return NULL on error
 */
static char *
read_character_stream (FILE*f, const char * stop_chars)
{
  const int SINGLEBUFSIZ = 512;  
  int currentbufsiz = SINGLEBUFSIZ;
  char * buf = mallocwm (SINGLEBUFSIZ);
  int i = 0;
  int c = 0;

  while ((c = getc (f)) != EOF)
    {
      if (strchr (stop_chars, c))
	{
	  ungetc (c, f);
	  break;
	}
      buf[i++ ] = c ;
      if (i > currentbufsiz - 2 )
	{
	  buf = realloc (buf, currentbufsiz += SINGLEBUFSIZ );
	  if (!buf)
	    {
	      fprintf (stderr, "Out of memory for realloc in %s %i\n", __FILE__, __LINE__);
	      return NULL;
	    }
	}
    }
  buf [i] = 0;
  return buf;
}

/**
 * reads PCdata, and returns.
 *
 * @return NULL on error.
 */
static dxml_element * 
read_PCDATA(FILE*f)
{
  dxml_element *new_element = initialize_element();
  new_element->element_type = element_type_pcdata ;
  if (!(new_element->element_name = read_character_stream (f, "<>")))
    {
      dxml_free_xml(new_element);
      return NULL;
    }
  return new_element;
}


/**
 * read attributes, and end with / (/>) or > and give it back to the caller
 * name="string" -- no space allowed?
 *
 * @return NULL if no attribute or error.
 *
 */
static dxml_attribute *
read_attribute(FILE*f)
{
  dxml_attribute* new_attribute = mallocwm(sizeof(dxml_attribute));
  int c, startquote;
  
  new_attribute -> attribute_name = 
    new_attribute -> attribute_data = NULL;
  new_attribute -> next = NULL; /* to be safe */
  
  if (skipwhitechars(f)) return NULL;
  /* check for /> and > */
  c = getc(f);
  ungetc(c,f);

  if ((c== '>') || (c == '/'))
    {
      /* non-error return. */
      dxml_free_attrib (new_attribute);  
      return NULL;
    }
  
  /* read the attribute name */
  if (!(new_attribute->attribute_name = read_character_stream(f, "> /\t=")))
    {
      dxml_free_attrib (new_attribute);
      return NULL;
    }  
  
  /* read = */
  if (checkchar('=',f))
    {
      dxml_free_attrib (new_attribute);
      return NULL;
    }
  
  /* read the attribute value */
  startquote=getc(f);
  switch(startquote)
    {
    case '"':
      new_attribute->attribute_data = read_character_stream(f, "\"");
      break;
    case '\'':
      new_attribute->attribute_data = read_character_stream(f, "'");
      break;
    default:
      fprintf(stderr, "Error with startquote, needs to be \" or '\n");
      exit(1);
    }
  if (checkchar(startquote,f)) /* this should succeed or I'm in deep trouble */
    {
      dxml_free_attrib(new_attribute);
      return NULL;
    }

  /* recurse-call */
  new_attribute -> next = read_attribute(f);
    
  /* end */
  return new_attribute;  
}

/** 
 * read PI or doctype decl, <? ... ?> 
 *
 * @return 1 on error.
 */
static int
skip_read_PI(FILE*f, int strict /** Check for < ? */)
{
  int c;
  
  while ((c=getc(f)) != '>')
    {
      switch (c)
	{
	case EOF:
	  fprintf (stderr, "Premature EOF found reading PI\n");
	  return 1;
	case '[':
	  /* support the !DOCTYPE DTD thing */
	  while ((c=getc(f)) !=']') {
	    if (c==EOF) break;
	  }
	  break;
	case '<':
	  /* I don't think that a "<" is allowed here, if it is a 
	     PI. */
	  if (strict)
	    {
	      fprintf (stderr, "While reading PI, an extra < was encountered\n");
	      return 1;
	    }
	  
	  break;
	}
    }

#ifndef PRESERVE_NL
  if (skipwhitechars(f))/* ignore spaces */
    return 1;
#endif
  return 0;
}

/**
 * read comment, <!(--) --> 
 *
 * @return 0 on success, 1 on error
 */
static int
skip_read_comment(FILE*f)
{
  int c;
  int prev = 0, prevprev = 0;

  /* check that it is a comment, or think it is a DOCTYPE decl. */
  if ((c=getc(f)) != '-')
    {
      ungetc(c,f);
      return skip_read_PI(f, 0);
    }
  if ((c=getc(f)) != '-')
    {
      ungetc(c,f);
      return skip_read_PI(f, 0);
    }
    
  while ((c=getc(f)) != EOF)
    {
      if (prev == '-' && prevprev == '-' && c == '>')
	break;
      prevprev = prev;
      prev = c;      
    }
#ifndef PRESERVE_NL
  if (skipwhitechars(f))/* ignore spaces */      
    return 1;
#endif

  return 0;
}

/** 
 *  read the start of  <... part 
 *
 *   @return 0 if it is not a XML normal start tag, that no further processing is required. 1 if it is a normal start tag. 2 on error.
 */
static int 
read_element_start(FILE*f)
{
  int c;
  
  c=getc(f);			/* Check for comments and PI. */

  if (c  == '?')
    {
      return skip_read_PI(f, 1)?2:0;
      /* ignore PI comments ...  */
    }
  
  if (c  == '!')
    {
      return skip_read_comment(f)?2:0;
      /* ignore comments ...  */
    }
  
  ungetc (c,f);
  return 1;
}


/**
 * Read an element tag start -- end.
 *
 *    assume that the pointer is at 
 *    <tagname att="...">
 *     ^                      i.e. "<" is already read.
 *
 * @return NULL if error.
 */
static dxml_element *
read_element(FILE*f)
{
  int c;
  char * buf;
  
  dxml_element * new_element = initialize_element();
  dxml_element * tmp_element, * current_element_bottom = NULL ;

  switch(read_element_start(f))
    {
    case 0:
      free (new_element);
      return NULL;
    case 1:
      break;
    case 2:
      fprintf (stderr, "Reading element failed due to possibly unclosed braces.\n");
      return NULL;
    }
  
  
  /* read element name and list of attributes */
  if (!(new_element->element_name = read_character_stream (f, "> /\t")))
    {
      return NULL;		/* error */
    }
  new_element->element_attribute = read_attribute(f);

  /* check if it is terminated with / ...  < something  / >*/
  c=getc(f);
  if (c == EOF)
    {
      fprintf(stderr, "Unexpected EOF found while parsing <%s\n",
	      new_element->element_name);
      return NULL;		/* error */
    }
  
  if (c == '/')
    {
      if (checkchar('>',f))
	{			/* this is error return. */
	  free(new_element->element_name);
	  dxml_free_attrib(new_element->element_attribute);
	  free(new_element);	/* free the current element */
	  ERROR_RETURN;
	}
      /* it is the end of the process... */
#ifndef PRESERVE_NL
      skipwhitechars(f);
#endif
      return new_element;      
    }

  /* do this for its PCDATA and elements included */
  skipwhitechars(f);
  while(1)
    {
      c = getc(f);
      if (c== EOF) 
	{
	  fprintf(stderr, "unexpected EOF\n");
	  return NULL;
	}
      else if (c== '<') /* XML thing is there...*/
	{
	  c = getc(f);
	  if (c=='/') /* end-tag */
	    break;
	  ungetc(c, f);
	  tmp_element = read_element(f); /* read another element */
	}
      else if (c == '>')
	{			/* handle error case, > shouldn't be here */
	  fprintf(stderr,
		  "Unexpected > in document\n");
	  exit (1);
	}
      else
	{
	  ungetc(c,f);
	  tmp_element = read_PCDATA(f); /* read PCDATA */
	}

      /* adding the child to the bottom of the tree... */
      if (tmp_element)
	{			/* only if non-null */
	  if (new_element -> child) /* this var is initially NULL */
	    {
	      current_element_bottom = 
		current_element_bottom -> next = tmp_element ;
	    }
	  else 
	    {
	      current_element_bottom = new_element -> child = tmp_element ;
	    }      
	}
    }  
  
  /*
   * check if the element is closed correctly 
   * at this point,  "</"  has already been read
   */
  buf = read_character_stream (f, "> \t\n\r");
  if (strcmp(new_element->element_name, buf))
    {
      fprintf(stderr,
	      "Bad element name, tag opened with %s and closed with %s\n",
	      new_element->element_name, buf);
      ERROR_RETURN;
    }
  free (buf);
  skipwhitechars(f);
  if (checkchar('>',f))
    {
      dxml_free_xml(new_element);
      return NULL;
    }
  
#ifndef PRESERVE_NL
  skipwhitechars(f);/* strip end chars */
#endif

  /* end ... */
  return new_element;
}

/*
 * Read a XML sequence from file.
 *
 * @return NULL on error 
 */
dxml_element* 
dxml_read_xml(FILE*f)
{ 
  dxml_element * e ;
  int c;
  
  while ((c = getc(f)) != '<')
    { /* garbage is loaded ... */
      if (c==EOF)
	{
	  return NULL;
	}
      fprintf(stderr, "Garbage is loaded in read_xml prologue\n");
    }

  /* repeat until I find the main XML bits.
     I skip off the <!-- and <? bits.
   */
  while (NULL==(e = read_element(f))) 
    {
      skipwhitechars(f);
      if (checkchar('<',f))
	{
	  /*dxml_free_xml(e);*/
	  return NULL;
	}
    }
  return e;
}

/**
 * Free attribute of a XML element.
 */
static void
dxml_free_attrib(dxml_attribute * a)
{
  if (!a)
    {
      fprintf (stderr, "loadxml, internal error: a is NULL\n");
      return;
    }

  if (a->attribute_name) 
    free(a->attribute_name);
  if (a->attribute_data) 
    free(a->attribute_data);
  if (a->next)
    dxml_free_attrib(a->next);
  free (a);
}

void
dxml_free_xml(dxml_element * e)
{
  if (!e)
    {
      fprintf (stderr, "loadxml, internal error: e is NULL\n");
      return;
    }
  
  if (e->element_name)
    free (e->element_name);
  if (e->element_attribute)
    dxml_free_attrib(e->element_attribute);
  if (e->child)
    dxml_free_xml(e->child);
  if (e->next)
    dxml_free_xml(e->next);
  free (e);
}

/*@}*/
