/*
 * unhtml: parse a MIME multipart message and, if `multipart/alternative',
 * discard all but the `text/plain' part.
 *
 * This code is in the public domain except where noted otherwise.
 *
 * Tim Pierce <twp@rootsweb.com>
 * 3 June 1999
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define CHUNKSIZE 4096

struct list {
  struct message *data;
  struct list *next;
};

/*
 * The `message' struct is used to represent a MIME multipart message,
 * as defined in RFC 2045 and RFC 2046.
 *
 * The `header' and `preamble' fields are flat text arrays containg
 * the raw text of the message header and preamble (any text preceding
 * the first body part).  In the case of a single-part message, the
 * `preamble' field is used to store the whole body.
 *
 * The `epilogue' field stores any text following the last body part.
 *
 * The `content_type' field contains the value of the message's
 * Content-Type header, minus any parameters: for example,
 * "text/plain".  It is NULL if the message lacks a Content-Type
 * header.  The `boundary' field contains the value of the `boundary'
 * parameter to the Content-Type field, if present.
 *
 * The `hsize' parameter is the size of the header, in bytes.  The
 * `bsize' parameter is the size of the body, in bytes.  These
 * parameters are not presently used and may be discarded.
 */

struct message {
  char *header;
  char *preamble;	/* text (if any) preceding the first body-part */
  char *epilogue;	/* text (if any) following the last body-part */
  char *content_type;
  char *boundary;
  int is8bit;		/* does this message contain 8bit data? */
  int hsize;	/* bytes allocated for hdr (may be more than necessary) */
  int bsize;	/* bytes allocated for body (may be more than necessary) */
  struct list *parts;
};

struct message *mime_parse (char *body);
void mime_write (struct message *msg);
void mime_warn (char *s);
void mime_fatal (char *s);
void mime_destroy (struct message *msg);
int check_msgtype (struct message *msg, char *type);
void getheader (struct message *msg, char *hdrname, char **hdrvalp, int *hdrlen);
char *next_boundary (char *body, char *boundary);
int add_subpart (struct message *msg, struct message *part);
void print_msg_info (struct message *msg, int indent);
void mime_decode_qp (struct message *msg);
void mime_decode_b64 (char *buf);
void mime_output_qp (char *text);

int
main (argc, argv)
     int argc;
     char **argv;
{
  struct message *msg;
  int len, c, last_char, bi, bsize;
  char *p, *body;

  if (argc > 1)
    {
      if (freopen (argv[1], "r", stdin) == NULL)
	{
	  perror ("unhtml: freopening standard input");
	  exit(1);
	}
    }

  /* Read the message. */
  bi = 0;
  body = (char *) malloc (sizeof(char) * CHUNKSIZE);
  if (body == NULL)
    {
      perror ("unhtml: could not malloc memory for body");
      exit(1);
    }

  while ((len = read (fileno(stdin), body+bi, CHUNKSIZE)) > 0)
    {
      bi += len;
      body = (char *) realloc (body, sizeof(char) * (bi+CHUNKSIZE));
      if (body == NULL)
	{
	  perror ("unhtml: could not realloc memory for body");
	  exit(1);
	}
    }
  bsize = bi;
  body[bsize] = '\0';

  msg = mime_parse (body);

  /*
   * Here's the important stuff: walk through the parts of a
   * multipart/alternative and look for a text/plain attachment.
   * If we find one, rewrite the headers of the parent message
   * (this is ugly) and output the text/plain body.
   */
  if (msg->content_type &&
      !strncasecmp (msg->content_type, "multipart/alternative", 21))
    {
      struct list *p;
      for (p = msg->parts; p != NULL; p = p->next)
	{
	  if (p->data->content_type == NULL ||
	      !strncasecmp (p->data->content_type, "text/plain", 10))
	    {
	      /* XXX: Rewrite the headers.  This is clumsy, and also
	       * doesn't handle Content-Type or Content-Length if they're
	       * the first headers in the message. */
	      char *cp;
	      for (cp = msg->header; *cp; ++cp)
		{
		  putchar (*cp);

		  /* Skip Content-Length and Content-Transfer-Encoding. */
		  if (*cp == '\n' &&
		      (!strncasecmp (cp+1, "Content-Length:",15) ||
		       !strncasecmp (cp+1, "Content-Transfer-Encoding:", 26)))
		    {
		      do
			{
			  cp = strchr (cp+1, '\n');
			}
		      while (cp != NULL && isspace(cp[1]));
		      if (cp == NULL)
			break;
		    }

		  /* Rewrite Content-Type. */
		  if (*cp == '\n' && !strncasecmp (cp+1, "Content-Type:", 13))
		    {
		      char *hp;
		      int hlen;

		      if (p->data->content_type)
			printf ("Content-Type: %s\n", p->data->content_type);

		      /* Skip to next header. */
		      do
			{
			  cp = strchr (cp+1, '\n');
			}
		      while (cp != NULL && isspace(cp[1]));
		      if (cp == NULL)
			break;
		    }
		}
	      putchar ('\n');
	      puts (p->data->preamble);
	      break;
	    }
	}
      if (p != NULL)
	return 0;
    }

  /*
   * If we got here, either the message wasn't multipart/alternative
   * or it didn't have a text/plain component.  In either case we
   * give up and write the original message to stdout.
   */
  fputs (body, stdout);
  return 0;
}

/*
 * mime_parse: process an RFC 2046 multipart message and return
 *	a message structure with all the necessary fields filled in.
 *
 * The BODY argument is a character array containing the raw text of the
 * message to be parsed.
 *
 * In the event of a fatal system error (should only happen in the
 * case of insufficient memory) or a fatal MIME parsing error, a
 * message will be printed on standard error and the return value
 * will be NULL.
 */

struct message *
mime_parse (body)
     char *body;
{
  char *p, *bodyp;
  int len;
  struct message *msg;

  /* Initialize the message. */
  msg = (struct message *) malloc (sizeof(struct message));
  if (msg == NULL)
    {
      perror ("unhtml: mime_parse could not malloc new message struct");
      exit(1);
    }
  msg->header = NULL;
  msg->preamble = NULL;
  msg->epilogue = NULL;
  msg->content_type = NULL;
  msg->boundary = NULL;
  msg->parts = NULL;
  msg->is8bit = 0;

  /* Get the header. */
  /* Special case for message with zero-length header. */
  if (body[0] == '\n')
    {
      msg->hsize = 0;
      msg->header = (char *) malloc(sizeof(char));
      if (msg->header == NULL)
	{
	  perror ("unhtml: mime_parse could not malloc header buffer");
	  return NULL;
	}
      msg->header[0] = '\0';
      bodyp = body + 1;
    }
  else
    {
      bodyp = strstr (body, "\n\n");
      if (bodyp == NULL)
	{
	  mime_fatal ("no message header found");
	  return NULL;
	}
      msg->hsize = bodyp - body + 1;
      msg->header = (char *) malloc (sizeof(char) * (msg->hsize+1));
      if (msg->header == NULL)
	{
	  perror ("unhtml: mime_parse could not malloc header buffer");
	  return NULL;
	}
      strncpy (msg->header, body, msg->hsize);
      msg->header[msg->hsize] = '\0';
      bodyp += 2;
    }

  /* Find the content-type. */
  getheader (msg, "Content-Type", &p, &len);
  if (p != NULL)
    {
      msg->content_type = (char *) malloc (sizeof(char) * (len + 1));
      if (msg->content_type == NULL)
	{
	  perror ("unhtml: mime_parse could not malloc Content-Type buffer");
	  mime_destroy (msg);
	  return NULL;
	}
      strncpy (msg->content_type, p, len);
      msg->content_type[len] = '\0';
    }

  /*
   * If this is a message/rfc822, then the body is an encapsulated
   * message.  Parse it, attach the result to the current message,
   * and we're done.
   */
  if (msg->content_type && !strcasecmp (msg->content_type, "message/rfc822"))
    {
      msg->parts = (struct list *) malloc (sizeof(struct list));
      if (msg->parts == NULL)
	{
	  perror ("unhtml: mime_parse could not malloc attachment list");
	  return NULL;
	}
      msg->parts->data = mime_parse (bodyp);
      msg->parts->next = NULL;
      return msg;
    }

  /* Find the message boundary. */
  if (msg->content_type && !strncasecmp (msg->content_type, "multipart/", 10))
    {
      /* Skip to next semicolon and see what keyword follows it. */
      p = msg->content_type;
      while ((p = strchr (p, ';')) != NULL)
	{
	  ++p;
	  p += strspn (p, " \t\v\r\n");
	  if (strncasecmp (p, "boundary", 8) == 0)
	    {
	      char *dest;
	      p += 8 + strspn (p+8, " \t\v\r\n");
	      if (*p++ != '=')
		{
		  mime_fatal ("expected `=' after `boundary' parameter");
		  mime_destroy (msg);
		  return NULL;
		}
	      p += strspn (p, " \t\v\r\n");
	      dest = msg->boundary = (char *) malloc (strlen(p));
	      if (dest == NULL)
		{
		  perror ("unhtml: mime_parse could not malloc boundary");
		  mime_destroy (msg);
		  return NULL;
		}
	      /* If next char is a quote, read the following quoted-string. */
	      if (*p == '"')
		{
		  ++p;
		  while (*p != '\0' && *p != '"')
		    {
		      if (*p == '\\')
			++p;
		      *dest++ = *p;
		      ++p;
		    }
		}
	      else	/* Generic non-special characters. */
		{
		  while (*p != '\0' && !strchr ("()<>@,;:\\\"/[]?=", *p))
		    {
		      *dest++ = *p;
		      ++p;
		    }
		}
	      *dest = '\0';
	      break;
	    }
	}
      if (msg->boundary == NULL)
	{
	  mime_fatal ("Content-Type lacks required `boundary' parameter");
	  mime_destroy (msg);
	  return NULL;
	}
    }

  /* Break up multiparts. */
  if (check_msgtype (msg, "multipart/"))
    {
      char *nextpart;
      struct message *part;

      /* Preamble. */
      p = next_boundary (bodyp, msg->boundary);
      if (p == NULL)
	msg->preamble = strdup (bodyp);
      else
	{
	  int psize = p - bodyp - strlen(msg->boundary) - 3;

	  /* Special case: a boundary line may occur at the very beginning
	     of the body, which means that no newline precedes it and psize
	     is negative. */
	  if (psize < 0)
	    psize = 0;
	  msg->preamble = (char *) malloc (sizeof(char) * (psize+1));
	  if (msg->preamble == NULL)
	    {
	      perror ("unhtml: mime_parse could not malloc preamble buffer");
	      mime_destroy (msg);
	      return NULL;
	    }
	  strncpy (msg->preamble, bodyp, psize);
	  msg->preamble[psize] = '\0';
	}

      /* Scan to each boundary and parse the body part contained therein. */
      while (p != NULL && strncmp (p, "--\n", 3) != 0)
	{
	  nextpart = next_boundary (++p, msg->boundary);
	  if (nextpart == NULL)
	    {
	      char buf[512];
	      snprintf (buf, sizeof buf, "no terminating `%s' boundary found",
			msg->boundary);
	      mime_warn (buf);
	      break;
	    }
	  else
	    {
	      char *part_end = nextpart - strlen(msg->boundary) - 3;
	      char c = *part_end;

	      /* XXX: Parsing a body part should not require munging
		 the buffer passed to mime_parse. */
	      *part_end = '\0';
	      part = mime_parse (p);
	      *part_end = c;
	      if (part == NULL)
		{
		  mime_destroy (msg);
		  return NULL;
		}
	      if (!add_subpart (msg, part))
		{
		  mime_destroy (msg);
		  return NULL;
		}
	    }
	  p = nextpart;
	}

      /* Get epilogue. */
      if (p != NULL)
	{
	  while (*p++ != '\n')
	    ;
	  if ((msg->epilogue = strdup(p)) == NULL)
	    {
	      perror ("unhtml: mime_parse could not strdup message epilogue");
	      mime_destroy (msg);
	      return NULL;
	    }
	}
    }
  else	/* not multipart */
    {
      msg->preamble = strdup (bodyp);
      if (msg->preamble == NULL)
	{
	  perror ("unhtml: mime_parse could not strdup message body");
	  mime_destroy (msg);
	  return NULL;
	}
    }

  /* Decode the body (preamble), if appropriate. */
  getheader (msg, "Content-Transfer-Encoding", &p, &len);
  if (p != NULL)
    {
      if (!strncasecmp (p, "quoted-printable", len))
	mime_decode_qp (msg);
      else if (!strncasecmp (p, "base64", len))
	mime_decode_b64 (msg->preamble);
      else if (strncasecmp (p, "7bit", 4) != 0 &&
	       strncasecmp (p, "8bit", 4) != 0)
	mime_warn ("unknown Content-Transfer-Encoding");
    }

  return msg;
}

/*
 * check_msgtype: check the type of a MIME message structure and return 1 if
 *	the message is of the desired type, 0 otherwise.
 */

int
check_msgtype (msg, type)
     struct message *msg;
     char *type;
{
  return (msg->content_type &&
	  !strncasecmp (msg->content_type, type, strlen(type)));
}

/*
 * getheader: examine a MIME message for a particular header, and
 *	record the location of that header's value (following the header name)
 *	and its length (excluding the trailing newline).
 *
 * The MSG argument is a message structure containing a parsed MIME message.
 * The HDRNAME argument is the name of the desired header, e.g. "Content-Type".
 * The HDRVALP argument stores a pointer to the beginning of the header
 *	contents, if that header is found in the message.
 * The HDRLEN argument stores the length of the header contents.
 */

void
getheader (msg, hdrname, hdrvalp, hdrlen)
     struct message *msg;
     char *hdrname;
     char **hdrvalp;
     int *hdrlen;
{
  char *p, *hvp;
  int hdrnamelen, hvlen;

  *hdrvalp = NULL;
  *hdrlen = 0;

  hdrnamelen = strlen(hdrname);

  p = msg->header;
  while (p != NULL)
    {
      if (strncasecmp (p, hdrname, hdrnamelen) == 0)
	{
	  hvp = p + hdrnamelen;
	  if (*hvp++ != ':')	/* colon must follow header name */
	    continue;
	  while (*hvp != '\0' && isspace(*hvp))
	    ++hvp;
	  for (hvlen = 0; hvp[hvlen] != '\0'; ++hvlen)
	    {
	      if (hvp[hvlen] == '\n' && !isspace(hvp[hvlen+1]))
		{
		  *hdrvalp = hvp;
		  *hdrlen = hvlen;
		  return;
		}
	    }
	}
      p = strchr (p, '\n');
      if (p)
	++p;
    }
}

/*
 * next_boundary: find the next MIME multipart boundary in a message.
 *	The return value is a pointer to the end of the boundary text,
 *	or NULL if no boundary can be found in this message.
 *
 * The BODY argument is a character array containing the message body.
 * The BOUNDARY argument is a character array containing the boundary
 *	delimiter.
 *
 * Because the return value points to the end of the boundary, it
 * will point to `\n' if this is an ordinary boundary or `--\n' if
 * it is a final boundary.
 */

char *
next_boundary (body, boundary)
     char *body;
     char *boundary;
{
  char *p;

  /* For efficiency reasons, look for the boundary first and then
     examine the characters around it. */

  p = strstr (body, boundary);
  if (p != NULL && strncmp (p-3, "\n--", 3) == 0)
    return p + strlen(boundary);
  return NULL;
}

/*
 * add_subpart: append one message to the list of sub-parts for another
 *	message.
 *
 * The argument MSG is a message structure representing a multipart message.
 * The argument PART is another message (possibly multipart) which is to
 *	be added to MSG's list of sub-parts.
 *
 * Return 1 on success.  If a fatal error arises, return 0.
 */

int
add_subpart (msg, part)
     struct message *msg;
     struct message *part;
{
  struct list *p;

  if (msg->parts == NULL)
    {
      msg->parts = (struct list *) malloc (sizeof(struct list));
      if (msg->parts == NULL)
	{
	  perror ("unhtml: add_subpart could not malloc attachment list");
	  return 0;
	}
      msg->parts->data = part;
      msg->parts->next = NULL;
    }
  else
    {
      for (p = msg->parts; p->next != NULL; p = p->next)
	;
      p = p->next = (struct list *) malloc (sizeof(struct list));
      if (p == NULL)
	{
	  perror ("unhtml: add_subpart could not malloc attachment buffer");
	  return 0;
	}
      p->data = part;
      p->next = NULL;
    }

  return 1;
}

void
mime_write (msg)
     struct message *msg;
{
  if (msg->header)
    {
      fputs (msg->header, stdout);
      putc ('\n', stdout);
    }

  /* message/rfc822 needs special handling. */
  if (msg->content_type &&
      !strncasecmp (msg->content_type, "message/rfc822", 14))
    {
      mime_write (msg->parts->data);
      return;
    }

  /* XXX: watch out for 8bit data here. */
  fputs (msg->preamble, stdout);

  if (msg->parts != NULL)
    {
      struct list *p;
      for (p = msg->parts; p != NULL; p = p->next)
	{
	  printf ("\n--%s\n", msg->boundary);
	  mime_write (p->data);
	}
      printf ("\n--%s--\n", msg->boundary);
      fputs (msg->epilogue, stdout);
    }
}


void
mime_warn (s)
     char *s;
{
  fprintf (stderr, "MIME parser: warning: %s\n", s);
}

void
mime_fatal (s)
     char *s;
{
  fprintf (stderr, "MIME parser: fatal: %s\n", s);
}

void
mime_destroy (msg)
     struct message *msg;
{
  struct list *p, *q;

  if (msg->header != NULL)
    free (msg->header);
  if (msg->preamble != NULL)
    free (msg->preamble);
  if (msg->epilogue != NULL)
    free (msg->epilogue);
  if (msg->content_type != NULL)
    free (msg->content_type);
  if (msg->boundary != NULL)
    free (msg->boundary);

  p = msg->parts;
  while (p != NULL)
    {
      if (p->data != NULL)
	mime_destroy (p->data);
      q = p;
      p = p->next;
      free (q);
    }
}

void
print_msg_info (msg, indent)
     struct message *msg;
     int indent;
{
  char indbuf[80];

  indbuf[indent--] = '\0';
  while (indent >= 0)
    indbuf[indent--] = ' ';

  printf ("%sHeader:\n", indbuf);
  printf ("%s--BEGIN--\n", indbuf);
  printf ("%s\n", msg->header);
  printf ("%s--END--\n", indbuf);
  printf ("%sContent-Type: %s\n", indbuf, msg->content_type);
  printf ("%sBoundary: %s\n", indbuf, msg->boundary);
  printf ("%s----------------------------------------\n", indbuf);

  if (msg->parts != NULL) {
    struct list *p;
    for (p = msg->parts; p != NULL; p = p->next) {
      print_msg_info (p->data, indent + 4);
    }
  }
}

int
hex2dec_char(ch)
     char ch;
{
  if (isdigit(ch))
    return ch-'0';
  else if (isupper(ch))
    return ch-'A'+10;
  else
    return ch-'a'+10;
}

/*
 * mime_decode_qp: convert the preamble of MSG from a quoted-printable
 *	encoding to raw text.
 */
void
mime_decode_qp (msg)
     struct message *msg;
{
  unsigned char *src, *dst;

  dst = src = msg->preamble;
  while (*src != '\0')
    {
      if (*src == '=')
	{
	  if (*++src == '\n')
	    {
	      ++src;
	      continue;
	    }
	  else
	    {
	      int hi, lo;
	      hi = hex2dec_char(*src++);
	      lo = hex2dec_char(*src);
	      *dst = hi*16 + lo;
	      if (*dst > 0x7f)
		msg->is8bit = 1;
	    }
	}
      else
	*dst = *src;
      ++dst, ++src;
    }
  *dst = '\0';
}

void
mime_output_qp (text)
     char *text;
{
  /* XXX: write this. */
}

/*
 * The char64 macro and `mime_decode_b64' routine are taken from
 * metamail 2.7, which is copyright (c) 1991 Bell Communications
 * Research, Inc. (Bellcore).  The following license applies to all
 * code below this point:
 *
 * Permission to use, copy, modify, and distribute this material 
 * for any purpose and without fee is hereby granted, provided 
 * that the above copyright notice and this permission notice 
 * appear in all copies, and that the name of Bellcore not be 
 * used in advertising or publicity pertaining to this 
 * material without the specific, prior written permission 
 * of an authorized representative of Bellcore.  BELLCORE 
 * MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY 
 * OF THIS MATERIAL FOR ANY PURPOSE.  IT IS PROVIDED "AS IS", 
 * WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
 */

static char index_64[128] = {
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
    52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
    -1, 0, 1, 2,  3, 4, 5, 6,  7, 8, 9,10, 11,12,13,14,
    15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
    -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
    41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
};

#define char64(c)  (((c) < 0 || (c) > 127) ? -1 : index_64[(c)])

void
mime_decode_b64 (src) 
     char *src;
{
  char *dst;
  int c1, c2, c3, c4;
  int newline = 1, DataDone = 0;

  dst = src;
  while ((c1 = *src++) != '\0')
    {
      if (isspace(c1)) {
	if (c1 == '\n') {
	  newline = 1;
	} else {
	  newline = 0;
	}
	continue;
      }
      if (DataDone) continue;
      newline = 0;
      do {
	c2 = *src++;
      } while (c2 != '\0' && isspace(c2));
      do {
	c3 = *src++;
      } while (c3 != '\0' && isspace(c3));
      do {
	c4 = *src++;
      } while (c4 != '\0' && isspace(c4));
      if (c2 == '\0' || c3 == '\0' || c4 == '\0')
	{
	  fprintf(stderr, "Warning: base64 decoder saw premature EOF!\n");
	  return;
        }
      if (c1 == '=' || c2 == '=') {
	DataDone=1;
	continue;
      }
      c1 = char64(c1);
      c2 = char64(c2);
      *dst++ = (c1<<2) | ((c2&0x30)>>4);
      if (c3 == '=')
	DataDone = 1;
      else
	{
	  c3 = char64(c3);
	  *dst++ = ((c2&0XF) << 4) | ((c3&0x3C) >> 2);
	  if (c4 == '=')
	    DataDone = 1;
          else
	    {
	      c4 = char64(c4);
	      *dst++ = ((c3&0x03) <<6) | c4;
            }
        }
    }
  *dst = '\0';
}

