denyhosts/clamav/libclamav/mbox.c

/*
 *  Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
 *  Copyright (C) 2007-2013 Sourcefire, Inc.
 *
 *  Authors: Nigel Horne
 *
 *  Acknowledgements: Some ideas came from Stephen White <stephen@earth.li>,
 *                    Michael Dankov <misha@btrc.ru>, Gianluigi Tiesi <sherpya@netfarm.it>,
 *                    Everton da Silva Marques, Thomas Lamy <Thomas.Lamy@in-online.net>,
 *                    James Stevens <James@kyzo.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 *  MA 02110-1301, USA.
 */
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif

#ifdef CL_THREAD_SAFE
#ifndef _REENTRANT
#define _REENTRANT /* for Solaris 2.8 */
#endif
#endif

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
#include <string.h>
#include <stdbool.h>
#ifdef HAVE_STRINGS_H
#include <strings.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <ctype.h>
#include <time.h>
#include <fcntl.h>
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <dirent.h>
#include <limits.h>
#include <signal.h>

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

#ifdef CL_THREAD_SAFE
#include <pthread.h>
#endif

#if defined(_WIN32) || defined(_WIN64)
#define strtok_r strtok_s
#endif

#include "clamav.h"
#include "others.h"
#include "str.h"
#include "filetypes.h"
#include "mbox.h"
#include "dconf.h"
#include "fmap.h"
#include "json_api.h"
#include "msxml_parser.h"

#if HAVE_LIBXML2
#include <libxml/xmlversion.h>
#include <libxml/HTMLtree.h>
#include <libxml/HTMLparser.h>
#include <libxml/xmlreader.h>
#endif

#define DCONF_PHISHING mctx->ctx->dconf->phishing

#ifdef CL_DEBUG

#if defined(C_LINUX)
#include <features.h>
#endif

#if __GLIBC__ == 2 && __GLIBC_MINOR__ >= 1 && !defined(__UCLIBC__) || defined(__UCLIBC_HAS_BACKTRACE__)
#define HAVE_BACKTRACE
#endif
#endif

#ifdef HAVE_BACKTRACE
#include <execinfo.h>

#ifdef USE_SYSLOG
#include <syslog.h>
#endif

static void sigsegv(int sig);
static void print_trace(int use_syslog);

/*#define    SAVE_TMP */ /* Save the file being worked on in tmp */
#endif

#if defined(NO_STRTOK_R) || !defined(CL_THREAD_SAFE)
#undef strtok_r
#undef __strtok_r
#define strtok_r(a, b, c) strtok(a, b)
#endif

typedef enum {
    FAIL,
    OK,
    OK_ATTACHMENTS_NOT_SAVED,
    VIRUS,
    MAXREC,
    MAXFILES
} mbox_status;

#ifndef isblank
#define isblank(c) (((c) == ' ') || ((c) == '\t'))
#endif

#define SAVE_TO_DISC /* multipart/message are saved in a temporary file */

#include "htmlnorm.h"

#include "phishcheck.h"

#ifndef _WIN32
#include <sys/time.h>
#include <netdb.h>
#include <sys/socket.h>
#include <netinet/in.h>
#if !defined(C_BEOS) && !defined(C_INTERIX)
#include <net/if.h>
#include <arpa/inet.h>
#endif
#endif

#include <fcntl.h>

/*
 * Use CL_SCAN_MAIL_PARTIAL_MESSAGE to handle messages covered by section 7.3.2 of RFC1341.
 *    This is experimental code so it is up to YOU to (1) ensure it's secure
 * (2) periodically trim the directory of old files
 *
 * If you use the load balancing feature of clamav-milter to run clamd on
 * more than one machine you must make sure that .../partial is on a shared
 * network filesystem
 */

/*
 * Slows things down a lot and only catches unencoded copies
 * of EICAR within bounces, which don't matter
 */
//#define    SCAN_UNENCODED_BOUNCES

typedef struct mbox_ctx {
    const char *dir;
    const table_t *rfc821Table;
    const table_t *subtypeTable;
    cli_ctx *ctx;
    unsigned int files; /* number of files extracted */
#if HAVE_JSON
    json_object *wrkobj;
#endif
} mbox_ctx;

/* if supported by the system, use the optimized
 * version of getc, that doesn't do locking,
 * and is possibly implemented entirely as a macro */
#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L
#define GETC(fp) getc_unlocked(fp)
#define LOCKFILE(fp) flockfile(fp)
#define UNLOCKFILE(fp) funlockfile(fp)
#else
#define GETC(fp) getc(fp)
#define LOCKFILE(fp)
#define UNLOCKFILE(fp)
#endif

static int cli_parse_mbox(const char *dir, cli_ctx *ctx);
static message *parseEmailFile(fmap_t *map, size_t *at, const table_t *rfc821Table, const char *firstLine, const char *dir, cli_ctx *ctx, bool *heuristicFound);
static message *parseEmailHeaders(message *m, const table_t *rfc821Table, bool *heuristicFound);
static int parseEmailHeader(message *m, const char *line, const table_t *rfc821, cli_ctx *ctx, bool *heuristicFound);
static cl_error_t parseMHTMLComment(const char *comment, cli_ctx *ctx, void *wrkjobj, void *cbdata);
static mbox_status parseRootMHTML(mbox_ctx *mctx, message *m, text *t);
static mbox_status parseEmailBody(message *messageIn, text *textIn, mbox_ctx *mctx, unsigned int recursion_level);
static int boundaryStart(const char *line, const char *boundary);
static int boundaryEnd(const char *line, const char *boundary);
static int initialiseTables(table_t **rfc821Table, table_t **subtypeTable);
static int getTextPart(message *const messages[], size_t size);
static size_t strip(char *buf, int len);
static int parseMimeHeader(message *m, const char *cmd, const table_t *rfc821Table, const char *arg, cli_ctx *ctx, bool *heuristicFound);
static int saveTextPart(mbox_ctx *mctx, message *m, int destroy_text);
static char *rfc2047(const char *in);
static char *rfc822comments(const char *in, char *out);
static int rfc1341(mbox_ctx *mctx, message *m);
static bool usefulHeader(int commandNumber, const char *cmd);
static char *getline_from_mbox(char *buffer, size_t len, fmap_t *map, size_t *at);
static bool isBounceStart(mbox_ctx *mctx, const char *line);
static bool exportBinhexMessage(mbox_ctx *mctx, message *m);
static int exportBounceMessage(mbox_ctx *ctx, text *start);
static const char *getMimeTypeStr(mime_type mimetype);
static const char *getEncTypeStr(encoding_type enctype);
static message *do_multipart(message *mainMessage, message **messages, int i, mbox_status *rc, mbox_ctx *mctx, message *messageIn, text **tptr, unsigned int recursion_level);
static int count_quotes(const char *buf);
static bool next_is_folded_header(const text *t);
static bool newline_in_header(const char *line);

static blob *getHrefs(message *m, tag_arguments_t *hrefs);
static void hrefs_done(blob *b, tag_arguments_t *hrefs);
static void checkURLs(message *m, mbox_ctx *mctx, mbox_status *rc, int is_html);

static bool haveTooManyMIMEPartsPerMessage(size_t mimePartCnt, cli_ctx *ctx, mbox_status *rc);
static bool hitLineFoldCnt(const char *const line, size_t *lineFoldCnt, cli_ctx *ctx, bool *heuristicFound);
static bool haveTooManyHeaderBytes(size_t totalLen, cli_ctx *ctx, bool *heuristicFound);
static bool haveTooManyEmailHeaders(size_t totalHeaderCnt, cli_ctx *ctx, bool *heuristicFound);
static bool haveTooManyMIMEArguments(size_t argCnt, cli_ctx *ctx, bool *heuristicFound);

/* Maximum line length according to RFC2821 */
#define RFC2821LENGTH 1000

/* Hashcodes for our hash tables */
#define CONTENT_TYPE 1
#define CONTENT_TRANSFER_ENCODING 2
#define CONTENT_DISPOSITION 3

/* Mime sub types */
#define PLAIN 1
#define ENRICHED 2
#define HTML 3
#define RICHTEXT 4
#define MIXED 5
#define ALTERNATIVE 6 /* RFC1521*/
#define DIGEST 7
#define SIGNED 8
#define PARALLEL 9
#define RELATED 10      /* RFC2387 */
#define REPORT 11       /* RFC1892 */
#define APPLEDOUBLE 12  /* Handling of this in only noddy for now */
#define FAX MIXED       /*                                              \
                         * RFC3458                                      \
                         * Drafts stated to treat is as mixed if it is  \
                         * not known.  This disappeared in the final    \
                         * version (except when talking about           \
                         * voice-message), but it is good enough for us \
                         * since we do no validation of coversheet      \
                         * presence etc. (which also has disappeared    \
                         * in the final version)                        \
                         */
#define ENCRYPTED 13    /*                                        \
                         * e.g. RFC2015                           \
                         * Content-Type: multipart/encrypted;     \
                         * boundary="nextPart1383049.XCRrrar2yq"; \
                         * protocol="application/pgp-encrypted"   \
                         */
#define X_BFILE RELATED /*                                             \
                         * BeOS, expert two parts: the file and it's   \
                         * attributes. The attributes part comes as    \
                         *    Content-Type: application/x-be_attribute \
                         *        name="foo"                           \
                         * I can't find where it is defined, any       \
                         * pointers would be appreciated. For now      \
                         * we treat it as multipart/related            \
                         */
#define KNOWBOT 14      /* Unknown and undocumented format? */

#define HEURISTIC_EMAIL_MAX_LINE_FOLDS_PER_HEADER (256 * 1024)
#define HEURISTIC_EMAIL_MAX_HEADER_BYTES (1024 * 256)
#define HEURISTIC_EMAIL_MAX_HEADERS 1024
#define HEURISTIC_EMAIL_MAX_MIME_PARTS_PER_MESSAGE 1024
#define HEURISTIC_EMAIL_MAX_ARGUMENTS_PER_HEADER 256

static const struct tableinit {
    const char *key;
    int value;
} rfc821headers[] = {
    /* TODO: make these regular expressions */
    {"Content-Type", CONTENT_TYPE},
    {"Content-Transfer-Encoding", CONTENT_TRANSFER_ENCODING},
    {"Content-Disposition", CONTENT_DISPOSITION},
    {NULL, 0}},
  mimeSubtypes[] = {/* see RFC2045 */
                    /* subtypes of Text */
                    {"plain", PLAIN},
                    {"enriched", ENRICHED},
                    {"html", HTML},
                    {"richtext", RICHTEXT},
                    /* subtypes of Multipart */
                    {"mixed", MIXED},
                    {"alternative", ALTERNATIVE},
                    {"digest", DIGEST},
                    {"signed", SIGNED},
                    {"parallel", PARALLEL},
                    {"related", RELATED},
                    {"report", REPORT},
                    {"appledouble", APPLEDOUBLE},
                    {"fax-message", FAX},
                    {"encrypted", ENCRYPTED},
                    {"x-bfile", X_BFILE},          /* BeOS */
                    {"knowbot", KNOWBOT},          /* ??? */
                    {"knowbot-metadata", KNOWBOT}, /* ??? */
                    {"knowbot-code", KNOWBOT},     /* ??? */
                    {"knowbot-state", KNOWBOT},    /* ??? */
                    {NULL, 0}},
  mimeTypeStr[] = {{"NOMIME", NOMIME}, {"APPLICATION", APPLICATION}, {"AUDIO", AUDIO}, {"IMAGE", IMAGE}, {"MESSAGE", MESSAGE}, {"MULTIPART", MULTIPART}, {"TEXT", TEXT}, {"VIDEO", VIDEO}, {"MEXTENSION", MEXTENSION}, {NULL, 0}}, encTypeStr[] = {{"NOENCODING", NOENCODING}, {"QUOTEDPRINTABLE", QUOTEDPRINTABLE}, {"BASE64", BASE64}, {"EIGHTBIT", EIGHTBIT}, {"BINARY", BINARY}, {"UUENCODE", UUENCODE}, {"YENCODE", YENCODE}, {"EEXTENSION", EEXTENSION}, {"BINHEX", BINHEX}, {NULL, 0}};

#ifdef CL_THREAD_SAFE
static pthread_mutex_t tables_mutex = PTHREAD_MUTEX_INITIALIZER;
#endif
static table_t *rfc821  = NULL;
static table_t *subtype = NULL;

int cli_mbox(const char *dir, cli_ctx *ctx)
{
    if (dir == NULL) {
        cli_dbgmsg("cli_mbox called with NULL dir\n");
        return CL_ENULLARG;
    }
    return cli_parse_mbox(dir, ctx);
}

/*
 * TODO: when signal handling is added, need to remove temp files when a
 *    signal is received
 * TODO: add option to scan in memory not via temp files, perhaps with a
 * named pipe or memory mapped file, though this won't work on big e-mails
 * containing many levels of encapsulated messages - it'd just take too much
 * RAM
 * TODO: parse .msg format files
 * TODO: fully handle AppleDouble format, see
 *    http://www.lazerware.com/formats/Specs/AppleSingle_AppleDouble.pdf
 * TODO: ensure parseEmailHeaders is always called before parseEmailBody
 * TODO: create parseEmail which calls parseEmailHeaders then parseEmailBody
 * TODO: Handle unexpected NUL bytes in header lines which stop strcmp()s:
 *    e.g. \0Content-Type: application/binary;
 */
static int
cli_parse_mbox(const char *dir, cli_ctx *ctx)
{
    int retcode;
    message *body;
    char buffer[RFC2821LENGTH + 1];
    mbox_ctx mctx;
    size_t at   = 0;
    fmap_t *map = ctx->fmap;

    cli_dbgmsg("in mbox()\n");

    if (!fmap_gets(map, buffer, &at, sizeof(buffer) - 1)) {
        /* empty message */
        return CL_CLEAN;
    }

#ifdef CL_THREAD_SAFE
    pthread_mutex_lock(&tables_mutex);
#endif

    if (initialiseTables(&rfc821, &subtype) < 0) {
#ifdef CL_THREAD_SAFE
        pthread_mutex_unlock(&tables_mutex);
#endif
        return CL_EMEM;
    }

#ifdef CL_THREAD_SAFE
    pthread_mutex_unlock(&tables_mutex);
#endif

    retcode = CL_SUCCESS;
    body    = NULL;

    mctx.dir          = dir;
    mctx.rfc821Table  = rfc821;
    mctx.subtypeTable = subtype;
    mctx.ctx          = ctx;
    mctx.files        = 0;
#if HAVE_JSON
    mctx.wrkobj = ctx->wrkproperty;
#endif

    /*
     * Is it a UNIX style mbox with more than one
     * mail message, or just a single mail message?
     *
     * TODO: It would be better if we called cli_magic_scan_dir here rather than
     * in cli_scanmail. Then we could improve the way mailboxes with more
     * than one message is handled, e.g. giving a better indication of
     * which message within the mailbox is infected
     */
    /*if((strncmp(buffer, "From ", 5) == 0) && isalnum(buffer[5])) {*/
    if (strncmp(buffer, "From ", 5) == 0) {
        /*
         * Have been asked to check a UNIX style mbox file, which
         * may contain more than one e-mail message to decode
         *
         * It would be far better for scanners.c to do this splitting
         * and do this
         *    FOR EACH mail in the mailbox
         *    DO
         *        pass this mail to cli_mbox --
         *        scan this file
         *        IF this file has a virus quit
         *        THEN
         *            return CL_VIRUS
         *        FI
         *    END
         * This would remove a problem with this code that it can
         * fill up the tmp directory before it starts scanning
         */
        bool lastLineWasEmpty;
        int messagenumber;
        message *m = messageCreate(); /*Create an empty email */

        if (m == NULL) {
            return CL_EMEM;
        }

        lastLineWasEmpty = false;
        messagenumber    = 1;
        messageSetCTX(m, ctx);

        do {
            cli_chomp(buffer);
            /*if(lastLineWasEmpty && (strncmp(buffer, "From ", 5) == 0) && isalnum(buffer[5])) */
            if (lastLineWasEmpty && (strncmp(buffer, "From ", 5) == 0)) {
                cli_dbgmsg("Deal with message number %d\n", messagenumber++);
                /*
                 * End of a message in the mail box
                 */
                bool heuristicFound = false;
                body                = parseEmailHeaders(m, rfc821, &heuristicFound);
                if (body == NULL) {
                    messageReset(m);
                    messageSetCTX(m, ctx);
                    if (heuristicFound) {
                        retcode = CL_VIRUS;
                        break;
                    }
                    continue;
                }
                messageSetCTX(body, ctx);
                messageDestroy(m);
                if (messageGetBody(body)) {
                    mbox_status rc = parseEmailBody(body, NULL, &mctx, 0);
                    if (rc == FAIL) {
                        m = body;
                        messageReset(m);
                        messageSetCTX(m, ctx);
                        continue;
                    } else if (rc == VIRUS) {
                        cli_dbgmsg("Message number %d is infected\n",
                                   messagenumber - 1);
                        retcode = CL_VIRUS;
                        m       = NULL;
                        break;
                    }
                }
                /*
                 * Starting a new message, throw away all the
                 * information about the old one. It would
                 * be best to be able to scan this message
                 * now, but cli_magic_scan_file needs arguments
                 * that haven't been passed here so it can't be
                 * called
                 */
                m = body;
                messageReset(m);
                messageSetCTX(m, ctx);

                cli_dbgmsg("Finished processing message\n");
            } else {
                lastLineWasEmpty = (bool)(buffer[0] == '\0');
            }

            if (isuuencodebegin(buffer)) {
                /*
                 * Fast track visa to uudecode.
                 * TODO: binhex, yenc
                 */
                if (uudecodeFile(m, buffer, dir, map, &at) < 0) {
                    if (messageAddStr(m, buffer) < 0) {
                        break;
                    }
                }
            } else {
                /* at this point, the \n has been removed */
                if (messageAddStr(m, buffer) < 0) {
                    break;
                }
            }
        } while (fmap_gets(map, buffer, &at, sizeof(buffer) - 1));

        if (retcode == CL_SUCCESS) {
            cli_dbgmsg("Extract attachments from email %d\n", messagenumber);
            bool heuristicFound = false;
            body                = parseEmailHeaders(m, rfc821, &heuristicFound);
            if (heuristicFound) {
                retcode = CL_VIRUS;
            }
        }
        if (m) {
            messageDestroy(m);
        }
    } else {
        /*
         * It's a single message, parse the headers then the body
         */
        if (strncmp(buffer, "P I ", 4) == 0)
            /*
             * CommuniGate Pro format: ignore headers until
             * blank line
             */
            while (fmap_gets(map, buffer, &at, sizeof(buffer) - 1) &&
                   (strchr("\r\n", buffer[0]) == NULL)) {
                ;
            }
        /* getline_from_mbox could be using unlocked_stdio(3),
         * so lock file here */
        /*
         * Ignore any blank lines at the top of the message
         */
        while (strchr("\r\n", buffer[0]) &&
               (getline_from_mbox(buffer, sizeof(buffer) - 1, map, &at) != NULL)) {
            ;
        }

        buffer[sizeof(buffer) - 1] = '\0';

        bool heuristicFound = false;
        body                = parseEmailFile(map, &at, rfc821, buffer, dir, ctx, &heuristicFound);
        if (heuristicFound) {
            retcode = CL_VIRUS;
        }
    }

    if (body) {
        /*
         * Write out the last entry in the mailbox
         */
        if ((retcode == CL_SUCCESS) && messageGetBody(body)) {
            messageSetCTX(body, ctx);
            switch (parseEmailBody(body, NULL, &mctx, 0)) {
                case OK:
                case OK_ATTACHMENTS_NOT_SAVED:
                    break;
                case FAIL:
                    /*
                     * beware: cli_magic_scan_desc(),
                     * changes this into CL_CLEAN, so only
                     * use it to inform the higher levels
                     * that we couldn't decode it because
                     * it isn't an mbox, not to signal
                     * decoding errors on what *is* a valid
                     * mbox
                     */
                    retcode = CL_EFORMAT;
                    break;
                case MAXREC:
                    retcode = CL_EMAXREC;
                    cli_append_potentially_unwanted_if_heur_exceedsmax(ctx, "Heuristics.Limits.Exceeded.MaxRecursion"); // Doing this now because it's actually tracking email recursion,-
                                                                                                                        // not fmap recursion, but it still is aborting with stuff not scanned.
                                                                                                                        // Also, we didn't have access to the ctx when this happened earlier.
                    break;
                case MAXFILES:
                    retcode = CL_EMAXFILES;
                    cli_append_potentially_unwanted_if_heur_exceedsmax(ctx, "Heuristics.Limits.Exceeded.MaxFiles"); // Doing this now because it's actually tracking email parts,-
                                                                                                                    // not actual files, but it still is aborting with stuff not scanned.
                                                                                                                    // Also, we didn't have access to the ctx when this happened earlier.
                    break;
                case VIRUS:
                    retcode = CL_VIRUS;
                    break;
            }
        }

        if (body->isTruncated && retcode == CL_SUCCESS) {
            retcode = CL_EMEM;
        }
        /*
         * Tidy up and quit
         */
        messageDestroy(body);
    }

    cli_dbgmsg("cli_mbox returning %d\n", retcode);

    return retcode;
}

#define READ_STRUCT_BUFFER_LEN 1024
typedef struct _ReadStruct {
    char buffer[READ_STRUCT_BUFFER_LEN + 1];

    size_t bufferLen;

    struct _ReadStruct *next;

} ReadStruct;

static ReadStruct *
appendReadStruct(ReadStruct *rs, const char *const buffer)
{
    if (NULL == rs) {
        cli_dbgmsg("appendReadStruct: Invalid argument\n");
        goto done;
    }

    size_t spaceLeft = (READ_STRUCT_BUFFER_LEN - rs->bufferLen);

    if (strlen(buffer) > spaceLeft) {
        ReadStruct *next = NULL;
        int part         = spaceLeft;
        strncpy(&(rs->buffer[rs->bufferLen]), buffer, part);
        rs->bufferLen += part;

        CLI_CALLOC(next, 1, sizeof(ReadStruct));

        rs->next = next;
        strcpy(next->buffer, &(buffer[part]));
        next->bufferLen = strlen(&(buffer[part]));

        rs = next;
    } else {
        strcpy(&(rs->buffer[rs->bufferLen]), buffer);
        rs->bufferLen += strlen(buffer);
    }

done:
    return rs;
}

static char *
getMallocedBufferFromList(const ReadStruct *head)
{

    const ReadStruct *rs = head;
    int bufferLen        = 1;
    char *working        = NULL;
    char *ret            = NULL;

    while (rs) {
        bufferLen += rs->bufferLen;
        rs = rs->next;
    }

    MALLOC(working, bufferLen);

    rs        = head;
    bufferLen = 0;
    while (rs) {
        memcpy(&(working[bufferLen]), rs->buffer, rs->bufferLen);
        bufferLen += rs->bufferLen;
        working[bufferLen] = 0;
        rs                 = rs->next;
    }

    ret = working;
done:
    if (NULL == ret) {
        FREE(working);
    }

    return ret;
}

static void
freeList(ReadStruct *head)
{
    while (head) {
        ReadStruct *rs = head->next;
        FREE(head);
        head = rs;
    }
}

#ifndef FREELIST_REALLOC
#define FREELIST_REALLOC(head, curr) \
    do {                             \
        if (curr != head) {          \
            freeList(head->next);    \
        }                            \
        head->bufferLen = 0;         \
        head->next      = 0;         \
        curr            = head;      \
    } while (0)
#endif /*FREELIST_REALLOC*/

/*Check if we have repeated blank lines with only a semicolon at the end.  Semicolon is a delimiter for parameters,
 * but if there is no data, it isn't a parameter.  Allow the first one because it may be continuation of a previous line
 * that actually had data in it.*/
static bool
doContinueMultipleEmptyOptions(const char *const line, bool *lastWasOnlySemi)
{
    if (line) {
        size_t i   = 0;
        int doCont = 1;
        for (; i < strlen(line); i++) {
            if (isblank(line[i])) {
            } else if (';' == line[i]) {
            } else {
                doCont = 0;
                break;
            }
        }

        if (1 == doCont) {
            if (*lastWasOnlySemi) {
                return true;
            }
            *lastWasOnlySemi = true;
        } else {
            *lastWasOnlySemi = false;
        }
    }
    return false;
}

static bool
hitLineFoldCnt(const char *const line, size_t *lineFoldCnt, cli_ctx *ctx, bool *heuristicFound)
{

    if (line) {
        if (isblank(line[0])) {
            (*lineFoldCnt)++;
        } else {
            (*lineFoldCnt) = 0;
        }

        if ((*lineFoldCnt) >= HEURISTIC_EMAIL_MAX_LINE_FOLDS_PER_HEADER) {
            if (SCAN_HEURISTIC_EXCEEDS_MAX) {
                cli_append_potentially_unwanted(ctx, "Heuristics.Limits.Exceeded.EmailLineFoldCnt");
                *heuristicFound = true;
            }

            return true;
        }
    }
    return false;
}

static bool
haveTooManyHeaderBytes(size_t totalLen, cli_ctx *ctx, bool *heuristicFound)
{

    if (totalLen > HEURISTIC_EMAIL_MAX_HEADER_BYTES) {
        if (SCAN_HEURISTIC_EXCEEDS_MAX) {
            cli_append_potentially_unwanted(ctx, "Heuristics.Limits.Exceeded.EmailHeaderBytes");
            *heuristicFound = true;
        }

        return true;
    }
    return false;
}

static bool
haveTooManyEmailHeaders(size_t totalHeaderCnt, cli_ctx *ctx, bool *heuristicFound)
{

    if (totalHeaderCnt > HEURISTIC_EMAIL_MAX_HEADERS) {
        if (SCAN_HEURISTIC_EXCEEDS_MAX) {
            cli_append_potentially_unwanted(ctx, "Heuristics.Limits.Exceeded.EmailHeaders");
            *heuristicFound = true;
        }

        return true;
    }
    return false;
}

static bool
haveTooManyMIMEPartsPerMessage(size_t mimePartCnt, cli_ctx *ctx, mbox_status *rc)
{

    if (mimePartCnt >= HEURISTIC_EMAIL_MAX_MIME_PARTS_PER_MESSAGE) {
        if (SCAN_HEURISTIC_EXCEEDS_MAX) {
            cli_append_potentially_unwanted(ctx, "Heuristics.Limits.Exceeded.EmailMIMEPartsPerMessage");
            *rc = VIRUS;
        }

        return true;
    }
    return false;
}

static bool
haveTooManyMIMEArguments(size_t argCnt, cli_ctx *ctx, bool *heuristicFound)
{

    if (argCnt >= HEURISTIC_EMAIL_MAX_ARGUMENTS_PER_HEADER) {
        if (SCAN_HEURISTIC_EXCEEDS_MAX) {
            cli_append_potentially_unwanted(ctx, "Heuristics.Limits.Exceeded.EmailMIMEArguments");
            *heuristicFound = true;
        }

        return true;
    }

    return false;
}

/*
 * Read in an email message from fin, parse it, and return the message
 *
 * FIXME: files full of new lines and nothing else are
 * handled ungracefully...
 */
static message *
parseEmailFile(fmap_t *map, size_t *at, const table_t *rfc821, const char *firstLine, const char *dir, cli_ctx *ctx, bool *heuristicFound)
{
    bool inHeader     = true;
    bool bodyIsEmpty  = true;
    bool lastWasBlank = false, lastBodyLineWasBlank = false;
    message *ret;
    bool anyHeadersFound = false;
    int commandNumber    = -1;
    char *boundary       = NULL;
    char buffer[RFC2821LENGTH + 1];
    bool lastWasOnlySemi    = false;
    int err                 = 1;
    size_t totalHeaderBytes = 0;
    size_t totalHeaderCnt   = 0;

    size_t lineFoldCnt = 0;

    *heuristicFound = false;

    ReadStruct *head = NULL;
    ReadStruct *curr = NULL;
    cli_dbgmsg("parseEmailFile\n");

    ret = messageCreate();
    if (ret == NULL)
        return NULL;

    CLI_CALLOC(head, 1, sizeof(ReadStruct));
    curr = head;

    strncpy(buffer, firstLine, sizeof(buffer) - 1);
    do {
        const char *line;

        (void)cli_chomp(buffer);

        if (buffer[0] == '\0')
            line = NULL;
        else
            line = buffer;

        if (doContinueMultipleEmptyOptions(line, &lastWasOnlySemi)) {
            continue;
        }

        if (hitLineFoldCnt(line, &lineFoldCnt, ctx, heuristicFound)) {
            break;
        }

        /*
         * Don't blank lines which are only spaces from headers,
         * otherwise they'll be treated as the end of header marker
         */
        if (lastWasBlank) {
            lastWasBlank = false;
            if (boundaryStart(buffer, boundary)) {
                cli_dbgmsg("Found a header line with space that should be blank\n");
                inHeader = false;
            }
        }
        if (inHeader) {
            cli_dbgmsg("parseEmailFile: check '%s'\n", buffer);

            /*
             * Ensure wide characters are handled where
             * sizeof(char) > 1
             */
            if (line && isspace(line[0] & 0xFF)) {
                char copy[sizeof(buffer)];

                strcpy(copy, buffer);
                strstrip(copy);
                if (copy[0] == '\0') {
                    /*
                     * The header line contains only white
                     * space. This is not the end of the
                     * headers according to RFC2822, but
                     * some MUAs will handle it as though
                     * it were, and virus writers exploit
                     * this bug. We can't just break from
                     * the loop here since that would allow
                     * other exploits such as inserting a
                     * white space line before the
                     * content-type line. So we just have
                     * to make a best guess. Sigh.
                     */
                    if (head->bufferLen) {
                        char *header     = getMallocedBufferFromList(head);
                        int needContinue = 0;
                        VERIFY_POINTER(header);

                        totalHeaderCnt++;
                        if (haveTooManyEmailHeaders(totalHeaderCnt, ctx, heuristicFound)) {
                            FREE(header);
                            break;
                        }
                        needContinue = (parseEmailHeader(ret, header, rfc821, ctx, heuristicFound) < 0);
                        if (*heuristicFound) {
                            FREE(header);
                            break;
                        }

                        FREE(header);
                        FREELIST_REALLOC(head, curr);

                        if (needContinue) {
                            continue;
                        }
                    }

                    if (boundary ||
                        ((boundary = (char *)messageFindArgument(ret, "boundary")) != NULL)) {
                        lastWasBlank = true;
                        continue;
                    }
                }
            }
            if ((line == NULL) && (0 == head->bufferLen)) { /* empty line */
                /*
                 * A blank line signifies the end of
                 * the header and the start of the text
                 */
                if (!anyHeadersFound)
                    /* Ignore the junk at the top */
                    continue;

                cli_dbgmsg("End of header information\n");
                inHeader    = false;
                bodyIsEmpty = true;
            } else {
                char *ptr;
                const char *lookahead;
                bool lineAdded = true;

                if (0 == head->bufferLen) {
                    char cmd[RFC2821LENGTH + 1], out[RFC2821LENGTH + 1];

                    /*
                     * Continuation of line we're ignoring?
                     */
                    if (isblank(line[0]))
                        continue;

                    /*
                     * Is this a header we're interested in?
                     */
                    if ((strchr(line, ':') == NULL) ||
                        (cli_strtokbuf(line, 0, ":", cmd) == NULL)) {
                        if (strncmp(line, "From ", 5) == 0)
                            anyHeadersFound = true;
                        continue;
                    }

                    ptr           = rfc822comments(cmd, out);
                    commandNumber = tableFind(rfc821, ptr ? ptr : cmd);

                    switch (commandNumber) {
                        case CONTENT_TRANSFER_ENCODING:
                        case CONTENT_DISPOSITION:
                        case CONTENT_TYPE:
                            anyHeadersFound = true;
                            break;
                        default:
                            if (!anyHeadersFound)
                                anyHeadersFound = usefulHeader(commandNumber, cmd);
                            continue;
                    }
                    curr = appendReadStruct(curr, line);
                    if (NULL == curr) {
                        if (ret) {
                            ret->isTruncated = true;
                        }
                        break;
                    }
                } else if (line != NULL) {
                    curr = appendReadStruct(curr, line);
                } else {
                    lineAdded = false;
                }

                if (lineAdded) {
                    totalHeaderBytes += strlen(line);
                    if (haveTooManyHeaderBytes(totalHeaderBytes, ctx, heuristicFound)) {
                        break;
                    }
                }

                if ((lookahead = fmap_need_off_once(map, *at, 1))) {
                    /*
                     * Section B.2 of RFC822 says TAB or
                     * SPACE means a continuation of the
                     * previous entry.
                     *
                     * Add all the arguments on the line
                     */
                    if (isblank(*lookahead))
                        continue;
                }

                /*
                 * Handle broken headers, where the next
                 * line isn't indented by whitespace
                 */
                {
                    char *header     = getMallocedBufferFromList(head); /*This is the issue */
                    int needContinue = 0;
                    VERIFY_POINTER(header);

                    needContinue = (header[strlen(header) - 1] == ';');
                    if (0 == needContinue) {
                        needContinue = (line && (count_quotes(header) & 1));
                    }

                    if (0 == needContinue) {
                        totalHeaderCnt++;
                        if (haveTooManyEmailHeaders(totalHeaderCnt, ctx, heuristicFound)) {
                            FREE(header);
                            break;
                        }
                        needContinue = (parseEmailHeader(ret, header, rfc821, ctx, heuristicFound) < 0);
                        if (*heuristicFound) {
                            FREE(header);
                            break;
                        }
                        /*Check total headers here;*/
                    }

                    FREE(header);
                    if (needContinue) {
                        continue;
                    }
                    FREELIST_REALLOC(head, curr);
                }
            }
        } else if (line && isuuencodebegin(line)) {
            /*
             * Fast track visa to uudecode.
             * TODO: binhex, yenc
             */
            bodyIsEmpty = false;
            if (uudecodeFile(ret, line, dir, map, at) < 0)
                if (messageAddStr(ret, line) < 0)
                    break;
        } else {
            if (line == NULL) {
                /*
                 * Although this would save time and RAM, some
                 * phish signatures have been built which need
                 * the blank lines
                 */
                if (lastBodyLineWasBlank &&
                    (messageGetMimeType(ret) != TEXT)) {
                    cli_dbgmsg("Ignoring consecutive blank lines in the body\n");
                    continue;
                }
                lastBodyLineWasBlank = true;
            } else {
                if (bodyIsEmpty) {
                    /*
                     * Broken message: new line in the
                     * middle of the headers, so the first
                     * line of the body is in fact
                     * the last lines of the header
                     */
                    if (newline_in_header(line))
                        continue;
                    bodyIsEmpty = false;
                }
                lastBodyLineWasBlank = false;
            }

            if (messageAddStr(ret, line) < 0)
                break;
        }
    } while (getline_from_mbox(buffer, sizeof(buffer) - 1, map, at) != NULL);

    err = 0;
done:
    if (err) {
        cli_errmsg("parseEmailFile: ERROR parsing file\n");
        ret->isTruncated = true;
    }

    FREE(boundary);

    freeList(head);

    if (!anyHeadersFound) {
        /*
         * False positive in believing we have an e-mail when we don't
         */
        messageDestroy(ret);
        cli_dbgmsg("parseEmailFile: no headers found, assuming it isn't an email\n");
        return NULL;
    }

    if (*heuristicFound) {
        messageDestroy(ret);
        cli_dbgmsg("parseEmailFile: found heuristic\n");
        return NULL;
    }

    cli_dbgmsg("parseEmailFile: return\n");

    return ret;
}

/*
 * The given message contains a raw e-mail.
 *
 * Returns the message's body with the correct arguments set, empties the
 * given message's contents (note that it isn't destroyed)
 *
 * TODO: remove the duplication with parseEmailFile
 */
static message *
parseEmailHeaders(message *m, const table_t *rfc821, bool *heuristicFound)
{
    bool inHeader    = true;
    bool bodyIsEmpty = true;
    text *t;
    message *ret;
    bool anyHeadersFound  = false;
    int commandNumber     = -1;
    char *fullline        = NULL;
    size_t fulllinelength = 0;
    bool lastWasOnlySemi  = false;
    size_t lineFoldCnt    = 0;
    size_t totalHeaderCnt = 0;

    cli_dbgmsg("parseEmailHeaders\n");

    *heuristicFound = false;

    if (m == NULL)
        return NULL;

    ret = messageCreate();

    for (t = messageGetBody(m); t; t = t->t_next) {
        const char *line;

        if (t->t_line)
            line = lineGetData(t->t_line);
        else
            line = NULL;

        if (doContinueMultipleEmptyOptions(line, &lastWasOnlySemi)) {
            continue;
        }

        if (hitLineFoldCnt(line, &lineFoldCnt, m->ctx, heuristicFound)) {
            break;
        }

        if (inHeader) {
            cli_dbgmsg("parseEmailHeaders: check '%s'\n",
                       line ? line : "");
            if (line == NULL) {
                /*
                 * A blank line signifies the end of
                 * the header and the start of the text
                 */
                cli_dbgmsg("End of header information\n");
                if (!anyHeadersFound) {
                    cli_dbgmsg("Nothing interesting in the header\n");
                    break;
                }
                inHeader    = false;
                bodyIsEmpty = true;
            } else {
                char *ptr;
                bool lineAdded = true;

                if (fullline == NULL) {
                    char cmd[RFC2821LENGTH + 1];

                    /*
                     * Continuation of line we're ignoring?
                     */
                    if (isblank(line[0]))
                        continue;

                    /*
                     * Is this a header we're interested in?
                     */
                    if ((strchr(line, ':') == NULL) ||
                        (cli_strtokbuf(line, 0, ":", cmd) == NULL)) {
                        if (strncmp(line, "From ", 5) == 0)
                            anyHeadersFound = true;
                        continue;
                    }

                    ptr           = rfc822comments(cmd, NULL);
                    commandNumber = tableFind(rfc821, ptr ? ptr : cmd);
                    if (ptr)
                        free(ptr);

                    switch (commandNumber) {
                        case CONTENT_TRANSFER_ENCODING:
                        case CONTENT_DISPOSITION:
                        case CONTENT_TYPE:
                            anyHeadersFound = true;
                            break;
                        default:
                            if (!anyHeadersFound)
                                anyHeadersFound = usefulHeader(commandNumber, cmd);
                            continue;
                    }
                    fullline       = cli_strdup(line);
                    fulllinelength = strlen(line) + 1;
                } else if (line) {
                    fulllinelength += strlen(line) + 1;
                    ptr = cli_realloc(fullline, fulllinelength);
                    if (ptr == NULL)
                        continue;
                    fullline = ptr;
                    cli_strlcat(fullline, line, fulllinelength);
                } else {
                    lineAdded = false;
                }

                /*continue doesn't seem right here, but that is what is done everywhere else when a malloc fails.*/
                if (NULL == fullline) {
                    continue;
                }

                if (lineAdded) {
                    if (haveTooManyHeaderBytes(fulllinelength, m->ctx, heuristicFound)) {
                        break;
                    }
                }

                if (next_is_folded_header(t))
                    /* Add arguments to this line */
                    continue;

                lineUnlink(t->t_line);
                t->t_line = NULL;

                if (count_quotes(fullline) & 1)
                    continue;

                ptr = rfc822comments(fullline, NULL);
                if (ptr) {
                    free(fullline);
                    fullline = ptr;
                }

                totalHeaderCnt++;
                if (haveTooManyEmailHeaders(totalHeaderCnt, m->ctx, heuristicFound)) {
                    break;
                }
                if (parseEmailHeader(ret, fullline, rfc821, m->ctx, heuristicFound) < 0) {
                    continue;
                }
                if (*heuristicFound) {
                    break;
                }

                free(fullline);
                fullline = NULL;
            }
        } else {
            if (bodyIsEmpty) {
                if (line == NULL)
                    /* throw away leading blank lines */
                    continue;
                /*
                 * Broken message: new line in the
                 * middle of the headers, so the first
                 * line of the body is in fact
                 * the last lines of the header
                 */
                if (newline_in_header(line))
                    continue;
                bodyIsEmpty = false;
            }
            /*if(t->t_line && isuuencodebegin(t->t_line))
                puts("FIXME: add fast visa here");*/
            cli_dbgmsg("parseEmailHeaders: finished with headers, moving body\n");
            messageMoveText(ret, t, m);
            break;
        }
    }

    if (fullline) {
        if (*fullline) switch (commandNumber) {
                case CONTENT_TRANSFER_ENCODING:
                case CONTENT_DISPOSITION:
                case CONTENT_TYPE:
                    cli_dbgmsg("parseEmailHeaders: Fullline unparsed '%s'\n", fullline);
            }
        free(fullline);
    }

    if (!anyHeadersFound) {
        /*
         * False positive in believing we have an e-mail when we don't
         */
        messageDestroy(ret);
        cli_dbgmsg("parseEmailHeaders: no headers found, assuming it isn't an email\n");
        return NULL;
    }
    if (*heuristicFound) {
        messageDestroy(ret);
        cli_dbgmsg("parseEmailHeaders: found a heuristic, delete message and stop parsing.\n");
        return NULL;
    }

    cli_dbgmsg("parseEmailHeaders: return\n");

    return ret;
}

/*
 * Handle a header line of an email message
 */
static int
parseEmailHeader(message *m, const char *line, const table_t *rfc821, cli_ctx *ctx, bool *heuristicFound)
{
    int ret = -1;
#ifdef CL_THREAD_SAFE
    char *strptr;
#endif
    const char *separator;
    char *cmd, *copy, tokenseparator[2];

    cli_dbgmsg("parseEmailHeader '%s'\n", line);

    /*
     * In RFC822 the separator between the key a value is a colon,
     * e.g.    Content-Transfer-Encoding: base64
     * However some MUA's are lapse about this and virus writers exploit
     * this hole, so we need to check all known possibilities
     */
    for (separator = ":= "; *separator; separator++)
        if (strchr(line, *separator) != NULL)
            break;

    if (*separator == '\0')
        return -1;

    copy = rfc2047(line);
    if (copy == NULL) {
        /* an RFC checker would return -1 here */
        copy = cli_strdup(line);
        if (NULL == copy) {
            goto done;
        }
    }

    tokenseparator[0] = *separator;
    tokenseparator[1] = '\0';

#ifdef CL_THREAD_SAFE
    cmd = strtok_r(copy, tokenseparator, &strptr);
#else
    cmd = strtok(copy, tokenseparator);
#endif

    if (cmd && (strstrip(cmd) > 0)) {
#ifdef CL_THREAD_SAFE
        char *arg = strtok_r(NULL, "", &strptr);
#else
        char *arg = strtok(NULL, "");
#endif

        if (arg) {
            /*
             * Found a header such as
             * Content-Type: multipart/mixed;
             * set arg to be
             * "multipart/mixed" and cmd to
             * be "Content-Type"
             */
            ret = parseMimeHeader(m, cmd, rfc821, arg, ctx, heuristicFound);
        }
    }
done:
    FREE(copy);

    return ret;
}

#if HAVE_LIBXML2
static const struct key_entry mhtml_keys[] = {
    /* root html tags for microsoft office document */
    {"html", "RootHTML", MSXML_JSON_ROOT | MSXML_JSON_ATTRIB},

    {"head", "Head", MSXML_JSON_WRKPTR | MSXML_COMMENT_CB},
    {"meta", "Meta", MSXML_JSON_WRKPTR | MSXML_JSON_MULTI | MSXML_JSON_ATTRIB},
    {"link", "Link", MSXML_JSON_WRKPTR | MSXML_JSON_MULTI | MSXML_JSON_ATTRIB},
    {"script", "Script", MSXML_JSON_WRKPTR | MSXML_JSON_MULTI | MSXML_JSON_VALUE}};
static size_t num_mhtml_keys = sizeof(mhtml_keys) / sizeof(struct key_entry);

static const struct key_entry mhtml_comment_keys[] = {
    /* embedded xml tags (comment) for microsoft office document */
    {"o:documentproperties", "DocumentProperties", MSXML_JSON_ROOT | MSXML_JSON_ATTRIB},
    {"o:author", "Author", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:lastauthor", "LastAuthor", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:revision", "Revision", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:totaltime", "TotalTime", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:created", "Created", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:lastsaved", "LastSaved", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:pages", "Pages", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:words", "Words", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:characters", "Characters", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:company", "Company", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:lines", "Lines", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:paragraphs", "Paragraphs", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:characterswithspaces", "CharactersWithSpaces", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
    {"o:version", "Version", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},

    {"o:officedocumentsettings", "DocumentSettings", MSXML_IGNORE_ELEM},
    {"w:worddocument", "WordDocument", MSXML_IGNORE_ELEM},
    {"w:latentstyles", "LatentStyles", MSXML_IGNORE_ELEM}};
static size_t num_mhtml_comment_keys = sizeof(mhtml_comment_keys) / sizeof(struct key_entry);
#endif

/*
 * The related multipart root HTML file comment parsing wrapper.
 *
 * Attempts to leverage msxml parser, cannot operate without LIBXML2.
 * This function is only used for Preclassification JSON.
 */
static cl_error_t parseMHTMLComment(const char *comment, cli_ctx *ctx, void *wrkjobj, void *cbdata)
{
    cl_error_t ret = CL_SUCCESS;

#if HAVE_LIBXML2
    const char *xmlsrt, *xmlend;
    xmlTextReaderPtr reader;

    UNUSEDPARAM(cbdata);
    UNUSEDPARAM(wrkjobj);

    xmlend = comment;
    while ((xmlsrt = strstr(xmlend, "<xml>"))) {
        xmlend = strstr(xmlsrt, "</xml>");
        if (xmlend == NULL) {
            cli_dbgmsg("parseMHTMLComment: unbounded xml tag\n");
            break;
        }

        reader = xmlReaderForMemory(xmlsrt, xmlend - xmlsrt + 6, "comment.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
        if (!reader) {
            cli_dbgmsg("parseMHTMLComment: cannot initialize xmlReader\n");

#if HAVE_JSON
            if (ctx->wrkproperty != NULL)
                ret = cli_json_parse_error(ctx->wrkproperty, "MHTML_ERROR_XML_READER_MEM");
#endif
            return ret; // libxml2 failed!
        }

        /* comment callback is not set to prevent recursion */
        /* TODO: should we separate the key dictionaries? */
        /* TODO: should we use the json object pointer? */
        ret = cli_msxml_parse_document(ctx, reader, mhtml_comment_keys, num_mhtml_comment_keys, MSXML_FLAG_JSON, NULL);

        xmlTextReaderClose(reader);
        xmlFreeTextReader(reader);
        if (ret != CL_SUCCESS)
            return ret;
    }
#else
    UNUSEDPARAM(comment);
    UNUSEDPARAM(ctx);
    UNUSEDPARAM(wrkjobj);
    UNUSEDPARAM(cbdata);

    cli_dbgmsg("in parseMHTMLComment\n");
    cli_dbgmsg("parseMHTMLComment: parsing html xml-comments requires libxml2!\n");
#endif
    return ret;
}

/*
 * The related multipart root HTML file parsing wrapper.
 *
 * Attempts to leverage msxml parser, cannot operate without LIBXML2.
 * This function is only used for Preclassification JSON.
 */
static mbox_status
parseRootMHTML(mbox_ctx *mctx, message *m, text *t)
{
    cli_ctx *ctx = mctx->ctx;
#if HAVE_LIBXML2
#ifdef LIBXML_HTML_ENABLED
    struct msxml_ctx mxctx;
    blob *input = NULL;
    htmlDocPtr htmlDoc;
    xmlTextReaderPtr reader;
    int ret        = CL_SUCCESS;
    mbox_status rc = OK;
#if HAVE_JSON
    json_object *rhtml;
#endif

    cli_dbgmsg("in parseRootMHTML\n");

    if (ctx == NULL)
        return OK;

    if (m == NULL && t == NULL)
        return OK;

    if (m != NULL)
        input = messageToBlob(m, 0);
    else /* t != NULL */
        input = textToBlob(t, NULL, 0);

    if (input == NULL)
        return OK;

    htmlDoc = htmlReadMemory((char *)input->data, input->len, "mhtml.html", NULL, CLAMAV_MIN_XMLREADER_FLAGS | HTML_PARSE_NOWARNING);
    if (htmlDoc == NULL) {
        cli_dbgmsg("parseRootMHTML: cannot initialize read html document\n");
#if HAVE_JSON
        if (ctx->wrkproperty != NULL)
            ret = cli_json_parse_error(ctx->wrkproperty, "MHTML_ERROR_HTML_READ");
        if (ret != CL_SUCCESS)
            rc = FAIL;
#endif
        blobDestroy(input);
        return rc;
    }

#if HAVE_JSON
    if (mctx->wrkobj) {
        rhtml = cli_jsonobj(mctx->wrkobj, "RootHTML");
        if (rhtml != NULL) {
            /* MHTML-specific properties */
            cli_jsonstr(rhtml, "Encoding", (const char *)htmlGetMetaEncoding(htmlDoc));
            cli_jsonint(rhtml, "CompressMode", xmlGetDocCompressMode(htmlDoc));
        }
    }
#endif

    reader = xmlReaderWalker(htmlDoc);
    if (reader == NULL) {
        cli_dbgmsg("parseRootMHTML: cannot initialize xmlTextReader\n");
#if HAVE_JSON
        if (ctx->wrkproperty != NULL)
            ret = cli_json_parse_error(ctx->wrkproperty, "MHTML_ERROR_XML_READER_IO");
        if (ret != CL_SUCCESS)
            rc = FAIL;
#endif
        blobDestroy(input);
        return rc;
    }

    memset(&mxctx, 0, sizeof(mxctx));
    /* no scanning callback set */
    mxctx.comment_cb = parseMHTMLComment;
    ret              = cli_msxml_parse_document(ctx, reader, mhtml_keys, num_mhtml_keys, MSXML_FLAG_JSON | MSXML_FLAG_WALK, &mxctx);
    switch (ret) {
        case CL_SUCCESS:
        case CL_ETIMEOUT:
        case CL_BREAK:
            rc = OK;
            break;

        case CL_EMAXREC:
            rc = MAXREC;
            break;

        case CL_EMAXFILES:
            rc = MAXFILES;
            break;

        case CL_VIRUS:
            rc = VIRUS;
            break;

        default:
            rc = FAIL;
    }

    xmlTextReaderClose(reader);
    xmlFreeTextReader(reader);
    xmlFreeDoc(htmlDoc);
    blobDestroy(input);
    return rc;
#else  /* LIBXML_HTML_ENABLED */
    UNUSEDPARAM(m);
    UNUSEDPARAM(t);
    cli_dbgmsg("in parseRootMHTML\n");
    cli_dbgmsg("parseRootMHTML: parsing html documents disabled in libxml2!\n");
#endif /* LIBXML_HTML_ENABLED */
#else  /* HAVE_LIBXML2 */
    UNUSEDPARAM(m);
    UNUSEDPARAM(t);
    cli_dbgmsg("in parseRootMHTML\n");
    cli_dbgmsg("parseRootMHTML: parsing html documents requires libxml2!\n");

    return OK;
#endif /* HAVE_LIBXML2 */
}

/*
 * This is a recursive routine.
 *
 * This function parses the body of mainMessage and saves its attachments in dir
 *
 * mainMessage is the buffer to be parsed, it contains an e-mail's body, without
 * any headers. First time of calling it'll be
 * the whole message. Later it'll be parts of a multipart message
 * textIn is the plain text message being built up so far
 */
static mbox_status
parseEmailBody(message *messageIn, text *textIn, mbox_ctx *mctx, unsigned int recursion_level)
{
    mbox_status rc;
    text *aText          = textIn;
    message *mainMessage = messageIn;
    fileblob *fb;
    bool infected                  = false;
    const struct cl_engine *engine = mctx->ctx->engine;
    const int doPhishingScan       = engine->dboptions & CL_DB_PHISHING_URLS && (DCONF_PHISHING & PHISHING_CONF_ENGINE);
#if HAVE_JSON
    json_object *saveobj = mctx->wrkobj;
#endif
    bool heuristicFound = false;

    cli_dbgmsg("in parseEmailBody, %u files saved so far\n",
               mctx->files);

    /* FIXMELIMITS: this should be better integrated */
    if (engine->max_recursion_level)
        /*
         * This is approximate
         */
        if (recursion_level > engine->max_recursion_level) {
            // Note: engine->max_recursion_level is re-purposed here out of convenience.
            //       ole2 recursion does not leverage the ctx->recursion_stack stack.
            cli_dbgmsg("parseEmailBody: hit maximum recursion level (%u)\n", recursion_level);
            return MAXREC;
        }
    if (engine->maxfiles && (mctx->files >= engine->maxfiles)) {
        /*
         * FIXME: This is only approx - it may have already
         * been exceeded
         */
        cli_dbgmsg("parseEmailBody: number of files exceeded %u\n", engine->maxfiles);
        return MAXFILES;
    }

    rc = OK;

    /* Anything left to be parsed? */
    if (mainMessage && (messageGetBody(mainMessage) != NULL)) {
        mime_type mimeType;
        int subtype, inhead, htmltextPart, inMimeHead, i;
        const char *mimeSubtype;
        char *boundary;
        const text *t_line;
        /*bool isAlternative;*/
        message *aMessage;
        int multiparts     = 0;
        message **messages = NULL; /* parts of a multipart message */

        cli_dbgmsg("Parsing mail file\n");

        mimeType    = messageGetMimeType(mainMessage);
        mimeSubtype = messageGetMimeSubtype(mainMessage);
#if HAVE_JSON
        if (mctx->wrkobj != NULL) {
            mctx->wrkobj = cli_jsonobj(mctx->wrkobj, "Body");
            cli_jsonstr(mctx->wrkobj, "MimeType", getMimeTypeStr(mimeType));
            cli_jsonstr(mctx->wrkobj, "MimeSubtype", mimeSubtype);
            cli_jsonstr(mctx->wrkobj, "EncodingType", getEncTypeStr(messageGetEncoding(mainMessage)));
            cli_jsonstr(mctx->wrkobj, "Disposition", messageGetDispositionType(mainMessage));
            if (messageHasFilename(mainMessage)) {
                char *filename = messageGetFilename(mainMessage);
                cli_jsonstr(mctx->wrkobj, "Filename", filename);
                free(filename);
            } else {
                cli_jsonstr(mctx->wrkobj, "Filename", "(inline)");
            }
        }
#endif

        /* pre-process */
        subtype = tableFind(mctx->subtypeTable, mimeSubtype);
        if ((mimeType == TEXT) && (subtype == PLAIN)) {
            /*
             * This is effectively no encoding, notice that we
             * don't check that charset is us-ascii
             */
            cli_dbgmsg("text/plain: Assume no attachments\n");
            mimeType = NOMIME;
            messageSetMimeSubtype(mainMessage, "");
        } else if ((mimeType == MESSAGE) &&
                   (strcasecmp(mimeSubtype, "rfc822-headers") == 0)) {
            /*
             * RFC1892/RFC3462: section 2 text/rfc822-headers
             * incorrectly sent as message/rfc822-headers
             *
             * Parse as text/plain, i.e. no mime
             */
            cli_dbgmsg("Changing message/rfc822-headers to text/rfc822-headers\n");
            mimeType = NOMIME;
            messageSetMimeSubtype(mainMessage, "");
        } else
            cli_dbgmsg("mimeType = %d\n", (int)mimeType);

        switch (mimeType) {
            case NOMIME:
                cli_dbgmsg("Not a mime encoded message\n");
                aText = textAddMessage(aText, mainMessage);

                if (!doPhishingScan) {
                    break;
                }
                /*
                 * Fall through: some phishing mails claim they are
                 * text/plain, when they are in fact html
                 */
                /* fall through */
            case TEXT:
                /* text/plain has been preprocessed as no encoding */
                if (doPhishingScan) {
                    /*
                     * It would be better to save and scan the
                     * file and only checkURLs if it's found to be
                     * clean
                     */
                    checkURLs(mainMessage, mctx, &rc, (subtype == HTML));
                    /*
                     * There might be html sent without subtype
                     * html too, so scan them for phishing
                     */
                    if (rc == VIRUS)
                        infected = true;
                }
                break;
            case MULTIPART:
                cli_dbgmsg("Content-type 'multipart' handler\n");
                boundary = messageFindArgument(mainMessage, "boundary");

#if HAVE_JSON
                if (mctx->wrkobj != NULL)
                    cli_jsonstr(mctx->wrkobj, "Boundary", boundary);
#endif

                if (boundary == NULL) {
                    cli_dbgmsg("Multipart/%s MIME message contains no boundary header\n",
                               mimeSubtype);
                    /* Broken e-mail message */
                    mimeType = NOMIME;
                    /*
                     * The break means that we will still
                     * check if the file contains a uuencoded file
                     */
                    break;
                }

                cli_chomp(boundary);

                /* Perhaps it should assume mixed? */
                if (mimeSubtype[0] == '\0') {
                    cli_dbgmsg("Multipart has no subtype assuming alternative\n");
                    mimeSubtype = "alternative";
                    messageSetMimeSubtype(mainMessage, "alternative");
                }

                /*
                 * Get to the start of the first message
                 */
                t_line = messageGetBody(mainMessage);

                if (t_line == NULL) {
                    cli_dbgmsg("Multipart MIME message has no body\n");
                    free((char *)boundary);
                    mimeType = NOMIME;
                    break;
                }

                do
                    if (t_line->t_line) {
                        if (boundaryStart(lineGetData(t_line->t_line), boundary))
                            break;
                        /*
                         * Found a binhex file before
                         *    the first multipart
                         * TODO: check yEnc
                         */
                        if (binhexBegin(mainMessage) == t_line) {
                            if (exportBinhexMessage(mctx, mainMessage)) {
                                /* virus found */
                                rc       = VIRUS;
                                infected = true;
                                break;
                            }
                        } else if (t_line->t_next &&
                                   (encodingLine(mainMessage) == t_line->t_next)) {
                            /*
                             * We look for the next line
                             * since later on we'll skip
                             * over the important line when
                             * we think it's a blank line
                             * at the top of the message -
                             * which it would have been in
                             * an RFC compliant world
                             */
                            cli_dbgmsg("Found MIME attachment before the first MIME section \"%s\"\n",
                                       lineGetData(t_line->t_next->t_line));
                            if (messageGetEncoding(mainMessage) == NOENCODING)
                                break;
                        }
                    }
                while ((t_line = t_line->t_next) != NULL);

                if (t_line == NULL) {
                    cli_dbgmsg("Multipart MIME message contains no boundary lines (%s)\n",
                               boundary);
                    free((char *)boundary);
                    mimeType = NOMIME;
                    /*
                     * The break means that we will still
                     * check if the file contains a yEnc/binhex file
                     */
                    break;
                }
                /*
                 * Build up a table of all of the parts of this
                 * multipart message. Remember, each part may itself
                 * be a multipart message.
                 */
                inhead     = 1;
                inMimeHead = 0;

                /*
                 * Re-read this variable in case mimeSubtype has changed
                 */
                subtype = tableFind(mctx->subtypeTable, mimeSubtype);

                /*
                 * Parse the mainMessage object and create an array
                 * of objects called messages, one for each of the
                 * multiparts that mainMessage contains.
                 *
                 * This looks like parseEmailHeaders() - maybe there's
                 * some duplication of code to be cleaned up
                 *
                 * We may need to create an array rather than just
                 * save each part as it is found because not all
                 * elements will need scanning, and we don't yet know
                 * which of those elements it will be, except in
                 * the case of mixed, when all parts need to be scanned.
                 */
                for (multiparts = 0; t_line && !infected; multiparts++) {
                    int lines = 0;
                    message **m;
                    mbox_status old_rc;

                    m = cli_realloc(messages, ((multiparts + 1) * sizeof(message *)));
                    if (m == NULL)
                        break;
                    messages = m;

                    aMessage = messages[multiparts] = messageCreate();
                    if (aMessage == NULL) {
                        multiparts--;
                        /* if allocation failed the first time,
                         * there's no point in retrying, just
                         * break out */
                        break;
                    }
                    messageSetCTX(aMessage, mctx->ctx);

                    cli_dbgmsg("Now read in part %d\n", multiparts);

                    /*
                     * Ignore blank lines. There shouldn't be ANY
                     * but some viruses insert them
                     */
                    while ((t_line = t_line->t_next) != NULL)
                        if (t_line->t_line &&
                            /*(cli_chomp(t_line->t_text) > 0))*/
                            (strlen(lineGetData(t_line->t_line)) > 0))
                            break;

                    if (t_line == NULL) {
                        cli_dbgmsg("Empty part\n");
                        /*
                         * Remove this part unless there's
                         * a binhex portion somewhere in
                         * the complete message that we may
                         * throw away by mistake if the MIME
                         * encoding information is incorrect
                         */
                        if (mainMessage &&
                            (binhexBegin(mainMessage) == NULL)) {
                            messageDestroy(aMessage);
                            --multiparts;
                        }
                        continue;
                    }

                    do {
                        const char *line = lineGetData(t_line->t_line);

                        /*
                        cli_dbgmsg("multipart %d: inMimeHead %d inhead %d boundary '%s' line '%s' next '%s'\n",
                        multiparts, inMimeHead, inhead, boundary, line,
                        t_line->t_next && t_line->t_next->t_line ? lineGetData(t_line->t_next->t_line) : "(null)");
                        */

                        if (inMimeHead) { /* continuation line */
                            if (line == NULL) {
                                /*inhead =*/inMimeHead = 0;
                                continue;
                            }
                            /*
                             * Handle continuation lines
                             * because the previous line
                             * ended with a ; or this line
                             * starts with a white space
                             */
                            cli_dbgmsg("Multipart %d: About to add mime Argument '%s'\n",
                                       multiparts, line);
                            /*
                             * Handle the case when it
                             * isn't really a continuation
                             * line:
                             * Content-Type: application/octet-stream;
                             * Content-Transfer-Encoding: base64
                             */
                            parseEmailHeader(aMessage, line, mctx->rfc821Table, mctx->ctx, &heuristicFound);
                            if (heuristicFound) {
                                rc = VIRUS;
                                break;
                            }

                            while (isspace((int)*line))
                                line++;

                            if (*line == '\0') {
                                inhead = inMimeHead = 0;
                                continue;
                            }
                            inMimeHead = false;
                            messageAddArgument(aMessage, line);
                        } else if (inhead) { /* handling normal headers */
                            /*int quotes;*/
                            char *fullline, *ptr;

                            if (line == NULL) {
                                /*
                                 * empty line, should the end of the headers,
                                 * but some base64 decoders, e.g. uudeview, are broken
                                 * and will handle this type of entry, decoding the
                                 * base64 content...
                                 * Content-Type: application/octet-stream; name=text.zip
                                 * Content-Transfer-Encoding: base64
                                 * Content-Disposition: attachment; filename="text.zip"
                                 *
                                 * Content-Disposition: attachment;
                                 *    filename=text.zip
                                 * Content-Type: application/octet-stream;
                                 *    name=text.zip
                                 * Content-Transfer-Encoding: base64
                                 *
                                 * UEsDBAoAAAAAAACgPjJ2RHw676gAAO+oAABEAAAAbWFpbF90ZXh0LWluZm8udHh0ICAgICAgICAg
                                 */
                                const text *next = t_line->t_next;

                                if (next && next->t_line) {
                                    const char *data = lineGetData(next->t_line);

                                    if ((messageGetEncoding(aMessage) == NOENCODING) &&
                                        (messageGetMimeType(aMessage) == APPLICATION) &&
                                        data && strstr(data, "base64")) {
                                        /*
                                         * Handle this nightmare (note the blank
                                         * line in the header and the incorrect
                                         * content-transfer-encoding header)
                                         *
                                         * Content-Type: application/octet-stream; name="zipped_files.EXEX-Spanska: Yes
                                         *
                                         * r-Encoding: base64
                                         * Content-Disposition: attachment; filename="zipped_files.EXE"
                                         */
                                        messageSetEncoding(aMessage, "base64");
                                        cli_dbgmsg("Ignoring fake end of headers\n");
                                        continue;
                                    }
                                    if ((strncmp(data, "Content", 7) == 0) ||
                                        (strncmp(data, "filename=", 9) == 0)) {
                                        cli_dbgmsg("Ignoring fake end of headers\n");
                                        continue;
                                    }
                                }
                                cli_dbgmsg("Multipart %d: End of header information\n",
                                           multiparts);
                                inhead = 0;
                                continue;
                            }
                            if (isspace((int)*line)) {
                                /*
                                 * The first line is
                                 * continuation line.
                                 * This is tricky
                                 * to handle, but
                                 * all we can do is our
                                 * best
                                 */
                                cli_dbgmsg("Part %d starts with a continuation line\n",
                                           multiparts);
                                messageAddArgument(aMessage, line);
                                /*
                                 * Give it a default
                                 * MIME type since
                                 * that may be the
                                 * missing line
                                 *
                                 * Choose application to
                                 * force a save
                                 */
                                if (messageGetMimeType(aMessage) == NOMIME)
                                    messageSetMimeType(aMessage, "application");
                                continue;
                            }

                            inMimeHead = false;

                            if (strlen(line) > RFC2821LENGTH) {
                                cli_dbgmsg("parseEmailBody: line length exceds RFC2821 maximum length (1000)\n");
                                // We must skip this line because functions like rfc822comments() may accept output buffers
                                // that [RFC2821LENGTH + 1] in and don't have any length checks to prevent exceeding that max.
                                // E.g. See `boundaryStart()`.
                                // TODO: A larger audit would be needed to remove this limitation, though frankly I recommend
                                // fully re-writing the email parser (in Rust).
                                continue;
                            }

                            fullline = rfc822comments(line, NULL);
                            if (fullline == NULL)
                                fullline = cli_strdup(line);

                            /*quotes = count_quotes(fullline);*/

                            /*
                             * Fold next lines to the end of this
                             * if they start with a white space
                             * or if this line has an odd number of quotes:
                             * Content-Type: application/octet-stream; name="foo
                             * "
                             */
                            while (t_line && next_is_folded_header(t_line)) {
                                const char *data;
                                size_t datasz;

                                t_line = t_line->t_next;

                                data = lineGetData(t_line->t_line);

                                if (data[1] == '\0') {
                                    /*
                                     * Broken message: the
                                     * blank line at the end
                                     * of the headers isn't blank -
                                     * it contains a space
                                     */
                                    cli_dbgmsg("Multipart %d: headers not terminated by blank line\n",
                                               multiparts);
                                    inhead = false;
                                    break;
                                }

                                datasz = strlen(fullline) + strlen(data) + 1;
                                ptr    = cli_realloc(fullline, datasz);

                                if (ptr == NULL)
                                    break;

                                fullline = ptr;
                                cli_strlcat(fullline, data, datasz);

                                /*quotes = count_quotes(data);*/
                            }

                            cli_dbgmsg("Multipart %d: About to parse folded header '%s'\n",
                                       multiparts, fullline);

                            parseEmailHeader(aMessage, fullline, mctx->rfc821Table, mctx->ctx, &heuristicFound);
                            free(fullline);
                            if (heuristicFound) {
                                rc = VIRUS;
                            }
                        } else if (boundaryEnd(line, boundary)) {
                            /*
                             * Some viruses put information
                             * *after* the end of message,
                             * which presumably some broken
                             * mail clients find, so we
                             * can't assume that this
                             * is the end of the message
                             */
                            /* t_line = NULL;*/
                            break;
                        } else if (boundaryStart(line, boundary)) {
                            inhead = 1;
                            break;
                        } else {
                            if (messageAddLine(aMessage, t_line->t_line) < 0)
                                break;
                            lines++;
                        }
                    } while ((t_line = t_line->t_next) != NULL);

                    cli_dbgmsg("Part %d has %d lines, rc = %d\n",
                               multiparts, lines, (int)rc);

                    /*
                     * Only save in the array of messages if some
                     * decision will be taken on whether to scan.
                     * If all parts will be scanned then save to
                     * file straight away
                     */
                    switch (subtype) {
                        case MIXED:
                        case ALTERNATIVE:
                        case REPORT:
                        case DIGEST:
                        case APPLEDOUBLE:
                        case KNOWBOT:
                        case -1:
                            old_rc      = rc;
                            mainMessage = do_multipart(mainMessage,
                                                       messages, multiparts,
                                                       &rc, mctx, messageIn,
                                                       &aText, recursion_level);
                            if ((rc == OK_ATTACHMENTS_NOT_SAVED) && (old_rc == OK))
                                rc = OK;
                            if (messages[multiparts]) {
                                messageDestroy(messages[multiparts]);
                                messages[multiparts] = NULL;
                            }
                            --multiparts;
                            if (rc == VIRUS)
                                infected = true;
                            break;

                        case RELATED:
                        case ENCRYPTED:
                        case SIGNED:
                        case PARALLEL:
                            /* all the subtypes that we handle
                             * (all from the switch(tableFind...) below)
                             * must be listed here */
                            break;
                        default:
                            /* this is a subtype that we
                             * don't handle anyway,
                             * don't store */
                            if (messages[multiparts]) {
                                messageDestroy(messages[multiparts]);
                                messages[multiparts] = NULL;
                            }
                            --multiparts;
                    }
                }

                free((char *)boundary);

                if (haveTooManyMIMEPartsPerMessage(multiparts, mctx->ctx, &rc)) {
                    if (messages) {
                        for (i = 0; i < multiparts; i++) {
                            if (messages[i])
                                messageDestroy(messages[i]);
                        }
                        free(messages);
                        messages = NULL;
                    }
                    break;
                }

                /*
                 * Preprocess. Anything special to be done before
                 * we handle the multiparts?
                 */
                switch (subtype) {
                    case KNOWBOT:
                        /* TODO */
                        cli_dbgmsg("multipart/knowbot parsed as multipart/mixed for now\n");
                        mimeSubtype = "mixed";
                        break;
                    case -1:
                        /*
                         * According to section 7.2.6 of
                         * RFC1521, unrecognized multiparts
                         * should be treated as multipart/mixed.
                         */
                        cli_dbgmsg("Unsupported multipart format `%s', parsed as mixed\n", mimeSubtype);
                        mimeSubtype = "mixed";
                        break;
                }

                /*
                 * We've finished message we're parsing
                 */
                if (mainMessage && (mainMessage != messageIn)) {
                    messageDestroy(mainMessage);
                    mainMessage = NULL;
                }

                cli_dbgmsg("The message has %d parts\n", multiparts);

                if (infected || ((multiparts == 0) && (aText == NULL))) {
                    if (messages) {
                        for (i = 0; i < multiparts; i++) {
                            if (messages[i])
                                messageDestroy(messages[i]);
                        }
                        free(messages);
                        messages = NULL;
                    }
                    if (aText && (textIn == NULL))
                        textDestroy(aText);

#if HAVE_JSON
                    mctx->wrkobj = saveobj;
#endif
                    /*
                     * Nothing to do
                     */
                    switch (rc) {
                        case VIRUS:
                            return VIRUS;
                        case MAXREC:
                            return MAXREC;
                        default:
                            return OK_ATTACHMENTS_NOT_SAVED;
                    }
                }

                cli_dbgmsg("Find out the multipart type (%s)\n", mimeSubtype);

                /*
                 * We now have all the parts of the multipart message
                 * in the messages array:
                 *    message *messages[multiparts]
                 * Let's decide what to do with them all
                 */
                switch (tableFind(mctx->subtypeTable, mimeSubtype)) {
                    case RELATED:
                        cli_dbgmsg("Multipart related handler\n");
                        /*
                         * Have a look to see if there's HTML code
                         * which will need scanning
                         */

                        // It's okay if multiparts == 0

                        htmltextPart = getTextPart(messages, multiparts);

                        if (htmltextPart >= 0 && messages) {
                            if (messageGetBody(messages[htmltextPart])) {

                                aText = textAddMessage(aText, messages[htmltextPart]);
                            }
                        } else {
                            /*
                             * There isn't an HTML bit. If there's a
                             * multipart bit, it'll may be in there
                             * somewhere
                             */
                            for (i = 0; i < multiparts; i++) {
                                if (messageGetMimeType(messages[i]) == MULTIPART) {
                                    htmltextPart = i;
                                    break;
                                }
                            }
                        }

                        if (htmltextPart == -1) {
                            cli_dbgmsg("No HTML code found to be scanned\n");
                        } else {
#if HAVE_JSON
                            /* Send root HTML file for preclassification */
                            if (mctx->ctx->wrkproperty)
                                (void)parseRootMHTML(mctx, messages[htmltextPart], aText);
#endif
                            rc = parseEmailBody(messages[htmltextPart], aText, mctx, recursion_level + 1);
                            if ((rc == OK) && messages[htmltextPart]) {
                                messageDestroy(messages[htmltextPart]);
                                messages[htmltextPart] = NULL;
                            } else if (rc == VIRUS) {
                                infected = true;
                                break;
                            }
                        }

                        /*
                         * The message is confused about the difference
                         * between alternative and related. Badtrans.B
                         * suffers from this problem.
                         *
                         * Fall through in this case:
                         * Content-Type: multipart/related;
                         *    type="multipart/alternative"
                         */
                        /* fall through */
                    case DIGEST:
                        /*
                         * According to section 5.1.5 RFC2046, the
                         * default mime type of multipart/digest parts
                         * is message/rfc822
                         *
                         * We consider them as alternative, wrong in
                         * the strictest sense since they aren't
                         * alternatives - all parts a valid - but it's
                         * OK for our needs since it means each part
                         * will be scanned
                         */
                    case ALTERNATIVE:
                        cli_dbgmsg("Multipart alternative handler\n");

                        /*
                         * Fall through - some clients are broken and
                         * say alternative instead of mixed. The Klez
                         * virus is broken that way, and anyway we
                         * wish to scan all of the alternatives
                         */
                        /* fall through */
                    case REPORT:
                        /*
                         * According to section 1 of RFC1892, the
                         * syntax of multipart/report is the same
                         * as multipart/mixed. There are some required
                         * parameters, but there's no need for us to
                         * verify that they exist
                         */
                    case ENCRYPTED:
                        /* MUAs without encryption plugins can display as multipart/mixed,
                         * just scan it*/
                    case MIXED:
                    case APPLEDOUBLE: /* not really supported */
                        /*
                         * Look for attachments
                         *
                         * Not all formats are supported. If an
                         * unsupported format turns out to be
                         * common enough to implement, it is a simple
                         * matter to add it
                         */
                        if (aText) {
                            if (mainMessage && (mainMessage != messageIn))
                                messageDestroy(mainMessage);
                            mainMessage = NULL;
                        }

                        cli_dbgmsg("Mixed message with %d parts\n", multiparts);
                        for (i = 0; i < multiparts; i++) {
                            mainMessage = do_multipart(mainMessage,
                                                       messages, i, &rc, mctx,
                                                       messageIn, &aText, recursion_level + 1);
                            if (rc == VIRUS) {
                                infected = true;
                                break;
                            }
                            if (rc == MAXREC)
                                break;
                            if (rc == OK_ATTACHMENTS_NOT_SAVED)
                                rc = OK;
                        }

                        /* rc = parseEmailBody(NULL, NULL, mctx, recursion_level + 1); */
                        break;
                    case SIGNED:
                    case PARALLEL:
                        /*
                         * If we're here it could be because we have a
                         * multipart/mixed message, consisting of a
                         * message followed by an attachment. That
                         * message itself is a multipart/alternative
                         * message and we need to dig out the plain
                         * text part of that alternative
                         */
                        if (messages) {
                            htmltextPart = getTextPart(messages, multiparts);
                            if (htmltextPart == -1)
                                htmltextPart = 0;
                            rc = parseEmailBody(messages[htmltextPart], aText, mctx, recursion_level + 1);
                        }
                        break;
                    default:
                        cli_dbgmsg("Unepxected mime sub type\n");
                        rc = CL_EFORMAT;
                        break;
                }

                if (mainMessage && (mainMessage != messageIn))
                    messageDestroy(mainMessage);

                if (aText && (textIn == NULL)) {
                    if ((!infected) && (fb = fileblobCreate()) != NULL) {
                        cli_dbgmsg("Save non mime and/or text/plain part\n");
                        fileblobSetFilename(fb, mctx->dir, "textpart");
                        /*fileblobAddData(fb, "Received: by clamd (textpart)\n", 30);*/
                        fileblobSetCTX(fb, mctx->ctx);
                        (void)textToFileblob(aText, fb, 1);

                        fileblobDestroy(fb);
                        mctx->files++;
                    }
                    textDestroy(aText);
                }

                if (messages) {
                    for (i = 0; i < multiparts; i++) {
                        if (messages[i])
                            messageDestroy(messages[i]);
                    }
                    free(messages);
                    messages = NULL;
                }

#if HAVE_JSON
                mctx->wrkobj = saveobj;
#endif
                return rc;

            case MESSAGE:
                /*
                 * Check for forbidden encodings
                 */
                switch (messageGetEncoding(mainMessage)) {
                    case NOENCODING:
                    case EIGHTBIT:
                    case BINARY:
                        break;
                    default:
                        cli_dbgmsg("MIME type 'message' cannot be decoded\n");
                        break;
                }
                rc = FAIL;
                if ((strcasecmp(mimeSubtype, "rfc822") == 0) ||
                    (strcasecmp(mimeSubtype, "delivery-status") == 0)) {
                    message *m = parseEmailHeaders(mainMessage, mctx->rfc821Table, &heuristicFound);
                    if (m) {
                        cli_dbgmsg("Decode rfc822\n");

                        messageSetCTX(m, mctx->ctx);

                        if (mainMessage && (mainMessage != messageIn)) {
                            messageDestroy(mainMessage);
                            mainMessage = NULL;
                        } else
                            messageReset(mainMessage);
                        if (messageGetBody(m))
                            rc = parseEmailBody(m, NULL, mctx, recursion_level + 1);

                        messageDestroy(m);
                    } else if (heuristicFound) {
                        rc = VIRUS;
                    }
                    break;
                } else if (strcasecmp(mimeSubtype, "disposition-notification") == 0) {
                    /* RFC 2298 - handle like a normal email */
                    rc = OK;
                    break;
                } else if (strcasecmp(mimeSubtype, "partial") == 0) {
                    if (mctx->ctx->options->mail & CL_SCAN_MAIL_PARTIAL_MESSAGE) {
                        /* RFC1341 message split over many emails */
                        if (rfc1341(mctx, mainMessage) >= 0)
                            rc = OK;
                    } else {
                        cli_warnmsg("Partial message received from MUA/MTA - message cannot be scanned\n");
                    }
                } else if (strcasecmp(mimeSubtype, "external-body") == 0)
                    /* TODO */
                    cli_warnmsg("Attempt to send Content-type message/external-body trapped\n");
                else
                    cli_warnmsg("Unsupported message format `%s' - if you believe this file contains a virus, submit it to www.clamav.net\n", mimeSubtype);

                if (mainMessage && (mainMessage != messageIn))
                    messageDestroy(mainMessage);

                if (messages) {
                    for (i = 0; i < multiparts; i++) {
                        if (messages[i])
                            messageDestroy(messages[i]);
                    }
                    free(messages);
                    messages = NULL;
                }
#if HAVE_JSON
                mctx->wrkobj = saveobj;
#endif
                return rc;

            default:
                cli_dbgmsg("Message received with unknown mime encoding - assume application\n");
                /*
                 * Some Yahoo emails attach as
                 * Content-Type: X-unknown/unknown;
                 * instead of
                 * Content-Type: application/unknown;
                 * so let's try our best to salvage something
                 */
                /* fall through */
            case APPLICATION:
                /*cptr = messageGetMimeSubtype(mainMessage);

                if((strcasecmp(cptr, "octet-stream") == 0) ||
                   (strcasecmp(cptr, "x-msdownload") == 0)) {*/
                {
                    fb = messageToFileblob(mainMessage, mctx->dir, 1);

                    if (fb) {
                        cli_dbgmsg("Saving main message as attachment\n");
                        if (fileblobScanAndDestroy(fb) == CL_VIRUS)
                            rc = VIRUS;
                        mctx->files++;
                        if (mainMessage != messageIn) {
                            messageDestroy(mainMessage);
                            mainMessage = NULL;
                        } else
                            messageReset(mainMessage);
                    }
                } /*else
                cli_warnmsg("Discarded application not sent as attachment\n");*/
                break;

            case AUDIO:
            case VIDEO:
            case IMAGE:
                break;
        }

        if (messages) {
            /* "can't happen" */
            cli_warnmsg("messages != NULL\n");
            for (i = 0; i < multiparts; i++) {
                if (messages[i])
                    messageDestroy(messages[i]);
            }
            free(messages);
            messages = NULL;
        }
    }

    if (aText && (textIn == NULL)) {
        /* Look for a bounce in the text (non mime encoded) portion */
        const text *t;
        /* isBounceStart() is expensive, reduce the number of calls */
        bool lookahead_definately_is_bounce = false;

        for (t = aText; t && (rc != VIRUS); t = t->t_next) {
            const line_t *l = t->t_line;
            const text *lookahead, *topofbounce;
            const char *s;
            bool inheader;

            if (l == NULL) {
                continue;
            }

            if (lookahead_definately_is_bounce)
                lookahead_definately_is_bounce = false;
            else if (!isBounceStart(mctx, lineGetData(l)))
                continue;

            lookahead = t->t_next;
            if (lookahead) {
                if (isBounceStart(mctx, lineGetData(lookahead->t_line))) {
                    lookahead_definately_is_bounce = true;
                    /* don't save worthless header lines */
                    continue;
                }
            } else /* don't save a single liner */
                break;

            /*
             * We've found what looks like the start of a bounce
             * message. Only bother saving if it really is a bounce
             * message, this helps to speed up scanning of ping-pong
             * messages that have lots of bounces within bounces in
             * them
             */
            for (; lookahead; lookahead = lookahead->t_next) {
                l = lookahead->t_line;

                if (l == NULL)
                    break;
                s = lineGetData(l);
                if (strncasecmp(s, "Content-Type:", 13) == 0) {
                    /*
                     * Don't bother with text/plain or
                     * text/html
                     */
                    if (CLI_STRCASESTR(s, "text/plain") != NULL)
                        /*
                         * Don't bother to save the
                         * unuseful part, read past
                         * the headers then we'll go
                         * on to look for the next
                         * bounce message
                         */
                        continue;
                    if ((!doPhishingScan) &&
                        (CLI_STRCASESTR(s, "text/html") != NULL))
                        continue;
                    break;
                }
            }

            if (lookahead && (lookahead->t_line == NULL)) {
                cli_dbgmsg("Non mime part bounce message is not mime encoded, so it will not be scanned\n");
                t = lookahead;
                /* look for next bounce message */
                continue;
            }

            /*
             * Prescan the bounce message to see if there's likely
             * to be anything nasty.
             * This algorithm is hand crafted and may be breakable
             * so all submissions are welcome. It's best NOT to
             * remove this however you may be tempted, because it
             * significantly speeds up the scanning of multiple
             * bounces (i.e. bounces within many bounces)
             */
            for (; lookahead; lookahead = lookahead->t_next) {
                l = lookahead->t_line;

                if (l) {
                    s = lineGetData(l);
                    if ((strncasecmp(s, "Content-Type:", 13) == 0) &&
                        (strstr(s, "multipart/") == NULL) &&
                        (strstr(s, "message/rfc822") == NULL) &&
                        (strstr(s, "text/plain") == NULL))
                        break;
                }
            }
            if (lookahead == NULL) {
                cli_dbgmsg("cli_mbox: I believe it's plain text which must be clean\n");
                /* nothing here, move along please */
                break;
            }
            if ((fb = fileblobCreate()) == NULL)
                break;
            cli_dbgmsg("Save non mime part bounce message\n");
            fileblobSetFilename(fb, mctx->dir, "bounce");
            fileblobAddData(fb, (const unsigned char *)"Received: by clamd (bounce)\n", 28);
            fileblobSetCTX(fb, mctx->ctx);

            inheader    = true;
            topofbounce = NULL;
            do {
                l = t->t_line;

                if (l == NULL) {
                    if (inheader) {
                        inheader    = false;
                        topofbounce = t;
                    }
                } else {
                    s = lineGetData(l);
                    fileblobAddData(fb, (const unsigned char *)s, strlen(s));
                }
                fileblobAddData(fb, (const unsigned char *)"\n", 1);
                lookahead = t->t_next;
                if (lookahead == NULL)
                    break;
                t = lookahead;
                l = t->t_line;
                if ((!inheader) && l) {
                    s = lineGetData(l);
                    if (isBounceStart(mctx, s)) {
                        cli_dbgmsg("Found the start of another bounce candidate (%s)\n", s);
                        lookahead_definately_is_bounce = true;
                        break;
                    }
                }
            } while (!fileblobInfected(fb));

            if (fileblobScanAndDestroy(fb) == CL_VIRUS)
                rc = VIRUS;
            mctx->files++;

            if (topofbounce)
                t = topofbounce;
        }
        textDestroy(aText);
        aText = NULL;
    }

    /*
     * No attachments - scan the text portions, often files
     * are hidden in HTML code
     */
    if (mainMessage && (rc != VIRUS)) {
        text *t_line;

        /*
         * Look for uu-encoded main file
         */
        if (mainMessage->body_first != NULL &&
            (encodingLine(mainMessage) != NULL) &&
            ((t_line = bounceBegin(mainMessage)) != NULL))
            rc = (exportBounceMessage(mctx, t_line) == CL_VIRUS) ? VIRUS : OK;
        else {
            bool saveIt;

            if (messageGetMimeType(mainMessage) == MESSAGE)
                /*
                 * Quick peek, if the encapsulated
                 * message has no
                 * content encoding statement don't
                 * bother saving to scan, it's safe
                 */
                saveIt = (bool)(encodingLine(mainMessage) != NULL);
            else if (mainMessage->body_last != NULL && (t_line = encodingLine(mainMessage)) != NULL) {
                /*
                 * Some bounces include the message
                 * body without the headers.
                 * FIXME: Unfortunately this generates a
                 * lot of false positives that a bounce
                 * has been found when it hasn't.
                 */
                if ((fb = fileblobCreate()) != NULL) {
                    cli_dbgmsg("Found a bounce message with no header at '%s'\n",
                               lineGetData(t_line->t_line));
                    fileblobSetFilename(fb, mctx->dir, "bounce");
                    fileblobAddData(fb,
                                    (const unsigned char *)"Received: by clamd (bounce)\n",
                                    28);

                    fileblobSetCTX(fb, mctx->ctx);
                    if (fileblobScanAndDestroy(textToFileblob(t_line, fb, 1)) == CL_VIRUS)
                        rc = VIRUS;
                    mctx->files++;
                }
                saveIt = false;
            } else
                /*
                 * Save the entire text portion,
                 * since it it may be an HTML file with
                 * a JavaScript virus or a phish
                 */
                saveIt = true;

            if (saveIt) {
                cli_dbgmsg("Saving text part to scan, rc = %d\n",
                           (int)rc);
                if (saveTextPart(mctx, mainMessage, 1) == CL_VIRUS)
                    rc = VIRUS;

                if (mainMessage != messageIn) {
                    messageDestroy(mainMessage);
                    mainMessage = NULL;
                } else
                    messageReset(mainMessage);
            }
        }
    } /*else
        rc = OK_ATTACHMENTS_NOT_SAVED; */
      /* nothing saved */

    if (mainMessage && (mainMessage != messageIn))
        messageDestroy(mainMessage);

    if ((rc != FAIL) && infected)
        rc = VIRUS;

#if HAVE_JSON
    mctx->wrkobj = saveobj;
#endif

    cli_dbgmsg("parseEmailBody() returning %d\n", (int)rc);

    return rc;
}

/*
 * Is the current line the start of a new section?
 *
 * New sections start with --boundary
 */
static int
boundaryStart(const char *line, const char *boundary)
{
    const char *ptr;
    char *out;
    int rc;
    char buf[RFC2821LENGTH + 1];
    char *newline;

    if (line == NULL || *line == '\0')
        return 0; /* empty line */
    if (boundary == NULL)
        return 0;

    newline = strdup(line);
    if (!(newline))
        newline = (char *)line;

    if (newline != line && strlen(line)) {
        char *p;
        /* Trim trailing spaces */
        p = newline + strlen(line) - 1;
        while (p >= newline && *p == ' ')
            *(p--) = '\0';
    }

    if (newline != line)
        cli_chomp(newline);

    /* cli_dbgmsg("boundaryStart: line = '%s' boundary = '%s'\n", line, boundary); */

    if ((*newline != '-') && (*newline != '(')) {
        if (newline != line)
            free(newline);
        return 0;
    }

    if (strchr(newline, '-') == NULL) {
        if (newline != line)
            free(newline);
        return 0;
    }

    if (strlen(newline) <= sizeof(buf)) {
        out = NULL;
        ptr = rfc822comments(newline, buf);
    } else
        ptr = out = rfc822comments(newline, NULL);

    if (ptr == NULL)
        ptr = newline;

    if ((*ptr++ != '-') || (*ptr == '\0')) {
        if (out)
            free(out);
        if (newline != line)
            free(newline);

        return 0;
    }

    /*
     * Gibe.B3 is broken, it has:
     *    boundary="---- =_NextPart_000_01C31177.9DC7C000"
     * but it's boundaries look like
     *    ------ =_NextPart_000_01C31177.9DC7C000
     * notice the one too few '-'.
     * Presumably this is a deliberate exploitation of a bug in some mail
     * clients.
     *
     * The trouble is that this creates a lot of false positives for
     * boundary conditions, if we're too lax about matches. We do our level
     * best to avoid these false positives. For example if we have
     * boundary="1" we want to ensure that we don't break out of every line
     * that has -1 in it instead of starting --1. This needs some more work.
     *
     * Look with and without RFC822 comments stripped, I've seen some
     * samples where () are taken as comments in boundaries and some where
     * they're not. Irrespective of whatever RFC2822 says, we need to find
     * viruses in both types of mails.
     */
    if ((strstr(&ptr[1], boundary) != NULL) || (strstr(newline, boundary) != NULL)) {
        const char *k = ptr;

        /*
         * We need to ensure that we don't match --11=-=-=11 when
         * looking for --1=-=-=1 in well behaved headers, that's a
         * false positive problem mentioned above
         */
        rc = 0;
        do
            if (strcmp(++k, boundary) == 0) {
                rc = 1;
                break;
            }
        while (*k == '-');
        if (rc == 0) {
            k = &line[1];
            do
                if (strcmp(++k, boundary) == 0) {
                    rc = 1;
                    break;
                }
            while (*k == '-');
        }
    } else if (*ptr++ != '-')
        rc = 0;
    else
        rc = (strcasecmp(ptr, boundary) == 0);

    if (out)
        free(out);

    if (rc == 1)
        cli_dbgmsg("boundaryStart: found %s in %s\n", boundary, line);

    if (newline != line)
        free(newline);

    return rc;
}

/*
 * Is the current line the end?
 *
 * The message ends with with --boundary--
 */
static int
boundaryEnd(const char *line, const char *boundary)
{
    size_t len;
    char *newline, *p, *p2;

    if (line == NULL || *line == '\0')
        return 0;

    p = newline = strdup(line);
    if (!(newline)) {
        p       = (char *)line;
        newline = (char *)line;
    }

    if (newline != line && strlen(line)) {
        /* Trim trailing spaces */
        p2 = newline + strlen(line) - 1;
        while (p2 >= newline && *p2 == ' ')
            *(p2--) = '\0';
    }

    /* cli_dbgmsg("boundaryEnd: line = '%s' boundary = '%s'\n", newline, boundary); */

    if (*p++ != '-') {
        if (newline != line)
            free(newline);
        return 0;
    }

    if (*p++ != '-') {
        if (newline != line)
            free(newline);

        return 0;
    }

    len = strlen(boundary);
    if (strncasecmp(p, boundary, len) != 0) {
        if (newline != line)
            free(newline);

        return 0;
    }
    /*
     * Use < rather than == because some broken mails have white
     * space after the boundary
     */
    if (strlen(p) < (len + 2)) {
        if (newline != line)
            free(newline);

        return 0;
    }

    p = &p[len];
    if (*p++ != '-') {
        if (newline != line)
            free(newline);

        return 0;
    }

    if (*p == '-') {
        /* cli_dbgmsg("boundaryEnd: found %s in %s\n", boundary, p); */
        if (newline != line)
            free(newline);

        return 1;
    }

    if (newline != line)
        free(newline);

    return 0;
}

/*
 * Initialise the various lookup tables
 *
 * Only initializes the tables if not already initialized.
 */
static int
initialiseTables(table_t **rfc821Table, table_t **subtypeTable)
{
    const struct tableinit *tableinit;

    /*
     * Initialise the various look up tables
     */
    if (NULL == *rfc821Table) {
        *rfc821Table = tableCreate();
        if (*rfc821Table == NULL) {
            return -1;
        }

        for (tableinit = rfc821headers; tableinit->key; tableinit++) {
            if (tableInsert(*rfc821Table, tableinit->key, tableinit->value) < 0) {
                tableDestroy(*rfc821Table);
                *rfc821Table = NULL;
                return -1;
            }
        }
    }
    if (NULL == *subtypeTable) {
        *subtypeTable = tableCreate();
        if (*subtypeTable == NULL) {
            tableDestroy(*rfc821Table);
            *rfc821Table = NULL;
            return -1;
        }

        for (tableinit = mimeSubtypes; tableinit->key; tableinit++) {
            if (tableInsert(*subtypeTable, tableinit->key, tableinit->value) < 0) {
                tableDestroy(*rfc821Table);
                tableDestroy(*subtypeTable);
                *rfc821Table  = NULL;
                *subtypeTable = NULL;
                return -1;
            }
        }
    }

    return 0;
}

/*
 * If there's a HTML text version use that, otherwise
 * use the first text part, otherwise just use the
 * first one around. HTML text is most likely to include
 * a scripting worm
 *
 * If we can't find one, return -1
 */
static int
getTextPart(message *const messages[], size_t size)
{
    size_t i;
    int textpart = -1;

    for (i = 0; i < size; i++)
        if (messages[i] && (messageGetMimeType(messages[i]) == TEXT)) {
            if (strcasecmp(messageGetMimeSubtype(messages[i]), "html") == 0)
                return (int)i;
            textpart = (int)i;
        }

    return textpart;
}

/*
 * strip -
 *    Remove the trailing spaces from a buffer. Don't call this directly,
 * always call strstrip() which is a wrapper to this routine to be used with
 * NUL terminated strings. This code looks a bit strange because of it's
 * heritage from code that worked on strings that weren't necessarily NUL
 * terminated.
 * TODO: rewrite for clamAV
 *
 * Returns it's new length (a la strlen)
 *
 * len must be int not size_t because of the >= 0 test, it is sizeof(buf)
 *    not strlen(buf)
 */
static size_t
strip(char *buf, int len)
{
    register char *ptr;
    register size_t i;

    if ((buf == NULL) || (len <= 0))
        return 0;

    i = strlen(buf);
    if (len > (int)(i + 1))
        return i;
    ptr = &buf[--len];

#if defined(UNIX) || defined(C_LINUX) || defined(C_DARWIN) /* watch - it may be in shared text area */
    do
        if (*ptr)
            *ptr = '\0';
    while ((--len >= 0) && (!isgraph(*--ptr)) && (*ptr != '\n') && (*ptr != '\r'));
#else /* more characters can be displayed on DOS */
    do
#ifndef REAL_MODE_DOS
        if (*ptr) /* C8.0 puts into a text area */
#endif
            *ptr = '\0';
    while ((--len >= 0) && ((*--ptr == '\0') || isspace((int)(*ptr & 0xFF))));
#endif
    return ((size_t)(len + 1));
}

/*
 * strstrip:
 *    Strip a given string
 */
size_t
strstrip(char *s)
{
    if (s == (char *)NULL)
        return (0);

    return (strip(s, strlen(s) + 1));
}

/*
 * Returns 0 for OK, -1 for error
 */
static int
parseMimeHeader(message *m, const char *cmd, const table_t *rfc821Table, const char *arg, cli_ctx *ctx, bool *heuristicFound)
{
    char *copy, *p, *buf;
    const char *ptr;
    int commandNumber;
    size_t argCnt = 0;

    *heuristicFound = false;

    cli_dbgmsg("parseMimeHeader: cmd='%s', arg='%s'\n", cmd, arg);

    copy = rfc822comments(cmd, NULL);
    if (copy) {
        commandNumber = tableFind(rfc821Table, copy);
        free(copy);
    } else {
        commandNumber = tableFind(rfc821Table, cmd);
    }

    copy = rfc822comments(arg, NULL);

    if (copy) {
        ptr = copy;
    } else {
        ptr = arg;
    }

    buf = NULL;

    switch (commandNumber) {
        case CONTENT_TYPE:
            /*
             * Fix for non RFC1521 compliant mailers
             * that send content-type: Text instead
             * of content-type: Text/Plain, or
             * just simply "Content-Type:"
             */
            if (arg == NULL)
                /*
                 * According to section 4 of RFC1521:
                 * "Note also that a subtype specification is
                 * MANDATORY. There are no default subtypes"
                 *
                 * We have to break this and make an assumption
                 * for the subtype because virus writers and
                 * email client writers don't get it right
                 */
                cli_dbgmsg("Empty content-type received, no subtype specified, assuming text/plain; charset=us-ascii\n");
            else if (strchr(ptr, '/') == NULL)
                /*
                 * Empty field, such as
                 *    Content-Type:
                 * which I believe is illegal according to
                 * RFC1521
                 */
                cli_dbgmsg("Invalid content-type '%s' received, no subtype specified, assuming text/plain; charset=us-ascii\n", ptr);
            else {
                int i;

                buf = cli_malloc(strlen(ptr) + 1);
                if (buf == NULL) {
                    cli_errmsg("parseMimeHeader: Unable to allocate memory for buf %llu\n", (long long unsigned)(strlen(ptr) + 1));
                    if (copy)
                        free(copy);
                    return -1;
                }
                /*
                 * Some clients are broken and
                 * put white space after the ;
                 */
                if (*arg == '/') {
                    cli_dbgmsg("Content-type '/' received, assuming application/octet-stream\n");
                    messageSetMimeType(m, "application");
                    messageSetMimeSubtype(m, "octet-stream");
                } else {
                    /*
                     * The content type could be in quotes:
                     *    Content-Type: "multipart/mixed"
                     * FIXME: this is a hack in that ignores
                     *    the quotes, it doesn't handle
                     *    them properly
                     */
                    while (isspace((const unsigned char)*ptr))
                        ptr++;
                    if (ptr[0] == '\"')
                        ptr++;

                    if (ptr[0] != '/') {
                        char *s;
#ifdef CL_THREAD_SAFE
                        char *strptr = NULL;
#endif

                        s = cli_strtokbuf(ptr, 0, ";", buf);
                        /*
                         * Handle
                         * Content-Type: foo/bar multipart/mixed
                         * and
                         * Content-Type: multipart/mixed foo/bar
                         */
                        if (s && *s) {
                            char *buf2 = cli_strdup(buf);

                            if (buf2 == NULL) {
                                if (copy)
                                    free(copy);
                                free(buf);
                                return -1;
                            }
                            for (;;) {
#ifdef CL_THREAD_SAFE
                                int set = messageSetMimeType(m, strtok_r(s, "/", &strptr));
#else
                                int set = messageSetMimeType(m, strtok(s, "/"));
#endif

#ifdef CL_THREAD_SAFE
                                s = strtok_r(NULL, ";", &strptr);
#else
                                s       = strtok(NULL, ";");
#endif
                                if (s == NULL)
                                    break;
                                if (set) {
                                    size_t len = strstrip(s) - 1;
                                    if (s[len] == '\"') {
                                        s[len] = '\0';
                                        len    = strstrip(s);
                                    }
                                    if (len) {
                                        if (strchr(s, ' '))
                                            messageSetMimeSubtype(m,
                                                                  cli_strtokbuf(s, 0, " ", buf2));
                                        else
                                            messageSetMimeSubtype(m, s);
                                    }
                                }

                                while (*s && !isspace((unsigned char)*s))
                                    s++;
                                if (*s++ == '\0')
                                    break;
                                if (*s == '\0')
                                    break;
                            }
                            free(buf2);
                        }
                    }
                }

                /*
                 * Add in all rest of the the arguments.
                 * e.g. if the header is this:
                 * Content-Type:', arg='multipart/mixed; boundary=foo
                 * we find the boundary argument set it
                 */
                i = 1;
                while (cli_strtokbuf(ptr, i++, ";", buf) != NULL) {
                    cli_dbgmsg("mimeArgs = '%s'\n", buf);

                    argCnt++;
                    if (haveTooManyMIMEArguments(argCnt, ctx, heuristicFound)) {
                        break;
                    }
                    messageAddArguments(m, buf);
                }
            }
            break;
        case CONTENT_TRANSFER_ENCODING:
            messageSetEncoding(m, ptr);
            break;
        case CONTENT_DISPOSITION:
            buf = cli_malloc(strlen(ptr) + 1);
            if (buf == NULL) {
                cli_errmsg("parseMimeHeader: Unable to allocate memory for buf %llu\n", (long long unsigned)(strlen(ptr) + 1));
                if (copy)
                    free(copy);
                return -1;
            }
            p = cli_strtokbuf(ptr, 0, ";", buf);
            if (p && *p) {
                messageSetDispositionType(m, p);
                messageAddArgument(m, cli_strtokbuf(ptr, 1, ";", buf));
            }
            if (!messageHasFilename(m))
                /*
                 * Handle this type of header, without
                 * a filename (e.g. some Worm.Torvil.D)
                 *    Content-ID: <nRfkHdrKsAxRU>
                 * Content-Transfer-Encoding: base64
                 * Content-Disposition: attachment
                 */
                messageAddArgument(m, "filename=unknown");
    }
    if (copy)
        free(copy);
    if (buf)
        free(buf);

    return 0;
}

/*
 * Save the text portion of the message
 */
static int
saveTextPart(mbox_ctx *mctx, message *m, int destroy_text)
{
    fileblob *fb;

    messageAddArgument(m, "filename=textportion");
    if ((fb = messageToFileblob(m, mctx->dir, destroy_text)) != NULL) {
        /*
         * Save main part to scan that
         */
        cli_dbgmsg("Saving main message\n");

        mctx->files++;
        return fileblobScanAndDestroy(fb);
    }
    return CL_ETMPFILE;
}

/*
 * Handle RFC822 comments in headers.
 * If out == NULL, return a buffer without the comments, the caller must free
 *    the returned buffer
 * Return NULL on error or if the input * has no comments.
 * See section 3.4.3 of RFC822
 * TODO: handle comments that go on to more than one line
 */
static char *
rfc822comments(const char *in, char *out)
{
    const char *iptr;
    char *optr;
    int backslash, inquote, commentlevel;

    if (in == NULL || out == in) {
        cli_errmsg("rfc822comments: Invalid parameters.n");
        return NULL;
    }

    if (strchr(in, '(') == NULL) {
        return NULL;
    }

    while (isspace((const unsigned char)*in)) {
        in++;
    }

    if (out == NULL) {
        out = cli_malloc(strlen(in) + 1);
        if (out == NULL) {
            cli_errmsg("rfc822comments: Unable to allocate memory for out %llu\n", (long long unsigned)(strlen(in) + 1));
            return NULL;
        }
    }

    backslash = commentlevel = inquote = 0;
    optr                               = out;

    cli_dbgmsg("rfc822comments: contains a comment\n");

    for (iptr = in; *iptr; iptr++)
        if (backslash) {
            if (commentlevel == 0)
                *optr++ = *iptr;
            backslash = 0;
        } else
            switch (*iptr) {
                case '\\':
                    backslash = 1;
                    break;
                case '\"':
                    *optr++ = '\"';
                    inquote = !inquote;
                    break;
                case '(':
                    if (inquote)
                        *optr++ = '(';
                    else
                        commentlevel++;
                    break;
                case ')':
                    if (inquote)
                        *optr++ = ')';
                    else if (commentlevel > 0)
                        commentlevel--;
                    break;
                default:
                    if (commentlevel == 0)
                        *optr++ = *iptr;
            }

    if (backslash) /* last character was a single backslash */
        *optr++ = '\\';
    *optr = '\0';

    /*strstrip(out);*/

    cli_dbgmsg("rfc822comments '%s'=>'%s'\n", in, out);

    return out;
}

/*
 * Handle RFC2047 encoding. Returns a malloc'd buffer that the caller must
 * free, or NULL on error
 */
static char *
rfc2047(const char *in)
{
    char *out, *pout;
    size_t len;

    if ((strstr(in, "=?") == NULL) || (strstr(in, "?=") == NULL))
        return cli_strdup(in);

    cli_dbgmsg("rfc2047 '%s'\n", in);
    out = cli_malloc(strlen(in) + 1);

    if (out == NULL) {
        cli_errmsg("rfc2047: Unable to allocate memory for out %llu\n", (long long unsigned)(strlen(in) + 1));
        return NULL;
    }

    pout = out;

    /* For each RFC2047 string */
    while (*in) {
        char encoding, *ptr, *enctext;
        message *m;
        blob *b;

        /* Find next RFC2047 string */
        while (*in) {
            if ((*in == '=') && (in[1] == '?')) {
                in += 2;
                break;
            }
            *pout++ = *in++;
        }
        /* Skip over charset, find encoding */
        while ((*in != '?') && *in)
            in++;
        if (*in == '\0')
            break;
        encoding = *++in;
        encoding = (char)tolower(encoding);

        if ((encoding != 'q') && (encoding != 'b')) {
            cli_warnmsg("Unsupported RFC2047 encoding type '%c' - if you believe this file contains a virus, submit it to www.clamav.net\n", encoding);
            free(out);
            out = NULL;
            break;
        }
        /* Skip to encoded text */
        if (*++in != '?')
            break;
        if (*++in == '\0')
            break;

        enctext = cli_strdup(in);
        if (enctext == NULL) {
            free(out);
            out = NULL;
            break;
        }
        in = strstr(in, "?=");
        if (in == NULL) {
            free(enctext);
            break;
        }
        in += 2;
        ptr = strstr(enctext, "?=");
        if (NULL == ptr) {
            free(enctext);
            break;
        }
        *ptr = '\0';
        /*cli_dbgmsg("Need to decode '%s' with method '%c'\n", enctext, encoding);*/

        m = messageCreate();
        if (m == NULL) {
            free(enctext);
            break;
        }
        messageAddStr(m, enctext);

        free(enctext);
        enctext = NULL;

        switch (encoding) {
            case 'q':
                messageSetEncoding(m, "quoted-printable");
                break;
            case 'b':
                messageSetEncoding(m, "base64");
                break;
        }
        b = messageToBlob(m, 1);
        if (b == NULL) {
            messageDestroy(m);
            break;
        }
        len = blobGetDataSize(b);
        cli_dbgmsg("Decoded as '%*.*s'\n", (int)len, (int)len,
                   (const char *)blobGetData(b));
        memcpy(pout, blobGetData(b), len);
        blobDestroy(b);
        messageDestroy(m);
        if (len > 0 && pout[len - 1] == '\n')
            pout += len - 1;
        else
            pout += len;
    }
    if (out == NULL)
        return NULL;

    *pout = '\0';

    cli_dbgmsg("rfc2047 returns '%s'\n", out);
    return out;
}

/*
 * Handle partial messages
 */
static int
rfc1341(mbox_ctx *mctx, message *m)
{
    char *arg, *id, *number, *total, *oldfilename;
    const char *tmpdir = NULL;
    int n;
    char pdir[PATH_MAX + 1];
    unsigned char md5_val[16];
    char *md5_hex;

    if ((NULL == mctx) || (NULL == m)) {
        cli_dbgmsg("rfc1341: Invalid NULL arguments\n");
        return -1;
    }

    id = (char *)messageFindArgument(m, "id");
    if (id == NULL) {
        return -1;
    }

    if (NULL != mctx->ctx) {
        tmpdir = cl_engine_get_str((const struct cl_engine *)mctx->ctx->engine, CL_ENGINE_TMPDIR, NULL);
    }
    if (NULL == tmpdir) {
        tmpdir = cli_gettmpdir();
    }

    snprintf(pdir, sizeof(pdir) - 1, "%s" PATHSEP "clamav-partial", tmpdir);

    if ((mkdir(pdir, S_IRUSR | S_IWUSR) < 0) && (errno != EEXIST)) {
        cli_errmsg("Can't create the directory '%s'\n", pdir);
        free(id);
        return -1;
    } else if (errno == EEXIST) {
        STATBUF statb;

        if (CLAMSTAT(pdir, &statb) < 0) {
            char err[128];
            cli_errmsg("Partial directory %s: %s\n", pdir,
                       cli_strerror(errno, err, sizeof(err)));
            free(id);
            return -1;
        }
        if (statb.st_mode & 077)
            cli_warnmsg("Insecure partial directory %s (mode 0%o)\n",
                        pdir,
#ifdef ACCESSPERMS
                        (int)(statb.st_mode & ACCESSPERMS)
#else
                        (int)(statb.st_mode & 0777)
#endif
            );
    }

    number = (char *)messageFindArgument(m, "number");
    if (number == NULL) {
        free(id);
        return -1;
    }

    oldfilename = messageGetFilename(m);

    arg = cli_malloc(10 + strlen(id) + strlen(number));
    if (arg) {
        sprintf(arg, "filename=%s%s", id, number);
        messageAddArgument(m, arg);
        free(arg);
    }

    if (oldfilename) {
        cli_dbgmsg("Must reset to %s\n", oldfilename);
        free(oldfilename);
    }

    n = atoi(number);
    cl_hash_data("md5", id, strlen(id), md5_val, NULL);
    md5_hex = cli_str2hex((const char *)md5_val, 16);

    if (!md5_hex) {
        free(id);
        free(number);
        return CL_EMEM;
    }

    if (messageSavePartial(m, pdir, md5_hex, n) < 0) {
        free(md5_hex);
        free(id);
        free(number);
        return -1;
    }

    total = (char *)messageFindArgument(m, "total");
    cli_dbgmsg("rfc1341: %s, %s of %s\n", id, number, (total) ? total : "?");
    if (total) {
        int t   = atoi(total);
        DIR *dd = NULL;

        free(total);
        /*
         * If it's the last one - reassemble it
         * FIXME: this assumes that we receive the parts in order
         */
        if ((n == t) && ((dd = opendir(pdir)) != NULL)) {
            FILE *fout;
            char outname[PATH_MAX + 1];
            time_t now;

            sanitiseName(id);

            snprintf(outname, sizeof(outname) - 1, "%s" PATHSEP "%s", mctx->dir, id);

            cli_dbgmsg("outname: %s\n", outname);

            fout = fopen(outname, "wb");
            if (fout == NULL) {
                cli_errmsg("Can't open '%s' for writing", outname);
                free(id);
                free(number);
                free(md5_hex);
                closedir(dd);
                return -1;
            }

            time(&now);
            for (n = 1; n <= t; n++) {
                char filename[NAME_MAX + 1];
                struct dirent *dent;

                snprintf(filename, sizeof(filename), "_%s-%u", md5_hex, n);

                while ((dent = readdir(dd))) {
                    FILE *fin;
                    char buffer[BUFSIZ], fullname[PATH_MAX + 1 + 256 + 1];
                    int nblanks;
                    STATBUF statb;
                    const char *dentry_idpart;
                    int test_fd;

                    if (dent->d_ino == 0)
                        continue;

                    if (!strcmp(".", dent->d_name) ||
                        !strcmp("..", dent->d_name))
                        continue;
                    snprintf(fullname, sizeof(fullname) - 1,
                             "%s" PATHSEP "%s", pdir, dent->d_name);
                    dentry_idpart = strchr(dent->d_name, '_');

                    if (!dentry_idpart ||
                        strcmp(filename, dentry_idpart) != 0) {
                        if (!m->ctx->engine->keeptmp)
                            continue;

                        if ((test_fd = open(fullname, O_RDONLY | O_BINARY)) < 0)
                            continue;

                        if (FSTAT(test_fd, &statb) < 0) {
                            close(test_fd);
                            continue;
                        }

                        if (now - statb.st_mtime > (time_t)(7 * 24 * 3600)) {
                            if (cli_unlink(fullname)) {
                                cli_unlink(outname);
                                fclose(fout);
                                free(md5_hex);
                                free(id);
                                free(number);
                                closedir(dd);
                                close(test_fd);
                                return -1;
                            }
                        }

                        close(test_fd);
                        continue;
                    }

                    fin = fopen(fullname, "rb");
                    if (fin == NULL) {
                        cli_errmsg("Can't open '%s' for reading", fullname);
                        fclose(fout);
                        cli_unlink(outname);
                        free(md5_hex);
                        free(id);
                        free(number);
                        closedir(dd);
                        return -1;
                    }
                    nblanks = 0;
                    while (fgets(buffer, sizeof(buffer) - 1, fin) != NULL)
                        /*
                         * Ensure that trailing newlines
                         * aren't copied
                         */
                        if (buffer[0] == '\n')
                            nblanks++;
                        else {
                            if (nblanks)
                                do {
                                    if (putc('\n', fout) == EOF) break;
                                } while (--nblanks > 0);
                            if (nblanks || fputs(buffer, fout) == EOF) {
                                fclose(fin);
                                fclose(fout);
                                cli_unlink(outname);
                                free(md5_hex);
                                free(id);
                                free(number);
                                closedir(dd);
                                return -1;
                            }
                        }
                    fclose(fin);

                    /* don't unlink if leave temps */
                    if (!m->ctx->engine->keeptmp) {
                        if (cli_unlink(fullname)) {
                            fclose(fout);
                            cli_unlink(outname);
                            free(md5_hex);
                            free(id);
                            free(number);
                            closedir(dd);
                            return -1;
                        }
                    }
                    break;
                }
                rewinddir(dd);
            }
            closedir(dd);
            fclose(fout);
        }
    }
    free(number);
    free(id);
    free(md5_hex);

    return 0;
}

static void
hrefs_done(blob *b, tag_arguments_t *hrefs)
{
    if (b)
        blobDestroy(b);
    html_tag_arg_free(hrefs);
}

/* extract URLs from static text */
static void extract_text_urls(const unsigned char *mem, size_t len, tag_arguments_t *hrefs)
{
    char url[1024];
    size_t off;
    for (off = 0; off + 10 < len; off++) {
        /* check whether this is the start of a URL */
        int32_t proto = cli_readint32(mem + off);
        /* convert to lowercase */
        proto |= 0x20202020;
        /* 'http:', 'https:', or 'ftp:' in little-endian */
        if ((proto == 0x70747468 &&
             (mem[off + 4] == ':' || (mem[off + 5] == 's' && mem[off + 6] == ':'))) ||
            proto == 0x3a707466) {
            size_t url_len;
            for (url_len = 4; off + url_len < len && url_len < (sizeof(url) - 1); url_len++) {
                unsigned char c = mem[off + url_len];
                /* smart compilers will compile this if into
                 * a single bt + jb instruction */
                if (c == ' ' || c == '\n' || c == '\t')
                    break;
            }
            memcpy(url, mem + off, url_len);
            url[url_len] = '\0';
            html_tag_arg_add(hrefs, "href", url);
            off += url_len;
        }
    }
}

/*
 * This used to be part of checkURLs, split out, because phishingScan needs it
 * too, and phishingScan might be used in situations where checkURLs is
 * disabled (see ifdef)
 */
static blob *
getHrefs(message *m, tag_arguments_t *hrefs)
{
    unsigned char *mem;
    blob *b = messageToBlob(m, 0);
    size_t len;

    if (b == NULL)
        return NULL;

    len = blobGetDataSize(b);

    if (len == 0) {
        blobDestroy(b);
        return NULL;
    }

    /* TODO: make this size customisable */
    if (len > 100 * 1024) {
        cli_dbgmsg("Viruses pointed to by URLs not scanned in large message\n");
        blobDestroy(b);
        return NULL;
    }

    hrefs->count = 0;
    hrefs->tag = hrefs->value = NULL;
    hrefs->contents           = NULL;

    cli_dbgmsg("getHrefs: calling html_normalise_mem\n");
    mem = blobGetData(b);
    if (!html_normalise_mem(mem, (off_t)len, NULL, hrefs, m->ctx->dconf)) {
        blobDestroy(b);
        return NULL;
    }
    cli_dbgmsg("getHrefs: html_normalise_mem returned\n");
    if (!hrefs->count && hrefs->scanContents) {
        extract_text_urls(mem, len, hrefs);
    }

    /* TODO: Do we need to call remove_html_comments? */
    return b;
}

/*
 * validate URLs for phishes
 * followurls: see if URLs point to malware
 */
static void
checkURLs(message *mainMessage, mbox_ctx *mctx, mbox_status *rc, int is_html)
{
    blob *b;
    tag_arguments_t hrefs;

    UNUSEDPARAM(is_html);

    if (*rc == VIRUS)
        return;

    hrefs.scanContents = mctx->ctx->engine->dboptions & CL_DB_PHISHING_URLS && (DCONF_PHISHING & PHISHING_CONF_ENGINE);

    if (!hrefs.scanContents)
        /*
         * Don't waste time extracting hrefs (parsing html), nobody
         * will need it
         */
        return;

    hrefs.count = 0;
    hrefs.tag = hrefs.value = NULL;
    hrefs.contents          = NULL;

    b = getHrefs(mainMessage, &hrefs);
    if (b) {
        if (hrefs.scanContents) {
            if (phishingScan(mctx->ctx, &hrefs) == CL_VIRUS) {
                /*
                 * FIXME: message objects' contents are
                 *    encapsulated so we should not access
                 *    the members directly
                 */
                mainMessage->isInfected = true;
                *rc                     = VIRUS;
                cli_dbgmsg("PH:Phishing found\n");
            }
        }
    }
    hrefs_done(b, &hrefs);
}

#ifdef HAVE_BACKTRACE
static void
sigsegv(int sig)
{
    signal(SIGSEGV, SIG_DFL);
    print_trace(1);
    exit(SIGSEGV);
}

static void
print_trace(int use_syslog)
{
    void *array[10];
    size_t size;
    char **strings;
    size_t i;
    pid_t pid = getpid();

    cli_errmsg("Segmentation fault, attempting to print backtrace\n");

    size    = backtrace(array, 10);
    strings = backtrace_symbols(array, size);

    cli_errmsg("Backtrace of pid %d:\n", pid);
    if (use_syslog)
        syslog(LOG_ERR, "Backtrace of pid %d:", pid);

    for (i = 0; i < size; i++) {
        cli_errmsg("%s\n", strings[i]);
        if (use_syslog)
            syslog(LOG_ERR, "bt[%llu]: %s", (unsigned long long)i, strings[i]);
    }

#ifdef SAVE_TMP
    cli_errmsg("The errant mail file has been saved\n");
#endif
    /* #else TODO: dump the current email */

    free(strings);
}
#endif

/* See also clamav-milter */
static bool
usefulHeader(int commandNumber, const char *cmd)
{
    switch (commandNumber) {
        case CONTENT_TRANSFER_ENCODING:
        case CONTENT_DISPOSITION:
        case CONTENT_TYPE:
            return true;
        default:
            if (strcasecmp(cmd, "From") == 0)
                return true;
            if (strcasecmp(cmd, "Received") == 0)
                return true;
            if (strcasecmp(cmd, "De") == 0)
                return true;
    }

    return false;
}

/*
 * Like fgets but cope with end of line by "\n", "\r\n", "\n\r", "\r"
 */
static char *
getline_from_mbox(char *buffer, size_t buffer_len, fmap_t *map, size_t *at)
{
    const char *src, *cursrc;
    char *curbuf;
    size_t i;
    size_t input_len = MIN(map->len - *at, buffer_len + 1);
    src = cursrc = fmap_need_off_once(map, *at, input_len);

    /* we check for eof from the result of GETC()
    if(feof(fin))
        return NULL;*/
    if (!src) {
        cli_dbgmsg("getline_from_mbox: fmap need failed\n");
        return NULL;
    }
    if ((buffer_len == 0) || (buffer == NULL)) {
        cli_errmsg("Invalid call to getline_from_mbox(). Refer to https://docs.clamav.net/manual/Installing.html\n");
        return NULL;
    }

    curbuf = buffer;

    for (i = 0; i < buffer_len - 1; i++) {
        char c;

        if (!input_len--) {
            if (curbuf == buffer) {
                /* EOF on first char */
                return NULL;
            }
            break;
        }

        switch ((c = *cursrc++)) {
            case '\0':
                continue;
            case '\n':
                *curbuf++ = '\n';
                if (input_len && *cursrc == '\r') {
                    i++;
                    cursrc++;
                }
                break;
            case '\r':
                *curbuf++ = '\r';
                if (input_len && *cursrc == '\n') {
                    i++;
                    cursrc++;
                }
                break;
            default:
                *curbuf++ = c;
                continue;
        }
        break;
    }
    *at += cursrc - src;
    *curbuf = '\0';

    return buffer;
}

/*
 * Is this line a candidate for the start of a bounce message?
 */
static bool
isBounceStart(mbox_ctx *mctx, const char *line)
{
    size_t len;

    if (line == NULL)
        return false;
    if (*line == '\0')
        return false;
    /*if((strncmp(line, "From ", 5) == 0) && !isalnum(line[5]))
        return false;
    if((strncmp(line, ">From ", 6) == 0) && !isalnum(line[6]))
        return false;*/

    len = strlen(line);
    if ((len < 6) || (len >= 72))
        return false;

    if ((memcmp(line, "From ", 5) == 0) ||
        (memcmp(line, ">From ", 6) == 0)) {
        int numSpaces = 0, numDigits = 0;

        line += 4;

        do
            if (*line == ' ')
                numSpaces++;
            else if (isdigit((*line) & 0xFF))
                numDigits++;
        while (*++line != '\0');

        if (numSpaces < 6)
            return false;
        if (numDigits < 11)
            return false;
        return true;
    }
    return (bool)(cli_compare_ftm_file((const unsigned char *)line, len, mctx->ctx->engine) == CL_TYPE_MAIL);
}

/*
 * Extract a binhexEncoded message, return if it's found to be infected as we
 *    extract it
 */
static bool
exportBinhexMessage(mbox_ctx *mctx, message *m)
{
    bool infected = false;
    fileblob *fb;

    if (messageGetEncoding(m) == NOENCODING)
        messageSetEncoding(m, "x-binhex");

    fb = messageToFileblob(m, mctx->dir, 0);

    if (fb) {
        cli_dbgmsg("Binhex file decoded to %s\n",
                   fileblobGetFilename(fb));

        if (fileblobScanAndDestroy(fb) == CL_VIRUS)
            infected = true;
        mctx->files++;
    } else
        cli_errmsg("Couldn't decode binhex file to %s\n", mctx->dir);

    return infected;
}

/*
 * Locate any bounce message and extract it. Return cl_status
 */
static int
exportBounceMessage(mbox_ctx *mctx, text *start)
{
    int rc = CL_CLEAN;
    text *t;
    fileblob *fb;

    /*
     * Attempt to save the original (unbounced)
     * message - clamscan will find that in the
     * directory and call us again (with any luck)
     * having found an e-mail message to handle.
     *
     * This finds a lot of false positives, the
     * search that a content type is in the
     * bounce (i.e. it's after the bounce header)
     * helps a bit.
     *
     * messageAddLine
     * optimization could help here, but needs
     * careful thought, do it with line numbers
     * would be best, since the current method in
     * messageAddLine of checking encoding first
     * must remain otherwise non bounce messages
     * won't be scanned
     */
    for (t = start; t; t = t->t_next) {
        const char *txt = lineGetData(t->t_line);
        char cmd[RFC2821LENGTH + 1];

        if (txt == NULL)
            continue;
        if (cli_strtokbuf(txt, 0, ":", cmd) == NULL)
            continue;

        switch (tableFind(mctx->rfc821Table, cmd)) {
            case CONTENT_TRANSFER_ENCODING:
                if ((strstr(txt, "7bit") == NULL) &&
                    (strstr(txt, "8bit") == NULL))
                    break;
                continue;
            case CONTENT_DISPOSITION:
                break;
            case CONTENT_TYPE:
                if (strstr(txt, "text/plain") != NULL)
                    t = NULL;
                break;
            default:
                if (strcasecmp(cmd, "From") == 0)
                    start = t;
                else if (strcasecmp(cmd, "Received") == 0)
                    start = t;
                continue;
        }
        break;
    }
    if (t && ((fb = fileblobCreate()) != NULL)) {
        cli_dbgmsg("Found a bounce message\n");
        fileblobSetFilename(fb, mctx->dir, "bounce");
        fileblobSetCTX(fb, mctx->ctx);
        if (textToFileblob(start, fb, 1) == NULL) {
            cli_dbgmsg("Nothing new to save in the bounce message\n");
            fileblobDestroy(fb);
        } else
            rc = fileblobScanAndDestroy(fb);
        mctx->files++;
    } else
        cli_dbgmsg("Not found a bounce message\n");

    return rc;
}

/*
 * Get string representation of mimetype
 */
static const char *getMimeTypeStr(mime_type mimetype)
{
    const struct tableinit *entry = mimeTypeStr;

    while (entry->key) {
        if (mimetype == ((mime_type)entry->value)) {
            return entry->key;
        }
        entry++;
    }
    return "UNKNOWN";
}

/*
 * Get string representation of encoding type
 */
static const char *getEncTypeStr(encoding_type enctype)
{
    const struct tableinit *entry = encTypeStr;

    while (entry->key) {
        if (enctype == ((encoding_type)entry->value)) {
            return entry->key;
        }
        entry++;
    }
    return "UNKNOWN";
}

/*
 * Handle the ith element of a number of multiparts, e.g. multipart/alternative
 */
static message *
do_multipart(message *mainMessage, message **messages, int i, mbox_status *rc, mbox_ctx *mctx, message *messageIn, text **tptr, unsigned int recursion_level)
{
    bool addToText = false;
    const char *dtype;
#ifndef SAVE_TO_DISC
    message *body;
#endif
    message *aMessage        = messages[i];
    const int doPhishingScan = mctx->ctx->engine->dboptions & CL_DB_PHISHING_URLS && (DCONF_PHISHING & PHISHING_CONF_ENGINE);
#if HAVE_JSON
    json_object *thisobj = NULL, *saveobj = mctx->wrkobj;

    if (mctx->wrkobj != NULL) {
        json_object *multiobj = cli_jsonarray(mctx->wrkobj, "Multipart");
        if (multiobj == NULL) {
            cli_errmsg("Cannot get multipart preclass array\n");
        } else if (NULL == (thisobj = messageGetJObj(aMessage))) {
            cli_dbgmsg("Cannot get message preclass object\n");
        } else if (CL_SUCCESS != cli_json_addowner(multiobj, thisobj, NULL, -1)) {
            cli_errmsg("Cannot assign message preclass object to multipart preclass array\n");
        }
    }
#endif

    if (aMessage == NULL) {
#if HAVE_JSON
        if (thisobj != NULL)
            cli_jsonstr(thisobj, "MimeType", "NULL");
#endif
        return mainMessage;
    }

    if (*rc != OK)
        return mainMessage;

    cli_dbgmsg("Mixed message part %d is of type %d\n",
               i, messageGetMimeType(aMessage));

#if HAVE_JSON
    if (thisobj != NULL) {
        cli_jsonstr(thisobj, "MimeType", getMimeTypeStr(messageGetMimeType(aMessage)));
        cli_jsonstr(thisobj, "MimeSubtype", messageGetMimeSubtype(aMessage));
        cli_jsonstr(thisobj, "EncodingType", getEncTypeStr(messageGetEncoding(aMessage)));
        cli_jsonstr(thisobj, "Disposition", messageGetDispositionType(aMessage));
        if (messageHasFilename(aMessage)) {
            char *filename = messageGetFilename(aMessage);
            cli_jsonstr(thisobj, "Filename", filename);
            free(filename);
        } else {
            cli_jsonstr(thisobj, "Filename", "(inline)");
        }
    }
#endif

    switch (messageGetMimeType(aMessage)) {
        case APPLICATION:
        case AUDIO:
        case IMAGE:
        case VIDEO:
            break;
        case NOMIME:
            cli_dbgmsg("No mime headers found in multipart part %d\n", i);
            if (mainMessage) {
                if (binhexBegin(aMessage)) {
                    cli_dbgmsg("Found binhex message in multipart/mixed mainMessage\n");

                    if (exportBinhexMessage(mctx, mainMessage))
                        *rc = VIRUS;
                }
                if (mainMessage != messageIn)
                    messageDestroy(mainMessage);
                mainMessage = NULL;
            } else if (aMessage) {
                if (binhexBegin(aMessage)) {
                    cli_dbgmsg("Found binhex message in multipart/mixed non mime part\n");
                    if (exportBinhexMessage(mctx, aMessage))
                        *rc = VIRUS;
                    messageReset(messages[i]);
                }
            }
            addToText = true;
            if (messageGetBody(aMessage) == NULL)
                /*
                 * No plain text version
                 */
                cli_dbgmsg("No plain text alternative\n");
            break;
        case TEXT:
            dtype = messageGetDispositionType(aMessage);
            cli_dbgmsg("Mixed message text part disposition \"%s\"\n",
                       dtype);
            if (strcasecmp(dtype, "attachment") == 0)
                break;
            if ((*dtype == '\0') || (strcasecmp(dtype, "inline") == 0)) {
                const char *cptr;

                if (mainMessage && (mainMessage != messageIn))
                    messageDestroy(mainMessage);
                mainMessage = NULL;
                cptr        = messageGetMimeSubtype(aMessage);
                cli_dbgmsg("Mime subtype \"%s\"\n", cptr);
                if ((tableFind(mctx->subtypeTable, cptr) == PLAIN) &&
                    (messageGetEncoding(aMessage) == NOENCODING)) {
                    /*
                     * Strictly speaking, a text/plain part
                     * is not an attachment. We pretend it
                     * is so that we can decode and scan it
                     */
                    if (!messageHasFilename(aMessage)) {
                        cli_dbgmsg("Adding part to main message\n");
                        addToText = true;
                    } else
                        cli_dbgmsg("Treating inline as attachment\n");
                } else {
                    const int is_html = (tableFind(mctx->subtypeTable, cptr) == HTML);
                    if (doPhishingScan)
                        checkURLs(aMessage, mctx, rc, is_html);
                    messageAddArgument(aMessage,
                                       "filename=mixedtextportion");
                }
                break;
            }
            cli_dbgmsg("Text type %s is not supported\n", dtype);
            return mainMessage;
        case MESSAGE:
            /* Content-Type: message/rfc822 */
            cli_dbgmsg("Found message inside multipart (encoding type %d)\n",
                       messageGetEncoding(aMessage));
#ifndef SCAN_UNENCODED_BOUNCES
            switch (messageGetEncoding(aMessage)) {
                case NOENCODING:
                case EIGHTBIT:
                case BINARY:
                    if (encodingLine(aMessage) == NULL) {
                        /*
                         * This means that the message
                         * has no attachments
                         *
                         * The test for
                         * messageGetEncoding is needed
                         * since encodingLine won't have
                         * been set if the message
                         * itself has been encoded
                         */
                        cli_dbgmsg("Unencoded multipart/message will not be scanned\n");
                        messageDestroy(messages[i]);
                        messages[i] = NULL;
                        return mainMessage;
                    }
                    /* FALLTHROUGH */
                default:
                    cli_dbgmsg("Encoded multipart/message will be scanned\n");
            }
#endif

#ifdef SAVE_TO_DISC
            /*
             * Save this embedded message
             * to a temporary file
             */
            if (saveTextPart(mctx, aMessage, 1) == CL_VIRUS)
                *rc = VIRUS;
            messageDestroy(messages[i]);
            messages[i] = NULL;
#else
            /*
             * Scan in memory, faster but is open to DoS attacks
             * when many nested levels are involved.
             */
            body = parseEmailHeaders(aMessage, mctx->rfc821Table);

            /*
             * We've finished with the
             * original copy of the message,
             * so throw that away and
             * deal with the encapsulated
             * message as a message.
             * This can save a lot of memory
             */
            messageDestroy(messages[i]);
            messages[i]  = NULL;
#if HAVE_JSON
            mctx->wrkobj = thisobj;
#endif
            if (body) {
                messageSetCTX(body, mctx->ctx);
                *rc = parseEmailBody(body, NULL, mctx, recursion_level + 1);
                if ((*rc == OK) && messageContainsVirus(body))
                    *rc = VIRUS;
                messageDestroy(body);
            }
#if HAVE_JSON
            mctx->wrkobj = saveobj;
#endif
#endif
            return mainMessage;
        case MULTIPART:
            /*
             * It's a multi part within a multi part
             * Run the message parser on this bit, it won't
             * be an attachment
             */
            cli_dbgmsg("Found multipart inside multipart\n");
#if HAVE_JSON
            mctx->wrkobj = thisobj;
#endif
            if (aMessage) {
                /*
                 * The headers were parsed when reading in the
                 * whole multipart section
                 */
                *rc = parseEmailBody(aMessage, *tptr, mctx, recursion_level + 1);
                cli_dbgmsg("Finished recursion, rc = %d\n", (int)*rc);
                messageDestroy(messages[i]);
                messages[i] = NULL;
            } else {
                *rc = parseEmailBody(NULL, NULL, mctx, recursion_level + 1);
                if (mainMessage && (mainMessage != messageIn)) {
                    messageDestroy(mainMessage);
                }
                mainMessage = NULL;
            }
#if HAVE_JSON
            mctx->wrkobj = saveobj;
#endif
            return mainMessage;
        default:
            cli_dbgmsg("Only text and application attachments are fully supported, type = %d\n",
                       messageGetMimeType(aMessage));
            /* fall through - we may be able to salvage something */
    }

    if (*rc != VIRUS) {
        fileblob *fb = messageToFileblob(aMessage, mctx->dir, 1);
#if HAVE_JSON
        json_object *arrobj;
#if (JSON_C_MAJOR_VERSION == 0) && (JSON_C_MINOR_VERSION < 13)
        int arrlen = 0;
#else
        size_t arrlen = 0;
#endif

        if (thisobj != NULL) {
            /* attempt to determine container size - prevents incorrect type reporting */
            if (json_object_object_get_ex(mctx->ctx->wrkproperty, "ContainedObjects", &arrobj)) {
                arrlen = json_object_array_length(arrobj);
            }
        }

#endif
        if (fb) {
            /* aMessage doesn't always have a ctx set */
            fileblobSetCTX(fb, mctx->ctx);
            if (fileblobScanAndDestroy(fb) == CL_VIRUS) {
                *rc = VIRUS;
            }
            if (!addToText) {
                mctx->files++;
            }
        }
#if HAVE_JSON
        if (thisobj != NULL) {
            json_object *entry = NULL;
            const char *dtype  = NULL;

            /* attempt to acquire container type */
            if (json_object_object_get_ex(mctx->ctx->wrkproperty, "ContainedObjects", &arrobj)) {
                if (json_object_array_length(arrobj) > arrlen) {
                    entry = json_object_array_get_idx(arrobj, arrlen);
                }
            }
            if (entry) {
                json_object_object_get_ex(entry, "FileType", &entry);
                if (entry) {
                    dtype = json_object_get_string(entry);
                }
            }
            cli_jsonint(thisobj, "ContainedObjectsIndex", (int32_t)arrlen);
            cli_jsonstr(thisobj, "ClamAVFileType", dtype ? dtype : "UNKNOWN");
        }
#endif
        if (messageContainsVirus(aMessage)) {
            *rc = VIRUS;
        }
    }
    messageDestroy(aMessage);
    messages[i] = NULL;

    return mainMessage;
}

/*
 * Returns the number of quote characters in the given string
 */
static int
count_quotes(const char *buf)
{
    int quotes = 0;

    while (*buf)
        if (*buf++ == '\"')
            quotes++;

    return quotes;
}

/*
 * Will the next line be a folded header? See RFC2822 section 2.2.3
 */
static bool
next_is_folded_header(const text *t)
{
    const text *next = t->t_next;
    const char *data, *ptr;

    if (next == NULL)
        return false;

    if (next->t_line == NULL)
        return false;

    data = lineGetData(next->t_line);

    /*
     * Section B.2 of RFC822 says TAB or SPACE means a continuation of the
     * previous entry.
     */
    if (isblank(data[0]))
        return true;

    if (strchr(data, '=') == NULL)
        /*
         * Avoid false positives with
         *    Content-Type: text/html;
         *    Content-Transfer-Encoding: quoted-printable
         */
        return false;

    /*
     * Some are broken and don't fold headers lines
     * correctly as per section 2.2.3 of RFC2822.
     * Generally they miss the white space at
     * the start of the fold line:
     *    Content-Type: multipart/related;
     *    type="multipart/alternative";
     *    boundary="----=_NextPart_000_006A_01C6AC47.348CB550"
     * should read:
     *    Content-Type: multipart/related;
     *     type="multipart/alternative";
     *     boundary="----=_NextPart_000_006A_01C6AC47.348CB550"
     * Since we're a virus checker not an RFC
     * verifier we need to handle these
     */
    data = lineGetData(t->t_line);

    ptr = strchr(data, '\0');

    while (--ptr > data)
        switch (*ptr) {
            case ';':
                return true;
            case '\n':
            case ' ':
            case '\r':
            case '\t':
                continue; /* white space at end of line */
            default:
                return false;
        }
    return false;
}

/*
 * This routine is called on the first line of the body of
 * an email to handle broken messages that have newlines
 * in the middle of its headers
 */
static bool
newline_in_header(const char *line)
{
    cli_dbgmsg("newline_in_header, check \"%s\"\n", line);

    if (strncmp(line, "Message-Id: ", 12) == 0)
        return true;
    if (strncmp(line, "Date: ", 6) == 0)
        return true;

    cli_dbgmsg("newline_in_header, returning \"%s\"\n", line);

    return false;
}