266 lines
9.4 KiB
C
266 lines
9.4 KiB
C
|
/*
|
||
|
* Extract component parts of MS XML files (e.g. MS Office 2003 XML Documents)
|
||
|
*
|
||
|
* Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
|
||
|
* Copyright (C) 2007-2013 Sourcefire, Inc.
|
||
|
*
|
||
|
* Authors: Kevin Lin
|
||
|
*
|
||
|
* This program is free software; you can redistribute it and/or modify it under
|
||
|
* the terms of the GNU General Public License version 2 as published by the
|
||
|
* Free Software Foundation.
|
||
|
*
|
||
|
* This program is distributed in the hope that it will be useful, but WITHOUT
|
||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||
|
* more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License along with
|
||
|
* this program; if not, write to the Free Software Foundation, Inc., 51
|
||
|
* Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||
|
*/
|
||
|
|
||
|
#if HAVE_CONFIG_H
|
||
|
#include "clamav-config.h"
|
||
|
#endif
|
||
|
|
||
|
#include <sys/types.h>
|
||
|
#include <sys/stat.h>
|
||
|
#include <fcntl.h>
|
||
|
|
||
|
#include "clamav.h"
|
||
|
#include "others.h"
|
||
|
#include "conv.h"
|
||
|
#include "json_api.h"
|
||
|
#include "msxml.h"
|
||
|
#include "msxml_parser.h"
|
||
|
|
||
|
#if HAVE_LIBXML2
|
||
|
#include <libxml/xmlreader.h>
|
||
|
|
||
|
#define MSXML_VERBIOSE 0
|
||
|
#if MSXML_VERBIOSE
|
||
|
#define cli_msxmlmsg(...) cli_dbgmsg(__VA_ARGS__)
|
||
|
#else
|
||
|
#define cli_msxmlmsg(...)
|
||
|
#endif
|
||
|
|
||
|
#define MSXML_READBUFF SCANBUFF
|
||
|
|
||
|
// clang-format off
|
||
|
|
||
|
static const struct key_entry msxml_keys[] = {
|
||
|
{ "worddocument", "WordDocument", MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
|
||
|
{ "workbook", "Workbook", MSXML_JSON_ROOT | MSXML_JSON_ATTRIB },
|
||
|
{ "bindata", "BinaryData", MSXML_SCAN_B64 | MSXML_JSON_COUNT | MSXML_JSON_ROOT },
|
||
|
{ "documentproperties", "DocumentProperties", MSXML_JSON_ROOT },
|
||
|
{ "author", "Author", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "lastauthor", "LastAuthor", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "revision", "Revision", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "totaltime", "TotalTime", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "created", "Created", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "lastsaved", "LastSaved", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "pages", "Pages", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "words", "Words", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "characters", "Characters", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "lines", "Lines", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "paragraph", "Paragraph", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "characterswithspaces", "CharactersWithSpaces", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "version", "Version", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "allowpng", "AllowPNG", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE },
|
||
|
{ "fonts", "Fonts", MSXML_IGNORE_ELEM },
|
||
|
{ "styles", "Styles", MSXML_IGNORE_ELEM }
|
||
|
};
|
||
|
static size_t num_msxml_keys = sizeof(msxml_keys) / sizeof(struct key_entry);
|
||
|
|
||
|
// clang-format on
|
||
|
|
||
|
static inline size_t msxml_read_cb_new_window(struct msxml_cbdata *cbdata)
|
||
|
{
|
||
|
const unsigned char *new_window = NULL;
|
||
|
off_t new_mappos;
|
||
|
size_t bytes;
|
||
|
|
||
|
if ((size_t)cbdata->mappos == cbdata->map->len) {
|
||
|
cli_msxmlmsg("msxml_read_cb: fmap REALLY EOF\n");
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
new_mappos = cbdata->mappos + cbdata->winsize;
|
||
|
bytes = MIN(cbdata->map->len - new_mappos, MSXML_READBUFF);
|
||
|
if (!bytes) {
|
||
|
cbdata->window = NULL;
|
||
|
cbdata->winpos = 0;
|
||
|
cbdata->mappos = cbdata->map->len;
|
||
|
cbdata->winsize = 0;
|
||
|
|
||
|
cli_msxmlmsg("msxml_read_cb: fmap EOF\n");
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
new_window = fmap_need_off_once(cbdata->map, new_mappos, bytes);
|
||
|
if (!new_window) {
|
||
|
cli_errmsg("msxml_read_cb: cannot acquire new window for fmap\n");
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
cbdata->window = new_window;
|
||
|
cbdata->winpos = 0;
|
||
|
cbdata->mappos = new_mappos;
|
||
|
cbdata->winsize = bytes;
|
||
|
|
||
|
cli_msxmlmsg("msxml_read_cb: acquired new window @ [%llu(+%llu)(max:%llu)]\n",
|
||
|
(long long unsigned)cbdata->mappos, (long long unsigned)(cbdata->mappos + cbdata->winsize),
|
||
|
(long long unsigned)cbdata->map->len);
|
||
|
|
||
|
return bytes;
|
||
|
}
|
||
|
|
||
|
int msxml_read_cb(void *ctx, char *buffer, int buffer_len)
|
||
|
{
|
||
|
struct msxml_cbdata *cbdata = (struct msxml_cbdata *)ctx;
|
||
|
size_t wbytes, rbytes;
|
||
|
int winret;
|
||
|
size_t len = (size_t)buffer_len;
|
||
|
|
||
|
cli_msxmlmsg("msxml_read_cb called\n");
|
||
|
|
||
|
/* initial iteration */
|
||
|
if (!cbdata->window) {
|
||
|
if ((winret = msxml_read_cb_new_window(cbdata)) <= 0)
|
||
|
return winret;
|
||
|
}
|
||
|
|
||
|
cli_msxmlmsg("msxml_read_cb: requested %zu bytes from offset %llu\n", len, (long long unsigned)(cbdata->mappos + cbdata->winpos));
|
||
|
|
||
|
wbytes = 0;
|
||
|
rbytes = cbdata->winsize - cbdata->winpos;
|
||
|
|
||
|
/* copying loop with preprocessing */
|
||
|
while (wbytes < len) {
|
||
|
const unsigned char *read_from;
|
||
|
char *write_to = buffer + wbytes;
|
||
|
enum msxml_state *state;
|
||
|
#if MSXML_VERBIOSE
|
||
|
size_t written;
|
||
|
#endif
|
||
|
|
||
|
if (!rbytes) {
|
||
|
if ((winret = msxml_read_cb_new_window(cbdata)) < 0)
|
||
|
return winret;
|
||
|
if (winret == 0) {
|
||
|
cli_msxmlmsg("msxml_read_cb: propagating fmap EOF [%llu]\n", (long long unsigned)wbytes);
|
||
|
return (int)wbytes;
|
||
|
}
|
||
|
|
||
|
rbytes = cbdata->winsize;
|
||
|
}
|
||
|
|
||
|
#if MSXML_VERBIOSE
|
||
|
written = MIN(rbytes, len - wbytes);
|
||
|
cli_msxmlmsg("msxml_read_cb: copying from window [%llu(+%llu)] %llu->~%llu\n",
|
||
|
(long long unsigned)(cbdata->winsize - rbytes), (long long unsigned)cbdata->winsize,
|
||
|
(long long unsigned)cbdata->winpos, (long long unsigned)(cbdata->winpos + written));
|
||
|
#endif
|
||
|
|
||
|
read_from = cbdata->window + cbdata->winpos;
|
||
|
state = &(cbdata->state);
|
||
|
|
||
|
while ((rbytes > 0) && (wbytes < len)) {
|
||
|
switch (*state) {
|
||
|
case MSXML_STATE_NORMAL:
|
||
|
if ((*read_from) == '&')
|
||
|
*state = MSXML_STATE_ENTITY_START_1;
|
||
|
break;
|
||
|
case MSXML_STATE_ENTITY_START_1:
|
||
|
if ((*read_from) == '#')
|
||
|
*state = MSXML_STATE_ENTITY_START_2;
|
||
|
else
|
||
|
*state = MSXML_STATE_NORMAL;
|
||
|
break;
|
||
|
case MSXML_STATE_ENTITY_START_2:
|
||
|
if ((*read_from) == 'x')
|
||
|
*state = MSXML_STATE_ENTITY_HEX;
|
||
|
else if (((*read_from) >= '0') && ((*read_from) <= '9'))
|
||
|
*state = MSXML_STATE_ENTITY_DEC;
|
||
|
else
|
||
|
*state = MSXML_STATE_NORMAL;
|
||
|
break;
|
||
|
case MSXML_STATE_ENTITY_HEX:
|
||
|
if ((((*read_from) >= '0') && ((*read_from) <= '9')) ||
|
||
|
(((*read_from) >= 'a') && ((*read_from) <= 'f')) ||
|
||
|
(((*read_from) >= 'A') && ((*read_from) <= 'F'))) {
|
||
|
} else
|
||
|
*state = MSXML_STATE_ENTITY_CLOSE;
|
||
|
break;
|
||
|
case MSXML_STATE_ENTITY_DEC:
|
||
|
if (((*read_from) >= '0') && ((*read_from) <= '9')) {
|
||
|
} else
|
||
|
*state = MSXML_STATE_ENTITY_CLOSE;
|
||
|
break;
|
||
|
default:
|
||
|
cli_errmsg("unknown *state: %d\n", *state);
|
||
|
}
|
||
|
|
||
|
if (*state == MSXML_STATE_ENTITY_CLOSE) {
|
||
|
if ((*read_from) != ';') {
|
||
|
cli_msxmlmsg("msxml_read_cb: detected unterminated character entity @ winoff %d\n",
|
||
|
(int)(read_from - cbdata->window));
|
||
|
(*write_to++) = ';';
|
||
|
wbytes++;
|
||
|
}
|
||
|
*state = MSXML_STATE_NORMAL;
|
||
|
if (wbytes >= len)
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
*(write_to++) = *(read_from++);
|
||
|
rbytes--;
|
||
|
wbytes++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
cbdata->winpos = cbdata->winsize - rbytes;
|
||
|
return (int)wbytes;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
cl_error_t cli_scanmsxml(cli_ctx *ctx)
|
||
|
{
|
||
|
#if HAVE_LIBXML2
|
||
|
struct msxml_cbdata cbdata;
|
||
|
xmlTextReaderPtr reader = NULL;
|
||
|
cl_error_t ret = CL_SUCCESS;
|
||
|
|
||
|
cli_dbgmsg("in cli_scanmsxml()\n");
|
||
|
|
||
|
if (!ctx)
|
||
|
return CL_ENULLARG;
|
||
|
|
||
|
memset(&cbdata, 0, sizeof(cbdata));
|
||
|
cbdata.map = ctx->fmap;
|
||
|
|
||
|
reader = xmlReaderForIO(msxml_read_cb, NULL, &cbdata, "msxml.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
|
||
|
if (!reader) {
|
||
|
cli_dbgmsg("cli_scanmsxml: cannot initialize xmlReader\n");
|
||
|
|
||
|
#if HAVE_JSON
|
||
|
ret = cli_json_parse_error(ctx->wrkproperty, "OOXML_ERROR_XML_READER_IO");
|
||
|
#endif
|
||
|
return ret; // libxml2 failed!
|
||
|
}
|
||
|
|
||
|
ret = cli_msxml_parse_document(ctx, reader, msxml_keys, num_msxml_keys, 1, NULL);
|
||
|
|
||
|
xmlTextReaderClose(reader);
|
||
|
xmlFreeTextReader(reader);
|
||
|
return ret;
|
||
|
#else
|
||
|
UNUSEDPARAM(ctx);
|
||
|
cli_dbgmsg("in cli_scanmsxml()\n");
|
||
|
cli_dbgmsg("cli_scanmsxml: scanning msxml documents requires libxml2!\n");
|
||
|
|
||
|
return CL_SUCCESS;
|
||
|
#endif
|
||
|
}
|