denyhosts/clamav/libclamav/rtf.c

692 lines
26 KiB
C

/*
* Extract embedded objects from RTF files.
*
* Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
* Copyright (C) 2007-2013 Sourcefire, Inc.
*
* Authors: Török Edvin
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <ctype.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "others.h"
#include "rtf.h"
#include "clamav.h"
#include "table.h"
#include "scanners.h"
#include "vba_extract.h"
enum parse_state { PARSE_MAIN,
PARSE_CONTROL_,
PARSE_CONTROL_WORD,
PARSE_CONTROL_SYMBOL,
PARSE_CONTROL_WORD_PARAM,
PARSE_INTERPRET_CONTROLWORD };
enum rtf_action {
RTF_OBJECT,
RTF_OBJECT_DATA
};
struct rtf_state;
typedef int (*rtf_callback_begin)(struct rtf_state*, cli_ctx* ctx, const char* tmpdir);
typedef int (*rtf_callback_process)(struct rtf_state*, const unsigned char* data, const size_t len);
typedef int (*rtf_callback_end)(struct rtf_state*, cli_ctx*);
struct rtf_state {
rtf_callback_begin cb_begin; /* must be non-null if you want cb_process, and cb_end to be called, also it must change cb_data to non-null */
rtf_callback_process cb_process;
rtf_callback_end cb_end;
void* cb_data; /* data set up by cb_begin, used by cb_process, and cleaned up by cb_end. typically state data */
size_t default_elements;
size_t controlword_cnt;
int64_t controlword_param;
enum parse_state parse_state;
int controlword_param_sign;
int encounteredTopLevel; /* encountered top-level control words that we care about */
char controlword[33];
};
static const struct rtf_state base_state = {
NULL, NULL, NULL, NULL, 0, 0, 0, PARSE_MAIN, 0, 0, " "};
struct stack {
struct rtf_state* states;
size_t elements;
size_t stack_cnt;
size_t stack_size;
int warned;
};
static const struct rtf_action_mapping {
const char* controlword;
const enum rtf_action action;
} rtf_action_mapping[] =
{
{"object", RTF_OBJECT},
{"objdata ", RTF_OBJECT_DATA}};
static const size_t rtf_action_mapping_cnt = sizeof(rtf_action_mapping) / sizeof(rtf_action_mapping[0]);
enum rtf_objdata_state { WAIT_MAGIC,
WAIT_DESC_LEN,
WAIT_DESC,
WAIT_ZERO,
WAIT_DATA_SIZE,
DUMP_DATA,
DUMP_DISCARD };
static const unsigned char rtf_data_magic[] = {0x01, 0x05, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00}; /* is this a magic number, or does it mean something */
static const size_t rtf_data_magic_len = sizeof(rtf_data_magic);
struct rtf_object_data {
char* name;
int fd;
int partial;
int has_partial;
enum rtf_objdata_state internal_state;
char* desc_name;
const char* tmpdir;
cli_ctx* ctx;
size_t desc_len;
size_t bread;
};
#define BUFF_SIZE 8192
/* generated by contrib/phishing/generate_tables.c */
static const short int hextable[256] = {
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
static void init_rtf_state(struct rtf_state* state)
{
*state = base_state;
state->parse_state = PARSE_MAIN;
state->controlword_cnt = 0;
}
static int compare_state(const struct rtf_state* a, const struct rtf_state* b)
{
return (a->parse_state == b->parse_state &&
a->encounteredTopLevel == b->encounteredTopLevel &&
a->cb_begin == b->cb_begin &&
a->cb_process == b->cb_process &&
a->cb_end == b->cb_end &&
a->cb_data == b->cb_data);
}
static int push_state(struct stack* stack, struct rtf_state* state)
{
int toplevel;
size_t defelements;
stack->elements++;
if (compare_state(state, &base_state)) {
state->default_elements++;
return 0; /* this is default state, don't push it, we'll know when we pop it that it was the default one,
we store in the state how many default elements we have on the stack */
}
if (stack->stack_cnt >= stack->stack_size) {
/* grow stack */
struct rtf_state* states;
stack->stack_size += 128;
states = cli_realloc2(stack->states, stack->stack_size * sizeof(*stack->states));
if (!states)
return CL_EMEM;
stack->states = states;
}
stack->states[stack->stack_cnt++] = *state;
toplevel = state->encounteredTopLevel;
defelements = state->default_elements;
*state = base_state;
state->encounteredTopLevel = toplevel;
state->default_elements = 0;
return 0;
}
static int pop_state(struct stack* stack, struct rtf_state* state)
{
stack->elements--;
if (state->default_elements) {
const size_t default_elements = state->default_elements - 1;
const int toplevel = state->encounteredTopLevel;
*state = base_state;
state->default_elements = default_elements;
state->encounteredTopLevel = toplevel;
return 0; /* this is a default 'state'*/
}
if (!stack->stack_cnt) {
if (!stack->warned) {
cli_dbgmsg("Warning: attempt to pop from empty stack!\n");
stack->warned = 1;
}
*state = base_state; /* lets assume we give it a base state */
return 0;
}
*state = stack->states[--stack->stack_cnt];
return 0;
}
static int load_actions(table_t* t)
{
size_t i;
for (i = 0; i < rtf_action_mapping_cnt; i++)
if (tableInsert(t, rtf_action_mapping[i].controlword, rtf_action_mapping[i].action) == -1)
return -1;
return 0;
}
static int rtf_object_begin(struct rtf_state* state, cli_ctx* ctx, const char* tmpdir)
{
struct rtf_object_data* data = cli_malloc(sizeof(*data));
if (!data) {
cli_errmsg("rtf_object_begin: Unable to allocate memory for object data\n");
return CL_EMEM;
}
data->fd = -1;
data->partial = 0;
data->has_partial = 0;
data->bread = 0;
data->internal_state = WAIT_MAGIC;
data->tmpdir = tmpdir;
data->ctx = ctx;
data->name = NULL;
data->desc_name = NULL;
state->cb_data = data;
return 0;
}
static cl_error_t decode_and_scan(struct rtf_object_data* data, cli_ctx* ctx)
{
cl_error_t ret = CL_CLEAN;
cli_dbgmsg("RTF:Scanning embedded object: %s\n", data->name);
if (data->fd > 0) {
if (data->bread == 1) {
cli_dbgmsg("Decoding ole object\n");
ret = cli_scan_ole10(data->fd, ctx);
} else {
ret = cli_magic_scan_desc(data->fd, data->name, ctx, NULL, LAYER_ATTRIBUTES_NONE);
}
close(data->fd);
data->fd = -1;
}
if (data->name) {
if (!ctx->engine->keeptmp)
if (cli_unlink(data->name)) ret = CL_EUNLINK;
free(data->name);
data->name = NULL;
}
return ret;
}
static int rtf_object_process(struct rtf_state* state, const unsigned char* input, const size_t len)
{
struct rtf_object_data* data = state->cb_data;
unsigned char outdata[BUFF_SIZE];
const unsigned char* out_data;
size_t out_cnt = 0;
size_t i;
int ret;
if (!data || !len)
return 0;
if (data->has_partial) {
for (i = 0; i < len && !isxdigit(input[i]); i++)
;
if (i < len) {
outdata[out_cnt++] = data->partial | hextable[input[i++]];
data->has_partial = 0;
} else
return 0;
} else
i = 0;
for (; i < len; i++) {
if (isxdigit(input[i])) {
const unsigned char byte = hextable[input[i++]] << 4;
while (i < len && !isxdigit(input[i]))
i++;
if (i == len) {
data->partial = byte;
data->has_partial = 1;
break;
}
outdata[out_cnt++] = byte | hextable[input[i]];
}
}
out_data = outdata;
while (out_data && out_cnt) {
switch (data->internal_state) {
case WAIT_MAGIC: {
cli_dbgmsg("RTF: waiting for magic\n");
for (i = 0; i < out_cnt && data->bread < rtf_data_magic_len; i++, data->bread++)
if (rtf_data_magic[data->bread] != out_data[i]) {
cli_dbgmsg("Warning: rtf objdata magic number not matched, expected:%d, got: %d, at pos:%lu\n", rtf_data_magic[i], out_data[i], (unsigned long int)data->bread);
}
out_cnt -= i;
if (data->bread == rtf_data_magic_len) {
out_data += i;
data->bread = 0;
data->internal_state = WAIT_DESC_LEN;
}
break;
}
case WAIT_DESC_LEN: {
if (data->bread == 0)
data->desc_len = 0;
for (i = 0; i < out_cnt && data->bread < 4; i++, data->bread++)
data->desc_len |= ((size_t)out_data[i]) << (data->bread * 8);
out_cnt -= i;
if (data->bread == 4) {
out_data += i;
data->bread = 0;
if (data->desc_len > 64) {
cli_dbgmsg("Description length too big (%lu), showing only 64 bytes of it\n", (unsigned long int)data->desc_len);
data->desc_name = cli_malloc(65);
} else
data->desc_name = cli_malloc(data->desc_len + 1);
if (!data->desc_name) {
cli_errmsg("rtf_object_process: Unable to allocate memory for data->desc_name\n");
return CL_EMEM;
}
data->internal_state = WAIT_DESC;
cli_dbgmsg("RTF: description length:%lu\n", (unsigned long int)data->desc_len);
}
break;
}
case WAIT_DESC: {
cli_dbgmsg("RTF: in WAIT_DESC\n");
for (i = 0; i < out_cnt && data->bread < data->desc_len && data->bread < 64; i++, data->bread++)
data->desc_name[data->bread] = out_data[i];
out_cnt -= i;
out_data += i;
if (data->bread < data->desc_len && data->bread < 64) {
cli_dbgmsg("RTF: waiting for more data(1)\n");
return 0; /* wait for more data */
}
data->desc_name[data->bread] = '\0';
if (data->desc_len - data->bread > out_cnt) {
data->desc_len -= out_cnt;
cli_dbgmsg("RTF: waiting for more data(2)\n");
return 0; /* wait for more data */
}
out_cnt -= data->desc_len - data->bread;
if (data->bread >= data->desc_len) {
out_data += data->desc_len - data->bread;
data->bread = 0;
cli_dbgmsg("Preparing to dump rtf embedded object, description:%s\n", data->desc_name);
free(data->desc_name);
data->desc_name = NULL;
data->internal_state = WAIT_ZERO;
}
break;
}
case WAIT_ZERO: {
if (out_cnt < 8 - data->bread) {
out_cnt = 0;
data->bread += out_cnt;
} else {
out_cnt -= 8 - data->bread;
data->bread = 8;
}
if (data->bread == 8) {
out_data += 8;
data->bread = 0;
cli_dbgmsg("RTF: next state: wait_data_size\n");
data->internal_state = WAIT_DATA_SIZE;
}
break;
}
case WAIT_DATA_SIZE: {
cli_dbgmsg("RTF: in WAIT_DATA_SIZE\n");
if (data->bread == 0)
data->desc_len = 0;
for (i = 0; i < out_cnt && data->bread < 4; i++, data->bread++)
data->desc_len |= ((size_t)out_data[i]) << (8 * data->bread);
out_cnt -= i;
if (data->bread == 4) {
out_data += i;
data->bread = 0;
cli_dbgmsg("Dumping rtf embedded object of size:%lu\n", (unsigned long int)data->desc_len);
if ((ret = cli_gentempfd(data->tmpdir, &data->name, &data->fd)))
return ret;
data->internal_state = DUMP_DATA;
cli_dbgmsg("RTF: next state: DUMP_DATA\n");
}
break;
}
case DUMP_DATA: {
size_t out_want = (out_cnt < data->desc_len) ? out_cnt : data->desc_len;
if (!data->bread) {
if (out_data[0] != 0xd0 || out_data[1] != 0xcf) {
/* this is not an ole2 doc, but some ole (stream?) to be
* decoded by cli_decode_ole_object*/
char out[4];
data->bread = 1; /* flag to indicate this needs to be scanned with cli_decode_ole_object*/
cli_writeint32(out, data->desc_len);
if (cli_writen(data->fd, out, 4) != 4)
return CL_EWRITE;
} else
data->bread = 2;
}
data->desc_len -= out_want;
if (cli_writen(data->fd, out_data, out_want) != out_want) {
return CL_EWRITE;
}
out_data += out_want;
out_cnt -= out_want;
if (!data->desc_len) {
int rc;
if ((rc = decode_and_scan(data, data->ctx)))
return rc;
data->bread = 0;
data->internal_state = WAIT_MAGIC;
}
break;
}
case DUMP_DISCARD:
default:
out_cnt = 0;
;
}
}
return 0;
}
static int rtf_object_end(struct rtf_state* state, cli_ctx* ctx)
{
struct rtf_object_data* data = state->cb_data;
int rc = 0;
if (!data)
return 0;
if (data->fd > 0) {
rc = decode_and_scan(data, ctx);
}
if (data->name)
free(data->name);
if (data->desc_name)
free(data->desc_name);
free(data);
state->cb_data = NULL;
return rc;
}
static void rtf_action(struct rtf_state* state, long action)
{
switch (action) {
case RTF_OBJECT:
state->encounteredTopLevel |= 1 << RTF_OBJECT;
break;
case RTF_OBJECT_DATA:
if (state->encounteredTopLevel & (1 << RTF_OBJECT)) {
state->cb_begin = rtf_object_begin;
state->cb_process = rtf_object_process;
state->cb_end = rtf_object_end;
}
break;
};
}
static void cleanup_stack(struct stack* stack, struct rtf_state* state, cli_ctx* ctx)
{
if (!stack || !stack->states)
return;
while (stack && stack->stack_cnt /* && state->default_elements*/) {
pop_state(stack, state);
if (state->cb_data && state->cb_end)
state->cb_end(state, ctx);
}
}
#define SCAN_CLEANUP \
if (state.cb_data && state.cb_end) \
ret = state.cb_end(&state, ctx); \
tableDestroy(actiontable); \
cleanup_stack(&stack, &state, ctx); \
if (!ctx->engine->keeptmp) \
cli_rmdirs(tempname); \
else \
rmdir(tempname); \
free(tempname); \
free(stack.states);
int cli_scanrtf(cli_ctx* ctx)
{
char* tempname;
const unsigned char* ptr;
const unsigned char* ptr_end;
int ret = CL_CLEAN;
struct rtf_state state;
struct stack stack;
size_t bread;
table_t* actiontable;
uint8_t main_symbols[256];
size_t offset = 0;
cli_dbgmsg("in cli_scanrtf()\n");
memset(main_symbols, 0, 256);
main_symbols['{'] = 1;
main_symbols['}'] = 1;
main_symbols['\\'] = 1;
stack.stack_cnt = 0;
stack.stack_size = 16;
stack.elements = 0;
stack.warned = 0;
stack.states = cli_malloc(stack.stack_size * sizeof(*stack.states));
if (!stack.states) {
cli_errmsg("ScanRTF: Unable to allocate memory for stack states\n");
return CL_EMEM;
}
if (!(tempname = cli_gentemp_with_prefix(ctx->sub_tmpdir, "rtf-tmp")))
return CL_EMEM;
if (mkdir(tempname, 0700)) {
cli_dbgmsg("ScanRTF -> Can't create temporary directory %s\n", tempname);
free(stack.states);
free(tempname);
return CL_ETMPDIR;
}
actiontable = tableCreate();
if ((ret = load_actions(actiontable))) {
cli_dbgmsg("RTF: Unable to load rtf action table\n");
free(stack.states);
if (!ctx->engine->keeptmp)
cli_rmdirs(tempname);
free(tempname);
tableDestroy(actiontable);
return ret;
}
init_rtf_state(&state);
for (offset = 0; (ptr = fmap_need_off_once_len(ctx->fmap, offset, BUFF_SIZE, &bread)) && bread; offset += bread) {
ptr_end = ptr + bread;
while (ptr < ptr_end) {
switch (state.parse_state) {
case PARSE_MAIN:
switch (*ptr++) {
case '{':
if ((ret = push_state(&stack, &state))) {
cli_dbgmsg("RTF:Push failure!\n");
SCAN_CLEANUP;
return ret;
}
break;
case '}':
if (state.cb_data && state.cb_end)
if ((ret = state.cb_end(&state, ctx))) {
SCAN_CLEANUP;
return ret;
}
if ((ret = pop_state(&stack, &state))) {
cli_dbgmsg("RTF:pop failure!\n");
SCAN_CLEANUP;
return ret;
}
break;
case '\\':
state.parse_state = PARSE_CONTROL_;
break;
default:
ptr--;
{
size_t i;
size_t left = ptr_end - ptr;
size_t use = left;
for (i = 1; i < left; i++)
if (main_symbols[ptr[i]]) {
use = i;
break;
}
if (state.cb_begin) {
if (!state.cb_data)
if ((ret = state.cb_begin(&state, ctx, tempname))) {
SCAN_CLEANUP;
return ret;
}
if ((ret = state.cb_process(&state, ptr, use))) {
if (state.cb_end) {
state.cb_end(&state, ctx);
}
SCAN_CLEANUP;
return ret;
}
}
ptr += use;
}
}
break;
case PARSE_CONTROL_:
if (isalpha(*ptr)) {
state.parse_state = PARSE_CONTROL_WORD;
state.controlword_cnt = 0;
} else
state.parse_state = PARSE_CONTROL_SYMBOL;
break;
case PARSE_CONTROL_SYMBOL:
ptr++; /* Do nothing */
state.parse_state = PARSE_MAIN;
break;
case PARSE_CONTROL_WORD:
if (state.controlword_cnt == 32) {
cli_dbgmsg("Invalid control word: maximum size exceeded:%s\n", state.controlword);
state.parse_state = PARSE_MAIN;
} else if (isalpha(*ptr))
state.controlword[state.controlword_cnt++] = *ptr++;
else {
if (isspace(*ptr)) {
state.controlword[state.controlword_cnt++] = *ptr++;
state.parse_state = PARSE_INTERPRET_CONTROLWORD;
} else if (isdigit(*ptr)) {
state.parse_state = PARSE_CONTROL_WORD_PARAM;
state.controlword_param = 0;
state.controlword_param_sign = 1;
} else if (*ptr == '-') {
ptr++;
state.parse_state = PARSE_CONTROL_WORD_PARAM;
state.controlword_param = 0;
state.controlword_param_sign = -1;
} else {
state.parse_state = PARSE_INTERPRET_CONTROLWORD;
}
}
break;
case PARSE_CONTROL_WORD_PARAM:
if (isdigit(*ptr)) {
if (((state.controlword_param) > INT64_MAX / 10) ||
(state.controlword_param * 10 > INT64_MAX - (*ptr - '0'))) {
cli_dbgmsg("Invalid control word param: maximum size exceeded.\n");
state.parse_state = PARSE_MAIN;
} else {
state.controlword_param = state.controlword_param * 10 + (*ptr - '0');
ptr++;
}
} else if (isalpha(*ptr)) {
ptr++;
} else {
if (state.controlword_param_sign < 0)
state.controlword_param = -state.controlword_param;
state.parse_state = PARSE_INTERPRET_CONTROLWORD;
}
break;
case PARSE_INTERPRET_CONTROLWORD: {
int action;
state.controlword[state.controlword_cnt] = '\0';
action = tableFind(actiontable, state.controlword);
if (action != -1) {
if (state.cb_data && state.cb_end) { /* premature end of previous block */
state.cb_end(&state, ctx);
state.cb_begin = NULL;
state.cb_end = NULL;
state.cb_data = NULL;
}
rtf_action(&state, action);
}
state.parse_state = PARSE_MAIN;
break;
}
}
}
}
SCAN_CLEANUP;
return ret;
}