denyhosts/clamav/libclamav/textnorm.c

159 lines
5.6 KiB
C

/*
* Generic text normalizer.
*
* Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
* Copyright (C) 2008-2013 Sourcefire, Inc.
*
* Authors: Török Edvin
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include "clamav.h"
#include "textnorm.h"
#include "bignum.h"
int text_normalize_init(struct text_norm_state *state, unsigned char *out, size_t out_len)
{
if (!state) {
return CL_ENULLARG;
}
state->out = out;
state->out_len = out_len;
state->out_pos = 0;
state->space_written = 0;
return CL_SUCCESS;
}
void text_normalize_reset(struct text_norm_state *state)
{
state->out_pos = 0;
state->space_written = 0;
}
enum normalize_action {
NORMALIZE_COPY,
NORMALIZE_SKIP,
NORMALIZE_AS_WHITESPACE,
NORMALIZE_ADD_32
};
/* use shorter names in the table */
#define IGN NORMALIZE_SKIP
#define WSP NORMALIZE_AS_WHITESPACE
#define A32 NORMALIZE_ADD_32
#define NOP NORMALIZE_COPY
/*
* whitespace: \t, \n, \f, \v, \r, [ ]
* nop: all characters 0x20 < c < 0x80, that are not A32 and WSP
* tolowercase: all uppercase characters
* ignore: control character < 0x20 that are not whitespace, and all > 0x7f
*/
static const enum normalize_action char_action[256] = {
IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, WSP, WSP, WSP, WSP, WSP, IGN, IGN,
IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
WSP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, /* 0x20 - 0x2f */
NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,
NOP, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32,
A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, NOP, NOP, NOP, NOP, NOP,
NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,
NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, /* 0x70 - 0x7f */
IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN};
/* Normalizes the text at @buf of length @buf_len, @buf can include \0 characters.
* Stores the normalized text in @state's buffer.
* Returns how many bytes it consumed of the input. */
size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len)
{
size_t i;
const unsigned char *out_end = state->out + state->out_len;
unsigned char *p = state->out + state->out_pos;
for (i = 0; i < buf_len && p < out_end; i++) {
unsigned char c = buf[i];
switch (char_action[c]) {
case NORMALIZE_SKIP:
continue;
case NORMALIZE_AS_WHITESPACE:
/* convert consecutive whitespaces to a single space */
if (!state->space_written) {
*p++ = ' ';
}
state->space_written = 1;
continue;
case NORMALIZE_ADD_32:
/* aka uppercase to lowercase */
c += 32;
/* fall through */
case NORMALIZE_COPY:
state->space_written = 0;
*p++ = c;
}
}
state->out_pos = p - state->out;
return i;
}
/* Normalizes the text in @fmap and stores the result in @state's buffer.
* Returns number of characters written to buffer. */
size_t text_normalize_map(struct text_norm_state *state, fmap_t *map, size_t offset)
{
const unsigned char *map_loc;
unsigned int map_pgsz;
uint64_t map_len;
size_t buff_len;
size_t acc;
size_t acc_total;
size_t acc_len;
map_len = map->len;
map_pgsz = map->pgsz;
buff_len = state->out_len;
acc_total = 0;
acc = 0;
while (1) {
/* Break out if we've reached the end of the map or our buffer. */
if (!(acc_len = MIN_3(map_pgsz, map_len - offset, buff_len - acc_total))) break;
/* If map_loc is NULL, then there's nothing left to do but recover. */
if (!(map_loc = fmap_need_off_once(map, offset, acc_len))) break;
offset += acc_len;
/* If we didn't normalize anything, no need to update values, just break out. */
if (!(acc = text_normalize_buffer(state, map_loc, acc_len))) break;
acc_total += acc;
}
return acc_total;
}