denyhosts/clamscan/libclamav/regex_suffix.c
2022-10-22 18:41:00 +08:00

468 lines
14 KiB
C

/*
* Parse a regular expression, and extract a static suffix.
*
* Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
* Copyright (C) 2007-2013 Sourcefire, Inc.
*
* Authors: Török Edvin
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "clamav.h"
#include "others.h"
#include "jsparse/textbuf.h"
#include "regex_suffix.h"
#define MODULE "regex_suffix: "
enum node_type {
root = 0,
concat,
alternate, /* | */
optional, /* ?, * */
leaf, /* a character */
leaf_class /* character class */
/* (x)+ is transformed into (x)*(x) */
};
struct node {
enum node_type type; /* must be first field */
struct node *parent;
union {
struct {
struct node *left;
struct node *right;
} children;
uint8_t *leaf_class_bitmap;
uint8_t leaf_char;
} u;
};
/* --- Prototypes --*/
static cl_error_t build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex);
/* -----------------*/
static uint8_t dot_bitmap[32] = {
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
static struct node *make_node(enum node_type type, struct node *left, struct node *right)
{
struct node *n;
if (type == concat) {
if (left == NULL)
return right;
if (right == NULL)
return left;
}
n = cli_malloc(sizeof(*n));
if (!n) {
cli_errmsg("make_node: Unable to allocate memory for new node\n");
return NULL;
}
n->type = type;
n->parent = NULL;
n->u.children.left = left;
n->u.children.right = right;
if (left)
left->parent = n;
if (right)
right->parent = n;
return n;
}
static struct node *dup_node(struct node *p)
{
struct node *node_left, *node_right;
struct node *d;
if (!p)
return NULL;
d = cli_malloc(sizeof(*d));
if (!d) {
cli_errmsg("dup_node: Unable to allocate memory for duplicate node\n");
return NULL;
}
d->type = p->type;
d->parent = NULL;
switch (p->type) {
case leaf:
d->u.leaf_char = p->u.leaf_char;
break;
case leaf_class:
d->u.leaf_class_bitmap = cli_malloc(32);
if (!d->u.leaf_class_bitmap) {
cli_errmsg("make_node: Unable to allocate memory for leaf class\n");
free(d);
return NULL;
}
memcpy(d->u.leaf_class_bitmap, p->u.leaf_class_bitmap, 32);
break;
default:
node_left = dup_node(p->u.children.left);
node_right = dup_node(p->u.children.right);
d->u.children.left = node_left;
d->u.children.right = node_right;
if (node_left)
node_left->parent = d;
if (node_right)
node_right->parent = d;
break;
}
return d;
}
static struct node *make_charclass(uint8_t *bitmap)
{
struct node *v = cli_malloc(sizeof(*v));
if (!v) {
cli_errmsg("make_charclass: Unable to allocate memory for character class\n");
return NULL;
}
v->type = leaf_class;
v->parent = NULL;
v->u.leaf_class_bitmap = bitmap;
return v;
}
static struct node *make_leaf(char c)
{
struct node *v = cli_malloc(sizeof(*v));
if (!v)
return NULL;
v->type = leaf;
v->parent = NULL;
v->u.leaf_char = c;
return v;
}
static void destroy_tree(struct node *n)
{
if (!n)
return;
switch (n->type) {
case concat:
case alternate:
case optional:
destroy_tree(n->u.children.left);
destroy_tree(n->u.children.right);
break;
case leaf_class:
if (n->u.leaf_class_bitmap != dot_bitmap)
free(n->u.leaf_class_bitmap);
break;
case root:
case leaf:
break;
}
free(n);
}
static uint8_t *parse_char_class(const char *pat, size_t *pos)
{
unsigned char range_start = 0;
int hasprev = 0;
uint8_t *bitmap = cli_malloc(32);
if (!bitmap) {
cli_errmsg("parse_char_class: Unable to allocate memory for bitmap\n");
return NULL;
}
if (pat[*pos] == '^') {
memset(bitmap, 0xFF, 32); /*match chars not in brackets*/
++*pos;
} else
memset(bitmap, 0x00, 32);
do {
/* literal ] can be first character, so test for it at the end of the loop, for example: []] */
if (pat[*pos] == '-' && hasprev) {
/* it is a range*/
unsigned char range_end;
unsigned int c;
assert(range_start);
++*pos;
if (pat[*pos] == '[')
if (pat[*pos + 1] == '.') {
/* collating sequence not handled */
free(bitmap);
/* we are parsing the regex for a
* filter, be conservative and
* tell the filter that anything could
* match here */
while (pat[*pos] != ']') ++*pos;
++*pos;
while (pat[*pos] != ']') ++*pos;
return dot_bitmap;
} else
range_end = pat[*pos];
else
range_end = pat[*pos];
for (c = range_start + 1; c <= range_end; c++)
bitmap[c >> 3] ^= 1 << (c & 0x7);
hasprev = 0;
} else if (pat[*pos] == '[' && pat[*pos] == ':') {
/* char class */
free(bitmap);
while (pat[*pos] != ']') ++*pos;
++*pos;
while (pat[*pos] != ']') ++*pos;
return dot_bitmap;
} else {
bitmap[pat[*pos] >> 3] ^= 1 << (pat[*pos] & 0x7);
range_start = pat[*pos];
++*pos;
hasprev = 1;
}
} while (pat[*pos] != ']');
return bitmap;
}
static struct node *parse_regex(const char *p, size_t *last)
{
struct node *v = NULL;
struct node *right;
struct node *tmp;
while (p[*last] != '$' && p[*last] != '\0') {
switch (p[*last]) {
case '|':
++*last;
right = parse_regex(p, last);
v = make_node(alternate, v, right);
if (!v)
return NULL;
break;
case '*':
case '?':
v = make_node(optional, v, NULL);
if (!v)
return NULL;
++*last;
break;
case '+':
/* (x)* */
tmp = make_node(optional, v, NULL);
if (!tmp)
return NULL;
/* (x) */
right = dup_node(v);
if (!right) {
free(tmp);
return NULL;
}
/* (x)*(x) => (x)+ */
v = make_node(concat, tmp, right);
if (!v)
return NULL;
++*last;
break;
case '(':
++*last;
right = parse_regex(p, last);
if (!right)
return NULL;
++*last;
v = make_node(concat, v, right);
break;
case ')':
return v;
case '.':
right = make_charclass(dot_bitmap);
if (!right)
return NULL;
v = make_node(concat, v, right);
if (!v)
return NULL;
++*last;
break;
case '[':
++*last;
right = make_charclass(parse_char_class(p, last));
if (!right)
return NULL;
v = make_node(concat, v, right);
if (!v)
return NULL;
++*last;
break;
case '\\':
/* next char is escaped, advance pointer
* and let fall-through handle it */
++*last;
default:
right = make_leaf(p[*last]);
v = make_node(concat, v, right);
if (!v)
return NULL;
++*last;
break;
}
}
return v;
}
#define BITMAP_HASSET(b, i) (b[i >> 3] & (1 << (i & 7)))
static cl_error_t build_suffixtree_ascend(struct node *n, struct text_buffer *buf, struct node *prev, suffix_callback cb, void *cbdata, struct regex_list *regex)
{
size_t i, cnt;
while (n) {
struct node *q = n;
switch (n->type) {
case root:
textbuffer_putc(buf, '\0');
if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
return CL_EMEM;
return CL_SUCCESS;
case leaf:
textbuffer_putc(buf, n->u.leaf_char);
n = n->parent;
break;
case leaf_class:
cnt = 0;
for (i = 0; i < 255; i++)
if (BITMAP_HASSET(n->u.leaf_class_bitmap, i))
cnt++;
if (cnt > 16) {
textbuffer_putc(buf, '\0');
if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
return CL_EMEM;
return CL_SUCCESS;
}
/* handle small classes by expanding */
for (i = 0; i < 255; i++) {
if (BITMAP_HASSET(n->u.leaf_class_bitmap, i)) {
size_t pos;
pos = buf->pos;
textbuffer_putc(buf, (char)i);
if (build_suffixtree_ascend(n->parent, buf, n, cb, cbdata, regex) != CL_SUCCESS)
return CL_EMEM;
buf->pos = pos;
}
}
return 0;
case concat:
if (prev != n->u.children.left) {
if (build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) != CL_SUCCESS)
return CL_EMEM;
/* we're done here, descend will call
* ascend if needed */
return CL_SUCCESS;
} else {
n = n->parent;
}
break;
case alternate:
n = n->parent;
break;
case optional:
textbuffer_putc(buf, '\0');
if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
return CL_EMEM;
return CL_SUCCESS;
}
prev = q;
}
return CL_SUCCESS;
}
static cl_error_t build_suffixtree_descend(struct node *n, struct text_buffer *buf, suffix_callback cb, void *cbdata, struct regex_list *regex)
{
size_t pos;
while (n && n->type == concat) {
n = n->u.children.right;
}
if (!n)
return CL_SUCCESS;
/* find out end of the regular expression,
* if it ends with a static pattern */
switch (n->type) {
case alternate:
/* save pos as restart point */
pos = buf->pos;
if (build_suffixtree_descend(n->u.children.left, buf, cb, cbdata, regex) != CL_SUCCESS)
return CL_EMEM;
buf->pos = pos;
if (build_suffixtree_descend(n->u.children.right, buf, cb, cbdata, regex) != CL_SUCCESS)
return CL_EMEM;
buf->pos = pos;
break;
case optional:
textbuffer_putc(buf, '\0');
if (cb(cbdata, buf->data, buf->pos - 1, regex) != CL_SUCCESS)
return CL_EMEM;
return CL_SUCCESS;
case leaf:
case leaf_class:
if (build_suffixtree_ascend(n, buf, NULL, cb, cbdata, regex) != CL_SUCCESS)
return CL_EMEM;
return CL_SUCCESS;
default:
break;
}
return CL_SUCCESS;
}
cl_error_t cli_regex2suffix(const char *pattern, regex_t *preg, suffix_callback cb, void *cbdata)
{
struct regex_list regex;
struct text_buffer buf;
struct node root_node;
struct node *n;
size_t last = 0;
int rc;
assert(pattern);
regex.preg = preg;
rc = cli_regcomp(regex.preg, pattern, REG_EXTENDED);
if (rc) {
size_t buflen = cli_regerror(rc, regex.preg, NULL, 0);
char *errbuf = cli_malloc(buflen);
if (errbuf) {
cli_regerror(rc, regex.preg, errbuf, buflen);
cli_errmsg(MODULE "Error compiling regular expression %s: %s\n", pattern, errbuf);
free(errbuf);
} else {
cli_errmsg(MODULE "Error compiling regular expression: %s\n", pattern);
}
return rc;
}
regex.nxt = NULL;
regex.pattern = cli_strdup(pattern);
n = parse_regex(pattern, &last);
if (!n)
return REG_ESPACE;
memset(&buf, 0, sizeof(buf));
memset(&root_node, 0, sizeof(buf));
n->parent = &root_node;
rc = build_suffixtree_descend(n, &buf, cb, cbdata, &regex);
free(regex.pattern);
free(buf.data);
destroy_tree(n);
return rc;
}