2022-10-22 18:41:00 +08:00
/*
* Copyright ( C ) 2013 - 2022 Cisco Systems , Inc . and / or its affiliates . All rights reserved .
* Copyright ( C ) 2007 - 2013 Sourcefire , Inc .
*
* Authors : Tomasz Kojm
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
* Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston ,
* MA 02110 - 1301 , USA .
*/
# if HAVE_CONFIG_H
# include "clamav-config.h"
# endif
# include <stdio.h>
# include <string.h>
# include <stdlib.h>
# include <sys/types.h>
# ifdef HAVE_UNISTD_H
# include <unistd.h>
# endif
# include "clamav.h"
# include "filetypes.h"
# include "others.h"
# include "readdb.h"
# include "matcher-ac.h"
# include "str.h"
# include "textdet.h"
# include "default.h"
# include "iowrap.h"
# include "mbr.h"
# include "gpt.h"
# include "ooxml.h"
# include "htmlnorm.h"
# include "entconv.h"
# include "mpool.h"
# define UNZIP_PRIVATE
# include "unzip.h"
# include "is_tar.h"
// clang-format off
static const struct ftmap_s {
const char * name ;
cli_file_t code ;
} ftmap [ ] = {
{ " CL_TYPE_TEXT_ASCII " , CL_TYPE_TEXT_ASCII } ,
{ " CL_TYPE_TEXT_UTF8 " , CL_TYPE_TEXT_UTF8 } ,
{ " CL_TYPE_TEXT_UTF16LE " , CL_TYPE_TEXT_UTF16LE } ,
{ " CL_TYPE_TEXT_UTF16BE " , CL_TYPE_TEXT_UTF16BE } ,
{ " CL_TYPE_BINARY_DATA " , CL_TYPE_BINARY_DATA } ,
{ " CL_TYPE_IGNORED " , CL_TYPE_IGNORED } ,
{ " CL_TYPE_ANY " , CL_TYPE_ANY } ,
{ " CL_TYPE_MSEXE " , CL_TYPE_MSEXE } ,
{ " CL_TYPE_ELF " , CL_TYPE_ELF } ,
{ " CL_TYPE_MACHO " , CL_TYPE_MACHO } ,
{ " CL_TYPE_MACHO_UNIBIN " , CL_TYPE_MACHO_UNIBIN } ,
{ " CL_TYPE_POSIX_TAR " , CL_TYPE_POSIX_TAR } ,
{ " CL_TYPE_OLD_TAR " , CL_TYPE_OLD_TAR } ,
{ " CL_TYPE_CPIO_OLD " , CL_TYPE_CPIO_OLD } ,
{ " CL_TYPE_CPIO_ODC " , CL_TYPE_CPIO_ODC } ,
{ " CL_TYPE_CPIO_NEWC " , CL_TYPE_CPIO_NEWC } ,
{ " CL_TYPE_CPIO_CRC " , CL_TYPE_CPIO_CRC } ,
{ " CL_TYPE_GZ " , CL_TYPE_GZ } ,
{ " CL_TYPE_ZIP " , CL_TYPE_ZIP } ,
{ " CL_TYPE_BZ " , CL_TYPE_BZ } ,
{ " CL_TYPE_RAR " , CL_TYPE_RAR } ,
{ " CL_TYPE_ARJ " , CL_TYPE_ARJ } ,
{ " CL_TYPE_MSSZDD " , CL_TYPE_MSSZDD } ,
{ " CL_TYPE_MSOLE2 " , CL_TYPE_MSOLE2 } ,
{ " CL_TYPE_MSCAB " , CL_TYPE_MSCAB } ,
{ " CL_TYPE_MSCHM " , CL_TYPE_MSCHM } ,
{ " CL_TYPE_SIS " , CL_TYPE_SIS } ,
{ " CL_TYPE_SCRENC " , CL_TYPE_SCRENC } ,
{ " CL_TYPE_GRAPHICS " , CL_TYPE_GRAPHICS } ,
{ " CL_TYPE_GIF " , CL_TYPE_GIF } ,
{ " CL_TYPE_PNG " , CL_TYPE_PNG } ,
{ " CL_TYPE_JPEG " , CL_TYPE_JPEG } ,
{ " CL_TYPE_TIFF " , CL_TYPE_TIFF } ,
{ " CL_TYPE_RIFF " , CL_TYPE_RIFF } ,
{ " CL_TYPE_BINHEX " , CL_TYPE_BINHEX } ,
{ " CL_TYPE_TNEF " , CL_TYPE_TNEF } ,
{ " CL_TYPE_CRYPTFF " , CL_TYPE_CRYPTFF } ,
{ " CL_TYPE_PDF " , CL_TYPE_PDF } ,
{ " CL_TYPE_UUENCODED " , CL_TYPE_UUENCODED } ,
{ " CL_TYPE_HTML_UTF16 " , CL_TYPE_HTML_UTF16 } ,
{ " CL_TYPE_SCRIPT " , CL_TYPE_SCRIPT } ,
{ " CL_TYPE_RTF " , CL_TYPE_RTF } ,
{ " CL_TYPE_HTML " , CL_TYPE_HTML } ,
{ " CL_TYPE_MAIL " , CL_TYPE_MAIL } ,
{ " CL_TYPE_SFX " , CL_TYPE_SFX } ,
{ " CL_TYPE_ZIPSFX " , CL_TYPE_ZIPSFX } ,
{ " CL_TYPE_RARSFX " , CL_TYPE_RARSFX } ,
{ " CL_TYPE_CABSFX " , CL_TYPE_CABSFX } ,
{ " CL_TYPE_ARJSFX " , CL_TYPE_ARJSFX } ,
{ " CL_TYPE_NULSFT " , CL_TYPE_NULSFT } ,
{ " CL_TYPE_AUTOIT " , CL_TYPE_AUTOIT } ,
{ " CL_TYPE_ISHIELD_MSI " , CL_TYPE_ISHIELD_MSI } ,
{ " CL_TYPE_7Z " , CL_TYPE_7Z } ,
{ " CL_TYPE_7ZSFX " , CL_TYPE_7ZSFX } ,
{ " CL_TYPE_SWF " , CL_TYPE_SWF } ,
{ " CL_TYPE_ISO9660 " , CL_TYPE_ISO9660 } ,
{ " CL_TYPE_JAVA " , CL_TYPE_JAVA } ,
{ " CL_TYPE_DMG " , CL_TYPE_DMG } ,
{ " CL_TYPE_MBR " , CL_TYPE_MBR } ,
{ " CL_TYPE_GPT " , CL_TYPE_GPT } ,
{ " CL_TYPE_APM " , CL_TYPE_APM } ,
{ " CL_TYPE_XAR " , CL_TYPE_XAR } ,
{ " CL_TYPE_PART_ANY " , CL_TYPE_PART_ANY } ,
{ " CL_TYPE_PART_HFSPLUS " , CL_TYPE_PART_HFSPLUS } ,
{ " CL_TYPE_XZ " , CL_TYPE_XZ } ,
{ " CL_TYPE_OOXML_WORD " , CL_TYPE_OOXML_WORD } ,
{ " CL_TYPE_OOXML_PPT " , CL_TYPE_OOXML_PPT } ,
{ " CL_TYPE_OOXML_XL " , CL_TYPE_OOXML_XL } ,
{ " CL_TYPE_INTERNAL " , CL_TYPE_INTERNAL } ,
{ " CL_TYPE_XDP " , CL_TYPE_XDP } ,
{ " CL_TYPE_XML_WORD " , CL_TYPE_XML_WORD } ,
{ " CL_TYPE_XML_XL " , CL_TYPE_XML_XL } ,
{ " CL_TYPE_HWP3 " , CL_TYPE_HWP3 } ,
{ " CL_TYPE_XML_HWP " , CL_TYPE_XML_HWP } ,
{ " CL_TYPE_HWPOLE2 " , CL_TYPE_HWPOLE2 } ,
{ " CL_TYPE_OOXML_HWP " , CL_TYPE_OOXML_HWP } ,
{ " CL_TYPE_PS " , CL_TYPE_PS } ,
{ " CL_TYPE_MHTML " , CL_TYPE_MHTML } ,
{ " CL_TYPE_LNK " , CL_TYPE_LNK } ,
{ " CL_TYPE_EGG " , CL_TYPE_EGG } ,
{ " CL_TYPE_EGGSFX " , CL_TYPE_EGGSFX } ,
{ NULL , CL_TYPE_IGNORED }
} ;
// clang-format on
cli_file_t cli_ftcode ( const char * name )
{
unsigned int i ;
for ( i = 0 ; ftmap [ i ] . name ; i + + )
if ( ! strcmp ( ftmap [ i ] . name , name ) )
return ftmap [ i ] . code ;
return CL_TYPE_ERROR ;
}
const char * cli_ftname ( cli_file_t code )
{
unsigned int i ;
for ( i = 0 ; ftmap [ i ] . name ; i + + )
if ( ftmap [ i ] . code = = code )
return ftmap [ i ] . name ;
return NULL ;
}
void cli_ftfree ( const struct cl_engine * engine )
{
struct cli_ftype * ftypes = engine - > ftypes , * pt ;
while ( ftypes ) {
pt = ftypes ;
ftypes = ftypes - > next ;
MPOOL_FREE ( engine - > mempool , pt - > magic ) ;
MPOOL_FREE ( engine - > mempool , pt - > tname ) ;
MPOOL_FREE ( engine - > mempool , pt ) ;
}
ftypes = engine - > ptypes ;
while ( ftypes ) {
pt = ftypes ;
ftypes = ftypes - > next ;
MPOOL_FREE ( engine - > mempool , pt - > magic ) ;
MPOOL_FREE ( engine - > mempool , pt - > tname ) ;
MPOOL_FREE ( engine - > mempool , pt ) ;
}
}
cli_file_t cli_compare_ftm_partition ( const unsigned char * buf , size_t buflen , const struct cl_engine * engine )
{
struct cli_ftype * ptype = engine - > ptypes ;
while ( ptype ) {
if ( ptype - > offset + ptype - > length < = buflen ) {
if ( ! memcmp ( buf + ptype - > offset , ptype - > magic , ptype - > length ) ) {
cli_dbgmsg ( " Recognized %s partition \n " , ptype - > tname ) ;
return ptype - > type ;
}
}
ptype = ptype - > next ;
}
cli_dbgmsg ( " Partition type is potentially unsupported \n " ) ;
return CL_TYPE_PART_ANY ;
}
cli_file_t cli_compare_ftm_file ( const unsigned char * buf , size_t buflen , const struct cl_engine * engine )
{
struct cli_ftype * ftype = engine - > ftypes ;
while ( ftype ) {
if ( ftype - > offset + ftype - > length < = buflen ) {
if ( ! memcmp ( buf + ftype - > offset , ftype - > magic , ftype - > length ) ) {
cli_dbgmsg ( " Recognized %s file \n " , ftype - > tname ) ;
return ftype - > type ;
}
}
ftype = ftype - > next ;
}
return cli_texttype ( buf , buflen ) ;
}
/* organize by length, cannot exceed SIZEOF_LOCAL_HEADER */
// clang-format off
const struct ooxml_ftcodes {
const char * entry ;
size_t len ;
cli_file_t type ;
} ooxml_detect [ ] = {
{ " xl/ " , 3 , CL_TYPE_OOXML_XL } ,
{ " ppt/ " , 4 , CL_TYPE_OOXML_PPT } ,
{ " word/ " , 5 , CL_TYPE_OOXML_WORD } ,
{ " BinData " , 7 , CL_TYPE_ZIP } , /* HWP */
{ " mimetype " , 8 , CL_TYPE_ZIP } , /* HWP */
{ " Contents " , 8 , CL_TYPE_ZIP } , /* HWP */
{ " docProps/ " , 9 , CL_TYPE_ZIP } , /* MS */
{ " customXml/ " , 10 , CL_TYPE_ZIP } , /* MS */
{ " version.xml " , 11 , CL_TYPE_ZIP } , /* HWP */
{ " settings.xml " , 12 , CL_TYPE_ZIP } , /* HWP */
{ " _.rels/.rels " , 12 , CL_TYPE_ZIP } , /* MS */
{ " [ContentTypes].xml " , 18 , CL_TYPE_ZIP } , /* MS */
{ " [Content_Types].xml " , 19 , CL_TYPE_ZIP } , /* MS */
{ " Preview/PrvText.txt " , 19 , CL_TYPE_ZIP } , /* HWP */
{ " Contents/content.hpf " , 20 , CL_TYPE_OOXML_HWP } ,
{ " META-INF/container.xml " , 22 , CL_TYPE_ZIP } , /* HWP */
{ NULL , 0 , CL_TYPE_ANY }
} ;
// clang-format on
/* set to biggest ooxml_detect len */
# define OOXML_DETECT_MAXLEN 22
# define OOXML_FTIDENTIFIED(type) \
do { \
if ( type ! = CL_TYPE_ZIP ) { \
switch ( type ) { \
case CL_TYPE_OOXML_XL : \
cli_dbgmsg ( " Recognized OOXML XL file \n " ) ; \
return CL_TYPE_OOXML_XL ; \
case CL_TYPE_OOXML_PPT : \
cli_dbgmsg ( " Recognized OOXML PPT file \n " ) ; \
return CL_TYPE_OOXML_PPT ; \
case CL_TYPE_OOXML_WORD : \
cli_dbgmsg ( " Recognized OOXML WORD file \n " ) ; \
return CL_TYPE_OOXML_WORD ; \
case CL_TYPE_OOXML_HWP : \
cli_dbgmsg ( " Recognized OOXML HWP file \n " ) ; \
return CL_TYPE_OOXML_HWP ; \
default : \
cli_dbgmsg ( " unexpected ooxml_filetype return: %i \n " , type ) ; \
} \
} \
} while ( 0 )
cli_file_t cli_determine_fmap_type ( fmap_t * map , const struct cl_engine * engine , cli_file_t basetype )
{
unsigned char buffer [ MAGIC_BUFFER_SIZE ] ;
const unsigned char * buff ;
unsigned char * decoded ;
2023-01-14 18:28:39 +08:00
int bread ;
cli_file_t scan_ret ;
2022-10-22 18:41:00 +08:00
cli_file_t ret = CL_TYPE_BINARY_DATA ;
struct cli_matcher * root ;
struct cli_ac_data mdata ;
if ( ! engine ) {
cli_errmsg ( " cli_determine_fmap_type: engine == NULL \n " ) ;
return CL_TYPE_ERROR ;
}
if ( basetype = = CL_TYPE_PART_ANY ) {
bread = MIN ( map - > len , CL_PART_MBUFF_SIZE ) ;
} else {
bread = MIN ( map - > len , CL_FILE_MBUFF_SIZE ) ;
}
if ( bread > MAGIC_BUFFER_SIZE ) {
/* Save anyone who tampered with the header */
bread = MAGIC_BUFFER_SIZE ;
}
buff = fmap_need_off_once ( map , 0 , bread ) ;
if ( buff ) {
2023-01-14 18:28:39 +08:00
if ( CL_SUCCESS ! = cli_memcpy ( buffer , buff , bread ) ) {
2022-10-22 18:41:00 +08:00
cli_errmsg ( " cli_determine_fmap_type: fileread error! \n " ) ;
return CL_TYPE_ERROR ;
}
} else {
return CL_TYPE_ERROR ;
}
if ( basetype = = CL_TYPE_PART_ANY ) { /* typing a partition */
ret = cli_compare_ftm_partition ( buff , bread , engine ) ;
} else { /* typing a file */
ret = cli_compare_ftm_file ( buff , bread , engine ) ;
if ( ret = = CL_TYPE_BINARY_DATA ) {
switch ( is_tar ( buff , bread ) ) {
case 1 :
cli_dbgmsg ( " Recognized old fashioned tar file \n " ) ;
return CL_TYPE_OLD_TAR ;
case 2 :
cli_dbgmsg ( " Recognized POSIX tar file \n " ) ;
return CL_TYPE_POSIX_TAR ;
}
} else if ( ret = = CL_TYPE_ZIP & & bread > 2 * ( SIZEOF_LOCAL_HEADER + 5 ) ) {
const char lhdr_magic [ 4 ] = { 0x50 , 0x4b , 0x03 , 0x04 } ;
const unsigned char * zbuff = buff ;
uint32_t zread = bread ;
uint64_t zoff = bread ;
const unsigned char * znamep = buff ;
int32_t zlen = bread ;
int lhc = 0 ;
int zi , i , likely_ooxml = 0 ;
cli_file_t ret2 ;
for ( zi = 0 ; zi < 32 ; zi + + ) {
znamep = ( const unsigned char * ) cli_memstr ( ( const char * ) znamep , zlen , lhdr_magic , 4 ) ;
if ( NULL ! = znamep ) {
znamep + = SIZEOF_LOCAL_HEADER ;
zlen = zread - ( znamep - zbuff ) ;
if ( zlen > OOXML_DETECT_MAXLEN ) {
for ( i = 0 ; ooxml_detect [ i ] . entry ; i + + ) {
if ( 0 = = memcmp ( znamep , ooxml_detect [ i ] . entry , ooxml_detect [ i ] . len ) ) {
if ( ooxml_detect [ i ] . type ! = CL_TYPE_ZIP ) {
OOXML_FTIDENTIFIED ( ooxml_detect [ i ] . type ) ;
/* returns any unexpected type detection */
return ooxml_detect [ i ] . type ;
}
likely_ooxml = 1 ;
}
}
/* only check first three readable zip headers */
if ( + + lhc > 2 ) {
/* if likely, check full archive */
if ( likely_ooxml ) {
cli_dbgmsg ( " Likely OOXML, checking additional zip headers \n " ) ;
if ( ( ret2 = cli_ooxml_filetype ( NULL , map ) ) ! = CL_TYPE_ANY ) {
/* either an error or retyping has occurred, return error or just CL_TYPE_ZIP? */
OOXML_FTIDENTIFIED ( ret2 ) ;
/* falls-through to additional filetyping */
}
}
break ;
}
} else {
znamep = NULL ; /* force to map more */
}
}
if ( znamep = = NULL ) {
if ( map - > len - zoff > SIZEOF_LOCAL_HEADER ) {
zoff - = SIZEOF_LOCAL_HEADER + OOXML_DETECT_MAXLEN + 1 ; /* remap for SIZEOF_LOCAL_HEADER+filelen for header overlap map boundary */
zread = MIN ( MAGIC_BUFFER_SIZE , map - > len - zoff ) ;
zbuff = fmap_need_off_once ( map , zoff , zread ) ;
if ( zbuff = = NULL ) {
cli_dbgmsg ( " cli_determine_fmap_type: error mapping data for OOXML check \n " ) ;
return CL_TYPE_ERROR ;
}
zoff + = zread ;
znamep = zbuff ;
zlen = zread ;
} else {
break ; /* end of data */
}
}
}
} else if ( ret = = CL_TYPE_MBR ) {
/* given filetype sig type 0 */
int iret = cli_mbr_check ( buff , bread , map - > len ) ;
if ( iret = = CL_TYPE_GPT ) {
cli_dbgmsg ( " Recognized GUID Partition Table file \n " ) ;
return CL_TYPE_GPT ;
} else if ( iret = = CL_CLEAN ) {
return CL_TYPE_MBR ;
}
/* re-detect type */
cli_dbgmsg ( " Recognized binary data \n " ) ;
ret = CL_TYPE_BINARY_DATA ;
}
}
if ( ret > = CL_TYPE_TEXT_ASCII & & ret < = CL_TYPE_BINARY_DATA ) {
/* HTML files may contain special characters and could be
* misidentified as BINARY_DATA by cli_compare_ftm_file ( )
*/
root = engine - > root [ 0 ] ;
if ( ! root )
return ret ;
if ( cli_ac_initdata ( & mdata , root - > ac_partsigs , root - > ac_lsigs , root - > ac_reloff_num , CLI_DEFAULT_AC_TRACKLEN ) )
return ret ;
2023-01-14 18:28:39 +08:00
scan_ret = ( cli_file_t ) cli_ac_scanbuff ( buff , bread , NULL , NULL , NULL , engine - > root [ 0 ] , & mdata , 0 , ret , NULL , AC_SCAN_FT , NULL ) ;
2022-10-22 18:41:00 +08:00
cli_ac_freedata ( & mdata ) ;
2023-01-14 18:28:39 +08:00
if ( scan_ret > = CL_TYPENO & &
/* Omit SFX archive types selected. We'll detect these in scanraw() */
( ( scan_ret ! = CL_TYPE_ZIPSFX ) & &
( scan_ret ! = CL_TYPE_ARJSFX ) & &
( scan_ret ! = CL_TYPE_RARSFX ) & &
( scan_ret ! = CL_TYPE_EGGSFX ) & &
( scan_ret ! = CL_TYPE_CABSFX ) & &
( scan_ret ! = CL_TYPE_7ZSFX ) ) ) {
ret = scan_ret ;
2022-10-22 18:41:00 +08:00
} else {
if ( cli_ac_initdata ( & mdata , root - > ac_partsigs , root - > ac_lsigs , root - > ac_reloff_num , CLI_DEFAULT_AC_TRACKLEN ) )
return ret ;
decoded = ( unsigned char * ) cli_utf16toascii ( ( char * ) buff , bread ) ;
if ( decoded ) {
2023-01-14 18:28:39 +08:00
scan_ret = ( cli_file_t ) cli_ac_scanbuff ( decoded , bread / 2 , NULL , NULL , NULL , engine - > root [ 0 ] , & mdata , 0 , CL_TYPE_TEXT_ASCII , NULL , AC_SCAN_FT , NULL ) ;
2022-10-22 18:41:00 +08:00
free ( decoded ) ;
2023-01-14 18:28:39 +08:00
if ( scan_ret = = CL_TYPE_HTML )
2022-10-22 18:41:00 +08:00
ret = CL_TYPE_HTML_UTF16 ;
}
cli_ac_freedata ( & mdata ) ;
if ( ( ( ( struct cli_dconf * ) engine - > dconf ) - > phishing & PHISHING_CONF_ENTCONV ) & & ret ! = CL_TYPE_HTML_UTF16 ) {
const char * encoding ;
/* check if we can autodetect this encoding.
* If we can ' t don ' t try to detect HTML sig , since
* we just tried that above , and failed */
if ( ( encoding = encoding_detect_bom ( buff , bread ) ) ) {
unsigned char decodedbuff [ ( MAGIC_BUFFER_SIZE + 1 ) * 2 ] ;
m_area_t in_area , out_area ;
memset ( decodedbuff , 0 , sizeof ( decodedbuff ) ) ;
in_area . buffer = ( unsigned char * ) buff ;
in_area . length = bread ;
in_area . offset = 0 ;
out_area . buffer = decodedbuff ;
out_area . length = sizeof ( decodedbuff ) ;
out_area . offset = 0 ;
/* in htmlnorm we simply skip over \0 chars, allowing HTML parsing in any unicode
* ( multibyte characters will not be exactly handled , but that is not a problem ) .
* However when detecting whether a file is HTML or not , we need exact conversion .
* ( just eliminating zeros and matching would introduce false positives */
if ( encoding_normalize_toascii ( & in_area , encoding , & out_area ) > = 0 & & out_area . length > 0 ) {
if ( cli_ac_initdata ( & mdata , root - > ac_partsigs , root - > ac_lsigs , root - > ac_reloff_num , CLI_DEFAULT_AC_TRACKLEN ) )
return ret ;
if ( out_area . length > 0 ) {
2023-01-14 18:28:39 +08:00
scan_ret = ( cli_file_t ) cli_ac_scanbuff ( decodedbuff , out_area . length , NULL , NULL , NULL , engine - > root [ 0 ] , & mdata , 0 , 0 , NULL , AC_SCAN_FT , NULL ) ; /* FIXME: can we use CL_TYPE_TEXT_ASCII instead of 0? */
if ( scan_ret = = CL_TYPE_HTML ) {
2022-10-22 18:41:00 +08:00
cli_dbgmsg ( " cli_determine_fmap_type: detected HTML signature in Unicode file \n " ) ;
/* htmlnorm is able to handle any unicode now, since it skips null chars */
ret = CL_TYPE_HTML ;
}
}
cli_ac_freedata ( & mdata ) ;
}
}
}
}
}
return ret ;
}