/* * Fuzzy hash implementations, matching, and signature support * * Copyright (C) 2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved. * * Authors: Micah Snyder, Mickey Sola, Scott Hutton * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ use std::{ collections::HashMap, convert::{TryFrom, TryInto}, ffi::CStr, mem::ManuallyDrop, os::raw::c_char, panic, slice, }; use image::{imageops::FilterType::Lanczos3, DynamicImage, ImageBuffer, Luma, Pixel, Rgb}; use log::{debug, error, warn}; use num_traits::{NumCast, ToPrimitive, Zero}; use rustdct::DctPlanner; use thiserror::Error; use transpose::transpose; use crate::{ffi_error, ffi_util::FFIError, rrf_call, sys, validate_str_param}; /// CdiffError enumerates all possible errors returned by this library. #[derive(Error, Debug)] pub enum FuzzyHashError { #[error("Invalid format")] Format, #[error("Unknown algorithm: {0}")] UnknownAlgorithm(String), #[error("Failed to convert hamming distance to unsigned 32bit integer: {0}")] FormatHammingDistance(String), #[error("Invalid hamming distance: {0}")] InvalidHammingDistance(u32), #[error("Invalid hash: {0}")] FormatHashBytes(String), #[error("Failed to load image: {0}")] ImageLoad(image::ImageError), #[error("Failed to load image due to bug in image decoder")] ImageLoadPanic(), #[error("Invalid parameter: {0}")] InvalidParameter(String), #[error("{0} parmeter is NULL")] NullParam(&'static str), } #[derive(PartialEq, Eq, Hash, Debug)] pub struct ImageFuzzyHash { bytes: [u8; 8], } #[derive(PartialEq, Eq, Hash, Debug)] pub enum FuzzyHash { Image(ImageFuzzyHash), } impl TryFrom<&str> for ImageFuzzyHash { type Error = &'static str; fn try_from(value: &str) -> Result { if value.len() != 16 { return Err("Image fuzzy hash must be 16 characters in length"); } let mut hashbytes = [0; 8]; if hex::decode_to_slice(value, &mut hashbytes).is_ok() { Ok(ImageFuzzyHash { bytes: hashbytes }) } else { Err("Failed to decode image fuzzy hash bytes from hex to bytes") } } } impl std::fmt::Display for FuzzyHash { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { FuzzyHash::Image(hash_bytes) => { write!(f, "{}", hex::encode(hash_bytes.bytes)) } } } } #[derive(Debug, Default)] pub struct FuzzyHashMap { hashmap: HashMap>, } #[derive(Debug, Copy, Clone)] pub struct FuzzyHashMeta { lsigid: u32, subsigid: u32, #[cfg(feature = "not_ready")] hamming_distance: u32, } /// Initialize the hashmap #[no_mangle] pub extern "C" fn fuzzy_hashmap_new() -> sys::fuzzyhashmap_t { Box::into_raw(Box::new(FuzzyHashMap::default())) as sys::fuzzyhashmap_t } /// Free the hashmap #[no_mangle] pub extern "C" fn fuzzy_hash_free_hashmap(fuzzy_hashmap: sys::fuzzyhashmap_t) { if fuzzy_hashmap.is_null() { warn!("Attempted to free a NULL hashmap pointer. Please report this at: https://github.com/Cisco-Talos/clamav/issues"); } else { let _ = unsafe { Box::from_raw(fuzzy_hashmap as *mut FuzzyHashMap) }; } } /// C interface for FuzzyHashMap::check(). /// Handles all the unsafe ffi stuff. /// /// # Safety /// /// No parameters may be NULL #[export_name = "fuzzy_hash_check"] pub unsafe extern "C" fn _fuzzy_hash_check( fuzzy_hashmap: sys::fuzzyhashmap_t, mdata: *mut sys::cli_ac_data, image_fuzzy_hash: sys::image_fuzzy_hash_t, ) -> bool { let hash_bytes = image_fuzzy_hash.hash; let hashmap = ManuallyDrop::new(Box::from_raw(fuzzy_hashmap as *mut FuzzyHashMap)); debug!( "Checking image fuzzy hash '{}' for signature match", hex::encode(hash_bytes) ); if let Some(meta_vec) = hashmap.check(hash_bytes) { for meta in meta_vec { sys::lsig_increment_subsig_match(mdata, meta.lsigid, meta.subsigid); } } true } /// C interface for FuzzyHashMap::load_subsignature(). /// Handles all the unsafe ffi stuff. /// /// # Safety /// /// `hexsig` and `err` must not be NULL #[export_name = "fuzzy_hash_load_subsignature"] pub unsafe extern "C" fn _fuzzy_hash_load_subsignature( fuzzy_hashmap: sys::fuzzyhashmap_t, hexsig: *const c_char, lsig_id: u32, subsig_id: u32, err: *mut *mut FFIError, ) -> bool { let hexsig = validate_str_param!(hexsig); let mut hashmap = ManuallyDrop::new(Box::from_raw(fuzzy_hashmap as *mut FuzzyHashMap)); rrf_call!( err = err, hashmap.load_subsignature(hexsig, lsig_id, subsig_id) ) } /// C interface for fuzzy_hash_calculate_image(). /// Handles all the unsafe ffi stuff. /// /// # Safety /// /// `file_bytes` and `hash_out` must not be NULL #[export_name = "fuzzy_hash_calculate_image"] pub unsafe extern "C" fn _fuzzy_hash_calculate_image( file_bytes: *const u8, file_size: usize, hash_out: *mut u8, hash_out_len: usize, err: *mut *mut FFIError, ) -> bool { if hash_out.is_null() { return ffi_error!(err = err, FuzzyHashError::NullParam("hash_out")); } let buffer = if file_bytes.is_null() { return ffi_error!(err = err, FuzzyHashError::NullParam("file_bytes")); } else { slice::from_raw_parts(file_bytes, file_size) }; let hash_result = fuzzy_hash_calculate_image(buffer); let hash_bytes = match hash_result { Ok(hash) => hash, Err(error) => return ffi_error!(err = err, error), }; if hash_out_len < hash_bytes.len() { return ffi_error!( err = err, FuzzyHashError::InvalidParameter(format!( "hash_bytes output parameter too small to hold the hash: {} < {}", hash_out_len, hash_bytes.len() )) ); } hash_out.copy_from(hash_bytes.as_ptr(), hash_bytes.len()); true } impl FuzzyHashMap { /// Check for fuzzy hash matches. /// /// In this initial version, we're just doing a simple hash lookup and the /// hamming distance is not considered. /// /// TODO: In a future version, replace this with an implementation that can find /// any hashes within the signature meta.hamming_distance. pub fn check(&self, hash: [u8; 8]) -> Option<&Vec> { let hash = FuzzyHash::Image(ImageFuzzyHash { bytes: hash }); self.hashmap.get(&hash) } /// Load a fuzzy hash subsignature /// Parse a fuzzy hash logical sig subsignature. /// Add the fuzzy hash to the matcher so it can be matched. pub fn load_subsignature( &mut self, hexsig: &str, lsig_id: u32, subsig_id: u32, ) -> Result<(), FuzzyHashError> { let mut hexsig_split = hexsig.split('#'); let algorithm = match hexsig_split.next() { Some(x) => x, None => return Err(FuzzyHashError::Format), }; let hash = match hexsig_split.next() { Some(x) => x, None => return Err(FuzzyHashError::Format), }; let distance: u32 = match hexsig_split.next() { Some(x) => match x.parse::() { Ok(n) => n, Err(_) => { return Err(FuzzyHashError::FormatHammingDistance(x.to_string())); } }, None => 0, }; // TODO: Support non-zero distance if distance != 0 { error!( "Non-zero hamming distances for image fuzzy hashes are not supported in this version." ); return Err(FuzzyHashError::InvalidHammingDistance(distance)); } match algorithm { "fuzzy_img" => { // Convert the hash string to an image fuzzy hash bytes struct let image_fuzzy_hash = hash .try_into() .map_err(|e| FuzzyHashError::FormatHashBytes(format!("{}: {}", e, hash)))?; let fuzzy_hash = FuzzyHash::Image(image_fuzzy_hash); let meta: FuzzyHashMeta = FuzzyHashMeta { lsigid: lsig_id, subsigid: subsig_id, #[cfg(feature = "not_ready")] hamming_distance: distance, }; // If the hash key does not exist in the hashmap, insert an empty vec. // Then add the current meta struct to the entry. self.hashmap .entry(fuzzy_hash) .or_insert_with(Vec::new) .push(meta); Ok(()) } _ => { error!("Unknown fuzzy hash algorithm: {}", algorithm); Err(FuzzyHashError::UnknownAlgorithm(algorithm.to_string())) } } } } /// Given a buffer and size, generate an image fuzzy hash /// /// This algorithm attempts to reproduce the results of the `phash()` function /// from the Python `imagehash` package. /// /// # Notes /// /// 1) I found that `image.grayscale() uses different RGB coefficients than /// the python `image.convert("L"). The docs for PIL.Image.convert() state: /// /// When translating a color image to greyscale (mode "L"), /// the library uses the ITU-R 601-2 luma transform:: /// /// L = R * 299/1000 + G * 587/1000 + B * 114/1000 /// /// You can get near-identical** grayscale results by making a clone (or forking) /// the image-rs crate, and changing the coefficients to match those above: /// /// diff --git a/src/color.rs b/src/color.rs /// index 78b5c587..92c99337 100644 /// --- a/src/color.rs /// +++ b/src/color.rs /// @@ -462,7 +462,7 @@ where /// } /// /// /// Coefficients to transform from sRGB to a CIE Y (luminance) value. /// -const SRGB_LUMA: [f32; 3] = [0.2126, 0.7152, 0.0722]; /// +const SRGB_LUMA: [f32; 3] = [0.299, 0.587, 0.114]; /// /// #[inline] /// fn rgb_to_luma(rgb: &[T]) -> T { /// /// **Note that I say "near-identical" because rounding /// appears to be slightly different and values are sometimes off-by-one. /// /// This change doesn't appear to be required to match the phash_simple() /// function, but to match the phash() function where the median is used instead /// of the mean -- this change is required. /// /// 2) scipy.fftpack.dct behaves differently on twodimensional arrays than /// single-dimensional arrays. /// See https://docs.scipy.org/doc/scipy/reference/generated/scipy.fftpack.dct.html: /// /// Note the optional "axis" argument: /// Axis along which the dct is computed; the default is over the last axis /// (i.e., axis=-1). /// /// For the Python `imagehash` package: /// - The `phash_simple()` function is doing a DCT-2 transform on a 2-dimensionals /// 32x32 array which means, just on the 2nd axis (just the rows). /// - The `phash()` function is doing a 2D DCT-2 transform, by running the DCT-2 on /// both X and Y axis, which is the same as transposing before or after each /// DCT-2 call. /// /// 3) I observed that the DCT2 results from Python are consistently 2x greater /// than those from Rust. If I multiply every value by 2 after running the DCT, /// then the results are the same. /// /// 4) We need to get a subset of the 2-D array representing the lower /// frequencies of the image, the same way the Python implementation does it. /// /// The way the python implementation does this is with this line: /// ```python /// dctlowfreq = dct[:hash_size, :hash_size] /// ``` /// /// You can't actually do that with a Python array of arrays... this is numpy /// 2-D array manipulation magic, where you can index 2-D arrays in slices. /// It works like this: /// ```ipython3 /// In [0]: x = [[0, 1, 2, 3, 4], [4, 5, 6, 7, 8], [8, 9, 10, 11, 12], [12, 13, 14, 15, 16], [16, 17, 18, 19, 20]] /// In [1]: h = 3 /// In [2]: n = np.asarray(x) /// In [3]: lf = n[:h, 1:h+1] /// In [4]: n /// array([[ 0, 1, 2, 3, 4], /// [ 4, 5, 6, 7, 8], /// [ 8, 9, 10, 11, 12], /// [12, 13, 14, 15, 16], /// [16, 17, 18, 19, 20]]) /// /// In [5]: lf /// array([[ 0, 1, 2], /// [ 4, 5, 6], /// [ 8, 9, 10]]) /// ``` /// /// We can do something similar, manually, to get the low-frequency selection. /// /// param: hash_out is an output variable /// param: hash_out_len indicates the size of the hash_out buffer pub fn fuzzy_hash_calculate_image(buffer: &[u8]) -> Result, FuzzyHashError> { // Load image and attempt to catch panics in case the decoders encounter unexpected issues let result = panic::catch_unwind(|| -> Result { let image = image::load_from_memory(buffer).map_err(FuzzyHashError::ImageLoad)?; Ok(image) }); let og_image = match result { Ok(image) => image?, Err(_) => return Err(FuzzyHashError::ImageLoadPanic()), }; // Drop the alpha channel (if exists). let buff_rgb8 = og_image.to_rgb8(); // Convert image to grayscale. let buff_luma8 = grayscale(&buff_rgb8); // Convert back to a DynamicImage type so we can resize it. let image_gs = DynamicImage::ImageLuma8(buff_luma8); // Shrink to a 32x32 (1024 pixel) image. let image_small = image::DynamicImage::resize_exact(&image_gs, 32, 32, Lanczos3); // Convert the data to a Vec of floats. let mut imgbuff_f32 = image_small.to_luma32f().into_raw(); // // Compute a 2D DCT-2 in-place. // let dct2 = DctPlanner::new().plan_dct2(32); // Use a scratch space so we can transpose and run DCT's without allocating any extra space. // We'll switch back and forth between the buffer for the original small image (buffer1) and the scratch buffer (buffer2). let buffer1: &mut [f32] = imgbuff_f32.as_mut_slice(); let buffer2: &mut [f32] = &mut [0.0; 1024]; // Transpose the image so we can run DCT on the X axis (columns) first. transpose(buffer1, buffer2, 32, 32); // Run DCT2 on the columns. for (row_in, row_out) in buffer2.chunks_mut(32).zip(buffer1.chunks_mut(32)) { dct2.process_dct2_with_scratch(row_in, row_out); } // Multiply each value x2, to match results from scipy.fftpack.dct() implementation. // Note: Unsure why this is required, but it is. buffer2.iter_mut().for_each(|f| *f *= 2.0); // Transpose the image back so we can run DCT on the Y axis (rows). transpose(buffer2, buffer1, 32, 32); // Run DCT2 on the rows. for (row_in, row_out) in buffer1.chunks_mut(32).zip(buffer2.chunks_mut(32)) { dct2.process_dct2_with_scratch(row_in, row_out); } // Multiply each value x2, to match results from scipy.fftpack.dct() implementation. // Note: Unsure why this is required, but it is. buffer1.iter_mut().for_each(|f| *f *= 2.0); // // Construct a DCT low frequency vector using the top-left most 8x8 values of the 32x32 DCT array. // let dct_low_freq = buffer1 // 2D array is 32-elements wide. .chunks(32) // Grab the first 8 rows. .take(8) // But only take the first 8 elements (columns) from each row. .flat_map(|chunk| chunk.chunks(8).take(1)) // Flatten the 8x8 selection down to a vector of floats. .flatten() .copied() .collect::>(); // Calculate average (median) of the DCT low frequency vector. let mut dct_low_freq_copy = dct_low_freq.clone(); dct_low_freq_copy.sort_by(|a, b| a.partial_cmp(b).unwrap()); let median: f32 = (dct_low_freq_copy[31] + dct_low_freq_copy[32]) / 2.0; // Construct hash vector by reducing DCT values to 1 or 0 by comparing terms vs median. let hashvec: Vec = dct_low_freq .into_iter() .map(|x| if x > median { 1 } else { 0 }) .collect(); // Construct hash vec from bits. let hash_bytes: Vec = hashvec .chunks(8) .map(|chunk| { let chunk = chunk.to_owned(); chunk .iter() .rev() .enumerate() .fold(None, |accum, (n, val)| { accum.or(Some(0)).map(|accum| accum | ((*val as u8) << n)) }) }) .take_while(|x| x.is_some()) .flatten() .collect(); debug!("Image hash: {}", hex::encode(&hash_bytes)); Ok(hash_bytes) } /// Use these instead: /// L = R * 299/1000 + G * 587/1000 + B * 114/1000 const SRGB_LUMA: [f32; 3] = [299.0 / 1000.0, 587.0 / 1000.0, 114.0 / 1000.0]; #[inline] fn rgb_to_luma(rgb: &[u8]) -> u8 { let l = SRGB_LUMA[0] * rgb[0].to_f32().unwrap() + SRGB_LUMA[1] * rgb[1].to_f32().unwrap() + SRGB_LUMA[2] * rgb[2].to_f32().unwrap(); NumCast::from(l.round()).unwrap() } /// Convert the supplied image to grayscale. Alpha channel is discarded. /// /// This is a customized implemententation of the grayscale feature from the `image` crate. /// This allows us to: /// - use RGB->LUMA constants that match those used by the Python Pillow package. /// - round the luma floating point value to the nearest integer rather than truncating. /// /// See also: https://github.com/image-rs/image/issues/1554 fn grayscale(image: &ImageBuffer, Vec>) -> ImageBuffer, Vec> { let (width, height) = image.dimensions(); let mut out = ImageBuffer::new(width, height); for y in 0..height { for x in 0..width { let pixel = image.get_pixel(x, y); let mut pix = Luma([Zero::zero()]); let gray = pix.channels_mut(); let rgb = pixel.channels(); gray[0] = rgb_to_luma(rgb); let pixel = Luma::from_slice(gray); //.into_color(); // no-op for luma->luma out.put_pixel(x, y, *pixel); } } out }