更新libclamav库1.0.0版本

This commit is contained in:
2023-01-14 18:28:39 +08:00
parent b879ee0b2e
commit 45fe15f472
8531 changed files with 1222046 additions and 177272 deletions

View File

@@ -0,0 +1 @@
{"files":{"CHANGELOG.md":"18e89ab18be7d22dd835924a38b1198651e870dfd0a2904a344120d341c4bc0e","Cargo.toml":"dd189fa732c00b7d1b036b43220253ef6ff2a0b1ff5ca7cf13f0669113a2055e","LICENSE":"1738b51502ae831fb59ffbeb22ebdd90bf17e5c72fe57c00b47552415f133fd8","README.md":"cfb01477df60cd0780ba59cbbc388f3f71d83bf51bab77e004fc7b811aff0efd","appveyor.yml":"b41eae9798a9bb250f6046509d9bbd6e63bac9ad2655d342b3d9c8975584f0c0","benches/benches.rs":"e277857c44afdc08b2acf35fc05be6226529c588eb9da397382b0a74c58615ab","dev":"5bd26dc2c86f777627abe96c5992b6c45e6b5dea52f42b7107fa5c106abe2ab4","src/lib.rs":"26a960216567e5dea46033b3383a69e7da498095c092e195e92c05faef52f915","tests/lib.rs":"de2b1181c379a0f55de7b86021a9afb77dbe81053a6acf99623bec3663f9b7c4"},"package":"8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"}

View File

@@ -0,0 +1,123 @@
# Change Log
This project attempts to adhere to [Semantic Versioning](http://semver.org).
## [Unreleased]
## [0.8.0] - (2018-08-19)
### Added
- Normalized versions of Levenshtein and Damerau-Levenshtein (thanks [@gentoid](https://github.com/gentoid))
## [0.7.0] - (2018-01-17)
### Changed
- Faster Levenshtein implementation (thanks [@wdv4758h](https://github.com/wdv4758h))
### Removed
- Remove the "against_vec" functions. They are one-liners now, so they don't
seem to add enough value to justify making the API larger. I didn't find
anybody using them when I skimmed through a GitHub search. If you do use them,
you can change the calls to something like:
```rust
let distances = strings.iter().map(|a| jaro(target, a)).collect();
```
## [0.6.0] - (2016-12-26)
### Added
- Add optimal string alignment distance
### Fixed
- Fix Damerau-Levenshtein implementation (previous implementation was actually
optimal string alignment; see this [Damerau-Levenshtein explanation])
## [0.5.2] - (2016-11-21)
### Changed
- Remove Cargo generated documentation in favor of a [docs.rs] link
## [0.5.1] - (2016-08-23)
### Added
- Add Cargo generated documentation
### Fixed
- Fix panic when Jaro or Jaro-Winkler are given strings both with a length of
one
## [0.5.0] - (2016-08-11)
### Changed
- Make Hamming faster (thanks @IBUzPE9) when the two strings have the same
length but slower when they have different lengths
## [0.4.1] - (2016-04-18)
### Added
- Add Vagrant setup for development
- Add AppVeyor configuration for Windows CI
### Fixed
- Fix metrics when given strings with multibyte characters (thanks @WanzenBug)
## [0.4.0] - (2015-06-10)
### Added
- For each metric, add a function that takes a vector of strings and returns a
vector of results (thanks @ovarene)
## [0.3.0] - (2015-04-30)
### Changed
- Remove usage of unstable Rust features
## [0.2.5] - (2015-04-24)
### Fixed
- Remove unnecessary `Float` import from doc tests
## [0.2.4] - (2015-04-15)
### Fixed
- Remove unused `core` feature flag
## [0.2.3] - (2015-04-01)
### Fixed
- Remove now unnecessary `Float` import
## [0.2.2] - (2015-03-29)
### Fixed
- Remove usage of `char_at` (marked as unstable)
## [0.2.1] - (2015-02-20)
### Fixed
- Update bit vector import to match Rust update
## [0.2.0] - (2015-02-19)
### Added
- Implement Damerau-Levenshtein
- Add tests in docs
## [0.1.1] - (2015-02-10)
### Added
- Configure Travis for CI
- Add rustdoc comments
### Fixed
- Limit Jaro-Winkler return value to a maximum of 1.0
- Fix float comparisons in tests
## [0.1.0] - (2015-02-09)
### Added
- Implement Hamming, Jaro, Jaro-Winkler, and Levenshtein
[Unreleased]: https://github.com/dguo/strsim-rs/compare/0.8.0...HEAD
[0.8.0]: https://github.com/dguo/strsim-rs/compare/0.7.0...0.8.0
[0.7.0]: https://github.com/dguo/strsim-rs/compare/0.6.0...0.7.0
[0.6.0]: https://github.com/dguo/strsim-rs/compare/0.5.2...0.6.0
[0.5.2]: https://github.com/dguo/strsim-rs/compare/0.5.1...0.5.2
[0.5.1]: https://github.com/dguo/strsim-rs/compare/0.5.0...0.5.1
[0.5.0]: https://github.com/dguo/strsim-rs/compare/0.4.1...0.5.0
[0.4.1]: https://github.com/dguo/strsim-rs/compare/0.4.0...0.4.1
[0.4.0]: https://github.com/dguo/strsim-rs/compare/0.3.0...0.4.0
[0.3.0]: https://github.com/dguo/strsim-rs/compare/0.2.5...0.3.0
[0.2.5]: https://github.com/dguo/strsim-rs/compare/0.2.4...0.2.5
[0.2.4]: https://github.com/dguo/strsim-rs/compare/0.2.3...0.2.4
[0.2.3]: https://github.com/dguo/strsim-rs/compare/0.2.2...0.2.3
[0.2.2]: https://github.com/dguo/strsim-rs/compare/0.2.1...0.2.2
[0.2.1]: https://github.com/dguo/strsim-rs/compare/0.2.0...0.2.1
[0.2.0]: https://github.com/dguo/strsim-rs/compare/0.1.1...0.2.0
[0.1.1]: https://github.com/dguo/strsim-rs/compare/0.1.0...0.1.1
[0.1.0]: https://github.com/dguo/strsim-rs/compare/fabad4...0.1.0
[docs.rs]: https://docs.rs/strsim/
[Damerau-Levenshtein explanation]:
http://scarcitycomputing.blogspot.com/2013/04/damerau-levenshtein-edit-distance.html

View File

@@ -0,0 +1,28 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g. crates.io) dependencies
#
# If you believe there's an error in this file please file an
# issue against the rust-lang/cargo repository. If you're
# editing this file be aware that the upstream Cargo.toml
# will likely look very different (and much more reasonable)
[package]
name = "strsim"
version = "0.8.0"
authors = ["Danny Guo <dannyguo91@gmail.com>"]
description = "Implementations of string similarity metrics.\nIncludes Hamming, Levenshtein, OSA, Damerau-Levenshtein, Jaro, and Jaro-Winkler.\n"
homepage = "https://github.com/dguo/strsim-rs"
documentation = "https://docs.rs/strsim/"
readme = "README.md"
keywords = ["string", "similarity", "Hamming", "Levenshtein", "Jaro"]
license = "MIT"
repository = "https://github.com/dguo/strsim-rs"
[badges.appveyor]
repository = "dguo/strsim-rs"
[badges.travis-ci]
repository = "dguo/strsim-rs"

View File

@@ -0,0 +1,23 @@
The MIT License (MIT)
Copyright (c) 2015 Danny Guo
Copyright (c) 2016 Titus Wormer <tituswormer@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,69 @@
# strsim-rs [![Crates.io](https://img.shields.io/crates/v/strsim.svg)](https://crates.io/crates/strsim) [![Crates.io](https://img.shields.io/crates/l/strsim.svg?maxAge=2592000)](https://github.com/dguo/strsim-rs/blob/master/LICENSE) [![Linux build status](https://travis-ci.org/dguo/strsim-rs.svg?branch=master)](https://travis-ci.org/dguo/strsim-rs) [![Windows build status](https://ci.appveyor.com/api/projects/status/ggue6i785618a39w?svg=true)](https://ci.appveyor.com/project/dguo/strsim-rs)
[Rust](https://www.rust-lang.org) implementations of [string similarity metrics]:
- [Hamming]
- [Levenshtein] - distance & normalized
- [Optimal string alignment]
- [Damerau-Levenshtein] - distance & normalized
- [Jaro and Jaro-Winkler] - this implementation of Jaro-Winkler does not limit the common prefix length
### Installation
```toml
# Cargo.toml
[dependencies]
strsim = "0.8.0"
```
### [Documentation](https://docs.rs/strsim/)
You can change the version in the url to see the documentation for an older
version in the
[changelog](https://github.com/dguo/strsim-rs/blob/master/CHANGELOG.md).
### Usage
```rust
extern crate strsim;
use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance,
damerau_levenshtein, normalized_damerau_levenshtein, jaro,
jaro_winkler};
fn main() {
match hamming("hamming", "hammers") {
Ok(distance) => assert_eq!(3, distance),
Err(why) => panic!("{:?}", why)
}
assert_eq!(3, levenshtein("kitten", "sitting"));
assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
assert_eq!(3, osa_distance("ac", "cba"));
assert_eq!(2, damerau_levenshtein("ac", "cba"));
assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001)
assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <
0.001);
assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
0.001);
}
```
### Development
If you don't want to install Rust itself, you can run `$ ./dev` for a
development CLI if you have [Docker] installed.
Benchmarks require a Nightly toolchain. They are run by `cargo +nightly bench`.
### License
[MIT](https://github.com/dguo/strsim-rs/blob/master/LICENSE)
[string similarity metrics]:http://en.wikipedia.org/wiki/String_metric
[Damerau-Levenshtein]:http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
[Jaro and Jaro-Winkler]:http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
[Levenshtein]:http://en.wikipedia.org/wiki/Levenshtein_distance
[Hamming]:http://en.wikipedia.org/wiki/Hamming_distance
[Optimal string alignment]:https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance
[Docker]:https://docs.docker.com/engine/installation/

View File

@@ -0,0 +1,13 @@
install:
- ps: Start-FileDownload 'https://static.rust-lang.org/dist/rust-beta-x86_64-pc-windows-gnu.exe'
- rust-beta-x86_64-pc-windows-gnu.exe /VERYSILENT /NORESTART /DIR="C:\Program Files (x86)\Rust"
- SET PATH=%PATH%;C:\Program Files (x86)\Rust\bin
- rustc -V
- cargo -V
build: false
test_script:
- cargo build
- cargo test --verbose

View File

@@ -0,0 +1,84 @@
//! Benchmarks for strsim.
#![feature(test)]
extern crate strsim;
mod benches {
use super::*;
extern crate test;
use self::test::Bencher;
#[bench]
fn bench_hamming(bencher: &mut Bencher) {
let a = "ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGG";
let b = "CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGC";
bencher.iter(|| {
strsim::hamming(&a, &b).unwrap();
})
}
#[bench]
fn bench_jaro(bencher: &mut Bencher) {
let a = "Philosopher Friedrich Nietzsche";
let b = "Philosopher Jean-Paul Sartre";
bencher.iter(|| {
strsim::jaro(&a, &b);
})
}
#[bench]
fn bench_jaro_winkler(bencher: &mut Bencher) {
let a = "Philosopher Friedrich Nietzsche";
let b = "Philosopher Jean-Paul Sartre";
bencher.iter(|| {
strsim::jaro_winkler(&a, &b);
})
}
#[bench]
fn bench_levenshtein(bencher: &mut Bencher) {
let a = "Philosopher Friedrich Nietzsche";
let b = "Philosopher Jean-Paul Sartre";
bencher.iter(|| {
strsim::levenshtein(&a, &b);
})
}
#[bench]
fn bench_normalized_levenshtein(bencher: &mut Bencher) {
let a = "Philosopher Friedrich Nietzsche";
let b = "Philosopher Jean-Paul Sartre";
bencher.iter(|| {
strsim::normalized_levenshtein(&a, &b);
})
}
#[bench]
fn bench_osa_distance(bencher: &mut Bencher) {
let a = "Philosopher Friedrich Nietzsche";
let b = "Philosopher Jean-Paul Sartre";
bencher.iter(|| {
strsim::osa_distance(&a, &b);
})
}
#[bench]
fn bench_damerau_levenshtein(bencher: &mut Bencher) {
let a = "Philosopher Friedrich Nietzsche";
let b = "Philosopher Jean-Paul Sartre";
bencher.iter(|| {
strsim::damerau_levenshtein(&a, &b);
})
}
#[bench]
fn bench_normalized_damerau_levenshtein(bencher: &mut Bencher) {
let a = "Philosopher Friedrich Nietzsche";
let b = "Philosopher Jean-Paul Sartre";
bencher.iter(|| {
strsim::normalized_damerau_levenshtein(&a, &b);
})
}
}

View File

@@ -0,0 +1,41 @@
#!/usr/bin/env python3
# ./dev --help
import argparse
import os
from subprocess import run
import sys
parser = argparse.ArgumentParser(prog='./dev')
subparsers = parser.add_subparsers(metavar='<command>', title='commands')
command = [
'docker', 'run', '-it', '--rm', '-v', os.getcwd() + ':/src:cached',
'-w=/src', 'rust:1.21.0'
]
def cargo(args, remaining):
sys.exit(run(command + ['cargo'] + remaining or []).returncode)
parser_cargo = subparsers.add_parser('cargo', help='run a cargo command')
parser_cargo.set_defaults(func=cargo)
def sh(args, remaining):
sys.exit(run(command + ['bash']).returncode)
parser_sh = subparsers.add_parser('sh', help='bring up a shell')
parser_sh.set_defaults(func=sh)
def test(args, remaining):
sys.exit(run(command + ['cargo', 'test']).returncode)
parser_test = subparsers.add_parser('test', help='run tests')
parser_test.set_defaults(func=test)
if len(sys.argv) > 1:
args, remaining = parser.parse_known_args()
try:
args.func(args, remaining)
except FileNotFoundError:
sys.exit('Please install Docker.')
else:
parser.print_help()

View File

@@ -0,0 +1,786 @@
//! This library implements string similarity metrics.
use std::char;
use std::cmp::{max, min};
use std::collections::HashMap;
#[derive(Debug, PartialEq)]
pub enum StrSimError {
DifferentLengthArgs
}
pub type HammingResult = Result<usize, StrSimError>;
/// Calculates the number of positions in the two strings where the characters
/// differ. Returns an error if the strings have different lengths.
///
/// ```
/// use strsim::hamming;
///
/// match hamming("hamming", "hammers") {
/// Ok(distance) => assert_eq!(3, distance),
/// Err(why) => panic!("{:?}", why)
/// }
/// ```
pub fn hamming(a: &str, b: &str) -> HammingResult {
let (mut ita, mut itb, mut count) = (a.chars(), b.chars(), 0);
loop {
match (ita.next(), itb.next()){
(Some(x), Some(y)) => if x != y { count += 1 },
(None, None) => return Ok(count),
_ => return Err(StrSimError::DifferentLengthArgs),
}
}
}
/// Calculates the Jaro similarity between two strings. The returned value
/// is between 0.0 and 1.0 (higher value means more similar).
///
/// ```
/// use strsim::jaro;
///
/// assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <
/// 0.001);
/// ```
pub fn jaro(a: &str, b: &str) -> f64 {
if a == b { return 1.0; }
let a_len = a.chars().count();
let b_len = b.chars().count();
// The check for lengths of one here is to prevent integer overflow when
// calculating the search range.
if a_len == 0 || b_len == 0 || (a_len == 1 && b_len == 1) {
return 0.0;
}
let search_range = (max(a_len, b_len) / 2) - 1;
let mut b_consumed = Vec::with_capacity(b_len);
for _ in 0..b_len {
b_consumed.push(false);
}
let mut matches = 0.0;
let mut transpositions = 0.0;
let mut b_match_index = 0;
for (i, a_char) in a.chars().enumerate() {
let min_bound =
// prevent integer wrapping
if i > search_range {
max(0, i - search_range)
} else {
0
};
let max_bound = min(b_len - 1, i + search_range);
if min_bound > max_bound {
continue;
}
for (j, b_char) in b.chars().enumerate() {
if min_bound <= j && j <= max_bound && a_char == b_char &&
!b_consumed[j] {
b_consumed[j] = true;
matches += 1.0;
if j < b_match_index {
transpositions += 1.0;
}
b_match_index = j;
break;
}
}
}
if matches == 0.0 {
0.0
} else {
(1.0 / 3.0) * ((matches / a_len as f64) +
(matches / b_len as f64) +
((matches - transpositions) / matches))
}
}
/// Like Jaro but gives a boost to strings that have a common prefix.
///
/// ```
/// use strsim::jaro_winkler;
///
/// assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
/// 0.001);
/// ```
pub fn jaro_winkler(a: &str, b: &str) -> f64 {
let jaro_distance = jaro(a, b);
// Don't limit the length of the common prefix
let prefix_length = a.chars()
.zip(b.chars())
.take_while(|&(a_char, b_char)| a_char == b_char)
.count();
let jaro_winkler_distance =
jaro_distance + (0.1 * prefix_length as f64 * (1.0 - jaro_distance));
if jaro_winkler_distance <= 1.0 {
jaro_winkler_distance
} else {
1.0
}
}
/// Calculates the minimum number of insertions, deletions, and substitutions
/// required to change one string into the other.
///
/// ```
/// use strsim::levenshtein;
///
/// assert_eq!(3, levenshtein("kitten", "sitting"));
/// ```
pub fn levenshtein(a: &str, b: &str) -> usize {
if a == b { return 0; }
let a_len = a.chars().count();
let b_len = b.chars().count();
if a_len == 0 { return b_len; }
if b_len == 0 { return a_len; }
let mut cache: Vec<usize> = (1..b_len+1).collect();
let mut result = 0;
let mut distance_a;
let mut distance_b;
for (i, a_char) in a.chars().enumerate() {
result = i;
distance_b = i;
for (j, b_char) in b.chars().enumerate() {
let cost = if a_char == b_char { 0 } else { 1 };
distance_a = distance_b + cost;
distance_b = cache[j];
result = min(result + 1, min(distance_a, distance_b + 1));
cache[j] = result;
}
}
result
}
/// Calculates a normalized score of the Levenshtein algorithm between 0.0 and
/// 1.0 (inclusive), where 1.0 means the strings are the same.
///
/// ```
/// use strsim::normalized_levenshtein;
///
/// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
/// assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
/// assert!(normalized_levenshtein("", "second").abs() < 0.00001);
/// assert!(normalized_levenshtein("first", "").abs() < 0.00001);
/// assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001);
/// ```
pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
}
/// Like Levenshtein but allows for adjacent transpositions. Each substring can
/// only be edited once.
///
/// ```
/// use strsim::osa_distance;
///
/// assert_eq!(3, osa_distance("ab", "bca"));
/// ```
pub fn osa_distance(a: &str, b: &str) -> usize {
let a_len = a.chars().count();
let b_len = b.chars().count();
if a == b { return 0; }
else if a_len == 0 { return b_len; }
else if b_len == 0 { return a_len; }
let mut prev_two_distances: Vec<usize> = Vec::with_capacity(b_len + 1);
let mut prev_distances: Vec<usize> = Vec::with_capacity(b_len + 1);
let mut curr_distances: Vec<usize> = Vec::with_capacity(b_len + 1);
let mut prev_a_char = char::MAX;
let mut prev_b_char = char::MAX;
for i in 0..(b_len + 1) {
prev_two_distances.push(i);
prev_distances.push(i);
curr_distances.push(0);
}
for (i, a_char) in a.chars().enumerate() {
curr_distances[0] = i + 1;
for (j, b_char) in b.chars().enumerate() {
let cost = if a_char == b_char { 0 } else { 1 };
curr_distances[j + 1] = min(curr_distances[j] + 1,
min(prev_distances[j + 1] + 1,
prev_distances[j] + cost));
if i > 0 && j > 0 && a_char != b_char &&
a_char == prev_b_char && b_char == prev_a_char {
curr_distances[j + 1] = min(curr_distances[j + 1],
prev_two_distances[j - 1] + 1);
}
prev_b_char = b_char;
}
prev_two_distances.clone_from(&prev_distances);
prev_distances.clone_from(&curr_distances);
prev_a_char = a_char;
}
curr_distances[b_len]
}
/// Like optimal string alignment, but substrings can be edited an unlimited
/// number of times, and the triangle inequality holds.
///
/// ```
/// use strsim::damerau_levenshtein;
///
/// assert_eq!(2, damerau_levenshtein("ab", "bca"));
/// ```
pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
if a == b { return 0; }
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let a_len = a_chars.len();
let b_len = b_chars.len();
if a_len == 0 { return b_len; }
if b_len == 0 { return a_len; }
let mut distances = vec![vec![0; b_len + 2]; a_len + 2];
let max_distance = a_len + b_len;
distances[0][0] = max_distance;
for i in 0..(a_len + 1) {
distances[i + 1][0] = max_distance;
distances[i + 1][1] = i;
}
for j in 0..(b_len + 1) {
distances[0][j + 1] = max_distance;
distances[1][j + 1] = j;
}
let mut chars: HashMap<char, usize> = HashMap::new();
for i in 1..(a_len + 1) {
let mut db = 0;
for j in 1..(b_len + 1) {
let k = match chars.get(&b_chars[j - 1]) {
Some(value) => value.clone(),
None => 0
};
let l = db;
let mut cost = 1;
if a_chars[i - 1] == b_chars[j - 1] {
cost = 0;
db = j;
}
let substitution_cost = distances[i][j] + cost;
let insertion_cost = distances[i][j + 1] + 1;
let deletion_cost = distances[i + 1][j] + 1;
let transposition_cost = distances[k][l] + (i - k - 1) + 1 +
(j - l - 1);
distances[i + 1][j + 1] = min(substitution_cost,
min(insertion_cost,
min(deletion_cost,
transposition_cost)));
}
chars.insert(a_chars[i - 1], i);
}
distances[a_len + 1][b_len + 1]
}
/// Calculates a normalized score of the DamerauLevenshtein algorithm between
/// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same.
///
/// ```
/// use strsim::normalized_damerau_levenshtein;
///
/// assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
/// assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
/// assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
/// assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
/// assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
/// ```
pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
1.0 - (damerau_levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn hamming_empty() {
match hamming("", "") {
Ok(distance) => { assert_eq!(0, distance); },
Err(why) => { panic!("{:?}", why); }
}
}
#[test]
fn hamming_same() {
match hamming("hamming", "hamming") {
Ok(distance) => { assert_eq!(0, distance); },
Err(why) => { panic!("{:?}", why); }
}
}
#[test]
fn hamming_diff() {
match hamming("hamming", "hammers") {
Ok(distance) => { assert_eq!(3, distance); },
Err(why) => { panic!("{:?}", why); }
}
}
#[test]
fn hamming_diff_multibyte() {
match hamming("hamming", "h香mmüng") {
Ok(distance) => { assert_eq!(2, distance); },
Err(why) => { panic!("{:?}", why); }
}
}
#[test]
fn hamming_unequal_length() {
match hamming("ham", "hamming") {
Ok(_) => { panic!(); },
Err(why) => { assert_eq!(why, StrSimError::DifferentLengthArgs); }
}
}
#[test]
fn hamming_names() {
match hamming("Friedrich Nietzs", "Jean-Paul Sartre") {
Ok(distance) => { assert_eq!(14, distance); },
Err(why) => { panic!("{:?}", why); }
}
}
#[test]
fn jaro_both_empty() {
assert_eq!(1.0, jaro("", ""));
}
#[test]
fn jaro_first_empty() {
assert_eq!(0.0, jaro("", "jaro"));
}
#[test]
fn jaro_second_empty() {
assert_eq!(0.0, jaro("distance", ""));
}
#[test]
fn jaro_same() {
assert_eq!(1.0, jaro("jaro", "jaro"));
}
#[test]
fn jaro_multibyte() {
assert!((0.818 - jaro("testabctest", "testöঙ香test")) < 0.001);
assert!((0.818 - jaro("testöঙ香test", "testabctest")) < 0.001);
}
#[test]
fn jaro_diff_short() {
assert!((0.767 - jaro("dixon", "dicksonx")).abs() < 0.001);
}
#[test]
fn jaro_diff_one_character() {
assert_eq!(0.0, jaro("a", "b"));
}
#[test]
fn jaro_diff_one_and_two() {
assert!((0.83 - jaro("a", "ab")).abs() < 0.01);
}
#[test]
fn jaro_diff_two_and_one() {
assert!((0.83 - jaro("ab", "a")).abs() < 0.01);
}
#[test]
fn jaro_diff_no_transposition() {
assert!((0.822 - jaro("dwayne", "duane")).abs() < 0.001);
}
#[test]
fn jaro_diff_with_transposition() {
assert!((0.944 - jaro("martha", "marhta")).abs() < 0.001);
}
#[test]
fn jaro_names() {
assert!((0.392 - jaro("Friedrich Nietzsche",
"Jean-Paul Sartre")).abs() < 0.001);
}
#[test]
fn jaro_winkler_both_empty() {
assert_eq!(1.0, jaro_winkler("", ""));
}
#[test]
fn jaro_winkler_first_empty() {
assert_eq!(0.0, jaro_winkler("", "jaro-winkler"));
}
#[test]
fn jaro_winkler_second_empty() {
assert_eq!(0.0, jaro_winkler("distance", ""));
}
#[test]
fn jaro_winkler_same() {
assert_eq!(1.0, jaro_winkler("Jaro-Winkler", "Jaro-Winkler"));
}
#[test]
fn jaro_winkler_multibyte() {
assert!((0.89 - jaro_winkler("testabctest", "testöঙ香test")).abs() <
0.001);
assert!((0.89 - jaro_winkler("testöঙ香test", "testabctest")).abs() <
0.001);
}
#[test]
fn jaro_winkler_diff_short() {
assert!((0.813 - jaro_winkler("dixon", "dicksonx")).abs() < 0.001);
assert!((0.813 - jaro_winkler("dicksonx", "dixon")).abs() < 0.001);
}
#[test]
fn jaro_winkler_diff_one_character() {
assert_eq!(0.0, jaro_winkler("a", "b"));
}
#[test]
fn jaro_winkler_diff_no_transposition() {
assert!((0.840 - jaro_winkler("dwayne", "duane")).abs() < 0.001);
}
#[test]
fn jaro_winkler_diff_with_transposition() {
assert!((0.961 - jaro_winkler("martha", "marhta")).abs() < 0.001);
}
#[test]
fn jaro_winkler_names() {
assert!((0.562 - jaro_winkler("Friedrich Nietzsche",
"Fran-Paul Sartre")).abs() < 0.001);
}
#[test]
fn jaro_winkler_long_prefix() {
assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
0.001);
}
#[test]
fn jaro_winkler_more_names() {
assert!((0.868 - jaro_winkler("Thorkel", "Thorgier")).abs() < 0.001);
}
#[test]
fn jaro_winkler_length_of_one() {
assert!((0.738 - jaro_winkler("Dinsdale", "D")).abs() < 0.001);
}
#[test]
fn jaro_winkler_very_long_prefix() {
assert!((1.0 - jaro_winkler("thequickbrownfoxjumpedoverx",
"thequickbrownfoxjumpedovery")).abs() <
0.001);
}
#[test]
fn levenshtein_empty() {
assert_eq!(0, levenshtein("", ""));
}
#[test]
fn levenshtein_same() {
assert_eq!(0, levenshtein("levenshtein", "levenshtein"));
}
#[test]
fn levenshtein_diff_short() {
assert_eq!(3, levenshtein("kitten", "sitting"));
}
#[test]
fn levenshtein_diff_with_space() {
assert_eq!(5, levenshtein("hello, world", "bye, world"));
}
#[test]
fn levenshtein_diff_multibyte() {
assert_eq!(3, levenshtein("öঙ香", "abc"));
assert_eq!(3, levenshtein("abc", "öঙ香"));
}
#[test]
fn levenshtein_diff_longer() {
let a = "The quick brown fox jumped over the angry dog.";
let b = "Lorem ipsum dolor sit amet, dicta latine an eam.";
assert_eq!(37, levenshtein(a, b));
}
#[test]
fn levenshtein_first_empty() {
assert_eq!(7, levenshtein("", "sitting"));
}
#[test]
fn levenshtein_second_empty() {
assert_eq!(6, levenshtein("kitten", ""));
}
#[test]
fn normalized_levenshtein_diff_short() {
assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
}
#[test]
fn normalized_levenshtein_for_empty_strings() {
assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001);
}
#[test]
fn normalized_levenshtein_first_empty() {
assert!(normalized_levenshtein("", "second").abs() < 0.00001);
}
#[test]
fn normalized_levenshtein_second_empty() {
assert!(normalized_levenshtein("first", "").abs() < 0.00001);
}
#[test]
fn normalized_levenshtein_identical_strings() {
assert!((normalized_levenshtein("identical", "identical") - 1.0).abs() < 0.00001);
}
#[test]
fn osa_distance_empty() {
assert_eq!(0, osa_distance("", ""));
}
#[test]
fn osa_distance_same() {
assert_eq!(0, osa_distance("damerau", "damerau"));
}
#[test]
fn osa_distance_first_empty() {
assert_eq!(7, osa_distance("", "damerau"));
}
#[test]
fn osa_distance_second_empty() {
assert_eq!(7, osa_distance("damerau", ""));
}
#[test]
fn osa_distance_diff() {
assert_eq!(3, osa_distance("ca", "abc"));
}
#[test]
fn osa_distance_diff_short() {
assert_eq!(3, osa_distance("damerau", "aderua"));
}
#[test]
fn osa_distance_diff_reversed() {
assert_eq!(3, osa_distance("aderua", "damerau"));
}
#[test]
fn osa_distance_diff_multibyte() {
assert_eq!(3, osa_distance("öঙ香", "abc"));
assert_eq!(3, osa_distance("abc", "öঙ香"));
}
#[test]
fn osa_distance_diff_unequal_length() {
assert_eq!(6, osa_distance("damerau", "aderuaxyz"));
}
#[test]
fn osa_distance_diff_unequal_length_reversed() {
assert_eq!(6, osa_distance("aderuaxyz", "damerau"));
}
#[test]
fn osa_distance_diff_comedians() {
assert_eq!(5, osa_distance("Stewart", "Colbert"));
}
#[test]
fn osa_distance_many_transpositions() {
assert_eq!(4, osa_distance("abcdefghijkl", "bacedfgihjlk"));
}
#[test]
fn osa_distance_diff_longer() {
let a = "The quick brown fox jumped over the angry dog.";
let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
assert_eq!(36, osa_distance(a, b));
}
#[test]
fn osa_distance_beginning_transposition() {
assert_eq!(1, osa_distance("foobar", "ofobar"));
}
#[test]
fn osa_distance_end_transposition() {
assert_eq!(1, osa_distance("specter", "spectre"));
}
#[test]
fn osa_distance_restricted_edit() {
assert_eq!(4, osa_distance("a cat", "an abct"));
}
#[test]
fn damerau_levenshtein_empty() {
assert_eq!(0, damerau_levenshtein("", ""));
}
#[test]
fn damerau_levenshtein_same() {
assert_eq!(0, damerau_levenshtein("damerau", "damerau"));
}
#[test]
fn damerau_levenshtein_first_empty() {
assert_eq!(7, damerau_levenshtein("", "damerau"));
}
#[test]
fn damerau_levenshtein_second_empty() {
assert_eq!(7, damerau_levenshtein("damerau", ""));
}
#[test]
fn damerau_levenshtein_diff() {
assert_eq!(2, damerau_levenshtein("ca", "abc"));
}
#[test]
fn damerau_levenshtein_diff_short() {
assert_eq!(3, damerau_levenshtein("damerau", "aderua"));
}
#[test]
fn damerau_levenshtein_diff_reversed() {
assert_eq!(3, damerau_levenshtein("aderua", "damerau"));
}
#[test]
fn damerau_levenshtein_diff_multibyte() {
assert_eq!(3, damerau_levenshtein("öঙ香", "abc"));
assert_eq!(3, damerau_levenshtein("abc", "öঙ香"));
}
#[test]
fn damerau_levenshtein_diff_unequal_length() {
assert_eq!(6, damerau_levenshtein("damerau", "aderuaxyz"));
}
#[test]
fn damerau_levenshtein_diff_unequal_length_reversed() {
assert_eq!(6, damerau_levenshtein("aderuaxyz", "damerau"));
}
#[test]
fn damerau_levenshtein_diff_comedians() {
assert_eq!(5, damerau_levenshtein("Stewart", "Colbert"));
}
#[test]
fn damerau_levenshtein_many_transpositions() {
assert_eq!(4, damerau_levenshtein("abcdefghijkl", "bacedfgihjlk"));
}
#[test]
fn damerau_levenshtein_diff_longer() {
let a = "The quick brown fox jumped over the angry dog.";
let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
assert_eq!(36, damerau_levenshtein(a, b));
}
#[test]
fn damerau_levenshtein_beginning_transposition() {
assert_eq!(1, damerau_levenshtein("foobar", "ofobar"));
}
#[test]
fn damerau_levenshtein_end_transposition() {
assert_eq!(1, damerau_levenshtein("specter", "spectre"));
}
#[test]
fn damerau_levenshtein_unrestricted_edit() {
assert_eq!(3, damerau_levenshtein("a cat", "an abct"));
}
#[test]
fn normalized_damerau_levenshtein_diff_short() {
assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
}
#[test]
fn normalized_damerau_levenshtein_for_empty_strings() {
assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001);
}
#[test]
fn normalized_damerau_levenshtein_first_empty() {
assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001);
}
#[test]
fn normalized_damerau_levenshtein_second_empty() {
assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001);
}
#[test]
fn normalized_damerau_levenshtein_identical_strings() {
assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001);
}
}

View File

@@ -0,0 +1,49 @@
extern crate strsim;
use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance,damerau_levenshtein,
normalized_damerau_levenshtein, jaro, jaro_winkler};
#[test]
fn hamming_works() {
match hamming("hamming", "hammers") {
Ok(distance) => assert_eq!(3, distance),
Err(why) => panic!("{:?}", why)
}
}
#[test]
fn levenshtein_works() {
assert_eq!(3, levenshtein("kitten", "sitting"));
}
#[test]
fn normalized_levenshtein_works() {
assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
}
#[test]
fn osa_distance_works() {
assert_eq!(3, osa_distance("ac", "cba"));
}
#[test]
fn damerau_levenshtein_works() {
assert_eq!(2, damerau_levenshtein("ac", "cba"));
}
#[test]
fn normalized_damerau_levenshtein_works() {
assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001);
}
#[test]
fn jaro_works() {
assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() <
0.001);
}
#[test]
fn jaro_winkler_works() {
assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() <
0.001);
}