更新libclamav库1.0.0版本
This commit is contained in:
74
clamav/libclamav_rust/.cargo/vendor/memchr/scripts/make-byte-frequency-table
vendored
Normal file
74
clamav/libclamav_rust/.cargo/vendor/memchr/scripts/make-byte-frequency-table
vendored
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# This does simple normalized frequency analysis on UTF-8 encoded text. The
|
||||
# result of the analysis is translated to a ranked list, where every byte is
|
||||
# assigned a rank. This list is written to src/freqs.rs.
|
||||
#
|
||||
# Currently, the frequencies are generated from the following corpuses:
|
||||
#
|
||||
# * The CIA world fact book
|
||||
# * The source code of rustc
|
||||
# * Septuaginta
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
from collections import Counter
|
||||
import sys
|
||||
|
||||
preamble = '''
|
||||
// NOTE: The following code was generated by "scripts/frequencies.py", do not
|
||||
// edit directly
|
||||
'''.lstrip()
|
||||
|
||||
|
||||
def eprint(*args, **kwargs):
|
||||
kwargs['file'] = sys.stderr
|
||||
print(*args, **kwargs)
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument('corpus', metavar='FILE', nargs='+')
|
||||
args = p.parse_args()
|
||||
|
||||
# Get frequency counts of each byte.
|
||||
freqs = Counter()
|
||||
for i in range(0, 256):
|
||||
freqs[i] = 0
|
||||
|
||||
eprint('reading entire corpus into memory')
|
||||
corpus = []
|
||||
for fpath in args.corpus:
|
||||
corpus.append(open(fpath, 'rb').read())
|
||||
|
||||
eprint('computing byte frequencies')
|
||||
for c in corpus:
|
||||
for byte in c:
|
||||
freqs[byte] += 1.0 / float(len(c))
|
||||
|
||||
eprint('writing Rust code')
|
||||
# Get the rank of each byte. A lower rank => lower relative frequency.
|
||||
rank = [0] * 256
|
||||
for i, (byte, _) in enumerate(freqs.most_common()):
|
||||
# print(byte)
|
||||
rank[byte] = 255 - i
|
||||
|
||||
# Forcefully set the highest rank possible for bytes that start multi-byte
|
||||
# UTF-8 sequences. The idea here is that a continuation byte will be more
|
||||
# discerning in a homogenous haystack.
|
||||
for byte in range(0xC0, 0xFF + 1):
|
||||
rank[byte] = 255
|
||||
|
||||
# Now write Rust.
|
||||
olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
|
||||
for byte in range(256):
|
||||
olines.append(' %3d, // %r' % (rank[byte], chr(byte)))
|
||||
olines.append('];')
|
||||
|
||||
print(preamble)
|
||||
print('\n'.join(olines))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Reference in New Issue
Block a user