75 lines
2.0 KiB
Python
75 lines
2.0 KiB
Python
#!/usr/bin/env python
|
|
|
|
# This does simple normalized frequency analysis on UTF-8 encoded text. The
|
|
# result of the analysis is translated to a ranked list, where every byte is
|
|
# assigned a rank. This list is written to src/freqs.rs.
|
|
#
|
|
# Currently, the frequencies are generated from the following corpuses:
|
|
#
|
|
# * The CIA world fact book
|
|
# * The source code of rustc
|
|
# * Septuaginta
|
|
|
|
from __future__ import absolute_import, division, print_function
|
|
|
|
import argparse
|
|
from collections import Counter
|
|
import sys
|
|
|
|
preamble = '''
|
|
// NOTE: The following code was generated by "scripts/frequencies.py", do not
|
|
// edit directly
|
|
'''.lstrip()
|
|
|
|
|
|
def eprint(*args, **kwargs):
|
|
kwargs['file'] = sys.stderr
|
|
print(*args, **kwargs)
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument('corpus', metavar='FILE', nargs='+')
|
|
args = p.parse_args()
|
|
|
|
# Get frequency counts of each byte.
|
|
freqs = Counter()
|
|
for i in range(0, 256):
|
|
freqs[i] = 0
|
|
|
|
eprint('reading entire corpus into memory')
|
|
corpus = []
|
|
for fpath in args.corpus:
|
|
corpus.append(open(fpath, 'rb').read())
|
|
|
|
eprint('computing byte frequencies')
|
|
for c in corpus:
|
|
for byte in c:
|
|
freqs[byte] += 1.0 / float(len(c))
|
|
|
|
eprint('writing Rust code')
|
|
# Get the rank of each byte. A lower rank => lower relative frequency.
|
|
rank = [0] * 256
|
|
for i, (byte, _) in enumerate(freqs.most_common()):
|
|
# print(byte)
|
|
rank[byte] = 255 - i
|
|
|
|
# Forcefully set the highest rank possible for bytes that start multi-byte
|
|
# UTF-8 sequences. The idea here is that a continuation byte will be more
|
|
# discerning in a homogenous haystack.
|
|
for byte in range(0xC0, 0xFF + 1):
|
|
rank[byte] = 255
|
|
|
|
# Now write Rust.
|
|
olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
|
|
for byte in range(256):
|
|
olines.append(' %3d, // %r' % (rank[byte], chr(byte)))
|
|
olines.append('];')
|
|
|
|
print(preamble)
|
|
print('\n'.join(olines))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|