#!/usr/bin/env python3

# http://inamidst.com/voynich/related
# Created by Sean B. Palmer

import math
from collections import Counter

def chomp(octets):
    if octets:
        if octets[-1] == 10:
            return octets[:-1]
    return octets

def voynich_words():
    with open("voynich101_comma.txt", "rb") as f:
        for line in f:
            yield chomp(line)

frequency = Counter()

def create_model():
    counts = Counter()
    total_count = 0
    model = {}

    for word in voynich_words():
        for char in word:
            frequency[char] += 1
        counts[word] += 1
        total_count += 1

    unique_words = len(counts)
    for (word, count) in counts.most_common(unique_words):
        probability = float(count) / total_count
        model[word] = probability
    return model

model = create_model()

def entropy(words):
    L1 = 0.95
    Lunk = 1 - L1
    unk = 0
    V = 1000000
    W = 0
    H = 0

    for word in words:
        W += 1
        P = Lunk / V
        if word in model:
            P += L1 * model[word]
        else:
            unk += 1
        H += -math.log(P, 2)

    # H / W is entropy
    # coverage is (W - unk) / W
    return H / W

with open("voynich101_frequent.txt", "rb") as f:
    top48 = f.read(48)

print("original\treplacement\tdifference")

normal = entropy(voynich_words())

for a in top48:
    a = bytes([a])
    for b in top48:
        b = bytes([b])
        words = (word.replace(a, b) for word in voynich_words())
        e = entropy(words)
        difference = normal - e
        a2  = ord(a)
        b2 = ord(b)
        # f = frequency[a2]
        print("%s\t%s\t%s" % (a2, b2, difference))