Hi, I have a question about reading books with Python.
I have an assignment that wants me to count the number of unique words in the English translation of Hamlet. The file I am using is taken from the Gutenberg library: Index of /files/1787
I am seriously lost because this code returns 4797 for me, but the solution tells me 3348.
I am not sure where my mistake could be.
I would be immensely grateful for any help.
import os
import pandas as pd
import numpy as np
from collections import Counter
def count_words_fast(text):
text = text.lower()
skips = [".", ",", ";", ":", "'", '"', "\n", "!", "?", "(", ")"]
for ch in skips:
text = text.replace(ch, "")
word_counts = Counter(text.split(" "))
return word_counts
def word_stats(word_counts):
num_unique = len(word_counts)
counts = word_counts.values()
return (num_unique, counts)
def word_count_distribution(text):
word_counts = count_words_fast(text)
count_distribution = Counter(word_counts.values())
return count_distribution
def more_frequent(distribution):
counts = list(distribution.keys())
frequency_of_counts = list(distribution.values())
cumulative_frequencies = np.cumsum(frequency_of_counts)
more_frequent = 1 - cumulative_frequencies / cumulative_frequencies[-1]
return dict(zip(counts, more_frequent))
def read_book(title_path):
"""Read a book and return it as a string."""
with open(title_path, "r", encoding="utf8") as current_file:
text=current_file.read()
text=text.replace("\n", " ").replace("\r", " ")
return text
hamlets = pd.DataFrame(columns = ["language","text"])
book_dir = "Books1"
title_num = 1
for language in os.listdir(book_dir):
for author in os.listdir(book_dir + "/" + language):
for title in os.listdir(book_dir + "/" + language + "/" + author):
if title == "Hamlet":
book_dir="./Books1"
inputfile = "Books1/"+language+"/"+author+"/"+title+".txt"
text = read_book(inputfile)
hamlets.loc[title_num] = language, text
title_num += 1
#print(hamlets)
book_dir="./Books1"
text=read_book("./Books1/English/shakespeare/Hamlet.txt")
hamlets.loc[title_num] = language, text
counted_text = count_words_fast(text)
data = pd.DataFrame({
"word": list(counted_text.keys()),
"count": list(counted_text.values())
})
data.head(10)
language, text = hamlets.iloc[0]
counted_text = count_words_fast(text)
data = pd.DataFrame({
"word": list(counted_text.keys()),
"count": list(counted_text.values())
})
data["length"] = data["word"].apply(len)
#data.loc[data["count"] > 10, "frequency"] = "frequent"
#data.loc[data["count"] <= 10, "frequency"] = "infrequent"
data.loc[data["count"] == 1, "frequency"] = "unique"
print(data)