### Coding Review

To succeed at Computer Science you need to be able to understand (and communicate using) code. Below are three snippets of Python code, each more complicated than the next. Can you explain how they work and what they do without running them? Without searching online references?

Some text processing:

import time
import statistics
import math

book_name = input("What's the name of the book? ")
book = open(book_name)

stats = {}
total_words = 0

for line in book:
words = line.split()
for word in words:
word_clean = word.replace(",", "")
word_clean = word_clean.replace(".", "")
word_clean = word_clean.replace("!", "")
word_clean = word_clean.replace("?", "")
word_clean = word_clean.replace("'", "")
word_clean = word_clean.replace('"', "")
word_clean = word_clean.replace(':', "")
word_clean = word_clean.strip()
word_clean = word_clean.lower().casefold()

if len(word_clean) > 0:
if word_clean not in stats:
stats[word_clean] = []
stats[word_clean].append(line)
total_words += 1

most = ("::::::", -math.inf)
unique = []

for word in stats:
length = len(stats[word])
if length > most:
most = (word, length)
if length == 1:
unique.append(word)

print("Summarizing [{}]".format(book_name))
print("There are {} total words.".format(total_words))
print("There are {} unique words.".format(len(unique)))
print("There are {} distinct words.".format(len(stats.keys())))
print("Most word used is [{}] with {} uses".format(most, most))


A simple simulation:

import random

MAX = 20

def generation(size):
newSize = 0

for _ in range(size):
# this amoeba dies, or splits
if random.random() > 0.25:
# split into two
newSize += 2
return newSize

def trial():
numberOfAmoebas = 1
iteration = 0

while numberOfAmoebas > 0 and iteration < MAX:
numberOfAmoebas = generation(numberOfAmoebas)
iteration += 1

print("Reached generation {} with {} amoebas".format(iteration, numberOfAmoebas))
# return a boolean of if it got to MAX
return (iteration, iteration == MAX, numberOfAmoebas)

def mean(data):
total = 0
for d in data:
total += d

NUM_TRIALS = 1000

data = []
population= []
for _ in range(NUM_TRIALS):
iterations, complete, pop = trial()
if not complete:
data.append(iterations)
else:
population.append(pop)

print("If the amoebas did not survive {:.2f}% of the time.  On failures, there were {:.2f} generations.".format(len(data)/NUM_TRIALS * 100,mean(data)))
print("If the amoebas did survive, the average population was {:.2f}.".format(mean(population)))


Something a little more challenging (note that we are using Shannon's entropy calculation: $entropy = -\sum_{i=1}^n P(x_i)\log_2P(x_i)$ where $P(x)$ is the probability of letter $x$):

import math
import time

USER_COL = 0
PASS_COL = 1

def report(min_complexity, max_complexity, most_common, least_common):
print("\tMinimum: {complexity:8.4f} bits".format(complexity=min_complexity))
print("\tMaximum: {complexity:8.4f} bits".format(complexity=max_complexity))

def calculate_most_and_least_common(lines):
# Now *search* for the most and least common

### most and least common
# the **first** password in the file is a good default.
most_common = least_common = lines.split(":")[PASS_COL].strip().lower()
for line in lines:
parts = line.split(":")
passwd = parts[PASS_COL]
passwd = passwd.strip().lower()

most_common = passwd
least_common = passwd
return most_common, least_common

def calculate_passwd_complexity(lines):
min_complexity = math.inf # Infinity is *pretty* big...
max_complexity = -math.inf # **negative** infinity is **really** small!

for line in lines:
parts = line.split(":")
passwd = parts[PASS_COL]
passwd = passwd.strip().lower()
complexity = 0

# complexity calculation using Shannon's entropy
for char in passwd:
probability = passwd.count(char) / len(passwd)
complexity = complexity + (probability * math.log2(probability))

complexity = complexity * -1
min_complexity = min(min_complexity, abs(complexity))
max_complexity = max(max_complexity, complexity)

file_name = input("Enter the name of the password file: ")

passwd_file = open(file_name, encoding="utf-8")