Coding Review

To succeed at Computer Science you need to be able to understand (and communicate using) code. Below are three snippets of Python code, each more complicated than the next. Can you explain how they work and what they do without running them? Without searching online references?

Some text processing:

import time import statistics import math book_name = input("What's the name of the book? ") book = open(book_name) stats = {} total_words = 0 for line in book: words = line.split() for word in words: word_clean = word.replace(",", "") word_clean = word_clean.replace(".", "") word_clean = word_clean.replace("!", "") word_clean = word_clean.replace("?", "") word_clean = word_clean.replace("'", "") word_clean = word_clean.replace('"', "") word_clean = word_clean.replace(':', "") word_clean = word_clean.strip() word_clean = word_clean.lower().casefold() if len(word_clean) > 0: if word_clean not in stats: stats[word_clean] = [] stats[word_clean].append(line) total_words += 1 most = ("::::::", -math.inf) unique = [] for word in stats: length = len(stats[word]) if length > most[1]: most = (word, length) if length == 1: unique.append(word) print("Summarizing [{}]".format(book_name)) print("There are {} total words.".format(total_words)) print("There are {} unique words.".format(len(unique))) print("There are {} distinct words.".format(len(stats.keys()))) print("Most word used is [{}] with {} uses".format(most[0], most[1]))

A simple simulation:

import random MAX = 20 def generation(size): newSize = 0 for _ in range(size): # this amoeba dies, or splits if random.random() > 0.25: # split into two newSize += 2 return newSize def trial(): # start with one numberOfAmoebas = 1 iteration = 0 while numberOfAmoebas > 0 and iteration < MAX: numberOfAmoebas = generation(numberOfAmoebas) iteration += 1 print("Reached generation {} with {} amoebas".format(iteration, numberOfAmoebas)) # return a boolean of if it got to MAX return (iteration, iteration == MAX, numberOfAmoebas) def mean(data): total = 0 for d in data: total += d return total / len(data) NUM_TRIALS = 1000 data = [] population= [] for _ in range(NUM_TRIALS): iterations, complete, pop = trial() if not complete: data.append(iterations) else: population.append(pop) print("If the amoebas did not survive {:.2f}% of the time. On failures, there were {:.2f} generations.".format(len(data)/NUM_TRIALS * 100,mean(data))) print("If the amoebas did survive, the average population was {:.2f}.".format(mean(population)))

Something a little more challenging (note that we are using Shannon's entropy calculation: entropy=i=1nP(xi)log2P(xi)entropy = -\sum_{i=1}^n P(x_i)\log_2P(x_i) where P(x)P(x) is the probability of letter xx):

import math import time USER_COL = 0 PASS_COL = 1 def report(min_complexity, max_complexity, most_common, least_common): print("Password complexity") print("\tMinimum: {complexity:8.4f} bits".format(complexity=min_complexity)) print("\tMaximum: {complexity:8.4f} bits".format(complexity=max_complexity)) print("\tMost common: {password}".format(password=most_common)) for user in passwords[most_common][0:10]: print("\t\t{username}".format(username=user)) print("\tLeast common: {password}".format(password=least_common)) for user in passwords[least_common][0:10]: print("\t\t{username}".format(username=user)) def calculate_most_and_least_common(lines): # Now *search* for the most and least common ### most and least common # the **first** password in the file is a good default. most_common = least_common = lines[0].split(":")[PASS_COL].strip().lower() for line in lines: parts = line.split(":") passwd = parts[PASS_COL] passwd = passwd.strip().lower() if len(passwords[passwd]) > len(passwords[most_common]): most_common = passwd if len(passwords[passwd]) < len(passwords[least_common]): least_common = passwd return most_common, least_common def calculate_passwd_complexity(lines): min_complexity = math.inf # Infinity is *pretty* big... max_complexity = -math.inf # **negative** infinity is **really** small! passwords = {} for line in lines: parts = line.split(":") username = parts[USER_COL] passwd = parts[PASS_COL] passwd = passwd.strip().lower() complexity = 0 # complexity calculation using Shannon's entropy for char in passwd: probability = passwd.count(char) / len(passwd) complexity = complexity + (probability * math.log2(probability)) complexity = complexity * -1 min_complexity = min(min_complexity, abs(complexity)) max_complexity = max(max_complexity, complexity) if passwd not in passwords: passwords[passwd] = [] passwords[passwd].append(username) return passwords, min_complexity, max_complexity file_name = input("Enter the name of the password file: ") passwd_file = open(file_name, encoding="utf-8") lines = passwd_file.readlines() passwords, min_complexity, max_complexity = calculate_passwd_complexity(lines) most_common, least_common = calculate_most_and_least_common(lines) report(min_complexity, max_complexity, most_common, least_common)