Project Answers
File Processing
Pacman Warnings
Print out all log lines in the pacman-install.log file that contain “warning”.
The first 4 line of output should look like this:
[2010-05-28 16:04] warning: /etc/sensors3.conf saved as /etc/sensors3.conf.pacorig
[2010-05-28 16:06] warning: /etc/mime.types saved as /etc/mime.types.pacorig
[2010-05-28 16:07] warning: /etc/mysql/my.cnf saved as /etc/mysql/my.cnf.pacorig
[2010-05-28 16:07] WARNING: The host 'Lab1' could not be looked up with resolveip.
Answer
with open('pacman-install.log') as pacman_log:
for line in pacman_log:
if ' warning' in line.lower():
print(line.rstrip())
SSH Users
Using the gzipped file sshd.log.gz, write a program, ssh_users.py, that prints all users that logged in successfully.
To find the user login, start by searching for the phrase “session opened for user “.
The output should look like this:
trey
diane
melanie
Answer
import gzip
users = set()
with gzip.open('sshd.log.gz', mode='rt') as log_file:
for line in log_file:
if 'session opened for user ' in line:
user = line.strip().split()[10]
if user not in users:
users.add(user)
print(user)
Hint
You can use the open function in the gzip module to read this gzipped file.
SSH Login Days
Using the same gzipped file, sshd.log.gz, create a program, ssh_login_days.py, that prints each day that each user successfully logged in, in chronological order.
The output should look like this:
Jun 04 trey
Jun 04 diane
Jun 05 diane
Jun 05 trey
Jun 05 melanie
Jun 07 melanie
Jun 07 trey
Answer
import gzip
seen = set()
with gzip.open('sshd.log.gz', mode='rt') as log_file:
for line in log_file:
if 'session opened for user ' in line:
fields = line.split()
day = ' '.join(fields[:2] + [fields[10]])
if day not in seen:
print(day)
seen.add(day)
Tree Parser
Create a program that will accept a file like this:
a
b
c
d
e
f
g
h
i
j
k
l
m
And will output the leaf nodes from this program like this:
/ a / b
/ a / c / d
/ a / e / f
/ g / h / i
/ g / h / j
/ k / l / m
Hint
You may want to use your DoublyLinkedNode class from your classes.py file.
Answers
from pathlib import Path
import sys
class Node:
def __init__(self, name, *, ancestors=[]):
self.ancestors = list(ancestors)
self.name = name
def ancestors_and_self(self):
"""Return iterable with our ordered ancestors and our own node."""
return [*self.ancestors, self]
def make_child(self, *args, **kwargs):
"""Create and return a child node of the current node."""
return type(self)(*args, ancestors=self.ancestors_and_self(), **kwargs)
def __str__(self):
"""Return a slash-delimited ancestors hierarchy for this node."""
if self.ancestors:
return f"{self.ancestors[-1]} / {self.name}"
else:
return self.name
def __repr__(self):
return self.name
class DoublyLinkedNode(Node):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.children = []
def make_child(self, *args, **kwargs):
"""Create and return a child node of the current node."""
node = super().make_child(*args, **kwargs)
self.children.append(node)
return node
def leaves(self):
"""Return all leaves of the current node and all descendent nodes."""
if self.is_leaf():
return [self]
else:
return [
node
for child in self.children
for node in child.leaves()
]
def is_leaf(self):
"""Return True if this node is a leaf (node which has no children)."""
return not self.children
class LevelNode(DoublyLinkedNode):
def __init__(self, name, level, *args, **kwargs):
self.level = level
super().__init__(name, *args, **kwargs)
def find_parent(self, level):
"""Return first ancestor node that is below the given level."""
return next((
node
for node in reversed(self.ancestors_and_self())
if node.level < level
))
def get_indentation(line):
return len(line) - len(line.lstrip(' '))
node = root = LevelNode('', -1)
data = Path(sys.argv[1]).read_text()
for line in data.splitlines():
indent_level = get_indentation(line)
parent = node.find_parent(indent_level)
node = parent.make_child(line.strip(), indent_level)
for node in root.leaves():
print(str(node).strip())
CSV Merging Exercises
The Readings Files
The readings files contain an ID column and two year columns.
For readings1.csv, the year columns are Year 1 and Year 2:
ID,Year 1,Year 2
purple,405,679
blue,589,562
yellow,888,957
orange,508,708
brown,643,1227
green,320,191
red,458,1398
pink,668,1661
black,278,768
For readings2.csv, the year columns are Year 2 and Year 3:
ID,Year 3,Year 4
purple,2094,3326
blue,2892,359
yellow,1038,2737
orange,889,810
brown,2396,1021
green,937,3064
red,2444,3218
pink,465,621
black,559,3243
Same Order Merge
Write a program to merge the two CSV files that have the same first column by combining the columns from the two files (the columns from the first file should be first) and removing the extra ID column from the second file.
Assume the identifiers (the ID values) in the given reading files are the same (same number of rows and same values in each row).
Given this readings1.csv file:
ID,Year 1,Year 2
purple,405,679
blue,589,562
yellow,888,957
orange,508,708
brown,643,1227
green,320,191
red,458,1398
pink,668,1661
black,278,768
And this readings2.csv file:
ID,Year 3,Year 4
purple,2094,3326
blue,2892,359
yellow,1038,2737
orange,889,810
brown,2396,1021
green,937,3064
red,2444,3218
pink,465,621
black,559,3243
The merged readings-combined.csv CSV file should look like this:
ID,Year 1,Year 2,Year 3,Year 4
purple,405,679,2094,3326
blue,589,562,2892,359
yellow,888,957,1038,2737
orange,508,708,889,810
brown,643,1227,2396,1021
green,320,191,937,3064
red,458,1398,2444,3218
pink,668,1661,465,621
black,278,768,559,3243
Answers
import csv
import sys
[filename1, filename2, merged_filename] = sys.argv[1:]
with open(filename1, newline="") as csv1:
rows1 = list(csv.reader(csv1))
with open(filename2, newline="") as csv2:
rows2 = list(csv.reader(csv2))
with open(merged_filename, mode="wt", newline="") as merged_csv:
writer = csv.writer(merged_csv)
for row1, row2 in zip(rows1, rows2):
writer.writerow(row1 + row2[1:])
Different Order Merge
Write a program to merge the two CSV files by the value of their first column by combining the columns from the two files (the columns from the first file should be first).
Assume the identifiers (the ID values) in the given reading files are the same (the values in the first column are the same but are not in the same order).
Given this readings1.csv file:
ID,Year 1,Year 2
orange,508,708
green,320,191
yellow,888,957
red,458,1398
black,278,768
pink,668,1661
brown,643,1227
blue,589,562
purple,405,679
And this readings2.csv file:
ID,Year 3,Year 4
green,937,3064
black,559,3243
blue,2892,359
pink,465,621
red,2444,3218
orange,889,810
brown,2396,1021
yellow,1038,2737
purple,2094,3326
The merged readings-combined.csv CSV file should look similar to this (note that the rows may occur in any order):
ID,Year 1,Year 2,Year 3,Year 4
purple,405,679,2094,3326
blue,589,562,2892,359
yellow,888,957,1038,2737
orange,508,708,889,810
brown,643,1227,2396,1021
green,320,191,937,3064
red,458,1398,2444,3218
pink,668,1661,465,621
black,278,768,559,3243
Answers
import csv
import sys
[filename1, filename2, merged_filename] = sys.argv[1:]
with open(filename1, newline="") as csv1:
row_groups = {
row[0]: row
for row in csv.reader(csv1)
}
with open(filename2, newline="") as csv2:
for row in csv.reader(csv2):
row_groups[row[0]] += row[1:]
with open(merged_filename, mode="wt", newline="") as merged_csv:
writer = csv.writer(merged_csv)
writer.writerows(row_groups.values())
Alternative solution, using *:
Answers
import csv
import sys
[filename1, filename2, merged_filename] = sys.argv[1:]
with open(filename1, newline="") as csv1:
row_groups = {
id: [id, *data]
for id, *data in csv.reader(csv1)
}
with open(filename2, newline="") as csv2:
for id, *data in csv.reader(csv2):
row_groups[id] += data
with open(merged_filename, mode="wt", newline="") as merged_csv:
writer = csv.writer(merged_csv)
writer.writerows(row_groups.values())
Job Analysis
These exercises require the file jobs.csv.
Group Parents and Children
Print out each parent job followed by each child job that corresponds to that parent.
The output should look something like this:
Lot Level Full job number Shared digits Unique number
Parent Lot 000JE0173V5.0E00#SBHPY_TS JE0173V5.0E 00
Child Lot 000JE0173V5.0E01#SBHPY_TS JE0173V5.0E 01
Child Lot 000JE0173V5.0E09#SBHPY JE0173V5.0E 09
Parent Lot 000JE01963X.0E00#SBKNR_TS JE01963X.0E 00
Child Lot 000JE01963X.0E09#SBKNR JE01963X.0E 09
Child Lot 000JE01963X.0E01#SBKNR_TS JE01963X.0E 01
Alternatively you could output those rows/columns into a delimited data file.
Answers
import csv
parents = set()
children = {}
with open('files/jobs.csv') as job_file:
reader = csv.DictReader(job_file, delimiter='\t')
for row in reader:
job_id = row['JOB (lot number_TS)']
shared = job_id[3:14]
n = job_id[14:16]
if n == '00':
parents.add((job_id, shared, n))
else:
children.setdefault(shared, set()).add((job_id, n))
print("Lot Level\tFull job number\t\t\tShared digits\tUnique number")
for parent_id, shared, parent_n in parents:
print(f"Parent Lot\t{parent_id.ljust(25)}\t{shared}\t{parent_n}")
for child_id, child_n in children[shared]:
print(f"Child Lot\t{child_id.ljust(25)}\t{shared}\t{child_n}")
Unclosed Parents
Identify parent jobs that should be closed because one or more of their children jobs are closed but they are not.
A job has been “closed” if the TRANSACTION_TYPE_NAME column is Job close variance at any point.
The output should look something like this:
Lot Level Full job number Shared digits Unique Closed?
Parent Lot 000JE0173V5.0E00#SBHPY_TS JE0173V5.0E 00 True
Child Lot 000JE0173V5.0E01#SBHPY_TS JE0173V5.0E 01 True
Child Lot 000JE0173V5.0E09#SBHPY JE0173V5.0E 09 False
Parent Lot 000JE01963X.0E00#SBKNR_TS JE01963X.0E 00 True
Child Lot 000JE01963X.0E01#SBKNR_TS JE01963X.0E 01 True
Child Lot 000JE01963X.0E09#SBKNR JE01963X.0E 09 False
If any parent jobs are not closed but they have at least one child job which is, an error should be printed out like this:
Error: 000JE0173V5.0E00#SBHPY_TS should be closed
Answers
import csv
parents = set()
children = {}
closed = set()
with open('files/jobs.csv') as job_file:
reader = csv.DictReader(job_file, delimiter='\t')
for row in reader:
job_id = row['JOB (lot number_TS)']
shared = job_id[3:14]
n = job_id[14:16]
if row['TRANSACTION_TYPE_NAME'] == "Job close variance":
closed.add(job_id)
if n == '00':
parents.add((job_id, shared, n))
else:
children.setdefault(shared, set()).add((job_id, n))
print("Lot Level\tFull job number\t\t\tShared digits\tUnique\tClosed?")
for parent_id, shared, parent_n in parents:
print(
f"Parent Lot\t{parent_id.ljust(25)}\t{shared}\t{parent_n}\t"
f"{parent_id in closed}"
)
for child_id, child_n in children[shared]:
print(
f"Child Lot\t{child_id.ljust(25)}\t{shared}\t{child_n}\t"
f"{child_id in closed}"
)
for parent_id, shared, parent_n in parents:
if (parent_id not in closed and
any(child_id in closed for child_id, child_n in children[shared])):
print(f"Error: {parent_id} should be closed")
CSV Comparison Exercises
These exercises require the files restaurants2017.csv and restaurants2018.csv.
Find New/Missing Rows
Open the two restaurant files and print out:
Each row in the first file where information on that ZIP code is only in
restaurants2017.csvEach row in the second file where information on that ZIP code is only in
restaurants2018.csv
Answers
import csv
filename1 = 'restaurants2017.csv'
filename2 = 'restaurants2018.csv'
with open(filename1) as file1:
reader1 = csv.reader(file1)
next(reader1)
rows1 = list(reader1)
with open(filename2) as file2:
reader2 = csv.reader(file2)
next(reader2)
rows2 = list(reader2)
zips1 = set()
for zip_code, *rest in rows1:
zips1.add(zip_code)
zips2 = set()
for zip_code, *rest in rows2:
zips2.add(zip_code)
print(zips1 - zips2)
print(zips2 - zips1)
import csv
filename1 = 'restaurants2017.csv'
filename2 = 'restaurants2018.csv'
def get_zip_codes(restaurant_file):
return {
zip_code
for zip_code, *rest in csv.reader(restaurant_file)
}
with open(filename1) as file1:
zips1 = get_zip_codes(file1)
with open(filename2) as file2:
zips2 = get_zip_codes(file2)
print('missing ZIP codes', *(zips1 - zips2))
print('new ZIP codes', *(zips2 - zips1))
Copy Reviews
For ZIP codes where the restaurant and owner are the same between restaurants2017.csv and restaurants2018.csv, copy the review column (if any) to restaurants2018.csv.
Answers
import csv
filename1 = 'restaurants2017.csv'
filename2 = 'restaurants2018.csv'
def get_csv_rows(filename):
with open(filename) as csv_file:
reader = csv.reader(csv_file)
headers = next(reader)
return headers, list(reader)
headers, rows1 = get_csv_rows(filename1)
headers, rows2 = get_csv_rows(filename2)
restaurants = {}
# Storing the rows1 data
for row in rows1:
zip_code, name, owner, *rest = row
restaurants[zip_code, name, owner] = row
# Putting rows1 reviews into rows2
for new_row in rows2:
zip_code, name, owner, *rest = new_row
if (zip_code, name, owner) in restaurants:
old_row = restaurants[zip_code, name, owner]
new_row[-1] = old_row[-1]
with open(filename2, mode='wt') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(headers)
writer.writerows(rows2)
import csv
filename1 = 'restaurants2017.csv'
filename2 = 'restaurants2018.csv'
def get_csv_rows(filename):
with open(filename) as csv_file:
return list(csv.reader(csv_file))
headers1, *rows1 = get_csv_rows(filename1)
headers2, *rows2 = get_csv_rows(filename2)
restaurants = {
(zip_code, name, owner): review
for zip_code, name, owner, rating, review in rows1
}
# Putting reviews into rows2
for row in rows2:
zip_code, name, owner, *rest = row
if (zip_code, name, owner) in restaurants:
row[-1] = restaurants[zip_code, name, owner]
with open(filename2, mode='wt') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(headers2)
writer.writerows(rows2)
Order The Rows
Reorder the rows in restaurants2018.csv by their ZIP code.
Answers
import csv
filename = 'restaurants2018.csv'
def get_zip(row):
"""First column is ZIP code."""
return row[0]
with open(filename, mode='rt') as csv_file:
reader = csv.reader(csv_file)
headers = next(reader)
rows = sorted(reader, key=get_zip)
with open(filename, mode='wt') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(headers)
writer.writerows(rows)
Email CSV Analysis
These exercises involve analysis of a emails_sent.csv file, which you must download.
Background
The file emails_sent.csv contains information on every email that was sent through my automated emailing software since the beginning of 2018.
The CSV file contains three columns:
User ID
Date and time the email was sent
Subject line of the email
These emails are sent to individuals who interact with me in a number of different ways:
People who subscribed to my general mailing list
People who signed up for a free preview of my Python Morsels subscription service
People who signed up for the paid version of my Python Morsels subscription service
The following information can be gleaned from this data:
All users who subscribed to my newsletter receive an email with the subject line
Welcome to Trey's Weekly Python Chat! I have some things to share with you...All users who sign up for a free preview of Python Morsels receive an email with the subject line
Prepare yourself for a preview of Python MorselsAll users who sign up for the paid version of Python Morsels receive an email with the subject line
Prepare yourself for Python Morsels
Initial Statistics
Create a Python program that reads the emails_sent.csv file and prints the following information from the CSV file:
How many total unique users are there?
How many users subscribed to my newsletter during the captured time frame?
How many users signed up for the free preview of Python Morsels?
How many users signed up for a paid Python Morsels subscription?
To figure out what to filter your data on to answer each of these questions, see the notes on subject lines in the background section above.
Please ask for help if get stuck or you find yourself struggling.
Note
You may need to specify encoding="utf-8" in these exercises when reading this CSV file on a Windows machine.
import csv
welcome_subject = "Welcome to Trey's Weekly Python Chat! I have some things to share with you..."
preview_subject = "Prepare yourself for a preview of Python Morsels"
paid_subject = "Prepare yourself for Python Morsels"
with open("emails_sent.csv", mode="rt", newline="") as csv_file:
reader = csv.reader(csv_file)
users = set()
subscribed = set()
previewed = set()
paid = set()
for uid, time, subject in reader:
users.add(uid)
if subject == welcome_subject:
subscribed.add(uid)
if subject == preview_subject:
previewed.add(uid)
if subject == paid_subject:
paid.add(uid)
print("Total users:", len(users))
print("Subscribed to newsletter:", len(users))
print("Signed up for Python Morsels preview:", len(previewed))
print("Paid for Python Morsels:", len(paid))
Free to paid conversions
What percentage of users who signed up for a free Python Morsels subscription eventually also had a paid subscription?
Anwers
import csv
with open('emails_sent.csv', mode='rt', newline='') as csv_file:
reader = csv.reader(csv_file)
free_users = set()
paid = 0
for uid, time, subject in reader:
if "Prepare yourself for a preview of Python Morsels" == subject:
free_users.add(uid)
if uid in free_users and "Prepare yourself for Python Morsels" == subject:
paid += 1
percent = round(paid/len(free_users)*100, 1)
print(f"{paid}/{len(free_users)} stayed subscribed ({percent}%)")
Days to Convert
Of those users who signed up for Trey’s newsletter and then later signed up for a paid Python Morsels subscription, how many days did it take them to convert from “subscribed to newsletter” to “paying for Python Morsels”?
Print out for each applicable user, the number of days it took between signing up for Trey’s newsletter and paying for Python Morsels.
Also print out the average number of days it took to sign up.
Anwers
import csv
from datetime import datetime, timedelta
def days_between(first, second):
time1 = datetime.strptime(first, "%Y-%m-%d %H:%M:%S")
time2 = datetime.strptime(second, "%Y-%m-%d %H:%M:%S")
return time2 - time1
with open('emails_sent.csv', mode='rt') as csv_file:
reader = csv.reader(csv_file)
conversion_days = {}
newsletter = {}
for uid, time, subject in reader:
if "Welcome to Trey's" in subject:
newsletter[uid] = time
if uid in newsletter and "Prepare yourself for Python Morsels" == subject:
conversion_days[uid] = days_between(newsletter[uid], time)
total_days = timedelta(0)
for days in conversion_days.values():
total_days += days
print(days)
print(f"Average conversion took {total_days/len(conversion_days)}")
More CSV Exercises
DDOS CSV File
Write a program, ddos_reformat.py, that will convert the text file ddos_attack_record.txt to a comma-delimited file named ddos_attack_record.csv. Maintain the column order.
The first 3 lines of ddos_attack_record.txt look like this:
TCP 192.168.2.401:54599 86.22.213.44:30739 ESTABLISHED
TCP 192.168.2.401:54633 177.19.180.123:9433 PIN_WAIT_2
TCP 192.168.2.401:54651 179.79.57.231:29820 ESTABLISHED
The first 3 lines of ddos_attack_record.csv should look like this:
TCP,192.168.2.401:54599,86.22.213.44:30739,ESTABLISHED
TCP,192.168.2.401:54633,177.19.180.123:9433,PIN_WAIT_2
TCP,192.168.2.401:54651,179.79.57.231:29820,ESTABLISHED
Answer
import csv
with open('ddos_attack_record.txt') as log_file:
with open('ddos_attack_record.csv', mode='wt', newline='') as csv_file:
writer = csv.writer(csv_file)
for line in log_file:
writer.writerow(line.split())
Split Logs
Write a program, oobelib_split.py which reads the pipe-delimited file oobelib.csv and creates a separate CSV files for all debug, info, warn, and error entries in oobelib.csv. Make sure to include all relevant information from each line in its respective file.
Name the files oobelib-debug.csv, oobelib-error.csv, oobelib-info.csv, and oobelib-warning.csv.
For example, the first 3 lines of oobelib-error.csv would be:
04/14/16 12:45:43:937,[ERROR],,,,OPMWrapper,,,16423,Failed in getting value for key in OPMGetValueForKey
04/14/16 14:03:29:118,[ERROR],,,,SLCoreService,,,47967,Could not find license from which to get user data
04/14/16 14:03:29:118,[ERROR],,,,SLCoreService,,,47967,No value for key [PersonGUID] in user dictionary.
Answer
import csv
with open('oobelib.csv') as csv_file:
log_file = csv.reader(csv_file, delimiter='|')
debug = csv.writer(open('oobelib-debug.csv', mode='w'))
info = csv.writer(open('oobelib-info.csv', mode='w'))
warning = csv.writer(open('oobelib-warning.csv', mode='w'))
error = csv.writer(open('oobelib-error.csv', mode='w'))
for row in log_file:
if len(row) < 2:
continue
if row[1] == '[DEBUG]':
debug.writerow(row)
elif row[1] == '[INFO]':
info.writerow(row)
elif row[1] == '[WARN]':
warning.writerow(row)
elif row[1] == '[ERROR]':
error.writerow(row)
Pipe to Comma
Write a program, pipe_to_comma.py, that reads a pipe-delimited file and outputs a comma-delimited file.
The program should take a pipe-delimited filename as input and add a .csv extension to create the CSV output filename.
Test the program with the oobelib.csv file.
Answer
"""Convert a pipe-delimited file to a comma-delimited file"""
import csv
import sys
def pipe_to_comma(filename):
"""Convert filename from pipe-delimited to comma-delimited"""
comma_file = filename + '.csv'
with open(filename) as in_file, open(comma_file, mode='w') as out_file:
pipe_reader = csv.reader(in_file, delimiter='|')
comma_writer = csv.writer(out_file)
for row in pipe_reader:
comma_writer.writerow(row)
if __name__ == "__main__":
pipe_to_comma(sys.argv[1])
Code Review Pairings
Write a program that matches up code code reviewers such that each person in a list of people is reviewing 2 other people’s code and no two people are reviewing each others code.
Example input file:
Amber
Christopher
Katie
Russell
Charlotte
Elias
Chiang
Example output file:
Name,To Review,To Review,Reviewed By,Reviewed By
Chiang,Katie,Christopher,Elias,Russell
Katie,Christopher,Amber,Chiang,Russell
Christopher,Amber,Charlotte,Chiang,Katie
Amber,Charlotte,Elias,Katie,Christopher
Charlotte,Elias,Russell,Christopher,Amber
Elias,Russell,Chiang,Amber,Charlotte
Russell,Chiang,Katie,Charlotte,Elias
Here’s a script to test the output file you generate:
import csv
import sys
def fail(message):
print(message)
exit(1)
def main(input_file):
reviewers_for = {}
reviewees_for = {}
reader = csv.reader(input_file)
next(reader)
for name, reviewer1, reviewer2, reviewee1, reviewee2 in reader:
reviewers = set((reviewer1, reviewer2))
reviewees = set((reviewee1, reviewee2))
if name in reviewers:
fail(f"{name} is their own reviewer.")
if name in reviewees:
fail(f"{name} is their own reviewee.")
if reviewers & reviewees:
fail(f"{name} has overlapping reviewers and reviewees.")
reviewers_for[name] = reviewers
reviewees_for[name] = reviewees
for name, reviewees in reviewees_for.items():
for other in reviewees:
if name not in reviewers_for[other]:
fail(f"{other} is {name} reviewee, {name} isn't {other} reviewer")
print("Looks good.")
if __name__ == '__main__':
with open(sys.argv[1]) as input_file:
main(input_file)
Answers
import csv from collections import defaultdict import click import random def with_next(sequence, n): """Return iterable yielding item with next n items (circularly).""" sequences = (sequence[i:] + sequence[:i] for i in range(n+1)) return zip(*sequences) def match_reviewers(names): reviewers = {} reviewees = defaultdict(list) for name, reviewer1, reviewer2 in with_next(names, n=2): reviewers[name] = [reviewer1, reviewer2] reviewees[reviewer1].append(name) reviewees[reviewer2].append(name) return reviewers, reviewees @click.command() @click.argument('names_file', type=click.File('rt')) @click.argument('output', type=click.File('wt')) def main(names_file, output): names = names_file.read().splitlines() random.shuffle(names) reviewers, reviewees = match_reviewers(names) writer = csv.writer(output) writer.writerow(['Name'] + ['To Review']*2 + ['Reviewed By']*2) writer.writerows( [name] + reviewers[name] + reviewees[name] for name in names ) if __name__ == '__main__': main()
Pipe-delimited Log File Converter
Write a program, oobelib_convert.py which reads the oobelib.log file and outputs a modified version of it which can be read by the csv module (maintain the | delimiter). Name the new file oobelib.csv.
The first 3 lines of the oobelib.log input file look like this:
04/14/16 12:45:43:694 | [INFO] | | | | OOBELib | | | 16423 | __OOBELIB_LOG_FILE__
04/14/16 12:45:43:694 | [INFO] | | | | OOBELib | | | 16423 | *************OOBELib Session Starts*************
04/14/16 12:45:43:694 | [INFO] | | | | OOBELib | | | 16423 | Version 9.0.0.2,7.0
We want the first 3 lines of the oobelib.csv output file to look like this:
04/14/16 12:45:43:694|[INFO]||||OOBELib|||16423|__OOBELIB_LOG_FILE__
04/14/16 12:45:43:694|[INFO]||||OOBELib|||16423|*************OOBELib Session Starts*************
04/14/16 12:45:43:694|[INFO]||||OOBELib|||16423|Version 9.0.0.2,7.0
Answer
import csv
with open('oobelib.log') as log_file:
with open('oobelib.csv', mode='w') as csv_file:
writer = csv.writer(csv_file, delimiter='|')
for line in log_file:
row = line.rstrip().split(' | ')
writer.writerow(row)
Track RPMs
Given this file of drive data for hard drives that should each have a 7200 rpm, write a program rotation_rate.py that does the following:
Reads a csv file containing timed records of rotations (in degrees) and temperatures of different make/models of hard drives.
Returns a dictionary containing the make and model as keys, and a list of RPMs per reading as values. The rpm values should be for each individual reading, not the cumulative rpm over time.
Time between readings can be assumed to be .001 second.
Note
You may use the following formula for rpm: (current_degrees - last_degrees) / 360 / .001 * 60
Example usage:
$ python rotation_rate.py
{('Z', '1'): [7116.66, 7266.66, 7233.33], ('Z', '2'): [7183.33, 7300.00, 7250.00], ('Z', '3'): [7150.00, 7366.66, 7083.33]}
The output should look like this file.
Answers
import csv
import sys
in_filename, out_filename = sys.argv[1:]
DEGREES = 360
SECONDS = 60
TIME = .001
def drive_rows_to_degrees_dict(csv_rows):
drive_degrees = {}
for make, model, _, reading_type, value in csv_rows:
if reading_type != 'degrees of rotation':
continue
if (make, model) not in drive_degrees:
drive_degrees[make, model] = []
drive_degrees[make, model].append(float(value))
return drive_degrees
def degrees_to_rpms(degrees_list):
rpms = []
last_degrees = 0
for degrees in degrees_list:
degree_delta = degrees - last_degrees
rpms.append(degree_delta * SECONDS / (DEGREES * TIME))
last_degrees = degrees
return rpms
def dict_to_rows(rpm_dict):
return sorted(
[make, model] + rpms
for (make, model), rpms in rpm_dict.items()
)
with open(in_filename) as in_file:
reader = csv.reader(in_file)
next(reader)
drive_degrees = drive_rows_to_degrees_dict(reader)
drive_rpms = {
drive: degrees_to_rpms(degrees)
for drive, degrees in drive_degrees.items()
}
with open(out_filename, mode='w') as out_file:
writer = csv.writer(out_file)
writer.writerow(['Make', 'Model', 'RPM'])
writer.writerows(dict_to_rows(drive_rpms))
Parsing Ping Data
Create a program parse_pings.py which should do the following:
Read each line of
this ping output fileand extract the values forbytes,IP,icmp_seq,ttl, andtime.Create the CSV file and write one line per row.
The CSV file should be in this format:
bytes,IP,ICMP Sequence,TTL,time (ms)
64,182.162.94.6,7,45,175
Run your program like this:
$ python parse_pings.py pings.txt pings.csv
For reference, the csv file you produce should look like this file
Answer
With string splitting/stripping:
import csv
import sys
in_file, out_file = sys.argv[1:]
with open(in_file) as ping_file:
with open(out_file, mode='w') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(('bytes', 'IP', 'ICMP Sequence', 'TTL', 'time (ms)'))
for line in ping_file:
if 'ttl' in line:
bytes, _, _, ip, seq, ttl, time, _ = line.split()
ip = ip.rstrip(':')
seq = seq.split('=')[1]
ttl = ttl.split('=')[1]
time = time.split('=')[1]
writer.writerow((bytes, ip, seq, ttl, time))
With regular expressions:
import csv
import re
import sys
in_file, out_file = sys.argv[1:]
IP_LINE_RE = re.compile(r'''
^
( \d+ ) [ ] bytes [ ] from [ ]
( \S+ ): [ ]
icmp_seq = ( \d+ ) [ ]
ttl = ( \d+ ) [ ]
time = ( \S+ ) [ ] ms
$
''', re.VERBOSE)
with open(in_file) as ping_file:
with open(out_file, mode='w') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(('bytes', 'IP', 'ICMP Sequence', 'TTL', 'time (ms)'))
for line in ping_file:
match = IP_LINE_RE.search(line)
if match:
bytes, ip, seq, ttl, time = match.groups()
writer.writerow((bytes, ip, seq, ttl, time))
Hard Drive Exercises
These exercises require this ZIP file of harddrive data collected from the Backblaze storage racks in 2016.
Annualized Failure Rate
Calculate the annualized failure rate of all drives in 2016.
You can calculate the annualized drive failure rate by:
FAILURES / DRIVE_DAYS * 365
Where:
FAILURES is the total number of drive failures experienced (a drive failure occurs when the
failurecolumn is1)DRIVE_DAYS is the total number of drives in the system on each day (number of data rows in all CSV files)
Answers
from glob import glob
import csv
drive_days = 0
failures = 0
for in_filename in glob('*-*-*.csv'):
with open(in_filename) as in_file:
reader = csv.reader(in_file)
next(reader)
for serial, model, capacity, failure in reader:
drive_days += 1
if failure == '1':
failures += 1
print(f'Total {100.0*365*failures/drive_days:.2f}')
Top 10 Models
What were the 10 most common hard drive models in Backblazes system in 2016?
Answers
from glob import glob
from collections import Counter
import csv
counts = Counter()
for in_filename in glob('*-*-*.csv'):
with open(in_filename) as in_file:
counts += Counter(
model
for serial, model, capacity, failure in csv.reader(in_file)
)
for m, c in counts.most_common(10):
print(c, m)
Failure Rate Per Drive
Calculate the annualized failure rate for each drive model.
Only include models with at least 4TB of storage and 10,000 drive days in 2016.
Note that Hitachi changed its name to HGST.
Answers
from collections import Counter
from glob import glob
import csv
CAPACITIES = {}
DAYS = Counter()
FAILURES = Counter()
for in_filename in glob('*-*-*.csv'):
with open(in_filename) as in_file:
reader = csv.reader(in_file)
next(reader)
for serial, model, capacity, failure in reader:
model = model.rsplit(" ", 1)[-1]
DAYS[model] += 1
if failure == '1':
FAILURES[model] += 1
if model not in CAPACITIES:
CAPACITIES[model] = int(capacity)/1000**3
RATES = Counter({
model: 100.0*365*FAILURES[model]/days
for model, days in DAYS.items()
if CAPACITIES[model] >= 4000 and days > 10000
})
for model, failure_rate in RATES.most_common():
print(
f'{model}: {failure_rate:.2f}% ' +
f'({DAYS[model]} drive days, {CAPACITIES[model]}GB)'
)
Read & Truncate Original CSV Files
Download the four 2016 Q1 through Q4 data ZIP files from the Backblaze Hard Drive Test Data page.
Create a Python script that:
Opens these ZIP files
Reads the CSV files within them
Writes the CSV files back to disk with the same name, writing only the second through fifth columns (
serial_number,model,capacity_bytes,failure)
Each CSV file should end up being about 3.1MB in size, for a total of about 1.1GB of CSV files.
Answers
import csv
import glob
import os
import re
from zipfile import ZipFile
CSV_FILE_RE = re.compile(r'\b\d{4}-\d{2}-\d{2}.csv$')
def write_out_file(rows, out_filename):
with open(out_filename, mode='wt') as out_file:
writer = csv.writer(out_file)
for row in rows:
writer.writerow(row[1:5])
print(f"Wrote {out_filename}")
def process_zip_file(filename):
with ZipFile(zip_filename) as zip_file:
csv_filenames = [
filename
for filename in zip_file.namelist()
if CSV_FILE_RE.search(filename)
]
for filename in csv_filenames:
basename = os.path.basename(filename)
with zip_file.open(filename) as in_file:
write_out_file(csv.reader(in_file), basename)
for zip_filename in glob.glob('data_Q*_2016.zip'):
process_zip_file(zip_filename)
Web Download Exercises
All Data Between Days
This exercises use the data from https://www.githubarchive.org/
Create a script that downloads all data from the Github archive between two dates.
Example:
$ python fetch_github_data.py 2016-02-28 2017-03-02
Fetching 2017-02-28-0.json.gz
Fetching 2017-02-28-1.json.gz
Fetching 2017-02-28-2.json.gz
Fetching 2017-02-28-3.json.gz
Fetching 2017-02-28-4.json.gz
Fetching 2017-02-28-5.json.gz
Fetching 2017-02-28-6.json.gz
Fetching 2017-02-28-7.json.gz
Fetching 2017-02-28-8.json.gz
Fetching 2017-02-28-9.json.gz
Fetching 2017-02-28-10.json.gz
Fetching 2017-02-28-11.json.gz
Fetching 2017-02-28-12.json.gz
Fetching 2017-02-28-13.json.gz
Fetching 2017-02-28-14.json.gz
Fetching 2017-02-28-15.json.gz
Fetching 2017-02-28-16.json.gz
Fetching 2017-02-28-17.json.gz
Fetching 2017-02-28-18.json.gz
Fetching 2017-02-28-19.json.gz
Fetching 2017-02-28-20.json.gz
Fetching 2017-02-28-21.json.gz
Fetching 2017-02-28-22.json.gz
Fetching 2017-02-28-23.json.gz
Fetching 2017-03-01-0.json.gz
Fetching 2017-03-01-1.json.gz
Fetching 2017-03-01-2.json.gz
Fetching 2017-03-01-3.json.gz
Fetching 2017-03-01-4.json.gz
Fetching 2017-03-01-5.json.gz
Fetching 2017-03-01-6.json.gz
Fetching 2017-03-01-7.json.gz
Fetching 2017-03-01-8.json.gz
Fetching 2017-03-01-9.json.gz
Fetching 2017-03-01-10.json.gz
Fetching 2017-03-01-11.json.gz
Fetching 2017-03-01-12.json.gz
Fetching 2017-03-01-13.json.gz
Fetching 2017-03-01-14.json.gz
Fetching 2017-03-01-15.json.gz
Fetching 2017-03-01-16.json.gz
Fetching 2017-03-01-17.json.gz
Fetching 2017-03-01-18.json.gz
Fetching 2017-03-01-19.json.gz
Fetching 2017-03-01-20.json.gz
Fetching 2017-03-01-21.json.gz
Fetching 2017-03-01-22.json.gz
Fetching 2017-03-01-23.json.gz
Fetching 2017-03-02-0.json.gz
Fetching 2017-03-02-1.json.gz
Fetching 2017-03-02-2.json.gz
Fetching 2017-03-02-3.json.gz
Fetching 2017-03-02-4.json.gz
Fetching 2017-03-02-5.json.gz
Fetching 2017-03-02-6.json.gz
Fetching 2017-03-02-7.json.gz
Fetching 2017-03-02-8.json.gz
Fetching 2017-03-02-9.json.gz
Fetching 2017-03-02-10.json.gz
Fetching 2017-03-02-11.json.gz
Fetching 2017-03-02-12.json.gz
Fetching 2017-03-02-13.json.gz
Fetching 2017-03-02-14.json.gz
Fetching 2017-03-02-15.json.gz
Fetching 2017-03-02-16.json.gz
Fetching 2017-03-02-17.json.gz
Fetching 2017-03-02-18.json.gz
Fetching 2017-03-02-19.json.gz
Fetching 2017-03-02-20.json.gz
Fetching 2017-03-02-21.json.gz
Fetching 2017-03-02-22.json.gz
Fetching 2017-03-02-23.json.gz
Answers
from datetime import datetime, timedelta
import sys
import urllib
from shutil import copyfileobj
date1, date2 = sys.argv[1:]
start_date = datetime.strptime(date1, '%Y-%m-%d')
end_date = datetime.strptime(date2, '%Y-%m-%d')
def daterange(start, end):
days = (end_date - start_date).days + 1
return [
start_date + timedelta(days=day_delta)
for day_delta in range(days)
]
for date in daterange(start_date, end_date):
date_string = date.strftime('%Y-%m-%d')
for hour in range(0, 24):
filename = f'{date_string}-{hour}.json.gz'
print(f"Fetching {filename}")
url = 'http://data.githubarchive.org/' + filename
with open(filename, mode='wb') as out_file:
response = urllib.urlopen(url)
copyfileobj(response, out_file)