Python Programming

Lecture 12 Downloading Data

12.1 Downloading Data (1)

.CSV


from pathlib import Path
import csv

path = Path('sitka_weather_07-2021_simple.csv')
lines = path.read_text().splitlines()

reader = csv.reader(lines)
header_row = next(reader)
print(header_row)

for index, column_header in enumerate(header_row):
    print(index,column_header)
        

['STATION', 'NAME', 'DATE', 'TAVG', 'TMAX', 'TMIN']
0 STATION
1 NAME
2 DATE
3 TAVG
4 TMAX
5 TMIN

Extracting and Reading Data


from pathlib import Path
import csv

path = Path('sitka_weather_07-2021_simple.csv')
lines = path.read_text().splitlines()

reader = csv.reader(lines)
header_row = next(reader)

highs = []
for row in reader:
    high = int(row[4])
    highs.append(high)

print(highs)

[61, 60, 66, 60, 65, 59, 58, 58, 57, 60, 60, 60, 57, 58, 60, 61, 63, 63, 
70, 64, 59, 63, 61, 58, 59, 64, 62, 70, 70, 73, 66]


from matplotlib import pyplot as plt

# Plot the high temperatures.
plt.style.use('seaborn')
fig, ax = plt.subplots()
ax.plot(highs, c='red')

# Format plot.
ax.set_title("Daily high temperatures, July 2021", fontsize=24)
ax.set_xlabel('', fontsize=16)
ax.set_ylabel("Temperature (F)", fontsize=16)
ax.tick_params(labelsize=16)

#plt.show()
plt.savefig('simple.jpg',dpi=300)

The datetime Module



from datetime import datetime

first_date = datetime.strptime('2021-7-1', '%Y-%m-%d') 
print(type(first_date))
print(first_date.strftime('%B %d %Y'))
print(first_date)

<class 'datetime.datetime'>
July 01 2021
2021-07-01 00:00:00


%A Weekday name, such as Monday
%B Month name, such as January
%m Month, as a number (01 to 12)
%d Day of the month, as a number (01 to 31) 
%Y Four-digit year, such as 2015
%y Two-digit year, such as 15
%H Hour, in 24-hour format (00 to 23)
%I Hour, in 12-hour format (01 to 12)
%p am or pm
%M Minutes (00 to 59) %S Seconds (00 to 61)

from pathlib import Path
import csv
from datetime import datetime
from matplotlib import pyplot as plt

path = Path('sitka_weather_07-2021_simple.csv')
lines = path.read_text().splitlines()
reader = csv.reader(lines)
header_row = next(reader)

dates, highs = [],[]
for row in reader:
    current_date = datetime.strptime(row[2], "%Y-%m-%d")
    dates.append(current_date)
    high = int(row[4])
    highs.append(high)

plt.style.use('seaborn')
fig, ax = plt.subplots()
ax.plot(dates, highs, c='red')

# Format plot.
ax.set_title("Daily high temperatures, July 2021", fontsize=24)
ax.set_xlabel('', fontsize=16)
fig.autofmt_xdate()
ax.set_ylabel("Temperature (F)", fontsize=16)
ax.tick_params(labelsize=16)

plt.savefig('simple.jpg',dpi=300)


from pathlib import Path
import csv
from datetime import datetime
from matplotlib import pyplot as plt

path = Path('sitka_weather_2021_simple.csv')
lines = path.read_text().splitlines()
reader = csv.reader(lines)
header_row = next(reader)
    
dates, highs, lows= [], [], []
for row in reader:
    current_date = datetime.strptime(row[2], "%Y-%m-%d")
    dates.append(current_date)
    high = int(row[4])
    highs.append(high)
    low = int(row[5])
    lows.append(low)
        

# Plot data.
plt.style.use('seaborn')
fig, ax = plt.subplots()
ax.plot(dates, highs, c='red', alpha=0.5)
ax.plot(dates, lows, c='blue', alpha=0.5)
ax.fill_between(dates, highs, lows, facecolor='blue', alpha=0.1)

# Format plot.
ax.set_title("Daily high and low temperatures - 2021", fontsize=24)
ax.set_xlabel('', fontsize=16)
fig.autofmt_xdate()
ax.set_ylabel("Temperature (F)", fontsize=16)
ax.tick_params(labelsize=16)

plt.savefig('simple.jpg',dpi=300)


from pathlib import Path
import csv
from datetime import datetime
from matplotlib import pyplot as plt

path = Path('death_valley_2021_simple.csv')
lines = path.read_text().splitlines()
reader = csv.reader(lines)
header_row = next(reader)
dates, highs, lows= [], [], []


# continue   
for row in reader:
    try:
        current_date = datetime.strptime(row[2], "%Y-%m-%d")
        high = int(row[3])
        low = int(row[4])
    except ValueError:
        print(current_date, 'missing data')
    else:
        dates.append(current_date)
        highs.append(high)
        lows.append(low)
  
2021-05-04 00:00:00 missing data

12.2 Downloading Data (2)

.JSON


from pathlib import Path
from datetime import datetime
import json

path = Path('btc_close_2017.json')
contents = path.read_text()
btc_data = json.loads(contents)

date=[]; close=[]; months=[]

for btc_dict in btc_data:
    date.append(datetime.strptime(btc_dict['date'], "%Y-%m-%d"))
    months.append(int(btc_dict['month']))
    close.append(int(float(btc_dict['close'])))


import matplotlib.pyplot as plt

plt.style.use('seaborn')
fig, ax = plt.subplots()

ax.plot(date,close, linewidth=0.5, c='red')
ax.scatter(date,close, s=5, c='red')
ax.set_title('Close',fontsize=10)
fig.autofmt_xdate()

plt.savefig('close.jpg',dpi=300)

Iterator (迭代器)

Many types of objects in Python that qualify as being iterable (可迭代的), such as list, tuple, dictionary. In Python, the mechanism for iteratioin is based upon the following conventions:

  • An iterator is an object that manages an iteration through a series of values. If variable, i, identifies an iterator object, then each call to the built-in function, next(i), produces a subsequent element from the underlying series, with StopIteration exception raised to indicate that there are no further elements.
  • Usage: 1. abstract sequence; 2. save memory

import itertools
nums = itertools.count(0,2)

print(next(nums))
print(next(nums))
print(next(nums))


0
2
4



import itertools

nums = itertools.count(0,2)
for i in nums:
    if i > 6:
        break
    print(i)

0
2
4
6



import itertools

cycle_strings = itertools.cycle('ABC')
i = 1
for string in cycle_strings:
    if i == 7:
        break
    print(string)
    i = i + 1

A
B
C
A
B
C



import itertools
for item in itertools.repeat('hello', 3):
    print(item)

hello world
hello world
hello world

import itertools
nums = itertools.repeat('hello', 3)

print(next(nums))
print(next(nums))
print(next(nums))
print(next(nums))

hello
hello
hello
Traceback (most recent call last):
    print(next(nums))
StopIteration

groupby()


from itertools import groupby

for key, value_iter in groupby('aaabbbaaccd'):
    print(key, ':', list(value_iter))

a : ['a', 'a', 'a']
b : ['b', 'b', 'b']
a : ['a', 'a']
c : ['c', 'c']
d : ['d']

from itertools import groupby

data = ['a', 'bb', 'ccc', 'dd', 'eee', 'f']
for key, value_iter in groupby(data, len):  
    print(key, ':', list(value_iter))

1 : ['a']
2 : ['bb']
3 : ['ccc']
2 : ['dd']
3 : ['eee']
1 : ['f']

from itertools import groupby
data = ['a', 'bb', 'cc', 'ddd', 'eee', 'f']
for key, value_iter in groupby(data, len):  
    print(key, ':', list(value_iter))

1 : ['a']
2 : ['bb', 'cc']
3 : ['ddd', 'eee']
1 : ['f']

zip()


>>> a = [1,2,3]
>>> b = [4,5,6]
>>> c = [4,5,6,7,8]
>>> zipped = zip(a,b)
>>> zipped
 #iterator
>>> list(zipped)
[(1, 4), (2, 5), (3, 6)] 
>>> list(zip(a,c))
[(1, 4), (2, 5), (3, 6)]

>>> a = [1,2,3]
>>> b = [4,5,6]
>>> c = [4,5,6,7,8]
>>> zipped = zip(a,b)
>>> list(zip(*zipped)) 
[(1, 2, 3), (4, 5, 6)]
>>> zipped = zip(a,b)
>>> x,y = zip(*zipped)
>>> print(x)
(1,2,3)

from itertools import groupby

xy_map = []
for x, y in groupby(zip(months, close), lambda w: w[0]):  
    y_list = []
    for first, second in y:
        y_list.append(second)
    xy_map.append([x, sum(y_list) / len(y_list)])  
    x_unique, y_mean = zip(*xy_map) 
    

import matplotlib.pyplot as plt

plt.style.use('seaborn')
fig, ax = plt.subplots()

ax.plot(x_unique, y_mean, linewidth=1)
ax.scatter(x_unique, y_mean, s=20)
ax.set_title('Close',fontsize=10)

plt.savefig('close.jpg',dpi=300)

Summary

  • Downloading Data
    • Reading: Python Crash Course, Chapter 16