import pandas as pd
df = pd.read_csv('sitka_weather_07-2021_simple.csv')
df.index
RangeIndex(start=0, stop=31, step=1)
df.columns
Index(['STATION', 'NAME', 'DATE', 'TAVG', 'TMAX', 'TMIN'], dtype='object')
df.head()
STATION NAME DATE TAVG TMAX TMIN
0 USW00025333 SITKA AIRPORT, AK US 2021-07-01 NaN 61 53
1 USW00025333 SITKA AIRPORT, AK US 2021-07-02 NaN 60 52
2 USW00025333 SITKA AIRPORT, AK US 2021-07-03 NaN 66 54
3 USW00025333 SITKA AIRPORT, AK US 2021-07-04 NaN 60 55
4 USW00025333 SITKA AIRPORT, AK US 2021-07-05 NaN 65 55
Extracting and Reading Data
df["TMAX"]
0 61
1 60
...
29 73
30 66
Name: TMAX, dtype: int64
df.loc[0]
STATION USW00025333
NAME SITKA AIRPORT, AK US
DATE 2021-07-01
TAVG NaN
TMAX 61
TMIN 53
Name: 0, dtype: object
df.loc[:,'TAVG':'TMAX'] #查看连续多列
df.loc[:,['TAVG','TMAX']] #查看特定多列
df.loc[2:4,:] # 查看连续多行 不是左闭右开
df.loc[[2,4],:] # 查看特定多行
Modify Data
# calculation
df.loc[:, 'difference'] = df["TMAX"] - df["TMIN"]
# example
df.loc[:, 'x'] = 1
df.loc[:, 'y'] = range(31)
Delete Data
# 删除返回值
df_x = df.drop('difference', axis = 1) # 删除单列
df_y = df.drop(1, axis = 0) # 删除单行
Filter Data
df_60 = df[df["TMAX"] >= 60]
from matplotlib import pyplot as plt
# Plot the high temperatures.
fig, ax = plt.subplots()
ax.plot(df["TMAX"], c='red')
# Format plot.
ax.set_title("Daily high temperatures, July 2021", fontsize=24)
ax.set_xlabel('', fontsize=16)
ax.set_ylabel("Temperature (F)", fontsize=16)
ax.tick_params(labelsize=16)
plt.savefig('simple.jpg',bbox_inches='tight',dpi=300)
datetime
# convert string to time format
df.loc[:,'date'] = pd.to_datetime(df['DATE'])
first_date = pd.to_datetime('2021-7-1',format = '%Y-%m-%d')
%A Weekday name, such as Monday; %B Month name, such as January;
%m Month, as a number (01 to 12)
%d Day of the month, as a number (01 to 31)
%Y Four-digit year, such as 2015; %y Two-digit year, such as 15
%M Minutes (00 to 59); %S Seconds (00 to 61)
# filtering data by date
df_t = df[(df['date']>="20210715") & (df['date']<="20210720")]
df_t
14 USW00025333 SITKA AIRPORT, AK US 2021-07-15 NaN 60 50 2021-07-15
...
19 USW00025333 SITKA AIRPORT, AK US 2021-07-20 NaN 64 56 2021-07-20
# calculate periods
periods = df.loc[20, "date"] - df.loc[1, "date"]
periods.days
19
df.loc[:,'date'] = pd.to_datetime(df['DATE'])
fig, ax = plt.subplots()
ax.plot(df["date"], df["TMAX"], c='red')
ax.set_title("Daily high temperatures, July 2021", fontsize=18)
ax.set_xlabel('', fontsize=16)
fig.autofmt_xdate()
ax.set_ylabel("Temperature (F)", fontsize=16)
ax.tick_params(labelsize=10)
plt.savefig('simple.jpg',bbox_inches='tight',dpi=300)
import pandas as pd
df = pd.read_csv('sitka_weather_2021_simple.csv')
df.loc[:,'date'] = pd.to_datetime(df['DATE'])
# Plot data.
fig, ax = plt.subplots()
ax.plot(df["date"], df["TMAX"], c='red', alpha=0.5)
ax.plot(df["date"], df["TMIN"], c='blue', alpha=0.5)
ax.fill_between(df["date"], df["TMAX"], df["TMIN"], facecolor='blue', alpha=0.1)
# Format plot.
ax.set_title("Daily high and low temperatures - 2021", fontsize=18)
ax.set_xlabel('', fontsize=16)
fig.autofmt_xdate()
ax.set_ylabel("Temperature (F)", fontsize=16)
ax.tick_params(labelsize=10)
plt.savefig('simple.jpg',bbox_inches='tight',dpi=300)
import pandas as pd
from matplotlib import pyplot as plt
frame = pd.read_csv('death_valley_2021_simple.csv')
frame.info()
RangeIndex: 365 entries, 0 to 364
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 STATION 365 non-null object
1 NAME 365 non-null object
2 DATE 365 non-null object
3 TMAX 364 non-null float64
4 TMIN 365 non-null int64
5 TOBS 360 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 17.2+ KB
frame[frame["TMAX"].isna()]
123 USC00042319 DEATH VALLEY NATIONAL PARK, CA US 2021-05-04 NaN 72 89.0
frame_DT = frame[["DATE", "TMAX", "TMIN"]]
frame_cleaned = frame_DT.dropna()
frame_cleaned.info()
Index: 364 entries, 0 to 364
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 DATE 364 non-null object
1 TMAX 364 non-null float64
2 TMIN 364 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 11.4+ KB
.JSON
from pathlib import Path
import pandas as pd
import json
path = Path('btc_close_2017.json')
contents = path.read_text()
btc_data = json.loads(contents)
date = [];close=[]; months=[]
for btc_dict in btc_data:
months.append(int(btc_dict['month']))
close.append(float(btc_dict['close']))
date.append(btc_dict['date'])
df = pd.DataFrame({"close": close, "months": months, "date": date})
df.loc[:,'date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(df['date'],df['close'], linewidth=0.5, c='red')
ax.scatter(df['date'],df['close'], s=5, c='red')
ax.set_title('Close',fontsize=10)
fig.autofmt_xdate()
plt.savefig('close.jpg',dpi=300)
monthly_avg = df.groupby('months')['close'].mean()
# mean(), sum(), max(), min(), std()
def f(arr):
return arr.max()-arr.min()
monthly_avg = df.groupby('months')['close'].apply(f)
monthly_avg = df.groupby('months')['close'].mean()
monthly_avg
months
1 6285.870968
2 7315.714286
...
11 51436.166667
12 91936.416667
Name: close, dtype: float64
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(monthly_avg.index, monthly_avg.values, linewidth=1)
ax.scatter(monthly_avg.index, monthly_avg.values, s=20)
ax.set_title('Close',fontsize=10)
plt.savefig('monthly_avg.jpg',dpi=300)
from pathlib import Path
import pandas as pd
import json
path = Path('btc_close_2017.json')
contents = path.read_text()
btc_data = json.loads(contents)
date = [];close=[]; months=[]
for btc_dict in btc_data:
months.append(int(btc_dict['month']))
close.append(float(btc_dict['close']))
date.append(btc_dict['date'])
df = pd.DataFrame({"close": close, "months": months, "date": date})
df.loc[:,'date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(df['date'],df['close'], linewidth=0.5, c='red')
fig.autofmt_xdate()
plt.savefig('close.jpg',dpi=300)
Data Combine
df_C
date pop
0 2024 1408975000
1 2023 1410710000
... ... ...
63 1961 660330000
64 1960 667070000
df_I
date pop
0 2024 1408975000
1 2023 1410710000
... ... ...
63 1961 446564729
64 1960 435990338
df_wide=pd.merge(df_C,df_I,on='date',\
suffixes=["_C","_I"])
df_wide
date pop_C pop_I
0 2024 1408975000 1450935791
... ... ... ...
63 1961 660330000 446564729
64 1960 667070000 435990338
df_long = pd.concat([df_C, df_I], \
ignore_index=True)
df_long
date pop
0 2024 1408975000
... ... ...
128 1961 446564729
129 1960 435990338