During the course of doing data analysis and modeling, a significant amount of time is spent on data preparation: loading, cleaning, transforming, and rearranging. Such tasks are often reported to take up 80% or more of an analyst's time.
In pandas, we've adopted a convention used in the R programming language by referring to missing data as NA, which stands for not available. In statistics applications, NA data may either be data that does not exist or that exists but was not observed.
When cleaning up data for analysis, it is often important to do analysis on the missing data itself to identify data collection problems or potential biases in the data caused by missing data.
import pandas as pd
from numpy import nan as NA
data = {'state': ['Ohio', 'Ohio', NA, 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2001, 2002, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, NA]}
frame = pd.DataFrame(data)
>>> frame
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 NaN 2001 3.6
3 Nevada 2002 2.4
4 Nevada 2002 2.9
5 Nevada 2003 NaN
>>> frame.info()
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
state 5 non-null object
year 6 non-null int64
pop 5 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 272.0+ bytes
>>> frame.isnull()
state year pop
0 False False False
1 False False False
2 True False False
3 False False False
4 False False False
5 False False True
>>> frame.dropna()
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
3 Nevada 2002 2.4
4 Nevada 2002 2.9
>>> frame['debt']=NA
>>> frame
state year pop debt
0 Ohio 2000 1.5 NaN
1 Ohio 2001 1.7 NaN
2 NaN 2001 3.6 NaN
3 Nevada 2002 2.4 NaN
4 Nevada 2002 2.9 NaN
5 Nevada 2003 NaN NaN
>>> frame=frame.dropna(axis=1,how="all")
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 NaN 2001 3.6
3 Nevada 2002 2.4
4 Nevada 2002 2.9
5 Nevada 2003 NaN
# only keep rows with at least 3 non-NaN values
>>> frame.dropna(thresh=3)
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
3 Nevada 2002 2.4
4 Nevada 2002 2.9
Filling in missing data
>>> frame.fillna(0)
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 0 2001 3.6
3 Nevada 2002 2.4
4 Nevada 2002 2.9
5 Nevada 2003 0.0
>>> frame.fillna({'state':'Ohio',
...: 'pop': 2})
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2001 3.6
3 Nevada 2002 2.4
4 Nevada 2002 2.9
5 Nevada 2003 2.0
>>> data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
....: 'k2': [1,3,2,3,3,4,4], 'v1':[0,1,2,3,4,5,5]})
>>> data
k1 k2 v1
0 one 1 0
1 two 3 1
2 one 2 2
3 two 3 3
4 one 3 4
5 two 4 5
6 two 4 5
>>> data.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
>>> data.drop_duplicates()
k1 k2 v1
0 one 1 0
1 two 3 1
2 one 2 2
3 two 3 3
4 one 3 4
5 two 4 5
>>> data.drop_duplicates(['k1','k2'])
k1 k2 v1
0 one 1 0
1 two 3 1
2 one 2 2
4 one 3 4
5 two 4 5
# keep="last", False
>>> data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
....: 'Pastrami', 'corned beef', 'Bacon',
....: 'pastrami', 'honey ham', 'nova lox'],
....: 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
>>> data
food ounces
0 bacon 4.0
1 pulled pork 3.0
2 bacon 12.0
3 Pastrami 6.0
4 corned beef 7.5
5 Bacon 8.0
6 pastrami 3.0
7 honey ham 5.0
8 nova lox 6.0
>>> lowercased = data['food'].apply(
...: lambda x: x.lower())
>>> lowercased
0 bacon
1 pulled pork
2 bacon
3 pastrami
4 corned beef
5 bacon
6 pastrami
7 honey ham
8 nova lox
meat_to_animal = {'bacon': 'pig','pulled pork': 'pig',
'pastrami': 'cow','corned beef': 'cow',
'honey ham': 'pig','nova lox': 'salmon'}
>>> data['animal'] = lowercased.map(meat_to_animal)
>>> data
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
Replacing Values
>>> data.replace("pig","pig-1")
food ounces animal
0 bacon 4.0 pig-1
1 pulled pork 3.0 pig-1
2 bacon 12.0 pig-1
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig-1
6 pastrami 3.0 cow
7 honey ham 5.0 pig-1
8 nova lox 6.0 salmon
>>> data.replace(["pig","cow"],"pig-2")
food ounces animal
0 bacon 4.0 pig-2
1 pulled pork 3.0 pig-2
2 bacon 12.0 pig-2
3 Pastrami 6.0 pig-2
4 corned beef 7.5 pig-2
5 Bacon 8.0 pig-2
6 pastrami 3.0 pig-2
7 honey ham 5.0 pig-2
8 nova lox 6.0 salmon
many to many, by dict
>>> data.replace({"pig":"pig-1","cow":"cow-1"})
Detecting and Filtering Outliers
>>> data[data['ounces']>=8]=NA
>>> data
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 NaN NaN NaN
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 NaN NaN NaN
...
>>> data = data.dropna()
Computing Dummy Variables
>>> dummies = pd.get_dummies(data['animal'])
>>> data.join(dummies)
food ounces animal cow pig salmon
0 bacon 4.0 pig 0 1 0
1 pulled pork 3.0 pig 0 1 0
3 Pastrami 6.0 cow 1 0 0
4 corned beef 7.5 cow 1 0 0
6 pastrami 3.0 cow 1 0 0
7 honey ham 5.0 pig 0 1 0
8 nova lox 6.0 salmon 0 0 1
import pandas as pd
db =pd.read_excel('db_top.xlsx')
# pd.read_csv('...csv',encoding='utf-8')
# pd.read_excel('db_top.xlsx', sheet_name="Sheet1")
# pd.read_excel('db_top.xlsx', sheet_name=0, index_col=0\
header=1, usecols=[0,2])
# 载入excel文件,sheet=0为第一个sheet,index_col指定某一列为index
# header=1指定某一行为header,uscecols提取哪几列
Character Encoding: ASCII, Unicode, UTF-8, , GBK
>>> db.columns
Index(['id', '全名', '导演', '类型', '摘要', '评分', '时长', '时间',
'年份', '演员', '投票数','发行信息', '中文名', '外文名'],dtype='object')
>>> genre = db[['中文名', '类型']].copy()
>>> genre
中文名 类型
0 星际穿越 剧情,科幻,悬疑,家庭,冒险
1 辛德勒的名单 剧情,历史,战争
2 唐伯虎点秋香 喜剧,爱情,古装
3 致命ID 剧情,悬疑,惊悚
4 指环王3:王者无敌 剧情,动作,奇幻,冒险
.. ... ...
245 地球上的星星 剧情,家庭,儿童
246 记忆碎片 剧情,悬疑,惊悚,犯罪
247 蝴蝶效应 剧情,科幻,悬疑,惊悚
248 海上钢琴师 剧情,音乐
249 三傻大闹宝莱坞 剧情,喜剧,爱情,歌舞
all_g_list = []
for x in genre['类型']:
all_g_list.append(x.split(','))
genre["类型"]=all_g_list
# 单个电影
x = pd.DataFrame()
x['类型'] = genre.iloc[0]['类型']
x["中文名"] = genre.iloc[0]['中文名']
x['d']=1
x= pd.pivot_table(x,index='中文名', columns='类型',values='d')
>>> x
类型 冒险 剧情 家庭 悬疑 科幻
中文名
星际穿越 1 1 1 1 1
# 所有电影
genre_list = pd.DataFrame()
for i in range(len(genre)):
x = pd.DataFrame()
x['类型'] = genre.iloc[i]['类型']
x["中文名"] = genre.iloc[i]['中文名']
x['d']=1
x = pd.pivot_table(x,index='中文名', columns='类型',values='d')
genre_list=pd.concat([genre_list,x])
genre_list.fillna(0)
# genre_list.to_csv('g.csv',encoding='utf-8-sig')
We can run regression with those dummy variables and scores of movies.
pip install seaborn
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
director = db['导演'].value_counts()
d = director.head(10)
sns.barplot(y=(d.index), x=(d.values), palette="RdBu_r")
plt.savefig('director.pdf',dpi=300, bbox='tight')
director.to_excel(excel_writer='director.xlsx', sheet_name="rank",\
encoding='utf-8')
# director.to_excel(excel_writer='director.xlsx', index=False,\
columns=[...], na_rep=0, inf_rep=0)
Multiple Sheets
excelpath = '...'
writer = pd.ExcelWriter(excelpath, engine="xlsxwriter")
df1.to_excel(writer, sheet_name="first")
df2.to_excel(writer, sheet_name="second")
df3.to_excel(writer, sheet_name="third")
writer.save()
# string
with open('pi_digits.txt') as file_object:
contents = file_object.read()
print(contents.rstrip())
# pi_digits.txt
3.1415926535
8979323846
2643383279
# Reading Line by Line
filename = 'pi_digits.txt'
with open(filename) as file_object:
for line in file_object:
print(line)
3.1415926535
8979323846
2643383279
with open(filename) as file_object:
for line in file_object:
print(line.rstrip())
3.1415926535
8979323846
2643383279
Making a List of Lines from a File
filename = 'pi_digits.txt'
with open(filename) as file_object:
lines = file_object.readlines()
filename = 'programming.txt'
with open(filename, 'w') as file_object:
file_object.write("I love programming.")
The second argument, 'w', tells Python that we want to open the file in write mode. You can open a file 198 Chapter 10 in read mode ('r'), write mode ('w'), append mode ('a'), or a mode that allows you to read and write to the file ('r+'). If you omit the mode argument, Python opens the file in read-only mode by default.
Python can only write strings to a text file. If you want to store numerical data in a text file, you'll have to convert the data to string format first using the str() function.
filename = 'programming.txt'
with open(filename, 'w') as file_object:
file_object.write("I love programming.")
file_object.write("I love creating new games.")
I love programming.I love creating new games.
filename = 'programming.txt'
with open(filename, 'w') as file_object:
file_object.write("I love programming.\n")
file_object.write("I love creating new games.\n")
I love programming.
I love creating new games.
filename = 'programming.txt'
message.py
with open(filename, 'a') as file_object:
file_object.write("I also love finding meaning in large datasets.\n")
file_object.write("I love creating apps that can run in a browser.\n")
I love programming.
I love creating new games.
I also love finding meaning in large datasets.
I love creating apps that can run in a browser.
pip install (安装文件的绝对路径)
xcode-select --install
pip install wordcloud
from wordcloud import WordCloud
string = 'Importance of relative word frequencies for font-size.\
With relative_scaling=0, only word-ranks are considered. With \
relative_scaling=1, a word that is twice as frequent will have \
twice the size. If you want to consider the word frequencies \
and not only their rank, relative_scaling around .5 often looks good.'
font = r'C:\Windows\Fonts\Arial.TTF'
wc = WordCloud(font_path=font, # Chinese
background_color='white',
width=1000,
height=800,
).generate(string)
wc.to_file('s1.png') # save figure
from matplotlib import pyplot as plt
plt.imshow(wc) # show figure by plt
plt.axis('off')
plt.show()
from wordcloud import WordCloud
filename = 'Harry Potter.txt'
with open(filename, encoding="utf-8") as f_obj:
contents = f_obj.read()
#font = r'C:\Windows\Fonts\Arial.TTF'
font = "/System/Library/Fonts/STHeiti Medium.ttc"
wc = WordCloud(font_path=font,
background_color='white',
width=1000,
height=800,
).generate(contents)
wc.to_file('s2.png')
from wordcloud import WordCloud
filename = 'sanguo.txt'
with open(filename , encoding="utf-8") as f_obj:
contents = f_obj.read()
#font = r'C:\Windows\Fonts\Arial.TTF'
font = "/System/Library/Fonts/STHeiti Medium.ttc"
wc = WordCloud(font_path=font,
background_color='white',
width=1000,
height=800,
).generate(contents)
wc.to_file('s3.png')
from wordcloud import WordCloud
import jieba
filename = 'sanguo.txt'
with open(filename , encoding="utf-8") as f_obj:
contents = f_obj.read()
s = jieba.lcut(contents)
txt = " ".join(s)
#font = r'C:\Windows\Fonts\Arial.TTF'
font = "/System/Library/Fonts/STHeiti Medium.ttc"
wc = WordCloud(font_path=font,
background_color='white',
width=1000,
height=800,
).generate(txt)
wc.to_file('s4.png')
from wordcloud import WordCloud
import jieba
filename = 'sanguo.txt'
with open(filename, encoding="utf-8") as f_obj:
contents = f_obj.read()
def sw(filename):
with open(filename, encoding="utf-8") as f_obj:
x = f_obj.readlines()
y = [word.strip() for word in x]
return y
name_list = ['baidu_stopwords.txt', 'cn_stopwords.txt',
'hit_stopwords.txt','scu_stopwords.txt']
stop_word = []
for x in name_list:
stop_word.extend(sw(x))
stop_word = list(set(stop_word))
s = jieba.lcut(contents)
result = [word for word in s if word not in stop_word]
s = [word for word in result if len(word)>1]
txt = " ".join(s)
#font = r'C:\Windows\Fonts\Arial.TTF'
font = "/System/Library/Fonts/STHeiti Medium.ttc"
wc = WordCloud(font_path=font,
background_color='white',
width=1000,
height=800,
).generate(txt)
wc.to_file('s5.png')