df = pd.read_csv("ks-projects-201801.csv")
relative path
df = pd.read_csv("data/ks-projects-201801.csv") # Linux or OSX
df = pd.read_csv("data\ks-projects-201801.csv") # Windows
absolute path
file_path = '/home/ehmatthes/data/ks-projects-201801.csv'
df = pd.read_csv(file_path) # Linux or OSX
file_path = r'C:\Users\ehmatthes\data\ks-projects-201801.csv'
df = pd.read_csv(file_path) # Windows
>>> df.shape
(378661, 15)
>>> df.info()
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
ID 378661 non-null int64
name 378657 non-null object
category 378661 non-null object
main_category 378661 non-null object
currency 378661 non-null object
deadline 378661 non-null object
goal 378661 non-null float64
launched 378661 non-null object
pledged 378661 non-null float64
state 378661 non-null object
backers 378661 non-null int64
country 378661 non-null object
usd pledged 374864 non-null float64
usd_pledged_real 378661 non-null float64
usd_goal_real 378661 non-null float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB
Filtering with boolean array
#查看第一行
>>> df.iloc[1,:]
ID 1000003930
name Greeting From Earth: ZGAC Arts Capsule For ET
category Narrative Film
main_category Film & Video
currency USD
deadline 2017-11-01
goal 30000
launched 2017-09-02 04:43:57
pledged 2421
state failed
backers 15
country US
usd pledged 100
usd_pledged_real 2421
usd_goal_real 30000
Name: 1, dtype: object
#根据条件筛选特定行:美国本土项目,筹资金额大于等于30000美刀
>>> df_us = df[df['country']=='US']
>>> df_us30000 = df_us[df_us['goal']>=30000]
#[43312 rows x 15 columns]
Basic Data Description
>>> df_us[['backers','goal','pledged']].describe()
backers goal pledged
count 292627.000000 2.926270e+05 2.926270e+05
mean 113.078615 4.403497e+04 9.670193e+03
std 985.723400 1.108372e+06 9.932942e+04
min 0.000000 1.000000e-02 0.000000e+00
25% 2.000000 2.000000e+03 4.100000e+01
50% 14.000000 5.250000e+03 7.250000e+02
75% 60.000000 1.500000e+04 4.370000e+03
max 219382.000000 1.000000e+08 2.033899e+07
Arithmetics (+,-,/,//,*,**)
>>> df_us['percentage'] = df_us['pledged']/df_us['goal']
#去掉warning
df_us.copy().loc[:,'percentage'] = df_us['pledged']/df_us['goal']
When sorting a DataFrame, you can use the data in one or more columns as the sort keys. To do so, pass one or more column names to the by option of sort_values.
>>> df_us.sort_values(by='goal', ascending=False)
>>> df_us.head().sort_values(by=['main_category','goal'],
...: ascending=[True,False])
ID ... percentage
2 1.000004e+09 ... 0.004889
1 1.000004e+09 ... 0.080700
4 1.000011e+09 ... 0.065795
5 1.000014e+09 ... 1.047500
3 1.000008e+09 ... 0.000200
>>> df_us.head().rank()
ID name category ... usd_pledged_real usd_goal_real percentage
1 1.0 2.0 3.5 ... 4.0 3.0 4.0
2 2.0 5.0 3.5 ... 2.0 4.0 2.0
3 3.0 4.0 2.0 ... 1.0 1.0 1.0
4 4.0 1.0 1.0 ... 3.0 2.0 3.0
5 5.0 3.0 5.0 ... 5.0 5.0 5.0
>>> df_s=pd.DataFrame({'sales':[100,200,300,200]})
sales
0 100
1 200
2 300
3 200
>>> df_s.rank()
sales
0 1.0
1 2.5
2 4.0
3 2.5
>>> df_s.rank(method='first') #数值
sales
0 1.0
1 2.0
2 4.0
3 3.0
min and max
>>> df_s.rank(method='min')
sales
0 1.0
1 2.0
2 4.0
3 2.0
>>> df_s.rank(method='max')
sales
0 1.0
1 3.0
2 4.0
3 3.0
>>> df_us['main_category'].value_counts()
Film & Video 51922
Music 43238
Publishing 31726
Games 24636
Art 22311
Design 21690
Technology 21556
Food 19941
Fashion 16584
Comics 8910
Theater 8709
Photography 7988
Crafts 6648
Journalism 3540
Dance 3228
Name: main_category, dtype: int64
>>> df_us['main_category'].value_counts(
...: normalize = True)
Film & Video 0.177434
Music 0.147758
Publishing 0.108418
Games 0.084189
Art 0.076244
Design 0.074122
Technology 0.073664
Food 0.068145
Fashion 0.056673
Comics 0.030448
Theater 0.029761
Photography 0.027298
Crafts 0.022718
Journalism 0.012097
Dance 0.011031
Name: main_category, dtype: float64
>>> df_us['main_category'].isin(['Music','Games'])
>>> df_us[df_us['main_category'].isin(['Music','Games'])==True]
from datetime import datetime
>>> now = datetime.now()
>>> now
datetime.datetime(2020, 10, 29, 11, 50, 46, 908483)
>>> now.year, now.month, now.day
(2020, 10, 29)
>>> delta = datetime(2011,1,7) - datetime(2008, 6, 24)
>>> delta.days
926
>>> from datetime import timedelta #only for day, hour, second
>>> datetime(2011,1,7) + timedelta(12)
datetime.datetime(2011, 1, 19, 0, 0)
Converting between String and Datetime
>>> date_1 = df_us.loc[1,'launched']
>>> date_1
'2017-09-02 04:43:57'
>>> stamp=datetime.strptime(date_1,'%Y-%m-%d %H:%M:%S')
datetime.datetime(2017, 9, 2, 4, 43, 57)
>>> type(stamp)
datetime.datetime
>>> stamp.strftime('%d-%m-%Y')
'02-09-2017'
%Y Four-digit year
%y two-digit year
%m two-digit month
%B Month name
%d two-digit day
%W week number
%H Hour in 24-h
%I Hour in 12-h
%S Second
%F shortcut Y-M-D
%D shortcut m/d/y
Parse
>>> from dateutil.parser import parse
>>> parse('2017-09-02 04:43:57')
datetime.datetime(2017, 9, 2, 4, 43, 57)
>>> stamp_0 =pd.to_datetime('2017-09-02 04:43:57')
>>> stamp_0
Timestamp('2017-09-02 04:43:57')
Calculating the time period
>>> df_us['launched_time'] = pd.to_datetime(df_us['launched'])
>>> df_us['deadline_time'] = pd.to_datetime(df_us['deadline'])
>>> df_us['period'] = df_us['deadline_time']-df_us['launched_time']
>>> df_us['period']
1 59 days 19:16:03
2 44 days 23:39:10
3 29 days 20:35:49
4 55 days 15:24:57
5 34 days 10:21:33
378656 29 days 21:24:30
378657 26 days 20:24:46
378658 45 days 04:19:30
378659 30 days 05:46:07
378660 27 days 14:52:13
Name: period, Length: 292627, dtype: timedelta64[ns]
Converting the time to numbers
>>> df_us['period_num'] = df_us['period']/timedelta(1)
>>> df_us['period_num']
1 59.802813
2 44.985532
3 29.858206
4 55.642326
5 34.431632
378656 29.892014
378657 26.850532
378658 45.180208
378659 30.240359
378660 27.619595
Name: period_num, Length: 292627, dtype: float64
Filtering by date
>>> df_us[(df_us['launched_time']>='20150101')]
>>> df_us[(df_us['launched_time']>='20150101')& (df_us['launched_time']<='20151231')]
ID ... period_num
4 1000011046 ... 55.642326
15 1000064368 ... 29.909109
17 1000068480 ... 29.110486
38 1000134913 ... 34.092025
49 1000197321 ... 40.050185
... ... ...
378600 999687927 ... 59.057986
378616 99977040 ... 29.821157
378635 99987261 ... 29.057477
378639 999884445 ... 29.455394
378644 999934908 ... 21.685255
[53467 rows x 19 columns]
Index by Timestamps
>>> df_ustime = df_us.set_index(['launched_time'])
>>> df_ustime
ID ... period_num
launched_time ...
2017-09-02 04:43:57 1000003930 ... 59.802813
2013-01-12 00:20:50 1000004038 ... 44.985532
2012-03-17 03:24:11 1000007540 ... 29.858206
2015-07-04 08:35:03 1000011046 ... 55.642326
2016-02-26 13:38:27 1000014025 ... 34.431632
... ... ...
2014-09-17 02:35:30 999976400 ... 29.892014
2011-06-22 03:35:14 999977640 ... 26.850532
2010-07-01 19:40:30 999986353 ... 45.180208
2016-01-13 18:13:53 999987933 ... 30.240359
2011-07-19 09:07:47 999988282 ... 27.619595
[292627 rows x 19 columns]
>>> df_ustime.loc['20170902'] # index重复
>>> df_ustime.loc['20170902':'20170930']
Summarizing and Computing
>>> df_us.count(axis=1) #每行非空个数
>>> df_us.count(axis=0) #每列非空个数
>>> df_us['usd_pledged_real'].sum()
>>> df_us['usd_pledged_real'].mean()
.max() .min()
.median() .mode()
.var() .std()
.quantile() #0-1
Correlation and Covariance
>>> data = pd.DataFrame({'Q1':[1,3,4,3,4],
...: 'Q2':[2,3,1,2,3],
...: 'Q3':[1,5,2,4,4]})
>>> data['Q1'].corr(data['Q3'])
0.4969039949999533
>>> data['Q1'].cov(data['Q3'])
1
>>> data.cov()
>>> data.corr()
Export .csv file
>>> out_col = ['deadline_time','period_num','launched_time']
>>> df_us.to_csv('out.csv',index=False,columns=out_col)
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("ks-projects-201801.csv")
df_state = df['state'].value_counts()
print(df_state)
df_state_r=df_state['failed':'canceled']
df_state_r['others']=df_state['undefined':].sum()
explode=[0.05,0,0,0]
plt.pie(df_state_r.values, labels=df_state_r.index,\
explode=explode, autopct='%.0f%%',shadow=True)
plt.title("State", loc='center')
plt.savefig('state.jpg',dpi=300)
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("ks-projects-201801.csv")
df_category = df['main_category'].value_counts()
df_category_r=df_category['Film & Video':'Fashion']
df_category_r['others']=df_category['Theater':'Dance'].sum()
plt.pie(df_category_r.values, labels=df_category_r.index,\
autopct='%.0f%%',shadow=True)
plt.title("Category", loc='center')
plt.savefig('category.jpg',dpi=300)
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("ks-projects-201801.csv")
df_pledged = df.sort_values(by='pledged',ascending=False).head(100)
plt.style.use('ggplot')
plt.plot(df_pledged['pledged'].values)
plt.title("Pledged", loc='center')
plt.xlabel("Top 100 projects", fontsize=10)
plt.ylabel("Amount", fontsize=10)
plt.savefig('pledged.jpg',dpi=300)
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("ks-projects-201801.csv")
df['launched_time'] = pd.to_datetime(df['launched'])
df['deadline_time'] = pd.to_datetime(df['deadline'])
def year_data(y):
return df[(df['launched_time']>=str(y)) & (df['launched_time']< str(y+1))]
df_year_count=[]
df_year=list(range(2010,2018))
for y in range(2010,2018):
df_year_count.append(year_data(y)['ID'].count())
plt.bar(df_year,df_year_count)
plt.title("2010-2017 Projects", loc='center')
plt.xlabel("Year", fontsize=10)
plt.ylabel("Amount", fontsize=10)
plt.savefig('year.jpg',dpi=300)
def year_data(y):
return df[(df['launched_time']>=str(y)) & (df['launched_time']< str(y+1))]
df_year_count=[]
df_year=list(range(2010,2018))
for y in range(2010,2018):
df_year_count.append(year_data(y)['ID'].count())
df_year_s=[]
for y in range(2010,2018):
df_year_s.append(year_data(y)[year_data(y)['state']=='successful']['ID'].count())
plt.bar(df_year,df_year_count, label='total')
plt.bar(df_year,df_year_s,label='successful')
plt.title("2010-2017 Projects", loc='center')
plt.xlabel("Year", fontsize=10)
plt.ylabel("Amount", fontsize=10)
plt.legend(loc="upper left")
plt.savefig('year_percentage.jpg',dpi=300)