Pandas数据分析常用操作整理
每次用python处理数据,老是记不清函数名😓,总得回去翻以前写的代码,实在太麻烦了,就在这里统一整理了一些pandas的常用操作,方便查阅。
【更新中】
读写csv/excel
- 从csv文件读取- 1 
 2
 3
 4
 5
 6- frame = pd.read_csv("./1.csv") 
 '''
 encoding=utf8/gbk
 header=1
 '''
- 从excel文件读取- 1 - member_frame=pd.read_excel("./附件二.xlsx", sheet_name="会员信息") 
- 写入csv文件- 1 
 2
 3
 4- # numpy array写入文件 
 np.savetxt("距离.csv", res_frame, delimiter=",",fmt="%f")
 # pandas dataframe
 road1_frame.to_csv("road1.csv",index=False,sep=',')
Dataframe相关操作
- 排序- 1 
 2
 3- frame.sort(["time", "id"]) # 可多字段排序 
 frame.sort(["time"],ascending=True) # 逆序
- 按条件筛选行- 1 
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17- # 单列 
 frame.loc[frame["grade"] == "B"]
 # 多列
 frame.loc[(frame["grade"] == "B") | (frame["loan_amnt"] > 5000)]
 # 限定显示字段
 frame.loc[frame["grade"] == "B", ["member_id", "term" , "loan_amnt", "grade","sub_grade", "int_rate"]]
 # 多种取值
 frame.loc[frame["grade"].isin(['A','B'])]
 # 取反(加~)
 frame.loc[~frame["grade"].isin(['A','B'])]
 # 包含子串
 frame.loc[frame["name"].str.contains("张")]
- dataframe遍历行- 1 
 2- for index, task in task_frame.iterrows(): 
 task_frame.loc[index, '任务密度']=task["num"]/1000
- series带index遍历行- 1 
 2- for i, v in tmp.iteritems(): # 或 items() 
 print(i, v)
- 统计某一栏取值分布- 1 
 2- road1_frame['hour'].value_counts().sort_index() # 按索引排序 
 frame["终端号"].value_counts().sort_values(ascending=False) # 按值排序- 1 
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18- 06 11 
 07 110
 08 208
 09 186
 10 106
 11 69
 12 86
 13 78
 14 81
 15 61
 17 117
 18 168
 19 153
 20 147
 21 93
 22 49
 23 8
 Name: hour, dtype: int64
- unique方法- 1 - frame["终端号"].unique() - 1 
 2- array(["'00064298", "'00066298", "'00092298", ..., "'00156242", 
 "'00010986", "'00119246"], dtype=object)
- 随机抽样- 1 
 2- terminal_samples = frame.sample(frac=0.1, replace=False) # 指定比例 无放回 
 terminal_samples = frame.sample(n=100, replace=True) # 指定数量 有放回
- 新建空Dataframe- 1 
 2- hour_distri_frame = pd.DataFrame(columns=["终端号"] + [x for x in range(24)]) # 设置列名 
 hour_distri_frame = hour_distri_frame.set_index(["终端号"], drop=True) # 设置索引字段- | | 0 | 1 | 2 | 3 | 4 | … | 19 | 20 | 21 | 22 | 23 | 
 | ——-: | —-: | —-: | —-: | —-: | —-: | —-: | —-: | —-: | —-: | —-: | —-: |
 | 终端号 | | | | | | | | | | | |- 0 rows × 24 columns 
- 时间转换- datetime转timestamp - 1 - frame['time'] = frame['deal_date'].apply(lambda x:time.mktime(x.timetuple())) 
- 转字符串 - 1 - frame['date'].apply(lambda x:x.strftime('%Y/%m/%d')) 
 
- 插入一行- append- 1 
 2
 3
 4
 5- # 字典形式 
 res = res.append([{'score':10.0}], ignore_index=True)
 # frame合并
 insertRow = pd.DataFrame([[0.,0.,0.,0.,0.]],columns = ['date','spring','summer','autumne','winter'])
 res = res.append(insertRow,ignore_index=True)
- concat - 1 
 2- newline = pd.DataFrame([{'obj1':obj1,'obj2':obj2,'relation':rel,'count':0, 'desc':desc}]) 
 rel50group_df=pd.concat([rel50group_df,newline],ignore_index = True)
- 使用index - 1 - df.loc[3]= ['china',123] 
 


