一、读取arff文件
1、使用scipy.io中arff读取文件
import pandas as pd
from scipy.io import arff
data,meta = arff.loadarff("../Dry_Bean_Dataset/Dry_Bean_Dataset.arff")
2、转化为DataFrame
df =pd.DataFrame(data)
print(df.shape)
print(df.head())
print(df.isnull().sum())
print(df.describe().T)
print(df.Class.unique())
3、使用LabelEncoder对类别Class进行编码
from sklearn.preprocessing import LabelEncoder
lb_encoder = LabelEncoder()
for col in df.columns:
df.Class = lb_encoder.fit_transform(df.Class)
print(df.Class.value_counts())
4、使用sns与plt绘制不同类别的数量
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x=df.Class)
plt.title('Class')
plt.show()
2.填写空值
1、查找空值
DataDF.isnull().sum().sort_values(ascending=False)
2、删除有缺失值的行或者列
'''移除行或列"'''
drop_row_data = train_data.dropna() # 移除缺值的行
drop_col_data = train_data.dropna(axis=1) # 移除至少缺一个值的列
3、常规填补法
意思是
all_features.Embarked = all_features.Embarked.fillna(all_features.Embarked.mean())
最近邻,之前和之后,在时间序列分析中更常见
print(DataDF.UnitPrice.fillna(method='ffill')) # 前向后填充
print(DataDF.UnitPrice.fillna(method='bfill')) # 后向前填充
4、使用与之相关的属性,相近的求均值填补
sns.boxplot(x = train_data["Pclass"], y = train_data["Age"])plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-29wUvpiF-1647510577772)(【数据处理】:(二)文本类/1647438088486.png)]
'''
通过数据可视化,发现Age与Pclass相关性较大
'''
def add_age(cols):
Age=cols[0]
Pclass=cols[1]
if pd.isnull(Age):
if Pclass==1:
return train_data[train_data['Pclass']==1]['Age'].mean()
elif Pclass==2:
return train_data[train_data['Pclass']==2]['Age'].mean()
elif Pclass==3:
return train_data[train_data['Pclass']==3]['Age'].mean()
else:
return Age
train_data['Age']=train_data[['Age','Pclass']].apply(add_age,axis=1)
5、具体分析
df_data['Title'] = df_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(df_data['Title'], df_data['Sex']).transpose()
Title | Capt | Col | Countess | Don | Dona | Dr | Jonkheer | Lady | Major | Master | Miss | Mlle | Mme | Mr | Mrs | Ms | Rev | Sir |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Sex | ||||||||||||||||||
female | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 260 | 2 | 1 | 0 | 187 | 2 | 0 | 0 |
male | 1 | 4 | 0 | 1 | 0 | 7 | 1 | 0 | 2 | 61 | 0 | 0 | 0 | 757 | 0 | 0 | 8 | 1 |
"""
根据统计结果填补Age
"""
train_data['Initial']=0
for i in train_data:
train_data['Initial']=train_data.Name.str.extract('([A-Za-z]+)\.')
pd.crosstab(train_data.Initial,train_data.Sex).T.style.background_gradient()
train_data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Mr'),'Age']=33
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Mrs'),'Age']=36
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Master'),'Age']=5
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Miss'),'Age']=22
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Other'),'Age']=46
3.将文本转换为数字
1、使用LabelEncoder对特征进行编码
from sklearn.preprocessing import LabelEncoder
lb_encoder = LabelEncoder()
for col in df.columns:
df.Class = lb_encoder.fit_transform(df.Class)
print(df.Class.value_counts())
2、直接替换
如果属性只有几个类别,可以直接替换
train_data.Embarked=train_data.Embarked.map({'S':0, 'C':1, 'Q':2})
要么
all_features.Sex.replace('male', 1, inplace=True)
all_features.Sex.replace('female', 0, inplace=True)
all_features.Embarked.replace('S', 0, inplace=True)
all_features.Embarked.replace('C', 1, inplace=True)
all_features.Embarked.replace('Q', 2, inplace=True)
3、one-hot编码
4. 缩放和规范化
更改缩放数据的范围,
在规范化中,改变数据分布的形状。
- in scaling , you’re changing the
range
of your data, while - in normalization , you’re changing the
shape of the distribution
of your data.
飞涨:
original_goal_data = pd.DataFrame(kickstarters_2017.goal)
scaled_goal_data = minmax_scaling(original_goal_data,columns=["goal"])
标准化:
original_pledged = pd.DataFrame(kickstarters_2017.pledged)
index_positive_pledges = kickstarters_2017.pledged > 0
positive_pledges_only = kickstarters_2017.pledged.loc[index_positive_pledges]
normalized_values = pd.Series(stats.boxcox(positive_pledges_only)[0],
name='pledged', index=positive_pledges_only.index)
5.日期处理
1、将日期规范化后存入dete_parsed列中
landslides['date_parsed'] = pd.to_datetime(landslides['date'], format="%m/%d/%Y")
注意!format中%Y尽量用大写
2、从时间属性值获取日期/月份/年份
day_of_month_earthquakes = earthquakes['date_parsed'].dt.day
6、数据统一
对于相同属性的相同值,由于空格、大小写等原因导致不一致。
1、提取出属性中出现的值,并使用unique()获取互不相同的值
countries = professors['Country'].unique()
2、删除头尾空格,把所有大写字符转化为小写
professors['Country'] = professors['Country'].str.lower()
professors['Country'] = professors['Country'].str.strip()
Python strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。
Python lower() 方法转换字符串中所有大写字符为小写。
3、模糊处理
matches = fuzzywuzzy.process.extract("usa", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
replace_matches_in_column(df=professors, column='Country', string_to_match="usa", min_ratio=70)
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
strings = df[column].unique()
matches = fuzzywuzzy.process.extract(string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]
rows_with_matches = df[column].isin(close_matches)
df.loc[rows_with_matches, column] = string_to_match
文章出处登录后可见!
已经登录?立即刷新