# modules we'll use
import pandas as pd
import numpy as np
# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process
import chardet
# read in all our data
professors = pd.read_csv("./pakistan_intellectual_capital.csv")
# set seed for reproducibility
np.random.seed(0)
/opt/conda/lib/python3.6/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
我们快速的看过资料
professors.head()
我们现在先看Country
这个column中,有没有哪边有资料不一致的问题
# get all the unique values in the 'Country' column
countries = professors['Country'].unique()
# sort them alphabetically and then take a closer look
countries.sort()
countries
array([' Germany', ' New Zealand', ' Sweden', ' USA', 'Australia',
'Austria', 'Canada', 'China', 'Finland', 'France', 'Greece',
'HongKong', 'Ireland', 'Italy', 'Japan', 'Macau', 'Malaysia',
'Mauritius', 'Netherland', 'New Zealand', 'Norway', 'Pakistan',
'Portugal', 'Russian Federation', 'Saudi Arabia', 'Scotland',
'Singapore', 'South Korea', 'SouthKorea', 'Spain', 'Sweden',
'Thailand', 'Turkey', 'UK', 'USA', 'USofA', 'Urbana', 'germany'],
dtype=object)
从资料中我们可以看出有一些不一致的问题,例如:Germany
、germany
, New Zealand
、New Zealand
第一步是要先将所有资料转为小写,并且将空格移除掉
在文本数据中,大写和空格的问题非常常见,这样处理过後,可以修复80%以上的不一致问题
# convert to lower case
professors['Country'] = professors['Country'].str.lower()
# remove trailing white spaces
professors['Country'] = professors['Country'].str.strip()
然後我们来处理更困难的不一致问题
然後我们现在来看一下前面处理过後的资料
# get all the unique values in the 'Country' column
countries = professors['Country'].unique()
# sort them alphabetically and then take a closer look
countries.sort()
countries
array(['australia', 'austria', 'canada', 'china', 'finland', 'france',
'germany', 'greece', 'hongkong', 'ireland', 'italy', 'japan',
'macau', 'malaysia', 'mauritius', 'netherland', 'new zealand',
'norway', 'pakistan', 'portugal', 'russian federation',
'saudi arabia', 'scotland', 'singapore', 'south korea',
'southkorea', 'spain', 'sweden', 'thailand', 'turkey', 'uk',
'urbana', 'usa', 'usofa'], dtype=object)
现在又有出现其他的不一致,例如说south korea
、southkorea
我们将会使用fuzzywuzzy套件来帮助我们确定那些字符彼此是接近的,只要数据集够小,就可以很好的纠正错误。
Fuzzy mathcing:自动寻找非常相似的字串的过程,称为Fuzzy matching。这个方法没办法100%的解决问题,但是至少可以节省一些时间
Fuzzywuzzy会返回两个字串之间的比例,比例越接近100,就代表说两个字串越相向
# get the top 10 closest matches to "south korea"
matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
# take a look at them
matches
[('south korea', 100),
('southkorea', 48),
('saudi arabia', 43),
('norway', 35),
('ireland', 33),
('portugal', 32),
('singapore', 30),
('netherland', 29),
('macau', 25),
('usofa', 25)]
从上面的结果可以看出,south korea
和southkorea
是非常接近的,我们将相似程度>47的资料替换成south korea
我们将这个步骤写成一个function,比较方便後续的处理
# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
# get a list of unique strings
strings = df[column].unique()
# get the top 10 closest matches to our input string
matches = fuzzywuzzy.process.extract(string_to_match, strings,
limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
# only get matches with a ratio > 90
close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]
# get the rows of all the close matches in our dataframe
rows_with_matches = df[column].isin(close_matches)
# replace all rows with close matches with the input matches
df.loc[rows_with_matches, column] = string_to_match
# let us know the function's done
print("All done!")
# use the function we just wrote to replace close matches to "south korea" with "south korea"
replace_matches_in_column(df=professors, column='Country', string_to_match="south korea")
All done!
# get all the unique values in the 'Country' column
countries = professors['Country'].unique()
# sort them alphabetically and then take a closer look
countries.sort()
countries
array(['australia', 'austria', 'canada', 'china', 'finland', 'france',
'germany', 'greece', 'hongkong', 'ireland', 'italy', 'japan',
'macau', 'malaysia', 'mauritius', 'netherland', 'new zealand',
'norway', 'pakistan', 'portugal', 'russian federation',
'saudi arabia', 'scotland', 'singapore', 'south korea', 'spain',
'sweden', 'thailand', 'turkey', 'uk', 'urbana', 'usa', 'usofa'],
dtype=object)
将资料替换过後再过来检查,确认资料已经被替换了
<<: Day 28 [Python ML、资料清理] 处理字元编码
部属.net core程序到IIS上 这里用local IIS测试 预先需要先安装 .NET Cor...
大家午安 ~ 昨天我们已经启用 VPC Flow Log 并且存放到 S3,今天我们会设定 AWS ...
曾有一个刚大学毕业的朋友(以下称朋友)问我关於求职的问题 朋友: 我非本科系毕业,又没经验要怎麽找软...
上一天我们提到了 Domain Layer 会包含以下三个组件:CoEditor, ContextM...
Https 连线网页使用 SSL 加密凭证可以让使用者在网页输入的资料更加安全,减少被截取内容的风险...