机器读中文:根据名字判断性别
标杆:机器读中文:根据名字判断性别
朴素贝叶斯
该模型预测结果的准确度为:0.8277
具体代码及步骤利用朴素贝叶斯对名字进行性别预测高频单字划分法(Python)
该模型预测结果的准确度为:0.70509
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from collections import Counter
# 读取数据
train = pd.read_table('train.txt', ',')
test = pd.read_table('test.txt', ',')
submit = pd.read_csv('sample_submit.csv')
# 所有男生的名字
train_male = train[train['gender'] == 1]
names_male = "".join(train_male['name'])
# 所有女生的名字
train_female = train[train['gender'] == 0]
names_female = "".join(train_female['name'])
# 所有男生的名字中频率最高的250个字top_chars_male
# 所有女生的名字中频率最高的250个字top_chars_female
n_top = 250
lists_male = map(lambda x: x.encode('utf-8'), names_male.decode('utf-8'))
counts_male = Counter(lists_male).most_common(n_top)
top_chars_male = []
for [x, y] in counts_male:
top_chars_male.append(x)
lists_female = map(lambda x: x.encode('utf-8'), names_female.decode('utf-8'))
counts_female = Counter(lists_female).most_common(n_top)
top_chars_female = []
for [x, y] in counts_female:
top_chars_female.append(x)
# 去除top_chars_male和top_chars_female中重复的字
top_chars_male = set(top_chars_male) - intersects
top_chars_female = set(top_chars_female) - intersects
# 如果名字中包含top_chars_male中的字,那么预测性别为男
# 否则查看名字中包含top_chars_female中的字,如果有,则预测性别为女
# 再则,随机设置一个性别
preds = []
for i in range(len(test)):
if any(s in test['name'].iat[i] for s in top_chars_male):
pred = 1
elif any(s in test['name'].iat[i] for s in top_chars_female):
pred = 0
else:
pred = np.random.choice(2)
preds.append(pred)
# 输出预测结果至my_top250_prediction.csv
submit['gender'] = np.array(preds)
submit.to_csv('my_top250_prediction.csv', index=False)
基于TF的GBDT模型(Python)
该模型预测结果的准确度为:0.83361
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.ensemble import GradientBoostingClassifier
# 读取数据
train = pd.read_table('train.txt', ',')
test = pd.read_table('test.txt', ',')
submit = pd.read_csv('sample_submit.csv')
# 所有男生的名字
train_male = train[train['gender'] == 1]
m_cnt = len(train_male)
names_male = "".join(train_male['name'])
# 所有女生的名字
train_female = train[train['gender'] == 0]
f_cnt = len(train_female)
names_female = "".join(train_female['name'])
# 统计每个字在男生、女生名字中出现的总次数
lists_male = map(lambda x: x.encode('utf-8'), names_male.decode('utf-8'))
counts_male = Counter(lists_male)
lists_female = map(lambda x: x.encode('utf-8'), names_female.decode('utf-8'))
counts_female = Counter(lists_female)
# 得到训练集中每个人的每个字的词频(Term Frequency,通常简称TF)
train_encoded = []
for i in range(len(train)):
name = train.at[i, 'name']
chs = map(lambda x: x.encode('utf-8'), name.decode('utf-8'))
row = [0., 0., 0., 0, train.at[i, 'gender']]
for j in range(len(chs)):
row[2* j] = counts_female[chs[j]] * 1. / f_cnt
row[2* j + 1] = counts_male[chs[j]] * 1. / m_cnt
train_encoded.append(row)
# 得到测试集中每个人的每个字的词频(Term Frequency,通常简称TF)
test_encoded = []
for i in range(len(test)):
name = test.at[i, 'name']
chs = map(lambda x: x.encode('utf-8'), name.decode('utf-8'))
row = [0., 0., 0., 0.,]
for j in range(len(chs)):
try:
row[2 * j] = counts_female[chs[j]] * 1. / f_cnt
except:
pass
try:
row[2 * j + 1] = counts_male[chs[j]] * 1. / m_cnt
except:
pass
test_encoded.append(row)
# 转换为pandas.DataFrame的形式
# 1_f是指这个人的第一个字在训练集中所有女生的字中出现的频率
# 2_f是指这个人的第二个字在训练集中所有女生的字中出现的频率
# 1_m是指这个人的第一个字在训练集中所有男生的字中出现的频率
# 2_m是指这个人的第二个字在训练集中所有男生的字中出现的频率
train_encoded = pd.DataFrame(train_encoded, columns=['1_f', '1_m', '2_f', '2_m', 'gender'])
test_encoded = pd.DataFrame(test_encoded, columns=['1_f', '1_m', '2_f', '2_m'])
# 训练GBDT模型
clf = GradientBoostingClassifier()
clf.fit(train_encoded.drop('gender', axis=1), train_encoded['gender'])
preds = clf.predict(test_encoded)
# 输出预测结果至my_TF_GBDT_prediction.csv
submit['gender'] = np.array(preds)
submit.to_csv('my_TF_GBDT_prediction.csv', index=False)