地震后建筑修复建议
标杆:地震后建筑修复建议
朴素贝叶斯分类模型(Python3)
该模型预测结果的MAP@2为:0.70645
# -*- coding: utf-8 -*-
# 引用模块
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import BernoulliNB
# 定义Pipeline中的各个环节
# 对两个特征取差值
class GetDiff:
def __init__(self, features=['floors', 'height']):
self.features = features
def fit(self, data, y=None):
return self
def transform(self, data):
d = data.copy()
for feature in self.features:
d['%s_diff'%feature] = d['%s_before'%feature] - d['%s_after'%feature]
return d
# 对数值特征进行标准化
class Scale:
def __init__(self, features=['floors_diff', 'height_diff', 'age', 'area']):
self.features = features
def fit(self, data, y=None):
self.scaling_params = dict()
for feature in self.features:
self.scaling_params[feature] = [np.mean(data[feature]), np.std(data[feature])]
return self
def transform(self, data):
d = data.copy()
for feature in self.features:
d[feature] = (d[feature] - self.scaling_params[feature][0]) / self.scaling_params[feature][1]
return d
# 对分类特征进行独热编码
class OneHotTransform:
def __init__(self, features=None):
if features == None:
self.features = ['roof_type', \
'ground_floor_type', \
'land_condition', \
'foundation_type', \
'position', \
'district_id']
else:
self.features = features
self.encoders = dict()
def fit(self, data, y=None):
for feature in self.features:
le = LabelEncoder()
le.fit(data[feature])
self.encoders[feature] = le
return self
def transform(self, data):
d = data.copy()
for feature in self.features:
le = self.encoders[feature]
ohe = OneHotEncoder(categories='auto')
onehot_array = ohe.fit_transform(le.transform(d[feature]).reshape(-1, 1))
onehot_columns = ['%s_%s'%(feature, c) for c in le.classes_]
onehot_df = pd.DataFrame(data=onehot_array.toarray(), columns=onehot_columns)
d = pd.concat([d, onehot_df], axis=1)
return d.drop(self.features, axis=1)
# 选择特征
class Selector:
def __init__(self, features=['floors_diff', 'height_diff', 'age', 'area']):
self.features = features
def fit(self, data, y=None):
return self
def transform(self, data):
d = data[self.features].copy()
return d
if __name__ == '__main__':
# 加载数据
train = pd.read_csv('train.csv')
submit = pd.read_csv('sample_submit.csv')
test = pd.read_csv('test.csv')
# 设置变量
model_features = ['floors_after', \
'height_after', \
'floors_diff', \
'height_diff', \
'age', \
'area', \
'roof_type', \
'ground_floor_type', \
'land_condition', \
'foundation_type', \
'position', \
'district_id']
# 定义数据管道
process_pipe = Pipeline([('get_diff', GetDiff(['floors', 'height'])), \
('scale_features', Scale(['floors_diff', 'height_diff', 'age', 'area'])), \
('selector', Selector(model_features)), \
('one_hot', OneHotTransform())])
# 利用数据管道处理训练集和测试集
process_pipe = process_pipe.fit(train)
train_processed = process_pipe.transform(train)
test_processed = process_pipe.transform(test)
# 训练分类器
cols_drop = ['floors_after', 'height_after', 'floors_diff', 'height_diff', 'age', 'area']
nb = BernoulliNB(class_prior=[(train['y']==i).mean() for i in range(4)])
nb.fit(train_processed.drop(cols_drop, axis=1), train['y'])
pred = nb.predict_proba(test_processed.drop(cols_drop, axis=1))
# 输出预测结果
pred_df = pd.DataFrame(data=pred.argsort()[:, -2:][:, ::-1], columns=['y1', 'y2'])
pred_df.to_csv('my_nb_predicitions.csv', index=False)
LightGBM分类模型(Python3)
该模型预测结果的MAP@2为:0.78856
# -*- coding: utf-8 -*-
# 引用模块
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
# 定义Pipeline中的各个环节
# 对两个特征取差值
class GetDiff:
def __init__(self, features=['floors', 'height']):
self.features = features
def fit(self, data, y=None):
return self
def transform(self, data):
d = data.copy()
for feature in self.features:
d['%s_diff'%feature] = d['%s_before'%feature] - d['%s_after'%feature]
return d
# 对数值特征进行标准化
class Scale:
def __init__(self, features=['floors_diff', 'height_diff', 'age', 'area']):
self.features = features
def fit(self, data, y=None):
self.scaling_params = dict()
for feature in self.features:
self.scaling_params[feature] = [np.mean(data[feature]), np.std(data[feature])]
return self
def transform(self, data):
d = data.copy()
for feature in self.features:
d[feature] = (d[feature] - self.scaling_params[feature][0]) / self.scaling_params[feature][1]
return d
# 对分类特征进行独热编码
class OneHotTransform:
def __init__(self, features=None):
if features == None:
self.features = ['roof_type', \
'ground_floor_type', \
'land_condition', \
'foundation_type', \
'position', \
'district_id']
else:
self.features = features
self.encoders = dict()
def fit(self, data, y=None):
for feature in self.features:
le = LabelEncoder()
le.fit(data[feature])
self.encoders[feature] = le
return self
def transform(self, data):
d = data.copy()
for feature in self.features:
le = self.encoders[feature]
ohe = OneHotEncoder(categories='auto')
onehot_array = ohe.fit_transform(le.transform(d[feature]).reshape(-1, 1))
onehot_columns = ['%s_%s'%(feature, c) for c in le.classes_]
onehot_df = pd.DataFrame(data=onehot_array.toarray(), columns=onehot_columns)
d = pd.concat([d, onehot_df], axis=1)
return d.drop(self.features, axis=1)
# 选择特征
class Selector:
def __init__(self, features=['floors_diff', 'height_diff', 'age', 'area']):
self.features = features
def fit(self, data, y=None):
return self
def transform(self, data):
d = data[self.features].copy()
return d
if __name__ == '__main__':
# 加载数据
train = pd.read_csv('train.csv')
submit = pd.read_csv('sample_submit.csv')
test = pd.read_csv('test.csv')
# 设置变量
model_features = ['floors_after', \
'height_after', \
'floors_diff', \
'height_diff', \
'age', \
'area', \
'roof_type', \
'ground_floor_type', \
'land_condition', \
'foundation_type', \
'position', \
'district_id']
# 定义数据管道
process_pipe = Pipeline([('get_diff', GetDiff(['floors', 'height'])), \
('scale_features', Scale(['floors_diff', 'height_diff', 'age', 'area'])), \
('selector', Selector(model_features)), \
('one_hot', OneHotTransform())])
# 利用数据管道处理训练集和测试集
process_pipe = process_pipe.fit(train)
train_processed = process_pipe.transform(train)
test_processed = process_pipe.transform(test)
# 训练分类器
clf = LGBMClassifier(max_depth=10, n_estimators=1000, learning_rate=0.1)
pred = clf.fit(train_processed, train['y']).predict_proba(test_processed)
# 输出预测结果
pred_df = pd.DataFrame(data=pred.argsort()[:, -2:][:, ::-1], columns=['y1', 'y2'])
pred_df.to_csv('my_lr_predicitions.csv', index=False)