地震后建筑修复建议

标杆:地震后建筑修复建议     

朴素贝叶斯分类模型(Python3)

该模型预测结果的MAP@2为:0.70645

# -*- coding: utf-8 -*-

# 引用模块
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import BernoulliNB


# 定义Pipeline中的各个环节

# 对两个特征取差值
class GetDiff:
    def __init__(self, features=['floors', 'height']):
        self.features = features
        
    def fit(self, data, y=None):
        return self
    
    def transform(self, data):
        d = data.copy()
        for feature in self.features:
            d['%s_diff'%feature] = d['%s_before'%feature] - d['%s_after'%feature]
        return d
    
# 对数值特征进行标准化
class Scale:
    def __init__(self, features=['floors_diff', 'height_diff', 'age', 'area']):
        self.features = features
        
    def fit(self, data, y=None):
        self.scaling_params = dict()
        for feature in self.features:
            self.scaling_params[feature] = [np.mean(data[feature]), np.std(data[feature])]
        return self
    
    def transform(self, data):
        d = data.copy()
        for feature in self.features:
            d[feature] = (d[feature] - self.scaling_params[feature][0]) / self.scaling_params[feature][1]
        return d
    
    
# 对分类特征进行独热编码
class OneHotTransform:
    def __init__(self, features=None):
        if features == None:
            self.features = ['roof_type', \
                             'ground_floor_type', \
                             'land_condition', \
                             'foundation_type', \
                             'position', \
                             'district_id']
        else:
            self.features = features
        self.encoders = dict()
    
    def fit(self, data, y=None):
        for feature in self.features:
            le = LabelEncoder()
            le.fit(data[feature])
            self.encoders[feature] = le
        return self
    
    def transform(self, data):
        d = data.copy()
        for feature in self.features:
            le = self.encoders[feature]
            ohe = OneHotEncoder(categories='auto')
            onehot_array = ohe.fit_transform(le.transform(d[feature]).reshape(-1, 1))
            onehot_columns = ['%s_%s'%(feature, c) for c in le.classes_]
            onehot_df = pd.DataFrame(data=onehot_array.toarray(), columns=onehot_columns)
            d = pd.concat([d, onehot_df], axis=1)
        return d.drop(self.features, axis=1)
        
    
# 选择特征 
class Selector:
    def __init__(self, features=['floors_diff', 'height_diff', 'age', 'area']):
        self.features = features
        
    def fit(self, data, y=None):
        return self
    
    def transform(self, data):
        d = data[self.features].copy()
        return d
    
    
if __name__ == '__main__':
    # 加载数据
    train = pd.read_csv('train.csv')
    submit = pd.read_csv('sample_submit.csv')
    test = pd.read_csv('test.csv')
    
    # 设置变量
    model_features = ['floors_after', \
                      'height_after', \
                      'floors_diff', \
                      'height_diff', \
                      'age', \
                      'area', \
                      'roof_type', \
                      'ground_floor_type', \
                      'land_condition', \
                      'foundation_type', \
                      'position', \
                      'district_id']
    
    # 定义数据管道
    process_pipe = Pipeline([('get_diff', GetDiff(['floors', 'height'])), \
                             ('scale_features', Scale(['floors_diff', 'height_diff', 'age', 'area'])), \
                             ('selector', Selector(model_features)), \
                             ('one_hot', OneHotTransform())])
    
    # 利用数据管道处理训练集和测试集
    process_pipe = process_pipe.fit(train)
    train_processed = process_pipe.transform(train)
    test_processed = process_pipe.transform(test)
    
    # 训练分类器
    cols_drop = ['floors_after', 'height_after', 'floors_diff', 'height_diff', 'age', 'area']
    nb = BernoulliNB(class_prior=[(train['y']==i).mean() for i in range(4)])
    nb.fit(train_processed.drop(cols_drop, axis=1), train['y'])
    pred = nb.predict_proba(test_processed.drop(cols_drop, axis=1))

    # 输出预测结果
    pred_df = pd.DataFrame(data=pred.argsort()[:, -2:][:, ::-1], columns=['y1', 'y2'])
    pred_df.to_csv('my_nb_predicitions.csv', index=False)



LightGBM分类模型(Python3)

该模型预测结果的MAP@2为:0.78856

# -*- coding: utf-8 -*-

# 引用模块
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier


# 定义Pipeline中的各个环节

# 对两个特征取差值
class GetDiff:
    def __init__(self, features=['floors', 'height']):
        self.features = features
        
    def fit(self, data, y=None):
        return self
    
    def transform(self, data):
        d = data.copy()
        for feature in self.features:
            d['%s_diff'%feature] = d['%s_before'%feature] - d['%s_after'%feature]
        return d
    
# 对数值特征进行标准化
class Scale:
    def __init__(self, features=['floors_diff', 'height_diff', 'age', 'area']):
        self.features = features
        
    def fit(self, data, y=None):
        self.scaling_params = dict()
        for feature in self.features:
            self.scaling_params[feature] = [np.mean(data[feature]), np.std(data[feature])]
        return self
    
    def transform(self, data):
        d = data.copy()
        for feature in self.features:
            d[feature] = (d[feature] - self.scaling_params[feature][0]) / self.scaling_params[feature][1]
        return d
    
    
# 对分类特征进行独热编码
class OneHotTransform:
    def __init__(self, features=None):
        if features == None:
            self.features = ['roof_type', \
                             'ground_floor_type', \
                             'land_condition', \
                             'foundation_type', \
                             'position', \
                             'district_id']
        else:
            self.features = features
        self.encoders = dict()
    
    def fit(self, data, y=None):
        for feature in self.features:
            le = LabelEncoder()
            le.fit(data[feature])
            self.encoders[feature] = le
        return self
    
    def transform(self, data):
        d = data.copy()
        for feature in self.features:
            le = self.encoders[feature]
            ohe = OneHotEncoder(categories='auto')
            onehot_array = ohe.fit_transform(le.transform(d[feature]).reshape(-1, 1))
            onehot_columns = ['%s_%s'%(feature, c) for c in le.classes_]
            onehot_df = pd.DataFrame(data=onehot_array.toarray(), columns=onehot_columns)
            d = pd.concat([d, onehot_df], axis=1)
        return d.drop(self.features, axis=1)
        
    
# 选择特征 
class Selector:
    def __init__(self, features=['floors_diff', 'height_diff', 'age', 'area']):
        self.features = features
        
    def fit(self, data, y=None):
        return self
    
    def transform(self, data):
        d = data[self.features].copy()
        return d


if __name__ == '__main__':
    # 加载数据
    train = pd.read_csv('train.csv')
    submit = pd.read_csv('sample_submit.csv')
    test = pd.read_csv('test.csv')
    
    # 设置变量
    model_features = ['floors_after', \
                      'height_after', \
                      'floors_diff', \
                      'height_diff', \
                      'age', \
                      'area', \
                      'roof_type', \
                      'ground_floor_type', \
                      'land_condition', \
                      'foundation_type', \
                      'position', \
                      'district_id']
    
    # 定义数据管道
    process_pipe = Pipeline([('get_diff', GetDiff(['floors', 'height'])), \
                             ('scale_features', Scale(['floors_diff', 'height_diff', 'age', 'area'])), \
                             ('selector', Selector(model_features)), \
                             ('one_hot', OneHotTransform())])
    
    # 利用数据管道处理训练集和测试集
    process_pipe = process_pipe.fit(train)
    train_processed = process_pipe.transform(train)
    test_processed = process_pipe.transform(test)
    
    # 训练分类器
    clf = LGBMClassifier(max_depth=10, n_estimators=1000, learning_rate=0.1)
    pred = clf.fit(train_processed, train['y']).predict_proba(test_processed)

    # 输出预测结果
    pred_df = pd.DataFrame(data=pred.argsort()[:, -2:][:, ::-1], columns=['y1', 'y2'])
    pred_df.to_csv('my_lr_predicitions.csv', index=False)