In [238]:
import pandas as pd
pd.set_option('display.max_columns', 50000)
import numpy as np
import csv
import json
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [239]:
def log1p(vec):
    return np.log1p(abs(vec))

def expm1(x):
    return np.expm1(x)

def clipExp(vec):
    return np.clip(expm1(vec), 0, None)

In [240]:
df = pd.read_csv(filepath_or_buffer="./dataset/train.csv")
df.head()

Unnamed: 0,Deal_title,Lead_name,Industry,Deal_value,Weighted_amount,Date_of_creation,Pitch,Contact_no,Lead_revenue,Fund_category,Geography,Location,POC_name,Designation,Lead_POC_email,Hiring_candidate_role,Lead_source,Level_of_meeting,Last_lead_update,Internal_POC,Resource,Internal_rating,Success_probability
0,TitleM5DZY,"Davis, Perkins and Bishop Inc",Restaurants,320506$,2067263.7$,2020-03-29,Product_2,607.447.7883,50 - 100 Million,Category 2,USA,"Killeen-Temple, TX",Charlene Werner,Executive Vice President,charlenewerner@davis.com,Community pharmacist,Website,Level 3,No track,"Davis,Sharrice A",,3,73.6
1,TitleKIW18,Bender PLC LLC,Construction Services,39488$,240876.8$,2019-07-10,Product_2,892-938-9493,500 Million - 1 Billion,Category 4,India,Ratlam,rakhi,Chairman/CEO/President,terrylogan@bender.com,Recruitment consultant,Others,Level 1,Did not hear back after Level 1,"Brown,Maxine A",No,5,58.9
2,TitleFXSDN,Carter-Henry and Sons,Hospitals/Clinics,359392$,2407926.4$,2019-07-27,Product_1,538.748.2271,500 Million - 1 Billion,Category 4,USA,"Albany-Schenectady-Troy, NY",Ariel Hamilton,SVP/General Counsel,arielhamilton@carterhenry.com,Health service manager,Marketing Event,Level 1,?,"Georgakopoulos,Vasilios T",No,4,68.8
3,TitlePSK4Y,Garcia Ltd Ltd,Real Estate,76774$,468321.4$,2021-01-30,Product_2,(692)052-1389x75188,500 Million - 1 Billion,Category 3,USA,"Mount Vernon-Anacortes, WA",Erin Wilson,CEO/Co-Founder/Chairman,erinwilson@garcia.com,"Therapist, speech and language",Contact Email,Level 2,Did not hear back after Level 1,"Brown,Maxine A",We have all the requirements,1,64.5
4,Title904GV,Lee and Sons PLC,Financial Services,483896$,,2019-05-22,Product_2,001-878-814-6134x015,50 - 100 Million,Category 3,India,Shimoga,kavita,Executive Vice President,mr.christopher@lee.com,Media planner,Website,Level 2,Up-to-date,"Thomas,Lori E",No,4,62.4


In [241]:
df.describe()

Unnamed: 0,Internal_rating,Success_probability
count,7007.0,7007.0
mean,3.009562,64.745133
std,1.418666,17.931635
min,1.0,-5.0
25%,2.0,60.6
50%,3.0,65.3
75%,4.0,69.6
max,5.0,107.34


In [242]:
df.dtypes

Deal_title                object
Lead_name                 object
Industry                  object
Deal_value                object
Weighted_amount           object
Date_of_creation          object
Pitch                     object
Contact_no                object
Lead_revenue              object
Fund_category             object
Geography                 object
Location                  object
POC_name                  object
Designation               object
Lead_POC_email            object
Hiring_candidate_role     object
Lead_source               object
Level_of_meeting          object
Last_lead_update          object
Internal_POC              object
Resource                  object
Internal_rating            int64
Success_probability      float64
dtype: object

In [243]:
def dropInitialRedundantFeatures(df):
    redundantColumns = ["Lead_name","Contact_no","Location","POC_name","Lead_POC_email","Internal_POC","Designation","Hiring_candidate_role"]
    df = df.drop(redundantColumns,axis=1)
    return df
df = dropInitialRedundantFeatures(df)

In [244]:
def handleAmountColumns(df):
    
    def myAmountPreProcess(amount):
        if not pd.isnull(amount) or not pd.isna(amount):
            amount = amount.replace("$", "")
            
        return amount
        
    df["Deal_value"] = df.apply(lambda row : myAmountPreProcess(row["Deal_value"]),axis=1)
    
    df["Weighted_amount"] = df.apply(lambda row : myAmountPreProcess(row["Weighted_amount"]),axis=1)
    
    
    df["Deal_value"] = pd.to_numeric(df["Deal_value"])
    df["Weighted_amount"] = pd.to_numeric(df["Weighted_amount"])
    
    df["Deal_value"] = df["Deal_value"].fillna(df["Deal_value"].median())
    df["Weighted_amount"] = df["Weighted_amount"].fillna(df["Weighted_amount"].median())
    
    return df
df = handleAmountColumns(df)

In [245]:
def handleDatesColumns(df):
    
    df['Date_of_creation']  = pd.to_datetime(df['Date_of_creation'],format="%Y-%m-%d")
    
    df["creation_year"] = df["Date_of_creation"].dt.year
    
    df = df.drop(["Date_of_creation"],axis=1)
    
    return df

df = handleDatesColumns(df)

In [246]:
df.head()

Unnamed: 0,Deal_title,Industry,Deal_value,Weighted_amount,Pitch,Lead_revenue,Fund_category,Geography,Lead_source,Level_of_meeting,Last_lead_update,Resource,Internal_rating,Success_probability,creation_year
0,TitleM5DZY,Restaurants,320506.0,2067263.7,Product_2,50 - 100 Million,Category 2,USA,Website,Level 3,No track,,3,73.6,2020
1,TitleKIW18,Construction Services,39488.0,240876.8,Product_2,500 Million - 1 Billion,Category 4,India,Others,Level 1,Did not hear back after Level 1,No,5,58.9,2019
2,TitleFXSDN,Hospitals/Clinics,359392.0,2407926.4,Product_1,500 Million - 1 Billion,Category 4,USA,Marketing Event,Level 1,?,No,4,68.8,2019
3,TitlePSK4Y,Real Estate,76774.0,468321.4,Product_2,500 Million - 1 Billion,Category 3,USA,Contact Email,Level 2,Did not hear back after Level 1,We have all the requirements,1,64.5,2021
4,Title904GV,Financial Services,483896.0,1548238.95,Product_2,50 - 100 Million,Category 3,India,Website,Level 2,Up-to-date,No,4,62.4,2019


In [247]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7007 entries, 0 to 7006
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Deal_title           7007 non-null   object 
 1   Industry             7006 non-null   object 
 2   Deal_value           7007 non-null   float64
 3   Weighted_amount      7007 non-null   float64
 4   Pitch                7007 non-null   object 
 5   Lead_revenue         7007 non-null   object 
 6   Fund_category        7007 non-null   object 
 7   Geography            6035 non-null   object 
 8   Lead_source          7007 non-null   object 
 9   Level_of_meeting     7007 non-null   object 
 10  Last_lead_update     6374 non-null   object 
 11  Resource             6858 non-null   object 
 12  Internal_rating      7007 non-null   int64  
 13  Success_probability  7007 non-null   float64
 14  creation_year        7007 non-null   int64  
dtypes: float64(3), int64(2), object(10)
me

In [248]:
print(df["Industry"].value_counts())
print("-"*50)

print(df["Pitch"].value_counts())
print("-"*50)

print(df["Lead_revenue"].value_counts())
print("-"*50)

print(df["Fund_category"].value_counts())
print("-"*50)

print(df["Geography"].value_counts())
print("-"*50)

print(df["Lead_source"].value_counts())
print("-"*50)

print(df["Level_of_meeting"].value_counts())
print("-"*50)

print(df["Last_lead_update"].value_counts())
print("-"*50)

print(df["Resource"].value_counts())
print("-"*50)

Banks                        1480
Insurance                     439
Financial Services            397
Real Estate                   209
Investment Bank/Brokerage     200
                             ... 
Multimedia Hardware             1
Internet                        1
Web Development                 1
BSPs (Broadband)                1
Religious Groups                1
Name: Industry, Length: 171, dtype: int64
--------------------------------------------------
Product_1    3504
Product_2    3503
Name: Pitch, dtype: int64
--------------------------------------------------
500 Million - 1 Billion    2377
100 - 500 Million          2351
50 - 100 Million           2279
Name: Lead_revenue, dtype: int64
--------------------------------------------------
Category 3    1791
Category 2    1785
Category 1    1745
Category 4    1686
Name: Fund_category, dtype: int64
--------------------------------------------------
USA      3061
India    2974
Name: Geography, dtype: int64
---------------------

In [249]:
def handleMissingValues(df):
    
    df["Industry"] = df["Industry"].fillna(df["Industry"].mode()[0]) # most occurred value
    
    df["Resource"] = df["Resource"].fillna(df["Resource"].mode()[0]) # most occurred value
    
    df["Last_lead_update"] = df["Last_lead_update"].fillna(df["Last_lead_update"].mode()[0]) # most occurred value
        
    df["Geography"] = df["Geography"].fillna(df["Geography"].mode()[0]) # most occurred value
    
    return df
df = handleMissingValues(df)

In [250]:
df.isnull().sum()

Deal_title             0
Industry               0
Deal_value             0
Weighted_amount        0
Pitch                  0
Lead_revenue           0
Fund_category          0
Geography              0
Lead_source            0
Level_of_meeting       0
Last_lead_update       0
Resource               0
Internal_rating        0
Success_probability    0
creation_year          0
dtype: int64

In [251]:
# def labelEncoding(df, onlyTransform: False, label_object = {}):
#     categorical_columns = [
#         'Industry',
#         'Lead_revenue',
#         'Last_lead_update'
#     ]
#     for col in categorical_columns:
#         labelencoder = None
#         if onlyTransform is False:
#             labelencoder = LabelEncoder()
#             labelencoder.fit(df[col])
#             label_object[col] = labelencoder
#         else:
#             labelencoder = label_object[col]
            
#         df[col] = labelencoder.transform(df[col])
#     return df, label_object

# df, label_object = labelEncoding(df, onlyTransform= False, label_object= {})
# df.head()

In [252]:
def oneHotEncoding(df, onlyTransform=False, one_hot_object = {}):
    categorical_columns = [
        "Industry",
        "Pitch",
        "Lead_revenue",
        "Fund_category",
        "Geography",
        "Lead_source",
        "Level_of_meeting",
        "Last_lead_update",
        "Resource"
    ]
    for col in categorical_columns:
        
        col_data  = df[col].values.reshape(-1,1)
        
        if onlyTransform is False:
            onehot_encoder = OneHotEncoder()
            onehot_encoder = onehot_encoder.fit(col_data)
            one_hot_object[col] = onehot_encoder
        
        onehot_encoded = one_hot_object[col].transform(col_data).toarray()
        columns_list = []
        for i in range(onehot_encoded.shape[1]):
            columns_list.append(col+"_"+str(int(i)))
        dfOneHot = pd.DataFrame(onehot_encoded, columns = columns_list) 
        df = pd.concat([df, dfOneHot], axis=1)
        

    df = df.drop(categorical_columns,axis=1)
    return df, one_hot_object


df, one_hot_object = oneHotEncoding(df, onlyTransform=False, one_hot_object = {})
df.head()

Unnamed: 0,Deal_title,Deal_value,Weighted_amount,Internal_rating,Success_probability,creation_year,Industry_0,Industry_1,Industry_2,Industry_3,Industry_4,Industry_5,Industry_6,Industry_7,Industry_8,Industry_9,Industry_10,Industry_11,Industry_12,Industry_13,Industry_14,Industry_15,Industry_16,Industry_17,Industry_18,Industry_19,Industry_20,Industry_21,Industry_22,Industry_23,Industry_24,Industry_25,Industry_26,Industry_27,Industry_28,Industry_29,Industry_30,Industry_31,Industry_32,Industry_33,Industry_34,Industry_35,Industry_36,Industry_37,Industry_38,Industry_39,Industry_40,Industry_41,Industry_42,Industry_43,Industry_44,Industry_45,Industry_46,Industry_47,Industry_48,Industry_49,Industry_50,Industry_51,Industry_52,Industry_53,Industry_54,Industry_55,Industry_56,Industry_57,Industry_58,Industry_59,Industry_60,Industry_61,Industry_62,Industry_63,Industry_64,Industry_65,Industry_66,Industry_67,Industry_68,Industry_69,Industry_70,Industry_71,Industry_72,Industry_73,Industry_74,Industry_75,Industry_76,Industry_77,Industry_78,Industry_79,Industry_80,Industry_81,Industry_82,Industry_83,Industry_84,Industry_85,Industry_86,Industry_87,Industry_88,Industry_89,Industry_90,Industry_91,Industry_92,Industry_93,Industry_94,Industry_95,Industry_96,Industry_97,Industry_98,Industry_99,Industry_100,Industry_101,Industry_102,Industry_103,Industry_104,Industry_105,Industry_106,Industry_107,Industry_108,Industry_109,Industry_110,Industry_111,Industry_112,Industry_113,Industry_114,Industry_115,Industry_116,Industry_117,Industry_118,Industry_119,Industry_120,Industry_121,Industry_122,Industry_123,Industry_124,Industry_125,Industry_126,Industry_127,Industry_128,Industry_129,Industry_130,Industry_131,Industry_132,Industry_133,Industry_134,Industry_135,Industry_136,Industry_137,Industry_138,Industry_139,Industry_140,Industry_141,Industry_142,Industry_143,Industry_144,Industry_145,Industry_146,Industry_147,Industry_148,Industry_149,Industry_150,Industry_151,Industry_152,Industry_153,Industry_154,Industry_155,Industry_156,Industry_157,Industry_158,Industry_159,Industry_160,Industry_161,Industry_162,Industry_163,Industry_164,Industry_165,Industry_166,Industry_167,Industry_168,Industry_169,Industry_170,Pitch_0,Pitch_1,Lead_revenue_0,Lead_revenue_1,Lead_revenue_2,Fund_category_0,Fund_category_1,Fund_category_2,Fund_category_3,Geography_0,Geography_1,Lead_source_0,Lead_source_1,Lead_source_2,Lead_source_3,Level_of_meeting_0,Level_of_meeting_1,Level_of_meeting_2,Last_lead_update_0,Last_lead_update_1,Last_lead_update_2,Last_lead_update_3,Last_lead_update_4,Last_lead_update_5,Last_lead_update_6,Last_lead_update_7,Last_lead_update_8,Last_lead_update_9,Last_lead_update_10,Resource_0,Resource_1,Resource_2,Resource_3,Resource_4,Resource_5
0,TitleM5DZY,320506.0,2067263.7,3,73.6,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,TitleKIW18,39488.0,240876.8,5,58.9,2019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,TitleFXSDN,359392.0,2407926.4,4,68.8,2019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,TitlePSK4Y,76774.0,468321.4,1,64.5,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Title904GV,483896.0,1548238.95,4,62.4,2019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [253]:
df['Success_probability'] = np.log1p(abs(df['Success_probability']))

y = pd.DataFrame(df["Success_probability"])
df = df.drop("Success_probability", axis=1)

df = df.drop("Deal_title",axis=1)

X = pd.DataFrame(df)
print(X.shape)
print(y.shape)

(7007, 210)
(7007, 1)


In [254]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1)


X_train = X
y_train = y

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7007, 210)
(7007, 1)
(701, 210)
(701, 1)


In [255]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler



# scalerY = StandardScaler()
# scalerY = scalerY.fit(y_train)
# y_train = scalerY.transform(y_train)
# y_train = pd.DataFrame(y_train)

# y_test = scalerY.transform(y_test)
# y_test = pd.DataFrame(y_test)




scalerX = StandardScaler()
scalerX = scalerX.fit(X_train)
X_train = scalerX.transform(X_train)
X_train = pd.DataFrame(X_train)

X_test = scalerX.transform(X_test)
X_test = pd.DataFrame(X_test)

In [256]:
pca_components_per = 0.06
from sklearn.decomposition import PCA
pca = PCA(pca_components_per)
pca = pca.fit(X_train)

X_train = pca.transform(X_train)

X_test = pca.transform(X_test)

In [257]:
from catboost import CatBoostRegressor
params = {'depth': 10, 'n_estimators': 1000, 'learning_rate': 0.07,'loss_function':'RMSE'}
cat = CatBoostRegressor(random_state = 1, **params)

In [258]:

cat.fit(X_train,y_train, eval_set=(X_test,y_test),use_best_model=True,plot=True)
y_test_predict  = cat.predict(X_test)

from sklearn import metrics
actual = np.expm1(y_test)
predicted = np.expm1(y_test_predict)
score = 100*max(0, 1-metrics.mean_squared_error(actual, predicted))
score

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.3933526	test: 0.4074913	best: 0.4074913 (0)	total: 30.4ms	remaining: 30.4s
1:	learn: 0.3919059	test: 0.4055589	best: 0.4055589 (1)	total: 59.9ms	remaining: 29.9s
2:	learn: 0.3907690	test: 0.4048175	best: 0.4048175 (2)	total: 89.8ms	remaining: 29.8s
3:	learn: 0.3891912	test: 0.4032682	best: 0.4032682 (3)	total: 119ms	remaining: 29.5s
4:	learn: 0.3881618	test: 0.4019274	best: 0.4019274 (4)	total: 148ms	remaining: 29.4s
5:	learn: 0.3870440	test: 0.4011347	best: 0.4011347 (5)	total: 177ms	remaining: 29.3s
6:	learn: 0.3858629	test: 0.4001063	best: 0.4001063 (6)	total: 206ms	remaining: 29.2s
7:	learn: 0.3850129	test: 0.3994154	best: 0.3994154 (7)	total: 235ms	remaining: 29.1s
8:	learn: 0.3842092	test: 0.3979431	best: 0.3979431 (8)	total: 266ms	remaining: 29.3s
9:	learn: 0.3829454	test: 0.3961574	best: 0.3961574 (9)	total: 297ms	remaining: 29.4s
10:	learn: 0.3822526	test: 0.3948946	best: 0.3948946 (10)	total: 327ms	remaining: 29.4s
11:	learn: 0.3815246	test: 0.3942619	best: 0.3942

97:	learn: 0.3320152	test: 0.3374269	best: 0.3374269 (97)	total: 3.02s	remaining: 27.8s
98:	learn: 0.3313092	test: 0.3366551	best: 0.3366551 (98)	total: 3.04s	remaining: 27.7s
99:	learn: 0.3309687	test: 0.3362704	best: 0.3362704 (99)	total: 3.08s	remaining: 27.7s
100:	learn: 0.3306168	test: 0.3361237	best: 0.3361237 (100)	total: 3.11s	remaining: 27.7s
101:	learn: 0.3302111	test: 0.3354966	best: 0.3354966 (101)	total: 3.14s	remaining: 27.6s
102:	learn: 0.3299664	test: 0.3351188	best: 0.3351188 (102)	total: 3.17s	remaining: 27.6s
103:	learn: 0.3298044	test: 0.3350066	best: 0.3350066 (103)	total: 3.2s	remaining: 27.6s
104:	learn: 0.3294089	test: 0.3342630	best: 0.3342630 (104)	total: 3.23s	remaining: 27.5s
105:	learn: 0.3290799	test: 0.3339849	best: 0.3339849 (105)	total: 3.26s	remaining: 27.5s
106:	learn: 0.3283973	test: 0.3331279	best: 0.3331279 (106)	total: 3.29s	remaining: 27.5s
107:	learn: 0.3278378	test: 0.3326522	best: 0.3326522 (107)	total: 3.32s	remaining: 27.4s
108:	learn: 0.327

191:	learn: 0.2977387	test: 0.3025435	best: 0.3025435 (191)	total: 5.89s	remaining: 24.8s
192:	learn: 0.2972451	test: 0.3018566	best: 0.3018566 (192)	total: 5.93s	remaining: 24.8s
193:	learn: 0.2969718	test: 0.3016247	best: 0.3016247 (193)	total: 5.96s	remaining: 24.8s
194:	learn: 0.2966424	test: 0.3011672	best: 0.3011672 (194)	total: 5.99s	remaining: 24.7s
195:	learn: 0.2963673	test: 0.3007048	best: 0.3007048 (195)	total: 6.02s	remaining: 24.7s
196:	learn: 0.2958920	test: 0.3003465	best: 0.3003465 (196)	total: 6.05s	remaining: 24.7s
197:	learn: 0.2956811	test: 0.3001499	best: 0.3001499 (197)	total: 6.08s	remaining: 24.6s
198:	learn: 0.2952755	test: 0.2999777	best: 0.2999777 (198)	total: 6.11s	remaining: 24.6s
199:	learn: 0.2950391	test: 0.2993725	best: 0.2993725 (199)	total: 6.14s	remaining: 24.6s
200:	learn: 0.2947729	test: 0.2991973	best: 0.2991973 (200)	total: 6.17s	remaining: 24.5s
201:	learn: 0.2943471	test: 0.2987113	best: 0.2987113 (201)	total: 6.2s	remaining: 24.5s
202:	learn:

283:	learn: 0.2709811	test: 0.2766143	best: 0.2766143 (283)	total: 8.72s	remaining: 22s
284:	learn: 0.2707481	test: 0.2764363	best: 0.2764363 (284)	total: 8.75s	remaining: 21.9s
285:	learn: 0.2703909	test: 0.2761132	best: 0.2761132 (285)	total: 8.78s	remaining: 21.9s
286:	learn: 0.2700635	test: 0.2759260	best: 0.2759260 (286)	total: 8.81s	remaining: 21.9s
287:	learn: 0.2698013	test: 0.2754122	best: 0.2754122 (287)	total: 8.84s	remaining: 21.8s
288:	learn: 0.2695704	test: 0.2749322	best: 0.2749322 (288)	total: 8.87s	remaining: 21.8s
289:	learn: 0.2692305	test: 0.2747270	best: 0.2747270 (289)	total: 8.9s	remaining: 21.8s
290:	learn: 0.2689942	test: 0.2745449	best: 0.2745449 (290)	total: 8.94s	remaining: 21.8s
291:	learn: 0.2686733	test: 0.2741931	best: 0.2741931 (291)	total: 8.97s	remaining: 21.7s
292:	learn: 0.2682819	test: 0.2740568	best: 0.2740568 (292)	total: 9s	remaining: 21.7s
293:	learn: 0.2678735	test: 0.2738038	best: 0.2738038 (293)	total: 9.03s	remaining: 21.7s
294:	learn: 0.26

381:	learn: 0.2434671	test: 0.2488756	best: 0.2488756 (381)	total: 11.8s	remaining: 19s
382:	learn: 0.2431480	test: 0.2484775	best: 0.2484775 (382)	total: 11.8s	remaining: 19s
383:	learn: 0.2429742	test: 0.2484602	best: 0.2484602 (383)	total: 11.8s	remaining: 19s
384:	learn: 0.2427860	test: 0.2482133	best: 0.2482133 (384)	total: 11.8s	remaining: 18.9s
385:	learn: 0.2424858	test: 0.2481145	best: 0.2481145 (385)	total: 11.9s	remaining: 18.9s
386:	learn: 0.2421472	test: 0.2477985	best: 0.2477985 (386)	total: 11.9s	remaining: 18.9s
387:	learn: 0.2418945	test: 0.2475856	best: 0.2475856 (387)	total: 11.9s	remaining: 18.8s
388:	learn: 0.2416192	test: 0.2471844	best: 0.2471844 (388)	total: 12s	remaining: 18.8s
389:	learn: 0.2414516	test: 0.2471079	best: 0.2471079 (389)	total: 12s	remaining: 18.8s
390:	learn: 0.2412821	test: 0.2469070	best: 0.2469070 (390)	total: 12s	remaining: 18.7s
391:	learn: 0.2410522	test: 0.2465413	best: 0.2465413 (391)	total: 12.1s	remaining: 18.7s
392:	learn: 0.2409151	

474:	learn: 0.2213610	test: 0.2241738	best: 0.2241738 (474)	total: 14.6s	remaining: 16.2s
475:	learn: 0.2211007	test: 0.2238260	best: 0.2238260 (475)	total: 14.7s	remaining: 16.1s
476:	learn: 0.2209324	test: 0.2235443	best: 0.2235443 (476)	total: 14.7s	remaining: 16.1s
477:	learn: 0.2207594	test: 0.2232156	best: 0.2232156 (477)	total: 14.7s	remaining: 16.1s
478:	learn: 0.2206472	test: 0.2232080	best: 0.2232080 (478)	total: 14.7s	remaining: 16s
479:	learn: 0.2204644	test: 0.2229155	best: 0.2229155 (479)	total: 14.8s	remaining: 16s
480:	learn: 0.2202612	test: 0.2227065	best: 0.2227065 (480)	total: 14.8s	remaining: 16s
481:	learn: 0.2201260	test: 0.2225636	best: 0.2225636 (481)	total: 14.8s	remaining: 15.9s
482:	learn: 0.2197939	test: 0.2224408	best: 0.2224408 (482)	total: 14.9s	remaining: 15.9s
483:	learn: 0.2195875	test: 0.2223002	best: 0.2223002 (483)	total: 14.9s	remaining: 15.9s
484:	learn: 0.2193762	test: 0.2221395	best: 0.2221395 (484)	total: 14.9s	remaining: 15.9s
485:	learn: 0.21

573:	learn: 0.2008404	test: 0.2032467	best: 0.2032467 (573)	total: 17.7s	remaining: 13.1s
574:	learn: 0.2006733	test: 0.2030154	best: 0.2030154 (574)	total: 17.7s	remaining: 13.1s
575:	learn: 0.2004597	test: 0.2027504	best: 0.2027504 (575)	total: 17.8s	remaining: 13.1s
576:	learn: 0.2003270	test: 0.2026043	best: 0.2026043 (576)	total: 17.8s	remaining: 13s
577:	learn: 0.2001515	test: 0.2025157	best: 0.2025157 (577)	total: 17.8s	remaining: 13s
578:	learn: 0.1999354	test: 0.2022173	best: 0.2022173 (578)	total: 17.9s	remaining: 13s
579:	learn: 0.1996120	test: 0.2019809	best: 0.2019809 (579)	total: 17.9s	remaining: 12.9s
580:	learn: 0.1994112	test: 0.2018239	best: 0.2018239 (580)	total: 17.9s	remaining: 12.9s
581:	learn: 0.1992002	test: 0.2014414	best: 0.2014414 (581)	total: 17.9s	remaining: 12.9s
582:	learn: 0.1990718	test: 0.2013983	best: 0.2013983 (582)	total: 18s	remaining: 12.9s
583:	learn: 0.1989750	test: 0.2013648	best: 0.2013648 (583)	total: 18s	remaining: 12.8s
584:	learn: 0.198885

670:	learn: 0.1832886	test: 0.1872322	best: 0.1872322 (670)	total: 20.7s	remaining: 10.1s
671:	learn: 0.1831384	test: 0.1872034	best: 0.1872034 (671)	total: 20.7s	remaining: 10.1s
672:	learn: 0.1829240	test: 0.1869961	best: 0.1869961 (672)	total: 20.8s	remaining: 10.1s
673:	learn: 0.1827446	test: 0.1868973	best: 0.1868973 (673)	total: 20.8s	remaining: 10.1s
674:	learn: 0.1825409	test: 0.1867952	best: 0.1867952 (674)	total: 20.8s	remaining: 10s
675:	learn: 0.1823115	test: 0.1865782	best: 0.1865782 (675)	total: 20.9s	remaining: 9.99s
676:	learn: 0.1821585	test: 0.1863427	best: 0.1863427 (676)	total: 20.9s	remaining: 9.96s
677:	learn: 0.1820051	test: 0.1863117	best: 0.1863117 (677)	total: 20.9s	remaining: 9.93s
678:	learn: 0.1817711	test: 0.1861507	best: 0.1861507 (678)	total: 20.9s	remaining: 9.9s
679:	learn: 0.1816066	test: 0.1861190	best: 0.1861190 (679)	total: 21s	remaining: 9.87s
680:	learn: 0.1813868	test: 0.1857565	best: 0.1857565 (680)	total: 21s	remaining: 9.84s
681:	learn: 0.181

766:	learn: 0.1672385	test: 0.1731867	best: 0.1731867 (766)	total: 23.7s	remaining: 7.2s
767:	learn: 0.1670389	test: 0.1730055	best: 0.1730055 (767)	total: 23.7s	remaining: 7.17s
768:	learn: 0.1668742	test: 0.1728104	best: 0.1728104 (768)	total: 23.8s	remaining: 7.13s
769:	learn: 0.1666620	test: 0.1726079	best: 0.1726079 (769)	total: 23.8s	remaining: 7.1s
770:	learn: 0.1665170	test: 0.1725564	best: 0.1725564 (770)	total: 23.8s	remaining: 7.07s
771:	learn: 0.1663020	test: 0.1722930	best: 0.1722930 (771)	total: 23.8s	remaining: 7.04s
772:	learn: 0.1662105	test: 0.1722230	best: 0.1722230 (772)	total: 23.9s	remaining: 7.01s
773:	learn: 0.1660887	test: 0.1721147	best: 0.1721147 (773)	total: 23.9s	remaining: 6.98s
774:	learn: 0.1658925	test: 0.1719211	best: 0.1719211 (774)	total: 23.9s	remaining: 6.95s
775:	learn: 0.1657593	test: 0.1718322	best: 0.1718322 (775)	total: 24s	remaining: 6.92s
776:	learn: 0.1656215	test: 0.1716418	best: 0.1716418 (776)	total: 24s	remaining: 6.89s
777:	learn: 0.16

864:	learn: 0.1522088	test: 0.1566630	best: 0.1566630 (864)	total: 26.8s	remaining: 4.18s
865:	learn: 0.1520622	test: 0.1564018	best: 0.1564018 (865)	total: 26.8s	remaining: 4.15s
866:	learn: 0.1519496	test: 0.1563103	best: 0.1563103 (866)	total: 26.8s	remaining: 4.12s
867:	learn: 0.1517238	test: 0.1561406	best: 0.1561406 (867)	total: 26.9s	remaining: 4.08s
868:	learn: 0.1515933	test: 0.1560803	best: 0.1560803 (868)	total: 26.9s	remaining: 4.05s
869:	learn: 0.1514464	test: 0.1559206	best: 0.1559206 (869)	total: 26.9s	remaining: 4.02s
870:	learn: 0.1512259	test: 0.1557279	best: 0.1557279 (870)	total: 27s	remaining: 3.99s
871:	learn: 0.1510778	test: 0.1556176	best: 0.1556176 (871)	total: 27s	remaining: 3.96s
872:	learn: 0.1509944	test: 0.1555506	best: 0.1555506 (872)	total: 27s	remaining: 3.93s
873:	learn: 0.1508146	test: 0.1553638	best: 0.1553638 (873)	total: 27s	remaining: 3.9s
874:	learn: 0.1506941	test: 0.1553154	best: 0.1553154 (874)	total: 27.1s	remaining: 3.87s
875:	learn: 0.15059

956:	learn: 0.1393981	test: 0.1439999	best: 0.1439999 (956)	total: 29.8s	remaining: 1.34s
957:	learn: 0.1392568	test: 0.1438979	best: 0.1438979 (957)	total: 29.9s	remaining: 1.31s
958:	learn: 0.1391616	test: 0.1438245	best: 0.1438245 (958)	total: 29.9s	remaining: 1.28s
959:	learn: 0.1390629	test: 0.1437133	best: 0.1437133 (959)	total: 29.9s	remaining: 1.25s
960:	learn: 0.1389598	test: 0.1436754	best: 0.1436754 (960)	total: 29.9s	remaining: 1.22s
961:	learn: 0.1388687	test: 0.1434569	best: 0.1434569 (961)	total: 30s	remaining: 1.18s
962:	learn: 0.1387087	test: 0.1433072	best: 0.1433072 (962)	total: 30s	remaining: 1.15s
963:	learn: 0.1386285	test: 0.1432940	best: 0.1432940 (963)	total: 30s	remaining: 1.12s
964:	learn: 0.1383989	test: 0.1430824	best: 0.1430824 (964)	total: 30.1s	remaining: 1.09s
965:	learn: 0.1382868	test: 0.1430048	best: 0.1430048 (965)	total: 30.1s	remaining: 1.06s
966:	learn: 0.1381061	test: 0.1427764	best: 0.1427764 (966)	total: 30.1s	remaining: 1.03s
967:	learn: 0.13

0

In [259]:
df_test = pd.read_csv("./dataset/test.csv")
df_test.head()

Unnamed: 0,Deal_title,Lead_name,Industry,Deal_value,Weighted_amount,Date_of_creation,Pitch,Contact_no,Lead_revenue,Fund_category,Geography,Location,POC_name,Designation,Lead_POC_email,Hiring_candidate_role,Lead_source,Level_of_meeting,Last_lead_update,Internal_POC,Resource,Internal_rating
0,TitleAD16O,Bonilla Ltd Inc,Investment Bank/Brokerage,200988$,,2020-04-15,Product_1,167.332.2751x989,100 - 500 Million,Category 4,India,Bhubaneshwar,sonia,Chairman/CEO/President,maureenthomas@bonilla.com,"Designer, fashion/clothing",Marketing Event,Level 1,more than a month,"Massiah,Gerard F",No,-1.0
1,TitleOW6CR,"Williams, Rogers and Roach PLC",Electronics,409961$,2541758.2$,2021-01-23,Product_1,001-486-903-0711x7831,100 - 500 Million,Category 3,USA,"Coeur d'Alene, ID",Daniel Bell,CEO/Co-Founder/Chairman,danielbell@williams.com,Horticultural consultant,Marketing Event,Level 2,Up-to-date,"Smith,Keenan H",Yes,1.0
2,TitleVVJQ5,"Wood, Vaughn and Morales Ltd",Banks,434433$,3041031.0$,2020-07-19,Product_1,(393)104-2610x9723,100 - 500 Million,Category 1,USA,"Portland-South Portland, ME",Andrew Davis,Chairman/Chief Innovation Officer,andrewdavis@wood.com,Information officer,Marketing Event,Level 2,Did not hear back after Level 1,"Gilley,Janine",Deliverable,5.0
3,TitleUS8NA,Durham-Crawford Inc,Music,218952$,1521716.4$,2020-02-27,Product_2,(817)040-4599,100 - 500 Million,Category 1,India,Bareilly,shital,CEO/Chairman/President,charlesrivera@durhamcrawford.com,Commercial/residential surveyor,Contact Email,Level 3,more than a month,"Morsy,Omar A",No,5.0
4,Title5VGWW,"Simpson, Duncan and Long LLC",Real Estate,392835$,2455218.75$,2020-10-25,Product_1,718-032-5726x76098,500 Million - 1 Billion,Category 3,USA,"Trenton, NJ",Shelly Stephenson,CEO/Co-Founder/Chairman,shellystephenson@simpson.com,Wellsite geologist,Others,Level 3,More than 2 weeks,"Morsy,Omar A",Deliverable,2.0


In [260]:
df_test = dropInitialRedundantFeatures(df_test)

In [261]:
df_test = handleAmountColumns(df_test)

In [262]:
df_test = handleDatesColumns(df_test)

In [263]:
df_test = handleMissingValues(df_test)

In [264]:
df_test, one_hot_object = oneHotEncoding(df_test, onlyTransform=True, one_hot_object = one_hot_object)

In [265]:
df_test.head()

Unnamed: 0,Deal_title,Deal_value,Weighted_amount,Internal_rating,creation_year,Industry_0,Industry_1,Industry_2,Industry_3,Industry_4,Industry_5,Industry_6,Industry_7,Industry_8,Industry_9,Industry_10,Industry_11,Industry_12,Industry_13,Industry_14,Industry_15,Industry_16,Industry_17,Industry_18,Industry_19,Industry_20,Industry_21,Industry_22,Industry_23,Industry_24,Industry_25,Industry_26,Industry_27,Industry_28,Industry_29,Industry_30,Industry_31,Industry_32,Industry_33,Industry_34,Industry_35,Industry_36,Industry_37,Industry_38,Industry_39,Industry_40,Industry_41,Industry_42,Industry_43,Industry_44,Industry_45,Industry_46,Industry_47,Industry_48,Industry_49,Industry_50,Industry_51,Industry_52,Industry_53,Industry_54,Industry_55,Industry_56,Industry_57,Industry_58,Industry_59,Industry_60,Industry_61,Industry_62,Industry_63,Industry_64,Industry_65,Industry_66,Industry_67,Industry_68,Industry_69,Industry_70,Industry_71,Industry_72,Industry_73,Industry_74,Industry_75,Industry_76,Industry_77,Industry_78,Industry_79,Industry_80,Industry_81,Industry_82,Industry_83,Industry_84,Industry_85,Industry_86,Industry_87,Industry_88,Industry_89,Industry_90,Industry_91,Industry_92,Industry_93,Industry_94,Industry_95,Industry_96,Industry_97,Industry_98,Industry_99,Industry_100,Industry_101,Industry_102,Industry_103,Industry_104,Industry_105,Industry_106,Industry_107,Industry_108,Industry_109,Industry_110,Industry_111,Industry_112,Industry_113,Industry_114,Industry_115,Industry_116,Industry_117,Industry_118,Industry_119,Industry_120,Industry_121,Industry_122,Industry_123,Industry_124,Industry_125,Industry_126,Industry_127,Industry_128,Industry_129,Industry_130,Industry_131,Industry_132,Industry_133,Industry_134,Industry_135,Industry_136,Industry_137,Industry_138,Industry_139,Industry_140,Industry_141,Industry_142,Industry_143,Industry_144,Industry_145,Industry_146,Industry_147,Industry_148,Industry_149,Industry_150,Industry_151,Industry_152,Industry_153,Industry_154,Industry_155,Industry_156,Industry_157,Industry_158,Industry_159,Industry_160,Industry_161,Industry_162,Industry_163,Industry_164,Industry_165,Industry_166,Industry_167,Industry_168,Industry_169,Industry_170,Pitch_0,Pitch_1,Lead_revenue_0,Lead_revenue_1,Lead_revenue_2,Fund_category_0,Fund_category_1,Fund_category_2,Fund_category_3,Geography_0,Geography_1,Lead_source_0,Lead_source_1,Lead_source_2,Lead_source_3,Level_of_meeting_0,Level_of_meeting_1,Level_of_meeting_2,Last_lead_update_0,Last_lead_update_1,Last_lead_update_2,Last_lead_update_3,Last_lead_update_4,Last_lead_update_5,Last_lead_update_6,Last_lead_update_7,Last_lead_update_8,Last_lead_update_9,Last_lead_update_10,Resource_0,Resource_1,Resource_2,Resource_3,Resource_4,Resource_5
0,TitleAD16O,200988.0,1565433.1,-1.0,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,TitleOW6CR,409961.0,2541758.2,1.0,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,TitleVVJQ5,434433.0,3041031.0,5.0,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,TitleUS8NA,218952.0,1521716.4,5.0,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,Title5VGWW,392835.0,2455218.75,2.0,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [266]:
Deal_title = df_test["Deal_title"]
Deal_title_df = pd.DataFrame(data = Deal_title,columns=["Deal_title"])
df_test = df_test.drop(["Deal_title"],axis=1)
df_test.head()

Unnamed: 0,Deal_value,Weighted_amount,Internal_rating,creation_year,Industry_0,Industry_1,Industry_2,Industry_3,Industry_4,Industry_5,Industry_6,Industry_7,Industry_8,Industry_9,Industry_10,Industry_11,Industry_12,Industry_13,Industry_14,Industry_15,Industry_16,Industry_17,Industry_18,Industry_19,Industry_20,Industry_21,Industry_22,Industry_23,Industry_24,Industry_25,Industry_26,Industry_27,Industry_28,Industry_29,Industry_30,Industry_31,Industry_32,Industry_33,Industry_34,Industry_35,Industry_36,Industry_37,Industry_38,Industry_39,Industry_40,Industry_41,Industry_42,Industry_43,Industry_44,Industry_45,Industry_46,Industry_47,Industry_48,Industry_49,Industry_50,Industry_51,Industry_52,Industry_53,Industry_54,Industry_55,Industry_56,Industry_57,Industry_58,Industry_59,Industry_60,Industry_61,Industry_62,Industry_63,Industry_64,Industry_65,Industry_66,Industry_67,Industry_68,Industry_69,Industry_70,Industry_71,Industry_72,Industry_73,Industry_74,Industry_75,Industry_76,Industry_77,Industry_78,Industry_79,Industry_80,Industry_81,Industry_82,Industry_83,Industry_84,Industry_85,Industry_86,Industry_87,Industry_88,Industry_89,Industry_90,Industry_91,Industry_92,Industry_93,Industry_94,Industry_95,Industry_96,Industry_97,Industry_98,Industry_99,Industry_100,Industry_101,Industry_102,Industry_103,Industry_104,Industry_105,Industry_106,Industry_107,Industry_108,Industry_109,Industry_110,Industry_111,Industry_112,Industry_113,Industry_114,Industry_115,Industry_116,Industry_117,Industry_118,Industry_119,Industry_120,Industry_121,Industry_122,Industry_123,Industry_124,Industry_125,Industry_126,Industry_127,Industry_128,Industry_129,Industry_130,Industry_131,Industry_132,Industry_133,Industry_134,Industry_135,Industry_136,Industry_137,Industry_138,Industry_139,Industry_140,Industry_141,Industry_142,Industry_143,Industry_144,Industry_145,Industry_146,Industry_147,Industry_148,Industry_149,Industry_150,Industry_151,Industry_152,Industry_153,Industry_154,Industry_155,Industry_156,Industry_157,Industry_158,Industry_159,Industry_160,Industry_161,Industry_162,Industry_163,Industry_164,Industry_165,Industry_166,Industry_167,Industry_168,Industry_169,Industry_170,Pitch_0,Pitch_1,Lead_revenue_0,Lead_revenue_1,Lead_revenue_2,Fund_category_0,Fund_category_1,Fund_category_2,Fund_category_3,Geography_0,Geography_1,Lead_source_0,Lead_source_1,Lead_source_2,Lead_source_3,Level_of_meeting_0,Level_of_meeting_1,Level_of_meeting_2,Last_lead_update_0,Last_lead_update_1,Last_lead_update_2,Last_lead_update_3,Last_lead_update_4,Last_lead_update_5,Last_lead_update_6,Last_lead_update_7,Last_lead_update_8,Last_lead_update_9,Last_lead_update_10,Resource_0,Resource_1,Resource_2,Resource_3,Resource_4,Resource_5
0,200988.0,1565433.1,-1.0,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,409961.0,2541758.2,1.0,2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,434433.0,3041031.0,5.0,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,218952.0,1521716.4,5.0,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,392835.0,2455218.75,2.0,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [267]:
x_test_submit = scalerX.transform(df_test)
x_test_submit = pd.DataFrame(x_test_submit)
x_test_submit.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209
0,-0.337326,-0.003192,-2.826493,0.680141,-0.020696,-0.041419,-0.071863,-0.020696,-0.037805,-0.029275,-0.04784,-0.098256,-0.016897,-0.106099,-0.075772,-0.011947,-0.031623,-0.057387,-0.072859,-0.037805,-0.011947,-0.517693,-0.054827,-0.029275,-0.091359,-0.117236,-0.016897,-0.039653,-0.029275,-0.070853,-0.033809,-0.020696,-0.041419,-0.066662,-0.044744,-0.14174,-0.035862,-0.043113,-0.023899,-0.035862,-0.011947,-0.144338,-0.070853,-0.086468,-0.057387,-0.016897,-0.072859,-0.043113,-0.031623,-0.04784,-0.245072,-0.119713,-0.073843,-0.023899,-0.053502,-0.050749,-0.011947,-0.087301,-0.162838,-0.044744,-0.146893,-0.09601,-0.020696,-0.029275,-0.086468,-0.067733,-0.258533,-0.011947,-0.020696,-0.035862,5.833952,-0.04784,-0.04784,-0.035862,-0.088127,-0.052144,-0.058625,-0.066662,-0.064466,-0.114068,-0.089757,-0.053502,-0.033809,-0.011947,-0.031623,-0.041419,-0.016897,-0.026722,-0.073843,-0.011947,-0.097513,-0.023899,-0.016897,-0.033809,-0.098993,-0.079493,-0.146893,-0.016897,-0.11848,-0.063341,-0.12154,-0.016897,-0.065573,-0.054827,-0.029275,-0.052144,-0.052144,-0.075772,-0.053502,-0.046317,-0.049316,-0.029275,-0.052144,-0.065573,-0.029275,-0.128028,-0.016897,-0.020696,-0.175341,-0.065573,-0.053502,-0.011947,-0.035862,-0.086468,-0.066662,-0.033809,-0.072859,-0.011947,-0.016897,-0.020696,-0.035862,-0.020696,-0.020696,-0.031623,-0.090562,-0.026722,-0.090562,-0.075772,-0.035862,-0.049316,-0.127451,-0.057387,-0.062195,-0.011947,-0.104016,-0.043113,-0.023899,-0.099726,-0.026722,-0.058625,-0.035862,-0.049316,-0.011947,-0.04784,-0.016897,-0.031623,-0.078579,-0.016897,-0.041419,-0.066662,-0.029275,-0.020696,-0.054827,-0.039653,-0.041419,-0.011947,-0.011947,-0.020696,-0.026722,-0.011947,-0.061028,-0.020696,-0.016897,-0.044744,-0.011947,0.999857,-0.999857,1.407279,-0.694278,-0.716513,-0.575867,-0.584656,-0.585975,1.776511,1.164511,-1.164511,-0.576087,1.685692,-0.574109,-0.565978,1.381548,-0.694278,-0.703326,-0.294438,-0.295866,-0.301816,-0.292717,-0.468236,-0.293578,-0.293004,-0.306298,-0.29099,-0.31239,3.252971,-0.434553,-0.438723,2.067935,-0.441957,-0.445413,-0.438492
1,1.111653,1.098342,-1.416617,2.345324,-0.020696,-0.041419,-0.071863,-0.020696,-0.037805,-0.029275,-0.04784,-0.098256,-0.016897,-0.106099,-0.075772,-0.011947,-0.031623,-0.057387,-0.072859,-0.037805,-0.011947,-0.517693,-0.054827,-0.029275,-0.091359,-0.117236,-0.016897,-0.039653,-0.029275,-0.070853,-0.033809,-0.020696,-0.041419,-0.066662,-0.044744,-0.14174,-0.035862,-0.043113,-0.023899,-0.035862,-0.011947,-0.144338,-0.070853,11.565034,-0.057387,-0.016897,-0.072859,-0.043113,-0.031623,-0.04784,-0.245072,-0.119713,-0.073843,-0.023899,-0.053502,-0.050749,-0.011947,-0.087301,-0.162838,-0.044744,-0.146893,-0.09601,-0.020696,-0.029275,-0.086468,-0.067733,-0.258533,-0.011947,-0.020696,-0.035862,-0.17141,-0.04784,-0.04784,-0.035862,-0.088127,-0.052144,-0.058625,-0.066662,-0.064466,-0.114068,-0.089757,-0.053502,-0.033809,-0.011947,-0.031623,-0.041419,-0.016897,-0.026722,-0.073843,-0.011947,-0.097513,-0.023899,-0.016897,-0.033809,-0.098993,-0.079493,-0.146893,-0.016897,-0.11848,-0.063341,-0.12154,-0.016897,-0.065573,-0.054827,-0.029275,-0.052144,-0.052144,-0.075772,-0.053502,-0.046317,-0.049316,-0.029275,-0.052144,-0.065573,-0.029275,-0.128028,-0.016897,-0.020696,-0.175341,-0.065573,-0.053502,-0.011947,-0.035862,-0.086468,-0.066662,-0.033809,-0.072859,-0.011947,-0.016897,-0.020696,-0.035862,-0.020696,-0.020696,-0.031623,-0.090562,-0.026722,-0.090562,-0.075772,-0.035862,-0.049316,-0.127451,-0.057387,-0.062195,-0.011947,-0.104016,-0.043113,-0.023899,-0.099726,-0.026722,-0.058625,-0.035862,-0.049316,-0.011947,-0.04784,-0.016897,-0.031623,-0.078579,-0.016897,-0.041419,-0.066662,-0.029275,-0.020696,-0.054827,-0.039653,-0.041419,-0.011947,-0.011947,-0.020696,-0.026722,-0.011947,-0.061028,-0.020696,-0.016897,-0.044744,-0.011947,0.999857,-0.999857,1.407279,-0.694278,-0.716513,-0.575867,-0.584656,1.706558,-0.562901,-0.858729,0.858729,-0.576087,1.685692,-0.574109,-0.565978,-0.723826,1.440345,-0.703326,-0.294438,-0.295866,-0.301816,-0.292717,-0.468236,-0.293578,-0.293004,-0.306298,-0.29099,3.201123,-0.307411,-0.434553,-0.438723,-0.483574,-0.441957,-0.445413,2.280545
2,1.281337,1.661644,1.403136,0.680141,-0.020696,-0.041419,-0.071863,-0.020696,-0.037805,-0.029275,-0.04784,-0.098256,-0.016897,-0.106099,-0.075772,-0.011947,-0.031623,-0.057387,-0.072859,-0.037805,-0.011947,1.931648,-0.054827,-0.029275,-0.091359,-0.117236,-0.016897,-0.039653,-0.029275,-0.070853,-0.033809,-0.020696,-0.041419,-0.066662,-0.044744,-0.14174,-0.035862,-0.043113,-0.023899,-0.035862,-0.011947,-0.144338,-0.070853,-0.086468,-0.057387,-0.016897,-0.072859,-0.043113,-0.031623,-0.04784,-0.245072,-0.119713,-0.073843,-0.023899,-0.053502,-0.050749,-0.011947,-0.087301,-0.162838,-0.044744,-0.146893,-0.09601,-0.020696,-0.029275,-0.086468,-0.067733,-0.258533,-0.011947,-0.020696,-0.035862,-0.17141,-0.04784,-0.04784,-0.035862,-0.088127,-0.052144,-0.058625,-0.066662,-0.064466,-0.114068,-0.089757,-0.053502,-0.033809,-0.011947,-0.031623,-0.041419,-0.016897,-0.026722,-0.073843,-0.011947,-0.097513,-0.023899,-0.016897,-0.033809,-0.098993,-0.079493,-0.146893,-0.016897,-0.11848,-0.063341,-0.12154,-0.016897,-0.065573,-0.054827,-0.029275,-0.052144,-0.052144,-0.075772,-0.053502,-0.046317,-0.049316,-0.029275,-0.052144,-0.065573,-0.029275,-0.128028,-0.016897,-0.020696,-0.175341,-0.065573,-0.053502,-0.011947,-0.035862,-0.086468,-0.066662,-0.033809,-0.072859,-0.011947,-0.016897,-0.020696,-0.035862,-0.020696,-0.020696,-0.031623,-0.090562,-0.026722,-0.090562,-0.075772,-0.035862,-0.049316,-0.127451,-0.057387,-0.062195,-0.011947,-0.104016,-0.043113,-0.023899,-0.099726,-0.026722,-0.058625,-0.035862,-0.049316,-0.011947,-0.04784,-0.016897,-0.031623,-0.078579,-0.016897,-0.041419,-0.066662,-0.029275,-0.020696,-0.054827,-0.039653,-0.041419,-0.011947,-0.011947,-0.020696,-0.026722,-0.011947,-0.061028,-0.020696,-0.016897,-0.044744,-0.011947,0.999857,-0.999857,1.407279,-0.694278,-0.716513,1.736512,-0.584656,-0.585975,-0.562901,-0.858729,0.858729,-0.576087,1.685692,-0.574109,-0.565978,-0.723826,1.440345,-0.703326,-0.294438,-0.295866,-0.301816,3.416268,-0.468236,-0.293578,-0.293004,-0.306298,-0.29099,-0.31239,-0.307411,-0.434553,2.279343,-0.483574,-0.441957,-0.445413,-0.438492
3,-0.212767,-0.052515,1.403136,0.680141,-0.020696,-0.041419,-0.071863,-0.020696,-0.037805,-0.029275,-0.04784,-0.098256,-0.016897,-0.106099,-0.075772,-0.011947,-0.031623,-0.057387,-0.072859,-0.037805,-0.011947,-0.517693,-0.054827,-0.029275,-0.091359,-0.117236,-0.016897,-0.039653,-0.029275,-0.070853,-0.033809,-0.020696,-0.041419,-0.066662,-0.044744,-0.14174,-0.035862,-0.043113,-0.023899,-0.035862,-0.011947,-0.144338,-0.070853,-0.086468,-0.057387,-0.016897,-0.072859,-0.043113,-0.031623,-0.04784,-0.245072,-0.119713,-0.073843,-0.023899,-0.053502,-0.050749,-0.011947,-0.087301,-0.162838,-0.044744,-0.146893,-0.09601,-0.020696,-0.029275,-0.086468,-0.067733,-0.258533,-0.011947,-0.020696,-0.035862,-0.17141,-0.04784,-0.04784,-0.035862,-0.088127,-0.052144,-0.058625,-0.066662,-0.064466,-0.114068,-0.089757,-0.053502,-0.033809,-0.011947,-0.031623,24.143667,-0.016897,-0.026722,-0.073843,-0.011947,-0.097513,-0.023899,-0.016897,-0.033809,-0.098993,-0.079493,-0.146893,-0.016897,-0.11848,-0.063341,-0.12154,-0.016897,-0.065573,-0.054827,-0.029275,-0.052144,-0.052144,-0.075772,-0.053502,-0.046317,-0.049316,-0.029275,-0.052144,-0.065573,-0.029275,-0.128028,-0.016897,-0.020696,-0.175341,-0.065573,-0.053502,-0.011947,-0.035862,-0.086468,-0.066662,-0.033809,-0.072859,-0.011947,-0.016897,-0.020696,-0.035862,-0.020696,-0.020696,-0.031623,-0.090562,-0.026722,-0.090562,-0.075772,-0.035862,-0.049316,-0.127451,-0.057387,-0.062195,-0.011947,-0.104016,-0.043113,-0.023899,-0.099726,-0.026722,-0.058625,-0.035862,-0.049316,-0.011947,-0.04784,-0.016897,-0.031623,-0.078579,-0.016897,-0.041419,-0.066662,-0.029275,-0.020696,-0.054827,-0.039653,-0.041419,-0.011947,-0.011947,-0.020696,-0.026722,-0.011947,-0.061028,-0.020696,-0.016897,-0.044744,-0.011947,-1.000143,1.000143,1.407279,-0.694278,-0.716513,1.736512,-0.584656,-0.585975,-0.562901,1.164511,-1.164511,1.735849,-0.593228,-0.574109,-0.565978,-0.723826,-0.694278,1.421816,-0.294438,-0.295866,-0.301816,-0.292717,-0.468236,-0.293578,-0.293004,-0.306298,-0.29099,-0.31239,3.252971,-0.434553,-0.438723,2.067935,-0.441957,-0.445413,-0.438492
4,0.992905,1.000705,-0.711679,0.680141,-0.020696,-0.041419,-0.071863,-0.020696,-0.037805,-0.029275,-0.04784,-0.098256,-0.016897,-0.106099,-0.075772,-0.011947,-0.031623,-0.057387,-0.072859,-0.037805,-0.011947,-0.517693,-0.054827,-0.029275,-0.091359,-0.117236,-0.016897,-0.039653,-0.029275,-0.070853,-0.033809,-0.020696,-0.041419,-0.066662,-0.044744,-0.14174,-0.035862,-0.043113,-0.023899,-0.035862,-0.011947,-0.144338,-0.070853,-0.086468,-0.057387,-0.016897,-0.072859,-0.043113,-0.031623,-0.04784,-0.245072,-0.119713,-0.073843,-0.023899,-0.053502,-0.050749,-0.011947,-0.087301,-0.162838,-0.044744,-0.146893,-0.09601,-0.020696,-0.029275,-0.086468,-0.067733,-0.258533,-0.011947,-0.020696,-0.035862,-0.17141,-0.04784,-0.04784,-0.035862,-0.088127,-0.052144,-0.058625,-0.066662,-0.064466,-0.114068,-0.089757,-0.053502,-0.033809,-0.011947,-0.031623,-0.041419,-0.016897,-0.026722,-0.073843,-0.011947,-0.097513,-0.023899,-0.016897,-0.033809,-0.098993,-0.079493,-0.146893,-0.016897,-0.11848,-0.063341,-0.12154,-0.016897,-0.065573,-0.054827,-0.029275,-0.052144,-0.052144,-0.075772,-0.053502,-0.046317,-0.049316,-0.029275,-0.052144,-0.065573,-0.029275,-0.128028,-0.016897,-0.020696,5.703185,-0.065573,-0.053502,-0.011947,-0.035862,-0.086468,-0.066662,-0.033809,-0.072859,-0.011947,-0.016897,-0.020696,-0.035862,-0.020696,-0.020696,-0.031623,-0.090562,-0.026722,-0.090562,-0.075772,-0.035862,-0.049316,-0.127451,-0.057387,-0.062195,-0.011947,-0.104016,-0.043113,-0.023899,-0.099726,-0.026722,-0.058625,-0.035862,-0.049316,-0.011947,-0.04784,-0.016897,-0.031623,-0.078579,-0.016897,-0.041419,-0.066662,-0.029275,-0.020696,-0.054827,-0.039653,-0.041419,-0.011947,-0.011947,-0.020696,-0.026722,-0.011947,-0.061028,-0.020696,-0.016897,-0.044744,-0.011947,0.999857,-0.999857,-0.710591,-0.694278,1.395648,-0.575867,-0.584656,1.706558,-0.562901,-0.858729,0.858729,-0.576087,-0.593228,1.741829,-0.565978,-0.723826,-0.694278,1.421816,-0.294438,-0.295866,-0.301816,-0.292717,-0.468236,3.406247,-0.293004,-0.306298,-0.29099,-0.31239,-0.307411,-0.434553,2.279343,-0.483574,-0.441957,-0.445413,-0.438492


In [268]:
x_test_submit = pca.transform(x_test_submit)

In [269]:
prediction = cat.predict(x_test_submit)
pred2 = np.expm1(prediction)
# prediction = scalerY.inverse_transform(prediction)

In [270]:
df_to_submit = pd.concat([Deal_title_df,pd.DataFrame(data = pred2,columns=["Success_probability"])], axis=1)

In [271]:
df_to_submit.head()

Unnamed: 0,Deal_title,Success_probability
0,TitleAD16O,66.122888
1,TitleOW6CR,63.409764
2,TitleVVJQ5,69.395749
3,TitleUS8NA,72.491548
4,Title5VGWW,73.490286


In [272]:
df_to_submit.to_csv(path_or_buf="./df_to_submit"+".csv",float_format='%.5f',index=False)