yshr10ic
train.csv
とtest.csv
を用いて簡単なEDAを実施してみました# mount drive
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
!pip install matplotlib-venn > /dev/null
# カレントディレクトリを変更
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/probspace/kiva/')
print(os.getcwd())
/content/drive/My Drive/Colab Notebooks/probspace/kiva
class Config():
root_path = './'
input_path = os.path.join(root_path, 'data')
import matplotlib.pylab as plt
from matplotlib_venn import venn2
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.options.display.float_format = '{:.5f}'.format
train_df = pd.read_csv(os.path.join(Config.input_path, 'train.csv'))
print(train_df.shape)
display(train_df)
(91333, 18)
LOAN_ID | ORIGINAL_LANGUAGE | DESCRIPTION | DESCRIPTION_TRANSLATED | LOAN_AMOUNT | IMAGE_ID | ACTIVITY_NAME | SECTOR_NAME | LOAN_USE | COUNTRY_CODE | COUNTRY_NAME | TOWN_NAME | CURRENCY_POLICY | CURRENCY_EXCHANGE_COVERAGE_RATE | CURRENCY | TAGS | REPAYMENT_INTERVAL | DISTRIBUTION_MODEL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1733169 | English | Teodora is a 50-year-old married woman from th... | Teodora is a 50-year-old married woman from th... | 100 | 3115271 | Weaving | Arts | to purchase materials like nipa palm, bamboo ... | PH | Philippines | Maribojoc, Bohol | shared | 0.10000 | PHP | #Elderly | monthly | field_partner |
1 | 1546998 | English | Diego is 32 years old and lives in the municip... | Diego is 32 years old and lives in the municip... | 1350 | 2870403 | Barber Shop | Services | to buy two hair clippers, a new barber chair, ... | CO | Colombia | Apartadó | shared | 0.10000 | COP | user_favorite, user_favorite | monthly | field_partner |
2 | 1808517 | Spanish | Osman, es un joven de 27 años de edad, soltero... | Osman is a young man, 27 years old, single, an... | 225 | 3215705 | Farming | Agriculture | to purchase sacks of fertilizers to care for a... | HN | Honduras | Nueva Frontera, Santa Barbara. | shared | 0.10000 | HNL | NaN | bullet | field_partner |
3 | 1452940 | English | His name is Nino, 31 years old, married to Che... | His name is Nino, 31 years old, married to Che... | 350 | 2745031 | Motorcycle Transport | Transportation | to pay for fuel, tires and change oil for his ... | PH | Philippines | Silang, Cavite | shared | 0.10000 | PHP | user_favorite | monthly | field_partner |
4 | 1778420 | English | Pictured above is Teresa, often described as a... | Pictured above is Teresa, often described as a... | 625 | 3083800 | Farming | Agriculture | to purchase hybrid seeds and fertilizer to imp... | KE | Kenya | Mumias | shared | 0.10000 | KES | #Eco-friendly, #Sustainable Ag, #Parent, #Elde... | bullet | field_partner |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
91328 | 1688789 | Spanish | Rider tiene 20 años de edad, vive en San Javie... | Rider is 20 years old. He lives in San Javier,... | 775 | 3054018 | Poultry | Agriculture | to buy chickens to raise and sell. | EC | Ecuador | San Javier | shared | 0.10000 | USD | volunteer_like, #Animals, #Supporting Family | monthly | field_partner |
91329 | 1878119 | English | Carmelita works hard to support four children.... | Carmelita works hard to support four children.... | 100 | 3311100 | Personal Housing Expenses | Housing | to build a sanitary toilet for her family | PH | Philippines | Danao Cebu | standard | nan | PHP | volunteer_like | monthly | field_partner |
91330 | 1639680 | English | Orn, 60 years of age, appears in the photo. Sh... | Orn, 60 years of age, appears in the photo. Sh... | 1500 | 2990352 | Grocery Store | Food | to pay for additional groceries to stock the s... | KH | Cambodia | Takeo province | shared | 0.10000 | USD | user_favorite, #Elderly, user_favorite | monthly | field_partner |
91331 | 1495391 | Spanish | Walter, a sus 27 años de edad, vive en unión l... | At 27 years of age, Walter is in a live-in rel... | 1750 | 2805390 | Farming | Agriculture | to buy agricultural supplies, such as fertiliz... | CO | Colombia | El Carmen de Viboral | shared | 0.10000 | COP | #Sustainable Ag, #Eco-friendly, user_favorite | monthly | field_partner |
91332 | 1602898 | English | Greetings from Uganda! This is Godfrey. He is ... | Greetings from Uganda! This is Godfrey. He is ... | 275 | 2943724 | Education provider | Education | to purchase a water-filtration system to provi... | UG | Uganda | Isingiro | shared | 0.10000 | UGX | #Health and Sanitation, user_favorite, #School... | irregular | field_partner |
91333 rows × 18 columns
test_df = pd.read_csv(os.path.join(Config.input_path, 'test.csv'))
print(test_df.shape)
display(test_df)
(91822, 17)
LOAN_ID | ORIGINAL_LANGUAGE | DESCRIPTION | DESCRIPTION_TRANSLATED | IMAGE_ID | ACTIVITY_NAME | SECTOR_NAME | LOAN_USE | COUNTRY_CODE | COUNTRY_NAME | TOWN_NAME | CURRENCY_POLICY | CURRENCY_EXCHANGE_COVERAGE_RATE | CURRENCY | TAGS | REPAYMENT_INTERVAL | DISTRIBUTION_MODEL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2041445 | English | Marcela is 69 years old and married with ten c... | Marcela is 69 years old and married with ten c... | 4051101 | General Store | Retail | to buy items to sell like canned goods and per... | PH | Philippines | Cauayan, Negros Occidental | standard | nan | PHP | NaN | monthly | field_partner |
1 | 1944435 | English | Roselia is 48 years old and has five children.... | Roselia is 48 years old and has five children.... | 3410523 | Pigs | Agriculture | to buy feeds and other supplies to raise her pig | PH | Philippines | Guihulngan, Negros Oriental | standard | nan | PHP | #Animals, #Repeat Borrower, #Schooling, #Woman... | monthly | field_partner |
2 | 2083354 | English | Ma. Marebil is a single woman, 40 years old wi... | Ma. Marebil is a single woman, 40 years old wi... | 4146690 | Clothing Sales | Clothing | to buy additional stock of clothes and dresses... | PH | Philippines | Santa Barbara, Iloilo | standard | nan | PHP | #Parent, #Single Parent, #Woman-Owned Business | monthly | field_partner |
3 | 1993565 | English | Good day, lenders! Meet one of KBMI’s clients,... | Good day, lenders! Meet one of KBMI’s clients,... | 3945982 | Food | Food | to buy more foods to grow her business. | ID | Indonesia | Pandeglang | shared | 0.10000 | IDR | #Woman-Owned Business, #Schooling, #Elderly, #... | monthly | field_partner |
4 | 2064272 | English | Rosemarie is a married woman with two children... | Rosemarie is a married woman with two children... | 4114040 | Food | Food | to buy ingredients for her food production bus... | PH | Philippines | Sogod Cebu | standard | nan | PHP | NaN | monthly | field_partner |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
91817 | 1993862 | English | Marjorie is a resident of Tubigon, Bohol. She ... | Marjorie is a resident of Tubigon, Bohol. She ... | 3946629 | Fishing | Food | to buy fishing nets. | PH | Philippines | Tubigon, Bohol | shared | 0.00000 | PHP | #Parent, #Biz Durable Asset | monthly | field_partner |
91818 | 2015070 | English | Hello, Kiva community! Meet Janeth, a mother e... | Hello, Kiva community! Meet Janeth, a mother e... | 4006025 | Home Energy | Personal Use | to buy a solar lantern to provide adequate lig... | KE | Kenya | Nandi Hills | shared | 0.00000 | KES | #Technology, #Eco-friendly, #Parent | monthly | field_partner |
91819 | 1950349 | French | Agé de 32 ans, Komi est marié .C'est un bouche... | Komi is 32 years old and married. He is a reno... | 3423123 | Butcher Shop | Food | to buy two cows. | TG | Togo | Vakpossito | shared | 0.00000 | XOF | #Biz Durable Asset, user_favorite, #Animals | monthly | field_partner |
91820 | 1921580 | Russian | Калбубу, 56 лет, вдова, есть взрослые дети. У ... | Kalbubu is 56 years old, a widow, and she has ... | 3373358 | Dairy | Agriculture | to buy dairy cows to increase her headcount of... | KG | Kyrgyzstan | Min-Bulak village, Talas region | shared | 0.00000 | KGS | #Animals, #Widowed, #Biz Durable Asset, #Woman... | irregular | field_partner |
91821 | 1976733 | English | Hinrilyn is 31 years old and has two children.... | Hinrilyn is 31 years old and has two children.... | 3841884 | Pigs | Agriculture | to buy feeds and other supplies to raise her l... | PH | Philippines | Coron, Palawan | standard | nan | PHP | user_favorite, #Animals, #Parent, user_favorit... | monthly | field_partner |
91822 rows × 17 columns
display(
pd.DataFrame({
'train': train_df.isnull().sum(),
'test': test_df.isnull().sum()
}).reset_index()
)
index | train | test | |
---|---|---|---|
0 | ACTIVITY_NAME | 0 | 0.00000 |
1 | COUNTRY_CODE | 0 | 0.00000 |
2 | COUNTRY_NAME | 0 | 0.00000 |
3 | CURRENCY | 0 | 0.00000 |
4 | CURRENCY_EXCHANGE_COVERAGE_RATE | 9272 | 43561.00000 |
5 | CURRENCY_POLICY | 0 | 0.00000 |
6 | DESCRIPTION | 0 | 0.00000 |
7 | DESCRIPTION_TRANSLATED | 1 | 0.00000 |
8 | DISTRIBUTION_MODEL | 0 | 0.00000 |
9 | IMAGE_ID | 0 | 0.00000 |
10 | LOAN_AMOUNT | 0 | nan |
11 | LOAN_ID | 0 | 0.00000 |
12 | LOAN_USE | 0 | 0.00000 |
13 | ORIGINAL_LANGUAGE | 0 | 0.00000 |
14 | REPAYMENT_INTERVAL | 0 | 0.00000 |
15 | SECTOR_NAME | 0 | 0.00000 |
16 | TAGS | 17986 | 30518.00000 |
17 | TOWN_NAME | 2760 | 2423.00000 |
print(f"[tr] {train_df.duplicated().sum()}")
print(f"[te] {test_df.duplicated().sum()}")
[tr] 0 [te] 0
all_df = pd.concat([train_df, test_df])
all_df.drop(columns=['LOAN_AMOUNT'], inplace=True)
print(f"{all_df.duplicated().sum()}")
0
print(f"[tr] len: {len(train_df)}, nunique: {train_df['LOAN_ID'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['LOAN_ID'].nunique()}")
[tr] len: 91333, nunique: 91333 [te] len: 91822, nunique: 91822
venn2([set(train_df['LOAN_ID']), set(test_df['LOAN_ID'])], set_labels=('train', 'test'));
display(
pd.DataFrame({
'train': train_df['ORIGINAL_LANGUAGE'].value_counts() / len(train_df),
'test': test_df['ORIGINAL_LANGUAGE'].value_counts() / len(test_df)
}).reset_index()
)
index | train | test | |
---|---|---|---|
0 | English | 0.70005 | 0.70159 |
1 | Spanish | 0.20766 | 0.17450 |
2 | French | 0.05667 | 0.08319 |
3 | Russian | 0.02770 | 0.03325 |
4 | Portuguese | 0.00792 | 0.00747 |
print(f"[tr] len: {len(train_df)}, nunique: {train_df['DESCRIPTION'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['DESCRIPTION'].nunique()}")
[tr] len: 91333, nunique: 91307 [te] len: 91822, nunique: 91803
print(f"[tr] duplicated: {train_df['DESCRIPTION'].duplicated().sum()}")
print(f"[te] duplicated: {test_df['DESCRIPTION'].duplicated().sum()}")
[tr] duplicated: 26 [te] duplicated: 19
display(train_df.loc[train_df['DESCRIPTION'].str.startswith('Hellen is 41')])
LOAN_ID | ORIGINAL_LANGUAGE | DESCRIPTION | DESCRIPTION_TRANSLATED | LOAN_AMOUNT | IMAGE_ID | ACTIVITY_NAME | SECTOR_NAME | LOAN_USE | COUNTRY_CODE | COUNTRY_NAME | TOWN_NAME | CURRENCY_POLICY | CURRENCY_EXCHANGE_COVERAGE_RATE | CURRENCY | TAGS | REPAYMENT_INTERVAL | DISTRIBUTION_MODEL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7196 | 1722176 | English | Hellen is 41 years old. She is married to Aurh... | Hellen is 41 years old. She is married to Aurh... | 225 | 3096942 | Cattle | Agriculture | to buy feed and vaccines. | KE | Kenya | Nyandarua South | shared | 0.10000 | KES | #Animals, #Woman-Owned Business, #Parent | monthly | field_partner |
7869 | 1874377 | English | Hellen is 41 years old. She is married to Aurh... | Hellen is 41 years old. She is married to Aurh... | 325 | 3297889 | Cattle | Agriculture | to buy animal feeds and vaccines. | KE | Kenya | Nyandarua South | shared | 0.00000 | KES | #Woman-Owned Business, #Animals, #Repeat Borro... | monthly | field_partner |
display(train_df.loc[train_df['DESCRIPTION_TRANSLATED'].isnull()])
LOAN_ID | ORIGINAL_LANGUAGE | DESCRIPTION | DESCRIPTION_TRANSLATED | LOAN_AMOUNT | IMAGE_ID | ACTIVITY_NAME | SECTOR_NAME | LOAN_USE | COUNTRY_CODE | COUNTRY_NAME | TOWN_NAME | CURRENCY_POLICY | CURRENCY_EXCHANGE_COVERAGE_RATE | CURRENCY | TAGS | REPAYMENT_INTERVAL | DISTRIBUTION_MODEL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
72195 | 1478847 | English | Tursun is 62 years old and she comes from the ... | NaN | 800 | 2783623 | Livestock | Agriculture | to buy some livestock and start a new business. | TJ | Tajikistan | Yavan | shared | 0.10000 | TJS | #Animals, #Elderly | irregular | field_partner |
print(f"[tr] len: {len(train_df)}, nunique: {train_df['DESCRIPTION_TRANSLATED'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['DESCRIPTION_TRANSLATED'].nunique()}")
[tr] len: 91333, nunique: 91309 [te] len: 91822, nunique: 91808
print(f"[tr] duplicated: {train_df['DESCRIPTION_TRANSLATED'].duplicated().sum()}")
print(f"[te] duplicated: {test_df['DESCRIPTION_TRANSLATED'].duplicated().sum()}")
[tr] duplicated: 23 [te] duplicated: 14
print('[tr]')
display(train_df.describe()['LOAN_AMOUNT'])
[tr]
count 91333.00000 mean 738.14996 std 1013.75287 min 25.00000 25% 225.00000 50% 450.00000 75% 825.00000 max 10000.00000 Name: LOAN_AMOUNT, dtype: float64
plt.figure(figsize=(20, 8))
train_df['LOAN_AMOUNT'].hist();
train_df['LOAN_AMOUNT_log1p'] = np.log1p(train_df['LOAN_AMOUNT'])
plt.figure(figsize=(20, 8))
train_df['LOAN_AMOUNT_log1p'].hist();
print(f"[tr] len: {len(train_df)}, nunique: {train_df['IMAGE_ID'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['IMAGE_ID'].nunique()}")
[tr] len: 91333, nunique: 91333 [te] len: 91822, nunique: 91822
venn2([set(train_df['IMAGE_ID']), set(test_df['IMAGE_ID'])], set_labels=('train', 'test'));
duplicated_image_ids = list(set(train_df['IMAGE_ID']) & set(test_df['IMAGE_ID']))
display(train_df.loc[train_df['IMAGE_ID']==duplicated_image_ids[0]])
display(test_df.loc[test_df['IMAGE_ID']==duplicated_image_ids[0]])
LOAN_ID | ORIGINAL_LANGUAGE | DESCRIPTION | DESCRIPTION_TRANSLATED | LOAN_AMOUNT | IMAGE_ID | ACTIVITY_NAME | SECTOR_NAME | LOAN_USE | COUNTRY_CODE | COUNTRY_NAME | TOWN_NAME | CURRENCY_POLICY | CURRENCY_EXCHANGE_COVERAGE_RATE | CURRENCY | TAGS | REPAYMENT_INTERVAL | DISTRIBUTION_MODEL | LOAN_AMOUNT_log1p | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
75564 | 1669059 | English | Jennifer works hard to support five children. ... | Jennifer works hard to support five children. ... | 200 | 3028992 | Personal Housing Expenses | Housing | to build a sanitary toilet for her family. | PH | Philippines | Banga, Aklan | shared | 0.10000 | PHP | #Health and Sanitation, #Eco-friendly, user_fa... | monthly | field_partner | 5.30330 |
LOAN_ID | ORIGINAL_LANGUAGE | DESCRIPTION | DESCRIPTION_TRANSLATED | IMAGE_ID | ACTIVITY_NAME | SECTOR_NAME | LOAN_USE | COUNTRY_CODE | COUNTRY_NAME | TOWN_NAME | CURRENCY_POLICY | CURRENCY_EXCHANGE_COVERAGE_RATE | CURRENCY | TAGS | REPAYMENT_INTERVAL | DISTRIBUTION_MODEL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
68693 | 1944221 | English | Jennifer works hard to support five children. ... | Jennifer works hard to support five children. ... | 3028992 | Personal Housing Expenses | Housing | to build a sanitary toilet for her family. | PH | Philippines | Banga, Aklan | standard | nan | PHP | NaN | monthly | field_partner |
display(train_df.loc[train_df['IMAGE_ID']==duplicated_image_ids[1]])
display(test_df.loc[test_df['IMAGE_ID']==duplicated_image_ids[1]])
LOAN_ID | ORIGINAL_LANGUAGE | DESCRIPTION | DESCRIPTION_TRANSLATED | LOAN_AMOUNT | IMAGE_ID | ACTIVITY_NAME | SECTOR_NAME | LOAN_USE | COUNTRY_CODE | COUNTRY_NAME | TOWN_NAME | CURRENCY_POLICY | CURRENCY_EXCHANGE_COVERAGE_RATE | CURRENCY | TAGS | REPAYMENT_INTERVAL | DISTRIBUTION_MODEL | LOAN_AMOUNT_log1p | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
19299 | 1806961 | English | Elizabeth is 47 years old, married and blessed... | Elizabeth is 47 years old, married and blessed... | 400 | 3213314 | Livestock | Agriculture | to buy feeds, vitamins and injections for her ... | PH | Philippines | Macate, Bambang, Nueva Vizcaya | shared | 0.10000 | PHP | #Animals | monthly | field_partner | 5.99396 |
LOAN_ID | ORIGINAL_LANGUAGE | DESCRIPTION | DESCRIPTION_TRANSLATED | IMAGE_ID | ACTIVITY_NAME | SECTOR_NAME | LOAN_USE | COUNTRY_CODE | COUNTRY_NAME | TOWN_NAME | CURRENCY_POLICY | CURRENCY_EXCHANGE_COVERAGE_RATE | CURRENCY | TAGS | REPAYMENT_INTERVAL | DISTRIBUTION_MODEL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
37366 | 1914116 | English | Elizabeth is known in the area as a hardworkin... | Elizabeth is known in the area as a hardworkin... | 3213314 | Pigs | Agriculture | to buy feeds, vitamins, and injections for her... | PH | Philippines | Macate, Bambang, Nueva Vizcaya | shared | 0.00000 | PHP | NaN | monthly | field_partner |
print(f"[tr] len: {len(train_df)}, nunique: {train_df['ACTIVITY_NAME'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['ACTIVITY_NAME'].nunique()}")
[tr] len: 91333, nunique: 161 [te] len: 91822, nunique: 161
print('[tr]')
display(train_df['ACTIVITY_NAME'].value_counts())
print('-'*50)
print('[te]')
display(test_df['ACTIVITY_NAME'].value_counts())
[tr]
Farming 11631 Personal Housing Expenses 7993 General Store 7504 Food Production/Sales 4174 Pigs 3790 ... Bicycle Sales 3 Utilities 3 Patchwork 3 Musical Instruments 3 Balut-Making 1 Name: ACTIVITY_NAME, Length: 161, dtype: int64
-------------------------------------------------- [te]
Farming 13626 General Store 10318 Pigs 6129 Personal Housing Expenses 4703 Food Production/Sales 4304 ... Balut-Making 2 Music Discs & Tapes 2 Bookbinding 2 Patchwork 1 Utilities 1 Name: ACTIVITY_NAME, Length: 161, dtype: int64
venn2([set(train_df['ACTIVITY_NAME'].unique()), set(test_df['ACTIVITY_NAME'].unique())], set_labels=('train', 'test'));
print(f"[tr] len: {len(train_df)}, nunique: {train_df['SECTOR_NAME'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['SECTOR_NAME'].nunique()}")
[tr] len: 91333, nunique: 15 [te] len: 91822, nunique: 15
display(
pd.DataFrame({
'train': train_df['SECTOR_NAME'].value_counts() / len(train_df),
'test': test_df['SECTOR_NAME'].value_counts() / len(test_df)
}).reset_index().sort_values(['train'], ascending=False)
)
index | train | test | |
---|---|---|---|
0 | Agriculture | 0.27334 | 0.31646 |
6 | Food | 0.20476 | 0.22642 |
11 | Retail | 0.16593 | 0.19337 |
8 | Housing | 0.08962 | 0.05245 |
12 | Services | 0.06012 | 0.04634 |
2 | Clothing | 0.04787 | 0.04273 |
10 | Personal Use | 0.04515 | 0.02991 |
4 | Education | 0.03868 | 0.03375 |
1 | Arts | 0.02165 | 0.01812 |
7 | Health | 0.01870 | 0.01563 |
13 | Transportation | 0.01631 | 0.00933 |
3 | Construction | 0.00910 | 0.00814 |
9 | Manufacturing | 0.00736 | 0.00632 |
5 | Entertainment | 0.00072 | 0.00051 |
14 | Wholesale | 0.00069 | 0.00052 |
venn2([set(train_df['SECTOR_NAME'].unique()), set(test_df['SECTOR_NAME'].unique())], set_labels=('train', 'test'));
print(f"[tr] len: {len(train_df)}, nunique: {train_df['LOAN_USE'].nunique()}")
print(f"[te] len: {len(test_df)}, nunique: {test_df['LOAN_USE'].nunique()}")
[tr] len: 91333, nunique: 63062 [te] len: 91822, nunique: 54099
print(f"[tr] duplicated: {train_df['LOAN_USE'].duplicated().sum()}")
print(f"[te] duplicated: {test_df['LOAN_USE'].duplicated().sum()}")
[tr] duplicated: 28271 [te] duplicated: 37723
print('[tr]')
display(train_df['LOAN_USE'].value_counts())
print('-'*50)
print('[te]')
display(test_df['LOAN_USE'].value_counts())
[tr]
to build a sanitary toilet for her family 3068 to build a sanitary toilet for her family. 1581 to buy a water filter to provide safe drinking water for their family. 895 to buy a water filter to provide safe drinking water for her family. 573 to buy ingredients for her food production business 449 ... to build facilities for sourcing water at home in order to improve her family's access to clean water. 1 to complete the budget needed for maintenance of his small coffee farm. 1 to buy corn, sugar, and milk. 1 to buy items like canned goods, personal care products, etc. to sell in her general store. 1 to buy more laying hens and corn to feed them. 1 Name: LOAN_USE, Length: 63062, dtype: int64
-------------------------------------------------- [te]
to build a sanitary toilet for her family 1771 to buy ingredients for her food production business 1057 to buy items like canned goods, personal care products, etc. to sell in her general store. 1028 to buy feed and other supplies to raise her livestock. 786 to buy feed and other supplies to raise her pigs. 618 ... to pay for her university studies, get good knowledge and find a well-paid job. 1 to acquire two bundles of used shoes wholesale, which will support her initial capital. 1 to buy water pumps and fertilizer for his farm. 1 to buy silk and poly silk thread, thimbles and dowels. She will also pay the wages of other craftsmen. 1 to purchase an additional basket of fish to sell. 1 Name: LOAN_USE, Length: 54099, dtype: int64
print('[tr]')
display(train_df['COUNTRY_CODE'].value_counts())
print('-'*50)
print('[te]')
display(test_df['COUNTRY_CODE'].value_counts())
[tr]
PH 21265 KE 12510 UG 5551 CO 4758 SV 4100 ... PA 28 PR 16 LS 11 IL 10 AM 1 Name: COUNTRY_CODE, Length: 61, dtype: int64
-------------------------------------------------- [te]
PH 28185 KE 12444 TJ 4388 EC 3818 UG 3088 ... PA 37 PR 24 LS 13 IL 11 AM 1 Name: COUNTRY_CODE, Length: 61, dtype: int64
venn2([set(train_df['COUNTRY_CODE'].unique()), set(test_df['COUNTRY_CODE'].unique())], set_labels=('train', 'test'));
print('[tr]')
display(train_df['COUNTRY_NAME'].value_counts())
print('-'*50)
print('[te]')
display(test_df['COUNTRY_NAME'].value_counts())
[tr]
Philippines 21265 Kenya 12510 Uganda 5551 Colombia 4758 El Salvador 4100 ... Panama 28 Puerto Rico 16 Lesotho 11 Israel 10 Armenia 1 Name: COUNTRY_NAME, Length: 61, dtype: int64
-------------------------------------------------- [te]
Philippines 28185 Kenya 12444 Tajikistan 4388 Ecuador 3818 Uganda 3088 ... Panama 37 Puerto Rico 24 Lesotho 13 Israel 11 Armenia 1 Name: COUNTRY_NAME, Length: 61, dtype: int64
venn2([set(train_df['COUNTRY_NAME'].unique()), set(test_df['COUNTRY_NAME'].unique())], set_labels=('train', 'test'));
train_df['COUNTRY'] = train_df['COUNTRY_CODE'] + '_' + train_df['COUNTRY_NAME']
test_df['COUNTRY'] = test_df['COUNTRY_CODE'] + '_' + test_df['COUNTRY_NAME']
venn2([set(train_df['COUNTRY'].unique()), set(test_df['COUNTRY'].unique())], set_labels=('train', 'test'));
display(
pd.DataFrame({
'train': train_df['CURRENCY_POLICY'].value_counts() / len(train_df),
'test': test_df['CURRENCY_POLICY'].value_counts() / len(test_df)
}).reset_index()
)
index | train | test | |
---|---|---|---|
0 | shared | 0.89848 | 0.52559 |
1 | standard | 0.10152 | 0.47441 |
display(
pd.DataFrame({
'train': train_df['CURRENCY_EXCHANGE_COVERAGE_RATE'].value_counts() / len(train_df),
'test': test_df['CURRENCY_EXCHANGE_COVERAGE_RATE'].value_counts() / len(test_df)
}).reset_index()
)
index | train | test | |
---|---|---|---|
0 | 0.00000 | 0.04661 | 0.32313 |
1 | 0.10000 | 0.85187 | 0.20247 |
print('[tr]')
display(train_df['CURRENCY'].value_counts())
print('-'*50)
print('[te]')
display(test_df['CURRENCY'].value_counts())
[tr]
PHP 21265 KES 12510 USD 11331 UGX 5551 COP 4758 TJS 3648 KHR 2768 XOF 2548 INR 2123 VND 1774 PEN 1691 PYG 1668 LRD 1642 MGA 1583 PKR 1547 WST 1352 HNL 1239 NGN 1183 KGS 980 RWF 855 JOD 784 GHS 778 GTQ 742 IDR 699 NIO 645 MZN 510 EGP 501 HTG 453 BOB 436 MXN 433 GEL 375 ALL 330 SBD 324 FJD 270 TOP 260 CRC 259 BRL 221 MDL 201 DOP 162 TRY 152 XAF 139 EUR 132 MWK 130 SLL 122 ZMW 98 NPR 62 PGK 43 THB 33 ILS 11 LSL 11 AMD 1 Name: CURRENCY, dtype: int64
-------------------------------------------------- [te]
PHP 28185 KES 12444 USD 12098 TJS 4386 XOF 3429 UGX 3088 MGA 2664 KHR 2511 COP 1965 PYG 1798 VND 1549 LRD 1462 KGS 1207 IDR 1064 NIO 1039 RWF 1010 WST 880 PEN 847 HNL 711 NGN 701 GTQ 691 JOD 674 GHS 533 SLL 495 HTG 457 EGP 454 MZN 438 ZMW 436 TOP 429 MXN 375 BOB 356 SBD 353 CRC 344 ALL 336 PKR 316 INR 308 BRL 263 FJD 259 GEL 248 MDL 224 XAF 166 MWK 127 EUR 116 DOP 93 TRY 92 PGK 63 NPR 59 THB 54 LSL 13 ILS 11 AMD 1 Name: CURRENCY, dtype: int64
venn2([set(train_df['CURRENCY'].unique()), set(test_df['CURRENCY'].unique())], set_labels=('train', 'test'));
train_df['COUNTRY_CURRENCY'] = train_df['COUNTRY_CODE'] + '_' + train_df['CURRENCY']
test_df['COUNTRY_CURRENCY'] = test_df['COUNTRY_CODE'] + '_' + test_df['CURRENCY']
venn2([set(train_df['COUNTRY_CURRENCY'].unique()), set(test_df['COUNTRY_CURRENCY'].unique())], set_labels=('train', 'test'));
# trainにのみ存在しているCOUNTRYとCURRENCYの組み合わせ
print(set(train_df['COUNTRY_CURRENCY']) - set(test_df['COUNTRY_CURRENCY']))
{'BO_USD', 'IN_USD', 'PS_ILS'}
print('tr:', train_df.loc[train_df['COUNTRY_CURRENCY'].str.startswith('PS'), 'COUNTRY_CURRENCY'].unique())
print('te:', test_df.loc[test_df['COUNTRY_CURRENCY'].str.startswith('PS'), 'COUNTRY_CURRENCY'].unique())
print('-'*50)
print('tr:', train_df.loc[train_df['COUNTRY_CURRENCY'].str.startswith('IN'), 'COUNTRY_CURRENCY'].unique())
print('te:', test_df.loc[test_df['COUNTRY_CURRENCY'].str.startswith('IN'), 'COUNTRY_CURRENCY'].unique())
print('-'*50)
print('tr:', train_df.loc[train_df['COUNTRY_CURRENCY'].str.startswith('BO'), 'COUNTRY_CURRENCY'].unique())
print('te:', test_df.loc[test_df['COUNTRY_CURRENCY'].str.startswith('BO'), 'COUNTRY_CURRENCY'].unique())
tr: ['PS_USD' 'PS_ILS'] te: ['PS_USD'] -------------------------------------------------- tr: ['IN_INR' 'IN_USD'] te: ['IN_INR'] -------------------------------------------------- tr: ['BO_BOB' 'BO_USD'] te: ['BO_BOB']
print(f'PS_ILS: {len(train_df.loc[train_df["COUNTRY_CURRENCY"]=="PS_ILS"])}')
print(f'IN_USD: {len(train_df.loc[train_df["COUNTRY_CURRENCY"]=="IN_USD"])}')
print(f'BO_USD: {len(train_df.loc[train_df["COUNTRY_CURRENCY"]=="BO_USD"])}')
PS_ILS: 1 IN_USD: 1 BO_USD: 17
print('[tr]')
display(train_df['TAGS'].value_counts())
print('-'*50)
print('[te]')
display(test_df['TAGS'].value_counts())
[tr]
user_favorite 3961 #Woman-Owned Business 2814 #Repeat Borrower 1761 #Elderly 1746 #Animals 1586 ... user_favorite, #Technology, #Sustainable Ag, #Eco-friendly, #Repeat Borrower, user_favorite 1 #Trees, user_favorite, user_favorite, #Vegan, #Sustainable Ag 1 user_favorite, #Biz Durable Asset, user_favorite, #US immigrant, user_favorite, user_favorite, #Woman-Owned Business 1 user_favorite, #Parent, user_favorite, #Biz Durable Asset, #Woman-Owned Business 1 #Biz Durable Asset, #Animals, user_favorite, user_favorite 1 Name: TAGS, Length: 21367, dtype: int64
-------------------------------------------------- [te]
user_favorite 7471 user_favorite, user_favorite 2100 #Parent, #Woman-Owned Business 1876 #Woman-Owned Business, #Parent 1555 #Woman-Owned Business 1026 ... #Single Parent, user_favorite, user_favorite, #Parent, user_favorite, #Schooling, #Woman-Owned Business 1 #Woman-Owned Business, volunteer_pick, #Repeat Borrower 1 #Elderly, user_favorite, #Woman-Owned Business, user_favorite, user_favorite, user_favorite, #Supporting Family, user_favorite 1 #Elderly, #Single 1 #Single, #Supporting Family, #Eco-friendly, user_favorite, user_favorite, #Health and Sanitation, #Vegan, user_favorite 1 Name: TAGS, Length: 20112, dtype: int64
train_tags_df = train_df['TAGS'].str.get_dummies(sep=',').add_prefix('tag_')
print(f'[tr] tags nunique: {train_tags_df.shape[1]}')
test_tags_df = test_df['TAGS'].str.get_dummies(sep=',').add_prefix('tag_')
print(f'[te] tags nunique: {test_tags_df.shape[1]}')
[tr] tags nunique: 58 [te] tags nunique: 60
train_tags_df
tag_ #Animals | tag_ #Biz Durable Asset | tag_ #Eco-friendly | tag_ #Elderly | tag_ #Fabrics | tag_ #Female Education | tag_ #First Loan | tag_ #Health and Sanitation | tag_ #Job Creator | tag_ #Orphan | tag_ #Parent | tag_ #Refugee | tag_ #Repair Renew Replace | tag_ #Repeat Borrower | tag_ #Schooling | tag_ #Single | tag_ #Single Parent | tag_ #Supporting Family | tag_ #Sustainable Ag | tag_ #Technology | tag_ #Trees | tag_ #US immigrant | tag_ #Unique | tag_ #Vegan | tag_ #Widowed | tag_ #Woman-Owned Business | tag_ user_favorite | tag_ volunteer_like | tag_ volunteer_pick | tag_#Animals | tag_#Biz Durable Asset | tag_#Eco-friendly | tag_#Elderly | tag_#Fabrics | tag_#Female Education | tag_#First Loan | tag_#Health and Sanitation | tag_#Job Creator | tag_#Orphan | tag_#Parent | tag_#Refugee | tag_#Repair Renew Replace | tag_#Repeat Borrower | tag_#Schooling | tag_#Single | tag_#Single Parent | tag_#Supporting Family | tag_#Sustainable Ag | tag_#Technology | tag_#Trees | tag_#US immigrant | tag_#Unique | tag_#Vegan | tag_#Widowed | tag_#Woman-Owned Business | tag_user_favorite | tag_volunteer_like | tag_volunteer_pick | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
91328 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
91329 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
91330 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
91331 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
91332 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
91333 rows × 58 columns
venn2([set(train_tags_df.columns), set(test_tags_df.columns)], set_labels=('train', 'test'));
# testにのみ存在しているTAGS
print(set(test_tags_df.columns) - set(train_tags_df.columns))
{'tag_ #US Black-Owned Business', 'tag_#US Black-Owned Business'}
train_tags_df.columns
Index(['tag_ #Animals', 'tag_ #Biz Durable Asset', 'tag_ #Eco-friendly', 'tag_ #Elderly', 'tag_ #Fabrics', 'tag_ #Female Education', 'tag_ #First Loan', 'tag_ #Health and Sanitation', 'tag_ #Job Creator', 'tag_ #Orphan', 'tag_ #Parent', 'tag_ #Refugee', 'tag_ #Repair Renew Replace', 'tag_ #Repeat Borrower', 'tag_ #Schooling', 'tag_ #Single', 'tag_ #Single Parent', 'tag_ #Supporting Family', 'tag_ #Sustainable Ag', 'tag_ #Technology', 'tag_ #Trees', 'tag_ #US immigrant', 'tag_ #Unique', 'tag_ #Vegan', 'tag_ #Widowed', 'tag_ #Woman-Owned Business', 'tag_ user_favorite', 'tag_ volunteer_like', 'tag_ volunteer_pick', 'tag_#Animals', 'tag_#Biz Durable Asset', 'tag_#Eco-friendly', 'tag_#Elderly', 'tag_#Fabrics', 'tag_#Female Education', 'tag_#First Loan', 'tag_#Health and Sanitation', 'tag_#Job Creator', 'tag_#Orphan', 'tag_#Parent', 'tag_#Refugee', 'tag_#Repair Renew Replace', 'tag_#Repeat Borrower', 'tag_#Schooling', 'tag_#Single', 'tag_#Single Parent', 'tag_#Supporting Family', 'tag_#Sustainable Ag', 'tag_#Technology', 'tag_#Trees', 'tag_#US immigrant', 'tag_#Unique', 'tag_#Vegan', 'tag_#Widowed', 'tag_#Woman-Owned Business', 'tag_user_favorite', 'tag_volunteer_like', 'tag_volunteer_pick'], dtype='object')
display(
pd.DataFrame({
'train': train_df['REPAYMENT_INTERVAL'].value_counts() / len(train_df),
'test': test_df['REPAYMENT_INTERVAL'].value_counts() / len(test_df)
}).reset_index()
)
index | train | test | |
---|---|---|---|
0 | monthly | 0.90192 | 0.86572 |
1 | bullet | 0.07146 | 0.08050 |
2 | irregular | 0.02662 | 0.05378 |
display(
pd.DataFrame({
'train': train_df['DISTRIBUTION_MODEL'].value_counts() / len(train_df),
'test': test_df['DISTRIBUTION_MODEL'].value_counts() / len(test_df)
}).reset_index()
)
index | train | test | |
---|---|---|---|
0 | field_partner | 0.99536 | 0.99585 |
1 | direct | 0.00464 | 0.00415 |
goukaisei
神EDAありがとうございます!!!