kotrying
# Library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install japanize_matplotlib
import japanize_matplotlib
%matplotlib inline
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.simplefilter('ignore')
# mount
from google.colab import drive
if not os.path.isdir('/content/drive'):
drive.mount('/content/drive')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting japanize_matplotlib Downloading japanize-matplotlib-1.1.3.tar.gz (4.1 MB) K |████████████████████████████████| 4.1 MB 5.1 MB/s ent already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from japanize_matplotlib) (3.2.2) Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.7/dist-packages (from matplotlib->japanize_matplotlib) (1.21.6) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->japanize_matplotlib) (3.0.9) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->japanize_matplotlib) (2.8.2) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->japanize_matplotlib) (0.11.0) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->japanize_matplotlib) (1.4.4) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib->japanize_matplotlib) (4.1.1) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib->japanize_matplotlib) (1.15.0) Building wheels for collected packages: japanize-matplotlib Building wheel for japanize-matplotlib (setup.py) ... ?25latplotlib: filename=japanize_matplotlib-1.1.3-py3-none-any.whl size=4120275 sha256=9b1d2daa9fdeae4c56750925777085fa2172df4f0066449e60ea2c71cffcbccd Stored in directory: /root/.cache/pip/wheels/83/97/6b/e9e0cde099cc40f972b8dd23367308f7705ae06cd6d4714658 Successfully built japanize-matplotlib Installing collected packages: japanize-matplotlib Successfully installed japanize-matplotlib-1.1.3 Mounted at /content/drive
構成
MyDrive
├<pollen_counts>
├<notebook>
│ └eda.ipynb
├<input>
│ ├train.csv
│ ├submission.csv
│ └test.csv
└<output>
# Config
DRIVE_PATH = "/content/drive/MyDrive/ML/PROBSPACE/pollen_counts"
INPUT = os.path.join(DRIVE_PATH, "input")
OUTPUT = os.path.join(DRIVE_PATH, "output")
TRAIN_FILE = os.path.join(INPUT, "train.csv")
TEST_FILE = os.path.join(INPUT, "test.csv")
SUB_FILE = os.path.join(INPUT, "submission.csv")
seed =42
# plot style
pd.set_option('display.max_columns', 200)
plt.rcParams['axes.facecolor'] = 'EEFFFE'
# Data
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
sub = pd.read_csv(SUB_FILE)
display(train.head(3))
display(test.head(3))
display(sub.head(3))
datetime | precipitation_utsunomiya | precipitation_chiba | precipitation_tokyo | temperature_utsunomiya | temperature_chiba | temperature_tokyo | winddirection_utsunomiya | winddirection_chiba | winddirection_tokyo | windspeed_utsunomiya | windspeed_chiba | windspeed_tokyo | pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017020101 | 0.0 | 0.0 | 0.0 | -1.0 | 4.1 | 2.9 | 16 | 1 | 2 | 2.7 | 2.5 | 1.3 | 0.0 | 8.0 | 0.0 |
1 | 2017020102 | 0.0 | 0.0 | 0.0 | -1.1 | 4.2 | 2.6 | 1 | 1 | 1 | 3.3 | 1.5 | 0.9 | 0.0 | 24.0 | 4.0 |
2 | 2017020103 | 0.0 | 0.0 | 0.0 | -0.7 | 4.2 | 2.4 | 1 | 15 | 16 | 4.0 | 1.7 | 0.6 | 4.0 | 32.0 | 12.0 |
datetime | precipitation_utsunomiya | precipitation_chiba | precipitation_tokyo | temperature_utsunomiya | temperature_chiba | temperature_tokyo | winddirection_utsunomiya | winddirection_chiba | winddirection_tokyo | windspeed_utsunomiya | windspeed_chiba | windspeed_tokyo | pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020040101 | 0.0 | 0.0 | 0.0 | 9.5 | 10.5 | 9.0 | 14 | 2 | 14 | 2.1 | 2.3 | 1.2 | 0 | 0 | 0 |
1 | 2020040102 | 0.0 | 0.0 | 0.0 | 9.2 | 10.3 | 9.0 | 2 | 16 | 14 | 1.4 | 2.7 | 0.8 | 0 | 0 | 0 |
2 | 2020040103 | 0.0 | 0.0 | 0.0 | 9.2 | 10.2 | 9.1 | 16 | 16 | 12 | 3.3 | 2.5 | 0.5 | 0 | 0 | 0 |
datetime | pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|---|
0 | 2020040101 | 0 | 0 | 0 |
1 | 2020040102 | 0 | 0 | 0 |
2 | 2020040103 | 0 | 0 | 0 |
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12168 entries, 0 to 12167 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 12168 non-null int64 1 precipitation_utsunomiya 12168 non-null float64 2 precipitation_chiba 12168 non-null float64 3 precipitation_tokyo 12168 non-null object 4 temperature_utsunomiya 12168 non-null float64 5 temperature_chiba 12168 non-null object 6 temperature_tokyo 12168 non-null object 7 winddirection_utsunomiya 12168 non-null int64 8 winddirection_chiba 12168 non-null object 9 winddirection_tokyo 12168 non-null object 10 windspeed_utsunomiya 12168 non-null float64 11 windspeed_chiba 12168 non-null object 12 windspeed_tokyo 12168 non-null object 13 pollen_utsunomiya 12168 non-null float64 14 pollen_chiba 12168 non-null float64 15 pollen_tokyo 12168 non-null float64 dtypes: float64(7), int64(2), object(7) memory usage: 1.5+ MB
test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 336 entries, 0 to 335 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 336 non-null int64 1 precipitation_utsunomiya 336 non-null float64 2 precipitation_chiba 336 non-null float64 3 precipitation_tokyo 336 non-null float64 4 temperature_utsunomiya 336 non-null float64 5 temperature_chiba 336 non-null float64 6 temperature_tokyo 336 non-null float64 7 winddirection_utsunomiya 336 non-null int64 8 winddirection_chiba 336 non-null int64 9 winddirection_tokyo 336 non-null int64 10 windspeed_utsunomiya 336 non-null float64 11 windspeed_chiba 336 non-null float64 12 windspeed_tokyo 336 non-null float64 13 pollen_utsunomiya 336 non-null int64 14 pollen_chiba 336 non-null int64 15 pollen_tokyo 336 non-null int64 dtypes: float64(9), int64(7) memory usage: 42.1 KB
欠損はなし
trainに複数のobjectを含む
train.select_dtypes(object)
precipitation_tokyo | temperature_chiba | temperature_tokyo | winddirection_chiba | winddirection_tokyo | windspeed_chiba | windspeed_tokyo | |
---|---|---|---|---|---|---|---|
0 | 0.0 | 4.1 | 2.9 | 1 | 2 | 2.5 | 1.3 |
1 | 0.0 | 4.2 | 2.6 | 1 | 1 | 1.5 | 0.9 |
2 | 0.0 | 4.2 | 2.4 | 15 | 16 | 1.7 | 0.6 |
3 | 0.0 | 4.4 | 1.8 | 15 | 1 | 3.1 | 1.4 |
4 | 0.0 | 4.1 | 1.5 | 14 | 14 | 3.4 | 0.9 |
... | ... | ... | ... | ... | ... | ... | ... |
12163 | 0 | 11.5 | 9.4 | 2 | 16 | 2.6 | 0.7 |
12164 | 0 | 11.3 | 8.9 | 15 | 14 | 1.7 | 1.3 |
12165 | 0 | 11.3 | 8.8 | 15 | 15 | 2.7 | 0.9 |
12166 | 0 | 10.9 | 8.9 | 16 | 1 | 2.9 | 0.6 |
12167 | 0 | 10.7 | 8.9 | 1 | 16 | 2.7 | 0.4 |
12168 rows × 7 columns
train.precipitation_tokyo.value_counts()
0.0 9952 0 1328 0.5 340 1.0 159 1.5 92 2.0 59 2.5 44 3.0 26 4.5 24 3.5 23 4.0 22 1 18 5.5 13 5.0 11 6.0 7 2 5 6.5 4 4 4 8.0 4 7.0 4 10.0 4 7.5 3 11.0 3 8.5 2 欠測 2 14.5 2 10.5 2 14.0 2 21.5 2 9.0 2 12.5 1 17.5 1 23.5 1 3 1 18.0 1 Name: precipitation_tokyo, dtype: int64
for col in train.select_dtypes(object).columns:
print(train[train[col].isin(['欠測'])].datetime)
11795 2020031612 11796 2020031613 Name: datetime, dtype: int64 11146 2020021811 11147 2020021812 11148 2020021813 Name: datetime, dtype: int64 11794 2020031611 11795 2020031612 11796 2020031613 Name: datetime, dtype: int64 11146 2020021811 11147 2020021812 11148 2020021813 Name: datetime, dtype: int64 11794 2020031611 11795 2020031612 11796 2020031613 Name: datetime, dtype: int64 11146 2020021811 11147 2020021812 11148 2020021813 Name: datetime, dtype: int64 11794 2020031611 11795 2020031612 11796 2020031613 Name: datetime, dtype: int64
trainには数値だがobject型のデータが存在 -> 計6個の「欠測」ラベル (2020/02/18 11-13時、2020/03/16 11-13時)
precipitation_tokyoを見ると0と0.0のように区別されている場合がある
# object(欠測) -> float
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
train_df = train.replace('欠測', np.nan)
lgb_imp = IterativeImputer(
estimator=LGBMRegressor(num_boost_round=100, random_state=seed),
max_iter=10,
initial_strategy='mean',
imputation_order='ascending',
verbose=1,
random_state=seed)
train_df = pd.DataFrame(lgb_imp.fit_transform(train_df), columns=train_df.columns)
train_df[['winddirection_chiba', 'winddirection_tokyo']] = train_df[['winddirection_chiba', 'winddirection_tokyo']].round().astype(int)
train_df[['precipitation_tokyo', 'temperature_chiba', 'temperature_tokyo', 'windspeed_chiba', 'windspeed_tokyo']] = train_df[['precipitation_tokyo', 'temperature_chiba', 'temperature_tokyo', 'windspeed_chiba', 'windspeed_tokyo']].round(1)
train[train.select_dtypes(object).columns] = train_df[train.select_dtypes(object).columns]
train
[IterativeImputer] Completing matrix with shape (12168, 16) [IterativeImputer] Change: 8.54139755011368, scaled tolerance: 2020033.124 [IterativeImputer] Early stopping criterion reached.
datetime | precipitation_utsunomiya | precipitation_chiba | precipitation_tokyo | temperature_utsunomiya | temperature_chiba | temperature_tokyo | winddirection_utsunomiya | winddirection_chiba | winddirection_tokyo | windspeed_utsunomiya | windspeed_chiba | windspeed_tokyo | pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017020101 | 0.0 | 0.0 | 0.0 | -1.0 | 4.1 | 2.9 | 16 | 1 | 2 | 2.7 | 2.5 | 1.3 | 0.0 | 8.0 | 0.0 |
1 | 2017020102 | 0.0 | 0.0 | 0.0 | -1.1 | 4.2 | 2.6 | 1 | 1 | 1 | 3.3 | 1.5 | 0.9 | 0.0 | 24.0 | 4.0 |
2 | 2017020103 | 0.0 | 0.0 | 0.0 | -0.7 | 4.2 | 2.4 | 1 | 15 | 16 | 4.0 | 1.7 | 0.6 | 4.0 | 32.0 | 12.0 |
3 | 2017020104 | 0.0 | 0.0 | 0.0 | -1.1 | 4.4 | 1.8 | 1 | 15 | 1 | 4.1 | 3.1 | 1.4 | 0.0 | 12.0 | 0.0 |
4 | 2017020105 | 0.0 | 0.0 | 0.0 | -1.2 | 4.1 | 1.5 | 2 | 14 | 14 | 3.7 | 3.4 | 0.9 | 0.0 | 32.0 | 4.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
12163 | 2020033120 | 0.0 | 0.0 | 0.0 | 10.0 | 11.5 | 9.4 | 16 | 2 | 16 | 2.4 | 2.6 | 0.7 | 118.0 | 0.0 | 12.0 |
12164 | 2020033121 | 0.0 | 0.0 | 0.0 | 10.1 | 11.3 | 8.9 | 15 | 15 | 14 | 2.4 | 1.7 | 1.3 | 73.0 | 4.0 | 4.0 |
12165 | 2020033122 | 0.0 | 0.0 | 0.0 | 9.8 | 11.3 | 8.8 | 3 | 15 | 15 | 1.2 | 2.7 | 0.9 | 8.0 | 0.0 | 20.0 |
12166 | 2020033123 | 0.5 | 0.0 | 0.0 | 9.7 | 10.9 | 8.9 | 16 | 16 | 1 | 0.5 | 2.9 | 0.6 | 24.0 | 4.0 | 0.0 |
12167 | 2020033124 | 0.0 | 0.0 | 0.0 | 9.7 | 10.7 | 8.9 | 16 | 1 | 16 | 1.0 | 2.7 | 0.4 | 16.0 | 4.0 | 12.0 |
12168 rows × 16 columns
df = pd.concat([train, test]).reset_index(drop=True)
df['time'] = pd.to_datetime(df.datetime.astype(str).str[:-2])
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day
df['hour'] = df.datetime.astype(str).str[-2:].astype(int)
df['weekday'] = df['time'].dt.weekday
df.head(3)
datetime | precipitation_utsunomiya | precipitation_chiba | precipitation_tokyo | temperature_utsunomiya | temperature_chiba | temperature_tokyo | winddirection_utsunomiya | winddirection_chiba | winddirection_tokyo | windspeed_utsunomiya | windspeed_chiba | windspeed_tokyo | pollen_utsunomiya | pollen_chiba | pollen_tokyo | time | year | month | day | hour | weekday | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017020101 | 0.0 | 0.0 | 0.0 | -1.0 | 4.1 | 2.9 | 16 | 1 | 2 | 2.7 | 2.5 | 1.3 | 0.0 | 8.0 | 0.0 | 2017-02-01 | 2017 | 2 | 1 | 1 | 2 |
1 | 2017020102 | 0.0 | 0.0 | 0.0 | -1.1 | 4.2 | 2.6 | 1 | 1 | 1 | 3.3 | 1.5 | 0.9 | 0.0 | 24.0 | 4.0 | 2017-02-01 | 2017 | 2 | 1 | 2 | 2 |
2 | 2017020103 | 0.0 | 0.0 | 0.0 | -0.7 | 4.2 | 2.4 | 1 | 15 | 16 | 4.0 | 1.7 | 0.6 | 4.0 | 32.0 | 12.0 | 2017-02-01 | 2017 | 2 | 1 | 3 | 2 |
plot_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, col in enumerate(plot_col):
plt.subplot(1, ncols, i+1)
plt.plot(df.time, df[col], alpha=1, color=color[i], label=col)
plt.xlabel(col)
plt.legend()
plt.grid()
plt.show()
df[df.month.isin([7,8,9,10,11,12,1])]
datetime | precipitation_utsunomiya | precipitation_chiba | precipitation_tokyo | temperature_utsunomiya | temperature_chiba | temperature_tokyo | winddirection_utsunomiya | winddirection_chiba | winddirection_tokyo | windspeed_utsunomiya | windspeed_chiba | windspeed_tokyo | pollen_utsunomiya | pollen_chiba | pollen_tokyo | time | year | month | day | hour | weekday |
---|
target_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
for col in target_col:
print('='*10+col+'='*10)
print(df[col].describe())
print('<75%より多く花粉を含む(月)>')
df[df[col] > df[col].quantile(0.75)].month.value_counts().plot(kind='barh', figsize=(5,3))
plt.show()
print('<99%より多く花粉を含む(月)>')
df[df[col] > df[col].quantile(0.99)].month.value_counts().plot(kind='barh', figsize=(5,3))
plt.show()
print()
==========pollen_utsunomiya========== count 12504.000000 mean 84.292306 std 338.311803 min 0.000000 25% 0.000000 50% 16.000000 75% 57.000000 max 12193.000000 Name: pollen_utsunomiya, dtype: float64 <75%より多く花粉を含む(月)>
<99%より多く花粉を含む(月)>
==========pollen_chiba========== count 12504.000000 mean 28.822297 std 99.154223 min 0.000000 25% 0.000000 50% 8.000000 75% 24.000000 max 4141.000000 Name: pollen_chiba, dtype: float64 <75%より多く花粉を含む(月)>
<99%より多く花粉を含む(月)>
==========pollen_tokyo========== count 12504.000000 mean 25.973289 std 73.793359 min 0.000000 25% 0.000000 50% 4.000000 75% 20.000000 max 2209.000000 Name: pollen_tokyo, dtype: float64 <75%より多く花粉を含む(月)>
<99%より多く花粉を含む(月)>
各観測拠点でスケールが違う(宇都宮>千葉>東京)
7-1月のデータは存在しない
3月の花粉量がとても多い
plot_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, col in enumerate(plot_col):
plt.subplot(1, ncols, i+1)
plt.hist(df[col], bins=200, alpha=1, color=color[i], label=col)
plt.xlabel(col)
plt.legend()
plt.grid()
plt.xlim(-1,500)
plt.show()
df[df.month.isin([2,3])].groupby('year').mean()[target_col].style.background_gradient()
pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|
year | |||
2017 | 66.026836 | 34.845339 | 29.654661 |
2018 | 252.665230 | 63.105603 | 70.619253 |
2019 | 193.033046 | 63.388649 | 32.174569 |
2020 | 25.973870 | 17.153249 | 12.936441 |
テストデータを含む2020年の花粉量は過去3年より少ない
df[df.datetime <= 2020033124].groupby('month').mean()[target_col].style.background_gradient()
pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|
month | |||
2 | 25.908712 | 25.114015 | 14.764773 |
3 | 229.262769 | 61.629368 | 55.248320 |
4 | 106.966667 | 34.286111 | 37.822685 |
5 | 32.176523 | 16.237455 | 17.732527 |
6 | 0.203704 | 0.177778 | 0.044444 |
df[df.datetime <= 2020033124].groupby('hour').mean()[target_col].style.background_gradient()
pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|
hour | |||
1 | 87.151874 | 24.960552 | 25.763314 |
2 | 104.230769 | 25.680473 | 24.100592 |
3 | 106.236686 | 25.019724 | 25.228797 |
4 | 107.980276 | 22.122288 | 26.536489 |
5 | 101.126233 | 25.763314 | 26.230769 |
6 | 79.039448 | 26.059172 | 27.944773 |
7 | 83.715976 | 31.885602 | 36.161736 |
8 | 90.887574 | 39.297830 | 41.773176 |
9 | 84.433925 | 44.159763 | 31.749507 |
10 | 80.832347 | 37.818540 | 28.428008 |
11 | 81.007890 | 34.001972 | 26.106509 |
12 | 83.358974 | 30.307692 | 24.307692 |
13 | 90.591716 | 30.011834 | 23.234714 |
14 | 88.502959 | 28.656805 | 21.566075 |
15 | 85.682446 | 31.741617 | 21.467456 |
16 | 83.394477 | 29.189349 | 20.660750 |
17 | 80.633136 | 27.506903 | 21.510848 |
18 | 80.648915 | 30.041420 | 26.364892 |
19 | 83.248521 | 28.358974 | 28.581854 |
20 | 80.625247 | 27.968442 | 28.899408 |
21 | 84.974359 | 26.153846 | 29.323471 |
22 | 78.532544 | 28.822485 | 25.189349 |
23 | 77.319527 | 27.686391 | 23.710059 |
24 | 74.721893 | 27.621302 | 25.731755 |
宇都宮は2-5時、千葉は8-10時頃、東京は7-8時頃に高い花粉量を観測
df[df.datetime <= 2020033124].groupby('weekday').mean()[target_col].style.background_gradient()
pollen_utsunomiya | pollen_chiba | pollen_tokyo | |
---|---|---|---|
weekday | |||
0 | 66.211806 | 23.478588 | 29.365162 |
1 | 77.891782 | 27.149306 | 26.777778 |
2 | 90.565972 | 39.984375 | 28.638889 |
3 | 82.493634 | 25.085069 | 29.924769 |
4 | 90.062215 | 24.257991 | 23.444635 |
5 | 121.541096 | 37.296804 | 23.267694 |
6 | 77.171233 | 30.037100 | 25.523402 |
宇都宮は土曜日、千葉は水曜日や土曜日に花粉量が多く、東京は金土曜日の花粉量が少ない
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(df[df.datetime <= 2020033124].drop(['datetime', 'time'], axis=1).corr(), vmax=1, vmin=-1, center=0)
<matplotlib.axes._subplots.AxesSubplot at 0x7f0ed9ac4390>
降水量
plot_col = ['precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, col in enumerate(plot_col):
plt.subplot(1, ncols, i+1)
for y in df.year.unique():
plt.plot(df[df.year==y].time, df[df.year==y][col], alpha=1, color=color[i])
plt.xlabel(col)
plt.grid()
plt.show()
plot_col = ['precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, (tcol, col) in enumerate(zip(target_col, plot_col)):
plt.subplot(1, ncols, i+1)
plt.scatter(df[col], df[tcol], alpha=1, color=color[i])
plt.xlabel(col)
plt.ylabel(tcol)
plt.grid()
plt.xlim(-1,11)
plt.show()
降水量が多い時は花粉量が少ない
気温
plot_col = ['temperature_utsunomiya', 'temperature_chiba', 'temperature_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, col in enumerate(plot_col):
plt.subplot(1, ncols, i+1)
for j, y in enumerate(df.year.unique()):
plt.plot(df[df.year==y].index, df[df.year==y][col], alpha=0.25*(j+1), color=color[i])
plt.plot(df[df.year==y].index, df[df.year==y][col].rolling(100).mean(), alpha=0.25*(j+1), color='black')
plt.xlabel(col)
plt.grid()
plt.show()
plot_col = ['temperature_utsunomiya', 'temperature_chiba', 'temperature_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, (tcol, col) in enumerate(zip(target_col, plot_col)):
plt.subplot(1, ncols, i+1)
plt.scatter(df[col], df[tcol], alpha=1, color=color[i])
plt.xlabel(col)
plt.ylabel(tcol)
plt.grid()
plt.show()
風向
# 指定地域の該当日の風向
winddirection = {
0:'静穏',
1:'北北東',
2:'北東',
3:'東北東',
4:'東',
5:'東南東',
6:'南東',
7:'南南東',
8:'南',
9:'南南西',
10:'南西',
11:'西南西',
12:'西',
13:'西北西',
14:'北西',
15:'北北西',
16:'北',
}
df_wd = df.copy()
for col in ['winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo']:
df_wd[col] = df_wd[col].map(winddirection)
df_wd.head(3)
datetime | precipitation_utsunomiya | precipitation_chiba | precipitation_tokyo | temperature_utsunomiya | temperature_chiba | temperature_tokyo | winddirection_utsunomiya | winddirection_chiba | winddirection_tokyo | windspeed_utsunomiya | windspeed_chiba | windspeed_tokyo | pollen_utsunomiya | pollen_chiba | pollen_tokyo | time | year | month | day | hour | weekday | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2017020101 | 0.0 | 0.0 | 0.0 | -1.0 | 4.1 | 2.9 | 北 | 北北東 | 北東 | 2.7 | 2.5 | 1.3 | 0.0 | 8.0 | 0.0 | 2017-02-01 | 2017 | 2 | 1 | 1 | 2 |
1 | 2017020102 | 0.0 | 0.0 | 0.0 | -1.1 | 4.2 | 2.6 | 北北東 | 北北東 | 北北東 | 3.3 | 1.5 | 0.9 | 0.0 | 24.0 | 4.0 | 2017-02-01 | 2017 | 2 | 1 | 2 | 2 |
2 | 2017020103 | 0.0 | 0.0 | 0.0 | -0.7 | 4.2 | 2.4 | 北北東 | 北北西 | 北 | 4.0 | 1.7 | 0.6 | 4.0 | 32.0 | 12.0 | 2017-02-01 | 2017 | 2 | 1 | 3 | 2 |
plot_col = ['winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, (tcol, col) in enumerate(zip(target_col, plot_col)):
plt.subplot(1, ncols, i+1)
df_wd[col].value_counts().plot(kind='barh')
plt.xlabel(tcol)
plt.ylabel(col)
plt.grid()
plt.show()
plot_col = ['winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, (tcol, col) in enumerate(zip(target_col, plot_col)):
plt.subplot(1, ncols, i+1)
df_wd[df_wd.datetime <= 2020033124].groupby(col).mean()[tcol].plot(kind='barh')
plt.xlabel(tcol)
plt.ylabel(col)
plt.grid()
plt.show()
宇都宮は北風のとき花粉量が多く、千葉は南風のとき花粉量が少なく、東京は西風のとき花粉量が多い
静穏は東京のみ多く存在し、花粉量は多い
風速
plot_col = ['windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, col in enumerate(plot_col):
plt.subplot(1, ncols, i+1)
for j, y in enumerate(df.year.unique()):
plt.plot(df[df.year==y].index, df[df.year==y][col], alpha=0.25*(j+1), color=color[i])
plt.plot(df[df.year==y].index, df[df.year==y][col].rolling(100).mean(), alpha=0.25*(j+1), color='black')
plt.xlabel(col)
plt.grid()
plt.show()
plot_col = ['windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, (tcol, col) in enumerate(zip(target_col, plot_col)):
plt.subplot(1, ncols, i+1)
plt.scatter(df[col], df[tcol], alpha=1, color=color[i])
plt.xlabel(col)
plt.ylabel(tcol)
plt.grid()
plt.show()
!pip install folium
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: folium in /usr/local/lib/python3.7/dist-packages (0.12.1.post1) Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from folium) (2.23.0) Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from folium) (1.21.6) Requirement already satisfied: jinja2>=2.9 in /usr/local/lib/python3.7/dist-packages (from folium) (2.11.3) Requirement already satisfied: branca>=0.3.0 in /usr/local/lib/python3.7/dist-packages (from folium) (0.5.0) Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2>=2.9->folium) (2.0.1) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->folium) (2022.6.15) Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->folium) (3.0.4) Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->folium) (1.24.3) Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->folium) (2.10)
観測拠点は以下の3つ
import folium
cities = pd.DataFrame({
'train': ['宇都宮(宇都宮市中央生涯学習センター)', '千葉(千葉県環境研究センター)', '府中(東京都多摩小平保健所)'],
'latitude': [36.559444, 35.525312, 35.730062],
'longtude': [139.882689, 140.068465, 139.516689],
})
map = folium.Map(width='75%', height='75%',location=[36,140], zoom_start=8)
for i, r in cities.iterrows():
folium.Marker(location=[r['latitude'], r['longtude']], popup=r['train']).add_to(map)
map
地理情報は提供されていないが、位置的な関係を考慮して風向と風量から、時間差での影響をモデルに組み込む必要があるかもしれない
例えばそれぞれの観測拠点で多く見られる北北東の風や南、南西の風の風が吹いた場合、ある程度の時間を置いて別の拠点へと風が到達するかもしれない