EDA

# Library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install japanize_matplotlib
import japanize_matplotlib
%matplotlib inline
import seaborn as sns
from tqdm.auto import tqdm

import warnings
warnings.simplefilter('ignore')

# mount
from google.colab import drive
if not os.path.isdir('/content/drive'):
    drive.mount('/content/drive')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting japanize_matplotlib
  Downloading japanize-matplotlib-1.1.3.tar.gz (4.1 MB)
K     |████████████████████████████████| 4.1 MB 5.1 MB/s 
ent already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from japanize_matplotlib) (3.2.2)
Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.7/dist-packages (from matplotlib->japanize_matplotlib) (1.21.6)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->japanize_matplotlib) (3.0.9)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->japanize_matplotlib) (2.8.2)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->japanize_matplotlib) (0.11.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->japanize_matplotlib) (1.4.4)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib->japanize_matplotlib) (4.1.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib->japanize_matplotlib) (1.15.0)
Building wheels for collected packages: japanize-matplotlib
  Building wheel for japanize-matplotlib (setup.py) ... ?25latplotlib: filename=japanize_matplotlib-1.1.3-py3-none-any.whl size=4120275 sha256=9b1d2daa9fdeae4c56750925777085fa2172df4f0066449e60ea2c71cffcbccd
  Stored in directory: /root/.cache/pip/wheels/83/97/6b/e9e0cde099cc40f972b8dd23367308f7705ae06cd6d4714658
Successfully built japanize-matplotlib
Installing collected packages: japanize-matplotlib
Successfully installed japanize-matplotlib-1.1.3
Mounted at /content/drive

構成

MyDrive
├<pollen_counts>
   ├<notebook>
   │ └eda.ipynb
   ├<input>
   │ ├train.csv
   │ ├submission.csv
   │ └test.csv
   └<output>
# Config
DRIVE_PATH = "/content/drive/MyDrive/ML/PROBSPACE/pollen_counts"
INPUT = os.path.join(DRIVE_PATH, "input")
OUTPUT = os.path.join(DRIVE_PATH, "output")

TRAIN_FILE = os.path.join(INPUT, "train.csv")
TEST_FILE = os.path.join(INPUT, "test.csv")
SUB_FILE = os.path.join(INPUT, "submission.csv")

seed =42

# plot style
pd.set_option('display.max_columns', 200)
plt.rcParams['axes.facecolor'] = 'EEFFFE'
# Data
train = pd.read_csv(TRAIN_FILE)
test = pd.read_csv(TEST_FILE)
sub = pd.read_csv(SUB_FILE)

データの中身を見る

display(train.head(3))
display(test.head(3))
display(sub.head(3))
datetime precipitation_utsunomiya precipitation_chiba precipitation_tokyo temperature_utsunomiya temperature_chiba temperature_tokyo winddirection_utsunomiya winddirection_chiba winddirection_tokyo windspeed_utsunomiya windspeed_chiba windspeed_tokyo pollen_utsunomiya pollen_chiba pollen_tokyo
0 2017020101 0.0 0.0 0.0 -1.0 4.1 2.9 16 1 2 2.7 2.5 1.3 0.0 8.0 0.0
1 2017020102 0.0 0.0 0.0 -1.1 4.2 2.6 1 1 1 3.3 1.5 0.9 0.0 24.0 4.0
2 2017020103 0.0 0.0 0.0 -0.7 4.2 2.4 1 15 16 4.0 1.7 0.6 4.0 32.0 12.0
datetime precipitation_utsunomiya precipitation_chiba precipitation_tokyo temperature_utsunomiya temperature_chiba temperature_tokyo winddirection_utsunomiya winddirection_chiba winddirection_tokyo windspeed_utsunomiya windspeed_chiba windspeed_tokyo pollen_utsunomiya pollen_chiba pollen_tokyo
0 2020040101 0.0 0.0 0.0 9.5 10.5 9.0 14 2 14 2.1 2.3 1.2 0 0 0
1 2020040102 0.0 0.0 0.0 9.2 10.3 9.0 2 16 14 1.4 2.7 0.8 0 0 0
2 2020040103 0.0 0.0 0.0 9.2 10.2 9.1 16 16 12 3.3 2.5 0.5 0 0 0
datetime pollen_utsunomiya pollen_chiba pollen_tokyo
0 2020040101 0 0 0
1 2020040102 0 0 0
2 2020040103 0 0 0
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12168 entries, 0 to 12167
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   datetime                  12168 non-null  int64  
 1   precipitation_utsunomiya  12168 non-null  float64
 2   precipitation_chiba       12168 non-null  float64
 3   precipitation_tokyo       12168 non-null  object 
 4   temperature_utsunomiya    12168 non-null  float64
 5   temperature_chiba         12168 non-null  object 
 6   temperature_tokyo         12168 non-null  object 
 7   winddirection_utsunomiya  12168 non-null  int64  
 8   winddirection_chiba       12168 non-null  object 
 9   winddirection_tokyo       12168 non-null  object 
 10  windspeed_utsunomiya      12168 non-null  float64
 11  windspeed_chiba           12168 non-null  object 
 12  windspeed_tokyo           12168 non-null  object 
 13  pollen_utsunomiya         12168 non-null  float64
 14  pollen_chiba              12168 non-null  float64
 15  pollen_tokyo              12168 non-null  float64
dtypes: float64(7), int64(2), object(7)
memory usage: 1.5+ MB
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336 entries, 0 to 335
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   datetime                  336 non-null    int64  
 1   precipitation_utsunomiya  336 non-null    float64
 2   precipitation_chiba       336 non-null    float64
 3   precipitation_tokyo       336 non-null    float64
 4   temperature_utsunomiya    336 non-null    float64
 5   temperature_chiba         336 non-null    float64
 6   temperature_tokyo         336 non-null    float64
 7   winddirection_utsunomiya  336 non-null    int64  
 8   winddirection_chiba       336 non-null    int64  
 9   winddirection_tokyo       336 non-null    int64  
 10  windspeed_utsunomiya      336 non-null    float64
 11  windspeed_chiba           336 non-null    float64
 12  windspeed_tokyo           336 non-null    float64
 13  pollen_utsunomiya         336 non-null    int64  
 14  pollen_chiba              336 non-null    int64  
 15  pollen_tokyo              336 non-null    int64  
dtypes: float64(9), int64(7)
memory usage: 42.1 KB

欠損はなし
trainに複数のobjectを含む

train.select_dtypes(object)
precipitation_tokyo temperature_chiba temperature_tokyo winddirection_chiba winddirection_tokyo windspeed_chiba windspeed_tokyo
0 0.0 4.1 2.9 1 2 2.5 1.3
1 0.0 4.2 2.6 1 1 1.5 0.9
2 0.0 4.2 2.4 15 16 1.7 0.6
3 0.0 4.4 1.8 15 1 3.1 1.4
4 0.0 4.1 1.5 14 14 3.4 0.9
... ... ... ... ... ... ... ...
12163 0 11.5 9.4 2 16 2.6 0.7
12164 0 11.3 8.9 15 14 1.7 1.3
12165 0 11.3 8.8 15 15 2.7 0.9
12166 0 10.9 8.9 16 1 2.9 0.6
12167 0 10.7 8.9 1 16 2.7 0.4

12168 rows × 7 columns

train.precipitation_tokyo.value_counts()
0.0     9952
0       1328
0.5      340
1.0      159
1.5       92
2.0       59
2.5       44
3.0       26
4.5       24
3.5       23
4.0       22
1         18
5.5       13
5.0       11
6.0        7
2          5
6.5        4
4          4
8.0        4
7.0        4
10.0       4
7.5        3
11.0       3
8.5        2
欠測         2
14.5       2
10.5       2
14.0       2
21.5       2
9.0        2
12.5       1
17.5       1
23.5       1
3          1
18.0       1
Name: precipitation_tokyo, dtype: int64
for col in train.select_dtypes(object).columns:
    print(train[train[col].isin(['欠測'])].datetime)
11795    2020031612
11796    2020031613
Name: datetime, dtype: int64
11146    2020021811
11147    2020021812
11148    2020021813
Name: datetime, dtype: int64
11794    2020031611
11795    2020031612
11796    2020031613
Name: datetime, dtype: int64
11146    2020021811
11147    2020021812
11148    2020021813
Name: datetime, dtype: int64
11794    2020031611
11795    2020031612
11796    2020031613
Name: datetime, dtype: int64
11146    2020021811
11147    2020021812
11148    2020021813
Name: datetime, dtype: int64
11794    2020031611
11795    2020031612
11796    2020031613
Name: datetime, dtype: int64

trainには数値だがobject型のデータが存在 -> 計6個の「欠測」ラベル (2020/02/18 11-13時、2020/03/16 11-13時)
precipitation_tokyoを見ると0と0.0のように区別されている場合がある

# object(欠測) -> float
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

train_df = train.replace('欠測', np.nan)
lgb_imp = IterativeImputer(
                       estimator=LGBMRegressor(num_boost_round=100, random_state=seed),
                       max_iter=10, 
                       initial_strategy='mean',
                       imputation_order='ascending',
                       verbose=1,
                       random_state=seed)

train_df = pd.DataFrame(lgb_imp.fit_transform(train_df), columns=train_df.columns)
train_df[['winddirection_chiba', 'winddirection_tokyo']] = train_df[['winddirection_chiba', 'winddirection_tokyo']].round().astype(int)
train_df[['precipitation_tokyo', 'temperature_chiba', 'temperature_tokyo', 'windspeed_chiba', 'windspeed_tokyo']] = train_df[['precipitation_tokyo', 'temperature_chiba', 'temperature_tokyo', 'windspeed_chiba', 'windspeed_tokyo']].round(1)
train[train.select_dtypes(object).columns] = train_df[train.select_dtypes(object).columns]
train
[IterativeImputer] Completing matrix with shape (12168, 16)
[IterativeImputer] Change: 8.54139755011368, scaled tolerance: 2020033.124 
[IterativeImputer] Early stopping criterion reached.
datetime precipitation_utsunomiya precipitation_chiba precipitation_tokyo temperature_utsunomiya temperature_chiba temperature_tokyo winddirection_utsunomiya winddirection_chiba winddirection_tokyo windspeed_utsunomiya windspeed_chiba windspeed_tokyo pollen_utsunomiya pollen_chiba pollen_tokyo
0 2017020101 0.0 0.0 0.0 -1.0 4.1 2.9 16 1 2 2.7 2.5 1.3 0.0 8.0 0.0
1 2017020102 0.0 0.0 0.0 -1.1 4.2 2.6 1 1 1 3.3 1.5 0.9 0.0 24.0 4.0
2 2017020103 0.0 0.0 0.0 -0.7 4.2 2.4 1 15 16 4.0 1.7 0.6 4.0 32.0 12.0
3 2017020104 0.0 0.0 0.0 -1.1 4.4 1.8 1 15 1 4.1 3.1 1.4 0.0 12.0 0.0
4 2017020105 0.0 0.0 0.0 -1.2 4.1 1.5 2 14 14 3.7 3.4 0.9 0.0 32.0 4.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
12163 2020033120 0.0 0.0 0.0 10.0 11.5 9.4 16 2 16 2.4 2.6 0.7 118.0 0.0 12.0
12164 2020033121 0.0 0.0 0.0 10.1 11.3 8.9 15 15 14 2.4 1.7 1.3 73.0 4.0 4.0
12165 2020033122 0.0 0.0 0.0 9.8 11.3 8.8 3 15 15 1.2 2.7 0.9 8.0 0.0 20.0
12166 2020033123 0.5 0.0 0.0 9.7 10.9 8.9 16 16 1 0.5 2.9 0.6 24.0 4.0 0.0
12167 2020033124 0.0 0.0 0.0 9.7 10.7 8.9 16 1 16 1.0 2.7 0.4 16.0 4.0 12.0

12168 rows × 16 columns

datetimeの分割

df = pd.concat([train, test]).reset_index(drop=True)
df['time'] = pd.to_datetime(df.datetime.astype(str).str[:-2])
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day
df['hour'] = df.datetime.astype(str).str[-2:].astype(int)
df['weekday'] = df['time'].dt.weekday
df.head(3)
datetime precipitation_utsunomiya precipitation_chiba precipitation_tokyo temperature_utsunomiya temperature_chiba temperature_tokyo winddirection_utsunomiya winddirection_chiba winddirection_tokyo windspeed_utsunomiya windspeed_chiba windspeed_tokyo pollen_utsunomiya pollen_chiba pollen_tokyo time year month day hour weekday
0 2017020101 0.0 0.0 0.0 -1.0 4.1 2.9 16 1 2 2.7 2.5 1.3 0.0 8.0 0.0 2017-02-01 2017 2 1 1 2
1 2017020102 0.0 0.0 0.0 -1.1 4.2 2.6 1 1 1 3.3 1.5 0.9 0.0 24.0 4.0 2017-02-01 2017 2 1 2 2
2 2017020103 0.0 0.0 0.0 -0.7 4.2 2.4 1 15 16 4.0 1.7 0.6 4.0 32.0 12.0 2017-02-01 2017 2 1 3 2

各観測拠点での花粉量を見る

plot_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, col in enumerate(plot_col):
    plt.subplot(1, ncols, i+1)
    plt.plot(df.time, df[col], alpha=1, color=color[i], label=col)
    plt.xlabel(col)
    plt.legend()
    plt.grid()
plt.show()
df[df.month.isin([7,8,9,10,11,12,1])]
datetime precipitation_utsunomiya precipitation_chiba precipitation_tokyo temperature_utsunomiya temperature_chiba temperature_tokyo winddirection_utsunomiya winddirection_chiba winddirection_tokyo windspeed_utsunomiya windspeed_chiba windspeed_tokyo pollen_utsunomiya pollen_chiba pollen_tokyo time year month day hour weekday
target_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
for col in target_col:
    print('='*10+col+'='*10)
    print(df[col].describe())
    print('<75%より多く花粉を含む(月)>')
    df[df[col] > df[col].quantile(0.75)].month.value_counts().plot(kind='barh', figsize=(5,3))
    plt.show()
    print('<99%より多く花粉を含む(月)>')
    df[df[col] > df[col].quantile(0.99)].month.value_counts().plot(kind='barh', figsize=(5,3))
    plt.show()
    print()
==========pollen_utsunomiya==========
count    12504.000000
mean        84.292306
std        338.311803
min          0.000000
25%          0.000000
50%         16.000000
75%         57.000000
max      12193.000000
Name: pollen_utsunomiya, dtype: float64
<75%より多く花粉を含む(月)>
<99%より多く花粉を含む(月)>
==========pollen_chiba==========
count    12504.000000
mean        28.822297
std         99.154223
min          0.000000
25%          0.000000
50%          8.000000
75%         24.000000
max       4141.000000
Name: pollen_chiba, dtype: float64
<75%より多く花粉を含む(月)>
<99%より多く花粉を含む(月)>
==========pollen_tokyo==========
count    12504.000000
mean        25.973289
std         73.793359
min          0.000000
25%          0.000000
50%          4.000000
75%         20.000000
max       2209.000000
Name: pollen_tokyo, dtype: float64
<75%より多く花粉を含む(月)>
<99%より多く花粉を含む(月)>

各観測拠点でスケールが違う(宇都宮>千葉>東京)
7-1月のデータは存在しない
3月の花粉量がとても多い

plot_col = ['pollen_utsunomiya', 'pollen_chiba', 'pollen_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, col in enumerate(plot_col):
    plt.subplot(1, ncols, i+1)
    plt.hist(df[col], bins=200, alpha=1, color=color[i], label=col)
    plt.xlabel(col)
    plt.legend()
    plt.grid()
    plt.xlim(-1,500)
plt.show()

花粉量と時間との関係を見る

df[df.month.isin([2,3])].groupby('year').mean()[target_col].style.background_gradient()
  pollen_utsunomiya pollen_chiba pollen_tokyo
year      
2017 66.026836 34.845339 29.654661
2018 252.665230 63.105603 70.619253
2019 193.033046 63.388649 32.174569
2020 25.973870 17.153249 12.936441

テストデータを含む2020年の花粉量は過去3年より少ない

df[df.datetime <= 2020033124].groupby('month').mean()[target_col].style.background_gradient()
  pollen_utsunomiya pollen_chiba pollen_tokyo
month      
2 25.908712 25.114015 14.764773
3 229.262769 61.629368 55.248320
4 106.966667 34.286111 37.822685
5 32.176523 16.237455 17.732527
6 0.203704 0.177778 0.044444
df[df.datetime <= 2020033124].groupby('hour').mean()[target_col].style.background_gradient()
  pollen_utsunomiya pollen_chiba pollen_tokyo
hour      
1 87.151874 24.960552 25.763314
2 104.230769 25.680473 24.100592
3 106.236686 25.019724 25.228797
4 107.980276 22.122288 26.536489
5 101.126233 25.763314 26.230769
6 79.039448 26.059172 27.944773
7 83.715976 31.885602 36.161736
8 90.887574 39.297830 41.773176
9 84.433925 44.159763 31.749507
10 80.832347 37.818540 28.428008
11 81.007890 34.001972 26.106509
12 83.358974 30.307692 24.307692
13 90.591716 30.011834 23.234714
14 88.502959 28.656805 21.566075
15 85.682446 31.741617 21.467456
16 83.394477 29.189349 20.660750
17 80.633136 27.506903 21.510848
18 80.648915 30.041420 26.364892
19 83.248521 28.358974 28.581854
20 80.625247 27.968442 28.899408
21 84.974359 26.153846 29.323471
22 78.532544 28.822485 25.189349
23 77.319527 27.686391 23.710059
24 74.721893 27.621302 25.731755

宇都宮は2-5時、千葉は8-10時頃、東京は7-8時頃に高い花粉量を観測

df[df.datetime <= 2020033124].groupby('weekday').mean()[target_col].style.background_gradient()
  pollen_utsunomiya pollen_chiba pollen_tokyo
weekday      
0 66.211806 23.478588 29.365162
1 77.891782 27.149306 26.777778
2 90.565972 39.984375 28.638889
3 82.493634 25.085069 29.924769
4 90.062215 24.257991 23.444635
5 121.541096 37.296804 23.267694
6 77.171233 30.037100 25.523402

宇都宮は土曜日、千葉は水曜日や土曜日に花粉量が多く、東京は金土曜日の花粉量が少ない

花粉量と天気情報(降水量・気温・風向・風速)との関係を見る

fig, ax = plt.subplots(figsize=(12, 10)) 
sns.heatmap(df[df.datetime <= 2020033124].drop(['datetime', 'time'], axis=1).corr(), vmax=1, vmin=-1, center=0)
<matplotlib.axes._subplots.AxesSubplot at 0x7f0ed9ac4390>

降水量

plot_col = ['precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, col in enumerate(plot_col):
    plt.subplot(1, ncols, i+1)
    for y in df.year.unique():
        plt.plot(df[df.year==y].time, df[df.year==y][col], alpha=1, color=color[i])
    plt.xlabel(col)
    plt.grid()
plt.show()
plot_col = ['precipitation_utsunomiya', 'precipitation_chiba', 'precipitation_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, (tcol, col) in enumerate(zip(target_col, plot_col)):
    plt.subplot(1, ncols, i+1)
    plt.scatter(df[col], df[tcol], alpha=1, color=color[i])
    plt.xlabel(col)
    plt.ylabel(tcol)
    plt.grid()
    plt.xlim(-1,11)
plt.show()

降水量が多い時は花粉量が少ない

気温

plot_col = ['temperature_utsunomiya', 'temperature_chiba', 'temperature_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, col in enumerate(plot_col):
    plt.subplot(1, ncols, i+1)
    for j, y in enumerate(df.year.unique()):
        plt.plot(df[df.year==y].index, df[df.year==y][col], alpha=0.25*(j+1), color=color[i])
        plt.plot(df[df.year==y].index, df[df.year==y][col].rolling(100).mean(), alpha=0.25*(j+1), color='black')
    plt.xlabel(col)
    plt.grid()
plt.show()
plot_col = ['temperature_utsunomiya', 'temperature_chiba', 'temperature_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, (tcol, col) in enumerate(zip(target_col, plot_col)):
    plt.subplot(1, ncols, i+1)
    plt.scatter(df[col], df[tcol], alpha=1, color=color[i])
    plt.xlabel(col)
    plt.ylabel(tcol)
    plt.grid()
plt.show()

風向

# 指定地域の該当日の風向
winddirection = {
    0:'静穏',
    1:'北北東',
    2:'北東',
    3:'東北東',
    4:'東',
    5:'東南東',
    6:'南東',
    7:'南南東',
    8:'南',
    9:'南南西',
    10:'南西',
    11:'西南西',
    12:'西',
    13:'西北西',
    14:'北西',
    15:'北北西',
    16:'北',
}

df_wd = df.copy()
for col in ['winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo']:
    df_wd[col] = df_wd[col].map(winddirection)
df_wd.head(3)
datetime precipitation_utsunomiya precipitation_chiba precipitation_tokyo temperature_utsunomiya temperature_chiba temperature_tokyo winddirection_utsunomiya winddirection_chiba winddirection_tokyo windspeed_utsunomiya windspeed_chiba windspeed_tokyo pollen_utsunomiya pollen_chiba pollen_tokyo time year month day hour weekday
0 2017020101 0.0 0.0 0.0 -1.0 4.1 2.9 北北東 北東 2.7 2.5 1.3 0.0 8.0 0.0 2017-02-01 2017 2 1 1 2
1 2017020102 0.0 0.0 0.0 -1.1 4.2 2.6 北北東 北北東 北北東 3.3 1.5 0.9 0.0 24.0 4.0 2017-02-01 2017 2 1 2 2
2 2017020103 0.0 0.0 0.0 -0.7 4.2 2.4 北北東 北北西 4.0 1.7 0.6 4.0 32.0 12.0 2017-02-01 2017 2 1 3 2
plot_col = ['winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, (tcol, col) in enumerate(zip(target_col, plot_col)):
    plt.subplot(1, ncols, i+1)
    df_wd[col].value_counts().plot(kind='barh')
    plt.xlabel(tcol)
    plt.ylabel(col)
    plt.grid()
plt.show()
plot_col = ['winddirection_utsunomiya', 'winddirection_chiba', 'winddirection_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, (tcol, col) in enumerate(zip(target_col, plot_col)):
    plt.subplot(1, ncols, i+1)
    df_wd[df_wd.datetime <= 2020033124].groupby(col).mean()[tcol].plot(kind='barh')
    plt.xlabel(tcol)
    plt.ylabel(col)
    plt.grid()
plt.show()

宇都宮は北風のとき花粉量が多く、千葉は南風のとき花粉量が少なく、東京は西風のとき花粉量が多い
静穏は東京のみ多く存在し、花粉量は多い

風速

plot_col = ['windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, col in enumerate(plot_col):
    plt.subplot(1, ncols, i+1)
    for j, y in enumerate(df.year.unique()):
        plt.plot(df[df.year==y].index, df[df.year==y][col], alpha=0.25*(j+1), color=color[i])
        plt.plot(df[df.year==y].index, df[df.year==y][col].rolling(100).mean(), alpha=0.25*(j+1), color='black')
    plt.xlabel(col)
    plt.grid()
plt.show()
plot_col = ['windspeed_utsunomiya', 'windspeed_chiba', 'windspeed_tokyo']
color = ['red','green','blue']
ncols = len(plot_col)
plt.subplots(1, ncols, sharey=True, sharex=True, figsize=(30, 5))
for i, (tcol, col) in enumerate(zip(target_col, plot_col)):
    plt.subplot(1, ncols, i+1)
    plt.scatter(df[col], df[tcol], alpha=1, color=color[i])
    plt.xlabel(col)
    plt.ylabel(tcol)
    plt.grid()
plt.show()

風向と風力と地理関係

!pip install folium
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: folium in /usr/local/lib/python3.7/dist-packages (0.12.1.post1)
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from folium) (2.23.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from folium) (1.21.6)
Requirement already satisfied: jinja2>=2.9 in /usr/local/lib/python3.7/dist-packages (from folium) (2.11.3)
Requirement already satisfied: branca>=0.3.0 in /usr/local/lib/python3.7/dist-packages (from folium) (0.5.0)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2>=2.9->folium) (2.0.1)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->folium) (2022.6.15)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->folium) (3.0.4)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->folium) (1.24.3)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->folium) (2.10)

観測拠点は以下の3つ

  • 宇都宮(宇都宮市中央生涯学習センター)
  • 千葉(千葉県環境研究センター)
  • 府中(東京都多摩小平保健所)
import folium

cities = pd.DataFrame({
    'train': ['宇都宮(宇都宮市中央生涯学習センター)', '千葉(千葉県環境研究センター)', '府中(東京都多摩小平保健所)'],
    'latitude': [36.559444, 35.525312, 35.730062],
    'longtude': [139.882689, 140.068465, 139.516689],
})
map = folium.Map(width='75%', height='75%',location=[36,140], zoom_start=8)
 
for i, r in cities.iterrows():
    folium.Marker(location=[r['latitude'], r['longtude']], popup=r['train']).add_to(map)
map
Make this Notebook Trusted to load map: File -> Trust Notebook

地理情報は提供されていないが、位置的な関係を考慮して風向と風量から、時間差での影響をモデルに組み込む必要があるかもしれない
例えばそれぞれの観測拠点で多く見られる北北東の風や南、南西の風の風が吹いた場合、ある程度の時間を置いて別の拠点へと風が到達するかもしれない

添付データ

  • eda.ipynb?X-Amz-Expires=10800&X-Amz-Date=20241121T091552Z&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIP7GCBGMWPMZ42PQ
  • Favicon
    new user
    コメントするには 新規登録 もしくは ログイン が必要です。