Skip to content

Commit 87eef60

Browse files
committed
Timeseries data generation - TimeGAN
1 parent 3eab5a1 commit 87eef60

File tree

4 files changed

+109
-5
lines changed

4 files changed

+109
-5
lines changed

README.md

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ We well know GANs for success in the realistic image generation. However, they c
1010
* Arxiv article: ["Tabular GANs for uneven distribution"](https://arxiv.org/abs/2010.00638)
1111
* Medium post: [GANs for tabular data](https://towardsdatascience.com/review-of-gans-for-tabular-data-a30a2199342)
1212

13-
### How to use library
13+
## How to use library
1414

1515
* Installation: `pip install tabgan`
1616
* To generate new data to train by sampling and then filtering by adversarial training
@@ -88,7 +88,38 @@ print("OriginalGenerator metric", fit_predict(clf, new_train1, new_target1, X_te
8888
new_train1, new_target1 = GANGenerator().generate_data_pipe(X_train, y_train, X_test, )
8989
print("GANGenerator metric", fit_predict(clf, new_train1, new_target1, X_test, y_test))
9090
```
91+
## Timeseries GAN generation TimeGAN
9192

93+
You can easily adjust code to generate multidimensional timeseries data.
94+
Basically it extracts days, months and year from _date_. Demo how to use in the example below:
95+
```python
96+
import pandas as pd
97+
import numpy as np
98+
from tabgan.utils import get_year_mnth_dt_from_date,make_two_digit,collect_dates
99+
from tabgan.sampler import OriginalGenerator, GANGenerator
100+
101+
102+
train_size = 100
103+
train = pd.DataFrame(
104+
np.random.randint(-10, 150, size=(train_size, 4)), columns=list("ABCD")
105+
)
106+
min_date = pd.to_datetime('2019-01-01')
107+
max_date = pd.to_datetime('2021-12-31')
108+
d = (max_date - min_date).days + 1
109+
110+
train['Date'] = min_date + pd.to_timedelta(pd.np.random.randint(d, size=train_size), unit='d')
111+
train = get_year_mnth_dt_from_date(train, 'Date')
112+
113+
new_train, new_target = GANGenerator(gen_x_times=1.1, cat_cols=['year'], bot_filter_quantile=0.001,
114+
top_filter_quantile=0.999,
115+
is_post_process=True, pregeneration_frac=2, only_generated_data=False).\
116+
generate_data_pipe(train.drop('Date', axis=1), None,
117+
train.drop('Date', axis=1)
118+
)
119+
new_train = collect_dates(new_train)
120+
```
121+
122+
## Experiments
92123
### Datasets and experiment design
93124

94125
**Running experiment**

pip_desc.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,38 @@ if __name__ == "__main__":
9797
print("GANGenerator metric", fit_predict(clf, new_train1, new_target1, X_test, y_test))
9898
```
9999

100+
## Timeseries GAN generation TimeGAN
101+
102+
You can easily adjust code to generate multidimensional timeseries data.
103+
Basically it extracts days, months and year from _date_. Demo how to use in the example below:
104+
```python
105+
import pandas as pd
106+
import numpy as np
107+
from tabgan.utils import get_year_mnth_dt_from_date,make_two_digit,collect_dates
108+
from tabgan.sampler import OriginalGenerator, GANGenerator
109+
110+
111+
train_size = 100
112+
train = pd.DataFrame(
113+
np.random.randint(-10, 150, size=(train_size, 4)), columns=list("ABCD")
114+
)
115+
min_date = pd.to_datetime('2019-01-01')
116+
max_date = pd.to_datetime('2021-12-31')
117+
d = (max_date - min_date).days + 1
118+
119+
train['Date'] = min_date + pd.to_timedelta(pd.np.random.randint(d, size=train_size), unit='d')
120+
train = get_year_mnth_dt_from_date(train, 'Date')
121+
122+
new_train, new_target = GANGenerator(gen_x_times=1.1, cat_cols=['year'], bot_filter_quantile=0.001,
123+
top_filter_quantile=0.999,
124+
is_post_process=True, pregeneration_frac=2, only_generated_data=False).\
125+
generate_data_pipe(train.drop('Date', axis=1), None,
126+
train.drop('Date', axis=1)
127+
)
128+
new_train = collect_dates(new_train)
129+
```
130+
131+
## Experiments
100132
### Datasets and experiment design
101133

102134
**Running experiment**

src/tabgan/sampler.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from _ctgan.synthesizer import _CTGANSynthesizer as CTGAN
1515
from tabgan.abc_sampler import Sampler, SampleData
1616
from tabgan.adversarial_model import AdversarialModel
17-
from tabgan.utils import setup_logging
17+
from tabgan.utils import setup_logging, get_year_mnth_dt_from_date, collect_dates
1818

1919
warnings.filterwarnings("ignore", category=FutureWarning)
2020

@@ -317,12 +317,13 @@ def get_columns_if_exists(df, col) -> pd.DataFrame:
317317

318318
if __name__ == "__main__":
319319
setup_logging(logging.DEBUG)
320+
train_size = 100
320321
train = pd.DataFrame(
321-
np.random.randint(-10, 150, size=(100, 4)), columns=list("ABCD")
322+
np.random.randint(-10, 150, size=(train_size, 4)), columns=list("ABCD")
322323
)
323324
logging.info(train)
324-
target = pd.DataFrame(np.random.randint(0, 2, size=(100, 1)), columns=list("Y"))
325-
test = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD"))
325+
target = pd.DataFrame(np.random.randint(0, 2, size=(train_size, 1)), columns=list("Y"))
326+
test = pd.DataFrame(np.random.randint(0, 100, size=(train_size, 4)), columns=list("ABCD"))
326327
_sampler(OriginalGenerator(gen_x_times=15), train, target, test)
327328
_sampler(
328329
GANGenerator(gen_x_times=10, only_generated_data=False,
@@ -336,3 +337,18 @@ def get_columns_if_exists(df, col) -> pd.DataFrame:
336337
None,
337338
train,
338339
)
340+
min_date = pd.to_datetime('2019-01-01')
341+
max_date = pd.to_datetime('2021-12-31')
342+
343+
d = (max_date - min_date).days + 1
344+
345+
train['Date'] = min_date + pd.to_timedelta(pd.np.random.randint(d, size=train_size), unit='d')
346+
train = get_year_mnth_dt_from_date(train, 'Date')
347+
348+
new_train, new_target = GANGenerator(gen_x_times=1.1, cat_cols=['year'], bot_filter_quantile=0.001,
349+
top_filter_quantile=0.999,
350+
is_post_process=True, pregeneration_frac=2, only_generated_data=False).\
351+
generate_data_pipe(train.drop('Date', axis=1), None,
352+
train.drop('Date', axis=1)
353+
)
354+
new_train = collect_dates(new_train)

src/tabgan/utils.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import logging
22
import sys
33

4+
import pandas as pd
5+
46

57
def setup_logging(loglevel):
68
"""Setup basic logging
@@ -15,4 +17,27 @@ def setup_logging(loglevel):
1517
)
1618

1719

20+
def make_two_digit(num_as_str: str) -> pd.DataFrame:
21+
if len(num_as_str) == 2:
22+
return num_as_str
23+
else:
24+
return '0' + num_as_str
25+
26+
27+
def get_year_mnth_dt_from_date(df: pd.DataFrame, date_col='Date') -> pd.DataFrame:
28+
df[date_col] = pd.to_datetime(df[date_col])
29+
df['year'] = df[date_col].dt.year
30+
df['month'] = df[date_col].dt.month
31+
df['day'] = df[date_col].dt.day
32+
return df
33+
34+
35+
def collect_dates(df: pd.DataFrame)-> pd.DataFrame:
36+
df["Date"] = df['year'].astype(str) + '-' \
37+
+ df['month'].astype(str).apply(make_two_digit) + '-' \
38+
+ df['day'].astype(str).apply(make_two_digit)
39+
df.drop(['year','month','day'], axis=1,inplace=True)
40+
return df
41+
42+
1843
TEMP_TARGET = "_temp_target"

0 commit comments

Comments
 (0)