@@ -122,19 +122,41 @@ def __init__(
122122 predict_mode : bool = False ,
123123 ):
124124 """
125- Timeseries dataset
125+ Timeseries dataset holding data for models.
126+
127+ Each sample is a subsequence of a full time series. The subsequence consists of encoder and decoder/prediction
128+ timepoints for a given time series. This class constructs an index which defined which subsequences exists and
129+ can be samples from (``index`` attribute). The samples in the index are defined by by the various parameters.
130+ to the class (encoder and prediction lengths, minimum prediction length, randomize length and predict keywords).
131+ How samples are
132+ sampled into batches for training, is determined by the DataLoader. The class provides the
133+ :py:meth:`~TimeSeriesDataSet.to_dataloader` method to convert the dataset into a dataloader.
134+
135+ Large datasets:
136+
137+ Currently the class is limited to in-memory operations. If you have extremely large data,
138+ however, you can pass prefitted encoders and and scalers to it and a subset of sequences to the class to
139+ construct a valid dataset (plus, likely the EncoderNormalizer should be used to normalize targets).
140+ when fitting a network, you would then to create a custom DataLoader that rotates through the datasets.
141+ There is currently no in-built methods to do this.
126142
127143 Args:
128144 data: dataframe with sequence data - each row can be identified with ``time_idx`` and the ``group_ids``
129- time_idx: integer column denoting the time index
145+ time_idx: integer column denoting the time index. This columns is used to determine the sequence of samples.
146+ If there no missings observations, the time index should increase by ``+1`` for each subsequent sample.
147+ The first time_idx for each series does not necessarily have to be ``0`` but any value is allowed.
130148 target: column denoting the target or list of columns denoting the target - categorical or continous.
131- group_ids: list of column names identifying a timeseries
149+ group_ids: list of column names identifying a time series. This means that the ``group_ids`` identify
150+ a sample together with the ``time_idx``. If you have only one timeseries, set this to the
151+ name of column that is constant.
132152 weight: column name for weights or list of column names corresponding to each target
133153 max_encoder_length: maximum length to encode
134154 min_encoder_length: minimum allowed length to encode. Defaults to max_encoder_length.
135- min_prediction_idx: minimum time index from where to start predictions
136- max_prediction_length: maximum prediction length (choose this not too short as it can help convergence)
137- min_prediction_length: minimum prediction length. Defaults to max_prediction_length
155+ min_prediction_idx: minimum ``time_idx`` from where to start predictions. This parameter can be useful to
156+ create a validation or test set.
157+ max_prediction_length: maximum prediction/decoder length (choose this not too short as it can help
158+ convergence)
159+ min_prediction_length: minimum prediction/decoder length. Defaults to max_prediction_length
138160 static_categoricals: list of categorical variables that do not change over time,
139161 entries can be also lists which are then encoded together
140162 (e.g. useful for product categories)
@@ -154,23 +176,39 @@ def __init__(
154176 dropout_categoricals: list of categorical variables that are unknown when making a forecast without
155177 observed history
156178 constant_fill_strategy: dictionary of column names with constants to fill in missing values if there are
157- gaps in the sequence
158- (otherwise forward fill strategy is used)
159- allow_missings: if to allow missing timesteps that are automatically filled up
160- add_relative_time_idx: if to add a relative time index as feature
161- add_target_scales: if to add scales for target to static real features
179+ gaps in the sequence (by default forward fill strategy is used). The values will be only used if
180+ ``allow_missings=True``. A common use case is to denote that demand was 0 if the sample is not in
181+ the dataset.
182+ allow_missings: if to allow missing timesteps that are automatically filled up. Missing values
183+ refer to gaps in the ``time_idx``, e.g. if a specific timeseries has only samples for
184+ 1, 2, 4, 5, the sample for 3 will be generated on-the-fly.
185+ Allow missings does not deal with ``NA`` values. You should fill NA values before
186+ passing the dataframe to the TimeSeriesDataSet.
187+ add_relative_time_idx: if to add a relative time index as feature (i.e. for each sampled sequence, the index
188+ will range from -encoder_length to prediction_length)
189+ add_target_scales: if to add scales for target to static real features (i.e. add the center and scale
190+ of the unnormalized timeseries as features)
162191 add_encoder_length: if to add decoder length to list of static real variables. Defaults to "auto",
163192 i.e. yes if ``min_encoder_length != max_encoder_length``.
164- target_normalizer: transformer that takes group_ids, target and time_idx to return normalized target
165- categorical_encoders: dictionary of scikit learn label transformers or None
166- scalers: dictionary of scikit learn scalers or None
193+ target_normalizer: transformer that takes group_ids, target and time_idx to return normalized targets.
194+ You can choose from the classes in :py:mod:`~pytorch_forecasting.encoders`.
195+ By default an appropriate normalizer is chosen automatically.
196+ categorical_encoders: dictionary of scikit learn label transformers. If you have unobserved categories in
197+ the future, you can use the :py:class:`~pytorch_forecasting.encoders.NaNLabelEncoder` with
198+ ``add_nan=True``. Defaults effectively to sklearn's ``LabelEncoder()``. Prefittet encoders will not
199+ be fit again.
200+ scalers: dictionary of scikit learn scalers. Defaults to sklearn's ``StandardScaler()``.
201+ Prefittet encoders will not be fit again.
167202 randomize_length: None or False if not to randomize lengths. Tuple of beta distribution concentrations
168203 from which
169204 probabilities are sampled that are used to sample new sequence lengths with a binomial
170205 distribution.
171206 If True, defaults to (0.2, 0.05), i.e. ~1/4 of samples around minimum encoder length.
172207 Defaults to False otherwise.
173- predict_mode: if to only iterate over each timeseries once (only the last provided samples)
208+ predict_mode: if to only iterate over each timeseries once (only the last provided samples).
209+ Effectively, this will take choose for each time series identified by ``group_ids``
210+ the last ``max_prediction_length`` samples of each time series as
211+ prediction samples and everthing previous up to ``max_encoder_length`` samples as encoder samples.
174212 """
175213 super ().__init__ ()
176214 self .max_encoder_length = max_encoder_length
@@ -1090,14 +1128,19 @@ def to_dataloader(
10901128 """
10911129 Get dataloader from dataset.
10921130
1131+ The
1132+
10931133 Args:
10941134 train (bool, optional): if dataloader is used for training or prediction
10951135 Will shuffle and drop last batch if True. Defaults to True.
10961136 batch_size (int): batch size for training model. Defaults to 64.
10971137 batch_sampler (Union[Sampler, str]): batch sampler or string. One of
10981138
10991139 * "synchronized": ensure that samples in decoder are aligned in time. Does not support missing
1100- values in dataset.
1140+ values in dataset. This makes only sense if the underlying algorithm makes use of values aligned
1141+ in time.
1142+ * PyTorch Sampler instance: any PyTorch sampler, e.g. the WeightedRandomSampler()
1143+ * None: samples are taken randomly from times series.
11011144
11021145 **kwargs: additional arguments to ``DataLoader()``
11031146
0 commit comments