wildboar.datasets
#
Dataset loading utilities.
See the dataset section in the User Guide for more details and examples.
Examples
>>> from wildboar.datasets import load_dataset
>>> X, y = load_dataset("GunPoint")
>>> X.shape
(200, 60)
Submodules#
Package Contents#
Classes#
Base class for handling dataset bundles. |
|
A repository is a collection of bundles. |
|
Bundle of numpy binary files. |
|
A repository is a collection of bundles. |
Functions#
|
Clear the cache by deleting cached datasets. |
|
Get all bundles in the repository. |
|
Get repository by name. |
|
Install repository. |
|
Get a list of all bundle names in the specified repository. |
|
List the collections of the repository. |
|
List the datasets in the repository. |
|
List the key of all installed repositories. |
|
Load a dataset from a repository. |
|
Load all datasets as a generator. |
|
Load the GunPoint dataset. |
|
Load the Synthetic_Control dataset. |
|
Load the TwoLeadECG dataset. |
|
Refresh the installed repositories. |
|
Change the global cache directory. |
- class wildboar.datasets.Bundle(*, key, version, name, tag=None, arrays=None, description=None, collections=None)[source]#
Base class for handling dataset bundles.
- Parameters:
- keystr
A unique key of the bundle.
- versionstr
The version of the bundle.
- namestr
Human-readable name of the bundle.
- tagstr, optional
A bundle tag.
- arrayslist
The arrays of the dataset.
- descriptionstr
Description of the bundle.
- collectionsdict, optional
A list of collections.
- get_collection(collection)[source]#
Get a dataset collection.
- Parameters:
- collectionstr, optional
The name of the collection.
- Returns:
- list
List of datasets in the collection.
- get_filename(version=None, tag=None, ext=None)[source]#
Get the cache name of the bundle.
- Parameters:
- versionstr, optional
The bundle version.
- tagstr, optional
The tag.
- extstr, optional
The extension of the file.
- Returns:
- str
The filename.
- list(archive, collection=None)[source]#
List all datasets in this bundle.
- Parameters:
- archiveZipFile
The bundle file.
- collectionstr, optional
The collection name.
- Returns:
- list
A sorted list of datasets in the bundle.
- load(name, archive)[source]#
Load a dataset from the bundle.
- Parameters:
- namestr
Name of the dataset.
- archiveZipFile
The zip-file bundle.
- Returns:
- xndarray
Data samples.
- yndarray
Data labels.
- n_training_samplesint
Number of samples that are for training. The value is <= x.shape[0].
- extrasdict, optional
Extra numpy arrays.
- class wildboar.datasets.JSONRepository(url)[source]#
A repository is a collection of bundles.
- get_bundle(key)[source]#
Get a bundle with the specified key.
- Parameters:
- keystr
Key of the bundle.
- Returns:
- Bundle, optional
A bundle or None.
- class wildboar.datasets.NpBundle(*, key, version, name, tag=None, arrays=None, description=None, collections=None)[source]#
Bundle of numpy binary files.
- get_collection(collection)[source]#
Get a dataset collection.
- Parameters:
- collectionstr, optional
The name of the collection.
- Returns:
- list
List of datasets in the collection.
- get_filename(version=None, tag=None, ext=None)[source]#
Get the cache name of the bundle.
- Parameters:
- versionstr, optional
The bundle version.
- tagstr, optional
The tag.
- extstr, optional
The extension of the file.
- Returns:
- str
The filename.
- list(archive, collection=None)[source]#
List all datasets in this bundle.
- Parameters:
- archiveZipFile
The bundle file.
- collectionstr, optional
The collection name.
- Returns:
- list
A sorted list of datasets in the bundle.
- load(name, archive)[source]#
Load a dataset from the bundle.
- Parameters:
- namestr
Name of the dataset.
- archiveZipFile
The zip-file bundle.
- Returns:
- xndarray
Data samples.
- yndarray
Data labels.
- n_training_samplesint
Number of samples that are for training. The value is <= x.shape[0].
- extrasdict, optional
Extra numpy arrays.
- class wildboar.datasets.Repository[source]#
A repository is a collection of bundles.
- get_bundle(key)[source]#
Get a bundle with the specified key.
- Parameters:
- keystr
Key of the bundle.
- Returns:
- Bundle, optional
A bundle or None.
- abstract property download_url[source]#
The url template for downloading bundles.
- Returns:
- str
The download url.
- wildboar.datasets.clear_cache(repository=None, *, cache_dir=None, keep_last_version=True)[source]#
Clear the cache by deleting cached datasets.
- Parameters:
- repositorystr, optional
The name of the repository to clear cache.
if None, clear cache of all repositories.
- cache_dirstr, optional
The cache directory.
- keep_last_versionbool, optional
If true, keep the latest version of each repository.
- wildboar.datasets.get_bundles(repository, *, refresh=False, timeout=None)[source]#
Get all bundles in the repository.
- Parameters:
- repositorystr
Name of the repository.
- refreshbool, optional
Refresh the repository.
Added in version 1.1.
- timeoutfloat, optional
Timeout for json request.
Added in version 1.1.
- Returns:
- dict
A dict of key Bundle pairs.
- wildboar.datasets.get_repository(repository)[source]#
Get repository by name.
- Parameters:
- repositorystr
Repository name.
- Returns:
- Repository
A repository.
- wildboar.datasets.install_repository(repository, *, refresh=True, timeout=None, cache_dir=None)[source]#
Install repository.
- Parameters:
- repositorystr or Repository
A repository.
- refreshbool, optional
Refresh the repository.
Added in version 1.1.
- timeoutfloat, optional
Timeout for json request.
Added in version 1.1.
- cache_dirstr, optional
Cache directory.
Added in version 1.1.
- wildboar.datasets.list_bundles(repository, *, refresh=False, timeout=None)[source]#
Get a list of all bundle names in the specified repository.
- Parameters:
- repositorystr
The name of the repository.
- refreshbool, optional
Refresh the repository.
Added in version 1.1.
- timeoutfloat, optional
Timeout for json request.
Added in version 1.1.
- Returns:
- list
A list of bundle names.
Examples
>>> from wildboar.datasets import list_bundles >>> list_bundles("wildboar") ["ucr", "ucr-tiny", ...]
- wildboar.datasets.list_collections(repository)[source]#
List the collections of the repository.
- Parameters:
- repositorystr or Bundle, optional
The data repository
if str load a named bundle, format {repository}/{bundle}.
- Returns:
- collections
A list of collections.
Examples
>>> from wildboar.datasets import list_collections >>> list_collections("wildboar/ucr") ["bake-off", ...]
- wildboar.datasets.list_datasets(repository='wildboar/ucr', *, collection=None, cache_dir=None, create_cache_dir=True, progress=True, force=False, refresh=False, timeout=None)[source]#
List the datasets in the repository.
- Parameters:
- repositorystr or Bundle, optional
The data repository
if str load a named bundle, format {repository}/{bundle}.
- collectionstr, optional
A collection of named datasets.
- cache_dirstr, optional
The directory where downloaded files are cached (default=’wildboar_cache’).
- create_cache_dirbool, optional
Create cache directory if missing (default=True).
- progressbool, optional
Show a progress bar while downloading a bundle.
- forcebool, optional
Force re-download of cached bundle.
- refreshbool, optional
Refresh the repository.
Added in version 1.1.
- timeoutfloat, optional
Timeout for json request.
Added in version 1.1.
- Returns:
- set
A set of dataset names.
- wildboar.datasets.list_repositories(*, refresh=False, timeout=None, cache_dir=None)[source]#
List the key of all installed repositories.
- Parameters:
- refreshbool, optional
Refresh all repositories.
Added in version 1.1.
- timeoutfloat, optional
Timeout for json request.
Added in version 1.1.
- cache_dirstr, optional
Cache directory.
Added in version 1.1.
- Returns:
- repositories
A list of installed repositories.
Examples
>>> from wildboar.datasets import list_repositories >>> list_repositories() ["wildboar", ...]
We can also refresh the repositories, to load any newly added but still pending repositories.
>>> list_repositories(refresh=True)
- wildboar.datasets.load_dataset(name, *, repository='wildboar/ucr', dtype=float, preprocess=None, contiguous=True, merge_train_test=True, cache_dir=None, create_cache_dir=True, progress=True, return_extras=False, force=False, refresh=False, timeout=None)[source]#
Load a dataset from a repository.
- Parameters:
- namestr
The name of the dataset to load.
- repositorystr, optional
The data repository formatted as
{repository}/{bundle}[:{version}][:{tag}]
. Read more in the User guide.- dtypedtype, optional
The data type of x (train and test).
- preprocessstr, list or callable, optional
Preprocess the dataset
if str, use named preprocess function (see
preprocess._PREPROCESS.keys()
for valid keys).if callable, function taking a np.ndarray and returns the preprocessed dataset.
if list, a list of callable or str.
- contiguousbool, optional
Ensure that the returned dataset is memory contiguous.
- merge_train_testbool, optional
Merge the existing training and testing partitions.
- cache_dirstr, optional
The directory where downloaded files are cached.
- create_cache_dirbool, optional
Create cache directory if missing (default=True).
- progressbool, optional
Show a progress bar while downloading a bundle.
- return_extrasbool, optional
Return optional extras.
Added in version 1.1.
- forcebool, optional
Force re-download of already cached bundle.
Added in version 1.0.4.
- refreshbool, optional
Refresh the repository.
Added in version 1.1.
- timeoutfloat, optional
Timeout for json request.
Added in version 1.1.
- Returns:
- xndarray, optional
The samples if merge_train_test=False.
- yndarray, optional
The labels, if merge_train_test=False.
- x_trainndarray, optional
The training samples if merge_train_test=False.
- x_testndarray, optional
The testing samples if merge_train_test=False.
- y_trainndarray, optional
The training labels if merge_train_test=False.
- y_testndarray, optional
The testing labels if merge_train_test=False.
- extrasdict, optional
The optional extras if return_extras=True.
Examples
Load a dataset from the default repository
>>> x, y = load_dataset("SyntheticControl") >>> x.shape (600, 60)
or if original training and testing splits are to be preserved
>>> x_train, x_test, y_train, y_test = load_dataset( ... "SyntheticControl", merge_train_test=False ... )
or for a specific version of the dataset
>>> x_train, x_test, y_train, y_test = load_dataset( ... "SyntheticControl", ... repository='wildboar/ucr-tiny:1.0.1', ... merge_train_test=False, ... )
- wildboar.datasets.load_datasets(repository='wildboar/ucr', *, collection=None, cache_dir=None, create_cache_dir=True, progress=True, force=False, filter=None, **kwargs)[source]#
Load all datasets as a generator.
- Parameters:
- repositorystr
The repository string.
- collectionstr, optional
A collection of named datasets.
- cache_dirstr, optional
The cache directory for downloaded dataset repositories.
- create_cache_dirbool, optional
Create the cache directory if it does not exist.
- progressbool, optional
If progress indicator is shown while downloading the repository.
- forcebool, optional
Force re-download of cached repository.
- filterstr, dict, list or callable, optional
Filter the datasets
if callable, only yield those datasets for which the callable returns True.
f(dataset, x, y) -> bool
if dict, filter based on the keys and values, where keys are attributes and values comparison specs
if list, filter based on conjunction of attribute comparisons
if str, filter based on attribute comparison
Read more in the User guide.
Warning
If the parameter
merge_train_test
isFalse
, the filter is applied on the training part of the data.- kwargsdict
Optional arguments to
load_dataset
.
- Yields:
- namestr
The dataset name
- datasetlist
Depends on the
kwargs
.If
merge_train_test=True
(default), dataset is a tuple of(x, y)
.If
merge_train_test=False
, dataset is a tuple of(x_train, x_test, y_train, y_test)
.If
return_extras=True
, the last element of the tuple contains optional extra (orNone
).
Examples
Load all datasets in a repository:
>>> for dataset, (x, y) in load_datasets(repository='wildboar/ucr-tiny'): ... print(dataset, x.shape, y.shape) ... Beef (60, 470) (60,) Coffee (56, 286) (56,) GunPoint (200, 150) (200,) SyntheticControl (600, 60) (600,) TwoLeadECG (1162, 82) (1162,)
Print the names of datasets with more than 200 samples
>>> for dataset, (x, y) in load_datasets( ... repository='wildboar/ucr-tiny', filter={"n_samples": ">200"} ... ): ... print(dataset) SyntheticControl TwoLeadECG
>>> for dataset, (x, y) in load_datasets( ... repository='wildboar/ucr-tiny', filter="n_samples>200" ... ): ... print(dataset) SyntheticControl TwoLeadECG
- wildboar.datasets.load_gun_point(merge_train_test=True)[source]#
Load the GunPoint dataset.
- Parameters:
- merge_train_testbool, optional
Merge the existing training and testing partitions.
- Returns:
- xndarray, optional
The samples if merge_train_test=False.
- yndarray, optional
The labels, if merge_train_test=False.
- x_trainndarray, optional
The training samples if merge_train_test=False.
- x_testndarray, optional
The testing samples if merge_train_test=False.
- y_trainndarray, optional
The training labels if merge_train_test=False.
- y_testndarray, optional
The testing labels if merge_train_test=False.
- extrasdict, optional
The optional extras if return_extras=True.
See also
load_dataset
load a named dataset
- wildboar.datasets.load_synthetic_control(merge_train_test=True)[source]#
Load the Synthetic_Control dataset.
- Parameters:
- merge_train_testbool, optional
Merge the existing training and testing partitions.
- Returns:
- xndarray, optional
The samples if merge_train_test=False.
- yndarray, optional
The labels, if merge_train_test=False.
- x_trainndarray, optional
The training samples if merge_train_test=False.
- x_testndarray, optional
The testing samples if merge_train_test=False.
- y_trainndarray, optional
The training labels if merge_train_test=False.
- y_testndarray, optional
The testing labels if merge_train_test=False.
- extrasdict, optional
The optional extras if return_extras=True.
See also
load_dataset
load a named dataset
- wildboar.datasets.load_two_lead_ecg(merge_train_test=True)[source]#
Load the TwoLeadECG dataset.
- Parameters:
- merge_train_testbool, optional
Merge the existing training and testing partitions.
- Returns:
- xndarray, optional
The samples if merge_train_test=False.
- yndarray, optional
The labels, if merge_train_test=False.
- x_trainndarray, optional
The training samples if merge_train_test=False.
- x_testndarray, optional
The testing samples if merge_train_test=False.
- y_trainndarray, optional
The training labels if merge_train_test=False.
- y_testndarray, optional
The testing labels if merge_train_test=False.
- extrasdict, optional
The optional extras if return_extras=True.
See also
load_dataset
load a named dataset
- wildboar.datasets.refresh_repositories(repository=None, *, timeout=None, cache_dir=None)[source]#
Refresh the installed repositories.
- Parameters:
- repositorystr, optional
The repository. None means all repositories.
- timeoutfloat, optional
Timeout for request.
Added in version 1.1.
- cache_dirstr, optional
Cache directory.
Added in version 1.1.