wildboar.datasets#

Dataset loading utilities.

See the dataset section in the User Guide for more details and examples.

Examples

>>> from wildboar.datasets import load_dataset
>>> X, y = load_dataset("GunPoint")
>>> X.shape
(200, 60)

Submodules#

Package Contents#

Classes#

Bundle

Base class for handling dataset bundles.

JSONRepository

A repository is a collection of bundles.

NpBundle

Bundle of numpy binary files.

Repository

A repository is a collection of bundles.

Functions#

clear_cache([repository, cache_dir, keep_last_version])

Clear the cache by deleting cached datasets.

get_bundles(repository, *[, refresh, timeout])

Get all bundles in the repository.

get_repository(repository)

Get repository by name.

install_repository(repository, *[, refresh, timeout, ...])

Install repository.

list_bundles(repository, *[, refresh, timeout])

Get a list of all bundle names in the specified repository.

list_collections(repository)

List the collections of the repository.

list_datasets([repository, collection, cache_dir, ...])

List the datasets in the repository.

list_repositories(*[, refresh, timeout, cache_dir])

List the key of all installed repositories.

load_dataset(name, *[, repository, dtype, preprocess, ...])

Load a dataset from a repository.

load_datasets([repository, collection, cache_dir, ...])

Load all datasets as a generator.

load_gun_point([merge_train_test])

Load the GunPoint dataset.

load_synthetic_control([merge_train_test])

Load the Synthetic_Control dataset.

load_two_lead_ecg([merge_train_test])

Load the TwoLeadECG dataset.

refresh_repositories([repository, timeout, cache_dir])

Refresh the installed repositories.

set_cache_dir([cache_dir])

Change the global cache directory.

class wildboar.datasets.Bundle(*, key, version, name, tag=None, arrays=None, description=None, collections=None)[source]#

Base class for handling dataset bundles.

Parameters:
keystr

A unique key of the bundle.

versionstr

The version of the bundle.

namestr

Human-readable name of the bundle.

tagstr, optional

A bundle tag.

arrayslist

The arrays of the dataset.

descriptionstr

Description of the bundle.

collectionsdict, optional

A list of collections.

get_collection(collection)[source]#

Get a dataset collection.

Parameters:
collectionstr, optional

The name of the collection.

Returns:
list

List of datasets in the collection.

get_filename(version=None, tag=None, ext=None)[source]#

Get the cache name of the bundle.

Parameters:
versionstr, optional

The bundle version.

tagstr, optional

The tag.

extstr, optional

The extension of the file.

Returns:
str

The filename.

list(archive, collection=None)[source]#

List all datasets in this bundle.

Parameters:
archiveZipFile

The bundle file.

collectionstr, optional

The collection name.

Returns:
list

A sorted list of datasets in the bundle.

load(name, archive)[source]#

Load a dataset from the bundle.

Parameters:
namestr

Name of the dataset.

archiveZipFile

The zip-file bundle.

Returns:
xndarray

Data samples.

yndarray

Data labels.

n_training_samplesint

Number of samples that are for training. The value is <= x.shape[0].

extrasdict, optional

Extra numpy arrays.

class wildboar.datasets.JSONRepository(url)[source]#

A repository is a collection of bundles.

get_bundle(key)[source]#

Get a bundle with the specified key.

Parameters:
keystr

Key of the bundle.

Returns:
Bundle, optional

A bundle or None.

get_bundles()[source]#

Get all bundles.

Returns:
dict

A dictionary of key and bundle.

refresh(timeout=None)[source]#

Refresh the repository.

property download_url[source]#

The url template for downloading bundles.

Returns:
str

The download url.

property name[source]#

Name of the repository.

Returns:
str

The name of the repository.

property version[source]#

The repository version.

Returns:
str

The version of the repository.

property wildboar_requires[source]#

The minimum required wildboar version.

Returns:
str

The min version.

class wildboar.datasets.NpBundle(*, key, version, name, tag=None, arrays=None, description=None, collections=None)[source]#

Bundle of numpy binary files.

get_collection(collection)[source]#

Get a dataset collection.

Parameters:
collectionstr, optional

The name of the collection.

Returns:
list

List of datasets in the collection.

get_filename(version=None, tag=None, ext=None)[source]#

Get the cache name of the bundle.

Parameters:
versionstr, optional

The bundle version.

tagstr, optional

The tag.

extstr, optional

The extension of the file.

Returns:
str

The filename.

list(archive, collection=None)[source]#

List all datasets in this bundle.

Parameters:
archiveZipFile

The bundle file.

collectionstr, optional

The collection name.

Returns:
list

A sorted list of datasets in the bundle.

load(name, archive)[source]#

Load a dataset from the bundle.

Parameters:
namestr

Name of the dataset.

archiveZipFile

The zip-file bundle.

Returns:
xndarray

Data samples.

yndarray

Data labels.

n_training_samplesint

Number of samples that are for training. The value is <= x.shape[0].

extrasdict, optional

Extra numpy arrays.

class wildboar.datasets.Repository[source]#

A repository is a collection of bundles.

get_bundle(key)[source]#

Get a bundle with the specified key.

Parameters:
keystr

Key of the bundle.

Returns:
Bundle, optional

A bundle or None.

abstract get_bundles()[source]#

Get all bundles.

Returns:
dict

A dictionary of key and bundle.

refresh(timeout=None)[source]#

Refresh the repository.

abstract property download_url[source]#

The url template for downloading bundles.

Returns:
str

The download url.

abstract property name[source]#

Name of the repository.

Returns:
str

The name of the repository.

abstract property version[source]#

The repository version.

Returns:
str

The version of the repository.

abstract property wildboar_requires[source]#

The minimum required wildboar version.

Returns:
str

The min version.

wildboar.datasets.clear_cache(repository=None, *, cache_dir=None, keep_last_version=True)[source]#

Clear the cache by deleting cached datasets.

Parameters:
repositorystr, optional

The name of the repository to clear cache.

  • if None, clear cache of all repositories.

cache_dirstr, optional

The cache directory.

keep_last_versionbool, optional

If true, keep the latest version of each repository.

wildboar.datasets.get_bundles(repository, *, refresh=False, timeout=None)[source]#

Get all bundles in the repository.

Parameters:
repositorystr

Name of the repository.

refreshbool, optional

Refresh the repository.

Added in version 1.1.

timeoutfloat, optional

Timeout for json request.

Added in version 1.1.

Returns:
dict

A dict of key Bundle pairs.

wildboar.datasets.get_repository(repository)[source]#

Get repository by name.

Parameters:
repositorystr

Repository name.

Returns:
Repository

A repository.

wildboar.datasets.install_repository(repository, *, refresh=True, timeout=None, cache_dir=None)[source]#

Install repository.

Parameters:
repositorystr or Repository

A repository.

refreshbool, optional

Refresh the repository.

Added in version 1.1.

timeoutfloat, optional

Timeout for json request.

Added in version 1.1.

cache_dirstr, optional

Cache directory.

Added in version 1.1.

wildboar.datasets.list_bundles(repository, *, refresh=False, timeout=None)[source]#

Get a list of all bundle names in the specified repository.

Parameters:
repositorystr

The name of the repository.

refreshbool, optional

Refresh the repository.

Added in version 1.1.

timeoutfloat, optional

Timeout for json request.

Added in version 1.1.

Returns:
list

A list of bundle names.

Examples

>>> from wildboar.datasets import list_bundles
>>> list_bundles("wildboar")
["ucr", "ucr-tiny", ...]
wildboar.datasets.list_collections(repository)[source]#

List the collections of the repository.

Parameters:
repositorystr or Bundle, optional

The data repository

  • if str load a named bundle, format {repository}/{bundle}.

Returns:
collections

A list of collections.

Examples

>>> from wildboar.datasets import list_collections
>>> list_collections("wildboar/ucr")
["bake-off", ...]
wildboar.datasets.list_datasets(repository='wildboar/ucr', *, collection=None, cache_dir=None, create_cache_dir=True, progress=True, force=False, refresh=False, timeout=None)[source]#

List the datasets in the repository.

Parameters:
repositorystr or Bundle, optional

The data repository

  • if str load a named bundle, format {repository}/{bundle}.

collectionstr, optional

A collection of named datasets.

cache_dirstr, optional

The directory where downloaded files are cached (default=’wildboar_cache’).

create_cache_dirbool, optional

Create cache directory if missing (default=True).

progressbool, optional

Show a progress bar while downloading a bundle.

forcebool, optional

Force re-download of cached bundle.

refreshbool, optional

Refresh the repository.

Added in version 1.1.

timeoutfloat, optional

Timeout for json request.

Added in version 1.1.

Returns:
set

A set of dataset names.

wildboar.datasets.list_repositories(*, refresh=False, timeout=None, cache_dir=None)[source]#

List the key of all installed repositories.

Parameters:
refreshbool, optional

Refresh all repositories.

Added in version 1.1.

timeoutfloat, optional

Timeout for json request.

Added in version 1.1.

cache_dirstr, optional

Cache directory.

Added in version 1.1.

Returns:
repositories

A list of installed repositories.

Examples

>>> from wildboar.datasets import list_repositories
>>> list_repositories()
["wildboar", ...]

We can also refresh the repositories, to load any newly added but still pending repositories.

>>> list_repositories(refresh=True)
wildboar.datasets.load_dataset(name, *, repository='wildboar/ucr', dtype=float, preprocess=None, contiguous=True, merge_train_test=True, cache_dir=None, create_cache_dir=True, progress=True, return_extras=False, force=False, refresh=False, timeout=None)[source]#

Load a dataset from a repository.

Parameters:
namestr

The name of the dataset to load.

repositorystr, optional

The data repository formatted as {repository}/{bundle}[:{version}][:{tag}]. Read more in the User guide.

dtypedtype, optional

The data type of x (train and test).

preprocessstr, list or callable, optional

Preprocess the dataset

  • if str, use named preprocess function (see preprocess._PREPROCESS.keys() for valid keys).

  • if callable, function taking a np.ndarray and returns the preprocessed dataset.

  • if list, a list of callable or str.

contiguousbool, optional

Ensure that the returned dataset is memory contiguous.

merge_train_testbool, optional

Merge the existing training and testing partitions.

cache_dirstr, optional

The directory where downloaded files are cached.

create_cache_dirbool, optional

Create cache directory if missing (default=True).

progressbool, optional

Show a progress bar while downloading a bundle.

return_extrasbool, optional

Return optional extras.

Added in version 1.1.

forcebool, optional

Force re-download of already cached bundle.

Added in version 1.0.4.

refreshbool, optional

Refresh the repository.

Added in version 1.1.

timeoutfloat, optional

Timeout for json request.

Added in version 1.1.

Returns:
xndarray, optional

The samples if merge_train_test=False.

yndarray, optional

The labels, if merge_train_test=False.

x_trainndarray, optional

The training samples if merge_train_test=False.

x_testndarray, optional

The testing samples if merge_train_test=False.

y_trainndarray, optional

The training labels if merge_train_test=False.

y_testndarray, optional

The testing labels if merge_train_test=False.

extrasdict, optional

The optional extras if return_extras=True.

Examples

Load a dataset from the default repository

>>> x, y = load_dataset("SyntheticControl")
>>> x.shape
(600, 60)

or if original training and testing splits are to be preserved

>>> x_train, x_test, y_train, y_test = load_dataset(
...     "SyntheticControl", merge_train_test=False
... )

or for a specific version of the dataset

>>> x_train, x_test, y_train, y_test = load_dataset(
...     "SyntheticControl",
...     repository='wildboar/ucr-tiny:1.0.1',
...     merge_train_test=False,
... )
wildboar.datasets.load_datasets(repository='wildboar/ucr', *, collection=None, cache_dir=None, create_cache_dir=True, progress=True, force=False, filter=None, **kwargs)[source]#

Load all datasets as a generator.

Parameters:
repositorystr

The repository string.

collectionstr, optional

A collection of named datasets.

cache_dirstr, optional

The cache directory for downloaded dataset repositories.

create_cache_dirbool, optional

Create the cache directory if it does not exist.

progressbool, optional

If progress indicator is shown while downloading the repository.

forcebool, optional

Force re-download of cached repository.

filterstr, dict, list or callable, optional

Filter the datasets

  • if callable, only yield those datasets for which the callable returns True. f(dataset, x, y) -> bool

  • if dict, filter based on the keys and values, where keys are attributes and values comparison specs

  • if list, filter based on conjunction of attribute comparisons

  • if str, filter based on attribute comparison

Read more in the User guide.

Warning

If the parameter merge_train_test is False, the filter is applied on the training part of the data.

kwargsdict

Optional arguments to load_dataset.

Yields:
namestr

The dataset name

datasetlist

Depends on the kwargs.

  • If merge_train_test=True (default), dataset is a tuple of (x, y).

  • If merge_train_test=False, dataset is a tuple of (x_train, x_test, y_train, y_test).

  • If return_extras=True, the last element of the tuple contains optional extra (or None).

Examples

Load all datasets in a repository:

>>> for dataset, (x, y) in load_datasets(repository='wildboar/ucr-tiny'):
...     print(dataset, x.shape, y.shape)
...
Beef (60, 470) (60,)
Coffee (56, 286) (56,)
GunPoint (200, 150) (200,)
SyntheticControl (600, 60) (600,)
TwoLeadECG (1162, 82) (1162,)

Print the names of datasets with more than 200 samples

>>> for dataset, (x, y) in load_datasets(
...    repository='wildboar/ucr-tiny', filter={"n_samples": ">200"}
... ):
...     print(dataset)
SyntheticControl
TwoLeadECG
>>> for dataset, (x, y) in load_datasets(
...    repository='wildboar/ucr-tiny', filter="n_samples>200"
... ):
...     print(dataset)
SyntheticControl
TwoLeadECG
wildboar.datasets.load_gun_point(merge_train_test=True)[source]#

Load the GunPoint dataset.

Parameters:
merge_train_testbool, optional

Merge the existing training and testing partitions.

Returns:
xndarray, optional

The samples if merge_train_test=False.

yndarray, optional

The labels, if merge_train_test=False.

x_trainndarray, optional

The training samples if merge_train_test=False.

x_testndarray, optional

The testing samples if merge_train_test=False.

y_trainndarray, optional

The training labels if merge_train_test=False.

y_testndarray, optional

The testing labels if merge_train_test=False.

extrasdict, optional

The optional extras if return_extras=True.

See also

load_dataset

load a named dataset

wildboar.datasets.load_synthetic_control(merge_train_test=True)[source]#

Load the Synthetic_Control dataset.

Parameters:
merge_train_testbool, optional

Merge the existing training and testing partitions.

Returns:
xndarray, optional

The samples if merge_train_test=False.

yndarray, optional

The labels, if merge_train_test=False.

x_trainndarray, optional

The training samples if merge_train_test=False.

x_testndarray, optional

The testing samples if merge_train_test=False.

y_trainndarray, optional

The training labels if merge_train_test=False.

y_testndarray, optional

The testing labels if merge_train_test=False.

extrasdict, optional

The optional extras if return_extras=True.

See also

load_dataset

load a named dataset

wildboar.datasets.load_two_lead_ecg(merge_train_test=True)[source]#

Load the TwoLeadECG dataset.

Parameters:
merge_train_testbool, optional

Merge the existing training and testing partitions.

Returns:
xndarray, optional

The samples if merge_train_test=False.

yndarray, optional

The labels, if merge_train_test=False.

x_trainndarray, optional

The training samples if merge_train_test=False.

x_testndarray, optional

The testing samples if merge_train_test=False.

y_trainndarray, optional

The training labels if merge_train_test=False.

y_testndarray, optional

The testing labels if merge_train_test=False.

extrasdict, optional

The optional extras if return_extras=True.

See also

load_dataset

load a named dataset

wildboar.datasets.refresh_repositories(repository=None, *, timeout=None, cache_dir=None)[source]#

Refresh the installed repositories.

Parameters:
repositorystr, optional

The repository. None means all repositories.

timeoutfloat, optional

Timeout for request.

Added in version 1.1.

cache_dirstr, optional

Cache directory.

Added in version 1.1.

wildboar.datasets.set_cache_dir(cache_dir=None)[source]#

Change the global cache directory.

If called without arguments, the cache directory is reset to the default directory.

Parameters:
cache_dirstr, optional

The cache directory root.