wildboar.datasets#

Submodules#

Package Contents#

Classes#

Bundle

Base class for handling dataset bundles

JSONRepository

A repository is a collection of bundles

NpBundle

bundle of numpy binary files

Repository

A repository is a collection of bundles

Functions#

clear_cache([repository, cache_dir, keep_last_version])

Clear the cache by deleting cached datasets

get_bundles(repository, *[, refresh, timeout])

Get all bundles in the repository

get_repository(repository)

Get repository by name

install_repository(repository, *[, refresh, timeout, ...])

Install repository

list_bundles(repository, *[, refresh, timeout])

Get a list of all bundle names in the specified repository.

list_collections(repository)

List the collections of the repository

list_datasets([repository, collection, cache_dir, ...])

List the datasets in the repository

list_repositories(*[, refresh, timeout, cache_dir])

List the key of all installed repositories

load_dataset(name, *[, repository, dtype, preprocess, ...])

Load a dataset from a repository

load_datasets([repository, collection, cache_dir, ...])

Load all datasets as a generator

load_gun_point([merge_train_test])

Load the GunPoint dataset

load_synthetic_control([merge_train_test])

Load the Synthetic_Control dataset

load_two_lead_ecg([merge_train_test])

Load the TwoLeadECG dataset

refresh_repositories([repository, timeout, cache_dir])

Refresh the installed repositories

set_cache_dir([cache_dir])

Change the global cache directory. If called without arguments, the cache

class wildboar.datasets.Bundle(*, key, version, name, tag=None, arrays=None, description=None, collections=None)[source]#

Base class for handling dataset bundles

name[source]#

Human-readable name of the bundle

Type:

str

description[source]#

Description of the bundle

Type:

str

label_index[source]#

Index of the class label(s)

Type:

int or array-like

Parameters:
  • key (str) – A unique key of the bundle

  • version (str) – The version of the bundle

  • name (str) – Human-readable name of the bundle

  • description (str) – Description of the bundle

  • arrays (list) – The arrays of the dataset

get_collection(collection)[source]#
get_filename(version=None, tag=None, ext=None)[source]#
list(archive, collection=None)[source]#

List all datasets in this bundle

Parameters:
  • archive (ZipFile) – The bundle file

  • collection (str, optional) – The collection name

Returns:

dataset_names – A sorted list of datasets in the bundle

Return type:

list

load(name, archive)[source]#

Load a dataset from the bundle

Parameters:
  • name (str) – Name of the dataset

  • archive (ZipFile) – The zip-file bundle

Returns:

  • x (ndarray) – Data samples

  • y (ndarray) – Data labels

  • n_training_samples (int) – Number of samples that are for training. The value is <= x.shape[0]

  • extras (dict, optional) – Extra numpy arrays

class wildboar.datasets.JSONRepository(url)[source]#

Bases: Repository

A repository is a collection of bundles

property download_url[source]#

The url template for downloading bundles

Returns:

str

Return type:

the download url

property identifier[source]#
property name[source]#

Name of the repository

Returns:

str

Return type:

the name of the repository

property version[source]#

The repository version

Returns:

str

Return type:

the version of the repository

property wildboar_requires[source]#

The minimum required wildboar version

Returns:

str

Return type:

the min version

supported_version = '1.1'[source]#
get_bundles()[source]#

Get all bundles

Returns:

dict

Return type:

a dictionary of key and bundle

class wildboar.datasets.NpBundle(*, key, version, name, tag=None, arrays=None, description=None, collections=None)[source]#

Bases: Bundle

bundle of numpy binary files

Parameters:
  • key (str) – A unique key of the bundle

  • version (str) – The version of the bundle

  • name (str) – Human-readable name of the bundle

  • description (str) – Description of the bundle

  • arrays (list) – The arrays of the dataset

class wildboar.datasets.Repository[source]#

A repository is a collection of bundles

property active[source]#
abstract property download_url[source]#

The url template for downloading bundles

Returns:

str

Return type:

the download url

abstract property identifier[source]#
abstract property name[source]#

Name of the repository

Returns:

str

Return type:

the name of the repository

abstract property version[source]#

The repository version

Returns:

str

Return type:

the version of the repository

abstract property wildboar_requires[source]#

The minimum required wildboar version

Returns:

str

Return type:

the min version

__eq__(o)[source]#

Return self==value.

__hash__() int[source]#

Return hash(self).

clear_cache(cache_dir, keep_last_version=True)[source]#
get_bundle(key)[source]#

Get a bundle with the specified key

Parameters:

key (str) – Key of the bundle

Returns:

bundle – A bundle or None

Return type:

Bundle, optional

abstract get_bundles()[source]#

Get all bundles

Returns:

dict

Return type:

a dictionary of key and bundle

list_datasets(bundle, *, cache_dir, collection=None, version=None, tag=None, create_cache_dir=True, progress=True, force=False)[source]#
load_dataset(bundle, dataset, *, cache_dir, version=None, tag=None, create_cache_dir=True, progress=True, force=False)[source]#
refresh(timeout=None)[source]#

Refresh the repository

wildboar.datasets.clear_cache(repository=None, *, cache_dir=None, keep_last_version=True)[source]#

Clear the cache by deleting cached datasets

Parameters:
  • repository (str, optional) –

    The name of the repository to clear cache.

    • if None, clear cache of all repositories

  • cache_dir (str, optional) – The cache directory

  • keep_last_version (bool, optional) – If true, keep the latest version of each repository.

wildboar.datasets.get_bundles(repository, *, refresh=False, timeout=None)[source]#

Get all bundles in the repository

Parameters:
  • repository (str) – Name of the repository

  • refresh (bool, optional) –

    Refresh the repository

    ..versionadded :: 1.1

  • timeout (float, optional) –

    Timeout for json request

    ..versionadded :: 1.1

Returns:

dict

Return type:

A dict of key Bundle pairs

wildboar.datasets.get_repository(repository)[source]#

Get repository by name

Parameters:

repository (str) – Repository name

Returns:

repository – A repository

Return type:

Repository

wildboar.datasets.install_repository(repository, *, refresh=True, timeout=None, cache_dir=None)[source]#

Install repository

Parameters:
  • repository (str or Repository) – A repository

  • refresh (bool, optional) –

    Refresh the repository

    ..versionadded :: 1.1

  • timeout (float, optional) –

    Timeout for json request

    ..versionadded :: 1.1

  • cache_dir (str, optional) –

    Cache directory

    ..versionadded :: 1.1

wildboar.datasets.list_bundles(repository, *, refresh=False, timeout=None)[source]#

Get a list of all bundle names in the specified repository.

Parameters:
  • repository (str) – The name of the repository

  • refresh (bool, optional) –

    Refresh the repository

    ..versionadded :: 1.1

  • timeout (float, optional) –

    Timeout for json request

    ..versionadded :: 1.1

Returns:

bundle – The name of the bundle

Return type:

str

wildboar.datasets.list_collections(repository)[source]#

List the collections of the repository

Parameters:

repository (str or Bundle, optional) –

The data repository

  • if str load a named bundle, format {repository}/{bundle}

Returns:

list

Return type:

a list of collections

wildboar.datasets.list_datasets(repository='wildboar/ucr', *, collection=None, cache_dir=None, create_cache_dir=True, progress=True, force=False, refresh=False, timeout=None)[source]#

List the datasets in the repository

Parameters:
  • repository (str or Bundle, optional) –

    The data repository

    • if str load a named bundle, format {repository}/{bundle}

  • collection (str, optional) – A collection of named datasets.

  • progress (bool, optional) – Show a progress bar while downloading a bundle.

  • cache_dir (str, optional) – The directory where downloaded files are cached (default=’wildboar_cache’)

  • create_cache_dir (bool, optional) – Create cache directory if missing (default=True)

  • force (bool, optional) – Force re-download of cached bundle

  • refresh (bool, optional) –

    Refresh the repository

    New in version 1.1.

  • timeout (float, optional) –

    Timeout for json request

    New in version 1.1.

Returns:

dataset – A set of dataset names

Return type:

set

wildboar.datasets.list_repositories(*, refresh=False, timeout=None, cache_dir=None)[source]#

List the key of all installed repositories

refreshbool, optional

Refresh all repositories

..versionadded :: 1.1

timeoutfloat, optional

Timeout for json request

..versionadded :: 1.1

cache_dirstr, optional

Cache directory

..versionadded :: 1.1

wildboar.datasets.load_dataset(name, *, repository='wildboar/ucr', dtype=float, preprocess=None, contiguous=True, merge_train_test=True, cache_dir=None, create_cache_dir=True, progress=True, return_extras=False, force=False, refresh=False, timeout=None)[source]#

Load a dataset from a repository

Parameters:
  • name (str) – The name of the dataset to load.

  • repository (str, optional) – The data repository formatted as {repository}/{bundle}[:{version}][:{tag}]

  • dtype (dtype, optional) – The data type of x (train and test)

  • contiguous (bool, optional) – Ensure that the returned dataset is memory contiguous.

  • preprocess (str, list or callable, optional) –

    Preprocess the dataset

    • if str, use named preprocess function (see preprocess._PREPROCESS.keys() for valid keys)

    • if callable, function taking a np.ndarray and returns the preprocessed dataset

    • if list, a list of callable or str

  • merge_train_test (bool, optional) – Merge the existing training and testing partitions.

  • progress (bool, optional) – Show a progress bar while downloading a bundle.

  • cache_dir (str, optional) – The directory where downloaded files are cached

  • create_cache_dir (bool, optional) – Create cache directory if missing (default=True)

  • return_extras (bool, optional) –

    Return optional extras

    New in version 1.1.

  • force (bool, optional) –

    Force re-download of already cached bundle

    New in version 1.0.4.

  • refresh (bool, optional) –

    Refresh the repository

    New in version 1.1.

  • timeout (float, optional) –

    Timeout for json request

    New in version 1.1.

Returns:

  • x (ndarray) – The data samples, optional

  • y (ndarray, optional) – The labels

  • x_train (ndarray, optional) – The training samples if merge_train_test=False

  • x_test (ndarray, optional) – The testing samples if merge_train_test=False

  • y_train (ndarray, optional) – The training labels if merge_train_test=False

  • y_test (ndarray, optional) – The testing labels if merge_train_test=False

  • extras (dict, optional) – The optional extras if return_extras=True

Examples

Load a dataset from the default repository

>>> x, y = load_dataset("SyntheticControl")

or if original training and testing splits are to be preserved

>>> x_train, x_test, y_train, y_test = load_dataset(
...     "SyntheticControl", merge_train_test=False
... )

or for a specific version of the dataset

>>> x_train, x_test, y_train, y_test = load_dataset(
...     "Wafer", repository='wildboar/ucr-tiny:1.0'
... )
wildboar.datasets.load_datasets(repository='wildboar/ucr', *, collection=None, cache_dir=None, create_cache_dir=True, progress=True, force=False, filter=None, **kwargs)[source]#

Load all datasets as a generator

Parameters:
  • repository (str) – The repository string

  • collection (str, optional) – A collection of named datasets.

  • progress (bool, optional) – If progress indicator is shown while downloading the repository.

  • cache_dir (str, optional) – The cache directory for downloaded dataset repositories.

  • create_cache_dir (bool, optional) – Create the cache directory if it does not exist.

  • force (bool, optional) – Force re-download of cached repository

  • filter (str, dict, list or callable, optional) –

    Filter the datasets

    • if callable, only yield those datasets for which the callable returns True. f(dataset, x, y) -> bool

    • if dict, filter based on the keys and values, where keys are attributes and values comparison specs

    • if list, filter based on conjunction of attribute comparisons

    • if str, filter based on attribute comparison

    The format of attribute comparisons are [attribute][comparison spec].

    Valid attributes are - dataset - n_samples - n_timestep - n_dims - n_labels

    The comparison spec is a string of two parts, comparison operator (<, <=, >, >= or =) and a number, e.g., “<100”, “<= 200”, or “>300”

  • kwargs (dict) – Optional arguments to load_dataset

Yields:
  • x (array-like) – Data samples

  • y (array-like) – Data labels

Examples

>>> from wildboar.datasets import load_datasets
>>> for dataset, (x, y) in load_datasets(repository='wildboar/ucr'):
>>>     print(dataset, x.shape, y.shape)

Print the names of datasets with more than 200 samples

>>> for dataset, (x, y) in load_datasets(
...    repository='wildboar/ucr', filter={"n_samples": ">200"}
... ):
>>>     print(dataset)
>>> for dataset, (x, y) in load_datasets(
...    repository='wildboar/ucr', filter="n_samples>200"
... ):
>>>     print(dataset)
wildboar.datasets.load_gun_point(merge_train_test=True)[source]#

Load the GunPoint dataset

See also

load_dataset

load a named dataset

wildboar.datasets.load_synthetic_control(merge_train_test=True)[source]#

Load the Synthetic_Control dataset

See also

load_dataset

load a named dataset

wildboar.datasets.load_two_lead_ecg(merge_train_test=True)[source]#

Load the TwoLeadECG dataset

See also

load_dataset

load a named dataset

wildboar.datasets.refresh_repositories(repository=None, *, timeout=None, cache_dir=None)[source]#

Refresh the installed repositories

repositorystr, optional

The repository. None means all repositories.

timeoutfloat, optional

Timeout for request

..versionadded :: 1.1

cache_dirstr, optional

Cache directory

..versionadded :: 1.1

wildboar.datasets.set_cache_dir(cache_dir=None)[source]#

Change the global cache directory. If called without arguments, the cache directory is reset to the default directory.

cache_dirstr, optional

The cache directory root