Source code for muspy.datasets.base

"""Base Dataset classes."""
import json
import warnings
from pathlib import Path
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    List,
    Sequence,
    Tuple,
    TypeVar,
    Union,
)

import numpy as np
from joblib import Parallel, delayed
from numpy.random import RandomState, permutation
from tqdm import tqdm

from ..inputs import load, read_abc_string
from ..music import Music
from ..outputs import save
from .utils import (
    check_md5,
    check_sha256,
    check_size,
    download_url,
    extract_archive,
)

if TYPE_CHECKING:
    from tensorflow.data import Dataset as TFDataset
    from torch.utils.data import Dataset as TorchDataset


RemoteDatasetType = TypeVar("RemoteDatasetType", bound="RemoteDataset")
FolderDatasetType = TypeVar("FolderDatasetType", bound="FolderDataset")


[docs]class DatasetInfo:
    """A container for dataset information."""

    def __init__(
        self,
        name: str = None,
        description: str = None,
        homepage: str = None,
        license: str = None,
    ):
        # pylint: disable=redefined-builtin
        self.name = name
        self.description = description
        self.homepage = homepage
        self.license = license

    def __repr__(self) -> str:
        to_join = []
        for attr in ("name", "description", "homepage", "license"):
            if getattr(self, attr) is not None:
                to_join.append(attr + "=" + repr(getattr(self, attr)))
        return "DatasetInfo(" + ", ".join(to_join) + ")"


[docs]class Dataset:
    """Base class for MusPy datasets.

    To build a custom dataset, it should inherit this class and overide
    the methods ``__getitem__`` and ``__len__`` as well as the class
    attribute ``_info``. ``__getitem__`` should return the ``i``-th data
    sample as a :class:`muspy.Music` object. ``__len__`` should return
    the size of the dataset. ``_info`` should be a
    :class:`muspy.DatasetInfo` instance storing the dataset information.

    """

    _info: DatasetInfo = DatasetInfo()
    _citation: str = ""

    def __getitem__(self, index) -> Music:
        raise NotImplementedError

    def __len__(self) -> int:
        raise NotImplementedError

[docs]    @classmethod
    def info(cls):
        """Return the dataset infomation."""
        return cls._info

[docs]    @classmethod
    def citation(cls):
        """Print the citation infomation."""
        return cls._citation

[docs]    def save(
        self,
        root: Union[str, Path],
        kind: str = "json",
        n_jobs: int = 1,
        ignore_exceptions: bool = True,
        verbose: bool = True,
        **kwargs,
    ):
        """Save all the music objects to a directory.

        Parameters
        ----------
        root : str or Path
            Root directory to save the data.
        kind : {'json', 'yaml'}, default: 'json'
            File format to save the data.
        n_jobs : int, default: 1
            Maximum number of concurrently running jobs. If equal to 1,
            disable multiprocessing.
        ignore_exceptions : bool, default: True
            Whether to ignore errors and skip failed conversions. This
            can be helpful if some source files are known to be
            corrupted.
        verbose : bool, default: True
            Whether to be verbose.
        **kwargs
            Keyword arguments to pass to :func:`muspy.save`.

        """
        if kind not in ("json", "yaml"):
            raise TypeError("`kind` must be either 'json' or 'yaml'.")

        root = Path(root).expanduser().resolve()
        root.mkdir(exist_ok=True)

        def _saver(idx):
            prefix = "0" * (n_digits - len(str(idx)))
            filename = root / (prefix + str(idx) + "." + kind)
            if ignore_exceptions:
                try:
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        save(filename, self[idx], kind, **kwargs)
                except Exception:  # pylint: disable=broad-except
                    return False
                return True
            save(filename, self[idx], kind, **kwargs)
            return True

        n_digits = len(str(len(self)))

        if verbose:
            print("Converting and saving the dataset...")
        if n_jobs == 1:
            count = 0
            for idx in tqdm(range(len(self))):  # type: ignore
                if _saver(idx):
                    count += 1
        else:
            # TODO: This is slow as `self` is passed between workers.
            results = Parallel(n_jobs=n_jobs, backend="threading", verbose=5)(
                delayed(_saver)(idx) for idx in range(len(self))
            )
            count = results.count(True)
        if verbose:
            print(f"Successfully saved {count} out of {len(self)} files.")

[docs]    def split(
        self,
        filename: Union[str, Path] = None,
        splits: Sequence[float] = None,
        random_state: Any = None,
    ) -> Dict[str, List[int]]:
        """Return the dataset as a PyTorch dataset.

        Parameters
        ----------
        filename : str or Path, optional
            If given and exists, path to the file to read the split
            from. If None or not exists, path to save the split.
        splits : float or list of float, optional
            Ratios for train-test-validation splits. If None, return the
            full dataset as a whole. If float, return train and test
            splits. If list of two floats, return train and test splits.
            If list of three floats, return train, test and validation
            splits.
        random_state : int, array_like or RandomState, optional
            Random state used to create the splits. If int or
            array_like, the value is passed to
            :class:`numpy.random.RandomState`, and the created
            RandomState object is used to create the splits. If
            RandomState, it will be used to create the splits.

        """
        if filename is not None and Path(filename).is_file():
            with open(str(filename)) as f:
                return json.load(f)

        if not isinstance(splits, (float, list, tuple)):
            raise TypeError("`splits` must be of type float, list or tuple.")

        if isinstance(splits, float):
            if splits <= 0:
                raise ValueError("`splits` must be positive.")
            if splits >= 1:
                raise ValueError("`splits` must be less than 1.")
            splits = [splits, 1 - splits]

        if isinstance(splits, (list, tuple)):
            if sum(splits) != 1:
                raise ValueError("`splits` must sum to 1.")
            if len(splits) < 2 or len(splits) > 3:
                raise ValueError("`splits` must have length 2 or 3.")

        if random_state is None:
            rand_indices = permutation(len(self))
        else:
            if not isinstance(random_state, RandomState):
                random_state = RandomState(random_state)
            rand_indices = random_state.permutation(len(self))

        boundaries = np.cumsum([0.0] + list(splits))
        names = ("train", "test", "validation")
        indices = {}
        for idx, (start, end) in enumerate(
            zip(boundaries[:-1], boundaries[1:])
        ):
            start_idx = int(start * len(self))
            end_idx = int(end * len(self))
            indices[names[idx]] = rand_indices[start_idx:end_idx]

        if filename is not None:
            indices_ = {key: value.tolist() for key, value in indices.items()}
            with open(str(filename), "w") as f:
                f.write(json.dumps(indices_))

        return indices

[docs]    def to_pytorch_dataset(
        self,
        factory: Callable = None,
        representation: str = None,
        split_filename: Union[str, Path] = None,
        splits: Sequence[float] = None,
        random_state: Any = None,
        **kwargs: Any,
    ) -> Union["TorchDataset", Dict[str, "TorchDataset"]]:
        """Return the dataset as a PyTorch dataset.

        Parameters
        ----------
        factory : Callable, optional
            Function to be applied to the Music objects. The input is a
            Music object, and the output is an array or a tensor.
        representation : str, optional
            Target representation. See :func:`muspy.to_representation()`
            for available representation.
        split_filename : str or Path, optional
            If given and exists, path to the file to read the split
            from. If None or not exists, path to save the split.
        splits : float or list of float, optional
            Ratios for train-test-validation splits. If None, return the
            full dataset as a whole. If float, return train and test
            splits. If list of two floats, return train and test splits.
            If list of three floats, return train, test and validation
            splits.
        random_state : int, array_like or RandomState, optional
            Random state used to create the splits. If int or
            array_like, the value is passed to
            :class:`numpy.random.RandomState`, and the created
            RandomState object is used to create the splits. If
            RandomState, it will be used to create the splits.

        Returns
        -------
        :class:torch.utils.data.Dataset` or Dict of \
                :class:torch.utils.data.Dataset`
            Converted PyTorch dataset(s).

        """
        if representation is None and factory is None:
            raise TypeError(
                "One of `representation` and `factory` must be given."
            )
        if representation is not None and factory is not None:
            raise TypeError(
                "Only one of `representation` and `factory` can be given."
            )

        try:
            # pylint: disable=import-outside-toplevel
            from torch.utils.data import Dataset as TorchDataset
        except ImportError as err:
            raise ImportError("Optional package pytorch is required.") from err

        class TorchMusicFactoryDataset(TorchDataset):
            """A PyTorch dataset built from a Music dataset.

            Parameters
            ----------
            dataset : :class:`muspy.Dataset`
                Dataset object to base on.
            factory : Callable
                Function to be applied to the Music objects. The input is a
                Music object, and the output is an array or a tensor.

            """

            def __init__(
                self,
                dataset: Dataset,
                factory: Callable,
                subset: str = "Full",
                indices: Sequence[int] = None,
            ):
                self.dataset = dataset
                self.factory = factory
                self.subset = subset
                self.indices = indices
                if self.indices is not None:
                    self.indices = sorted(
                        idx for idx in self.indices if idx < len(self.dataset)
                    )

            def __repr__(self) -> str:
                return (
                    f"TorchMusicFactoryDataset(dataset={self.dataset}, "
                    f"factory={self.subset}, subset={self.factory})"
                )

            def __getitem__(self, index):
                if self.indices is None:
                    return self.factory(self.dataset[index])
                return self.factory(self.dataset[self.indices[index]])

            def __len__(self) -> int:
                if self.indices is None:
                    return len(self.dataset)
                return len(self.indices)

        class TorchRepresentationDataset(TorchMusicFactoryDataset):
            """A PyTorch music dataset.

            Parameters
            ----------
            dataset : :class:`muspy.Dataset`
                Dataset object to base on.
            representation : str
                Target representation. See
                :func:`muspy.to_representation()` for available
                representation.

            """

            def __init__(
                self,
                dataset: Dataset,
                representation: str,
                subset: str = "Full",
                indices: Sequence[int] = None,
                **kwargs: Any,
            ):
                self.representation = representation

                def factory(music):
                    return music.to_representation(representation, **kwargs)

                super().__init__(
                    dataset, factory=factory, subset=subset, indices=indices
                )

            def __repr__(self) -> str:
                return (
                    f"TorchRepresentationDataset(dataset={self.dataset}, "
                    f"representation={self.representation}, "
                    f"subset={self.subset})"
                )

        # No split
        if splits is None:
            if representation is not None:
                return TorchRepresentationDataset(
                    self, representation, **kwargs
                )
            return TorchMusicFactoryDataset(self, factory)  # type: ignore

        datasets: Dict[str, "TorchDataset"] = {}
        indices_list = self.split(split_filename, splits, random_state)
        for key, value in indices_list.items():
            if representation is not None:
                datasets[key] = TorchRepresentationDataset(
                    self, representation, key, value, **kwargs
                )
            else:

                datasets[key] = TorchMusicFactoryDataset(
                    self, factory, key, value  # type: ignore
                )

        return datasets

[docs]    def to_tensorflow_dataset(
        self,
        factory: Callable = None,
        representation: str = None,
        split_filename: Union[str, Path] = None,
        splits: Sequence[float] = None,
        random_state: Any = None,
        **kwargs: Any,
    ) -> Union["TFDataset", Dict[str, "TFDataset"]]:
        """Return the dataset as a TensorFlow dataset.

        Parameters
        ----------
        factory : Callable, optional
            Function to be applied to the Music objects. The input is a
            Music object, and the output is an array or a tensor.
        representation : str, optional
            Target representation. See :func:`muspy.to_representation()`
            for available representation.
        split_filename : str or Path, optional
            If given and exists, path to the file to read the split
            from. If None or not exists, path to save the split.
        splits : float or list of float, optional
            Ratios for train-test-validation splits. If None, return the
            full dataset as a whole. If float, return train and test
            splits. If list of two floats, return train and test splits.
            If list of three floats, return train, test and validation
            splits.
        random_state : int, array_like or RandomState, optional
            Random state used to create the splits. If int or
            array_like, the value is passed to
            :class:`numpy.random.RandomState`, and the created
            RandomState object is used to create the splits. If
            RandomState, it will be used to create the splits.

        Returns
        -------
        :class:tensorflow.data.Dataset` or Dict of
        :class:tensorflow.data.dataset`
            Converted TensorFlow dataset(s).

        """
        if representation is None and factory is None:
            raise TypeError(
                "One of `representation` and `factory` must be given."
            )
        if representation is not None and factory is not None:
            raise TypeError(
                "Only one of `representation` and `factory` can be given."
            )

        try:
            # pylint: disable=import-outside-toplevel
            import tensorflow as tf
            from tensorflow.data import Dataset as TFDataset
        except ImportError as err:
            raise ImportError(
                "Optional package tensorflow is required."
            ) from err

        if representation is not None:

            def _gen(indices):
                for idx in indices:
                    yield self[idx].to_representation(representation, **kwargs)

        else:

            def _gen(indices):
                for idx in indices:
                    yield factory(self[idx])

        # TODO: `from_generator` is slow.

        # No split
        if splits is None:
            indices = np.arange(len(self))
            return TFDataset.from_generator(_gen, tf.float32, args=[indices])

        datasets: Dict[str, TFDataset] = {}
        indices_list = self.split(split_filename, splits, random_state)
        for key, value in indices_list.items():
            indices = np.array(value)
            datasets[key] = TFDataset.from_generator(
                _gen, tf.float32, args=[indices]
            )

        return datasets


[docs]class RemoteDataset(Dataset):
    """Base class for remote MusPy datasets.

    This class extends :class:`muspy.Dataset` to support remote
    datasets. To build a custom remote dataset, please refer to the
    documentation of :class:`muspy.Dataset` for details. In addition,
    set the class attribute ``_sources`` to the URLs to the source files
    (see Notes).

    Attributes
    ----------
    root : str or Path
        Root directory of the dataset.

    Parameters
    ----------
    download_and_extract : bool, default: False
        Whether to download and extract the dataset.
    overwrite : bool, default: False
        Whether to overwrite existing file(s).
    cleanup : bool, default: False
        Whether to remove the source archive(s).
    verbose : bool, default: True
        Whether to be verbose.

    Raises
    ------
    RuntimeError:
        If ``download_and_extract`` is False but file
        ``{root}/.muspy.success`` does not exist (see below).

    Important
    ---------
    :meth:`muspy.Dataset.exists` depends solely on a special file named
    ``.muspy.success`` in directory ``{root}/_converted/``. This file
    serves as an indicator for the existence and integrity of the
    dataset. It will automatically be created if the dataset is
    successfully downloaded and extracted by
    :meth:`muspy.Dataset.download_and_extract`. If the dataset is
    downloaded manually, make sure to create the ``.muspy.success`` file
    in directory ``{root}/_converted/`` to prevent errors.

    Notes
    -----
    The class attribute ``_sources`` is a dictionary storing the
    following information of each source file.

    - filename (str): Name to save the file.
    - url (str): URL to the file.
    - archive (bool): Whether the file is an archive.
    - md5 (str, optional): Expected MD5 checksum of the file.
    - sha256 (str, optional): Expected SHA256 checksum of the file.

    Here is an example.::

        _sources = {
            "example": {
                "filename": "example.tar.gz",
                "url": "https://www.example.com/example.tar.gz",
                "archive": True,
                "md5": None,
                "sha256": None,
            }
        }

    See Also
    --------
    :class:`muspy.Dataset` : Base class for MusPy datasets.

    """

    _sources: Dict[str, dict] = {}

    def __init__(
        self,
        root: Union[str, Path],
        download_and_extract: bool = False,
        overwrite: bool = False,
        cleanup: bool = False,
        verbose: bool = True,
    ):
        super().__init__()
        self.root = Path(root).expanduser().resolve()
        self.root.mkdir(exist_ok=True)

        if download_and_extract:
            self.download_and_extract(
                overwrite=overwrite, cleanup=cleanup, verbose=verbose
            )

        if not self.exists():
            raise RuntimeError(
                "Dataset not found. You can download it by passing "
                "`download_and_extract=True`."
            )

    def __repr__(self) -> str:
        return f"{type(self).__name__}(root={self.root})"

    def __getitem__(self, index) -> Music:
        raise NotImplementedError

    def __len__(self) -> int:
        raise NotImplementedError

[docs]    def exists(self) -> bool:
        """Return True if the dataset exists, otherwise False."""
        if not (self.root / ".muspy.success").is_file():
            return False
        return True

[docs]    def source_exists(self) -> bool:
        """Return True if all the sources exist, otherwise False."""
        for source in self._sources.values():
            filename = self.root / source["filename"]
            if not filename.is_file():
                return False
            if "size" in source and not check_size(filename, source["size"]):
                return False
            if "md5" in source and not check_md5(filename, source["md5"]):
                return False
            if "sha256" in source and not check_sha256(
                filename, source["sha256"]
            ):
                return False
        return True

[docs]    def download(
        self: RemoteDatasetType, overwrite: bool = False, verbose: bool = True
    ) -> RemoteDatasetType:
        """Download the dataset source(s).

        Parameters
        ----------
        overwrite : bool, default: False
            Whether to overwrite existing file(s).
        verbose : bool, default: True
            Whether to be verbose.

        Returns
        -------
        Object itself.

        """
        if self.exists():
            if verbose:
                print(
                    "Skip downloading as the `.muspy.success` file is found."
                )
            return self

        for source in self._sources.values():
            download_url(
                source["url"],
                self.root / source["filename"],
                overwrite=overwrite,
                size=source.get("size"),
                md5=source.get("md5"),
                sha256=source.get("sha256"),
                verbose=verbose,
            )
        return self

[docs]    def extract(
        self: RemoteDatasetType, cleanup: bool = False, verbose: bool = True
    ) -> RemoteDatasetType:
        """Extract the downloaded archive(s).

        Parameters
        ----------
        cleanup : bool, default: False
            Whether to remove the source archive after extraction.
        verbose : bool, default: True
            Whether to be verbose.

        Returns
        -------
        Object itself.

        """
        if self.exists():
            if verbose:
                print("Skip extracting as the `.muspy.success` file is found.")
            return self

        for source in self._sources.values():
            filename = self.root / source["filename"]
            if source["archive"]:
                extract_archive(
                    filename, self.root, cleanup=cleanup, verbose=verbose
                )
        (self.root / ".muspy.success").touch(exist_ok=True)
        return self

[docs]    def download_and_extract(
        self: RemoteDatasetType,
        overwrite: bool = False,
        cleanup: bool = False,
        verbose: bool = True,
    ) -> RemoteDatasetType:
        """Download source datasets and extract the downloaded archives.

        Parameters
        ----------
        overwrite : bool, default: False
            Whether to overwrite existing file(s).
        cleanup : bool, default: False
            Whether to remove the source archive(s).
        verbose : bool, default: True
            Whether to be verbose.

        Returns
        -------
        Object itself.

        """
        return self.download(overwrite=overwrite, verbose=verbose).extract(
            cleanup=cleanup, verbose=verbose
        )


def _get_filenames(root, extensions: List[str], recursive: bool = True):
    filenames = []
    for ext in extensions:
        if recursive:
            filenames.extend(root.rglob(f"*.{ext}"))
        else:
            filenames.extend(root.glob(f"*.{ext}"))
    return filenames


[docs]class MusicDataset(Dataset):
    """Class for datasets of MusPy JSON/YAML files.

    Parameters
    ----------
    root : str or Path
        Root directory of the dataset.
    kind : {'json', 'yaml'}, optional
        File formats to include in the dataset. Defaults to include
        both JSON and YAML files.

    Attributes
    ----------
    root : Path
        Root directory of the dataset.
    filenames : list of Path
        Path to the files, relative to `root`.

    See Also
    --------
    :class:`muspy.Dataset` : Base class for MusPy datasets.

    """

    def __init__(self, root: Union[str, Path], kind: str = None):
        if kind is not None and kind not in ("json", "yaml"):
            raise ValueError(f"Unknown value for `kind` : {kind} .")

        self.root = Path(root).expanduser().resolve()
        self.root.mkdir(exist_ok=True)

        if kind is None:
            extensions = ["json", "json.gz", "yaml", "yaml.gz"]
        elif kind == "json":
            extensions = ["json", "json.gz"]
        else:
            extensions = ["yaml", "yaml.gz"]
        self.filenames = _get_filenames(self.root, extensions)

    def __repr__(self) -> str:
        return f"{type(self).__name__}(root={self.root})"

    def __getitem__(self, index) -> Music:
        return load(self.root / self.filenames[index])

    def __len__(self) -> int:
        return len(self.filenames)


[docs]class RemoteMusicDataset(MusicDataset, RemoteDataset):
    """Base class for remote datasets of MusPy JSON/YAML files.

    Parameters
    ----------
    root : str or Path
        Root directory of the dataset.
    download_and_extract : bool, default: False
        Whether to download and extract the dataset.
    overwrite : bool, default: False
        Whether to overwrite existing file(s).
    cleanup : bool, default: False
        Whether to remove the source archive(s).
    kind : {'json', 'yaml'}, optional
        File formats to include in the dataset. Defaults to include
        both JSON and YAML files.
    verbose : bool. default: True
        Whether to be verbose.

    Attributes
    ----------
    root : Path
        Root directory of the dataset.
    filenames : list of Path
        Path to the files, relative to `root`.

    See Also
    --------
    :class:`muspy.MusicDataset` :
        Class for datasets of MusPy JSON/YAML files.
    :class:`muspy.RemoteDataset` : Base class for remote MusPy datasets.

    """

    def __init__(
        self,
        root: Union[str, Path],
        download_and_extract: bool = False,
        overwrite: bool = False,
        cleanup: bool = False,
        kind: str = None,
        verbose: bool = True,
    ):
        RemoteDataset.__init__(
            self,
            root,
            download_and_extract=download_and_extract,
            overwrite=overwrite,
            cleanup=cleanup,
            verbose=verbose,
        )
        MusicDataset.__init__(self, root, kind=kind)


[docs]class FolderDataset(Dataset):
    """Class for datasets storing files in a folder.

    This class extends :class:`muspy.Dataset` to support folder
    datasets. To build a custom folder dataset, please refer to the
    documentation of :class:`muspy.Dataset` for details. In addition,
    set class attribute ``_extension`` to the extension to look for
    when building the dataset and set ``read`` to a callable that takes
    as inputs a filename of a source file and return the converted Music
    object.

    Attributes
    ----------
    root : str or Path
        Root directory of the dataset.

    Parameters
    ----------
    convert : bool, default: False
        Whether to convert the dataset to MusPy JSON/YAML files. If
        False, will check if converted data exists. If so, disable
        on-the-fly mode. If not, enable on-the-fly mode and warns.
    kind : {'json', 'yaml'}, default: 'json'
        File format to save the data.
    n_jobs : int, default: 1
        Maximum number of concurrently running jobs. If equal to 1,
        disable multiprocessing.
    ignore_exceptions : bool, default: True
        Whether to ignore errors and skip failed conversions. This can
        be helpful if some source files are known to be corrupted.
    use_converted : bool, optional
        Force to disable on-the-fly mode and use converted data.
        Defaults to True if converted data exist, otherwise False.

    Important
    ---------
    :meth:`muspy.FolderDataset.converted_exists` depends solely on a
    special file named ``.muspy.success`` in the folder
    ``{root}/_converted/``, which serves as an indicator for the
    existence and integrity of the converted dataset. If the converted
    dataset is built by :meth:`muspy.FolderDataset.convert`, the
    ``.muspy.success`` file will be created as well. If the converted
    dataset is created manually, make sure to create the
    ``.muspy.success`` file in the folder ``{root}/_converted/`` to
    prevent errors.

    Notes
    -----
    Two modes are available for this dataset. When the on-the-fly mode
    is enabled, a data sample is converted to a music object on the fly
    when being indexed. When the on-the-fly mode is disabled, a data
    sample is loaded from the precomputed converted data.

    See Also
    --------
    :class:`muspy.Dataset` : Base class for MusPy datasets.

    """

    _extension: str = ""

    def __init__(
        self,
        root: Union[str, Path],
        convert: bool = False,
        kind: str = "json",
        n_jobs: int = 1,
        ignore_exceptions: bool = True,
        use_converted: bool = None,
    ):
        self.root = Path(root).expanduser().resolve()
        self.kind = kind

        # A pointer to the callable used to produce the Music object
        self._factory: Callable = lambda: None

        # A pointer to the list of filenames used when indexing
        self._filenames: list = []

        self.raw_filenames: list = []
        self.converted_filenames: list = []

        if convert:
            self.convert(kind, n_jobs, ignore_exceptions)

        if use_converted is None:
            use_converted = self.converted_exists()

        if use_converted:
            self.use_converted()
        else:
            self.on_the_fly()

        if not self._filenames:
            raise ValueError("Nothing found in the directory.")

        (self.root / ".muspy.success").touch()

    @property
    def converted_dir(self):
        """Path to the root directory of the converted dataset."""
        return self.root / "_converted"

    def __repr__(self) -> str:
        return f"{type(self).__name__}(root={self.root})"

    def __getitem__(self, index) -> Music:
        return self._factory(self._filenames[index])

    def __len__(self) -> int:
        return len(self._filenames)

[docs]    def read(self, filename: Any) -> Music:
        """Read a file into a Music object."""
        raise NotImplementedError

[docs]    def load(self, filename: Union[str, Path]) -> Music:
        """Load a file into a Music object."""
        return load(self.root / filename)

[docs]    def exists(self) -> bool:
        """Return True if the dataset exists, otherwise False."""
        if not (self.root / ".muspy.success").is_file():
            return False
        return True

[docs]    def converted_exists(self) -> bool:
        """Return True if the saved dataset exists, otherwise False."""
        if not (self.converted_dir / ".muspy.success").is_file():
            return False
        return True

[docs]    def get_converted_filenames(self):
        """Return a list of converted filenames."""
        return sorted(self.converted_dir.rglob("*." + self.kind))

[docs]    def use_converted(self: FolderDatasetType) -> FolderDatasetType:
        """Disable on-the-fly mode and use converted data.

        Returns
        -------
        Object itself.

        """
        if not self.converted_exists():
            raise RuntimeError(
                "Converted data not found. Run `convert()` to convert "
                "the dataset."
            )
        if not self.converted_filenames:
            self.converted_filenames = self.get_converted_filenames()
        self._filenames = self.converted_filenames
        self._use_converted = True
        self._factory = self.load
        return self

[docs]    def get_raw_filenames(self):
        """Return a list of raw filenames."""
        return sorted(
            (
                filename
                for filename in self.root.rglob("*." + self._extension)
                if not str(filename.relative_to(self.root)).startswith(
                    "_converted/"
                )
            )
        )

[docs]    def on_the_fly(self: FolderDatasetType) -> FolderDatasetType:
        """Enable on-the-fly mode and convert the data on the fly.

        Returns
        -------
        Object itself.

        """
        if not self.raw_filenames:
            self.raw_filenames = self.get_raw_filenames()
        self._filenames = self.raw_filenames
        self._use_converted = False
        self._factory = self.read
        return self

[docs]    def convert(
        self: FolderDatasetType,
        kind: str = "json",
        n_jobs: int = 1,
        ignore_exceptions: bool = True,
        verbose: bool = True,
        **kwargs,
    ) -> FolderDatasetType:
        """Convert and save the Music objects.

        The converted files will be named by its index and saved to
        ``root/_converted``. The original filenames can be found in the
        ``filenames`` attribute. For example, the file at
        ``filenames[i]`` will be converted and saved to ``{i}.json``.

        Parameters
        ----------
        kind : {'json', 'yaml'}, default: 'json'
            File format to save the data.
        n_jobs : int, default: 1
            Maximum number of concurrently running jobs. If equal to 1,
            disable multiprocessing.
        ignore_exceptions : bool, default: True
            Whether to ignore errors and skip failed conversions. This
            can be helpful if some source files are known to be
            corrupted.
        verbose : bool, default: True
            Whether to be verbose.
        **kwargs
            Keyword arguments to pass to :func:`muspy.save`.

        Returns
        -------
        Object itself.

        """
        if self.converted_exists():
            if verbose:
                print("Skip conversion as the `.muspy.success` file is found.")
            return self
        self.on_the_fly()
        self.converted_dir.mkdir(exist_ok=True)
        self.save(
            self.converted_dir,
            kind=kind,
            n_jobs=n_jobs,
            ignore_exceptions=ignore_exceptions,
            verbose=verbose,
            **kwargs,
        )
        (self.converted_dir / ".muspy.success").touch(exist_ok=True)
        self.use_converted()
        self.kind = kind
        return self


[docs]class RemoteFolderDataset(FolderDataset, RemoteDataset):
    """Base class for remote datasets storing files in a folder.

    Attributes
    ----------
    root : str or Path
        Root directory of the dataset.

    Parameters
    ----------
    download_and_extract : bool, default: False
        Whether to download and extract the dataset.
    cleanup : bool, default: False
        Whether to remove the source archive(s).
    convert : bool, default: False
        Whether to convert the dataset to MusPy JSON/YAML files. If
        False, will check if converted data exists. If so, disable
        on-the-fly mode. If not, enable on-the-fly mode and warns.
    kind : {'json', 'yaml'}, default: 'json'
        File format to save the data.
    n_jobs : int, default: 1
        Maximum number of concurrently running jobs. If equal to 1,
        disable multiprocessing.
    ignore_exceptions : bool, default: True
        Whether to ignore errors and skip failed conversions. This can
        be helpful if some source files are known to be corrupted.
    use_converted : bool, optional
        Force to disable on-the-fly mode and use converted data.
        Defaults to True if converted data exist, otherwise False.

    See Also
    --------
    :class:`muspy.FolderDataset` :
        Class for datasets storing files in a folder.
    :class:`muspy.RemoteDataset` : Base class for remote MusPy datasets.

    """

    def __init__(
        self,
        root: Union[str, Path],
        download_and_extract: bool = False,
        overwrite: bool = False,
        cleanup: bool = False,
        convert: bool = False,
        kind: str = "json",
        n_jobs: int = 1,
        ignore_exceptions: bool = True,
        use_converted: bool = None,
        verbose: bool = True,
    ):
        RemoteDataset.__init__(
            self,
            root,
            download_and_extract=download_and_extract,
            overwrite=overwrite,
            cleanup=cleanup,
            verbose=verbose,
        )
        FolderDataset.__init__(
            self,
            root,
            convert=convert,
            kind=kind,
            n_jobs=n_jobs,
            ignore_exceptions=ignore_exceptions,
            use_converted=use_converted,
        )

[docs]    def read(self, filename: str) -> Music:
        """Read a file into a Music object."""
        raise NotImplementedError


[docs]class ABCFolderDataset(FolderDataset):
    """Class for datasets storing ABC files in a folder.

    See Also
    --------
    :class:`muspy.FolderDataset` :
        Class for datasets storing files in a folder.

    """

    _extension = "abc"

[docs]    def read(self, filename: Tuple[str, Tuple[int, int]]) -> Music:
        """Read a file into a Music object."""
        filename_, (start, end) = filename
        data = []
        with open(filename_) as f:
            for idx, line in enumerate(f):
                if start <= idx < end and not line.startswith("%"):
                    data.append(line)
        return read_abc_string("".join(data))  # type: ignore

[docs]    def on_the_fly(self: FolderDatasetType) -> FolderDatasetType:
        """Enable on-the-fly mode and convert the data on the fly.

        Returns
        -------
        Object itself.

        """
        if not self.raw_filenames:
            filenames = sorted(
                (
                    filename
                    for filename in self.root.rglob("*." + self._extension)
                    if not str(filename.relative_to(self.root)).startswith(
                        "_converted/"
                    )
                )
            )
            self.raw_filenames = []
            for filename in filenames:
                idx = 0
                start = 0
                with open(filename, errors="ignore") as f:

                    # Detect parts in a file
                    for idx, line in enumerate(f):
                        if line.startswith("X:"):
                            if start:
                                self.raw_filenames.append(
                                    (filename, (start, idx))
                                )
                            start = idx

                    # Append the last part
                    if start:
                        self.raw_filenames.append((filename, (start, idx)))

        self._filenames = self.raw_filenames
        self._use_converted = False
        self._factory = self.read
        return self


[docs]class RemoteABCFolderDataset(ABCFolderDataset, RemoteDataset):
    """Base class for remote datasets storing ABC files in a folder.

    See Also
    --------
    :class:`muspy.ABCFolderDataset` :
        Class for datasets storing ABC files in a folder.
    :class:`muspy.RemoteDataset` : Base class for remote MusPy datasets.

    """

    def __init__(
        self,
        root: Union[str, Path],
        download_and_extract: bool = False,
        overwrite: bool = False,
        cleanup: bool = False,
        convert: bool = False,
        kind: str = "json",
        n_jobs: int = 1,
        ignore_exceptions: bool = True,
        use_converted: bool = None,
        verbose: bool = True,
    ):
        RemoteDataset.__init__(
            self,
            root,
            download_and_extract=download_and_extract,
            overwrite=overwrite,
            cleanup=cleanup,
            verbose=verbose,
        )
        ABCFolderDataset.__init__(
            self,
            root,
            convert=convert,
            kind=kind,
            n_jobs=n_jobs,
            ignore_exceptions=ignore_exceptions,
            use_converted=use_converted,
        )