Source code for cpmpy.tools.xcsp3.dataset

"""
PyTorch-style Dataset for XCSP3 competition instances.

Simply create a dataset instance (configured for the targeted competition year/track) and start iterating over its contents:

.. code-block:: python

    from cpmpy.tools.xcsp3 import XCSP3Dataset, read_xcsp3

    for filename, metadata in XCSP3Dataset(year=2024, track="COP", download=True): # auto download dataset and iterate over its instances
        # Do whatever you want here, e.g. reading to a CPMpy model and solving it:
        model = read_xcsp3(filename)
        model.solve()
        print(model.status())

The `metadata` contains usefull information about the current problem instance.

Since the dataset is PyTorch compatible, it can be used with a DataLoader:

.. code-block:: python

    from cpmpy.tools.xcsp3 import XCSP3Dataset, read_xcsp3

    # Initialize the dataset
    dataset = XCSP3Dataset(year=2024, track="COP", download=True)

    from torch.utils.data import DataLoader

    # Wrap dataset in a DataLoader
    data_loader = DataLoader(dataset, batch_size=10, shuffle=False)

    # Iterate over the dataset
    for batch in data_loader:
        # Your code here
"""

import pathlib
from typing import Tuple, Any
import xml.etree.ElementTree as ET
from urllib.request import urlretrieve
from urllib.error import HTTPError, URLError
import zipfile

[docs] class XCSP3Dataset(object): # torch.utils.data.Dataset compatible """ XCSP3 Dataset in a PyTorch compatible format. Arguments: root (str): Root directory containing the XCSP3 instances (if 'download', instances will be downloaded to this location) year (int): Competition year (2022, 2023 or 2024) track (str, optional): Filter instances by track type (e.g., "COP", "CSP", "MiniCOP") transform (callable, optional): Optional transform to be applied on the instance data (the file path of each problem instance) target_transform (callable, optional): Optional transform to be applied on the metadata (the metadata dictionary of each problem instance) download (bool): If True, downloads the dataset from the internet and puts it in `root` directory """ def __init__(self, root: str = ".", year: int = 2023, track: str = None, transform=None, target_transform=None, download: bool = False): """ Initialize the XCSP3 Dataset. """ self.root = pathlib.Path(root) self.year = year self.transform = transform self.target_transform = target_transform self.track = track self.track_dir = self.root / str(year) / track if not str(year).startswith('20'): raise ValueError("Year must start with '20'") if not track: raise ValueError("Track must be specified, e.g. COP, CSP, MiniCOP, ...") # Create root directory if it doesn't exist self.root.mkdir(parents=True, exist_ok=True) if not self.track_dir.exists(): if not download: raise ValueError(f"Dataset for year {year} and track {track} not found. Please set download=True to download the dataset.") else: print(f"Downloading XCSP3 {year} instances...") url = f"https://www.cril.univ-artois.fr/~lecoutre/compets/" year_suffix = str(year)[2:] # Drop the starting '20' url_path = url + f"instancesXCSP{year_suffix}.zip" zip_path = self.root / f"instancesXCSP{year_suffix}.zip" try: urlretrieve(url_path, str(zip_path)) except (HTTPError, URLError) as e: raise ValueError(f"No dataset available for year {year}. Error: {str(e)}") # Extract only the specific track folder from the zip with zipfile.ZipFile(zip_path, 'r') as zip_ref: # Get the main folder name (e.g., "024_V3") main_folder = None for name in zip_ref.namelist(): if '/' in name: main_folder = name.split('/')[0] break if main_folder is None: raise ValueError(f"Could not find main folder in zip file") # Extract only files from the specified track # Get all unique track names from zip tracks = set() for file_info in zip_ref.infolist(): parts = file_info.filename.split('/') if len(parts) > 2 and parts[0] == main_folder: tracks.add(parts[1]) # Check if requested track exists if track not in tracks: raise ValueError(f"Track '{track}' not found in dataset. Available tracks: {sorted(tracks)}") # Create track folder in root directory, parents=True ensures recursive creation self.track_dir.mkdir(parents=True, exist_ok=True) # Extract files for the specified track prefix = f"{main_folder}/{track}/" for file_info in zip_ref.infolist(): if file_info.filename.startswith(prefix): # Extract file to track_dir, removing main_folder/track prefix filename = pathlib.Path(file_info.filename).name with zip_ref.open(file_info) as source, open(self.track_dir / filename, 'wb') as target: target.write(source.read()) # Clean up the zip file zip_path.unlink() def __len__(self) -> int: """Return the total number of instances.""" return len(list(self.track_dir.glob("*.xml.lzma"))) def __getitem__(self, index: int) -> Tuple[Any, Any]: """ Get a single XCSP3 instance filename and metadata. Args: index (int): Index of the instance to retrieve Returns: Tuple[Any, Any]: A tuple containing: - The filename of the instance - Metadata dictionary with file name, track, year etc. """ if index < 0 or index >= len(self): raise IndexError("Index out of range") # Get all compressed XML files and sort for deterministic behavior files = sorted(list(self.track_dir.glob("*.xml.lzma"))) file_path = files[index] filename = str(file_path) if self.transform: # does not need to remain a filename... filename = self.transform(filename) # Basic metadata about the instance metadata = { 'year': self.year, 'track': self.track, 'name': file_path.stem.replace('.xml.lzma', ''), 'path': filename, } if self.target_transform: metadata = self.target_transform(metadata) return filename, metadata
if __name__ == "__main__": dataset = XCSP3Dataset(year=2024, track="MiniCOP", download=True) print("Dataset size:", len(dataset)) print("Instance 0:", dataset[0])