Source code for cpmpy.tools.opb.dataset

"""
PyTorch-style Dataset for PB competition instances in restricted OPB PB24 format.

Simply create a dataset instance (configured for the targeted competition year/track) and start iterating over its contents:

.. code-block:: python

    from cpmpy.tools.opb import OPBDataset, read_opb

    for filename, metadata in OPBDataset(year=2016, track="DEC-LIN", download=True): # auto download dataset and iterate over its instances
        # Do whatever you want here, e.g. reading to a CPMpy model and solving it:
        model = read_opb(filename)
        model.solve()
        print(model.status())

The `metadata` contains usefull information about the current problem instance.

Since the dataset is PyTorch compatible, it can be used with a DataLoader:

.. code-block:: python

    from cpmpy.tools.opb import OPBDataset, read_opb

    # Initialize the dataset
    dataset = OPBDataset(year=2016, track="DEC-LIN", download=True):

    from torch.utils.data import DataLoader

    # Wrap dataset in a DataLoader
    data_loader = DataLoader(dataset, batch_size=10, shuffle=False)

    # Iterate over the dataset
    for batch in data_loader:
        # Your code here
"""

import os
import pathlib
from typing import Tuple, Any
from urllib.request import urlretrieve
from urllib.error import HTTPError, URLError
import tarfile

[docs]class OPBDataset(object): # torch.utils.data.Dataset compatible """ OPB PB24 Dataset in a PyTorch compatible format. Arguments: root (str): Root directory containing the OPB instances (if 'download', instances will be downloaded to this location) year (int): Competition year (2006, 2007, 2009, 2010, 2011, 2012, 2015, 2016 or 2024) track (str, optional): Filter instances by track type (e.g., "DEC-LIN", "DEC-NLC", "OPT-LIN", "OPT-NLC") transform (callable, optional): Optional transform to be applied on the instance data (the file path of each problem instance) target_transform (callable, optional): Optional transform to be applied on the metadata (the metadata dictionary of each problem instance) download (bool): If True, downloads the dataset from the internet and puts it in `root` directory """ def __init__(self, root: str = ".", year: int = 2023, track: str = None, transform=None, target_transform=None, download: bool = False): """ Initialize the OPB Dataset. """ self.root = pathlib.Path(root) self.year = year self.transform = transform self.target_transform = target_transform self.track = track self.dataset_dir = self.root / str(year) / track if not str(year).startswith('20'): raise ValueError("Year must start with '20'") # Create root directory if it doesn't exist self.root.mkdir(parents=True, exist_ok=True) if not self.dataset_dir.exists(): if not download: raise ValueError(f"Dataset for year {year} and track {track} not found. Please set download=True to download the dataset.") else: print(f"Downloading OPB {year} instances...") url = f"https://www.cril.univ-artois.fr/PB24/benchs/" year_suffix = str(year)[2:] # Drop the starting '20' url_path = url + f"normalized-PB{year_suffix}.tar" tar_path = self.root / f"normalized-extraPB{year_suffix}.tar" try: urlretrieve(url_path, str(tar_path)) except (HTTPError, URLError) as e: raise ValueError(f"No dataset available for year {year}. Error: {str(e)}") # Extract only the specific track folder from the tar with tarfile.open(tar_path, "r:*") as tar_ref: # r:* handles .tar, .tar.gz, .tar.bz2, etc. # Get the main folder name main_folder = None for name in tar_ref.getnames(): if "/" in name: main_folder = name.split("/")[0] break if main_folder is None: raise ValueError(f"Could not find main folder in tar file") # Extract only files from the specified track # Get all unique track names from tar tracks = set() for member in tar_ref.getmembers(): parts = member.name.split("/") if len(parts) > 2 and parts[0] == main_folder: tracks.add(parts[1]) # Check if requested track exists if track not in tracks: raise ValueError(f"Track '{track}' not found in dataset. Available tracks: {sorted(tracks)}") # Create track folder in root directory self.dataset_dir.mkdir(parents=True, exist_ok=True) # Extract files for the specified track prefix = f"{main_folder}/{track}/" for member in tar_ref.getmembers(): if member.name.startswith(prefix) and member.isfile(): # Path relative to main_folder/track relative_path = member.name[len(prefix):] # Flatten: replace "/" with "_" to encode subfolders (some instances have clashing names) flat_name = relative_path.replace("/", "_") target_path = self.dataset_dir / flat_name with tar_ref.extractfile(member) as source, open(target_path, "wb") as target: target.write(source.read()) # Clean up the tar file tar_path.unlink() def __len__(self) -> int: """Return the total number of instances.""" return len(list(self.dataset_dir.glob("*.opb.xz"))) def __getitem__(self, index: int) -> Tuple[Any, Any]: """ Get a single OPB instance filename and metadata. Args: index (int): Index of the instance to retrieve Returns: Tuple[Any, Any]: A tuple containing: - The filename of the instance - Metadata dictionary with file name, track, year etc. """ if index < 0 or index >= len(self): raise IndexError("Index out of range") # Get all compressed XML files and sort for deterministic behavior files = sorted(list(self.dataset_dir.glob("*.opb.xz"))) file_path = files[index] filename = str(file_path) if self.transform: # does not need to remain a filename... filename = self.transform(filename) # Basic metadata about the instance metadata = { 'year': self.year, 'track': self.track, 'author': str(file_path).split(os.sep)[-1].split("_")[0], 'name': file_path.stem.replace('.xml.lzma', ''), 'path': filename, } if self.target_transform: metadata = self.target_transform(metadata) return filename, metadata
if __name__ == "__main__": dataset = OPBDataset(year=2024, track="DEC-LIN", download=True) print("Dataset size:", len(dataset)) print("Instance 0:", dataset[0])