Source code for cpmpy.tools.datasets.xcsp3

"""
XCSP3 is an XML-based format designed to represent instances of combinatorial constrained problems through Constraint Programming (CP) models. 
Origin: https://xcsp.org/instances/
"""

from __future__ import annotations

import os
import lzma
import zipfile
import pathlib
import io
from typing import Any, Optional, Dict, Callable

from cpmpy.tools.datasets.core import FileDataset


[docs] class XCSP3Dataset(FileDataset): # torch.utils.data.Dataset compatible """ XCSP3 Dataset in a PyTorch compatible format. - Origin: https://xcsp.org/instances/ - Reference: Audemard, G., Boussemart, F., Lecoutre, C., Piette, C., Tabary, S. XCSP3: An Integrated Format for Benchmarking Combinatorial Constrained Problems. arXiv:2009.00514, 2020. Arguments: root (str): Root directory containing the XCSP3 instances (if 'download', instances will be downloaded to this location) year (int): Competition year (2022, 2023 or 2024) track (str, optional): Filter instances by track type (e.g., "COP", "CSP", "MiniCOP") transform (callable, optional): Optional transform to be applied on the instance data (the file path of each problem instance) target_transform (callable, optional): Optional transform to be applied on the metadata (the metadata dictionary of each problem instance) download (bool): If True, downloads the dataset from the internet and puts it in `root` directory """ name = "xcsp3" description = "XCSP3 competition benchmark instances for constraint satisfaction and optimization." homepage = "https://xcsp.org/instances/" citation = [ "Audemard, G., Boussemart, F., Lecoutre, C., Piette, C., Tabary, S. XCSP3: An Integrated Format for Benchmarking Combinatorial Constrained Problems. arXiv:2009.00514, 2020.", ] def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False, **kwargs: Any): """ Initialize the XCSP3 Dataset. """ self.root = pathlib.Path(root) self.year = year self.track = track dataset_dir = self.root / self.name / str(year) / track if not str(year).startswith('20'): raise ValueError("Year must start with '20'") if not track: raise ValueError("Track must be specified, e.g. COP, CSP, MiniCOP, ...") super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension=".xml.lzma", **kwargs )
[docs] def categories(self) -> Dict[str, Any]: return { "year": self.year, "track": self.track }
[docs] def collect_instance_metadata(self, file: pathlib.Path) -> Dict[str, Any]: """ Extract instance type (CSP/COP) from XCSP3 XML root element. """ import re result: Dict[str, Any] = {} try: with self.open(file) as f: # Read only the first few lines to find the root element header = "" for _ in range(10): line = f.readline() if not line: break header += line if ">" in line: break match = re.search(r'type\s*=\s*"([^"]+)"', header) if match: result["instance_type"] = match.group(1) match = re.search(r'format\s*=\s*"([^"]+)"', header) if match: result["xcsp_format"] = match.group(1) except Exception: pass return result
[docs] def download(self): url = "https://www.cril.univ-artois.fr/~lecoutre/compets/" target = f"instancesXCSP{str(self.year)[2:]}.zip" target_download_path = self.root / target print(f"Downloading XCSP3 {self.year} instances from www.cril.univ-artois.fr") try: target_download_path = self._download_file(url, target, destination=str(target_download_path)) except ValueError as e: raise ValueError(f"No dataset available for year {self.year}. Error: {str(e)}") # Extract only the specific track folder from the zip with zipfile.ZipFile(target_download_path, 'r') as zip_ref: # Get the main folder name (e.g., "024_V3") main_folder = None for name in zip_ref.namelist(): if '/' in name: main_folder = name.split('/')[0] break if main_folder is None: raise ValueError("Could not find main folder in zip file") # Extract only files from the specified track # Get all unique track names from zip tracks = set() for file_info in zip_ref.infolist(): parts = file_info.filename.split('/') if len(parts) > 2 and parts[0] == main_folder: tracks.add(parts[1]) # Check if requested track exists if self.track not in tracks: raise ValueError(f"Track '{self.track}' not found in dataset. Available tracks: {sorted(tracks)}") # Create track folder in root directory, parents=True ensures recursive creation self.dataset_dir.mkdir(parents=True, exist_ok=True) # Extract files for the specified track prefix = f"{main_folder}/{self.track}/" for file_info in zip_ref.infolist(): if file_info.filename.startswith(prefix): # Extract file to track_dir, removing main_folder/track prefix filename = pathlib.Path(file_info.filename).name with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as out_file: out_file.write(source.read()) # Clean up the zip file target_download_path.unlink()
[docs] @classmethod def open(cls, instance: os.PathLike) -> io.TextIOBase: return lzma.open(instance, mode='rt', encoding='utf-8') if str(instance).endswith(".lzma") else open(instance)
if __name__ == "__main__": dataset = XCSP3Dataset(year=2024, track="MiniCOP", download=True) print("Dataset size:", len(dataset)) print("Instance 0:", dataset[0])