Source code for cpmpy.tools.datasets.xcsp3

"""
XCSP3 is an XML-based format designed to represent instances of combinatorial constrained problems through Constraint Programming (CP) models. 
Origin: https://xcsp.org/instances/
"""

from __future__ import annotations

import os
import lzma
import zipfile
import pathlib
import io
from typing import Any, Optional, Dict, Callable

from cpmpy.tools.datasets.core import FileDataset



[docs]
class XCSP3Dataset(FileDataset):  # torch.utils.data.Dataset compatible

    """
    XCSP3 Dataset in a PyTorch compatible format.
    
    - Origin: https://xcsp.org/instances/
    - Reference: Audemard, G., Boussemart, F., Lecoutre, C., Piette, C., Tabary, S. XCSP3: An Integrated Format for Benchmarking Combinatorial Constrained Problems. arXiv:2009.00514, 2020.

    Arguments:
        root (str): Root directory containing the XCSP3 instances (if 'download', instances will be downloaded to this location)
        year (int): Competition year (2022, 2023 or 2024)
        track (str, optional): Filter instances by track type (e.g., "COP", "CSP", "MiniCOP")
        transform (callable, optional): Optional transform to be applied on the instance data (the file path of each problem instance)
        target_transform (callable, optional): Optional transform to be applied on the metadata (the metadata dictionary of each problem instance)
        download (bool): If True, downloads the dataset from the internet and puts it in `root` directory
    """

    name = "xcsp3"
    description = "XCSP3 competition benchmark instances for constraint satisfaction and optimization."
    homepage = "https://xcsp.org/instances/"
    citation = [
        "Audemard, G., Boussemart, F., Lecoutre, C., Piette, C., Tabary, S. XCSP3: An Integrated Format for Benchmarking Combinatorial Constrained Problems. arXiv:2009.00514, 2020.",
    ]


    def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP",
                 transform: Optional[Callable] = None, target_transform: Optional[Callable] = None,
                 download: bool = False, **kwargs: Any):
        """
        Initialize the XCSP3 Dataset.
        """

        self.root = pathlib.Path(root)
        self.year = year
        self.track = track

        dataset_dir = self.root / self.name / str(year) / track
        
        if not str(year).startswith('20'):
            raise ValueError("Year must start with '20'")
        if not track:
            raise ValueError("Track must be specified, e.g. COP, CSP, MiniCOP, ...")
        
        super().__init__(
            dataset_dir=dataset_dir,
            transform=transform, target_transform=target_transform, 
            download=download, extension=".xml.lzma",
            **kwargs
        )


[docs]
    def categories(self) -> Dict[str, Any]:
        return {
            "year": self.year,
            "track": self.track
        }



[docs]
    def collect_instance_metadata(self, file: pathlib.Path) -> Dict[str, Any]:
        """
        Extract instance type (CSP/COP) from XCSP3 XML root element.
        """
        import re
        result: Dict[str, Any] = {}
        try:
            with self.open(file) as f:
                # Read only the first few lines to find the root element
                header = ""
                for _ in range(10):
                    line = f.readline()
                    if not line:
                        break
                    header += line
                    if ">" in line:
                        break
                match = re.search(r'type\s*=\s*"([^"]+)"', header)
                if match:
                    result["instance_type"] = match.group(1)
                match = re.search(r'format\s*=\s*"([^"]+)"', header)
                if match:
                    result["xcsp_format"] = match.group(1)
        except Exception:
            pass
        return result



[docs]
    def download(self):

        url = "https://www.cril.univ-artois.fr/~lecoutre/compets/"
        target = f"instancesXCSP{str(self.year)[2:]}.zip"
        target_download_path = self.root / target

        print(f"Downloading XCSP3 {self.year} instances from www.cril.univ-artois.fr")

        try:
            target_download_path = self._download_file(url, target, destination=str(target_download_path))
        except ValueError as e:
            raise ValueError(f"No dataset available for year {self.year}. Error: {str(e)}")

        # Extract only the specific track folder from the zip
        with zipfile.ZipFile(target_download_path, 'r') as zip_ref:
            # Get the main folder name (e.g., "024_V3")
            main_folder = None
            for name in zip_ref.namelist():
                if '/' in name:
                    main_folder = name.split('/')[0]
                    break
            
            if main_folder is None:
                raise ValueError("Could not find main folder in zip file")

            # Extract only files from the specified track
            # Get all unique track names from zip
            tracks = set()
            for file_info in zip_ref.infolist():
                parts = file_info.filename.split('/')
                if len(parts) > 2 and parts[0] == main_folder:
                    tracks.add(parts[1])
            
            # Check if requested track exists
            if self.track not in tracks:
                raise ValueError(f"Track '{self.track}' not found in dataset. Available tracks: {sorted(tracks)}")
            
            # Create track folder in root directory, parents=True ensures recursive creation
            self.dataset_dir.mkdir(parents=True, exist_ok=True)
            
            # Extract files for the specified track
            prefix = f"{main_folder}/{self.track}/"
            for file_info in zip_ref.infolist():
                if file_info.filename.startswith(prefix):
                    # Extract file to track_dir, removing main_folder/track prefix
                    filename = pathlib.Path(file_info.filename).name
                    with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as out_file:
                        out_file.write(source.read())

        # Clean up the zip file
        target_download_path.unlink()




[docs]
    @classmethod
    def open(cls, instance: os.PathLike) -> io.TextIOBase:
        return lzma.open(instance, mode='rt', encoding='utf-8') if str(instance).endswith(".lzma") else open(instance)




if __name__ == "__main__":
    dataset = XCSP3Dataset(year=2024, track="MiniCOP", download=True)
    print("Dataset size:", len(dataset))
    print("Instance 0:", dataset[0])