Source code for prefgraph.datasets._yoochoose

"""Yoochoose RecSys 2015 click-stream dataset loader.

Loads click session data from the RecSys 2015 challenge and reconstructs
menu-choice observations: items clicked in a session = menu, purchased
item = choice.

Data download:
  The dataset is available from the RecSys 2015 challenge archives.
  Place yoochoose-clicks.dat and yoochoose-buys.dat in the data directory.

Source: https://recsys.acm.org/recsys15/challenge/
"""

from __future__ import annotations

import os
from pathlib import Path

import numpy as np

from prefgraph.core.session import MenuChoiceLog


SESSION_GAP_MINUTES = 30
MIN_MENU_SIZE = 2
MAX_MENU_SIZE = 50


def _find_data_dir(data_dir: str | Path | None) -> Path:
    """Find Yoochoose data directory via cascade."""
    candidates = []
    if data_dir is not None:
        candidates.append(Path(data_dir))

    env = os.environ.get("PYREVEALED_DATA_DIR")
    if env:
        candidates.append(Path(env) / "yoochoose")

    candidates.extend([
        Path.home() / ".prefgraph" / "data" / "yoochoose",
        Path(__file__).resolve().parents[3] / "datasets" / "yoochoose" / "data",
    ])

    for d in candidates:
        if d.is_dir() and (
            (d / "yoochoose-clicks.dat").exists() or
            (d / "yoochoose-clicks.csv").exists()
        ):
            return d

    searched = "\n  ".join(str(c) for c in candidates)
    raise FileNotFoundError(
        f"Yoochoose data not found. Searched:\n  {searched}\n\n"
        "Download from the RecSys 2015 challenge:\n"
        "  https://recsys.acm.org/recsys15/challenge/\n\n"
        "Required files: yoochoose-clicks.dat, yoochoose-buys.dat\n"
        "Place in one of the directories above."
    )


[docs] def load_yoochoose( data_dir: str | Path | None = None, min_sessions: int = 5, max_users: int | None = 5000, remap_items: bool = True, ) -> dict[str, MenuChoiceLog]: """Load Yoochoose click-stream data as menu-choice observations. Each session with a purchase becomes a menu-choice observation: items clicked = menu, purchased item = choice. Args: data_dir: Path to directory containing yoochoose-clicks.dat and yoochoose-buys.dat. min_sessions: Minimum purchase sessions per user. max_users: Cap number of users returned (default 5000). remap_items: Remap item IDs to 0..N-1 per user. Returns: Dict mapping session_id (str) -> MenuChoiceLog. """ try: import pandas as pd except ImportError: raise ImportError( "pandas is required for dataset loaders. " "Install with: pip install 'prefgraph[datasets]'" ) from None data_path = _find_data_dir(data_dir) # Load clicks clicks_file = data_path / "yoochoose-clicks.dat" if not clicks_file.exists(): clicks_file = data_path / "yoochoose-clicks.csv" print(f" Loading Yoochoose clicks from {clicks_file}...") clicks = pd.read_csv( clicks_file, names=["session_id", "timestamp", "item_id", "category"], parse_dates=["timestamp"], ) print(f" Raw clicks: {len(clicks):,}") # Load buys buys_file = data_path / "yoochoose-buys.dat" if not buys_file.exists(): buys_file = data_path / "yoochoose-buys.csv" buys = pd.read_csv( buys_file, names=["session_id", "timestamp", "item_id", "price", "quantity"], parse_dates=["timestamp"], ) print(f" Raw buys: {len(buys):,}") # Find sessions with exactly 1 unique purchased item buy_sessions = buys.groupby("session_id")["item_id"].nunique() single_buy_sessions = set(buy_sessions[buy_sessions == 1].index) # Get the purchased item per session session_purchases = ( buys[buys["session_id"].isin(single_buy_sessions)] .groupby("session_id")["item_id"] .first() .to_dict() ) # Filter clicks to sessions with purchases clicks_with_buys = clicks[clicks["session_id"].isin(single_buy_sessions)] # Build menus: items clicked in each session session_menus = ( clicks_with_buys .groupby("session_id")["item_id"] .apply(set) .to_dict() ) # Build (menu, choice) pairs records = [] for session_id, menu in session_menus.items(): choice = session_purchases.get(session_id) if choice is None: continue menu = menu | {choice} # Ensure choice is in menu if len(menu) < MIN_MENU_SIZE or len(menu) > MAX_MENU_SIZE: continue records.append({ "session_id": session_id, "menu": frozenset(menu), "choice": choice, }) print(f" Valid sessions: {len(records):,}") # Yoochoose sessions are anonymous - group by session_id patterns # Each session_id is a unique user visit. To get per-"user" data, # we need multiple sessions from the same user. Yoochoose doesn't # have persistent user IDs, so we use session_id as user_id directly # (each session = one user with 1 observation is not useful for RP analysis). # # Alternative: group sessions that share many items as likely same user. # For simplicity, we create synthetic "users" by grouping consecutive # sessions that share overlapping items. # # SIMPLIFIED APPROACH: Since each session is one observation, we sample # random groups of sessions to form "synthetic users" for RP analysis. # This tests whether a random sample of shoppers shows consistent # preferences across the catalog. # Group sessions by the most-purchased item category to create user proxies df = pd.DataFrame(records) # Use category from clicks to group sessions click_categories = ( clicks_with_buys[clicks_with_buys["session_id"].isin(df["session_id"])] .groupby("session_id")["category"] .first() .to_dict() ) df["category"] = df["session_id"].map(click_categories) # Group by category and create users from batches of sessions user_logs: dict[str, MenuChoiceLog] = {} user_count = 0 for cat, group in df.groupby("category"): if len(group) < min_sessions: continue # Split this category's sessions into user-sized chunks sessions_list = group.to_dict("records") for chunk_start in range(0, len(sessions_list), min_sessions * 2): chunk = sessions_list[chunk_start:chunk_start + min_sessions * 2] if len(chunk) < min_sessions: continue menus = [r["menu"] for r in chunk] choices = [r["choice"] for r in chunk] if remap_items: all_items = set() for m in menus: all_items |= m item_map = {item: idx for idx, item in enumerate(sorted(all_items))} menus = [frozenset(item_map[i] for i in m) for m in menus] choices = [item_map[c] for c in choices] uid = f"user_{user_count}" user_logs[uid] = MenuChoiceLog(menus=menus, choices=choices) user_count += 1 if max_users is not None and user_count >= max_users: break if max_users is not None and user_count >= max_users: break print(f" Built {len(user_logs)} MenuChoiceLog objects") return user_logs