"""Dataset loaders for PrefGraph.
Provides sklearn-style dataset loading functions that return BehaviorPanel
objects ready for analysis.
Example:
>>> from prefgraph.datasets import load_dunnhumby
>>> panel = load_dunnhumby()
>>> print(panel.summary())
Note:
Data files are NOT bundled with PrefGraph (too large for PyPI).
You must download them separately. Each loader provides instructions
when the data is not found.
"""
from prefgraph.datasets._demo import load_demo
from prefgraph.datasets._generators import (
generate_random_budgets,
generate_random_menus,
generate_random_production,
generate_random_intertemporal,
)
from prefgraph.datasets._dunnhumby import load_dunnhumby
from prefgraph.datasets._open_ecommerce import load_open_ecommerce
from prefgraph.datasets._uci_retail import load_uci_retail
# These loaders require pandas at module level. Import lazily so that
# `import prefgraph` does not crash without pandas installed.
# The 4 offenders: _retailrocket, _rees46, _taobao, _tenrec all have
# bare `import pandas as pd` at file scope. Wrapping them here defers
# the pandas import until the loader function is actually called.
from prefgraph.datasets._instacart import load_instacart
from prefgraph.datasets._instacart_menu_v2 import load_instacart_menu_v2
from prefgraph.datasets._yoochoose import load_yoochoose
from prefgraph.datasets._olist import load_olist
from prefgraph.datasets._m5 import load_m5
from prefgraph.datasets._online_retail_ii import load_online_retail_ii
from prefgraph.datasets._hm import load_hm
from prefgraph.datasets._pakistan import load_pakistan
from prefgraph.datasets._favorita import load_favorita
[docs]
def load_retailrocket(*args, **kwargs):
"""Lazy wrapper - defers pandas import until called."""
from prefgraph.datasets._retailrocket import load_retailrocket as _fn
return _fn(*args, **kwargs)
[docs]
def load_rees46(*args, **kwargs):
"""Lazy wrapper - defers pandas import until called."""
from prefgraph.datasets._rees46 import load_rees46 as _fn
return _fn(*args, **kwargs)
[docs]
def load_taobao(*args, **kwargs):
"""Lazy wrapper - defers pandas import until called."""
from prefgraph.datasets._taobao import load_taobao as _fn
return _fn(*args, **kwargs)
def load_tenrec(*args, **kwargs):
"""Lazy wrapper - defers pandas import until called."""
from prefgraph.datasets._tenrec import load_tenrec as _fn
return _fn(*args, **kwargs)
def load_kuairec(*args, **kwargs):
"""Lazy wrapper - defers polars import until called."""
from prefgraph.datasets._kuairec import load_kuairec as _fn
return _fn(*args, **kwargs)
def load_mind(*args, **kwargs):
"""Lazy wrapper - defers polars import until called."""
from prefgraph.datasets._mind import load_mind as _fn
return _fn(*args, **kwargs)
def load_finn_slates(*args, **kwargs):
"""Lazy wrapper - defers numpy import until called."""
from prefgraph.datasets._finn_slates import load_finn_slates as _fn
return _fn(*args, **kwargs)
[docs]
def list_datasets() -> list[dict[str, str]]:
"""List available datasets with descriptions.
Returns:
List of dicts with name, description, source, and size info.
"""
return [
{
"name": "demo",
"description": "Synthetic budget data: 40% rational, 40% noisy, 20% irrational consumers",
"source": "Generated (seeded, deterministic)",
"goods": "5 categories (default)",
"observations": "15 per user (default)",
},
{
"name": "retailrocket",
"description": "E-commerce click-stream from 1.4M visitors, reconstructed into menu choices",
"source": "Kaggle (retailrocket/ecommerce-dataset)",
"goods": "Menu-based (items viewed per session)",
"observations": "~5-50 sessions per user",
},
{
"name": "dunnhumby",
"description": "Grocery transactions from 2,500 households over 2 years",
"source": "Kaggle (dunnhumby - The Complete Journey)",
"goods": "10 commodity categories",
"observations": "~50 weeks per household",
},
{
"name": "open_ecommerce",
"description": "Amazon purchases from ~4,700 consumers over 2018-2022",
"source": "Open E-Commerce 1.0 (Crowdsourced)",
"goods": "50 product categories",
"observations": "~20 months per user",
},
{
"name": "uci_retail",
"description": "UK online retail transactions from ~1,800 customers",
"source": "UCI Machine Learning Repository",
"goods": "50 product categories",
"observations": "~8 months per customer",
},
{
"name": "yoochoose",
"description": "E-commerce click sessions from RecSys 2015 challenge",
"source": "RecSys 2015 Challenge",
"goods": "Menu-based (items clicked per session)",
"observations": "~5-50 sessions per user",
},
{
"name": "m5",
"description": "Walmart item sales across 10 stores, 3 states, 1941 days",
"source": "Kaggle (m5-forecasting-accuracy)",
"goods": "7 departments",
"observations": "~277 weeks per store",
},
{
"name": "olist",
"description": "Brazilian e-commerce orders from 96K customers across marketplaces",
"source": "Kaggle (olistbr/brazilian-ecommerce)",
"goods": "20 product categories",
"observations": "~4-12 months per repeat customer",
},
{
"name": "rees46",
"description": "Multi-category eCommerce behavior (view/cart/purchase) from Oct-Nov 2019",
"source": "Kaggle (mkechinov/ecommerce-behavior-data-from-multi-category-store)",
"goods": "Menu-based (items viewed per session)",
"observations": "~5-50 sessions per user",
},
{
"name": "online_retail_ii",
"description": "UK e-commerce transactions from 5,942 customers over Dec 2009 - Dec 2011",
"source": "UCI ML Repository (Online Retail II)",
"goods": "30 top products (real prices)",
"observations": "~4-24 months per customer",
},
{
"name": "hm",
"description": "H&M fashion transactions from 1.36M customers over Sep 2018 - Sep 2020",
"source": "Kaggle (h-and-m-personalized-fashion-recommendations)",
"goods": "20 product groups (article prefix)",
"observations": "~6-24 months per customer",
},
{
"name": "pakistan",
"description": "Pakistan e-commerce transactions with real prices across 16 product categories",
"source": "Kaggle (zusmani/pakistans-largest-ecommerce-dataset)",
"goods": "16 product categories (real PKR prices)",
"observations": "Monthly aggregation, min 5 months per customer",
},
{
"name": "favorita",
"description": "Ecuador Favorita grocery sales from 54 stores across 33 product families",
"source": "Kaggle (favorita-grocery-sales-forecasting)",
"goods": "33 product families (uniform prices)",
"observations": "~200+ weeks per store",
},
{
"name": "taobao",
"description": "Taobao user behavior: 100M click/purchase events from ~1M users",
"source": "Kaggle (marwa80/userbehavior)",
"goods": "Menu-based (daily viewed → purchased items)",
"observations": "~5-50 purchase-days per user",
},
{
"name": "kuairec",
"description": "KuaiRec near-dense video interaction matrix: 1,411 users × 3,327 videos",
"source": "GitHub (chongminggao/KuaiRec), CIKM 2022",
"goods": "Menu-based (daily watched videos → argmax watch_ratio choice)",
"observations": "~5-30 qualifying days per user",
},
{
"name": "mind",
"description": "MIND news impression logs from Microsoft News: ~50K users, 230K+ impressions",
"source": "msnews.github.io (Microsoft Research License), ACL 2020",
"goods": "Menu-based (impression list → clicked article)",
"observations": "~5-50 1-click impressions per user",
},
]
__all__ = [
"load_demo",
"load_dunnhumby",
"load_open_ecommerce",
"load_uci_retail",
"load_retailrocket",
"load_instacart",
"load_instacart_menu_v2",
"load_yoochoose",
"load_olist",
"load_m5",
"load_rees46",
"load_online_retail_ii",
"load_hm",
"load_pakistan",
"load_favorita",
"load_taobao",
"load_tenrec",
"load_kuairec",
"load_mind",
"load_finn_slates",
"list_datasets",
"generate_random_budgets",
"generate_random_menus",
"generate_random_production",
"generate_random_intertemporal",
]