import pandas as pd, numpy as np
from ..util import extract_user_item
from .base import create_dataset_unbiased, Dataset, create_temporal_splits, create_user_splits
from .prepare_netflix_data import prepare_netflix_data
from .prepare_ml_1m_data import prepare_ml_1m_data
from .prepare_yoochoose_data import prepare_yoochoose_data
[docs]def prepare_minimal_dataset():
""" minimal dataset to enable main workflow in unit tests """
event_df = pd.DataFrame([
["u1", "i1", 3],
["u2", "i2", 5],
["u3", "i3", 7],
["u3", "i4", 9],
], columns=["USER_ID", "ITEM_ID", "TIMESTAMP"])
user_df = pd.DataFrame([
("u3", 9),
("u2", 0),
("u1", float('inf')),
("u2", 6), # choose a random order with repeats to ensure that the order is preserved
], columns=['USER_ID', 'TEST_START_TIME']).set_index('USER_ID')
item_df = pd.DataFrame(index=["i1", "i2", "i3", "i4"])
D = create_dataset_unbiased(event_df, user_df, item_df, 100, exclude_train=True)
D._k1 = 2
test_user_ids = D.user_in_test.set_index('TEST_START_TIME', append=True).index.tolist()
test_item_ids = D.item_in_test.index.tolist()
train_user_ids = D.user_df.set_index('TEST_START_TIME', append=True).index.tolist()
train_item_ids = D.item_df.index.tolist()
assert test_user_ids == [('u3', 9.0), ('u2', 6.0)], f"{test_user_ids}"
assert test_item_ids == ['i1', 'i3'], f"{test_item_ids}"
assert train_user_ids == [('u3', 9.0), ('u2', 0.0), ('u1', float('inf'))], f"{train_user_ids}"
assert train_item_ids == ['i1', 'i2', 'i3', 'i4'], f"{train_item_ids}"
return (D,)
[docs]def prepare_synthetic_data(split_fn_name, exclude_train=False,
num_users=300, num_items=200, num_events=10000):
""" prepare synthetic data for end-to-end unit tests """
event_df = pd.DataFrame({
'USER_ID': np.random.choice(num_users, num_events),
'ITEM_ID': np.random.choice(num_items, num_events),
'TIMESTAMP': np.random.uniform(0, 5, num_events),
}).sort_values(["USER_ID", "TIMESTAMP"])
user_df, item_df = extract_user_item(event_df)
if split_fn_name == 'split_by_time':
D, V = create_temporal_splits(event_df, user_df, item_df, 4, 1, 1)
elif split_fn_name == 'split_by_user':
D, V = create_user_splits(event_df, user_df.assign(_in_GroupA=user_df.index % 2),
item_df, 3, 1)
else:
raise ValueError(f"unknown {split_fn_name}")
D._is_synthetic_data = True # for hawkes_poisson verification purposes
return (D, V)
[docs]def prepare_simple_pattern():
""" Transformer(D.item_df, max_epochs=100).fit(D.auto_regressive) # flaky
RNN(D.item_df, max_epochs=50).fit(D.auto_regressive) # stable
"""
event_df = pd.DataFrame({
"USER_ID": 1,
"ITEM_ID": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2],
"TIMESTAMP": 1 + np.arange(12),
})
user_df, item_df = extract_user_item(event_df)
user_df['TEST_START_TIME'] = 12
D = create_dataset_unbiased(event_df, user_df, item_df, 1)
D.print_stats()
return (D, None)