- 使用pytorch对经典的rank/多任务模型进行实现,并且对外提供统一调用的API接口,极大的降低了使用Rank/多任务模型的时间成本
- 该项目使用了pytorch来实现我们的各种模型,以便于初学推荐系统的人可以更好的理解算法的核心思想
- 由于已经有了很多类似的优秀的开源,我们这里对那些十分通用的模块参考了已有的开源,十分感谢这些开源贡献者的贡献
#最新版
git clone https://github.com/HaSai666/rec_pangu.git
cd rec_pangu
pip install -e . --verbose
#稳定版
pip install rec_pangu --upgrade
目前支持如下类型的序列召回模型:
- 经典序列召回模型
- 基于图的序列召回模型
- 多兴趣序列召回模型
- 基于LLM的序列召回模型
模型 | 类型 | 论文 | 年份 |
---|---|---|---|
NGCF(ToDo) | 图协同过滤 | Neural Graph Collaborative Filtering | 2019 |
LightGCN(ToDo) | 图协同过滤 | LightGCN: Simplifying and Powering Graph Convolution Network for Recommendation | 2019 |
NCL(ToDo) | 图对比学习 | Improving Graph Collaborative Filtering with Neighborhood-enriched Contrastive Learning | 2022 |
SimGCL(ToDo) | 图对比学习 | Are Graph Augmentations Necessary? Simple Graph Contrastive Learning for Recommendation | 2022 |
SGL(ToDo) | 图对比学习 | Self-supervised Graph Learning for Recommendation | 2021 |
我们的Rank和多任务模型所对外暴露的接口十分相似,同时我们这里也支持使用wandb来实时监测模型训练指标,我们下面会分别给出Rank,多任务模型,wandb的demo
# 声明数据schema
import torch
from rec_pangu.dataset import get_dataloader
from rec_pangu.models.ranking import WDL, DeepFM, NFM, FiBiNet, AFM, AFN, AOANet, AutoInt, CCPM, LR, FM, xDeepFM
from rec_pangu.trainer import RankTrainer
import pandas as pd
if __name__ == '__main__':
df = pd.read_csv('sample_data/ranking_sample_data.csv')
print(df.head())
# 声明数据schema
schema = {
"sparse_cols": ['user_id', 'item_id', 'item_type', 'dayofweek', 'is_workday', 'city', 'county',
'town', 'village', 'lbs_city', 'lbs_district', 'hardware_platform', 'hardware_ischarging',
'os_type', 'network_type', 'position'],
"dense_cols": ['item_expo_1d', 'item_expo_7d', 'item_expo_14d', 'item_expo_30d', 'item_clk_1d',
'item_clk_7d', 'item_clk_14d', 'item_clk_30d', 'use_duration'],
"label_col": 'click',
}
# 准备数据,这里只选择了100条数据,所以没有切分数据集
train_df = df
valid_df = df
test_df = df
# 声明使用的device
device = torch.device('cpu')
# 获取dataloader
train_loader, valid_loader, test_loader, enc_dict = get_dataloader(train_df, valid_df, test_df, schema)
# 声明模型,排序模型目前支持:WDL, DeepFM, NFM, FiBiNet, AFM, AFN, AOANet, AutoInt, CCPM, LR, FM, xDeepFM
model = xDeepFM(enc_dict=enc_dict)
# 声明Trainer
trainer = RankTrainer(num_task=1)
# 训练模型
trainer.fit(model, train_loader, valid_loader, epoch=5, lr=1e-3, device=device)
# 保存模型权重
trainer.save_model(model, './model_ckpt')
# 模型验证
test_metric = trainer.evaluate_model(model, test_loader, device=device)
print('Test metric:{}'.format(test_metric))
import torch
from rec_pangu.dataset import get_dataloader
from rec_pangu.models.multi_task import AITM, ShareBottom, ESSM, MMOE, OMOE, MLMMOE
from rec_pangu.trainer import RankTrainer
import pandas as pd
if __name__ == '__main__':
df = pd.read_csv('sample_data/multi_task_sample_data.csv')
print(df.head())
#声明数据schema
schema = {
"sparse_cols": ['user_id', 'item_id', 'item_type', 'dayofweek', 'is_workday', 'city', 'county',
'town', 'village', 'lbs_city', 'lbs_district', 'hardware_platform', 'hardware_ischarging',
'os_type', 'network_type', 'position'],
"dense_cols": ['item_expo_1d', 'item_expo_7d', 'item_expo_14d', 'item_expo_30d', 'item_clk_1d',
'item_clk_7d', 'item_clk_14d', 'item_clk_30d', 'use_duration'],
"label_col": ['click', 'scroll'],
}
#准备数据,这里只选择了100条数据,所以没有切分数据集
train_df = df
valid_df = df
test_df = df
#声明使用的device
device = torch.device('cpu')
#获取dataloader
train_loader, valid_loader, test_loader, enc_dict = get_dataloader(train_df, valid_df, test_df, schema)
#声明模型,多任务模型目前支持:AITM,ShareBottom,ESSM,MMOE,OMOE,MLMMOE
model = AITM(enc_dict=enc_dict)
#声明Trainer
trainer = RankTrainer(num_task=2)
#训练模型
trainer.fit(model, train_loader, valid_loader, epoch=5, lr=1e-3, device=device)
#保存模型权重
trainer.save_model(model, './model_ckpt')
#模型验证
test_metric = trainer.evaluate_model(model, test_loader, device=device)
print('Test metric:{}'.format(test_metric))
import torch
from rec_pangu.dataset import get_dataloader
from rec_pangu.models.sequence import ComirecSA,ComirecDR,MIND,CMI,Re4,NARM,YotubeDNN,SRGNN
from rec_pangu.trainer import SequenceTrainer
from rec_pangu.utils import set_device
import pandas as pd
if __name__=='__main__':
#声明数据schema
schema = {
'user_col': 'user_id',
'item_col': 'item_id',
'cate_cols': ['genre'],
'max_length': 20,
'time_col': 'timestamp',
'task_type':'sequence'
}
# 模型配置
config = {
'embedding_dim': 64,
'lr': 0.001,
'K': 1,
'device':-1,
}
config['device'] = set_device(config['device'])
config.update(schema)
#样例数据
train_df = pd.read_csv('./sample_data/sample_train.csv')
valid_df = pd.read_csv('./sample_data/sample_valid.csv')
test_df = pd.read_csv('./sample_data/sample_test.csv')
#声明使用的device
device = torch.device('cpu')
#获取dataloader
train_loader, valid_loader, test_loader, enc_dict = get_dataloader(train_df, valid_df, test_df, schema, batch_size=50)
#声明模型,序列召回模型模型目前支持: ComirecSA,ComirecDR,MIND,CMI,Re4,NARM,YotubeDNN,SRGNN
model = ComirecSA(enc_dict=enc_dict,config=config)
#声明Trainer
trainer = SequenceTrainer(model_ckpt_dir='./model_ckpt')
#训练模型
trainer.fit(model, train_loader, valid_loader, epoch=500, lr=1e-3, device=device,log_rounds=10,
use_earlystoping=True, max_patience=5, monitor_metric='recall@20',)
#保存模型权重和enc_dict
trainer.save_all(model, enc_dict, './model_ckpt')
#模型验证
test_metric = trainer.evaluate_model(model, test_loader, device=device)
TODO