diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2594cd4..2926b4b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,7 +28,7 @@ repos: rev: v2.3.0 hooks: - id: codespell - args: ["--skip", "*.json"] + args: ["--skip", "*.json", "-L", "TBE"] - repo: https://github.com/executablebooks/mdformat rev: 0.7.18 hooks: diff --git a/docs/source/faq.md b/docs/source/faq.md index 9341ced..46c1381 100644 --- a/docs/source/faq.md +++ b/docs/source/faq.md @@ -155,3 +155,17 @@ Tag: TUNNEL Endpoint: http://dt.cn-shanghai-vpc.maxcompute.aliyun-inc.com **原因:** 离线预测输出表已存在,并且schema不正确 **解决方法:** 删除已存在的输出表或修改输出表名 + +______________________________________________________________________ + +**Q11: fbgemm的embedding lookup op的EmbeddingBoundsCheck error** + +**报错信息:** fbgemm的embedding lookup op报错: + +``` +EmbeddingBoundsCheck (VBE false): (at least one) Out of bounds access for batch: 12, table: 2, bag element: 0, idx: 3, num_rows: 3, indices_start: 1815, indices_end: 1816, T: 244, B: 67, b_t: 1955. Setting idx to zero. +``` + +**原因:** 第2个embedding table只有3行embedding(num_rows: 3),但是传入的id是3(idx: 3),越界了 + +**解决方法:** 只通过报错日志很难直接确定第2个embedding table是关联哪一个特征。需设置环境变量`LOG_LEVEL=INFO`或`LOG_LEVEL=DEBUG`重新执行训练命令,可以看到训练日志中包含如下内容`[TBE=xxx] Contents: ['id_3_emb', 'lookup_2_emb', 'lookup_3_emb', ...`,就可以得知`lookup_3`这个特征的输入值存在问题需要进一步检查输入数据。 diff --git a/tzrec/__init__.py b/tzrec/__init__.py index 5101b0f..f11fc22 100644 --- a/tzrec/__init__.py +++ b/tzrec/__init__.py @@ -33,5 +33,11 @@ from tzrec.utils import load_class as _load_class # NOQA -_logging.basicConfig(format="[%(asctime)s][%(levelname)s] %(message)s") +_log_level = _os.getenv("LOG_LEVEL") +if _log_level: + _log_level = getattr(_logging, _log_level) + +_logging.basicConfig( + format="[%(asctime)s][%(levelname)s] %(message)s", level=_log_level +) _load_class.auto_import()