From 10083dcd05219b2ee8e1163d79407a8e2ba98414 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Sun, 22 Dec 2024 05:24:50 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 24111 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 24506 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..a8cf9ce1 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-12-19T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2412.15210v1","updated":"2024-12-19T18:59:46Z","published":"2024-12-19T18:59:46Z","title":"Tokenisation is NP-Complete","summary":" In this work, we prove the NP-completeness of two variants of tokenisation,\ndefined as the problem of compressing a dataset to at most $\\delta$ symbols by\neither finding a vocabulary directly (direct tokenisation), or selecting a\nsequence of merge operations (bottom-up tokenisation).\n","authors":["Philip Whittington","Gregor Bachmann","Tiago Pimentel"],"pdf_url":"https://arxiv.org/pdf/2412.15210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15204v1","updated":"2024-12-19T18:59:17Z","published":"2024-12-19T18:59:17Z","title":"LongBench v2: Towards Deeper Understanding and Reasoning on Realistic\n Long-context Multitasks","summary":" This paper introduces LongBench v2, a benchmark designed to assess the\nability of LLMs to handle long-context problems requiring deep understanding\nand reasoning across real-world multitasks. LongBench v2 consists of 503\nchallenging multiple-choice questions, with contexts ranging from 8k to 2M\nwords, across six major task categories: single-document QA, multi-document QA,\nlong in-context learning, long-dialogue history understanding, code repository\nunderstanding, and long structured data understanding. To ensure the breadth\nand the practicality, we collect data from nearly 100 highly educated\nindividuals with diverse professional backgrounds. We employ both automated and\nmanual review processes to maintain high quality and difficulty, resulting in\nhuman experts achieving only 53.7% accuracy under a 15-minute time constraint.\nOur evaluation reveals that the best-performing model, when directly answers\nthe questions, achieves only 50.1% accuracy. In contrast, the o1-preview model,\nwhich includes longer reasoning, achieves 57.7%, surpassing the human baseline\nby 4%. These results highlight the importance of enhanced reasoning ability and\nscaling inference-time compute to tackle the long-context challenges in\nLongBench v2. The project is available at https://longbench2.github.io.\n","authors":["Yushi Bai","Shangqing Tu","Jiajie Zhang","Hao Peng","Xiaozhi Wang","Xin Lv","Shulin Cao","Jiazheng Xu","Lei Hou","Yuxiao Dong","Jie Tang","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2412.15204v1.pdf","comment":"25 pages, 13 figures"},{"id":"http://arxiv.org/abs/2412.15194v1","updated":"2024-12-19T18:58:04Z","published":"2024-12-19T18:58:04Z","title":"MMLU-CF: A Contamination-free Multi-task Language Understanding\n Benchmark","summary":" Multiple-choice question (MCQ) datasets like Massive Multitask Language\nUnderstanding (MMLU) are widely used to evaluate the commonsense,\nunderstanding, and problem-solving abilities of large language models (LLMs).\nHowever, the open-source nature of these benchmarks and the broad sources of\ntraining data for LLMs have inevitably led to benchmark contamination,\nresulting in unreliable evaluation results. To alleviate this issue, we propose\na contamination-free and more challenging MCQ benchmark called MMLU-CF. This\nbenchmark reassesses LLMs' understanding of world knowledge by averting both\nunintentional and malicious data leakage. To avoid unintentional data leakage,\nwe source data from a broader domain and design three decontamination rules. To\nprevent malicious data leakage, we divide the benchmark into validation and\ntest sets with similar difficulty and subject distributions. The test set\nremains closed-source to ensure reliable results, while the validation set is\npublicly available to promote transparency and facilitate independent\nverification. Our evaluation of mainstream LLMs reveals that the powerful\nGPT-4o achieves merely a 5-shot score of 73.4% and a 0-shot score of 71.9% on\nthe test set, which indicates the effectiveness of our approach in creating a\nmore rigorous and contamination-free evaluation standard. The GitHub repository\nis available at https://github.com/microsoft/MMLU-CF and the dataset refers to\nhttps://huggingface.co/datasets/microsoft/MMLU-CF.\n","authors":["Qihao Zhao","Yangyu Huang","Tengchao Lv","Lei Cui","Qinzheng Sun","Shaoguang Mao","Xin Zhang","Ying Xin","Qiufeng Yin","Scarlett Li","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2412.15194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15189v1","updated":"2024-12-19T18:57:11Z","published":"2024-12-19T18:57:11Z","title":"Face the Facts! Evaluating RAG-based Fact-checking Pipelines in\n Realistic Settings","summary":" Natural Language Processing and Generation systems have recently shown the\npotential to complement and streamline the costly and time-consuming job of\nprofessional fact-checkers. In this work, we lift several constraints of\ncurrent state-of-the-art pipelines for automated fact-checking based on the\nRetrieval-Augmented Generation (RAG) paradigm. Our goal is to benchmark, under\nmore realistic scenarios, RAG-based methods for the generation of verdicts -\ni.e., short texts discussing the veracity of a claim - evaluating them on\nstylistically complex claims and heterogeneous, yet reliable, knowledge bases.\nOur findings show a complex landscape, where, for example, LLM-based retrievers\noutperform other retrieval techniques, though they still struggle with\nheterogeneous knowledge bases; larger models excel in verdict faithfulness,\nwhile smaller models provide better context adherence, with human evaluations\nfavouring zero-shot and one-shot approaches for informativeness, and fine-tuned\nmodels for emotional alignment.\n","authors":["Daniel Russo","Stefano Menini","Jacopo Staiano","Marco Guerini"],"pdf_url":"https://arxiv.org/pdf/2412.15189v1.pdf","comment":"Code and data at https://github.com/drusso98/face-the-facts"},{"id":"http://arxiv.org/abs/2412.15188v1","updated":"2024-12-19T18:56:24Z","published":"2024-12-19T18:56:24Z","title":"LlamaFusion: Adapting Pretrained Language Models for Multimodal\n Generation","summary":" We present LlamaFusion, a framework for empowering pretrained text-only large\nlanguage models (LLMs) with multimodal generative capabilities, enabling them\nto understand and generate both text and images in arbitrary sequences.\nLlamaFusion leverages existing Llama-3's weights for processing texts\nautoregressively while introducing additional and parallel transformer modules\nfor processing images with diffusion. During training, the data from each\nmodality is routed to its dedicated modules: modality-specific feedforward\nlayers, query-key-value projections, and normalization layers process each\nmodality independently, while the shared self-attention layers allow\ninteractions across text and image features. By freezing the text-specific\nmodules and only training the image-specific modules, LlamaFusion preserves the\nlanguage capabilities of text-only LLMs while developing strong visual\nunderstanding and generation abilities. Compared to methods that pretrain\nmultimodal generative models from scratch, our experiments demonstrate that,\nLlamaFusion improves image understanding by 20% and image generation by 3.6%\nusing only 50% of the FLOPs while maintaining Llama-3's language capabilities.\nWe also demonstrate that this framework can adapt existing vision-language\nmodels with multimodal generation ability. Overall, this framework not only\nleverages existing computational investments in text-only LLMs but also enables\nthe parallel development of language and vision capabilities, presenting a\npromising direction for efficient multimodal model development.\n","authors":["Weijia Shi","Xiaochuang Han","Chunting Zhou","Weixin Liang","Xi Victoria Lin","Luke Zettlemoyer","Lili Yu"],"pdf_url":"https://arxiv.org/pdf/2412.15188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15177v1","updated":"2024-12-19T18:51:30Z","published":"2024-12-19T18:51:30Z","title":"Critical-Questions-of-Thought: Steering LLM reasoning with Argumentative\n Querying","summary":" Studies have underscored how, regardless of the recent breakthrough and swift\nadvances in AI research, even state-of-the-art Large Language models (LLMs)\ncontinue to struggle when performing logical and mathematical reasoning. The\nresults seem to suggest that LLMs still work as (highly advanced) data pattern\nidentifiers, scoring poorly when attempting to generalise and solve reasoning\nproblems the models have never previously seen or that are not close to samples\npresented in their training data. To address this compelling concern, this\npaper makes use of the notion of critical questions from the literature on\nargumentation theory, focusing in particular on Toulmin's model of\nargumentation. We show that employing these critical questions can improve the\nreasoning capabilities of LLMs. By probing the rationale behind the models'\nreasoning process, the LLM can assess whether some logical mistake is occurring\nand correct it before providing the final reply to the user prompt. The\nunderlying idea is drawn from the gold standard of any valid argumentative\nprocedure: the conclusion is valid if it is entailed by accepted premises. Or,\nto paraphrase such Aristotelian principle in a real-world approximation,\ncharacterised by incomplete information and presumptive logic, the conclusion\nis valid if not proved otherwise. This approach successfully steers the models'\noutput through a reasoning pipeline, resulting in better performance against\nthe baseline and its Chain-of-Thought (CoT) implementation. To this end, an\nextensive evaluation of the proposed approach on the MT-Bench Reasoning and\nMath tasks across a range of LLMs is provided.\n","authors":["Federico Castagna","Isabel Sassoon","Simon Parsons"],"pdf_url":"https://arxiv.org/pdf/2412.15177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05199v2","updated":"2024-12-19T18:46:21Z","published":"2024-11-07T21:51:07Z","title":"CodeLutra: Boosting LLM Code Generation via Preference-Guided Refinement","summary":" Large Language Models (LLMs) have revolutionized code generation but require\nsignificant resources and often over-generalize, limiting their task-specific\nefficiency. Fine-tuning smaller, open-source LLMs provides a cost-effective\nalternative. However, standard supervised approaches rely only on correct\nexamples, missing valuable insights from failures. We introduce CodeLutra, a\nframework that leverages both correct and incorrect code attempts. Instead of\nusing only correct solutions, CodeLutra applies iterative preference-based\nrefinement, comparing successful and failed outputs to better approximate\ndesired results. This approach narrows the performance gap with\nstate-of-the-art larger models without requiring massive datasets or auxiliary\nmodels. For instance, on a challenging data science coding task, using only 500\nsamples improved Llama-3-8B's accuracy from 28.2% to 48.6%, approaching GPT-4's\nlevel. By learning from both successes and mistakes, CodeLutra provides a\nscalable and efficient path to high-quality code generation, making smaller\nopen-source models more competitive with leading closed-source alternatives.\n","authors":["Leitian Tao","Xiang Chen","Tong Yu","Tung Mai","Ryan Rossi","Yixuan Li","Saayan Mitra"],"pdf_url":"https://arxiv.org/pdf/2411.05199v2.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2412.15156v1","updated":"2024-12-19T18:32:21Z","published":"2024-12-19T18:32:21Z","title":"Prompt-A-Video: Prompt Your Video Diffusion Model via Preference-Aligned\n LLM","summary":" Text-to-video models have made remarkable advancements through optimization\non high-quality text-video pairs, where the textual prompts play a pivotal role\nin determining quality of output videos. However, achieving the desired output\noften entails multiple revisions and iterative inference to refine\nuser-provided prompts. Current automatic methods for refining prompts encounter\nchallenges such as Modality-Inconsistency, Cost-Discrepancy, and Model-Unaware\nwhen applied to text-to-video diffusion models. To address these problem, we\nintroduce an LLM-based prompt adaptation framework, termed as Prompt-A-Video,\nwhich excels in crafting Video-Centric, Labor-Free and Preference-Aligned\nprompts tailored to specific video diffusion model. Our approach involves a\nmeticulously crafted two-stage optimization and alignment system. Initially, we\nconduct a reward-guided prompt evolution pipeline to automatically create\noptimal prompts pool and leverage them for supervised fine-tuning (SFT) of the\nLLM. Then multi-dimensional rewards are employed to generate pairwise data for\nthe SFT model, followed by the direct preference optimization (DPO) algorithm\nto further facilitate preference alignment. Through extensive experimentation\nand comparative analyses, we validate the effectiveness of Prompt-A-Video\nacross diverse generation models, highlighting its potential to push the\nboundaries of video generation.\n","authors":["Yatai Ji","Jiacheng Zhang","Jie Wu","Shilong Zhang","Shoufa Chen","Chongjian GE","Peize Sun","Weifeng Chen","Wenqi Shao","Xuefeng Xiao","Weilin Huang","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2412.15156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15151v1","updated":"2024-12-19T18:28:41Z","published":"2024-12-19T18:28:41Z","title":"Language Models as Continuous Self-Evolving Data Engineers","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities on\nvarious tasks, while the further evolvement is limited to the lack of\nhigh-quality training data. In addition, traditional training approaches rely\ntoo much on expert-labeled data, setting an upper limit on the performance of\nLLMs. To address this issue, we propose a novel paradigm that enables LLMs to\ntrain itself by autonomously generating, cleaning, reviewing, and annotating\ndata with preference information, named LANCE. Our approach demonstrates that\nLLMs can serve as continuous self-evolving data engineers, significantly\nreducing the time and cost of the post-training data construction process.\nThrough iterative fine-tuning on different variants of the Qwen2, we validate\nthe effectiveness of LANCE across various tasks, showing that it can\ncontinuously improve model performance and maintain high-quality data\ngeneration. Across eight benchmark dimensions, LANCE resulted in an average\nscore enhancement of 3.36 for Qwen2-7B and 2.70 for Qwen2-7B-Instruct. This\ntraining paradigm with autonomous data construction not only reduces the\nreliance on human experts or external models but also ensures that the data\naligns with human values and preferences, paving the way for the development of\nfuture superintelligent systems that can exceed human capabilities.\n","authors":["Peidong Wang","Ming Wang","Zhiming Ma","Xiaocui Yang","Shi Feng","Daling Wang","Yifei Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.15151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15127v1","updated":"2024-12-19T18:08:04Z","published":"2024-12-19T18:08:04Z","title":"Adaptive Pruning for Large Language Models with Structural Importance\n Awareness","summary":" The recent advancements in large language models (LLMs) have significantly\nimproved language understanding and generation capabilities. However, it is\ndifficult to deploy LLMs on resource-constrained edge devices due to their high\ncomputational and storage resource demands. To address this issue, we propose a\nnovel LLM model pruning method, namely structurally-aware adaptive pruning\n(SAAP), to significantly reduce the computational and memory costs while\nmaintaining model performance. We first define an adaptive importance fusion\nmetric to evaluate the importance of all coupled structures in LLMs by\nconsidering their homoscedastic uncertainty. Then, we rank the importance of\nall modules to determine the specific layers that should be pruned to meet\nparticular performance requirements. Furthermore, we develop a new group\nfine-tuning strategy to improve the inference efficiency of LLMs. Finally, we\nevaluate the proposed SAAP method on multiple LLMs across two common tasks,\ni.e., zero-shot classification and text generation. Experimental results show\nthat our SAAP method outperforms several state-of-the-art baseline methods,\nachieving 2.17%, 2.37%, and 2.39% accuracy gains on LLaMA-7B, Vicuna-7B, and\nLLaMA-13B. Additionally, SAAP improves the token generation speed by 5%,\nshowcasing its practical advantages in resource-constrained scenarios.\n","authors":["Haotian Zheng","Jinke Ren","Yushan Sun","Ruichen Zhang","Wenbo Zhang","Zhen Li","Dusit Niyato","Shuguang Cui","Yatong Han"],"pdf_url":"https://arxiv.org/pdf/2412.15127v1.pdf","comment":"12 pages, 6 figures, 12 tables"},{"id":"http://arxiv.org/abs/2412.15118v1","updated":"2024-12-19T17:59:42Z","published":"2024-12-19T17:59:42Z","title":"Outcome-Refining Process Supervision for Code Generation","summary":" Large Language Models have demonstrated remarkable capabilities in code\ngeneration, yet they often struggle with complex programming tasks that require\ndeep algorithmic reasoning. While process supervision through learned reward\nmodels shows promise in guiding reasoning steps, it requires expensive training\ndata and suffers from unreliable evaluation. We propose Outcome-Refining\nProcess Supervision, a novel paradigm that treats outcome refinement itself as\nthe process to be supervised. Our framework leverages concrete execution\nsignals to ground the supervision of reasoning steps, while using\ntree-structured exploration to maintain multiple solution trajectories\nsimultaneously. Experiments demonstrate that our approach enables even smaller\nmodels to achieve high success accuracy and performance metrics on competitive\nprogramming tasks, creates more reliable verification than traditional reward\nmodels without requiring training PRMs. Our approach achieves significant\nimprovements across 5 models and 3 datasets: an average of 26.9% increase in\ncorrectness and 42.2% in efficiency. The results suggest that providing\nstructured reasoning space with concrete verification signals is crucial for\nsolving complex programming tasks. We open-source all our code and data at:\nhttps://github.com/zhuohaoyu/ORPS\n","authors":["Zhuohao Yu","Weizheng Gu","Yidong Wang","Zhengran Zeng","Jindong Wang","Wei Ye","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.15118v1.pdf","comment":"18 pages, 5 figures, Code: https://github.com/zhuohaoyu/ORPS"},{"id":"http://arxiv.org/abs/2409.18472v2","updated":"2024-12-19T17:57:43Z","published":"2024-09-27T06:18:55Z","title":"URIEL+: Enhancing Linguistic Inclusion and Usability in a Typological\n and Multilingual Knowledge Base","summary":" URIEL is a knowledge base offering geographical, phylogenetic, and\ntypological vector representations for 7970 languages. It includes distance\nmeasures between these vectors for 4005 languages, which are accessible via the\nlang2vec tool. Despite being frequently cited, URIEL is limited in terms of\nlinguistic inclusion and overall usability. To tackle these challenges, we\nintroduce URIEL+, an enhanced version of URIEL and lang2vec that addresses\nthese limitations. In addition to expanding typological feature coverage for\n2898 languages, URIEL+ improves the user experience with robust, customizable\ndistance calculations to better suit the needs of users. These upgrades also\noffer competitive performance on downstream tasks and provide distances that\nbetter align with linguistic distance studies.\n","authors":["Aditya Khan","Mason Shipton","David Anugraha","Kaiyao Duan","Phuong H. Hoang","Eric Khiu","A. Seza Doğruöz","En-Shiun Annie Lee"],"pdf_url":"https://arxiv.org/pdf/2409.18472v2.pdf","comment":"Accepted to COLING 2025"},{"id":"http://arxiv.org/abs/2412.15115v1","updated":"2024-12-19T17:56:09Z","published":"2024-12-19T17:56:09Z","title":"Qwen2.5 Technical Report","summary":" In this report, we introduce Qwen2.5, a comprehensive series of large\nlanguage models (LLMs) designed to meet diverse needs. Compared to previous\niterations, Qwen 2.5 has been significantly improved during both the\npre-training and post-training stages. In terms of pre-training, we have scaled\nthe high-quality pre-training datasets from the previous 7 trillion tokens to\n18 trillion tokens. This provides a strong foundation for common sense, expert\nknowledge, and reasoning capabilities. In terms of post-training, we implement\nintricate supervised finetuning with over 1 million samples, as well as\nmultistage reinforcement learning. Post-training techniques enhance human\npreference, and notably improve long text generation, structural data analysis,\nand instruction following. To handle diverse and varied use cases effectively,\nwe present Qwen2.5 LLM series in rich sizes. Open-weight offerings include base\nand instruction-tuned models, with quantized versions available. In addition,\nfor hosted solutions, the proprietary models currently include two\nmixture-of-experts (MoE) variants: Qwen2.5-Turbo and Qwen2.5-Plus, both\navailable from Alibaba Cloud Model Studio. Qwen2.5 has demonstrated top-tier\nperformance on a wide range of benchmarks evaluating language understanding,\nreasoning, mathematics, coding, human preference alignment, etc. Specifically,\nthe open-weight flagship Qwen2.5-72B-Instruct outperforms a number of open and\nproprietary models and demonstrates competitive performance to the\nstate-of-the-art open-weight model, Llama-3-405B-Instruct, which is around 5\ntimes larger. Qwen2.5-Turbo and Qwen2.5-Plus offer superior cost-effectiveness\nwhile performing competitively against GPT-4o-mini and GPT-4o respectively.\nAdditionally, as the foundation, Qwen2.5 models have been instrumental in\ntraining specialized models such as Qwen2.5-Math, Qwen2.5-Coder, QwQ, and\nmultimodal models.\n","authors":[" Qwen"," :","An Yang","Baosong Yang","Beichen Zhang","Binyuan Hui","Bo Zheng","Bowen Yu","Chengyuan Li","Dayiheng Liu","Fei Huang","Haoran Wei","Huan Lin","Jian Yang","Jianhong Tu","Jianwei Zhang","Jianxin Yang","Jiaxi Yang","Jingren Zhou","Junyang Lin","Kai Dang","Keming Lu","Keqin Bao","Kexin Yang","Le Yu","Mei Li","Mingfeng Xue","Pei Zhang","Qin Zhu","Rui Men","Runji Lin","Tianhao Li","Tingyu Xia","Xingzhang Ren","Xuancheng Ren","Yang Fan","Yang Su","Yichang Zhang","Yu Wan","Yuqiong Liu","Zeyu Cui","Zhenru Zhang","Zihan Qiu"],"pdf_url":"https://arxiv.org/pdf/2412.15115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15113v1","updated":"2024-12-19T17:55:42Z","published":"2024-12-19T17:55:42Z","title":"Associative memory inspires improvements for in-context learning using a\n novel attention residual stream architecture","summary":" Large language models (LLMs) demonstrate an impressive ability to utilise\ninformation within the context of their input sequences to appropriately\nrespond to data unseen by the LLM during its training procedure. This ability\nis known as in-context learning (ICL). Humans and non-human animals demonstrate\nsimilar abilities, however their neural architectures differ substantially from\nLLMs. Despite this, a critical component within LLMs, the attention mechanism,\nresembles modern associative memory models, widely used in and influenced by\nthe computational neuroscience community to model biological memory systems.\nUsing this connection, we introduce an associative memory model capable of\nperforming ICL. We use this as inspiration for a novel residual stream\narchitecture which allows information to directly flow between attention heads.\nWe test this architecture during training within a two-layer Transformer and\nshow its ICL abilities manifest more quickly than without this modification. We\nthen apply our architecture in small language models with 8 million parameters,\nfocusing on attention head values, with results also indicating improved ICL\nperformance at this larger and more naturalistic scale.\n","authors":["Thomas F Burns","Tomoki Fukai","Christopher J Earls"],"pdf_url":"https://arxiv.org/pdf/2412.15113v1.pdf","comment":"18 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2412.04619v3","updated":"2024-12-19T17:51:34Z","published":"2024-12-05T21:12:37Z","title":"Sometimes I am a Tree: Data Drives Unstable Hierarchical Generalization","summary":" Language models (LMs), like other neural networks, often favor shortcut\nheuristics based on surface-level patterns. Although LMs behave like n-gram\nmodels early in training, they must eventually learn hierarchical syntactic\nrepresentations to correctly apply grammatical rules out-of-distribution (OOD).\nIn this work, we use case studies of English grammar to explore how complex,\ndiverse training data drives models to generalize OOD. We construct a framework\nthat unifies our understanding of random variation with training dynamics, rule\nselection with memorization, and data diversity with complexity. We show that\nthese factors are nuanced, and that intermediate levels of diversity and\ncomplexity lead to inconsistent behavior across random seeds and to unstable\ntraining dynamics. Our findings emphasize the critical role of training data in\nshaping generalization patterns and illuminate how competing model strategies\nlead to inconsistent generalization outcomes across random seeds. Code is\navailable at https://github.com/sunnytqin/concept_comp.git.\n","authors":["Tian Qin","Naomi Saphra","David Alvarez-Melis"],"pdf_url":"https://arxiv.org/pdf/2412.04619v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15101v1","updated":"2024-12-19T17:48:23Z","published":"2024-12-19T17:48:23Z","title":"Review-Then-Refine: A Dynamic Framework for Multi-Hop Question Answering\n with Temporal Adaptability","summary":" Retrieve-augmented generation (RAG) frameworks have emerged as a promising\nsolution to multi-hop question answering(QA) tasks since it enables large\nlanguage models (LLMs) to incorporate external knowledge and mitigate their\ninherent knowledge deficiencies. Despite this progress, existing RAG\nframeworks, which usually follows the retrieve-then-read paradigm, often\nstruggle with multi-hop QA with temporal information since it has difficulty\nretrieving and synthesizing accurate time-related information. To address the\nchallenge, this paper proposes a novel framework called review-then-refine,\nwhich aims to enhance LLM performance in multi-hop QA scenarios with temporal\ninformation. Our approach begins with a review phase, where decomposed\nsub-queries are dynamically rewritten with temporal information, allowing for\nsubsequent adaptive retrieval and reasoning process. In addition, we implement\nadaptive retrieval mechanism to minimize unnecessary retrievals, thus reducing\nthe potential for hallucinations. In the subsequent refine phase, the LLM\nsynthesizes the retrieved information from each sub-query along with its\ninternal knowledge to formulate a coherent answer. Extensive experimental\nresults across multiple datasets demonstrate the effectiveness of our proposed\nframework, highlighting its potential to significantly improve multi-hop QA\ncapabilities in LLMs.\n","authors":["Xiangsen Chen","Xuming Hu","Nan Tang"],"pdf_url":"https://arxiv.org/pdf/2412.15101v1.pdf","comment":"20 pages, 2 figures"},{"id":"http://arxiv.org/abs/2412.15098v1","updated":"2024-12-19T17:46:13Z","published":"2024-12-19T17:46:13Z","title":"A Cross-Domain Study of the Use of Persuasion Techniques in Online\n Disinformation","summary":" Disinformation, irrespective of domain or language, aims to deceive or\nmanipulate public opinion, typically through employing advanced persuasion\ntechniques. Qualitative and quantitative research on the weaponisation of\npersuasion techniques in disinformation has been mostly topic-specific (e.g.,\nCOVID-19) with limited cross-domain studies, resulting in a lack of\ncomprehensive understanding of these strategies. This study employs a\nstate-of-the-art persuasion technique classifier to conduct a large-scale,\nmulti-domain analysis of the role of 16 persuasion techniques in disinformation\nnarratives. It shows how different persuasion techniques are employed\ndisproportionately in different disinformation domains. We also include a\ndetailed case study on climate change disinformation, highlighting how\nlinguistic, psychological, and cultural factors shape the adaptation of\npersuasion strategies to fit unique thematic contexts.\n","authors":["João A. Leite","Olesya Razuvayevskaya","Carolina Scarton","Kalina Bontcheva"],"pdf_url":"https://arxiv.org/pdf/2412.15098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13702v2","updated":"2024-12-19T17:36:38Z","published":"2024-12-18T10:45:24Z","title":"Typhoon 2: A Family of Open Text and Multimodal Thai Large Language\n Models","summary":" This paper introduces Typhoon 2, a series of text and multimodal large\nlanguage models optimized for the Thai language. The series includes models for\ntext, vision, and audio. Typhoon2-Text builds on state-of-the-art open models,\nsuch as Llama 3 and Qwen2, and we perform continual pre-training on a mixture\nof English and Thai data. We employ post-training techniques to enhance Thai\nlanguage performance while preserving the base models' original capabilities.\nWe release text models across a range of sizes, from 1 to 70 billion\nparameters, available in both base and instruction-tuned variants. To guardrail\ntext generation, we release Typhoon2-Safety, a classifier enhanced for Thai\ncultures and language. Typhoon2-Vision improves Thai document understanding\nwhile retaining general visual capabilities, such as image captioning.\nTyphoon2-Audio introduces an end-to-end speech-to-speech model architecture\ncapable of processing audio, speech, and text inputs and generating both text\nand speech outputs.\n","authors":["Kunat Pipatanakul","Potsawee Manakul","Natapong Nitarach","Warit Sirichotedumrong","Surapon Nonesung","Teetouch Jaknamon","Parinthapat Pengpun","Pittawat Taveekitworachai","Adisai Na-Thalang","Sittipong Sripaisarnmongkol","Krisanapong Jirayoot","Kasima Tharnpipitchai"],"pdf_url":"https://arxiv.org/pdf/2412.13702v2.pdf","comment":"technical report, 55 pages"},{"id":"http://arxiv.org/abs/2412.15084v1","updated":"2024-12-19T17:29:44Z","published":"2024-12-19T17:29:44Z","title":"AceMath: Advancing Frontier Math Reasoning with Post-Training and Reward\n Modeling","summary":" In this paper, we introduce AceMath, a suite of frontier math models that\nexcel in solving complex math problems, along with highly effective reward\nmodels capable of evaluating generated solutions and reliably identifying the\ncorrect ones. To develop the instruction-tuned math models, we propose a\nsupervised fine-tuning (SFT) process that first achieves competitive\nperformance across general domains, followed by targeted fine-tuning for the\nmath domain using a carefully curated set of prompts and synthetically\ngenerated responses. The resulting model, AceMath-72B-Instruct greatly\noutperforms Qwen2.5-Math-72B-Instruct, GPT-4o and Claude-3.5 Sonnet. To develop\nmath-specialized reward model, we first construct AceMath-RewardBench, a\ncomprehensive and robust benchmark for evaluating math reward models across\ndiverse problems and difficulty levels. After that, we present a systematic\napproach to build our math reward models. The resulting model, AceMath-72B-RM,\nconsistently outperforms state-of-the-art reward models. Furthermore, when\ncombining AceMath-72B-Instruct with AceMath-72B-RM, we achieve the highest\naverage rm@8 score across the math reasoning benchmarks. We will release model\nweights, training data, and evaluation benchmarks at:\nhttps://research.nvidia.com/labs/adlr/acemath\n","authors":["Zihan Liu","Yang Chen","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2412.15084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15077v1","updated":"2024-12-19T17:26:07Z","published":"2024-12-19T17:26:07Z","title":"Till the Layers Collapse: Compressing a Deep Neural Network through the\n Lenses of Batch Normalization Layers","summary":" Today, deep neural networks are widely used since they can handle a variety\nof complex tasks. Their generality makes them very powerful tools in modern\ntechnology. However, deep neural networks are often overparameterized. The\nusage of these large models consumes a lot of computation resources. In this\npaper, we introduce a method called \\textbf{T}ill the \\textbf{L}ayers\n\\textbf{C}ollapse (TLC), which compresses deep neural networks through the\nlenses of batch normalization layers. By reducing the depth of these networks,\nour method decreases deep neural networks' computational requirements and\noverall latency. We validate our method on popular models such as Swin-T,\nMobileNet-V2, and RoBERTa, across both image classification and natural\nlanguage processing (NLP) tasks.\n","authors":["Zhu Liao","Nour Hezbri","Victor Quétu","Van-Tam Nguyen","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2412.15077v1.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2412.15060v1","updated":"2024-12-19T17:08:11Z","published":"2024-12-19T17:08:11Z","title":"ConfliBERT: A Language Model for Political Conflict","summary":" Conflict scholars have used rule-based approaches to extract information\nabout political violence from news reports and texts. Recent Natural Language\nProcessing developments move beyond rigid rule-based approaches. We review our\nrecent ConfliBERT language model (Hu et al. 2022) to process political and\nviolence related texts. The model can be used to extract actor and action\nclassifications from texts about political conflict. When fine-tuned, results\nshow that ConfliBERT has superior performance in accuracy, precision and recall\nover other large language models (LLM) like Google's Gemma 2 (9B), Meta's Llama\n3.1 (7B), and Alibaba's Qwen 2.5 (14B) within its relevant domains. It is also\nhundreds of times faster than these more generalist LLMs. These results are\nillustrated using texts from the BBC, re3d, and the Global Terrorism Dataset\n(GTD).\n","authors":["Patrick T. Brandt","Sultan Alsarra","Vito J. D`Orazio","Dagmar Heintze","Latifur Khan","Shreyas Meher","Javier Osorio","Marcus Sianan"],"pdf_url":"https://arxiv.org/pdf/2412.15060v1.pdf","comment":"30 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2412.15035v1","updated":"2024-12-19T16:46:54Z","published":"2024-12-19T16:46:54Z","title":"LLMs Lost in Translation: M-ALERT uncovers Cross-Linguistic Safety Gaps","summary":" Building safe Large Language Models (LLMs) across multiple languages is\nessential in ensuring both safe access and linguistic diversity. To this end,\nwe introduce M-ALERT, a multilingual benchmark that evaluates the safety of\nLLMs in five languages: English, French, German, Italian, and Spanish. M-ALERT\nincludes 15k high-quality prompts per language, totaling 75k, following the\ndetailed ALERT taxonomy. Our extensive experiments on 10 state-of-the-art LLMs\nhighlight the importance of language-specific safety analysis, revealing that\nmodels often exhibit significant inconsistencies in safety across languages and\ncategories. For instance, Llama3.2 shows high unsafety in the category\ncrime_tax for Italian but remains safe in other languages. Similar differences\ncan be observed across all models. In contrast, certain categories, such as\nsubstance_cannabis and crime_propaganda, consistently trigger unsafe responses\nacross models and languages. These findings underscore the need for robust\nmultilingual safety practices in LLMs to ensure safe and responsible usage\nacross diverse user communities.\n","authors":["Felix Friedrich","Simone Tedeschi","Patrick Schramowski","Manuel Brack","Roberto Navigli","Huu Nguyen","Bo Li","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2412.15035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14512v3","updated":"2024-12-19T16:37:00Z","published":"2024-08-25T04:32:45Z","title":"LLMs as Zero-shot Graph Learners: Alignment of GNN Representations with\n LLM Token Embeddings","summary":" Zero-shot graph machine learning, especially with graph neural networks\n(GNNs), has garnered significant interest due to the challenge of scarce\nlabeled data. While methods like self-supervised learning and graph prompt\nlearning have been extensively explored, they often rely on fine-tuning with\ntask-specific labels, limiting their effectiveness in zero-shot scenarios.\nInspired by the zero-shot capabilities of instruction-fine-tuned large language\nmodels (LLMs), we introduce a novel framework named Token Embedding-Aligned\nGraph Language Model (TEA-GLM) that leverages LLMs as cross-dataset and\ncross-task zero-shot learners for graph machine learning. Concretely, we\npretrain a GNN, aligning its representations with token embeddings of an LLM.\nWe then train a linear projector that transforms the GNN's representations into\na fixed number of graph token embeddings without tuning the LLM. A unified\ninstruction is designed for various graph tasks at different levels, such as\nnode classification (node-level) and link prediction (edge-level). These design\nchoices collectively enhance our method's effectiveness in zero-shot learning,\nsetting it apart from existing methods. Experiments show that our graph token\nembeddings help the LLM predictor achieve state-of-the-art performance on\nunseen datasets and tasks compared to other methods using LLMs as predictors.\n","authors":["Duo Wang","Yuan Zuo","Fengzhi Li","Junjie Wu"],"pdf_url":"https://arxiv.org/pdf/2408.14512v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10868v4","updated":"2024-12-19T16:22:07Z","published":"2024-06-16T09:36:32Z","title":"Identifying Query-Relevant Neurons in Large Language Models for\n Long-Form Texts","summary":" Large Language Models (LLMs) possess vast amounts of knowledge within their\nparameters, prompting research into methods for locating and editing this\nknowledge. Previous work has largely focused on locating entity-related (often\nsingle-token) facts in smaller models. However, several key questions remain\nunanswered: (1) How can we effectively locate query-relevant neurons in\ndecoder-only LLMs, such as Llama and Mistral? (2) How can we address the\nchallenge of long-form (or free-form) text generation? (3) Are there localized\nknowledge regions in LLMs? In this study, we introduce Query-Relevant Neuron\nCluster Attribution (QRNCA), a novel architecture-agnostic framework capable of\nidentifying query-relevant neurons in LLMs. QRNCA allows for the examination of\nlong-form answers beyond triplet facts by employing the proxy task of\nmulti-choice question answering. To evaluate the effectiveness of our detected\nneurons, we build two multi-choice QA datasets spanning diverse domains and\nlanguages. Empirical evaluations demonstrate that our method outperforms\nbaseline methods significantly. Further, analysis of neuron distributions\nreveals the presence of visible localized regions, particularly within\ndifferent domains. Finally, we show potential applications of our detected\nneurons in knowledge editing and neuron-based prediction.\n","authors":["Lihu Chen","Adam Dejl","Francesca Toni"],"pdf_url":"https://arxiv.org/pdf/2406.10868v4.pdf","comment":"AAAI 2025 Main Track"},{"id":"http://arxiv.org/abs/2411.10912v2","updated":"2024-12-19T16:20:49Z","published":"2024-11-16T23:29:32Z","title":"SPICA: Retrieving Scenarios for Pluralistic In-Context Alignment","summary":" When different groups' values differ, one approach to model alignment is to\nsteer models at inference time towards each group's preferences. However,\ntechniques like in-context learning only consider similarity when drawing\nfew-shot examples and not cross-group differences in values. We propose SPICA,\na framework that accounts for group-level differences during in-context example\nretrieval. SPICA introduces three designs: scenario banks, group-informed\nretrieval metrics, and in-context alignment prompts. From an evaluation of\nSPICA on an alignment task collecting inputs from four demographic groups ($n =\n544$), our metrics retrieve in-context examples that more closely match\nobserved preferences, with the best prompt configuration using multiple\ncontrastive responses to demonstrate examples. In an end-to-end evaluation ($n\n= 120$), we observe that SPICA is higher rated than similarity-based retrieval,\nwith groups seeing up to a +0.16 point improvement on a 5 point scale.\nAdditionally, gains from SPICA were more uniform, with all groups benefiting\nfrom alignment rather than only some. Finally, we find that while a\ngroup-agnostic approach can align to aggregated values, it is not most suited\nfor divergent groups.\n","authors":["Quan Ze Chen","K. J. Kevin Feng","Chan Young Park","Amy X. Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.10912v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15004v1","updated":"2024-12-19T16:20:22Z","published":"2024-12-19T16:20:22Z","title":"Large Language Models and Code Security: A Systematic Literature Review","summary":" Large Language Models (LLMs) have emerged as powerful tools for automating\nvarious programming tasks, including security-related ones, such as detecting\nand fixing vulnerabilities. Despite their promising capabilities, when required\nto produce or modify pre-existing code, LLMs could introduce vulnerabilities\nunbeknown to the programmer. When analyzing code, they could miss clear\nvulnerabilities or signal nonexistent ones. In this Systematic Literature\nReview (SLR), we aim to investigate both the security benefits and potential\ndrawbacks of using LLMs for a variety of code-related tasks. In particular,\nfirst we focus on the types of vulnerabilities that could be introduced by\nLLMs, when used for producing code. Second, we analyze the capabilities of LLMs\nto detect and fix vulnerabilities, in any given code, and how the prompting\nstrategy of choice impacts their performance in these two tasks. Last, we\nprovide an in-depth analysis on how data poisoning attacks on LLMs can impact\nperformance in the aforementioned tasks.\n","authors":["Enna Basic","Alberto Giaretta"],"pdf_url":"https://arxiv.org/pdf/2412.15004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08406v2","updated":"2024-12-19T16:09:47Z","published":"2024-09-12T21:39:01Z","title":"Knowledge Tagging with Large Language Model based Multi-Agent System","summary":" Knowledge tagging for questions is vital in modern intelligent educational\napplications, including learning progress diagnosis, practice question\nrecommendations, and course content organization. Traditionally, these\nannotations have been performed by pedagogical experts, as the task demands not\nonly a deep semantic understanding of question stems and knowledge definitions\nbut also a strong ability to link problem-solving logic with relevant knowledge\nconcepts. With the advent of advanced natural language processing (NLP)\nalgorithms, such as pre-trained language models and large language models\n(LLMs), pioneering studies have explored automating the knowledge tagging\nprocess using various machine learning models. In this paper, we investigate\nthe use of a multi-agent system to address the limitations of previous\nalgorithms, particularly in handling complex cases involving intricate\nknowledge definitions and strict numerical constraints. By demonstrating its\nsuperior performance on the publicly available math question knowledge tagging\ndataset, MathKnowCT, we highlight the significant potential of an LLM-based\nmulti-agent system in overcoming the challenges that previous methods have\nencountered. Finally, through an in-depth discussion of the implications of\nautomating knowledge tagging, we underscore the promising results of deploying\nLLM-based algorithms in educational contexts.\n","authors":["Hang Li","Tianlong Xu","Ethan Chang","Qingsong Wen"],"pdf_url":"https://arxiv.org/pdf/2409.08406v2.pdf","comment":"Accepted by AAAI 2025 (AAAI/IAAI 2025 Innovative Application Award)"},{"id":"http://arxiv.org/abs/2412.14986v1","updated":"2024-12-19T15:58:53Z","published":"2024-12-19T15:58:53Z","title":"Chain-of-MetaWriting: Linguistic and Textual Analysis of How Small\n Language Models Write Young Students Texts","summary":" Large Language Models (LLMs) have been used to generate texts in response to\ndifferent writing tasks: reports, essays, story telling. However, language\nmodels do not have a meta-representation of the text writing process, nor\ninherent communication learning needs, comparable to those of young human\nstudents. This paper introduces a fine-grained linguistic and textual analysis\nof multilingual Small Language Models' (SLMs) writing. With our method,\nChain-of-MetaWriting, SLMs can imitate some steps of the human writing process,\nsuch as planning and evaluation. We mainly focused on short story and essay\nwriting tasks in French for schoolchildren and undergraduate students\nrespectively. Our results show that SLMs encounter difficulties in assisting\nyoung students on sensitive topics such as violence in the schoolyard, and they\nsometimes use words too complex for the target audience. In particular, the\noutput is quite different from the human produced texts in term of text\ncohesion and coherence regarding temporal connectors, topic progression,\nreference.\n","authors":["Ioana Buhnila","Georgeta Cislaru","Amalia Todirascu"],"pdf_url":"https://arxiv.org/pdf/2412.14986v1.pdf","comment":"Accepted at WRAICOGS 2025 (Writing Aids at the Crossroads of AI,\n Cognitive Science, and NLP) co-located with COLING 2025"},{"id":"http://arxiv.org/abs/2412.11745v2","updated":"2024-12-19T15:55:45Z","published":"2024-12-16T13:03:43Z","title":"Beyond Dataset Creation: Critical View of Annotation Variation and Bias\n Probing of a Dataset for Online Radical Content Detection","summary":" The proliferation of radical content on online platforms poses significant\nrisks, including inciting violence and spreading extremist ideologies. Despite\nongoing research, existing datasets and models often fail to address the\ncomplexities of multilingual and diverse data. To bridge this gap, we introduce\na publicly available multilingual dataset annotated with radicalization levels,\ncalls for action, and named entities in English, French, and Arabic. This\ndataset is pseudonymized to protect individual privacy while preserving\ncontextual information. Beyond presenting our freely available dataset, we\nanalyze the annotation process, highlighting biases and disagreements among\nannotators and their implications for model performance. Additionally, we use\nsynthetic data to investigate the influence of socio-demographic traits on\nannotation patterns and model predictions. Our work offers a comprehensive\nexamination of the challenges and opportunities in building robust datasets for\nradical content detection, emphasizing the importance of fairness and\ntransparency in model development.\n","authors":["Arij Riabi","Virginie Mouilleron","Menel Mahamdi","Wissam Antoun","Djamé Seddah"],"pdf_url":"https://arxiv.org/pdf/2412.11745v2.pdf","comment":"Accepted to COLING 2025"},{"id":"http://arxiv.org/abs/2412.13765v2","updated":"2024-12-19T15:50:54Z","published":"2024-12-18T12:01:53Z","title":"LLM-SEM: A Sentiment-Based Student Engagement Metric Using LLMS for\n E-Learning Platforms","summary":" Current methods for analyzing student engagement in e-learning platforms,\nincluding automated systems, often struggle with challenges such as handling\nfuzzy sentiment in text comments and relying on limited metadata. Traditional\napproaches, such as surveys and questionnaires, also face issues like small\nsample sizes and scalability. In this paper, we introduce LLM-SEM (Language\nModel-Based Student Engagement Metric), a novel approach that leverages video\nmetadata and sentiment analysis of student comments to measure engagement. By\nutilizing recent Large Language Models (LLMs), we generate high-quality\nsentiment predictions to mitigate text fuzziness and normalize key features\nsuch as views and likes. Our holistic method combines comprehensive metadata\nwith sentiment polarity scores to gauge engagement at both the course and\nlesson levels. Extensive experiments were conducted to evaluate various LLM\nmodels, demonstrating the effectiveness of LLM-SEM in providing a scalable and\naccurate measure of student engagement. We fine-tuned TXLM-RoBERTa using\nhuman-annotated sentiment datasets to enhance prediction accuracy and utilized\nLLama 3B, and Gemma 9B from Ollama.\n","authors":["Ali Hamdi","Ahmed Abdelmoneim Mazrou","Mohamed Shaltout"],"pdf_url":"https://arxiv.org/pdf/2412.13765v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14965v1","updated":"2024-12-19T15:44:04Z","published":"2024-12-19T15:44:04Z","title":"Movie2Story: A framework for understanding videos and telling stories in\n the form of novel text","summary":" Multimodal video-to-text models have made considerable progress, primarily in\ngenerating brief descriptions of video content. However, there is still a\ndeficiency in generating rich long-form text descriptions that integrate both\nvideo and audio. In this paper, we introduce a framework called M2S, designed\nto generate novel-length text by combining audio, video, and character\nrecognition. M2S includes modules for video long-form text description and\ncomprehension, audio-based analysis of emotion, speech rate, and character\nalignment, and visual-based character recognition alignment. By integrating\nmultimodal information using the large language model GPT4o, M2S stands out in\nthe field of multimodal text generation. We demonstrate the effectiveness and\naccuracy of M2S through comparative experiments and human evaluation.\nAdditionally, the model framework has good scalability and significant\npotential for future research.\n","authors":["Kangning Li","Zheyang Jia","Anyu Ying"],"pdf_url":"https://arxiv.org/pdf/2412.14965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14964v1","updated":"2024-12-19T15:44:01Z","published":"2024-12-19T15:44:01Z","title":"Knowledge Injection via Prompt Distillation","summary":" In many practical applications, large language models (LLMs) need to\nincorporate new knowledge not present in their pre-training data. The primary\nmethods for this are fine-tuning and retrieval-augmented generation (RAG).\nAlthough RAG has emerged as the industry standard for knowledge injection,\nfine-tuning has not yet achieved comparable success. In this paper, we propose\na new fine-tuning technique for learning new knowledge and show that it can\nreach the performance of RAG. The proposed method is based on the\nself-distillation approach, which we call prompt distillation. First, we\ngenerate question-answer pairs about the new knowledge. Then, we fine-tune a\nstudent model on the question-answer pairs to imitate the output distributions\nof a teacher model, which additionally receives the new knowledge in its\nprompt. The student model is identical to the teacher, except it is equipped\nwith a LoRA adapter. This training procedure facilitates distilling the new\nknowledge from the teacher's prompt into the student's weights.\n","authors":["Kalle Kujanpää","Harri Valpola","Alexander Ilin"],"pdf_url":"https://arxiv.org/pdf/2412.14964v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2412.14959v1","updated":"2024-12-19T15:39:31Z","published":"2024-12-19T15:39:31Z","title":"Understanding the Dark Side of LLMs' Intrinsic Self-Correction","summary":" Intrinsic self-correction was proposed to improve LLMs' responses via\nfeedback prompts solely based on their inherent capability. However, recent\nworks show that LLMs' intrinsic self-correction fails without oracle labels as\nfeedback prompts. In this paper, we aim to interpret LLMs' intrinsic\nself-correction for different tasks, especially for those failure cases. By\nincluding one simple task and three complex tasks with state-of-the-art (SOTA)\nLLMs like ChatGPT families (o1, 4o, 3.5-turbo) and Llama families (2-7B, 3-8B,\nand 3.1-8B), we design three interpretation methods to reveal the dark side of\nLLMs' intrinsic self-correction. We identify intrinsic self-correction can (1)\ncause LLMs to waver both intermedia and final answers and lead to prompt bias\non simple factual questions; (2) introduce human-like cognitive bias on complex\ntasks. In light of our findings, we also provide two simple yet effective\nstrategies for alleviation: question repeating and supervised fine-tuning with\na few samples. We open-source our work at https://x-isc.info/.\n","authors":["Qingjie Zhang","Han Qiu","Di Wang","Haoting Qian","Yiming Li","Tianwei Zhang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2412.14959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13647v2","updated":"2024-12-19T15:37:55Z","published":"2024-12-18T09:23:12Z","title":"G-VEval: A Versatile Metric for Evaluating Image and Video Captions\n Using GPT-4o","summary":" Evaluation metric of visual captioning is important yet not thoroughly\nexplored. Traditional metrics like BLEU, METEOR, CIDEr, and ROUGE often miss\nsemantic depth, while trained metrics such as CLIP-Score, PAC-S, and Polos are\nlimited in zero-shot scenarios. Advanced Language Model-based metrics also\nstruggle with aligning to nuanced human preferences. To address these issues,\nwe introduce G-VEval, a novel metric inspired by G-Eval and powered by the new\nGPT-4o. G-VEval uses chain-of-thought reasoning in large multimodal models and\nsupports three modes: reference-free, reference-only, and combined,\naccommodating both video and image inputs. We also propose MSVD-Eval, a new\ndataset for video captioning evaluation, to establish a more transparent and\nconsistent framework for both human experts and evaluation metrics. It is\ndesigned to address the lack of clear criteria in existing datasets by\nintroducing distinct dimensions of Accuracy, Completeness, Conciseness, and\nRelevance (ACCR). Extensive results show that G-VEval outperforms existing\nmethods in correlation with human annotations, as measured by Kendall tau-b and\nKendall tau-c. This provides a flexible solution for diverse captioning tasks\nand suggests a straightforward yet effective approach for large language models\nto understand video content, paving the way for advancements in automated\ncaptioning. Codes are available at https://github.com/ztangaj/gveval\n","authors":["Tony Cheng Tong","Sirui He","Zhiwen Shao","Dit-Yan Yeung"],"pdf_url":"https://arxiv.org/pdf/2412.13647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.20054v2","updated":"2024-12-19T15:30:40Z","published":"2024-06-28T17:07:06Z","title":"To Word Senses and Beyond: Inducing Concepts with Contextualized\n Language Models","summary":" Polysemy and synonymy are two crucial interrelated facets of lexical\nambiguity. While both phenomena are widely documented in lexical resources and\nhave been studied extensively in NLP, leading to dedicated systems, they are\noften being considered independently in practical problems. While many tasks\ndealing with polysemy (e.g. Word Sense Disambiguiation or Induction) highlight\nthe role of word's senses, the study of synonymy is rooted in the study of\nconcepts, i.e. meanings shared across the lexicon. In this paper, we introduce\nConcept Induction, the unsupervised task of learning a soft clustering among\nwords that defines a set of concepts directly from data. This task generalizes\nWord Sense Induction. We propose a bi-level approach to Concept Induction that\nleverages both a local lemma-centric view and a global cross-lexicon view to\ninduce concepts. We evaluate the obtained clustering on SemCor's annotated data\nand obtain good performance (BCubed F1 above 0.60). We find that the local and\nthe global levels are mutually beneficial to induce concepts and also senses in\nour setting. Finally, we create static embeddings representing our induced\nconcepts and use them on the Word-in-Context task, obtaining competitive\nperformance with the State-of-the-Art.\n","authors":["Bastien Liétard","Pascal Denis","Mikaella Keller"],"pdf_url":"https://arxiv.org/pdf/2406.20054v2.pdf","comment":"Published in EMNLP 2024 main conference proceedings"},{"id":"http://arxiv.org/abs/2408.10839v2","updated":"2024-12-19T15:25:41Z","published":"2024-08-20T13:34:17Z","title":"Benchmarking Large Language Models for Math Reasoning Tasks","summary":" The use of Large Language Models (LLMs) in mathematical reasoning has become\na cornerstone of related research, demonstrating the intelligence of these\nmodels and enabling potential practical applications through their advanced\nperformance, such as in educational settings. Despite the variety of datasets\nand in-context learning algorithms designed to improve the ability of LLMs to\nautomate mathematical problem solving, the lack of comprehensive benchmarking\nacross different datasets makes it complicated to select an appropriate model\nfor specific tasks. In this project, we present a benchmark that fairly\ncompares seven state-of-the-art in-context learning algorithms for mathematical\nproblem solving across five widely used mathematical datasets on four powerful\nfoundation models. Furthermore, we explore the trade-off between efficiency and\nperformance, highlighting the practical applications of LLMs for mathematical\nreasoning. Our results indicate that larger foundation models like GPT-4o and\nLLaMA 3-70B can solve mathematical reasoning independently from the concrete\nprompting strategy, while for smaller models the in-context learning approach\nsignificantly influences the performance. Moreover, the optimal prompt depends\non the chosen foundation model. We open-source our benchmark code to support\nthe integration of additional models in future research.\n","authors":["Kathrin Seßler","Yao Rong","Emek Gözlüklü","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2408.10839v2.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2412.11795v2","updated":"2024-12-19T15:21:44Z","published":"2024-12-16T14:07:39Z","title":"ProsodyFM: Unsupervised Phrasing and Intonation Control for Intelligible\n Speech Synthesis","summary":" Prosody contains rich information beyond the literal meaning of words, which\nis crucial for the intelligibility of speech. Current models still fall short\nin phrasing and intonation; they not only miss or misplace breaks when\nsynthesizing long sentences with complex structures but also produce unnatural\nintonation. We propose ProsodyFM, a prosody-aware text-to-speech synthesis\n(TTS) model with a flow-matching (FM) backbone that aims to enhance the\nphrasing and intonation aspects of prosody. ProsodyFM introduces two key\ncomponents: a Phrase Break Encoder to capture initial phrase break locations,\nfollowed by a Duration Predictor for the flexible adjustment of break\ndurations; and a Terminal Intonation Encoder which learns a bank of intonation\nshape tokens combined with a novel Pitch Processor for more robust modeling of\nhuman-perceived intonation change. ProsodyFM is trained with no explicit\nprosodic labels and yet can uncover a broad spectrum of break durations and\nintonation patterns. Experimental results demonstrate that ProsodyFM can\neffectively improve the phrasing and intonation aspects of prosody, thereby\nenhancing the overall intelligibility compared to four state-of-the-art (SOTA)\nmodels. Out-of-distribution experiments show that this prosody improvement can\nfurther bring ProsodyFM superior generalizability for unseen complex sentences\nand speakers. Our case study intuitively illustrates the powerful and\nfine-grained controllability of ProsodyFM over phrasing and intonation.\n","authors":["Xiangheng He","Junjie Chen","Zixing Zhang","Björn W. Schuller"],"pdf_url":"https://arxiv.org/pdf/2412.11795v2.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2410.07991v4","updated":"2024-12-19T15:16:49Z","published":"2024-10-10T14:48:57Z","title":"Human and LLM Biases in Hate Speech Annotations: A Socio-Demographic\n Analysis of Annotators and Targets","summary":" The rise of online platforms exacerbated the spread of hate speech, demanding\nscalable and effective detection. However, the accuracy of hate speech\ndetection systems heavily relies on human-labeled data, which is inherently\nsusceptible to biases. While previous work has examined the issue, the\ninterplay between the characteristics of the annotator and those of the target\nof the hate are still unexplored. We fill this gap by leveraging an extensive\ndataset with rich socio-demographic information of both annotators and targets,\nuncovering how human biases manifest in relation to the target's attributes.\nOur analysis surfaces the presence of widespread biases, which we\nquantitatively describe and characterize based on their intensity and\nprevalence, revealing marked differences. Furthermore, we compare human biases\nwith those exhibited by persona-based LLMs. Our findings indicate that while\npersona-based LLMs do exhibit biases, these differ significantly from those of\nhuman annotators. Overall, our work offers new and nuanced results on human\nbiases in hate speech annotations, as well as fresh insights into the design of\nAI-driven hate speech detection systems.\n","authors":["Tommaso Giorgi","Lorenzo Cima","Tiziano Fagni","Marco Avvenuti","Stefano Cresci"],"pdf_url":"https://arxiv.org/pdf/2410.07991v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09378v2","updated":"2024-12-19T15:14:18Z","published":"2024-12-12T15:46:43Z","title":"From Bench to Bedside: A Review of Clinical Trials in Drug Discovery and\n Development","summary":" Clinical trials are an indispensable part of the drug development process,\nbridging the gap between basic research and clinical application. During the\ndevelopment of new drugs, clinical trials are used not only to evaluate the\nsafety and efficacy of the drug but also to explore its dosage, treatment\nregimens, and potential side effects. This review discusses the various stages\nof clinical trials, including Phase I (safety assessment), Phase II\n(preliminary efficacy evaluation), Phase III (large-scale validation), and\nPhase IV (post-marketing surveillance), highlighting the characteristics of\neach phase and their interrelationships. Additionally, the paper addresses the\nmajor challenges encountered in clinical trials, such as ethical issues,\nsubject recruitment difficulties, diversity and representativeness concerns,\nand proposes strategies for overcoming these challenges. With the advancement\nof technology, innovative technologies such as artificial intelligence, big\ndata, and digitalization are gradually transforming clinical trial design and\nimplementation, improving trial efficiency and data quality. The article also\nlooks forward to the future of clinical trials, particularly the impact of\nemerging therapies such as gene therapy and immunotherapy on trial design, as\nwell as the importance of regulatory reforms and global collaboration. In\nconclusion, the core role of clinical trials in drug development will continue\nto drive the progress of innovative drug development and clinical treatment.\n","authors":["Tianyang Wang","Ming Liu","Benji Peng","Xinyuan Song","Charles Zhang","Xintian Sun","Qian Niu","Junyu Liu","Silin Chen","Keyu Chen","Ming Li","Pohsun Feng","Ziqian Bi","Yunze Wang","Yichao Zhang","Cheng Fei","Lawrence KQ Yan"],"pdf_url":"https://arxiv.org/pdf/2412.09378v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2407.04693v2","updated":"2024-12-19T15:11:47Z","published":"2024-07-05T17:56:38Z","title":"ANAH-v2: Scaling Analytical Hallucination Annotation of Large Language\n Models","summary":" Large language models (LLMs) exhibit hallucinations in long-form\nquestion-answering tasks across various domains and wide applications. Current\nhallucination detection and mitigation datasets are limited in domains and\nsizes, which struggle to scale due to prohibitive labor costs and insufficient\nreliability of existing hallucination annotators. To facilitate the scalable\noversight of LLM hallucinations, this paper introduces an iterative\nself-training framework that simultaneously and progressively scales up the\nhallucination annotation dataset and improves the accuracy of the hallucination\nannotator. Based on the Expectation Maximization (EM) algorithm, in each\niteration, the framework first applies a hallucination annotation pipeline to\nannotate a scaled dataset and then trains a more accurate hallucination\nannotator on the dataset. This new hallucination annotator is adopted in the\nhallucination annotation pipeline used for the next iteration. Extensive\nexperimental results demonstrate that the finally obtained hallucination\nannotator with only 7B parameters surpasses the performance of GPT-4 and\nobtains new state-of-the-art hallucination detection results on HaluEval and\nHalluQA by zero-shot inference. Such an annotator can not only evaluate the\nhallucination levels of various LLMs on the large-scale dataset but also help\nto mitigate the hallucination of LLMs generations, with the Natural Language\nInference (NLI) metric increasing from 25% to 37% on HaluEval.\n","authors":["Yuzhe Gu","Ziwei Ji","Wenwei Zhang","Chengqi Lyu","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2407.04693v2.pdf","comment":"Accepted by NeurIPS 2024. Dataset, code, and model are released at\n https://github.com/open-compass/ANAH"},{"id":"http://arxiv.org/abs/2411.16300v3","updated":"2024-12-19T15:11:46Z","published":"2024-11-25T11:35:08Z","title":"BayLing 2: A Multilingual Large Language Model with Efficient Language\n Alignment","summary":" Large language models (LLMs), with their powerful generative capabilities and\nvast knowledge, empower various tasks in everyday life. However, these\nabilities are primarily concentrated in high-resource languages, leaving\nlow-resource languages with weaker generative capabilities and relatively\nlimited knowledge. Enhancing the multilingual capabilities of LLMs is therefore\ncrucial for serving over 100 linguistic communities worldwide. An intuitive\napproach to enhance the multilingual capabilities would be to construct\ninstruction data for various languages, but constructing instruction data for\nover 100 languages is prohibitively costly. In this paper, we introduce BayLing\n2, which efficiently transfers generative capabilities and knowledge from\nhigh-resource languages to low-resource languages through language alignment.\nTo achieve this, we constructed a dataset of 3.2 million instructions,\ncomprising high-resource language instructions (Chinese and English) and\ncross-lingual instructions for 100+ languages and performed instruction tuning\nbased on the dataset to facilitate the capability transfer between languages.\nUsing Llama as the foundation model, we developed BayLing-2-7B, BayLing-2-13B,\nand BayLing-2-8B, and conducted a comprehensive evaluation of BayLing. For\nmultilingual translation across 100+ languages, BayLing shows superior\nperformance compared to open-source models of similar scale. For multilingual\nknowledge and understanding benchmarks, BayLing achieves significant\nimprovements across over 20 low-resource languages, demonstrating its\ncapability of effective knowledge transfer from high-resource to low-resource\nlanguages. Furthermore, results on English benchmarks indicate that BayLing\nmaintains high performance in highresource languages while enhancing the\nperformance in low-resource languages. Demo, homepage, code and models of\nBayLing are available.\n","authors":["Shaolei Zhang","Kehao Zhang","Qingkai Fang","Shoutao Guo","Yan Zhou","Xiaodong Liu","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2411.16300v3.pdf","comment":"BayLing 2's online demo: http://nlp.ict.ac.cn/bayling/demo. BayLing\n 2's code and models: https://github.com/ictnlp/BayLing"},{"id":"http://arxiv.org/abs/2312.00326v5","updated":"2024-12-19T15:07:38Z","published":"2023-12-01T03:44:54Z","title":"Agent-OM: Leveraging LLM Agents for Ontology Matching","summary":" Ontology matching (OM) enables semantic interoperability between different\nontologies and resolves their conceptual heterogeneity by aligning related\nentities. OM systems currently have two prevailing design paradigms:\nconventional knowledge-based expert systems and newer machine learning-based\npredictive systems. While large language models (LLMs) and LLM agents have\nrevolutionised data engineering and have been applied creatively in many\ndomains, their potential for OM remains underexplored. This study introduces a\nnovel agent-powered LLM-based design paradigm for OM systems. With\nconsideration of several specific challenges in leveraging LLM agents for OM,\nwe propose a generic framework, namely Agent-OM (Agent for Ontology Matching),\nconsisting of two Siamese agents for retrieval and matching, with a set of\nsimple OM tools. Our framework is implemented in a proof-of-concept system.\nEvaluations of three Ontology Alignment Evaluation Initiative (OAEI) tracks\nover state-of-the-art OM systems show that our system can achieve results very\nclose to the long-standing best performance on simple OM tasks and can\nsignificantly improve the performance on complex and few-shot OM tasks.\n","authors":["Zhangcheng Qiang","Weiqing Wang","Kerry Taylor"],"pdf_url":"https://arxiv.org/pdf/2312.00326v5.pdf","comment":"19 pages, 13 figures, 4 tables"},{"id":"http://arxiv.org/abs/2412.14922v1","updated":"2024-12-19T15:00:18Z","published":"2024-12-19T15:00:18Z","title":"RobustFT: Robust Supervised Fine-tuning for Large Language Models under\n Noisy Response","summary":" Supervised fine-tuning (SFT) plays a crucial role in adapting large language\nmodels (LLMs) to specific domains or tasks. However, as demonstrated by\nempirical experiments, the collected data inevitably contains noise in\npractical applications, which poses significant challenges to model performance\non downstream tasks. Therefore, there is an urgent need for a noise-robust SFT\nframework to enhance model capabilities in downstream tasks. To address this\nchallenge, we introduce a robust SFT framework (RobustFT) that performs noise\ndetection and relabeling on downstream task data. For noise identification, our\napproach employs a multi-expert collaborative system with inference-enhanced\nmodels to achieve superior noise detection. In the denoising phase, we utilize\na context-enhanced strategy, which incorporates the most relevant and confident\nknowledge followed by careful assessment to generate reliable annotations.\nAdditionally, we introduce an effective data selection mechanism based on\nresponse entropy, ensuring only high-quality samples are retained for\nfine-tuning. Extensive experiments conducted on multiple LLMs across five\ndatasets demonstrate RobustFT's exceptional performance in noisy scenarios.\n","authors":["Junyu Luo","Xiao Luo","Kaize Ding","Jingyang Yuan","Zhiping Xiao","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.14922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14905v1","updated":"2024-12-19T14:37:11Z","published":"2024-12-19T14:37:11Z","title":"Dehallucinating Parallel Context Extension for Retrieval-Augmented\n Generation","summary":" Large language models (LLMs) are susceptible to generating hallucinated\ninformation, despite the integration of retrieval-augmented generation (RAG).\nParallel context extension (PCE) is a line of research attempting to\neffectively integrating parallel (unordered) contexts, while it still suffers\nfrom hallucinations when adapted to RAG scenarios. In this paper, we propose\nDePaC (Dehallucinating Parallel Context Extension), which alleviates the\nhallucination problem with context-aware negative training and\ninformation-calibrated aggregation. DePaC is designed to alleviate two types of\nin-context hallucination: fact fabrication (i.e., LLMs present claims that are\nnot supported by the contexts) and fact omission (i.e., LLMs fail to present\nclaims that can be supported by the contexts). Specifically, (1) for fact\nfabrication, we apply the context-aware negative training that fine-tunes the\nLLMs with negative supervisions, thus explicitly guiding the LLMs to refuse to\nanswer when contexts are not related to questions; (2) for fact omission, we\npropose the information-calibrated aggregation which prioritizes context\nwindows with higher information increment from their contexts. The experimental\nresults on nine RAG tasks demonstrate that DePaC significantly alleviates the\ntwo types of hallucination and consistently achieves better performances on\nthese tasks.\n","authors":["Zexiong Ma","Shengnan An","Zeqi Lin","Yanzhen Zou","Jian-Guang Lou","Bing Xie"],"pdf_url":"https://arxiv.org/pdf/2412.14905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14872v1","updated":"2024-12-19T14:11:15Z","published":"2024-12-19T14:11:15Z","title":"Why language models collapse when trained on recursively generated text","summary":" Language models (LMs) have been widely used to generate text on the Internet.\nThe generated text is often collected into the training corpus of the next\ngenerations of LMs. Previous work has experimentally found that LMs collapse\nwhen trained on recursively generated text. This paper contributes to existing\nknowledge from two aspects. We present a theoretical proof of LM collapse. Our\nproof reveals the cause of LM collapse and proves that all auto-regressive LMs\nwill definitely collapse. We present a new finding: the performance of LMs\ngradually declines when trained on recursively generated text until they\nperform no better than a randomly initialized LM. The trained LMs produce large\namounts of repetitive text and perform poorly across a wide range of natural\nlanguage tasks. The above proof and new findings deepen our understanding of LM\ncollapse and offer valuable insights that may inspire new training techniques\nto mitigate this threat.\n","authors":["Lecheng Wang","Xianjie Shi","Ge Li","Jia Li","Yihong Dong","Xuanming Zhang","Wenpin Jiao","Hong Mei"],"pdf_url":"https://arxiv.org/pdf/2412.14872v1.pdf","comment":"28 pages, 9 figures"},{"id":"http://arxiv.org/abs/2412.14867v1","updated":"2024-12-19T14:03:22Z","published":"2024-12-19T14:03:22Z","title":"Graph-Convolutional Networks: Named Entity Recognition and Large\n Language Model Embedding in Document Clustering","summary":" Recent advances in machine learning, particularly Large Language Models\n(LLMs) such as BERT and GPT, provide rich contextual embeddings that improve\ntext representation. However, current document clustering approaches often\nignore the deeper relationships between named entities (NEs) and the potential\nof LLM embeddings. This paper proposes a novel approach that integrates Named\nEntity Recognition (NER) and LLM embeddings within a graph-based framework for\ndocument clustering. The method builds a graph with nodes representing\ndocuments and edges weighted by named entity similarity, optimized using a\ngraph-convolutional network (GCN). This ensures a more effective grouping of\nsemantically related documents. Experimental results indicate that our approach\noutperforms conventional co-occurrence-based methods in clustering, notably for\ndocuments rich in named entities.\n","authors":["Imed Keraghel","Mohamed Nadif"],"pdf_url":"https://arxiv.org/pdf/2412.14867v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.06228v2","updated":"2024-12-19T13:57:05Z","published":"2024-11-09T16:17:14Z","title":"An $\\mathbf{L^*}$ Algorithm for Deterministic Weighted Regular Languages","summary":" Extracting finite state automata (FSAs) from black-box models offers a\npowerful approach to gaining interpretable insights into complex model\nbehaviors. To support this pursuit, we present a weighted variant of Angluin's\n(1987) $\\mathbf{L^*}$ algorithm for learning FSAs. We stay faithful to the\noriginal algorithm, devising a way to exactly learn deterministic weighted FSAs\nwhose weights support division. Furthermore, we formulate the learning process\nin a manner that highlights the connection with FSA minimization, showing how\n$\\mathbf{L^*}$ directly learns a minimal automaton for the target language.\n","authors":["Clemente Pasti","Talu Karagöz","Anej Svete","Franz Nowak","Reda Boumasmoud","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2411.06228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14860v1","updated":"2024-12-19T13:55:48Z","published":"2024-12-19T13:55:48Z","title":"Think&Cite: Improving Attributed Text Generation with Self-Guided Tree\n Search and Progress Reward Modeling","summary":" Despite their outstanding capabilities, large language models (LLMs) are\nprone to hallucination and producing factually incorrect information. This\nchallenge has spurred efforts in attributed text generation, which prompts LLMs\nto generate content with supporting evidence. In this paper, we propose a novel\nframework, called Think&Cite, and formulate attributed text generation as a\nmulti-step reasoning problem integrated with search. Specifically, we propose\nSelf-Guided Monte Carlo Tree Search (SG-MCTS), which capitalizes on the\nself-reflection capability of LLMs to reflect on the intermediate states of\nMCTS for guiding the tree expansion process. To provide reliable and\ncomprehensive feedback, we introduce Progress Reward Models to measure the\nprogress of tree search from the root to the current state from two aspects,\ni.e., generation and attribution progress. We conduct extensive experiments on\nthree datasets and the results show that our approach significantly outperforms\nbaseline approaches.\n","authors":["Junyi Li","Hwee Tou Ng"],"pdf_url":"https://arxiv.org/pdf/2412.14860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14849v1","updated":"2024-12-19T13:39:47Z","published":"2024-12-19T13:39:47Z","title":"DS$^2$-ABSA: Dual-Stream Data Synthesis with Label Refinement for\n Few-Shot Aspect-Based Sentiment Analysis","summary":" Recently developed large language models (LLMs) have presented promising new\navenues to address data scarcity in low-resource scenarios. In few-shot\naspect-based sentiment analysis (ABSA), previous efforts have explored data\naugmentation techniques, which prompt LLMs to generate new samples by modifying\nexisting ones. However, these methods fail to produce adequately diverse data,\nimpairing their effectiveness. Besides, some studies apply in-context learning\nfor ABSA by using specific instructions and a few selected examples as prompts.\nThough promising, LLMs often yield labels that deviate from task requirements.\nTo overcome these limitations, we propose DS$^2$-ABSA, a dual-stream data\nsynthesis framework targeted for few-shot ABSA. It leverages LLMs to synthesize\ndata from two complementary perspectives: \\textit{key-point-driven} and\n\\textit{instance-driven}, which effectively generate diverse and high-quality\nABSA samples in low-resource settings. Furthermore, a \\textit{label refinement}\nmodule is integrated to improve the synthetic labels. Extensive experiments\ndemonstrate that DS$^2$-ABSA significantly outperforms previous few-shot ABSA\nsolutions and other LLM-oriented data generation methods.\n","authors":["Hongling Xu","Yice Zhang","Qianlong Wang","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2412.14849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14847v1","updated":"2024-12-19T13:39:24Z","published":"2024-12-19T13:39:24Z","title":"A Survey of RWKV","summary":" The Receptance Weighted Key Value (RWKV) model offers a novel alternative to\nthe Transformer architecture, merging the benefits of recurrent and\nattention-based systems. Unlike conventional Transformers, which depend heavily\non self-attention, RWKV adeptly captures long-range dependencies with minimal\ncomputational demands. By utilizing a recurrent framework, RWKV addresses some\ncomputational inefficiencies found in Transformers, particularly in tasks with\nlong sequences. RWKV has recently drawn considerable attention for its robust\nperformance across multiple domains. Despite its growing popularity, no\nsystematic review of the RWKV model exists. This paper seeks to fill this gap\nas the first comprehensive review of the RWKV architecture, its core\nprinciples, and its varied applications, such as natural language generation,\nnatural language understanding, and computer vision. We assess how RWKV\ncompares to traditional Transformer models, highlighting its capability to\nmanage long sequences efficiently and lower computational costs. Furthermore,\nwe explore the challenges RWKV encounters and propose potential directions for\nfuture research and advancement. We consistently maintain the related\nopen-source materials at: https://github.com/MLGroupJLU/RWKV-Survey.\n","authors":["Zhiyuan Li","Tingyu Xia","Yi Chang","Yuan Wu"],"pdf_url":"https://arxiv.org/pdf/2412.14847v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2412.14843v1","updated":"2024-12-19T13:36:18Z","published":"2024-12-19T13:36:18Z","title":"Mapping and Influencing the Political Ideology of Large Language Models\n using Synthetic Personas","summary":" The analysis of political biases in large language models (LLMs) has\nprimarily examined these systems as single entities with fixed viewpoints.\nWhile various methods exist for measuring such biases, the impact of\npersona-based prompting on LLMs' political orientation remains unexplored. In\nthis work we leverage PersonaHub, a collection of synthetic persona\ndescriptions, to map the political distribution of persona-based prompted LLMs\nusing the Political Compass Test (PCT). We then examine whether these initial\ncompass distributions can be manipulated through explicit ideological prompting\ntowards diametrically opposed political orientations: right-authoritarian and\nleft-libertarian. Our experiments reveal that synthetic personas predominantly\ncluster in the left-libertarian quadrant, with models demonstrating varying\ndegrees of responsiveness when prompted with explicit ideological descriptors.\nWhile all models demonstrate significant shifts towards right-authoritarian\npositions, they exhibit more limited shifts towards left-libertarian positions,\nsuggesting an asymmetric response to ideological manipulation that may reflect\ninherent biases in model training.\n","authors":["Pietro Bernardelle","Leon Fröhling","Stefano Civelli","Riccardo Lunardi","Kevin Roiter","Gianluca Demartini"],"pdf_url":"https://arxiv.org/pdf/2412.14843v1.pdf","comment":"4 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2412.14838v1","updated":"2024-12-19T13:28:42Z","published":"2024-12-19T13:28:42Z","title":"DynamicKV: Task-Aware Adaptive KV Cache Compression for Long Context\n LLMs","summary":" Efficient KV cache management in LLMs is crucial for long-context tasks like\nRAG and summarization. Existing KV cache compression methods enforce a fixed\npattern, neglecting task-specific characteristics and reducing the retention of\nessential information. However, we observe distinct activation patterns across\nlayers in various tasks, highlighting the need for adaptive strategies tailored\nto each task's unique demands. Based on this insight, we propose DynamicKV, a\nmethod that dynamically optimizes token retention by adjusting the number of\ntokens retained at each layer to adapt to the specific task. DynamicKV\nestablishes global and per-layer maximum KV cache budgets, temporarily\nretaining the maximum budget for the current layer, and periodically updating\nthe KV cache sizes of all preceding layers during inference. Our method retains\nonly 1.7% of the KV cache size while achieving ~85% of the Full KV cache\nperformance on LongBench. Notably, even under extreme compression (0.9%),\nDynamicKV surpasses state-of-the-art (SOTA) methods by 11% in the\nNeedle-in-a-Haystack test using Mistral-7B-Instruct-v0.2. The code will be\nreleased.\n","authors":["Xiabin Zhou","Wenbin Wang","Minyan Zeng","Jiaxian Guo","Xuebo Liu","Li Shen","Min Zhang","Liang Ding"],"pdf_url":"https://arxiv.org/pdf/2412.14838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14835v1","updated":"2024-12-19T13:25:39Z","published":"2024-12-19T13:25:39Z","title":"Progressive Multimodal Reasoning via Active Retrieval","summary":" Multi-step multimodal reasoning tasks pose significant challenges for\nmultimodal large language models (MLLMs), and finding effective ways to enhance\ntheir performance in such scenarios remains an unresolved issue. In this paper,\nwe propose AR-MCTS, a universal framework designed to progressively improve the\nreasoning capabilities of MLLMs through Active Retrieval (AR) and Monte Carlo\nTree Search (MCTS). Our approach begins with the development of a unified\nretrieval module that retrieves key supporting insights for solving complex\nreasoning problems from a hybrid-modal retrieval corpus. To bridge the gap in\nautomated multimodal reasoning verification, we employ the MCTS algorithm\ncombined with an active retrieval mechanism, which enables the automatic\ngeneration of step-wise annotations. This strategy dynamically retrieves key\ninsights for each reasoning step, moving beyond traditional beam search\nsampling to improve the diversity and reliability of the reasoning space.\nAdditionally, we introduce a process reward model that aligns progressively to\nsupport the automatic verification of multimodal reasoning tasks. Experimental\nresults across three complex multimodal reasoning benchmarks confirm the\neffectiveness of the AR-MCTS framework in enhancing the performance of various\nmultimodal models. Further analysis demonstrates that AR-MCTS can optimize\nsampling diversity and accuracy, yielding reliable multimodal reasoning.\n","authors":["Guanting Dong","Chenghao Zhang","Mengjie Deng","Yutao Zhu","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2412.14835v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2412.14829v1","updated":"2024-12-19T13:19:19Z","published":"2024-12-19T13:19:19Z","title":"Mention Attention for Pronoun Translation","summary":" Most pronouns are referring expressions, computers need to resolve what do\nthe pronouns refer to, and there are divergences on pronoun usage across\nlanguages. Thus, dealing with these divergences and translating pronouns is a\nchallenge in machine translation. Mentions are referring candidates of pronouns\nand have closer relations with pronouns compared to general tokens. We assume\nthat extracting additional mention features can help pronoun translation.\nTherefore, we introduce an additional mention attention module in the decoder\nto pay extra attention to source mentions but not non-mention tokens. Our\nmention attention module not only extracts features from source mentions, but\nalso considers target-side context which benefits pronoun translation. In\naddition, we also introduce two mention classifiers to train models to\nrecognize mentions, whose outputs guide the mention attention. We conduct\nexperiments on the WMT17 English-German translation task, and evaluate our\nmodels on general translation and pronoun translation, using BLEU, APT, and\ncontrastive evaluation metrics. Our proposed model outperforms the baseline\nTransformer model in terms of APT and BLEU scores, this confirms our hypothesis\nthat we can improve pronoun translation by paying additional attention to\nsource mentions, and shows that our introduced additional modules do not have\nnegative effect on the general translation quality.\n","authors":["Gongbo Tang","Christian Hardmeier"],"pdf_url":"https://arxiv.org/pdf/2412.14829v1.pdf","comment":"camera-ready version of the paper accepted by JCRAI-23 conference, in\n ACL format"},{"id":"http://arxiv.org/abs/2408.17072v2","updated":"2024-12-19T13:16:40Z","published":"2024-08-30T07:57:30Z","title":"MaFeRw: Query Rewriting with Multi-Aspect Feedbacks for\n Retrieval-Augmented Large Language Models","summary":" In a real-world RAG system, the current query often involves spoken ellipses\nand ambiguous references from dialogue contexts, necessitating query rewriting\nto better describe user's information needs. However, traditional context-based\nrewriting has minimal enhancement on downstream generation tasks due to the\nlengthy process from query rewriting to response generation. Some researchers\ntry to utilize reinforcement learning with generation feedback to assist the\nrewriter, but these sparse rewards provide little guidance in most cases,\nleading to unstable training and generation results. We find that user's needs\nare also reflected in the gold document, retrieved documents and ground truth.\nTherefore, by feeding back these multi-aspect dense rewards to query rewriting,\nmore stable and satisfactory responses can be achieved. In this paper, we\npropose a novel query rewriting method MaFeRw, which improves RAG performance\nby integrating multi-aspect feedback from both the retrieval process and\ngenerated results. Specifically, we first use manual data to train a T5 model\nfor the rewriter initialization. Next, we design three metrics as reinforcement\nlearning feedback: the similarity between the rewritten query and the gold\ndocument, the ranking metrics, and ROUGE between the generation and the ground\ntruth. Inspired by RLAIF, we train three kinds of reward models for the above\nmetrics to achieve more efficient training. Finally, we combine the scores of\nthese reward models as feedback, and use PPO algorithm to explore the optimal\nquery rewriting strategy. Experimental results on two conversational RAG\ndatasets demonstrate that MaFeRw achieves superior generation metrics and more\nstable training compared to baselines.\n","authors":["Yujing Wang","Hainan Zhang","Liang Pang","Binghui Guo","Hongwei Zheng","Zhiming Zheng"],"pdf_url":"https://arxiv.org/pdf/2408.17072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14809v1","updated":"2024-12-19T12:57:47Z","published":"2024-12-19T12:57:47Z","title":"ResoFilter: Rine-grained Synthetic Data Filtering for Large Language\n Models through Data-Parameter Resonance Analysis","summary":" Large language models (LLMs) have shown remarkable effectiveness across\nvarious domains, with data augmentation methods utilizing GPT for synthetic\ndata generation becoming prevalent. However, the quality and utility of\naugmented data remain questionable, and current methods lack clear metrics for\nevaluating data characteristics. To address these challenges, we propose\nResoFilter, a novel method that integrates models, data, and tasks to refine\ndatasets. ResoFilter leverages the fine-tuning process to obtain Data-Parameter\nfeatures for data selection, offering improved interpretability by representing\ndata characteristics through model weights. Our experiments demonstrate that\nResoFilter achieves comparable results to full-scale fine-tuning using only\nhalf the data in mathematical tasks and exhibits strong generalization across\ndifferent models and domains. This method provides valuable insights for\nconstructing synthetic datasets and evaluating high-quality data, offering a\npromising solution for enhancing data augmentation techniques and improving\ntraining dataset quality for LLMs. For reproducibility, we will release our\ncode and data upon acceptance.\n","authors":["Zeao Tu","Xiangdi Meng","Yu He","Zihan Yao","Tianyu Qi","Jun Liu","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2412.14809v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2412.14780v1","updated":"2024-12-19T12:06:24Z","published":"2024-12-19T12:06:24Z","title":"Disentangling Reasoning Tokens and Boilerplate Tokens For Language Model\n Fine-tuning","summary":" When using agent-task datasets to enhance agent capabilities for Large\nLanguage Models (LLMs), current methodologies often treat all tokens within a\nsample equally. However, we argue that tokens serving different roles -\nspecifically, reasoning tokens versus boilerplate tokens (e.g., those governing\noutput format) - differ significantly in importance and learning complexity,\nnecessitating their disentanglement and distinct treatment. To address this, we\npropose a novel Shuffle-Aware Discriminator (SHAD) for adaptive token\ndiscrimination. SHAD classifies tokens by exploiting predictability differences\nobserved after shuffling input-output combinations across samples: boilerplate\ntokens, due to their repetitive nature among samples, maintain predictability,\nwhereas reasoning tokens do not. Using SHAD, we propose the\nReasoning-highlighted Fine-Tuning (RFT) method, which adaptively emphasizes\nreasoning tokens during fine-tuning, yielding notable performance gains over\ncommon Supervised Fine-Tuning (SFT).\n","authors":["Ziang Ye","Zhenru Zhang","Yang Zhang","Jianxin Ma","Junyang Lin","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2412.14780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14771v1","updated":"2024-12-19T11:55:51Z","published":"2024-12-19T11:55:51Z","title":"ALKAFI-LLAMA3: Fine-Tuning LLMs for Precise Legal Understanding in\n Palestine","summary":" Large Language Models (LLMs) have demonstrated remarkable potential in\ndiverse domains, yet their application in the legal sector, particularly in\nlow-resource contexts, remains limited. This study addresses the challenges of\nadapting LLMs to the Palestinian legal domain, where political instability,\nfragmented legal frameworks, and limited AI resources hinder effective\nmachine-learning applications. We present a fine-tuned model based on a\nquantized version of Llama-3.2-1B-Instruct, trained on a synthetic data set\nderived from Palestinian legal texts. Using smaller-scale models and\nstrategically generated question-answer pairs, we achieve a cost-effective,\nlocally sustainable solution that provides accurate and contextually relevant\nlegal guidance. Our experiments demonstrate promising performance on various\nquery types, ranging from yes/no questions and narrative explanations to\ncomplex legal differentiations, while highlighting areas for improvement, such\nas handling calculation-based inquiries and structured list formatting. This\nwork provides a pathway for the deployment of AI-driven legal assistance tools\ntailored to the needs of resource-constrained environments.\n","authors":["Rabee Qasem","Mohannad Hendi","Banan Tantour"],"pdf_url":"https://arxiv.org/pdf/2412.14771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14769v1","updated":"2024-12-19T11:51:57Z","published":"2024-12-19T11:51:57Z","title":"PsyDraw: A Multi-Agent Multimodal System for Mental Health Screening in\n Left-Behind Children","summary":" Left-behind children (LBCs), numbering over 66 million in China, face severe\nmental health challenges due to parental migration for work. Early screening\nand identification of at-risk LBCs is crucial, yet challenging due to the\nsevere shortage of mental health professionals, especially in rural areas.\nWhile the House-Tree-Person (HTP) test shows higher child participation rates,\nits requirement for expert interpretation limits its application in\nresource-scarce regions. To address this challenge, we propose PsyDraw, a\nmulti-agent system based on Multimodal Large Language Models that assists\nmental health professionals in analyzing HTP drawings. The system employs\nspecialized agents for feature extraction and psychological interpretation,\noperating in two stages: comprehensive feature analysis and professional report\ngeneration. Evaluation of HTP drawings from 290 primary school students reveals\nthat 71.03% of the analyzes achieved High Consistency with professional\nevaluations, 26.21% Moderate Consistency and only 2.41% Low Consistency. The\nsystem identified 31.03% of cases requiring professional attention,\ndemonstrating its effectiveness as a preliminary screening tool. Currently\ndeployed in pilot schools, \\method shows promise in supporting mental health\nprofessionals, particularly in resource-limited areas, while maintaining high\nprofessional standards in psychological assessment.\n","authors":["Yiqun Zhang","Xiaocui Yang","Xiaobai Li","Siyuan Yu","Yi Luan","Shi Feng","Daling Wang","Yifei Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.14769v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2412.14751v1","updated":"2024-12-19T11:30:07Z","published":"2024-12-19T11:30:07Z","title":"Query pipeline optimization for cancer patient question answering\n systems","summary":" Retrieval-augmented generation (RAG) mitigates hallucination in Large\nLanguage Models (LLMs) by using query pipelines to retrieve relevant external\ninformation and grounding responses in retrieved knowledge. However, query\npipeline optimization for cancer patient question-answering (CPQA) systems\nrequires separately optimizing multiple components with domain-specific\nconsiderations. We propose a novel three-aspect optimization approach for the\nRAG query pipeline in CPQA systems, utilizing public biomedical databases like\nPubMed and PubMed Central. Our optimization includes: (1) document retrieval,\nutilizing a comparative analysis of NCBI resources and introducing Hybrid\nSemantic Real-time Document Retrieval (HSRDR); (2) passage retrieval,\nidentifying optimal pairings of dense retrievers and rerankers; and (3)\nsemantic representation, introducing Semantic Enhanced Overlap Segmentation\n(SEOS) for improved contextual understanding. On a custom-developed dataset\ntailored for cancer-related inquiries, our optimized RAG approach improved the\nanswer accuracy of Claude-3-haiku by 5.24% over chain-of-thought prompting and\nabout 3% over a naive RAG setup. This study highlights the importance of\ndomain-specific query optimization in realizing the full potential of RAG and\nprovides a robust framework for building more accurate and reliable CPQA\nsystems, advancing the development of RAG-based biomedical systems.\n","authors":["Maolin He","Rena Gao","Mike Conway","Brian E. Chapman"],"pdf_url":"https://arxiv.org/pdf/2412.14751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14737v1","updated":"2024-12-19T11:10:36Z","published":"2024-12-19T11:10:36Z","title":"On Verbalized Confidence Scores for LLMs","summary":" The rise of large language models (LLMs) and their tight integration into our\ndaily life make it essential to dedicate efforts towards their trustworthiness.\nUncertainty quantification for LLMs can establish more human trust into their\nresponses, but also allows LLM agents to make more informed decisions based on\neach other's uncertainty. To estimate the uncertainty in a response, internal\ntoken logits, task-specific proxy models, or sampling of multiple responses are\ncommonly used. This work focuses on asking the LLM itself to verbalize its\nuncertainty with a confidence score as part of its output tokens, which is a\npromising way for prompt- and model-agnostic uncertainty quantification with\nlow overhead. Using an extensive benchmark, we assess the reliability of\nverbalized confidence scores with respect to different datasets, models, and\nprompt methods. Our results reveal that the reliability of these scores\nstrongly depends on how the model is asked, but also that it is possible to\nextract well-calibrated confidence scores with certain prompt methods. We argue\nthat verbalized confidence scores can become a simple but effective and\nversatile uncertainty quantification method in the future. Our code is\navailable at https://github.com/danielyxyang/llm-verbalized-uq .\n","authors":["Daniel Yang","Yao-Hung Hubert Tsai","Makoto Yamada"],"pdf_url":"https://arxiv.org/pdf/2412.14737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18403v2","updated":"2024-12-19T11:07:09Z","published":"2024-06-26T14:56:13Z","title":"LLMs instead of Human Judges? A Large Scale Empirical Study across 20\n NLP Evaluation Tasks","summary":" There is an increasing trend towards evaluating NLP models with LLMs instead\nof human judgments, raising questions about the validity of these evaluations,\nas well as their reproducibility in the case of proprietary models. We provide\nJUDGE-BENCH, an extensible collection of 20 NLP datasets with human annotations\ncovering a broad range of evaluated properties and types of data, and\ncomprehensively evaluate 11 current LLMs, covering both open-weight and\nproprietary models, for their ability to replicate the annotations. Our\nevaluations show substantial variance across models and datasets. Models are\nreliable evaluators on some tasks, but overall display substantial variability\ndepending on the property being evaluated, the expertise level of the human\njudges, and whether the language is human or model-generated. We conclude that\nLLMs should be carefully validated against human judgments before being used as\nevaluators.\n","authors":["Anna Bavaresco","Raffaella Bernardi","Leonardo Bertolazzi","Desmond Elliott","Raquel Fernández","Albert Gatt","Esam Ghaleb","Mario Giulianelli","Michael Hanna","Alexander Koller","André F. T. Martins","Philipp Mondorf","Vera Neplenbroek","Sandro Pezzelle","Barbara Plank","David Schlangen","Alessandro Suglia","Aditya K Surikuchi","Ece Takmaz","Alberto Testoni"],"pdf_url":"https://arxiv.org/pdf/2406.18403v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11242v2","updated":"2024-12-19T10:33:13Z","published":"2024-12-15T16:47:16Z","title":"TrimLLM: Progressive Layer Dropping for Domain-Specific LLMs","summary":" Specializing large language models (LLMs) for local deployment in\ndomain-specific use cases is necessary for strong performance while meeting\nlatency and privacy constraints. However, conventional task-specific adaptation\napproaches do not show simultaneous memory saving and inference speedup at\ndeployment time. Practical compression techniques like quantization and pruning\nrequire dedicated hardware or kernel support to achieve measured inference\nspeedup. We develop TrimLLM based on the layer-wise specialization phenomenon\nwe empirically observed and verified on contemporary LLMs. TrimLLM reduces the\ndepth of LLMs via progressive layer dropping. We show it retains LLMs' capacity\nin specific domains and achieves inference speedup irrespective of hardware and\ndeep learning frameworks. We evaluated TrimLLM on LLMs of various sizes for\ninference; models adapted on medical, legal, and financial datasets all\ndemonstrate $2.1-5.7\\times$ inference speedup on consumer GPUs and up to\n$3.1\\times$ speedup on A100 when compared to state-of-the-art model compression\nalgorithms, with no loss in accuracy at 50$\\sim$60\\% model compression ratio.\n","authors":["Lanxiang Hu","Tajana Rosing","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.11242v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07675v3","updated":"2024-12-19T10:11:42Z","published":"2024-12-10T17:02:58Z","title":"RAZOR: Sharpening Knowledge by Cutting Bias with Unsupervised Text\n Rewriting","summary":" Despite the widespread use of LLMs due to their superior performance in\nvarious tasks, their high computational costs often lead potential users to opt\nfor the pretraining-finetuning pipeline. However, biases prevalent in manually\nconstructed datasets can introduce spurious correlations between tokens and\nlabels, creating so-called shortcuts and hindering the generalizability of\nfine-tuned models. Existing debiasing methods often rely on prior knowledge of\nspecific dataset biases, which is challenging to acquire a priori. We propose\nRAZOR (Rewriting And Zero-bias Optimization Refinement), a novel, unsupervised,\nand data-focused debiasing approach based on text rewriting for shortcut\nmitigation. RAZOR leverages LLMs to iteratively rewrite potentially biased text\nsegments by replacing them with heuristically selected alternatives in a\nshortcut space defined by token statistics and positional information. This\nprocess aims to align surface-level text features more closely with diverse\nlabel distributions, thereby promoting the learning of genuine linguistic\npatterns. Compared with unsupervised SoTA models, RAZOR improves by 3.5% on the\nFEVER and 6.5% on MNLI and SNLI datasets according to the F1 score.\nAdditionally, RAZOR effectively mitigates specific known biases, reducing\nbias-related terms by x2 without requiring prior bias information, a result\nthat is on par with SoTA models that leverage prior information. Our work\nprioritizes data manipulation over architectural modifications, emphasizing the\npivotal role of data quality in enhancing model performance and fairness. This\nresearch contributes to developing more robust evaluation benchmarks for\ndebiasing methods by incorporating metrics for bias reduction and overall model\nefficacy.\n","authors":["Shuo Yang","Bardh Prenkaj","Gjergji Kasneci"],"pdf_url":"https://arxiv.org/pdf/2412.07675v3.pdf","comment":"Shuo and Bardh contributed equally. Accepted to AAAI'25, Paper #17117"},{"id":"http://arxiv.org/abs/2412.14689v1","updated":"2024-12-19T09:43:39Z","published":"2024-12-19T09:43:39Z","title":"How to Synthesize Text Data without Model Collapse?","summary":" Model collapse in synthetic data indicates that iterative training on\nself-generated data leads to a gradual decline in performance. With the\nproliferation of AI models, synthetic data will fundamentally reshape the web\ndata ecosystem. Future GPT-$\\{n\\}$ models will inevitably be trained on a blend\nof synthetic and human-produced data. In this paper, we focus on two questions:\nwhat is the impact of synthetic data on language model training, and how to\nsynthesize data without model collapse? We first pre-train language models\nacross different proportions of synthetic data, revealing a negative\ncorrelation between the proportion of synthetic data and model performance. We\nfurther conduct statistical analysis on synthetic data to uncover\ndistributional shift phenomenon and over-concentration of n-gram features.\nInspired by the above findings, we propose token editing on human-produced data\nto obtain semi-synthetic data. As a proof of concept, we theoretically\ndemonstrate that token-level editing can prevent model collapse, as the test\nerror is constrained by a finite upper bound. We conduct extensive experiments\non pre-training from scratch, continual pre-training, and supervised\nfine-tuning. The results validate our theoretical proof that token-level\nediting improves data quality and enhances model performance.\n","authors":["Xuekai Zhu","Daixuan Cheng","Hengli Li","Kaiyan Zhang","Ermo Hua","Xingtai Lv","Ning Ding","Zhouhan Lin","Zilong Zheng","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.14689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14686v1","updated":"2024-12-19T09:40:17Z","published":"2024-12-19T09:40:17Z","title":"Each Fake News is Fake in its Own Way: An Attribution Multi-Granularity\n Benchmark for Multimodal Fake News Detection","summary":" Social platforms, while facilitating access to information, have also become\nsaturated with a plethora of fake news, resulting in negative consequences.\nAutomatic multimodal fake news detection is a worthwhile pursuit. Existing\nmultimodal fake news datasets only provide binary labels of real or fake.\nHowever, real news is alike, while each fake news is fake in its own way. These\ndatasets fail to reflect the mixed nature of various types of multimodal fake\nnews. To bridge the gap, we construct an attributing multi-granularity\nmultimodal fake news detection dataset \\amg, revealing the inherent fake\npattern. Furthermore, we propose a multi-granularity clue alignment model \\our\nto achieve multimodal fake news detection and attribution. Experimental results\ndemonstrate that \\amg is a challenging dataset, and its attribution setting\nopens up new avenues for future research.\n","authors":["Hao Guo","Zihan Ma","Zhi Zeng","Minnan Luo","Weixin Zeng","Jiuyang Tang","Xiang Zhao"],"pdf_url":"https://arxiv.org/pdf/2412.14686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14675v1","updated":"2024-12-19T09:29:08Z","published":"2024-12-19T09:29:08Z","title":"LLMs as mediators: Can they diagnose conflicts accurately?","summary":" Prior research indicates that to be able to mediate conflict, observers of\ndisagreements between parties must be able to reliably distinguish the sources\nof their disagreement as stemming from differences in beliefs about what is\ntrue (causality) vs. differences in what they value (morality). In this paper,\nwe test if OpenAI's Large Language Models GPT 3.5 and GPT 4 can perform this\ntask and whether one or other type of disagreement proves particularly\nchallenging for LLM's to diagnose. We replicate study 1 in Ko\\c{c}ak et al.\n(2003), which employes a vignette design, with OpenAI's GPT 3.5 and GPT 4. We\nfind that both LLMs have similar semantic understanding of the distinction\nbetween causal and moral codes as humans and can reliably distinguish between\nthem. When asked to diagnose the source of disagreement in a conversation, both\nLLMs, compared to humans, exhibit a tendency to overestimate the extent of\ncausal disagreement and underestimate the extent of moral disagreement in the\nmoral misalignment condition. This tendency is especially pronounced for GPT 4\nwhen using a proximate scale that relies on concrete language specific to an\nissue. GPT 3.5 does not perform as well as GPT4 or humans when using either the\nproximate or the distal scale. The study provides a first test of the potential\nfor using LLMs to mediate conflict by diagnosing the root of disagreements in\ncausal and evaluative codes.\n","authors":["Özgecan Koçak","Phanish Puranam","Afşar Yegin"],"pdf_url":"https://arxiv.org/pdf/2412.14675v1.pdf","comment":"27 pages, 2 appendices, 21 tables (incl appendices)"},{"id":"http://arxiv.org/abs/2412.06926v3","updated":"2024-12-19T09:24:39Z","published":"2024-12-09T19:11:54Z","title":"When Every Token Counts: Optimal Segmentation for Low-Resource Language\n Models","summary":" Traditional greedy tokenization methods have been a critical step in Natural\nLanguage Processing (NLP), influencing how text is converted into tokens and\ndirectly impacting model performance. While subword tokenizers like Byte-Pair\nEncoding (BPE) are widely used, questions remain about their optimality across\nmodel scales and languages. In this work, we demonstrate through extensive\nexperiments that an optimal BPE configuration significantly reduces token count\ncompared to greedy segmentation, yielding improvements in token-saving\npercentages and performance benefits, particularly for smaller models. We\nevaluate tokenization performance across various intrinsic and extrinsic tasks,\nincluding generation and classification. Our findings suggest that\ncompression-optimized tokenization strategies could provide substantial\nadvantages for multilingual and low-resource language applications,\nhighlighting a promising direction for further research and inclusive NLP.\n","authors":["Bharath Raj S","Garvit Suri","Vikrant Dewangan","Raghav Sonavane"],"pdf_url":"https://arxiv.org/pdf/2412.06926v3.pdf","comment":"LoResLM @ COLING 2025"},{"id":"http://arxiv.org/abs/2412.14670v1","updated":"2024-12-19T09:21:39Z","published":"2024-12-19T09:21:39Z","title":"Analysis and Visualization of Linguistic Structures in Large Language\n Models: Neural Representations of Verb-Particle Constructions in BERT","summary":" This study investigates the internal representations of verb-particle\ncombinations within transformer-based large language models (LLMs),\nspecifically examining how these models capture lexical and syntactic nuances\nat different neural network layers. Employing the BERT architecture, we analyse\nthe representational efficacy of its layers for various verb-particle\nconstructions such as 'agree on', 'come back', and 'give up'. Our methodology\nincludes a detailed dataset preparation from the British National Corpus,\nfollowed by extensive model training and output analysis through techniques\nlike multi-dimensional scaling (MDS) and generalized discrimination value (GDV)\ncalculations. Results show that BERT's middle layers most effectively capture\nsyntactic structures, with significant variability in representational accuracy\nacross different verb categories. These findings challenge the conventional\nuniformity assumed in neural network processing of linguistic elements and\nsuggest a complex interplay between network architecture and linguistic\nrepresentation. Our research contributes to a better understanding of how deep\nlearning models comprehend and process language, offering insights into the\npotential and limitations of current neural approaches to linguistic analysis.\nThis study not only advances our knowledge in computational linguistics but\nalso prompts further research into optimizing neural architectures for enhanced\nlinguistic precision.\n","authors":["Hassane Kissane","Achim Schilling","Patrick Krauss"],"pdf_url":"https://arxiv.org/pdf/2412.14670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14660v1","updated":"2024-12-19T09:10:07Z","published":"2024-12-19T09:10:07Z","title":"Unveiling Uncertainty: A Deep Dive into Calibration and Performance of\n Multimodal Large Language Models","summary":" Multimodal large language models (MLLMs) combine visual and textual data for\ntasks such as image captioning and visual question answering. Proper\nuncertainty calibration is crucial, yet challenging, for reliable use in areas\nlike healthcare and autonomous driving. This paper investigates representative\nMLLMs, focusing on their calibration across various scenarios, including before\nand after visual fine-tuning, as well as before and after multimodal training\nof the base LLMs. We observed miscalibration in their performance, and at the\nsame time, no significant differences in calibration across these scenarios. We\nalso highlight how uncertainty differs between text and images and how their\nintegration affects overall uncertainty. To better understand MLLMs'\nmiscalibration and their ability to self-assess uncertainty, we construct the\nIDK (I don't know) dataset, which is key to evaluating how they handle\nunknowns. Our findings reveal that MLLMs tend to give answers rather than admit\nuncertainty, but this self-assessment improves with proper prompt adjustments.\nFinally, to calibrate MLLMs and enhance model reliability, we propose\ntechniques such as temperature scaling and iterative prompt optimization. Our\nresults provide insights into improving MLLMs for effective and responsible\ndeployment in multimodal applications. Code and IDK dataset:\n\\href{https://github.com/hfutml/Calibration-MLLM}{https://github.com/hfutml/Calibration-MLLM}.\n","authors":["Zijun Chen","Wenbo Hu","Guande He","Zhijie Deng","Zheng Zhang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2412.14660v1.pdf","comment":"Accepted to COLING 2025"},{"id":"http://arxiv.org/abs/2412.14656v1","updated":"2024-12-19T09:07:38Z","published":"2024-12-19T09:07:38Z","title":"Length Controlled Generation for Black-box LLMs","summary":" Large language models (LLMs) have demonstrated impressive instruction\nfollowing capabilities, while still struggling to accurately manage the length\nof the generated text, which is a fundamental requirement in many real-world\napplications. Existing length control methods involve fine-tuning the\nparameters of LLMs, which is inefficient and suboptimal for practical use. In\nthis paper, we propose a novel iterative sampling framework for text length\ncontrol, integrating the Metropolis-Hastings algorithm with an importance\nsampling acceleration strategy. This framework efficiently and reliably\nregulates LLMs to generate length-constrained text without modifying the\nunderlying parameters, thereby preserving the original capabilities of LLMs.\nExperimental results demonstrate that our framework achieves almost 100\\%\nsuccess rates of length control on Llama3.1 for tasks such as length-controlled\nabstractive summarization and length-constrained instruction following, with\nminimal additional computational overhead. This also highlights the significant\npotential of our method for precise length control across a broader range of\napplications, without compromising the versatility of LLMs.\n","authors":["Yuxuan Gu","Wenjie Wang","Xiaocheng Feng","Weihong Zhong","Kun Zhu","Lei Huang","Tat-Seng Chua","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2412.14656v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2412.14642v1","updated":"2024-12-19T08:51:16Z","published":"2024-12-19T08:51:16Z","title":"TOMG-Bench: Evaluating LLMs on Text-based Open Molecule Generation","summary":" In this paper, we propose Text-based Open Molecule Generation Benchmark\n(TOMG-Bench), the first benchmark to evaluate the open-domain molecule\ngeneration capability of LLMs. TOMG-Bench encompasses a dataset of three major\ntasks: molecule editing (MolEdit), molecule optimization (MolOpt), and\ncustomized molecule generation (MolCustom). Each task further contains three\nsubtasks, with each subtask comprising 5,000 test samples. Given the inherent\ncomplexity of open molecule generation, we have also developed an automated\nevaluation system that helps measure both the quality and the accuracy of the\ngenerated molecules. Our comprehensive benchmarking of 25 LLMs reveals the\ncurrent limitations and potential areas for improvement in text-guided molecule\ndiscovery. Furthermore, with the assistance of OpenMolIns, a specialized\ninstruction tuning dataset proposed for solving challenges raised by\nTOMG-Bench, Llama3.1-8B could outperform all the open-source general LLMs, even\nsurpassing GPT-3.5-turbo by 46.5\\% on TOMG-Bench. Our codes and datasets are\navailable through https://github.com/phenixace/TOMG-Bench.\n","authors":["Jiatong Li","Junxian Li","Yunqing Liu","Dongzhan Zhou","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2412.14642v1.pdf","comment":"A benchmark for text-based open molecule generation"},{"id":"http://arxiv.org/abs/2409.17603v2","updated":"2024-12-19T08:39:05Z","published":"2024-09-26T07:40:03Z","title":"Deep CLAS: Deep Contextual Listen, Attend and Spell","summary":" Contextual-LAS (CLAS) has been shown effective in improving Automatic Speech\nRecognition (ASR) of rare words. It relies on phrase-level contextual modeling\nand attention-based relevance scoring without explicit contextual constraint\nwhich lead to insufficient use of contextual information. In this work, we\npropose deep CLAS to use contextual information better. We introduce bias loss\nforcing model to focus on contextual information. The query of bias attention\nis also enriched to improve the accuracy of the bias attention score. To get\nfine-grained contextual information, we replace phrase-level encoding with\ncharacter-level encoding and encode contextual information with conformer\nrather than LSTM. Moreover, we directly use the bias attention score to correct\nthe output probability distribution of the model. Experiments using the public\nAISHELL-1 and AISHELL-NER. On AISHELL-1, compared to CLAS baselines, deep CLAS\nobtains a 65.78% relative recall and a 53.49% relative F1-score increase in the\nnamed entity recognition scene.\n","authors":["Mengzhi Wang","Shifu Xiong","Genshun Wan","Hang Chen","Jianqing Gao","Lirong Dai"],"pdf_url":"https://arxiv.org/pdf/2409.17603v2.pdf","comment":"Submitted to JUSTC"},{"id":"http://arxiv.org/abs/2412.14626v1","updated":"2024-12-19T08:28:18Z","published":"2024-12-19T08:28:18Z","title":"Learning to Generate Research Idea with Dynamic Control","summary":" The rapid advancements in large language models (LLMs) have demonstrated\ntheir potential to accelerate scientific discovery, particularly in automating\nthe process of research ideation. LLM-based systems have shown promise in\ngenerating hypotheses and research ideas. However, current approaches\npredominantly rely on prompting-based pre-trained models, limiting their\nability to optimize generated content effectively. Moreover, they also lack the\ncapability to deal with the complex interdependence and inherent restrictions\namong novelty, feasibility, and effectiveness, which remains challenging due to\nthe inherent trade-offs among these dimensions, such as the\ninnovation-feasibility conflict. To address these limitations, we for the first\ntime propose fine-tuning LLMs to be better idea proposers and introduce a novel\nframework that employs a two-stage approach combining Supervised Fine-Tuning\n(SFT) and controllable Reinforcement Learning (RL). In the SFT stage, the model\nlearns foundational patterns from pairs of research papers and follow-up ideas.\nIn the RL stage, multi-dimensional reward modeling, guided by fine-grained\nfeedback, evaluates and optimizes the generated ideas across key metrics.\nDimensional controllers enable dynamic adjustment of generation, while a\nsentence-level decoder ensures context-aware emphasis during inference. Our\nframework provides a balanced approach to research ideation, achieving\nhigh-quality outcomes by dynamically navigating the trade-offs among novelty,\nfeasibility, and effectiveness.\n","authors":["Ruochen Li","Liqiang Jing","Chi Han","Jiawei Zhou","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2412.14626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14011v2","updated":"2024-12-19T08:27:54Z","published":"2024-12-18T16:29:45Z","title":"Towards an optimised evaluation of teachers' discourse: The case of\n engaging messages","summary":" Evaluating teachers' skills is crucial for enhancing education quality and\nstudent outcomes. Teacher discourse, significantly influencing student\nperformance, is a key component. However, coding this discourse can be\nlaborious. This study addresses this issue by introducing a new methodology for\noptimising the assessment of teacher discourse. The research consisted of two\nstudies, both within the framework of engaging messages used by secondary\neducation teachers. The first study involved training two large language models\non real-world examples from audio-recorded lessons over two academic years to\nidentify and classify the engaging messages from the lessons' transcripts. This\nresulted in sensitivities of 84.31% and 91.11%, and specificities of 97.69% and\n86.36% in identification and classification, respectively. The second study\napplied these models to transcripts of audio-recorded lessons from a third\nacademic year to examine the frequency and distribution of message types by\neducational level and moment of the academic year. Results showed teachers\npredominantly use messages emphasising engagement benefits, linked to improved\noutcomes, while one-third highlighted non-engagement disadvantages, associated\nwith increased anxiety. The use of engaging messages declined in Grade 12 and\ntowards the academic year's end. These findings suggest potential interventions\nto optimise engaging message use, enhancing teaching quality and student\noutcomes.\n","authors":["Samuel Falcon","Jaime Leon"],"pdf_url":"https://arxiv.org/pdf/2412.14011v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14617v1","updated":"2024-12-19T08:06:09Z","published":"2024-12-19T08:06:09Z","title":"How good is GPT at writing political speeches for the White House?","summary":" Using large language models (LLMs), computers are able to generate a written\ntext in response to a us er request. As this pervasive technology can be\napplied in numerous contexts, this study analyses the written style of one LLM\ncalled GPT by comparing its generated speeches with those of the recent US\npresidents. To achieve this objective, the State of the Union (SOTU) addresses\nwritten by Reagan to Biden are contrasted to those produced by both GPT-3.5 and\nGPT-4.o versions. Compared to US presidents, GPT tends to overuse the lemma\n\"we\" and produce shorter messages with, on average, longer sentences. Moreover,\nGPT opts for an optimistic tone, opting more often for political (e.g.,\npresident, Congress), symbolic (e.g., freedom), and abstract terms (e.g.,\nfreedom). Even when imposing an author's style to GPT, the resulting speech\nremains distinct from addresses written by the target author. Finally, the two\nGPT versions present distinct characteristics, but both appear overall\ndissimilar to true presidential messages.\n","authors":["Jacques Savoy"],"pdf_url":"https://arxiv.org/pdf/2412.14617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14613v1","updated":"2024-12-19T08:03:16Z","published":"2024-12-19T08:03:16Z","title":"HarmonicEval: Multi-modal, Multi-task, Multi-criteria Automatic\n Evaluation Using a Vision Language Model","summary":" Vision-language models (VLMs) have shown impressive abilities in text and\nimage understanding. However, existing metrics for evaluating the text\ngenerated by VLMs focus exclusively on overall quality, leading to two\nlimitations: 1) it is challenging to identify which aspects of the text need\nimprovement from the overall score; 2) metrics may overlook specific evaluation\ncriteria when predicting an overall score. To address these limitations, we\npropose HarmonicEval, a reference-free evaluation metric that aggregates\ncriterion-wise scores to produce the overall score in a bottom-up manner.\nFurthermore, we construct the Multi-task Multi-criteria Human Evaluation (MMHE)\ndataset, which comprises 18,000 expert human judgments across four\nvision-language tasks. Our experiments demonstrate that HarmonicEval achieves\nhigher correlations with human judgments than conventional metrics while\nproviding numerical scores for each criterion.\n","authors":["Masanari Ohi","Masahiro Kaneko","Naoaki Okazaki","Nakamasa Inoue"],"pdf_url":"https://arxiv.org/pdf/2412.14613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14612v1","updated":"2024-12-19T08:02:08Z","published":"2024-12-19T08:02:08Z","title":"KARRIEREWEGE: A Large Scale Career Path Prediction Dataset","summary":" Accurate career path prediction can support many stakeholders, like job\nseekers, recruiters, HR, and project managers. However, publicly available data\nand tools for career path prediction are scarce. In this work, we introduce\nKARRIEREWEGE, a comprehensive, publicly available dataset containing over 500k\ncareer paths, significantly surpassing the size of previously available\ndatasets. We link the dataset to the ESCO taxonomy to offer a valuable resource\nfor predicting career trajectories. To tackle the problem of free-text inputs\ntypically found in resumes, we enhance it by synthesizing job titles and\ndescriptions resulting in KARRIEREWEGE+. This allows for accurate predictions\nfrom unstructured data, closely aligning with real-world application\nchallenges. We benchmark existing state-of-the-art (SOTA) models on our dataset\nand a prior benchmark and observe improved performance and robustness,\nparticularly for free-text use cases, due to the synthesized data.\n","authors":["Elena Senger","Yuri Campbell","Rob van der Goot","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2412.14612v1.pdf","comment":"Accepted at COLING Industry Track"},{"id":"http://arxiv.org/abs/2412.14596v1","updated":"2024-12-19T07:31:40Z","published":"2024-12-19T07:31:40Z","title":"LDP: Generalizing to Multilingual Visual Information Extraction by\n Language Decoupled Pretraining","summary":" Visual Information Extraction (VIE) plays a crucial role in the comprehension\nof semi-structured documents, and several pre-trained models have been\ndeveloped to enhance performance. However, most of these works are monolingual\n(usually English). Due to the extremely unbalanced quantity and quality of\npre-training corpora between English and other languages, few works can extend\nto non-English scenarios. In this paper, we conduct systematic experiments to\nshow that vision and layout modality hold invariance among images with\ndifferent languages. If decoupling language bias from document images, a\nvision-layout-based model can achieve impressive cross-lingual generalization.\nAccordingly, we present a simple but effective multilingual training paradigm\nLDP (Language Decoupled Pre-training) for better utilization of monolingual\npre-training data. Our proposed model LDM (Language Decoupled Model) is first\npre-trained on the language-independent data, where the language knowledge is\ndecoupled by a diffusion model, and then the LDM is fine-tuned on the\ndownstream languages. Extensive experiments show that the LDM outperformed all\nSOTA multilingual pre-trained models, and also maintains competitiveness on\ndownstream monolingual/English benchmarks.\n","authors":["Huawen Shen","Gengluo Li","Jinwen Zhong","Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.14596v1.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2411.12262v2","updated":"2024-12-19T07:29:23Z","published":"2024-11-19T06:21:51Z","title":"Low-resource Machine Translation: what for? who for? An observational\n study on a dedicated Tetun language translation service","summary":" Low-resource machine translation (MT) presents a diversity of community needs\nand application challenges that remain poorly understood. To complement surveys\nand focus groups, which tend to rely on small samples of respondents, we\npropose an observational study on actual usage patterns of a specialized MT\nservice for the Tetun language, which is the lingua franca in Timor-Leste. Our\nanalysis of 100,000 translation requests reveals patterns that challenge\nassumptions based on existing corpora. We find that users, many of them\nstudents on mobile devices, typically translate text from a high-resource\nlanguage into Tetun across diverse domains including science, healthcare, and\ndaily life. This contrasts sharply with available Tetun corpora, which are\ndominated by news articles covering government and social issues. Our results\nsuggest that MT systems for minority languages like Tetun should prioritize\naccuracy on domains relevant to educational contexts, in the high-resource to\nlow-resource direction. More broadly, this study demonstrates how observational\nanalysis can inform low-resource language technology development, by grounding\nresearch in practical community needs.\n","authors":["Raphael Merx","Adérito José Guterres Correia","Hanna Suominen","Ekaterina Vylomova"],"pdf_url":"https://arxiv.org/pdf/2411.12262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14588v1","updated":"2024-12-19T07:14:13Z","published":"2024-12-19T07:14:13Z","title":"Beyond Guilt: Legal Judgment Prediction with Trichotomous Reasoning","summary":" In legal practice, judges apply the trichotomous dogmatics of criminal law,\nsequentially assessing the elements of the offense, unlawfulness, and\nculpability to determine whether an individual's conduct constitutes a crime.\nAlthough current legal large language models (LLMs) show promising accuracy in\njudgment prediction, they lack trichotomous reasoning capabilities due to the\nabsence of an appropriate benchmark dataset, preventing them from predicting\ninnocent outcomes. As a result, every input is automatically assigned a charge,\nlimiting their practical utility in legal contexts. To bridge this gap, we\nintroduce LJPIV, the first benchmark dataset for Legal Judgment Prediction with\nInnocent Verdicts. Adhering to the trichotomous dogmatics, we extend three\nwidely-used legal datasets through LLM-based augmentation and manual\nverification. Our experiments with state-of-the-art legal LLMs and novel\nstrategies that integrate trichotomous reasoning into zero-shot prompting and\nfine-tuning reveal: (1) current legal LLMs have significant room for\nimprovement, with even the best models achieving an F1 score of less than 0.3\non LJPIV; and (2) our strategies notably enhance both in-domain and\ncross-domain judgment prediction accuracy, especially for cases resulting in an\ninnocent verdict.\n","authors":["Kepu Zhang","Haoyue Yang","Xu Tang","Weijie Yu","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2412.14588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14584v1","updated":"2024-12-19T07:06:01Z","published":"2024-12-19T07:06:01Z","title":"Simulation-Free Hierarchical Latent Policy Planning for Proactive\n Dialogues","summary":" Recent advancements in proactive dialogues have garnered significant\nattention, particularly for more complex objectives (e.g. emotion support and\npersuasion). Unlike traditional task-oriented dialogues, proactive dialogues\ndemand advanced policy planning and adaptability, requiring rich scenarios and\ncomprehensive policy repositories to develop such systems. However, existing\napproaches tend to rely on Large Language Models (LLMs) for user simulation and\nonline learning, leading to biases that diverge from realistic scenarios and\nresult in suboptimal efficiency. Moreover, these methods depend on manually\ndefined, context-independent, coarse-grained policies, which not only incur\nhigh expert costs but also raise concerns regarding their completeness. In our\nwork, we highlight the potential for automatically discovering policies\ndirectly from raw, real-world dialogue records. To this end, we introduce a\nnovel dialogue policy planning framework, LDPP. It fully automates the process\nfrom mining policies in dialogue records to learning policy planning.\nSpecifically, we employ a variant of the Variational Autoencoder to discover\nfine-grained policies represented as latent vectors. After automatically\nannotating the data with these latent policy labels, we propose an Offline\nHierarchical Reinforcement Learning (RL) algorithm in the latent space to\ndevelop effective policy planning capabilities. Our experiments demonstrate\nthat LDPP outperforms existing methods on two proactive scenarios, even\nsurpassing ChatGPT with only a 1.8-billion-parameter LLM.\n","authors":["Tao He","Lizi Liao","Yixin Cao","Yuanxing Liu","Yiheng Sun","Zerui Chen","Ming Liu","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2412.14584v1.pdf","comment":"24 pages, 5 fgiures, AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14581v1","updated":"2024-12-19T07:01:25Z","published":"2024-12-19T07:01:25Z","title":"CORD: Balancing COnsistency and Rank Distillation for Robust\n Retrieval-Augmented Generation","summary":" With the adoption of retrieval-augmented generation (RAG), large language\nmodels (LLMs) are expected to ground their generation to the retrieved\ncontexts. Yet, this is hindered by position bias of LLMs, failing to evenly\nattend to all contexts. Previous work has addressed this by synthesizing\ncontexts with perturbed positions of gold segment, creating a\nposition-diversified train set. We extend this intuition to propose consistency\nregularization with augmentation and distillation. First, we augment each\ntraining instance with its position perturbation to encourage consistent\npredictions, regardless of ordering. We also distill behaviors of this pair,\nalthough it can be counterproductive in certain RAG scenarios where the given\norder from the retriever is crucial for generation quality. We thus propose\nCORD, balancing COnsistency and Rank Distillation. CORD adaptively samples\nnoise-controlled perturbations from an interpolation space, ensuring both\nconsistency and respect for the rank prior. Empirical results show this balance\nenables CORD to outperform consistently in diverse RAG benchmarks.\n","authors":["Youngwon Lee","Seung-won Hwang","Daniel Campos","Filip Graliński","Zhewei Yao","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2412.14581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.07629v2","updated":"2024-12-19T06:53:37Z","published":"2024-12-10T16:08:14Z","title":"Piece of Table: A Divide-and-Conquer Approach for Selecting Sub-Tables\n in Table Question Answering","summary":" Applying language models (LMs) to tables is challenging due to the inherent\nstructural differences between two-dimensional tables and one-dimensional text\nfor which the LMs were originally designed. Furthermore, when applying\nlinearized tables to LMs, the maximum token lengths often imposed in\nself-attention calculations make it difficult to comprehensively understand the\ncontext spread across large tables. To address these challenges, we present\nPieTa (Piece of Table), a new framework for sub-table-based question answering\n(QA). PieTa operates through an iterative process of dividing tables into\nsmaller windows, using LMs to select relevant cells within each window, and\nmerging these cells into a sub-table. This multi-resolution approach captures\ndependencies across multiple rows and columns while avoiding the limitations\ncaused by long context inputs. Instantiated as a simple iterative sub-table\nunion algorithm, PieTa demonstrates improved performance over previous\nsub-table-based QA approaches.\n","authors":["Wonjin Lee","Kyumin Kim","Sungjae Lee","Jihun Lee","Kwang In Kim"],"pdf_url":"https://arxiv.org/pdf/2412.07629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14574v1","updated":"2024-12-19T06:44:59Z","published":"2024-12-19T06:44:59Z","title":"Sliding Windows Are Not the End: Exploring Full Ranking with\n Long-Context Large Language Models","summary":" Large Language Models (LLMs) have shown exciting performance in listwise\npassage ranking. Due to the limited input length, existing methods often adopt\nthe sliding window strategy. Such a strategy, though effective, is inefficient\nas it involves repetitive and serialized processing, which usually re-evaluates\nrelevant passages multiple times. As a result, it incurs redundant API costs,\nwhich are proportional to the number of inference tokens. The development of\nlong-context LLMs enables the full ranking of all passages within a single\ninference, avoiding redundant API costs. In this paper, we conduct a\ncomprehensive study of long-context LLMs for ranking tasks in terms of\nefficiency and effectiveness. Surprisingly, our experiments reveal that full\nranking with long-context LLMs can deliver superior performance in the\nsupervised fine-tuning setting with a huge efficiency improvement. Furthermore,\nwe identify two limitations of fine-tuning the full ranking model based on\nexisting methods: (1) sliding window strategy fails to produce a full ranking\nlist as a training label, and (2) the language modeling loss cannot emphasize\ntop-ranked passage IDs in the label. To alleviate these issues, we propose a\nnew complete listwise label construction approach and a novel importance-aware\nlearning objective for full ranking. Experiments show the superior performance\nof our method over baselines. Our codes are available at\n\\url{https://github.com/8421BCD/fullrank}.\n","authors":["Wenhan Liu","Xinyu Ma","Yutao Zhu","Ziliang Zhao","Shuaiqiang Wang","Dawei Yin","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2412.14574v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.07663v2","updated":"2024-12-19T06:34:31Z","published":"2024-08-14T16:51:21Z","title":"Alignment-Enhanced Decoding:Defending via Token-Level Adaptive Refining\n of Probability Distributions","summary":" Large language models are susceptible to jailbreak attacks, which can result\nin the generation of harmful content. While prior defenses mitigate these risks\nby perturbing or inspecting inputs, they ignore competing objectives, the\nunderlying cause of alignment failures. In this paper, we propose\nAlignment-Enhanced Decoding (AED), a novel defense that employs adaptive\ndecoding to address the root causes of jailbreak issues. We first define the\nCompetitive Index to quantify alignment failures and utilize feedback from\nself-evaluation to compute post-alignment logits. Then, AED adaptively combines\nAED and post-alignment logits with the original logits to obtain harmless and\nhelpful distributions. Consequently, our method enhances safety alignment while\nmaintaining helpfulness. We conduct experiments across five models and four\ncommon jailbreaks, with the results validating the effectiveness of our\napproach. Code is available at https://github.com/GIGABaozi/AED.git.\n","authors":["Quan Liu","Zhenhong Zhou","Longzhu He","Yi Liu","Wei Zhang","Sen Su"],"pdf_url":"https://arxiv.org/pdf/2408.07663v2.pdf","comment":"Accepted by EMNLP 2024, 15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2412.13663v2","updated":"2024-12-19T06:32:26Z","published":"2024-12-18T09:39:44Z","title":"Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for\n Fast, Memory Efficient, and Long Context Finetuning and Inference","summary":" Encoder-only transformer models such as BERT offer a great performance-size\ntradeoff for retrieval and classification tasks with respect to larger\ndecoder-only models. Despite being the workhorse of numerous production\npipelines, there have been limited Pareto improvements to BERT since its\nrelease. In this paper, we introduce ModernBERT, bringing modern model\noptimizations to encoder-only models and representing a major Pareto\nimprovement over older encoders. Trained on 2 trillion tokens with a native\n8192 sequence length, ModernBERT models exhibit state-of-the-art results on a\nlarge pool of evaluations encompassing diverse classification tasks and both\nsingle and multi-vector retrieval on different domains (including code). In\naddition to strong downstream performance, ModernBERT is also the most speed\nand memory efficient encoder and is designed for inference on common GPUs.\n","authors":["Benjamin Warner","Antoine Chaffin","Benjamin Clavié","Orion Weller","Oskar Hallström","Said Taghadouini","Alexis Gallagher","Raja Biswas","Faisal Ladhak","Tom Aarsen","Nathan Cooper","Griffin Adams","Jeremy Howard","Iacopo Poli"],"pdf_url":"https://arxiv.org/pdf/2412.13663v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19813v3","updated":"2024-12-19T06:27:44Z","published":"2024-07-29T09:05:10Z","title":"Improving Retrieval Augmented Language Model with Self-Reasoning","summary":" The Retrieval-Augmented Language Model (RALM) has shown remarkable\nperformance on knowledge-intensive tasks by incorporating external knowledge\nduring inference, which mitigates the factual hallucinations inherited in large\nlanguage models (LLMs). Despite these advancements, challenges persist in the\nimplementation of RALMs, particularly concerning their reliability and\ntraceability. To be specific, the irrelevant document retrieval may result in\nunhelpful response generation or even deteriorate the performance of LLMs,\nwhile the lack of proper citations in generated outputs complicates efforts to\nverify the trustworthiness of the models. To this end, we propose a novel\nself-reasoning framework aimed at improving the reliability and traceability of\nRALMs, whose core idea is to leverage reasoning trajectories generated by the\nLLM itself. The framework involves constructing self-reason trajectories with\nthree processes: a relevance-aware process, an evidence-aware selective\nprocess, and a trajectory analysis process. We have evaluated our framework\nacross four public datasets (two short-form QA datasets, one long-form QA\ndataset, and one fact verification dataset) to demonstrate the superiority of\nour method, which can outperform existing state-of-the-art models and can\nachieve comparable performance with GPT-4, while only using 2,000 training\nsamples.\n","authors":["Yuan Xia","Jingbo Zhou","Zhenhui Shi","Jun Chen","Haifeng Huang"],"pdf_url":"https://arxiv.org/pdf/2407.19813v3.pdf","comment":"AAAI 2025 (main conference)"},{"id":"http://arxiv.org/abs/2412.14556v1","updated":"2024-12-19T06:14:20Z","published":"2024-12-19T06:14:20Z","title":"CitaLaw: Enhancing LLM with Citations in Legal Domain","summary":" In this paper, we propose CitaLaw, the first benchmark designed to evaluate\nLLMs' ability to produce legally sound responses with appropriate citations.\nCitaLaw features a diverse set of legal questions for both laypersons and\npractitioners, paired with a comprehensive corpus of law articles and precedent\ncases as a reference pool. This framework enables LLM-based systems to retrieve\nsupporting citations from the reference corpus and align these citations with\nthe corresponding sentences in their responses. Moreover, we introduce\nsyllogism-inspired evaluation methods to assess the legal alignment between\nretrieved references and LLM-generated responses, as well as their consistency\nwith user questions. Extensive experiments on 2 open-domain and 7\nlegal-specific LLMs demonstrate that integrating legal references substantially\nenhances response quality. Furthermore, our proposed syllogism-based evaluation\nmethod exhibits strong agreement with human judgments.\n","authors":["Kepu Zhang","Weijie Yu","Sunhao Dai","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2412.14556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.08125v2","updated":"2024-12-19T05:46:29Z","published":"2024-12-11T06:21:33Z","title":"Progressive Multi-granular Alignments for Grounded Reasoning in Large\n Vision-Language Models","summary":" Existing Large Vision-Language Models (LVLMs) excel at matching concepts\nacross multi-modal inputs but struggle with compositional concepts and\nhigh-level relationships between entities. This paper introduces Progressive\nmulti-granular Vision-Language alignments (PromViL), a novel framework to\nenhance LVLMs' ability in performing grounded compositional visual reasoning\ntasks. Our approach constructs a hierarchical structure of multi-modal\nalignments, ranging from simple to complex concepts. By progressively aligning\ntextual descriptions with corresponding visual regions, our model learns to\nleverage contextual information from lower levels to inform higher-level\nreasoning. To facilitate this learning process, we introduce a data generation\nprocess that creates a novel dataset derived from Visual Genome, providing a\nwide range of nested compositional vision-language pairs. Experimental results\ndemonstrate that our PromViL framework significantly outperforms baselines on\nvarious visual grounding and compositional question answering tasks. The code\nis available at: https://github.com/lqh52/PromViL.\n","authors":["Quang-Hung Le","Long Hoang Dang","Ngan Le","Truyen Tran","Thao Minh Le"],"pdf_url":"https://arxiv.org/pdf/2412.08125v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00451v3","updated":"2024-12-19T05:32:59Z","published":"2024-10-01T07:11:55Z","title":"Unleashing the Unseen: Harnessing Benign Datasets for Jailbreaking Large\n Language Models","summary":" Despite significant ongoing efforts in safety alignment, large language\nmodels (LLMs) such as GPT-4 and LLaMA 3 remain vulnerable to jailbreak attacks\nthat can induce harmful behaviors, including through the use of adversarial\nsuffixes. Building on prior research, we hypothesize that these adversarial\nsuffixes are not mere bugs but may represent features that can dominate the\nLLM's behavior. To evaluate this hypothesis, we conduct several experiments.\nFirst, we demonstrate that benign features can be effectively made to function\nas adversarial suffixes, i.e., we develop a feature extraction method to\nextract sample-agnostic features from benign dataset in the form of suffixes\nand show that these suffixes may effectively compromise safety alignment.\nSecond, we show that adversarial suffixes generated from jailbreak attacks may\ncontain meaningful features, i.e., appending the same suffix to different\nprompts results in responses exhibiting specific characteristics. Third, we\nshow that such benign-yet-safety-compromising features can be easily introduced\nthrough fine-tuning using only benign datasets. As a result, we are able to\ncompletely eliminate GPT's safety alignment in a blackbox setting through\nfinetuning with only benign data. Our code and data is available at\n\\url{https://github.com/suffix-maybe-feature/adver-suffix-maybe-features}.\n","authors":["Wei Zhao","Zhe Li","Yige Li","Jun Sun"],"pdf_url":"https://arxiv.org/pdf/2410.00451v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13362v3","updated":"2024-12-19T05:26:14Z","published":"2024-06-19T09:07:31Z","title":"VisualRWKV: Exploring Recurrent Neural Networks for Visual Language\n Models","summary":" Visual Language Models (VLMs) have rapidly progressed with the recent success\nof large language models. However, there have been few attempts to incorporate\nefficient linear Recurrent Neural Networks (RNNs) architectures into VLMs. In\nthis study, we introduce VisualRWKV, the first application of a linear RNN\nmodel to multimodal learning tasks, leveraging the pre-trained RWKV language\nmodel. We propose a data-dependent recurrence and sandwich prompts to enhance\nour modeling capabilities, along with a 2D image scanning mechanism to enrich\nthe processing of visual sequences. Extensive experiments demonstrate that\nVisualRWKV achieves competitive performance compared to Transformer-based\nmodels like LLaVA-1.5 on various benchmarks. Compared to LLaVA-1.5, VisualRWKV\nhas a speed advantage of 3.98 times and can save 54% of GPU memory when\nreaching an inference length of 24K tokens. To facilitate further research and\nanalysis, we have made the checkpoints and the associated code publicly\naccessible at the following GitHub repository: see\nhttps://github.com/howard-hou/VisualRWKV.\n","authors":["Haowen Hou","Peigen Zeng","Fei Ma","Fei Richard Yu"],"pdf_url":"https://arxiv.org/pdf/2406.13362v3.pdf","comment":"Accepted at COLING 2025 main conference"},{"id":"http://arxiv.org/abs/2412.14533v1","updated":"2024-12-19T05:11:16Z","published":"2024-12-19T05:11:16Z","title":"ClusterTalk: Corpus Exploration Framework using Multi-Dimensional\n Exploratory Search","summary":" Exploratory search of large text corpora is essential in domains like\nbiomedical research, where large amounts of research literature are\ncontinuously generated. This paper presents ClusterTalk (The demo video and\nsource code are available at: https://github.com/achouhan93/ClusterTalk), a\nframework for corpus exploration using multi-dimensional exploratory search.\nOur system integrates document clustering with faceted search, allowing users\nto interactively refine their exploration and ask corpus and document-level\nqueries. Compared to traditional one-dimensional search approaches like keyword\nsearch or clustering, this system improves the discoverability of information\nby encouraging a deeper interaction with the corpus. We demonstrate the\nfunctionality of the ClusterTalk framework based on four million PubMed\nabstracts for the four-year time frame.\n","authors":["Ashish Chouhan","Saifeldin Mandour","Michael Gertz"],"pdf_url":"https://arxiv.org/pdf/2412.14533v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2404.01349v2","updated":"2024-12-19T05:05:46Z","published":"2024-03-31T22:22:53Z","title":"Fairness in Large Language Models: A Taxonomic Survey","summary":" Large Language Models (LLMs) have demonstrated remarkable success across\nvarious domains. However, despite their promising performance in numerous\nreal-world applications, most of these algorithms lack fairness considerations.\nConsequently, they may lead to discriminatory outcomes against certain\ncommunities, particularly marginalized populations, prompting extensive study\nin fair LLMs. On the other hand, fairness in LLMs, in contrast to fairness in\ntraditional machine learning, entails exclusive backgrounds, taxonomies, and\nfulfillment techniques. To this end, this survey presents a comprehensive\noverview of recent advances in the existing literature concerning fair LLMs.\nSpecifically, a brief introduction to LLMs is provided, followed by an analysis\nof factors contributing to bias in LLMs. Additionally, the concept of fairness\nin LLMs is discussed categorically, summarizing metrics for evaluating bias in\nLLMs and existing algorithms for promoting fairness. Furthermore, resources for\nevaluating bias in LLMs, including toolkits and datasets, are summarized.\nFinally, existing research challenges and open questions are discussed.\n","authors":["Zhibo Chu","Zichong Wang","Wenbin Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.08108v2","updated":"2024-12-19T05:01:33Z","published":"2024-12-11T05:23:34Z","title":"Doubly-Universal Adversarial Perturbations: Deceiving Vision-Language\n Models Across Both Images and Text with a Single Perturbation","summary":" Large Vision-Language Models (VLMs) have demonstrated remarkable performance\nacross multimodal tasks by integrating vision encoders with large language\nmodels (LLMs). However, these models remain vulnerable to adversarial attacks.\nAmong such attacks, Universal Adversarial Perturbations (UAPs) are especially\npowerful, as a single optimized perturbation can mislead the model across\nvarious input images. In this work, we introduce a novel UAP specifically\ndesigned for VLMs: the Doubly-Universal Adversarial Perturbation (Doubly-UAP),\ncapable of universally deceiving VLMs across both image and text inputs. To\nsuccessfully disrupt the vision encoder's fundamental process, we analyze the\ncore components of the attention mechanism. After identifying value vectors in\nthe middle-to-late layers as the most vulnerable, we optimize Doubly-UAP in a\nlabel-free manner with a frozen model. Despite being developed as a black-box\nto the LLM, Doubly-UAP achieves high attack success rates on VLMs, consistently\noutperforming baseline methods across vision-language tasks. Extensive ablation\nstudies and analyses further demonstrate the robustness of Doubly-UAP and\nprovide insights into how it influences internal attention mechanisms.\n","authors":["Hee-Seon Kim","Minbeom Kim","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2412.08108v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14528v1","updated":"2024-12-19T04:51:06Z","published":"2024-12-19T04:51:06Z","title":"Multi-Level Optimal Transport for Universal Cross-Tokenizer Knowledge\n Distillation on Language Models","summary":" Knowledge distillation (KD) has become a prevalent technique for compressing\nlarge language models (LLMs). Existing KD methods are constrained by the need\nfor identical tokenizers (i.e., vocabularies) between teacher and student\nmodels, limiting their versatility in handling LLMs of different architecture\nfamilies. In this paper, we introduce the Multi-Level Optimal Transport\n(MultiLevelOT), a novel approach that advances the optimal transport for\nuniversal cross-tokenizer knowledge distillation. Our method aligns the logit\ndistributions of the teacher and the student at both token and sequence levels\nusing diverse cost matrices, eliminating the need for dimensional or\ntoken-by-token correspondence. At the token level, MultiLevelOT integrates both\nglobal and local information by jointly optimizing all tokens within a sequence\nto enhance robustness. At the sequence level, we efficiently capture complex\ndistribution structures of logits via the Sinkhorn distance, which approximates\nthe Wasserstein distance for divergence measures. Extensive experiments on\ntasks such as extractive QA, generative QA, and summarization demonstrate that\nthe MultiLevelOT outperforms state-of-the-art cross-tokenizer KD methods under\nvarious settings. Our approach is robust to different student and teacher\nmodels across model families, architectures, and parameter sizes.\n","authors":["Xiao Cui","Mo Zhu","Yulei Qin","Liang Xie","Wengang Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2412.14528v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2411.16646v2","updated":"2024-12-19T04:50:45Z","published":"2024-11-25T18:28:26Z","title":"Self-Generated Critiques Boost Reward Modeling for Language Models","summary":" Reward modeling is crucial for aligning large language models (LLMs) with\nhuman preferences, especially in reinforcement learning from human feedback\n(RLHF). However, current reward models mainly produce scalar scores and\nstruggle to incorporate critiques in a natural language format. We hypothesize\nthat predicting both critiques and the scalar reward would improve reward\nmodeling ability. Motivated by this, we propose Critic-RM, a framework that\nimproves reward models using self-generated critiques without extra\nsupervision. Critic-RM employs a two-stage process: generating and filtering\nhigh-quality critiques, followed by joint fine-tuning on reward prediction and\ncritique generation. Experiments across benchmarks show that Critic-RM improves\nreward modeling accuracy by 3.7%-7.3% compared to standard reward models and\nLLM judges, demonstrating strong performance and data efficiency. Additional\nstudies further validate the effectiveness of generated critiques in rectifying\nflawed reasoning steps with 2.5%-3.2% gains in improving reasoning accuracy.\n","authors":["Yue Yu","Zhengxing Chen","Aston Zhang","Liang Tan","Chenguang Zhu","Richard Yuanzhe Pang","Yundi Qian","Xuewei Wang","Suchin Gururangan","Chao Zhang","Melanie Kambadur","Dhruv Mahajan","Rui Hou"],"pdf_url":"https://arxiv.org/pdf/2411.16646v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2412.00869v2","updated":"2024-12-19T04:38:59Z","published":"2024-12-01T16:15:14Z","title":"KnowledgePrompts: Exploring the Abilities of Large Language Models to\n Solve Proportional Analogies via Knowledge-Enhanced Prompting","summary":" Making analogies is fundamental to cognition. Proportional analogies, which\nconsist of four terms, are often used to assess linguistic and cognitive\nabilities. For instance, completing analogies like \"Oxygen is to Gas as \nis to \" requires identifying the semantic relationship (e.g., \"type of\")\nbetween the first pair of terms (\"Oxygen\" and \"Gas\") and finding a second pair\nthat shares the same relationship (e.g., \"Aluminum\" and \"Metal\"). In this work,\nwe introduce a 15K Multiple-Choice Question Answering (MCQA) dataset for\nproportional analogy completion and evaluate the performance of contemporary\nLarge Language Models (LLMs) in various knowledge-enhanced prompt settings.\nSpecifically, we augment prompts with three types of knowledge: exemplar,\nstructured, and targeted. Our results show that despite extensive training\ndata, solving proportional analogies remains challenging for current LLMs, with\nthe best model achieving an accuracy of 55%. Notably, we find that providing\ntargeted knowledge can better assist models in completing proportional\nanalogies compared to providing exemplars or collections of structured\nknowledge. Our code and data are available at:\nhttps://github.com/Thiliniiw/KnowledgePrompts/\n","authors":["Thilini Wijesiriwardene","Ruwan Wickramarachchi","Sreeram Vennam","Vinija Jain","Aman Chadha","Amitava Das","Ponnurangam Kumaraguru","Amit Sheth"],"pdf_url":"https://arxiv.org/pdf/2412.00869v2.pdf","comment":"Accepted at COLING 2025"},{"id":"http://arxiv.org/abs/2305.09574v2","updated":"2024-12-19T04:35:08Z","published":"2023-05-16T16:11:48Z","title":"UOR: Universal Backdoor Attacks on Pre-trained Language Models","summary":" Backdoors implanted in pre-trained language models (PLMs) can be transferred\nto various downstream tasks, which exposes a severe security threat. However,\nmost existing backdoor attacks against PLMs are un-targeted and task-specific.\nFew targeted and task-agnostic methods use manually pre-defined triggers and\noutput representations, which prevent the attacks from being more effective and\ngeneral. In this paper, we first summarize the requirements that a more\nthreatening backdoor attack against PLMs should satisfy, and then propose a new\nbackdoor attack method called UOR, which breaks the bottleneck of the previous\napproach by turning manual selection into automatic optimization. Specifically,\nwe define poisoned supervised contrastive learning which can automatically\nlearn the more uniform and universal output representations of triggers for\nvarious PLMs. Moreover, we use gradient search to select appropriate trigger\nwords which can be adaptive to different PLMs and vocabularies. Experiments\nshow that our method can achieve better attack performance on various text\nclassification tasks compared to manual methods. Further, we tested our method\non PLMs with different architectures, different usage paradigms, and more\ndifficult tasks, which demonstrated the universality of our method.\n","authors":["Wei Du","Peixuan Li","Boqun Li","Haodong Zhao","Gongshen Liu"],"pdf_url":"https://arxiv.org/pdf/2305.09574v2.pdf","comment":"ACL-Findings 2024"},{"id":"http://arxiv.org/abs/2412.14516v1","updated":"2024-12-19T04:31:56Z","published":"2024-12-19T04:31:56Z","title":"Cal-DPO: Calibrated Direct Preference Optimization for Language Model\n Alignment","summary":" We study the problem of aligning large language models (LLMs) with human\npreference data. Contrastive preference optimization has shown promising\nresults in aligning LLMs with available preference data by optimizing the\nimplicit reward associated with the policy. However, the contrastive objective\nfocuses mainly on the relative values of implicit rewards associated with two\nresponses while ignoring their actual values, resulting in suboptimal alignment\nwith human preferences. To address this limitation, we propose calibrated\ndirect preference optimization (Cal-DPO), a simple yet effective algorithm. We\nshow that substantial improvement in alignment with the given preferences can\nbe achieved simply by calibrating the implicit reward to ensure that the\nlearned implicit rewards are comparable in scale to the ground-truth rewards.\nWe demonstrate the theoretical advantages of Cal-DPO over existing approaches.\nThe results of our experiments on a variety of standard benchmarks show that\nCal-DPO remarkably improves off-the-shelf methods.\n","authors":["Teng Xiao","Yige Yuan","Huaisheng Zhu","Mingxiao Li","Vasant G Honavar"],"pdf_url":"https://arxiv.org/pdf/2412.14516v1.pdf","comment":"Accepted by NeurIPS 2024 Main"},{"id":"http://arxiv.org/abs/2412.14510v1","updated":"2024-12-19T04:18:51Z","published":"2024-12-19T04:18:51Z","title":"PA-RAG: RAG Alignment via Multi-Perspective Preference Optimization","summary":" The emergence of Retrieval-augmented generation (RAG) has alleviated the\nissues of outdated and hallucinatory content in the generation of large\nlanguage models (LLMs), yet it still reveals numerous limitations. When a\ngeneral-purpose LLM serves as the RAG generator, it often suffers from\ninadequate response informativeness, response robustness, and citation quality.\nPast approaches to tackle these limitations, either by incorporating additional\nsteps beyond generating responses or optimizing the generator through\nsupervised fine-tuning (SFT), still failed to align with the RAG requirement\nthoroughly. Consequently, optimizing the RAG generator from multiple preference\nperspectives while maintaining its end-to-end LLM form remains a challenge. To\nbridge this gap, we propose Multiple Perspective Preference Alignment for\nRetrieval-Augmented Generation (PA-RAG), a method for optimizing the generator\nof RAG systems to align with RAG requirements comprehensively. Specifically, we\nconstruct high-quality instruction fine-tuning data and multi-perspective\npreference data by sampling varied quality responses from the generator across\ndifferent prompt documents quality scenarios. Subsequently, we optimize the\ngenerator using SFT and Direct Preference Optimization (DPO). Extensive\nexperiments conducted on four question-answer datasets across three LLMs\ndemonstrate that PA-RAG can significantly enhance the performance of RAG\ngenerators. Our code and datasets are available at\nhttps://github.com/wujwyi/PA-RAG.\n","authors":["Jiayi Wu","Hengyi Cai","Lingyong Yan","Hao Sun","Xiang Li","Shuaiqiang Wang","Dawei Yin","Ming Gao"],"pdf_url":"https://arxiv.org/pdf/2412.14510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14501v1","updated":"2024-12-19T03:48:40Z","published":"2024-12-19T03:48:40Z","title":"Do Large Language Models Defend Inferentialist Semantics?: On the\n Logical Expressivism and Anti-Representationalism of LLMs","summary":" The philosophy of language, which has historically been developed through an\nanthropocentric lens, is now being forced to move towards post-anthropocentrism\ndue to the advent of large language models (LLMs) like ChatGPT (OpenAI), Claude\n(Anthropic), which are considered to possess linguistic abilities comparable to\nthose of humans. Traditionally, LLMs have been explained through distributional\nsemantics as their foundational semantics. However, recent research is\nexploring alternative foundational semantics beyond distributional semantics.\nThis paper proposes Robert Brandom's inferentialist semantics as an suitable\nfoundational semantics for LLMs, specifically focusing on the issue of\nlinguistic representationalism within this post-anthropocentric trend. Here, we\nshow that the anti-representationalism and logical expressivism of inferential\nsemantics, as well as quasi-compositionality, are useful in interpreting the\ncharacteristics and behaviors of LLMs. Further, we propose a \\emph{consensus\ntheory of truths} for LLMs. This paper argues that the characteristics of LLMs\nchallenge mainstream assumptions in philosophy of language, such as semantic\nexternalism and compositionality. We believe the argument in this paper leads\nto a re-evaluation of anti\\hyphen{}representationalist views of language,\npotentially leading to new developments in the philosophy of language.\n","authors":["Yuzuki Arai","Sho Tsugawa"],"pdf_url":"https://arxiv.org/pdf/2412.14501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14480v1","updated":"2024-12-19T03:04:34Z","published":"2024-12-19T03:04:34Z","title":"GraphEQA: Using 3D Semantic Scene Graphs for Real-time Embodied Question\n Answering","summary":" In Embodied Question Answering (EQA), agents must explore and develop a\nsemantic understanding of an unseen environment in order to answer a situated\nquestion with confidence. This remains a challenging problem in robotics, due\nto the difficulties in obtaining useful semantic representations, updating\nthese representations online, and leveraging prior world knowledge for\nefficient exploration and planning. Aiming to address these limitations, we\npropose GraphEQA, a novel approach that utilizes real-time 3D metric-semantic\nscene graphs (3DSGs) and task relevant images as multi-modal memory for\ngrounding Vision-Language Models (VLMs) to perform EQA tasks in unseen\nenvironments. We employ a hierarchical planning approach that exploits the\nhierarchical nature of 3DSGs for structured planning and semantic-guided\nexploration. Through experiments in simulation on the HM-EQA dataset and in the\nreal world in home and office environments, we demonstrate that our method\noutperforms key baselines by completing EQA tasks with higher success rates and\nfewer planning steps.\n","authors":["Saumya Saxena","Blake Buchanan","Chris Paxton","Bingqing Chen","Narunas Vaskevicius","Luigi Palmieri","Jonathan Francis","Oliver Kroemer"],"pdf_url":"https://arxiv.org/pdf/2412.14480v1.pdf","comment":"Project website: https://saumyasaxena.github.io/grapheqa"},{"id":"http://arxiv.org/abs/2310.13008v2","updated":"2024-12-19T02:54:42Z","published":"2023-10-16T07:26:24Z","title":"DavIR: Data Selection via Implicit Reward for Large Language Models","summary":" We introduce DavIR, a model-based data selection method for post-training\nLarge Language Models. DavIR generalizes Reducible Holdout Loss to core-set\nselection problem of causal language modeling, and quantifies the learnability\nof a given datum with respect to a pre-trained LLM based on relative reduction\nin loss during fine-tuning, a metric we show to be closely related to the\nimplicit reward model described in Direct Preference Optimization (DPO). We\nshow that 6% of Alpaca dataset selected with DavIR can steer both the LLaMA and\nGemma model family to produce superior performance compared to the same models\ntrained on the full 52K dataset. We also show that Alpaca dataset compressed\nwith DavIR can be combined with GSM8K dataset to effectively balance\nopen-domain freeform QA and mathematical reasoning capabilities. Finally, we\napply the DavIR objective to DPO and develop a normalized DavIR-DPO objective\nwhich improves alignment performance of Zephyr-7B-SFT model by 8% (relative) on\nAlpacaEval, compared against training on vanilla DPO objective.\n","authors":["Haotian Zhou","Tingkai Liu","Qianli Ma","Yufeng Zhang","Jianbo Yuan","Pengfei Liu","Yang You","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2310.13008v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14475v1","updated":"2024-12-19T02:49:55Z","published":"2024-12-19T02:49:55Z","title":"MegaPairs: Massive Data Synthesis For Universal Multimodal Retrieval","summary":" Despite the rapidly growing demand for multimodal retrieval, progress in this\nfield remains severely constrained by a lack of training data. In this paper,\nwe introduce MegaPairs, a novel data synthesis method that leverages vision\nlanguage models (VLMs) and open-domain images, together with a massive\nsynthetic dataset generated from this method. Our empirical analysis shows that\nMegaPairs generates high-quality data, enabling the multimodal retriever to\nsignificantly outperform the baseline model trained on 70$\\times$ more data\nfrom existing datasets. Moreover, since MegaPairs solely relies on general\nimage corpora and open-source VLMs, it can be easily scaled up, enabling\ncontinuous improvements in retrieval performance. In this stage, we produced\nmore than 26 million training instances and trained several models of varying\nsizes using this data. These new models achieve state-of-the-art zero-shot\nperformance across 4 popular composed image retrieval (CIR) benchmarks and the\nhighest overall performance on the 36 datasets provided by MMEB. They also\ndemonstrate notable performance improvements with additional downstream\nfine-tuning. Our produced dataset, well-trained models, and data synthesis\npipeline will be made publicly available to facilitate the future development\nof this field.\n","authors":["Junjie Zhou","Zheng Liu","Ze Liu","Shitao Xiao","Yueze Wang","Bo Zhao","Chen Jason Zhang","Defu Lian","Yongping Xiong"],"pdf_url":"https://arxiv.org/pdf/2412.14475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14471v1","updated":"2024-12-19T02:39:26Z","published":"2024-12-19T02:39:26Z","title":"Why We Build Local Large Language Models: An Observational Analysis from\n 35 Japanese and Multilingual LLMs","summary":" Why do we build local large language models (LLMs)? What should a local LLM\nlearn from the target language? Which abilities can be transferred from other\nlanguages? Do language-specific scaling laws exist? To explore these research\nquestions, we evaluated 35 Japanese, English, and multilingual LLMs on 19\nevaluation benchmarks for Japanese and English, taking Japanese as a local\nlanguage. Adopting an observational approach, we analyzed correlations of\nbenchmark scores, and conducted principal component analysis (PCA) on the\nscores to derive \\textit{ability factors} of local LLMs. We found that training\non English text can improve the scores of academic subjects in Japanese\n(JMMLU). In addition, it is unnecessary to specifically train on Japanese text\nto enhance abilities for solving Japanese code generation, arithmetic\nreasoning, commonsense, and reading comprehension tasks. In contrast, training\non Japanese text could improve question-answering tasks about Japanese\nknowledge and English-Japanese translation, which indicates that abilities for\nsolving these two tasks can be regarded as \\textit{Japanese abilities} for\nLLMs. Furthermore, we confirmed that the Japanese abilities scale with the\ncomputational budget for Japanese text.\n","authors":["Koshiro Saito","Sakae Mizuki","Masanari Ohi","Taishi Nakamura","Taihei Shiotani","Koki Maeda","Youmi Ma","Kakeru Hattori","Kazuki Fujii","Takumi Okamoto","Shigeki Ishida","Hiroya Takamura","Rio Yokota","Naoaki Okazaki"],"pdf_url":"https://arxiv.org/pdf/2412.14471v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2407.21792v2","updated":"2024-12-19T02:39:01Z","published":"2024-07-31T17:59:24Z","title":"Safetywashing: Do AI Safety Benchmarks Actually Measure Safety Progress?","summary":" As artificial intelligence systems grow more powerful, there has been\nincreasing interest in \"AI safety\" research to address emerging and future\nrisks. However, the field of AI safety remains poorly defined and\ninconsistently measured, leading to confusion about how researchers can\ncontribute. This lack of clarity is compounded by the unclear relationship\nbetween AI safety benchmarks and upstream general capabilities (e.g., general\nknowledge and reasoning). To address these issues, we conduct a comprehensive\nmeta-analysis of AI safety benchmarks, empirically analyzing their correlation\nwith general capabilities across dozens of models and providing a survey of\nexisting directions in AI safety. Our findings reveal that many safety\nbenchmarks highly correlate with both upstream model capabilities and training\ncompute, potentially enabling \"safetywashing\" -- where capability improvements\nare misrepresented as safety advancements. Based on these findings, we propose\nan empirical foundation for developing more meaningful safety metrics and\ndefine AI safety in a machine learning research context as a set of clearly\ndelineated research goals that are empirically separable from generic\ncapabilities advancements. In doing so, we aim to provide a more rigorous\nframework for AI safety research, advancing the science of safety evaluations\nand clarifying the path towards measurable progress.\n","authors":["Richard Ren","Steven Basart","Adam Khoja","Alice Gatti","Long Phan","Xuwang Yin","Mantas Mazeika","Alexander Pan","Gabriel Mukobi","Ryan H. Kim","Stephen Fitz","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2407.21792v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.18118v3","updated":"2024-12-19T02:37:00Z","published":"2024-06-26T07:15:44Z","title":"SafeAligner: Safety Alignment against Jailbreak Attacks via Response\n Disparity Guidance","summary":" As the development of large language models (LLMs) rapidly advances, securing\nthese models effectively without compromising their utility has become a\npivotal area of research. However, current defense strategies against jailbreak\nattacks (i.e., efforts to bypass security protocols) often suffer from limited\nadaptability, restricted general capability, and high cost. To address these\nchallenges, we introduce SafeAligner, a methodology implemented at the decoding\nstage to fortify defenses against jailbreak attacks. We begin by developing two\nspecialized models: the Sentinel Model, which is trained to foster safety, and\nthe Intruder Model, designed to generate riskier responses. SafeAligner\nleverages the disparity in security levels between the responses from these\nmodels to differentiate between harmful and beneficial tokens, effectively\nguiding the safety alignment by altering the output token distribution of the\ntarget model. Extensive experiments show that SafeAligner can increase the\nlikelihood of beneficial tokens, while reducing the occurrence of harmful ones,\nthereby ensuring secure alignment with minimal loss to generality.\n","authors":["Caishuang Huang","Wanxu Zhao","Rui Zheng","Huijie Lv","Shihan Dou","Sixian Li","Xiao Wang","Enyu Zhou","Junjie Ye","Yuming Yang","Tao Gui","Qi Zhang","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2406.18118v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14205v3","updated":"2024-12-19T02:35:48Z","published":"2024-05-23T06:03:19Z","title":"Agent Planning with World Knowledge Model","summary":" Recent endeavors towards directly using large language models (LLMs) as agent\nmodels to execute interactive planning tasks have shown commendable results.\nDespite their achievements, however, they still struggle with brainless\ntrial-and-error in global planning and generating hallucinatory actions in\nlocal planning due to their poor understanding of the ``real'' physical world.\nImitating humans' mental world knowledge model which provides global prior\nknowledge before the task and maintains local dynamic knowledge during the\ntask, in this paper, we introduce parametric World Knowledge Model (WKM) to\nfacilitate agent planning. Concretely, we steer the agent model to\nself-synthesize knowledge from both expert and sampled trajectories. Then we\ndevelop WKM, providing prior task knowledge to guide the global planning and\ndynamic state knowledge to assist the local planning. Experimental results on\nthree complex real-world simulated datasets with three state-of-the-art\nopen-source LLMs, Mistral-7B, Gemma-7B, and Llama-3-8B, demonstrate that our\nmethod can achieve superior performance compared to various strong baselines.\nBesides, we analyze to illustrate that our WKM can effectively alleviate the\nblind trial-and-error and hallucinatory action issues, providing strong support\nfor the agent's understanding of the world. Other interesting findings include:\n1) our instance-level task knowledge can generalize better to unseen tasks, 2)\nweak WKM can guide strong agent model planning, and 3) unified WKM training has\npromising potential for further development. The code is available at\nhttps://github.com/zjunlp/WKM.\n","authors":["Shuofei Qiao","Runnan Fang","Ningyu Zhang","Yuqi Zhu","Xiang Chen","Shumin Deng","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2405.14205v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.14470v1","updated":"2024-12-19T02:35:15Z","published":"2024-12-19T02:35:15Z","title":"Agent-SafetyBench: Evaluating the Safety of LLM Agents","summary":" As large language models (LLMs) are increasingly deployed as agents, their\nintegration into interactive environments and tool use introduce new safety\nchallenges beyond those associated with the models themselves. However, the\nabsence of comprehensive benchmarks for evaluating agent safety presents a\nsignificant barrier to effective assessment and further improvement. In this\npaper, we introduce Agent-SafetyBench, a comprehensive benchmark designed to\nevaluate the safety of LLM agents. Agent-SafetyBench encompasses 349\ninteraction environments and 2,000 test cases, evaluating 8 categories of\nsafety risks and covering 10 common failure modes frequently encountered in\nunsafe interactions. Our evaluation of 16 popular LLM agents reveals a\nconcerning result: none of the agents achieves a safety score above 60%. This\nhighlights significant safety challenges in LLM agents and underscores the\nconsiderable need for improvement. Through quantitative analysis, we identify\ncritical failure modes and summarize two fundamental safety detects in current\nLLM agents: lack of robustness and lack of risk awareness. Furthermore, our\nfindings suggest that reliance on defense prompts alone is insufficient to\naddress these safety issues, emphasizing the need for more advanced and robust\nstrategies. We release Agent-SafetyBench at\n\\url{https://github.com/thu-coai/Agent-SafetyBench} to facilitate further\nresearch and innovation in agent safety evaluation and improvement.\n","authors":["Zhexin Zhang","Shiyao Cui","Yida Lu","Jingzhuo Zhou","Junxiao Yang","Hongning Wang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2412.14470v1.pdf","comment":"23 pages, 9 figures"},{"id":"http://arxiv.org/abs/2412.14461v1","updated":"2024-12-19T02:21:41Z","published":"2024-12-19T02:21:41Z","title":"From Human Annotation to LLMs: SILICON Annotation Workflow for\n Management Research","summary":" Unstructured text data annotation and analysis are fundamental to management\nresearch, often relying on human annotators through crowdsourcing platforms.\nWhile Large Language Models (LLMs) promise to provide a cost-effective and\nefficient alternative to human annotation, there lacks a systematic workflow\nthat evaluate when LLMs are suitable or how to proceed with LLM-based text\nannotation in a reproducible manner. This paper addresses this methodological\ngap by introducing the ``SILICON\" (\\textbf{S}ystematic \\textbf{I}nference with\n\\textbf{L}LMs for \\textbf{I}nformation \\textbf{C}lassificati\\textbf{o}n and\n\\textbf{N}otation) workflow. The workflow integrates established principles of\nhuman annotation with systematic prompt optimization and model selection,\naddressing challenges such as developing robust annotation guidelines,\nestablishing high-quality human baselines, optimizing prompts, and ensuring\nreproducibility across LLMs. We validate the SILICON workflow through seven\ncase studies covering common management research tasks, including business\nproposal evaluation, dialog intent and breakdown analysis, review attribute\ndetection. Our findings highlight the importance of validating annotation\nguideline agreement, the superiority of expert-developed human baselines over\ncrowdsourced ones, the iterative nature of prompt optimization, and the\nnecessity of testing multiple LLMs. Notably, we propose a regression-based\nmethodology to empirically compare LLM outputs across prompts and models. Our\nworkflow advances management research by establishing reproducible processes\nfor LLM-based annotation that maintain scientific rigor. We provide practical\nguidance for researchers to effectively navigate the evolving landscape of\ngenerative AI tools effectively while maintaining transparency and\nreproducibility.\n","authors":["Xiang Cheng","Raveesh Mayya","João Sedoc"],"pdf_url":"https://arxiv.org/pdf/2412.14461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14768v3","updated":"2024-12-19T02:18:54Z","published":"2024-05-23T16:35:52Z","title":"WISE: Rethinking the Knowledge Memory for Lifelong Model Editing of\n Large Language Models","summary":" Large language models (LLMs) need knowledge updates to meet the ever-growing\nworld facts and correct the hallucinated responses, facilitating the methods of\nlifelong model editing. Where the updated knowledge resides in memories is a\nfundamental question for model editing. In this paper, we find that editing\neither long-term memory (direct model parameters) or working memory\n(non-parametric knowledge of neural network activations/representations by\nretrieval) will result in an impossible triangle -- reliability,\ngeneralization, and locality can not be realized together in the lifelong\nediting settings. For long-term memory, directly editing the parameters will\ncause conflicts with irrelevant pretrained knowledge or previous edits (poor\nreliability and locality). For working memory, retrieval-based activations can\nhardly make the model understand the edits and generalize (poor\ngeneralization). Therefore, we propose WISE to bridge the gap between memories.\nIn WISE, we design a dual parametric memory scheme, which consists of the main\nmemory for the pretrained knowledge and a side memory for the edited knowledge.\nWe only edit the knowledge in the side memory and train a router to decide\nwhich memory to go through when given a query. For continual editing, we devise\na knowledge-sharding mechanism where different sets of edits reside in distinct\nsubspaces of parameters, and are subsequently merged into a shared memory\nwithout conflicts. Extensive experiments show that WISE can outperform previous\nmodel editing methods and overcome the impossible triangle under lifelong model\nediting of question answering, hallucination, and out-of-distribution settings\nacross trending LLM architectures, e.g., GPT, LLaMA, and Mistral. Code is\navailable at https://github.com/zjunlp/EasyEdit.\n","authors":["Peng Wang","Zexi Li","Ningyu Zhang","Ziwen Xu","Yunzhi Yao","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2405.14768v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.13144v3","updated":"2024-12-19T02:14:09Z","published":"2024-06-19T01:37:10Z","title":"DialSim: A Real-Time Simulator for Evaluating Long-Term Multi-Party\n Dialogue Understanding of Conversational Agents","summary":" Recent advancements in Large Language Models (LLMs) have significantly\nenhanced the capabilities of conversational agents, making them applicable to\nvarious fields (e.g., education). Despite their progress, the evaluation of the\nagents often overlooks the complexities of real-world conversations, such as\nreal-time interactions, multi-party dialogues, and extended contextual\ndependencies. To bridge this gap, we introduce DialSim, a real-time dialogue\nsimulator. In this simulator, an agent is assigned the role of a character from\npopular TV shows, requiring it to respond to spontaneous questions using past\ndialogue information and to distinguish between known and unknown information.\nKey features of DialSim include assessing the agent's ability to respond within\na reasonable time limit, handling long-term multi-party dialogues, and\nevaluating performance under randomized questioning with LongDialQA, a novel,\nhigh-quality question-answering dataset. Our experiments using DialSim reveal\nthe strengths and weaknesses of the latest conversational agents, offering\nvaluable insights for future advancements in conversational AI. DialSim is\navailable at https://dialsim.github.io/.\n","authors":["Jiho Kim","Woosog Chay","Hyeonji Hwang","Daeun Kyung","Hyunseung Chung","Eunbyeol Cho","Yohan Jo","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2406.13144v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17969v3","updated":"2024-12-19T02:10:00Z","published":"2024-05-28T08:56:33Z","title":"Knowledge Circuits in Pretrained Transformers","summary":" The remarkable capabilities of modern large language models are rooted in\ntheir vast repositories of knowledge encoded within their parameters, enabling\nthem to perceive the world and engage in reasoning. The inner workings of how\nthese models store knowledge have long been a subject of intense interest and\ninvestigation among researchers. To date, most studies have concentrated on\nisolated components within these models, such as the Multilayer Perceptrons and\nattention head. In this paper, we delve into the computation graph of the\nlanguage model to uncover the knowledge circuits that are instrumental in\narticulating specific knowledge. The experiments, conducted with GPT2 and\nTinyLLAMA, have allowed us to observe how certain information heads, relation\nheads, and Multilayer Perceptrons collaboratively encode knowledge within the\nmodel. Moreover, we evaluate the impact of current knowledge editing techniques\non these knowledge circuits, providing deeper insights into the functioning and\nconstraints of these editing methodologies. Finally, we utilize knowledge\ncircuits to analyze and interpret language model behaviors such as\nhallucinations and in-context learning. We believe the knowledge circuits hold\npotential for advancing our understanding of Transformers and guiding the\nimproved design of knowledge editing. Code and data are available in\nhttps://github.com/zjunlp/KnowledgeCircuits.\n","authors":["Yunzhi Yao","Ningyu Zhang","Zekun Xi","Mengru Wang","Ziwen Xu","Shumin Deng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2405.17969v3.pdf","comment":"NeurIPS 2024, 26 pages"},{"id":"http://arxiv.org/abs/2412.14454v1","updated":"2024-12-19T02:09:59Z","published":"2024-12-19T02:09:59Z","title":"Are Longer Prompts Always Better? Prompt Selection in Large Language\n Models for Recommendation Systems","summary":" In large language models (LLM)-based recommendation systems (LLM-RSs),\naccurately predicting user preferences by leveraging the general knowledge of\nLLMs is possible without requiring extensive training data. By converting\nrecommendation tasks into natural language inputs called prompts, LLM-RSs can\nefficiently solve issues that have been difficult to address due to data\nscarcity but are crucial in applications such as cold-start and cross-domain\nproblems. However, when applying this in practice, selecting the prompt that\nmatches tasks and data is essential. Although numerous prompts have been\nproposed in LLM-RSs and representing the target user in prompts significantly\nimpacts recommendation accuracy, there are still no clear guidelines for\nselecting specific prompts.\n In this paper, we categorize and analyze prompts from previous research to\nestablish practical prompt selection guidelines. Through 450 experiments with\n90 prompts and five real-world datasets, we examined the relationship between\nprompts and dataset characteristics in recommendation accuracy. We found that\nno single prompt consistently outperforms others; thus, selecting prompts on\nthe basis of dataset characteristics is crucial. Here, we propose a prompt\nselection method that achieves higher accuracy with minimal validation data.\nBecause increasing the number of prompts to explore raises costs, we also\nintroduce a cost-efficient strategy using high-performance and cost-efficient\nLLMs, significantly reducing exploration costs while maintaining high\nprediction accuracy. Our work offers valuable insights into the prompt\nselection, advancing accurate and efficient LLM-RSs.\n","authors":["Genki Kusano","Kosuke Akimoto","Kunihiro Takeoka"],"pdf_url":"https://arxiv.org/pdf/2412.14454v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2412.14436v1","updated":"2024-12-19T01:35:47Z","published":"2024-12-19T01:35:47Z","title":"ORBIT: Cost-Effective Dataset Curation for Large Language Model Domain\n Adaptation with an Astronomy Case Study","summary":" Recent advances in language modeling demonstrate the need for high-quality\ndomain-specific training data, especially for tasks that require specialized\nknowledge. General-purpose models, while versatile, often lack the depth needed\nfor expert-level tasks because of limited domain-specific information. Domain\nadaptation training can enhance these models, but it demands substantial,\nhigh-quality data. To address this, we propose ORBIT, a cost-efficient\nmethodology for curating massive, high-quality domain-specific datasets from\nnoisy web sources, tailored for training specialist large language models.\nUsing astronomy as a primary case study, we refined the 1.3T-token FineWeb-Edu\ndataset into a high-quality, 10B-token subset focused on astronomy. Fine-tuning\n\\textsc{LLaMA-3-8B} on a 1B-token astronomy subset improved performance on the\nMMLU astronomy benchmark from 69\\% to 76\\% and achieved top results on\nAstroBench, an astronomy-specific benchmark. Moreover, our model (Orbit-LLaMA)\noutperformed \\textsc{LLaMA-3-8B-base}, with GPT-4o evaluations preferring it in\n73\\% of cases across 1000 astronomy-specific questions. Additionally, we\nvalidated ORBIT's generalizability by applying it to law and medicine,\nachieving a significant improvement of data quality compared to an unfiltered\nbaseline. We open-source the ORBIT methodology, including the curated datasets,\nthe codebase, and the resulting model at\n\\href{https://github.com/ModeEric/ORBIT-Llama}{https://github.com/ModeEric/ORBIT-Llama}.\n","authors":["Eric Modesitt","Ke Yang","Spencer Hulsey","Chengxiang Zhai","Volodymyr Kindratenko"],"pdf_url":"https://arxiv.org/pdf/2412.14436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14426v1","updated":"2024-12-19T00:41:40Z","published":"2024-12-19T00:41:40Z","title":"All-in-One Tuning and Structural Pruning for Domain-Specific LLMs","summary":" Existing pruning techniques for large language models (LLMs) targeting\ndomain-specific applications typically follow a two-stage process: pruning the\npretrained general-purpose LLMs and then fine-tuning the pruned LLMs on\nspecific domains. However, the pruning decisions, derived from the pretrained\nweights, remain unchanged during fine-tuning, even if the weights have been\nupdated. Therefore, such a combination of the pruning decisions and the\nfinetuned weights may be suboptimal, leading to non-negligible performance\ndegradation. To address these limitations, we propose ATP: All-in-One Tuning\nand Structural Pruning, a unified one-stage structural pruning and fine-tuning\napproach that dynamically identifies the current optimal substructure\nthroughout the fine-tuning phase via a trainable pruning decision generator.\nMoreover, given the limited available data for domain-specific applications,\nLow-Rank Adaptation (LoRA) becomes a common technique to fine-tune the LLMs. In\nATP, we introduce LoRA-aware forward and sparsity regularization to ensure that\nthe substructures corresponding to the learned pruning decisions can be\ndirectly removed after the ATP process. ATP outperforms the state-of-the-art\ntwo-stage pruning methods on tasks in the legal and healthcare domains. More\nspecifically, ATP recovers up to 88% and 91% performance of the dense model\nwhen pruning 40% parameters of LLaMA2-7B and LLaMA3-8B models, respectively.\n","authors":["Lei Lu","Zhepeng Wang","Ruexue Bao","Mengbing Wang","Fangyi Li","Yawen Wu","Weiwen Jiang","Jie Xu","Yanzhi Wang","Shangqian Gao"],"pdf_url":"https://arxiv.org/pdf/2412.14426v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2412.15216v1","updated":"2024-12-19T18:59:58Z","published":"2024-12-19T18:59:58Z","title":"UIP2P: Unsupervised Instruction-based Image Editing via Cycle Edit\n Consistency","summary":" We propose an unsupervised model for instruction-based image editing that\neliminates the need for ground-truth edited images during training. Existing\nsupervised methods depend on datasets containing triplets of input image,\nedited image, and edit instruction. These are generated by either existing\nediting methods or human-annotations, which introduce biases and limit their\ngeneralization ability. Our method addresses these challenges by introducing a\nnovel editing mechanism called Cycle Edit Consistency (CEC), which applies\nforward and backward edits in one training step and enforces consistency in\nimage and attention spaces. This allows us to bypass the need for ground-truth\nedited images and unlock training for the first time on datasets comprising\neither real image-caption pairs or image-caption-edit triplets. We empirically\nshow that our unsupervised technique performs better across a broader range of\nedits with high fidelity and precision. By eliminating the need for\npre-existing datasets of triplets, reducing biases associated with supervised\nmethods, and proposing CEC, our work represents a significant advancement in\nunblocking scaling of instruction-based image editing.\n","authors":["Enis Simsar","Alessio Tonioni","Yongqin Xian","Thomas Hofmann","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2412.15216v1.pdf","comment":"Project page: https://enis.dev/uip2p/"},{"id":"http://arxiv.org/abs/2412.15215v1","updated":"2024-12-19T18:59:57Z","published":"2024-12-19T18:59:57Z","title":"EnvGS: Modeling View-Dependent Appearance with Environment Gaussian","summary":" Reconstructing complex reflections in real-world scenes from 2D images is\nessential for achieving photorealistic novel view synthesis. Existing methods\nthat utilize environment maps to model reflections from distant lighting often\nstruggle with high-frequency reflection details and fail to account for\nnear-field reflections. In this work, we introduce EnvGS, a novel approach that\nemploys a set of Gaussian primitives as an explicit 3D representation for\ncapturing reflections of environments. These environment Gaussian primitives\nare incorporated with base Gaussian primitives to model the appearance of the\nwhole scene. To efficiently render these environment Gaussian primitives, we\ndeveloped a ray-tracing-based renderer that leverages the GPU's RT core for\nfast rendering. This allows us to jointly optimize our model for high-quality\nreconstruction while maintaining real-time rendering speeds. Results from\nmultiple real-world and synthetic datasets demonstrate that our method produces\nsignificantly more detailed reflections, achieving the best rendering quality\nin real-time novel view synthesis.\n","authors":["Tao Xie","Xi Chen","Zhen Xu","Yiman Xie","Yudong Jin","Yujun Shen","Sida Peng","Hujun Bao","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.15215v1.pdf","comment":"Project page: https://zju3dv.github.io/envgs/"},{"id":"http://arxiv.org/abs/2412.15213v1","updated":"2024-12-19T18:59:56Z","published":"2024-12-19T18:59:56Z","title":"Flowing from Words to Pixels: A Framework for Cross-Modality Evolution","summary":" Diffusion models, and their generalization, flow matching, have had a\nremarkable impact on the field of media generation. Here, the conventional\napproach is to learn the complex mapping from a simple source distribution of\nGaussian noise to the target media distribution. For cross-modal tasks such as\ntext-to-image generation, this same mapping from noise to image is learnt\nwhilst including a conditioning mechanism in the model. One key and thus far\nrelatively unexplored feature of flow matching is that, unlike Diffusion\nmodels, they are not constrained for the source distribution to be noise.\nHence, in this paper, we propose a paradigm shift, and ask the question of\nwhether we can instead train flow matching models to learn a direct mapping\nfrom the distribution of one modality to the distribution of another, thus\nobviating the need for both the noise distribution and conditioning mechanism.\nWe present a general and simple framework, CrossFlow, for cross-modal flow\nmatching. We show the importance of applying Variational Encoders to the input\ndata, and introduce a method to enable Classifier-free guidance. Surprisingly,\nfor text-to-image, CrossFlow with a vanilla transformer without cross attention\nslightly outperforms standard flow matching, and we show that it scales better\nwith training steps and model size, while also allowing for interesting latent\narithmetic which results in semantically meaningful edits in the output space.\nTo demonstrate the generalizability of our approach, we also show that\nCrossFlow is on par with or outperforms the state-of-the-art for various\ncross-modal / intra-modal mapping tasks, viz. image captioning, depth\nestimation, and image super-resolution. We hope this paper contributes to\naccelerating progress in cross-modal media generation.\n","authors":["Qihao Liu","Xi Yin","Alan Yuille","Andrew Brown","Mannat Singh"],"pdf_url":"https://arxiv.org/pdf/2412.15213v1.pdf","comment":"Project page: https://cross-flow.github.io/"},{"id":"http://arxiv.org/abs/2412.15214v1","updated":"2024-12-19T18:59:56Z","published":"2024-12-19T18:59:56Z","title":"LeviTor: 3D Trajectory Oriented Image-to-Video Synthesis","summary":" The intuitive nature of drag-based interaction has led to its growing\nadoption for controlling object trajectories in image-to-video synthesis.\nStill, existing methods that perform dragging in the 2D space usually face\nambiguity when handling out-of-plane movements. In this work, we augment the\ninteraction with a new dimension, i.e., the depth dimension, such that users\nare allowed to assign a relative depth for each point on the trajectory. That\nway, our new interaction paradigm not only inherits the convenience from 2D\ndragging, but facilitates trajectory control in the 3D space, broadening the\nscope of creativity. We propose a pioneering method for 3D trajectory control\nin image-to-video synthesis by abstracting object masks into a few cluster\npoints. These points, accompanied by the depth information and the instance\ninformation, are finally fed into a video diffusion model as the control\nsignal. Extensive experiments validate the effectiveness of our approach,\ndubbed LeviTor, in precisely manipulating the object movements when producing\nphoto-realistic videos from static images. Project page:\nhttps://ppetrichor.github.io/levitor.github.io/\n","authors":["Hanlin Wang","Hao Ouyang","Qiuyu Wang","Wen Wang","Ka Leong Cheng","Qifeng Chen","Yujun Shen","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2412.15214v1.pdf","comment":"Project page available at\n https://ppetrichor.github.io/levitor.github.io/"},{"id":"http://arxiv.org/abs/2412.15211v1","updated":"2024-12-19T18:59:51Z","published":"2024-12-19T18:59:51Z","title":"Generative Multiview Relighting for 3D Reconstruction under Extreme\n Illumination Variation","summary":" Reconstructing the geometry and appearance of objects from photographs taken\nin different environments is difficult as the illumination and therefore the\nobject appearance vary across captured images. This is particularly challenging\nfor more specular objects whose appearance strongly depends on the viewing\ndirection. Some prior approaches model appearance variation across images using\na per-image embedding vector, while others use physically-based rendering to\nrecover the materials and per-image illumination. Such approaches fail at\nfaithfully recovering view-dependent appearance given significant variation in\ninput illumination and tend to produce mostly diffuse results. We present an\napproach that reconstructs objects from images taken under different\nilluminations by first relighting the images under a single reference\nillumination with a multiview relighting diffusion model and then\nreconstructing the object's geometry and appearance with a radiance field\narchitecture that is robust to the small remaining inconsistencies among the\nrelit images. We validate our proposed approach on both synthetic and real\ndatasets and demonstrate that it greatly outperforms existing techniques at\nreconstructing high-fidelity appearance from images taken under extreme\nillumination variation. Moreover, our approach is particularly effective at\nrecovering view-dependent \"shiny\" appearance which cannot be reconstructed by\nprior methods.\n","authors":["Hadi Alzayer","Philipp Henzler","Jonathan T. Barron","Jia-Bin Huang","Pratul P. Srinivasan","Dor Verbin"],"pdf_url":"https://arxiv.org/pdf/2412.15211v1.pdf","comment":"Project page: https://relight-to-reconstruct.github.io/"},{"id":"http://arxiv.org/abs/2412.15212v1","updated":"2024-12-19T18:59:51Z","published":"2024-12-19T18:59:51Z","title":"Scaling 4D Representations","summary":" Scaling has not yet been convincingly demonstrated for pure self-supervised\nlearning from video. However, prior work has focused evaluations on\nsemantic-related tasks $\\unicode{x2013}$ action classification, ImageNet\nclassification, etc. In this paper we focus on evaluating self-supervised\nlearning on non-semantic vision tasks that are more spatial (3D) and temporal\n(+1D = 4D), such as camera pose estimation, point and object tracking, and\ndepth estimation. We show that by learning from very large video datasets,\nmasked auto-encoding (MAE) with transformer video models actually scales,\nconsistently improving performance on these 4D tasks, as model size increases\nfrom 20M all the way to the largest by far reported self-supervised video model\n$\\unicode{x2013}$ 22B parameters. Rigorous apples-to-apples comparison with\nmany recent image and video models demonstrates the benefits of scaling 4D\nrepresentations.\n","authors":["João Carreira","Dilara Gokay","Michael King","Chuhan Zhang","Ignacio Rocco","Aravindh Mahendran","Thomas Albert Keck","Joseph Heyward","Skanda Koppula","Etienne Pot","Goker Erdogan","Yana Hasson","Yi Yang","Klaus Greff","Guillaume Le Moing","Sjoerd van Steenkiste","Daniel Zoran","Drew A. Hudson","Pedro Vélez","Luisa Polanía","Luke Friedman","Chris Duvarney","Ross Goroshin","Kelsey Allen","Jacob Walker","Rishabh Kabra","Eric Aboussouan","Jennifer Sun","Thomas Kipf","Carl Doersch","Viorica Pătrăucean","Dima Damen","Pauline Luc","Mehdi S. M. Sajjadi","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2412.15212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15209v1","updated":"2024-12-19T18:59:44Z","published":"2024-12-19T18:59:44Z","title":"PRIMA: Multi-Image Vision-Language Models for Reasoning Segmentation","summary":" Despite significant advancements in Large Vision-Language Models (LVLMs),\nexisting pixel-grounding models operate on single-image settings, limiting\ntheir ability to perform detailed, fine-grained comparisons across multiple\nimages. Conversely, current multi-image understanding models lack pixel-level\ngrounding. Our work addresses this gap by introducing the task of multi-image\npixel-grounded reasoning segmentation, and PRIMA, a novel LVLM that integrates\npixel-level grounding with robust multi-image reasoning capabilities to produce\ncontextually rich, pixel-grounded explanations. Central to PRIMA is an\nefficient vision module that queries fine-grained visual representations across\nmultiple images, reducing TFLOPs by $25.3\\%$. To support training and\nevaluation, we curate $M^4Seg$, a new reasoning segmentation benchmark\nconsisting of $\\sim$224K question-answer pairs that require fine-grained visual\nunderstanding across multiple images. Experimental results demonstrate PRIMA\noutperforms state-of-the-art baselines.\n","authors":["Muntasir Wahed","Kiet A. Nguyen","Adheesh Sunil Juvekar","Xinzhuo Li","Xiaona Zhou","Vedant Shah","Tianjiao Yu","Pinar Yanardag","Ismini Lourentzou"],"pdf_url":"https://arxiv.org/pdf/2412.15209v1.pdf","comment":"Project page: https://plan-lab.github.io/prima"},{"id":"http://arxiv.org/abs/2412.15208v1","updated":"2024-12-19T18:59:40Z","published":"2024-12-19T18:59:40Z","title":"OpenEMMA: Open-Source Multimodal Model for End-to-End Autonomous Driving","summary":" Since the advent of Multimodal Large Language Models (MLLMs), they have made\na significant impact across a wide range of real-world applications,\nparticularly in Autonomous Driving (AD). Their ability to process complex\nvisual data and reason about intricate driving scenarios has paved the way for\na new paradigm in end-to-end AD systems. However, the progress of developing\nend-to-end models for AD has been slow, as existing fine-tuning methods demand\nsubstantial resources, including extensive computational power, large-scale\ndatasets, and significant funding. Drawing inspiration from recent advancements\nin inference computing, we propose OpenEMMA, an open-source end-to-end\nframework based on MLLMs. By incorporating the Chain-of-Thought reasoning\nprocess, OpenEMMA achieves significant improvements compared to the baseline\nwhen leveraging a diverse range of MLLMs. Furthermore, OpenEMMA demonstrates\neffectiveness, generalizability, and robustness across a variety of challenging\ndriving scenarios, offering a more efficient and effective approach to\nautonomous driving. We release all the codes in\nhttps://github.com/taco-group/OpenEMMA.\n","authors":["Shuo Xing","Chengyuan Qian","Yuping Wang","Hongyuan Hua","Kexin Tian","Yang Zhou","Zhengzhong Tu"],"pdf_url":"https://arxiv.org/pdf/2412.15208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15206v1","updated":"2024-12-19T18:59:33Z","published":"2024-12-19T18:59:33Z","title":"AutoTrust: Benchmarking Trustworthiness in Large Vision Language Models\n for Autonomous Driving","summary":" Recent advancements in large vision language models (VLMs) tailored for\nautonomous driving (AD) have shown strong scene understanding and reasoning\ncapabilities, making them undeniable candidates for end-to-end driving systems.\nHowever, limited work exists on studying the trustworthiness of DriveVLMs -- a\ncritical factor that directly impacts public transportation safety. In this\npaper, we introduce AutoTrust, a comprehensive trustworthiness benchmark for\nlarge vision-language models in autonomous driving (DriveVLMs), considering\ndiverse perspectives -- including trustfulness, safety, robustness, privacy,\nand fairness. We constructed the largest visual question-answering dataset for\ninvestigating trustworthiness issues in driving scenarios, comprising over 10k\nunique scenes and 18k queries. We evaluated six publicly available VLMs,\nspanning from generalist to specialist, from open-source to commercial models.\nOur exhaustive evaluations have unveiled previously undiscovered\nvulnerabilities of DriveVLMs to trustworthiness threats. Specifically, we found\nthat the general VLMs like LLaVA-v1.6 and GPT-4o-mini surprisingly outperform\nspecialized models fine-tuned for driving in terms of overall trustworthiness.\nDriveVLMs like DriveLM-Agent are particularly vulnerable to disclosing\nsensitive information. Additionally, both generalist and specialist VLMs remain\nsusceptible to adversarial attacks and struggle to ensure unbiased\ndecision-making across diverse environments and populations. Our findings call\nfor immediate and decisive action to address the trustworthiness of DriveVLMs\n-- an issue of critical importance to public safety and the welfare of all\ncitizens relying on autonomous transportation systems. Our benchmark is\npublicly available at \\url{https://github.com/taco-group/AutoTrust}, and the\nleaderboard is released at \\url{https://taco-group.github.io/AutoTrust/}.\n","authors":["Shuo Xing","Hongyuan Hua","Xiangbo Gao","Shenzhe Zhu","Renjie Li","Kexin Tian","Xiaopeng Li","Heng Huang","Tianbao Yang","Zhangyang Wang","Yang Zhou","Huaxiu Yao","Zhengzhong Tu"],"pdf_url":"https://arxiv.org/pdf/2412.15206v1.pdf","comment":"55 pages, 14 figures"},{"id":"http://arxiv.org/abs/2412.15205v1","updated":"2024-12-19T18:59:31Z","published":"2024-12-19T18:59:31Z","title":"FlowAR: Scale-wise Autoregressive Image Generation Meets Flow Matching","summary":" Autoregressive (AR) modeling has achieved remarkable success in natural\nlanguage processing by enabling models to generate text with coherence and\ncontextual understanding through next token prediction. Recently, in image\ngeneration, VAR proposes scale-wise autoregressive modeling, which extends the\nnext token prediction to the next scale prediction, preserving the 2D structure\nof images. However, VAR encounters two primary challenges: (1) its complex and\nrigid scale design limits generalization in next scale prediction, and (2) the\ngenerator's dependence on a discrete tokenizer with the same complex scale\nstructure restricts modularity and flexibility in updating the tokenizer. To\naddress these limitations, we introduce FlowAR, a general next scale prediction\nmethod featuring a streamlined scale design, where each subsequent scale is\nsimply double the previous one. This eliminates the need for VAR's intricate\nmulti-scale residual tokenizer and enables the use of any off-the-shelf\nVariational AutoEncoder (VAE). Our simplified design enhances generalization in\nnext scale prediction and facilitates the integration of Flow Matching for\nhigh-quality image synthesis. We validate the effectiveness of FlowAR on the\nchallenging ImageNet-256 benchmark, demonstrating superior generation\nperformance compared to previous methods. Codes will be available at\n\\url{https://github.com/OliverRensu/FlowAR}.\n","authors":["Sucheng Ren","Qihang Yu","Ju He","Xiaohui Shen","Alan Yuille","Liang-Chieh Chen"],"pdf_url":"https://arxiv.org/pdf/2412.15205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15200v1","updated":"2024-12-19T18:58:46Z","published":"2024-12-19T18:58:46Z","title":"DI-PCG: Diffusion-based Efficient Inverse Procedural Content Generation\n for High-quality 3D Asset Creation","summary":" Procedural Content Generation (PCG) is powerful in creating high-quality 3D\ncontents, yet controlling it to produce desired shapes is difficult and often\nrequires extensive parameter tuning. Inverse Procedural Content Generation aims\nto automatically find the best parameters under the input condition. However,\nexisting sampling-based and neural network-based methods still suffer from\nnumerous sample iterations or limited controllability. In this work, we present\nDI-PCG, a novel and efficient method for Inverse PCG from general image\nconditions. At its core is a lightweight diffusion transformer model, where PCG\nparameters are directly treated as the denoising target and the observed images\nas conditions to control parameter generation. DI-PCG is efficient and\neffective. With only 7.6M network parameters and 30 GPU hours to train, it\ndemonstrates superior performance in recovering parameters accurately, and\ngeneralizing well to in-the-wild images. Quantitative and qualitative\nexperiment results validate the effectiveness of DI-PCG in inverse PCG and\nimage-to-3D generation tasks. DI-PCG offers a promising approach for efficient\ninverse PCG and represents a valuable exploration step towards a 3D generation\npath that models how to construct a 3D asset using parametric models.\n","authors":["Wang Zhao","Yan-Pei Cao","Jiale Xu","Yuejiang Dong","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2412.15200v1.pdf","comment":"Project page: https://thuzhaowang.github.io/projects/DI-PCG/"},{"id":"http://arxiv.org/abs/2412.15199v1","updated":"2024-12-19T18:58:36Z","published":"2024-12-19T18:58:36Z","title":"LiDAR-RT: Gaussian-based Ray Tracing for Dynamic LiDAR Re-simulation","summary":" This paper targets the challenge of real-time LiDAR re-simulation in dynamic\ndriving scenarios. Recent approaches utilize neural radiance fields combined\nwith the physical modeling of LiDAR sensors to achieve high-fidelity\nre-simulation results. Unfortunately, these methods face limitations due to\nhigh computational demands in large-scale scenes and cannot perform real-time\nLiDAR rendering. To overcome these constraints, we propose LiDAR-RT, a novel\nframework that supports real-time, physically accurate LiDAR re-simulation for\ndriving scenes. Our primary contribution is the development of an efficient and\neffective rendering pipeline, which integrates Gaussian primitives and\nhardware-accelerated ray tracing technology. Specifically, we model the\nphysical properties of LiDAR sensors using Gaussian primitives with learnable\nparameters and incorporate scene graphs to handle scene dynamics. Building upon\nthis scene representation, our framework first constructs a bounding volume\nhierarchy (BVH), then casts rays for each pixel and generates novel LiDAR views\nthrough a differentiable rendering algorithm. Importantly, our framework\nsupports realistic rendering with flexible scene editing operations and various\nsensor configurations. Extensive experiments across multiple public benchmarks\ndemonstrate that our method outperforms state-of-the-art methods in terms of\nrendering quality and efficiency. Our project page is at\nhttps://zju3dv.github.io/lidar-rt.\n","authors":["Chenxu Zhou","Lvchang Fu","Sida Peng","Yunzhi Yan","Zhanhua Zhang","Yong Chen","Jiazhi Xia","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.15199v1.pdf","comment":"Project page: https://zju3dv.github.io/lidar-rt"},{"id":"http://arxiv.org/abs/2412.15195v1","updated":"2024-12-19T18:58:14Z","published":"2024-12-19T18:58:14Z","title":"Preventing Local Pitfalls in Vector Quantization via Optimal Transport","summary":" Vector-quantized networks (VQNs) have exhibited remarkable performance across\nvarious tasks, yet they are prone to training instability, which complicates\nthe training process due to the necessity for techniques such as subtle\ninitialization and model distillation. In this study, we identify the local\nminima issue as the primary cause of this instability. To address this, we\nintegrate an optimal transport method in place of the nearest neighbor search\nto achieve a more globally informed assignment. We introduce OptVQ, a novel\nvector quantization method that employs the Sinkhorn algorithm to optimize the\noptimal transport problem, thereby enhancing the stability and efficiency of\nthe training process. To mitigate the influence of diverse data distributions\non the Sinkhorn algorithm, we implement a straightforward yet effective\nnormalization strategy. Our comprehensive experiments on image reconstruction\ntasks demonstrate that OptVQ achieves 100% codebook utilization and surpasses\ncurrent state-of-the-art VQNs in reconstruction quality.\n","authors":["Borui Zhang","Wenzhao Zheng","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2412.15195v1.pdf","comment":"Code is available at https://github.com/zbr17/OptVQ"},{"id":"http://arxiv.org/abs/2412.15191v1","updated":"2024-12-19T18:57:21Z","published":"2024-12-19T18:57:21Z","title":"AV-Link: Temporally-Aligned Diffusion Features for Cross-Modal\n Audio-Video Generation","summary":" We propose AV-Link, a unified framework for Video-to-Audio and Audio-to-Video\ngeneration that leverages the activations of frozen video and audio diffusion\nmodels for temporally-aligned cross-modal conditioning. The key to our\nframework is a Fusion Block that enables bidirectional information exchange\nbetween our backbone video and audio diffusion models through a\ntemporally-aligned self attention operation. Unlike prior work that uses\nfeature extractors pretrained for other tasks for the conditioning signal,\nAV-Link can directly leverage features obtained by the complementary modality\nin a single framework i.e. video features to generate audio, or audio features\nto generate video. We extensively evaluate our design choices and demonstrate\nthe ability of our method to achieve synchronized and high-quality audiovisual\ncontent, showcasing its potential for applications in immersive media\ngeneration. Project Page: snap-research.github.io/AVLink/\n","authors":["Moayed Haji-Ali","Willi Menapace","Aliaksandr Siarohin","Ivan Skorokhodov","Alper Canberk","Kwot Sin Lee","Vicente Ordonez","Sergey Tulyakov"],"pdf_url":"https://arxiv.org/pdf/2412.15191v1.pdf","comment":"Project Page: snap-research.github.io/AVLink/"},{"id":"http://arxiv.org/abs/2412.15190v1","updated":"2024-12-19T18:57:13Z","published":"2024-12-19T18:57:13Z","title":"EarthDial: Turning Multi-sensory Earth Observations to Interactive\n Dialogues","summary":" Automated analysis of vast Earth observation data via interactive\nVision-Language Models (VLMs) can unlock new opportunities for environmental\nmonitoring, disaster response, and resource management. Existing generic VLMs\ndo not perform well on Remote Sensing data, while the recent Geo-spatial VLMs\nremain restricted to a fixed resolution and few sensor modalities. In this\npaper, we introduce EarthDial, a conversational assistant specifically designed\nfor Earth Observation (EO) data, transforming complex, multi-sensory Earth\nobservations into interactive, natural language dialogues. EarthDial supports\nmulti-spectral, multi-temporal, and multi-resolution imagery, enabling a wide\nrange of remote sensing tasks, including classification, detection, captioning,\nquestion answering, visual reasoning, and visual grounding. To achieve this, we\nintroduce an extensive instruction tuning dataset comprising over 11.11M\ninstruction pairs covering RGB, Synthetic Aperture Radar (SAR), and\nmultispectral modalities such as Near-Infrared (NIR) and infrared. Furthermore,\nEarthDial handles bi-temporal and multi-temporal sequence analysis for\napplications like change detection. Our extensive experimental results on 37\ndownstream applications demonstrate that EarthDial outperforms existing generic\nand domain-specific models, achieving better generalization across various EO\ntasks.\n","authors":["Sagar Soni","Akshay Dudhane","Hiyam Debary","Mustansar Fiaz","Muhammad Akhtar Munir","Muhammad Sohail Danish","Paolo Fraccaro","Campbell D Watson","Levente J Klein","Fahad Shahbaz Khan","Salman Khan"],"pdf_url":"https://arxiv.org/pdf/2412.15190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15188v1","updated":"2024-12-19T18:56:24Z","published":"2024-12-19T18:56:24Z","title":"LlamaFusion: Adapting Pretrained Language Models for Multimodal\n Generation","summary":" We present LlamaFusion, a framework for empowering pretrained text-only large\nlanguage models (LLMs) with multimodal generative capabilities, enabling them\nto understand and generate both text and images in arbitrary sequences.\nLlamaFusion leverages existing Llama-3's weights for processing texts\nautoregressively while introducing additional and parallel transformer modules\nfor processing images with diffusion. During training, the data from each\nmodality is routed to its dedicated modules: modality-specific feedforward\nlayers, query-key-value projections, and normalization layers process each\nmodality independently, while the shared self-attention layers allow\ninteractions across text and image features. By freezing the text-specific\nmodules and only training the image-specific modules, LlamaFusion preserves the\nlanguage capabilities of text-only LLMs while developing strong visual\nunderstanding and generation abilities. Compared to methods that pretrain\nmultimodal generative models from scratch, our experiments demonstrate that,\nLlamaFusion improves image understanding by 20% and image generation by 3.6%\nusing only 50% of the FLOPs while maintaining Llama-3's language capabilities.\nWe also demonstrate that this framework can adapt existing vision-language\nmodels with multimodal generation ability. Overall, this framework not only\nleverages existing computational investments in text-only LLMs but also enables\nthe parallel development of language and vision capabilities, presenting a\npromising direction for efficient multimodal model development.\n","authors":["Weijia Shi","Xiaochuang Han","Chunting Zhou","Weixin Liang","Xi Victoria Lin","Luke Zettlemoyer","Lili Yu"],"pdf_url":"https://arxiv.org/pdf/2412.15188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15185v1","updated":"2024-12-19T18:55:25Z","published":"2024-12-19T18:55:25Z","title":"Tiled Diffusion","summary":" Image tiling -- the seamless connection of disparate images to create a\ncoherent visual field -- is crucial for applications such as texture creation,\nvideo game asset development, and digital art. Traditionally, tiles have been\nconstructed manually, a method that poses significant limitations in\nscalability and flexibility. Recent research has attempted to automate this\nprocess using generative models. However, current approaches primarily focus on\ntiling textures and manipulating models for single-image generation, without\ninherently supporting the creation of multiple interconnected tiles across\ndiverse domains. This paper presents Tiled Diffusion, a novel approach that\nextends the capabilities of diffusion models to accommodate the generation of\ncohesive tiling patterns across various domains of image synthesis that require\ntiling. Our method supports a wide range of tiling scenarios, from self-tiling\nto complex many-to-many connections, enabling seamless integration of multiple\nimages. Tiled Diffusion automates the tiling process, eliminating the need for\nmanual intervention and enhancing creative possibilities in various\napplications, such as seamlessly tiling of existing images, tiled texture\ncreation, and 360{\\deg} synthesis.\n","authors":["Or Madar","Ohad Fried"],"pdf_url":"https://arxiv.org/pdf/2412.15185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07449v2","updated":"2024-12-19T18:51:28Z","published":"2024-11-12T00:20:11Z","title":"Tracing the Roots: Leveraging Temporal Dynamics in Diffusion\n Trajectories for Origin Attribution","summary":" Diffusion models have revolutionized image synthesis, garnering significant\nresearch interest in recent years. Diffusion is an iterative algorithm in which\nsamples are generated step-by-step, starting from pure noise. This process\nintroduces the notion of diffusion trajectories, i.e., paths from the standard\nGaussian distribution to the target image distribution. In this context, we\nstudy discriminative algorithms operating on these trajectories. Specifically,\ngiven a pre-trained diffusion model, we consider the problem of classifying\nimages as part of the training dataset, generated by the model or originating\nfrom an external source. Our approach demonstrates the presence of patterns\nacross steps that can be leveraged for classification. We also conduct ablation\nstudies, which reveal that using higher-order gradient features to characterize\nthe trajectories leads to significant performance gains and more robust\nalgorithms.\n","authors":["Andreas Floros","Seyed-Mohsen Moosavi-Dezfooli","Pier Luigi Dragotti"],"pdf_url":"https://arxiv.org/pdf/2411.07449v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15171v1","updated":"2024-12-19T18:46:55Z","published":"2024-12-19T18:46:55Z","title":"SqueezeMe: Efficient Gaussian Avatars for VR","summary":" Gaussian Splatting has enabled real-time 3D human avatars with unprecedented\nlevels of visual quality. While previous methods require a desktop GPU for\nreal-time inference of a single avatar, we aim to squeeze multiple Gaussian\navatars onto a portable virtual reality headset with real-time drivable\ninference. We begin by training a previous work, Animatable Gaussians, on a\nhigh quality dataset captured with 512 cameras. The Gaussians are animated by\ncontrolling base set of Gaussians with linear blend skinning (LBS) motion and\nthen further adjusting the Gaussians with a neural network decoder to correct\ntheir appearance. When deploying the model on a Meta Quest 3 VR headset, we\nfind two major computational bottlenecks: the decoder and the rendering. To\naccelerate the decoder, we train the Gaussians in UV-space instead of\npixel-space, and we distill the decoder to a single neural network layer.\nFurther, we discover that neighborhoods of Gaussians can share a single\ncorrective from the decoder, which provides an additional speedup. To\naccelerate the rendering, we develop a custom pipeline in Vulkan that runs on\nthe mobile GPU. Putting it all together, we run 3 Gaussian avatars concurrently\nat 72 FPS on a VR headset. Demo videos are at\nhttps://forresti.github.io/squeezeme.\n","authors":["Shunsuke Saito","Stanislav Pidhorskyi","Igor Santesteban","Forrest Iandola","Divam Gupta","Anuj Pahuja","Nemanja Bartolovic","Frank Yu","Emanuel Garbin","Tomas Simon"],"pdf_url":"https://arxiv.org/pdf/2412.15171v1.pdf","comment":"Initial version"},{"id":"http://arxiv.org/abs/2412.15159v1","updated":"2024-12-19T18:34:50Z","published":"2024-12-19T18:34:50Z","title":"OnlineVPO: Align Video Diffusion Model with Online Video-Centric\n Preference Optimization","summary":" In recent years, the field of text-to-video (T2V) generation has made\nsignificant strides. Despite this progress, there is still a gap between\ntheoretical advancements and practical application, amplified by issues like\ndegraded image quality and flickering artifacts. Recent advancements in\nenhancing the video diffusion model (VDM) through feedback learning have shown\npromising results. However, these methods still exhibit notable limitations,\nsuch as misaligned feedback and inferior scalability. To tackle these issues,\nwe introduce OnlineVPO, a more efficient preference learning approach tailored\nspecifically for video diffusion models. Our method features two novel designs,\nfirstly, instead of directly using image-based reward feedback, we leverage the\nvideo quality assessment (VQA) model trained on synthetic data as the reward\nmodel to provide distribution and modality-aligned feedback on the video\ndiffusion model. Additionally, we introduce an online DPO algorithm to address\nthe off-policy optimization and scalability issue in existing video preference\nlearning frameworks. By employing the video reward model to offer concise video\nfeedback on the fly, OnlineVPO offers effective and efficient preference\nguidance. Extensive experiments on the open-source video-diffusion model\ndemonstrate OnlineVPO as a simple yet effective and more importantly scalable\npreference learning algorithm for video diffusion models, offering valuable\ninsights for future advancements in this domain.\n","authors":["Jiacheng Zhang","Jie Wu","Weifeng Chen","Yatai Ji","Xuefeng Xiao","Weilin Huang","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2412.15159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15156v1","updated":"2024-12-19T18:32:21Z","published":"2024-12-19T18:32:21Z","title":"Prompt-A-Video: Prompt Your Video Diffusion Model via Preference-Aligned\n LLM","summary":" Text-to-video models have made remarkable advancements through optimization\non high-quality text-video pairs, where the textual prompts play a pivotal role\nin determining quality of output videos. However, achieving the desired output\noften entails multiple revisions and iterative inference to refine\nuser-provided prompts. Current automatic methods for refining prompts encounter\nchallenges such as Modality-Inconsistency, Cost-Discrepancy, and Model-Unaware\nwhen applied to text-to-video diffusion models. To address these problem, we\nintroduce an LLM-based prompt adaptation framework, termed as Prompt-A-Video,\nwhich excels in crafting Video-Centric, Labor-Free and Preference-Aligned\nprompts tailored to specific video diffusion model. Our approach involves a\nmeticulously crafted two-stage optimization and alignment system. Initially, we\nconduct a reward-guided prompt evolution pipeline to automatically create\noptimal prompts pool and leverage them for supervised fine-tuning (SFT) of the\nLLM. Then multi-dimensional rewards are employed to generate pairwise data for\nthe SFT model, followed by the direct preference optimization (DPO) algorithm\nto further facilitate preference alignment. Through extensive experimentation\nand comparative analyses, we validate the effectiveness of Prompt-A-Video\nacross diverse generation models, highlighting its potential to push the\nboundaries of video generation.\n","authors":["Yatai Ji","Jiacheng Zhang","Jie Wu","Shilong Zhang","Shoufa Chen","Chongjian GE","Peize Sun","Weifeng Chen","Wenqi Shao","Xuefeng Xiao","Weilin Huang","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2412.15156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15150v1","updated":"2024-12-19T18:28:37Z","published":"2024-12-19T18:28:37Z","title":"Leveraging Color Channel Independence for Improved Unsupervised Object\n Detection","summary":" Object-centric architectures can learn to extract distinct object\nrepresentations from visual scenes, enabling downstream applications on the\nobject level. Similarly to autoencoder-based image models, object-centric\napproaches have been trained on the unsupervised reconstruction loss of images\nencoded by RGB color spaces. In our work, we challenge the common assumption\nthat RGB images are the optimal color space for unsupervised learning in\ncomputer vision. We discuss conceptually and empirically that other color\nspaces, such as HSV, bear essential characteristics for object-centric\nrepresentation learning, like robustness to lighting conditions. We further\nshow that models improve when requiring them to predict additional color\nchannels. Specifically, we propose to transform the predicted targets to the\nRGB-S space, which extends RGB with HSV's saturation component and leads to\nmarkedly better reconstruction and disentanglement for five common evaluation\ndatasets. The use of composite color spaces can be implemented with basically\nno computational overhead, is agnostic of the models' architecture, and is\nuniversally applicable across a wide range of visual computing tasks and\ntraining types. The findings of our approach encourage additional\ninvestigations in computer vision tasks beyond object-centric learning.\n","authors":["Bastian Jäckl","Yannick Metz","Udo Schlegel","Daniel A. Keim","Maximilian T. Fischer"],"pdf_url":"https://arxiv.org/pdf/2412.15150v1.pdf","comment":"38 pages incl. references, 16 figures"},{"id":"http://arxiv.org/abs/2412.15129v1","updated":"2024-12-19T18:09:42Z","published":"2024-12-19T18:09:42Z","title":"Jet: A Modern Transformer-Based Normalizing Flow","summary":" In the past, normalizing generative flows have emerged as a promising class\nof generative models for natural images. This type of model has many modeling\nadvantages: the ability to efficiently compute log-likelihood of the input\ndata, fast generation and simple overall structure. Normalizing flows remained\na topic of active research but later fell out of favor, as visual quality of\nthe samples was not competitive with other model classes, such as GANs,\nVQ-VAE-based approaches or diffusion models. In this paper we revisit the\ndesign of the coupling-based normalizing flow models by carefully ablating\nprior design choices and using computational blocks based on the Vision\nTransformer architecture, not convolutional neural networks. As a result, we\nachieve state-of-the-art quantitative and qualitative performance with a much\nsimpler architecture. While the overall visual quality is still behind the\ncurrent state-of-the-art models, we argue that strong normalizing flow models\ncan help advancing research frontier by serving as building components of more\npowerful generative models.\n","authors":["Alexander Kolesnikov","André Susano Pinto","Michael Tschannen"],"pdf_url":"https://arxiv.org/pdf/2412.15129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15119v1","updated":"2024-12-19T17:59:54Z","published":"2024-12-19T17:59:54Z","title":"Parallelized Autoregressive Visual Generation","summary":" Autoregressive models have emerged as a powerful approach for visual\ngeneration but suffer from slow inference speed due to their sequential\ntoken-by-token prediction process. In this paper, we propose a simple yet\neffective approach for parallelized autoregressive visual generation that\nimproves generation efficiency while preserving the advantages of\nautoregressive modeling. Our key insight is that parallel generation depends on\nvisual token dependencies-tokens with weak dependencies can be generated in\nparallel, while strongly dependent adjacent tokens are difficult to generate\ntogether, as their independent sampling may lead to inconsistencies. Based on\nthis observation, we develop a parallel generation strategy that generates\ndistant tokens with weak dependencies in parallel while maintaining sequential\ngeneration for strongly dependent local tokens. Our approach can be seamlessly\nintegrated into standard autoregressive models without modifying the\narchitecture or tokenizer. Experiments on ImageNet and UCF-101 demonstrate that\nour method achieves a 3.6x speedup with comparable quality and up to 9.5x\nspeedup with minimal quality degradation across both image and video generation\ntasks. We hope this work will inspire future research in efficient visual\ngeneration and unified autoregressive modeling. Project page:\nhttps://epiphqny.github.io/PAR-project.\n","authors":["Yuqing Wang","Shuhuai Ren","Zhijie Lin","Yujin Han","Haoyuan Guo","Zhenheng Yang","Difan Zou","Jiashi Feng","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2412.15119v1.pdf","comment":"Project page: https://epiphqny.github.io/PAR-project"},{"id":"http://arxiv.org/abs/2412.11917v3","updated":"2024-12-19T17:57:59Z","published":"2024-12-16T16:01:18Z","title":"Does VLM Classification Benefit from LLM Description Semantics?","summary":" Accurately describing images with text is a foundation of explainable AI.\nVision-Language Models (VLMs) like CLIP have recently addressed this by\naligning images and texts in a shared embedding space, expressing semantic\nsimilarities between vision and language embeddings. VLM classification can be\nimproved with descriptions generated by Large Language Models (LLMs). However,\nit is difficult to determine the contribution of actual description semantics,\nas the performance gain may also stem from a semantic-agnostic ensembling\neffect, where multiple modified text prompts act as a noisy test-time\naugmentation for the original one. We propose an alternative evaluation\nscenario to decide if a performance boost of LLM-generated descriptions is\ncaused by such a noise augmentation effect or rather by genuine description\nsemantics. The proposed scenario avoids noisy test-time augmentation and\nensures that genuine, distinctive descriptions cause the performance boost.\nFurthermore, we propose a training-free method for selecting discriminative\ndescriptions that work independently of classname-ensembling effects. Our\napproach identifies descriptions that effectively differentiate classes within\na local CLIP label neighborhood, improving classification accuracy across seven\ndatasets. Additionally, we provide insights into the explainability of\ndescription-based image classification with VLMs.\n","authors":["Pingchuan Ma","Lennart Rietdorf","Dmytro Kotovenko","Vincent Tao Hu","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2412.11917v3.pdf","comment":"AAAI-25 (extended version), Code: https://github.com/CompVis/DisCLIP"},{"id":"http://arxiv.org/abs/2412.15106v1","updated":"2024-12-19T17:51:49Z","published":"2024-12-19T17:51:49Z","title":"Knowing Where to Focus: Attention-Guided Alignment for Text-based Person\n Search","summary":" In the realm of Text-Based Person Search (TBPS), mainstream methods aim to\nexplore more efficient interaction frameworks between text descriptions and\nvisual data. However, recent approaches encounter two principal challenges.\nFirstly, the widely used random-based Masked Language Modeling (MLM) considers\nall the words in the text equally during training. However, massive\nsemantically vacuous words ('with', 'the', etc.) be masked fail to contribute\nefficient interaction in the cross-modal MLM and hampers the representation\nalignment. Secondly, manual descriptions in TBPS datasets are tedious and\ninevitably contain several inaccuracies. To address these issues, we introduce\nan Attention-Guided Alignment (AGA) framework featuring two innovative\ncomponents: Attention-Guided Mask (AGM) Modeling and Text Enrichment Module\n(TEM). AGM dynamically masks semantically meaningful words by aggregating the\nattention weight derived from the text encoding process, thereby cross-modal\nMLM can capture information related to the masked word from text context and\nimages and align their representations. Meanwhile, TEM alleviates low-quality\nrepresentations caused by repetitive and erroneous text descriptions by\nreplacing those semantically meaningful words with MLM's prediction. It not\nonly enriches text descriptions but also prevents overfitting. Extensive\nexperiments across three challenging benchmarks demonstrate the effectiveness\nof our AGA, achieving new state-of-the-art results with Rank-1 accuracy\nreaching 78.36%, 67.31%, and 67.4% on CUHK-PEDES, ICFG-PEDES, and RSTPReid,\nrespectively.\n","authors":["Lei Tan","Weihao Li","Pingyang Dai","Jie Chen","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2412.15106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13788v2","updated":"2024-12-19T17:51:42Z","published":"2024-03-20T17:51:53Z","title":"DepthFM: Fast Monocular Depth Estimation with Flow Matching","summary":" Current discriminative depth estimation methods often produce blurry\nartifacts, while generative approaches suffer from slow sampling due to\ncurvatures in the noise-to-depth transport. Our method addresses these\nchallenges by framing depth estimation as a direct transport between image and\ndepth distributions. We are the first to explore flow matching in this field,\nand we demonstrate that its interpolation trajectories enhance both training\nand sampling efficiency while preserving high performance. While generative\nmodels typically require extensive training data, we mitigate this dependency\nby integrating external knowledge from a pre-trained image diffusion model,\nenabling effective transfer even across differing objectives. To further boost\nour model performance, we employ synthetic data and utilize image-depth pairs\ngenerated by a discriminative model on an in-the-wild image dataset. As a\ngenerative model, our model can reliably estimate depth confidence, which\nprovides an additional advantage. Our approach achieves competitive zero-shot\nperformance on standard benchmarks of complex natural scenes while improving\nsampling efficiency and only requiring minimal synthetic data for training.\n","authors":["Ming Gui","Johannes Schusterbauer","Ulrich Prestel","Pingchuan Ma","Dmytro Kotovenko","Olga Grebenkova","Stefan Andreas Baumann","Vincent Tao Hu","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2403.13788v2.pdf","comment":"AAAI 2025, Project Page: https://github.com/CompVis/depth-fm"},{"id":"http://arxiv.org/abs/2412.15095v1","updated":"2024-12-19T17:45:08Z","published":"2024-12-19T17:45:08Z","title":"A Full Transformer-based Framework for Automatic Pain Estimation using\n Videos","summary":" The automatic estimation of pain is essential in designing an optimal pain\nmanagement system offering reliable assessment and reducing the suffering of\npatients. In this study, we present a novel full transformer-based framework\nconsisting of a Transformer in Transformer (TNT) model and a Transformer\nleveraging cross-attention and self-attention blocks. Elaborating on videos\nfrom the BioVid database, we demonstrate state-of-the-art performances, showing\nthe efficacy, efficiency, and generalization capability across all the primary\npain estimation tasks.\n","authors":["Stefanos Gkikas","Manolis Tsiknakis"],"pdf_url":"https://arxiv.org/pdf/2412.15095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15077v1","updated":"2024-12-19T17:26:07Z","published":"2024-12-19T17:26:07Z","title":"Till the Layers Collapse: Compressing a Deep Neural Network through the\n Lenses of Batch Normalization Layers","summary":" Today, deep neural networks are widely used since they can handle a variety\nof complex tasks. Their generality makes them very powerful tools in modern\ntechnology. However, deep neural networks are often overparameterized. The\nusage of these large models consumes a lot of computation resources. In this\npaper, we introduce a method called \\textbf{T}ill the \\textbf{L}ayers\n\\textbf{C}ollapse (TLC), which compresses deep neural networks through the\nlenses of batch normalization layers. By reducing the depth of these networks,\nour method decreases deep neural networks' computational requirements and\noverall latency. We validate our method on popular models such as Swin-T,\nMobileNet-V2, and RoBERTa, across both image classification and natural\nlanguage processing (NLP) tasks.\n","authors":["Zhu Liao","Nour Hezbri","Victor Quétu","Van-Tam Nguyen","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2412.15077v1.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2412.15058v1","updated":"2024-12-19T17:06:53Z","published":"2024-12-19T17:06:53Z","title":"MultiverSeg: Scalable Interactive Segmentation of Biomedical Imaging\n Datasets with In-Context Guidance","summary":" Medical researchers and clinicians often need to perform novel segmentation\ntasks on a set of related images. Existing methods for segmenting a new dataset\nare either interactive, requiring substantial human effort for each image, or\nrequire an existing set of manually labeled images. We introduce a system,\nMultiverSeg, that enables practitioners to rapidly segment an entire new\ndataset without requiring access to any existing labeled data from that task or\ndomain. Along with the image to segment, the model takes user interactions such\nas clicks, bounding boxes or scribbles as input, and predicts a segmentation.\nAs the user segments more images, those images and segmentations become\nadditional inputs to the model, providing context. As the context set of\nlabeled images grows, the number of interactions required to segment each new\nimage decreases. We demonstrate that MultiverSeg enables users to interactively\nsegment new datasets efficiently, by amortizing the number of interactions per\nimage to achieve an accurate segmentation. Compared to using a state-of-the-art\ninteractive segmentation method, using MultiverSeg reduced the total number of\nscribble steps by 53% and clicks by 36% to achieve 90% Dice on sets of images\nfrom unseen tasks. We release code and model weights at\nhttps://multiverseg.csail.mit.edu\n","authors":["Hallee E. Wong","Jose Javier Gonzalez Ortiz","John Guttag","Adrian V. Dalca"],"pdf_url":"https://arxiv.org/pdf/2412.15058v1.pdf","comment":"Project Website: https://multiverseg.csail.mit.edu Keywords:\n interactive segmentation, in-context learning, medical image analysis,\n biomedical imaging, image annotation, visual prompting"},{"id":"http://arxiv.org/abs/2412.15054v1","updated":"2024-12-19T17:02:03Z","published":"2024-12-19T17:02:03Z","title":"GIRAFE: Glottal Imaging Dataset for Advanced Segmentation, Analysis, and\n Facilitative Playbacks Evaluation","summary":" The advances in the development of Facilitative Playbacks extracted from\nHigh-Speed videoendoscopic sequences of the vocal folds are hindered by a\nnotable lack of publicly available datasets annotated with the semantic\nsegmentations corresponding to the area of the glottal gap. This fact also\nlimits the reproducibility and further exploration of existing research in this\nfield.\n To address this gap, GIRAFE is a data repository designed to facilitate the\ndevelopment of advanced techniques for the semantic segmentation, analysis, and\nfast evaluation of High-Speed videoendoscopic sequences of the vocal folds. The\nrepository includes 65 high-speed videoendoscopic recordings from a cohort of\n50 patients (30 female, 20 male). The dataset comprises 15 recordings from\nhealthy controls, 26 from patients with diagnosed voice disorders, and 24 with\nan unknown health condition. All of them were manually annotated by an expert,\nincluding the masks corresponding to the semantic segmentation of the glottal\ngap. The repository is also complemented with the automatic segmentation of the\nglottal area using different state-of-the-art approaches.\n This data set has already supported several studies, which demonstrates its\nusefulness for the development of new glottal gap segmentation algorithms from\nHigh-Speed-Videoendoscopic sequences to improve or create new Facilitative\nPlaybacks. Despite these advances and others in the field, the broader\nchallenge of performing an accurate and completely automatic semantic\nsegmentation method of the glottal area remains open.\n","authors":["G. Andrade-Miranda","K. Chatzipapas","J. D. Arias-Londoño","J. I. Godino-Llorente"],"pdf_url":"https://arxiv.org/pdf/2412.15054v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.15050v1","updated":"2024-12-19T16:57:45Z","published":"2024-12-19T16:57:45Z","title":"Uni-Renderer: Unifying Rendering and Inverse Rendering Via Dual Stream\n Diffusion","summary":" Rendering and inverse rendering are pivotal tasks in both computer vision and\ngraphics. The rendering equation is the core of the two tasks, as an ideal\nconditional distribution transfer function from intrinsic properties to RGB\nimages. Despite achieving promising results of existing rendering methods, they\nmerely approximate the ideal estimation for a specific scene and come with a\nhigh computational cost. Additionally, the inverse conditional distribution\ntransfer is intractable due to the inherent ambiguity. To address these\nchallenges, we propose a data-driven method that jointly models rendering and\ninverse rendering as two conditional generation tasks within a single diffusion\nframework. Inspired by UniDiffuser, we utilize two distinct time schedules to\nmodel both tasks, and with a tailored dual streaming module, we achieve\ncross-conditioning of two pre-trained diffusion models. This unified approach,\nnamed Uni-Renderer, allows the two processes to facilitate each other through a\ncycle-consistent constrain, mitigating ambiguity by enforcing consistency\nbetween intrinsic properties and rendered images. Combined with a meticulously\nprepared dataset, our method effectively decomposition of intrinsic properties\nand demonstrates a strong capability to recognize changes during rendering. We\nwill open-source our training and inference code to the public, fostering\nfurther research and development in this area.\n","authors":["Zhifei Chen","Tianshuo Xu","Wenhang Ge","Leyi Wu","Dongyu Yan","Jing He","Luozhou Wang","Lu Zeng","Shunsi Zhang","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2412.15050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.03767v2","updated":"2024-12-19T16:45:52Z","published":"2023-01-10T03:10:32Z","title":"Metric Compatible Training for Online Backfilling in Large-Scale\n Retrieval","summary":" Backfilling is the process of re-extracting all gallery embeddings from\nupgraded models in image retrieval systems. It inevitably requires a\nprohibitively large amount of computational cost and even entails the downtime\nof the service. Although backward-compatible learning sidesteps this challenge\nby tackling query-side representations, this leads to suboptimal solutions in\nprinciple because gallery embeddings cannot benefit from model upgrades. We\naddress this dilemma by introducing an online backfilling algorithm, which\nenables us to achieve a progressive performance improvement during the\nbackfilling process while not sacrificing the final performance of new model\nafter the completion of backfilling. To this end, we first propose a simple\ndistance rank merge technique for online backfilling. Then, we incorporate a\nreverse transformation module for more effective and efficient merging, which\nis further enhanced by adopting a metric-compatible contrastive learning\napproach. These two components help to make the distances of old and new models\ncompatible, resulting in desirable merge results during backfilling with no\nextra computational overhead. Extensive experiments show the effectiveness of\nour framework on four standard benchmarks in various settings.\n","authors":["Seonguk Seo","Mustafa Gokhan Uzunbas","Bohyung Han","Sara Cao","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2301.03767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15032v1","updated":"2024-12-19T16:44:01Z","published":"2024-12-19T16:44:01Z","title":"DCTdiff: Intriguing Properties of Image Generative Modeling in the DCT\n Space","summary":" This paper explores image modeling from the frequency space and introduces\nDCTdiff, an end-to-end diffusion generative paradigm that efficiently models\nimages in the discrete cosine transform (DCT) space. We investigate the design\nspace of DCTdiff and reveal the key design factors. Experiments on different\nframeworks (UViT, DiT), generation tasks, and various diffusion samplers\ndemonstrate that DCTdiff outperforms pixel-based diffusion models regarding\ngenerative quality and training efficiency. Remarkably, DCTdiff can seamlessly\nscale up to high-resolution generation without using the latent diffusion\nparadigm. Finally, we illustrate several intriguing properties of DCT image\nmodeling. For example, we provide a theoretical proof of why `image diffusion\ncan be seen as spectral autoregression', bridging the gap between diffusion and\nautoregressive models. The effectiveness of DCTdiff and the introduced\nproperties suggest a promising direction for image modeling in the frequency\nspace. The code is at \\url{https://github.com/forever208/DCTdiff}.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Haozhe Jia","Lanmiao Liu","Martin Beneš","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2412.15032v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2302.10634v2","updated":"2024-12-19T16:41:57Z","published":"2023-02-21T12:48:44Z","title":"A Deep Learning-Based and Fully Automated Pipeline for Regurgitant\n Mitral Valve Anatomy Analysis from 3D Echocardiography","summary":" 3D transesophageal echocardiography (3DTEE), is the recommended method for\ndiagnosing mitral regurgitation (MR). 3DTEE provides a high-quality 3D image of\nthe mitral valve (MV), allowing for precise segmentation and measurement of the\nregurgitant valve anatomy. However, manual TEE segmentations are time-consuming\nand prone to intra-operator variability, affecting the reliability of the\nmeasurements. To address this, we developed a fully automated pipeline using a\n3D convolutional neural network (CNN) to segment MV substructures (annulus,\nanterior leaflet, and posterior leaflet) and quantify MV anatomy. The 3D CNN,\nbased on a multi-decoder residual U-Net architecture, was trained and tested on\na dataset comprising 100 3DTEE images with corresponding segmentations. Within\nthe pipeline, a custom algorithm refines the CNN-based segmentations and\nextracts MV models, from which anatomical landmarks and features are\nquantified. The accuracy of the proposed method was assessed using Dice score\nand mean surface distance (MSD) against ground truth segmentations, and the\nextracted anatomical parameters were compared against a semiautomated\ncommercial software TomTec Image Arena. The trained 3D CNN achieved an average\nDice score of 0.79 and MSD of 0.47 mm for the combined segmentation of the\nannulus, anterior and posterior leaflet. The proposed CNN architecture\noutperformed a baseline residual U-Net architecture in MV substructure\nsegmentation, and the refinement of the predicted annulus segmentation improved\nMSD by 8.36%. The annular and leaflet linear measurements differed by less than\n7.94 mm and 3.67 mm, respectively, compared to the 3D measurements obtained\nwith TomTec Image Arena. The proposed pipeline was faster than the commercial\nsoftware, with a modeling time of 12.54 s and a quantification time of 54.42 s.\n","authors":["Riccardo Munafò","Simone Saitta","Giacomo Ingallina","Paolo Denti","Francesco Maisano","Eustachio Agricola","Alberto Redaelli","Emiliano Votta"],"pdf_url":"https://arxiv.org/pdf/2302.10634v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15023v1","updated":"2024-12-19T16:37:19Z","published":"2024-12-19T16:37:19Z","title":"Stable-V2A: Synthesis of Synchronized Sound Effects with Temporal and\n Semantic Controls","summary":" Sound designers and Foley artists usually sonorize a scene, such as from a\nmovie or video game, by manually annotating and sonorizing each action of\ninterest in the video. In our case, the intent is to leave full creative\ncontrol to sound designers with a tool that allows them to bypass the more\nrepetitive parts of their work, thus being able to focus on the creative\naspects of sound production. We achieve this presenting Stable-V2A, a two-stage\nmodel consisting of: an RMS-Mapper that estimates an envelope representative of\nthe audio characteristics associated with the input video; and Stable-Foley, a\ndiffusion model based on Stable Audio Open that generates audio semantically\nand temporally aligned with the target video. Temporal alignment is guaranteed\nby the use of the envelope as a ControlNet input, while semantic alignment is\nachieved through the use of sound representations chosen by the designer as\ncross-attention conditioning of the diffusion process. We train and test our\nmodel on Greatest Hits, a dataset commonly used to evaluate V2A models. In\naddition, to test our model on a case study of interest, we introduce Walking\nThe Maps, a dataset of videos extracted from video games depicting animated\ncharacters walking in different locations. Samples and code available on our\ndemo page at https://ispamm.github.io/Stable-V2A.\n","authors":["Riccardo Fosco Gramaccioni","Christian Marinoni","Emilian Postolache","Marco Comunità","Luca Cosmo","Joshua D. Reiss","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2412.15023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15010v1","updated":"2024-12-19T16:22:37Z","published":"2024-12-19T16:22:37Z","title":"Robust Federated Learning in the Face of Covariate Shift: A Magnitude\n Pruning with Hybrid Regularization Framework for Enhanced Model Aggregation","summary":" The development of highly sophisticated neural networks has allowed for fast\nprogress in every field of computer vision, however, applications where\nannotated data is prohibited due to privacy or security concerns remain\nchallenging. Federated Learning (FL) offers a promising framework for\nindividuals aiming to collaboratively develop a shared model while preserving\ndata privacy. Nevertheless, our findings reveal that variations in data\ndistribution among clients can profoundly affect FL methodologies, primarily\ndue to instabilities in the aggregation process. We also propose a novel FL\nframework to mitigate the adverse effects of covariate shifts among federated\nclients by combining individual parameter pruning and regularization techniques\nto improve the robustness of individual clients' models to aggregate. Each\nclient's model is optimized through magnitude-based pruning and the addition of\ndropout and noise injection layers to build more resilient decision pathways in\nthe networks and improve the robustness of the model's parameter aggregation\nstep. The proposed framework is capable of extracting robust representations\neven in the presence of very large covariate shifts among client data\ndistributions and in the federation of a small number of clients. Empirical\nfindings substantiate the effectiveness of our proposed methodology across\ncommon benchmark datasets, including CIFAR10, MNIST, SVHN, and Fashion MNIST.\nFurthermore, we introduce the CelebA-Gender dataset, specifically designed to\nevaluate performance on a more realistic domain. The proposed method is capable\nof extracting robust representations even in the presence of both high and low\ncovariate shifts among client data distributions.\n","authors":["Ozgu Goksu","Nicolas Pugeault"],"pdf_url":"https://arxiv.org/pdf/2412.15010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14988v1","updated":"2024-12-19T16:00:10Z","published":"2024-12-19T16:00:10Z","title":"Stitch Contrast and Segment_Learning a Human Action Segmentation Model\n Using Trimmed Skeleton Videos","summary":" Existing skeleton-based human action classification models rely on\nwell-trimmed action-specific skeleton videos for both training and testing,\nprecluding their scalability to real-world applications where untrimmed videos\nexhibiting concatenated actions are predominant. To overcome this limitation,\nrecently introduced skeleton action segmentation models involve un-trimmed\nskeleton videos into end-to-end training. The model is optimized to provide\nframe-wise predictions for any length of testing videos, simultaneously\nrealizing action localization and classification. Yet, achieving such an\nimprovement im-poses frame-wise annotated skeleton videos, which remains\ntime-consuming in practice. This paper features a novel framework for\nskeleton-based action segmentation trained on short trimmed skeleton videos,\nbut that can run on longer un-trimmed videos. The approach is implemented in\nthree steps: Stitch, Contrast, and Segment. First, Stitch proposes a tem-poral\nskeleton stitching scheme that treats trimmed skeleton videos as elementary\nhuman motions that compose a semantic space and can be sampled to generate\nmulti-action stitched se-quences. Contrast learns contrastive representations\nfrom stitched sequences with a novel discrimination pretext task that enables a\nskeleton encoder to learn meaningful action-temporal contexts to improve action\nsegmentation. Finally, Segment relates the proposed method to action\nsegmentation by learning a segmentation layer while handling particular da-ta\navailability. Experiments involve a trimmed source dataset and an untrimmed\ntarget dataset in an adaptation formulation for real-world skeleton-based human\naction segmentation to evaluate the effectiveness of the proposed method.\n","authors":["Haitao Tian","Pierre Payeur"],"pdf_url":"https://arxiv.org/pdf/2412.14988v1.pdf","comment":"Accepted as AAAI 2025"},{"id":"http://arxiv.org/abs/2412.08941v3","updated":"2024-12-19T15:59:19Z","published":"2024-12-12T05:08:05Z","title":"Optimized Gradient Clipping for Noisy Label Learning","summary":" Previous research has shown that constraining the gradient of loss function\nwith respect to model-predicted probabilities can enhance the model robustness\nagainst noisy labels. These methods typically specify a fixed optimal threshold\nfor gradient clipping through validation data to obtain the desired robustness\nagainst noise. However, this common practice overlooks the dynamic distribution\nof gradients from both clean and noisy-labeled samples at different stages of\ntraining, significantly limiting the model capability to adapt to the variable\nnature of gradients throughout the training process. To address this issue, we\npropose a simple yet effective approach called Optimized Gradient Clipping\n(OGC), which dynamically adjusts the clipping threshold based on the ratio of\nnoise gradients to clean gradients after clipping, estimated by modeling the\ndistributions of clean and noisy samples. This approach allows us to modify the\nclipping threshold at each training step, effectively controlling the influence\nof noise gradients. Additionally, we provide statistical analysis to certify\nthe noise-tolerance ability of OGC. Our extensive experiments across various\ntypes of label noise, including symmetric, asymmetric, instance-dependent, and\nreal-world noise, demonstrate the effectiveness of our approach.\n","authors":["Xichen Ye","Yifan Wu","Weizhong Zhang","Xiaoqiang Li","Yifan Chen","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2412.08941v3.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2412.14974v1","updated":"2024-12-19T15:48:51Z","published":"2024-12-19T15:48:51Z","title":"Arti-PG: A Toolbox for Procedurally Synthesizing Large-Scale and Diverse\n Articulated Objects with Rich Annotations","summary":" The acquisition of substantial volumes of 3D articulated object data is\nexpensive and time-consuming, and consequently the scarcity of 3D articulated\nobject data becomes an obstacle for deep learning methods to achieve remarkable\nperformance in various articulated object understanding tasks. Meanwhile,\npairing these object data with detailed annotations to enable training for\nvarious tasks is also difficult and labor-intensive to achieve. In order to\nexpeditiously gather a significant number of 3D articulated objects with\ncomprehensive and detailed annotations for training, we propose Articulated\nObject Procedural Generation toolbox, a.k.a. Arti-PG toolbox. Arti-PG toolbox\nconsists of i) descriptions of articulated objects by means of a generalized\nstructure program along with their analytic correspondence to the objects'\npoint cloud, ii) procedural rules about manipulations on the structure program\nto synthesize large-scale and diverse new articulated objects, and iii)\nmathematical descriptions of knowledge (e.g. affordance, semantics, etc.) to\nprovide annotations to the synthesized object. Arti-PG has two appealing\nproperties for providing training data for articulated object understanding\ntasks: i) objects are created with unlimited variations in shape through\nprogram-oriented structure manipulation, ii) Arti-PG is widely applicable to\ndiverse tasks by easily providing comprehensive and detailed annotations.\nArti-PG now supports the procedural generation of 26 categories of articulate\nobjects and provides annotations across a wide range of both vision and\nmanipulation tasks, and we provide exhaustive experiments which fully\ndemonstrate its advantages. We will make Arti-PG toolbox publicly available for\nthe community to use.\n","authors":["Jianhua Sun","Yuxuan Li","Jiude Wei","Longfei Xu","Nange Wang","Yining Zhang","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2412.14974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14969v1","updated":"2024-12-19T15:47:31Z","published":"2024-12-19T15:47:31Z","title":"PhotoHolmes: a Python library for forgery detection in digital images","summary":" In this paper, we introduce PhotoHolmes, an open-source Python library\ndesigned to easily run and benchmark forgery detection methods on digital\nimages. The library includes implementations of popular and state-of-the-art\nmethods, dataset integration tools, and evaluation metrics. Utilizing the\nBenchmark tool in PhotoHolmes, users can effortlessly compare various methods.\nThis facilitates an accurate and reproducible comparison between their own\nmethods and those in the existing literature. Furthermore, PhotoHolmes includes\na command-line interface (CLI) to easily run the methods implemented in the\nlibrary on any suspicious image. As such, image forgery methods become more\naccessible to the community. The library has been built with extensibility and\nmodularity in mind, which makes adding new methods, datasets and metrics to the\nlibrary a straightforward process. The source code is available at\nhttps://github.com/photoholmes/photoholmes.\n","authors":["Julián O'Flaherty","Rodrigo Paganini","Juan Pablo Sotelo","Julieta Umpiérrez","Marina Gardella","Matías Tailanian","Pablo Musé"],"pdf_url":"https://arxiv.org/pdf/2412.14969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14965v1","updated":"2024-12-19T15:44:04Z","published":"2024-12-19T15:44:04Z","title":"Movie2Story: A framework for understanding videos and telling stories in\n the form of novel text","summary":" Multimodal video-to-text models have made considerable progress, primarily in\ngenerating brief descriptions of video content. However, there is still a\ndeficiency in generating rich long-form text descriptions that integrate both\nvideo and audio. In this paper, we introduce a framework called M2S, designed\nto generate novel-length text by combining audio, video, and character\nrecognition. M2S includes modules for video long-form text description and\ncomprehension, audio-based analysis of emotion, speech rate, and character\nalignment, and visual-based character recognition alignment. By integrating\nmultimodal information using the large language model GPT4o, M2S stands out in\nthe field of multimodal text generation. We demonstrate the effectiveness and\naccuracy of M2S through comparative experiments and human evaluation.\nAdditionally, the model framework has good scalability and significant\npotential for future research.\n","authors":["Kangning Li","Zheyang Jia","Anyu Ying"],"pdf_url":"https://arxiv.org/pdf/2412.14965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14963v1","updated":"2024-12-19T15:43:05Z","published":"2024-12-19T15:43:05Z","title":"IDOL: Instant Photorealistic 3D Human Creation from a Single Image","summary":" Creating a high-fidelity, animatable 3D full-body avatar from a single image\nis a challenging task due to the diverse appearance and poses of humans and the\nlimited availability of high-quality training data. To achieve fast and\nhigh-quality human reconstruction, this work rethinks the task from the\nperspectives of dataset, model, and representation. First, we introduce a\nlarge-scale HUman-centric GEnerated dataset, HuGe100K, consisting of 100K\ndiverse, photorealistic sets of human images. Each set contains 24-view frames\nin specific human poses, generated using a pose-controllable\nimage-to-multi-view model. Next, leveraging the diversity in views, poses, and\nappearances within HuGe100K, we develop a scalable feed-forward transformer\nmodel to predict a 3D human Gaussian representation in a uniform space from a\ngiven human image. This model is trained to disentangle human pose, body shape,\nclothing geometry, and texture. The estimated Gaussians can be animated without\npost-processing. We conduct comprehensive experiments to validate the\neffectiveness of the proposed dataset and method. Our model demonstrates the\nability to efficiently reconstruct photorealistic humans at 1K resolution from\na single input image using a single GPU instantly. Additionally, it seamlessly\nsupports various applications, as well as shape and texture editing tasks.\n","authors":["Yiyu Zhuang","Jiaxi Lv","Hao Wen","Qing Shuai","Ailing Zeng","Hao Zhu","Shifeng Chen","Yujiu Yang","Xun Cao","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2412.14963v1.pdf","comment":"21 pages, 15 figures, includes main content, supplementary materials,\n and references"},{"id":"http://arxiv.org/abs/2412.14961v1","updated":"2024-12-19T15:42:21Z","published":"2024-12-19T15:42:21Z","title":"TDCNet: Transparent Objects Depth Completion with CNN-Transformer\n Dual-Branch Parallel Network","summary":" The sensing and manipulation of transparent objects present a critical\nchallenge in industrial and laboratory robotics. Conventional sensors face\nchallenges in obtaining the full depth of transparent objects due to the\nrefraction and reflection of light on their surfaces and their lack of visible\ntexture. Previous research has attempted to obtain complete depth maps of\ntransparent objects from RGB and damaged depth maps (collected by depth sensor)\nusing deep learning models. However, existing methods fail to fully utilize the\noriginal depth map, resulting in limited accuracy for deep completion. To solve\nthis problem, we propose TDCNet, a novel dual-branch CNN-Transformer parallel\nnetwork for transparent object depth completion. The proposed framework\nconsists of two different branches: one extracts features from partial depth\nmaps, while the other processes RGB-D images. Experimental results demonstrate\nthat our model achieves state-of-the-art performance across multiple public\ndatasets. Our code and the pre-trained model are publicly available at\nhttps://github.com/XianghuiFan/TDCNet.\n","authors":["Xianghui Fan","Chao Ye","Anping Deng","Xiaotian Wu","Mengyang Pan","Hang Yang"],"pdf_url":"https://arxiv.org/pdf/2412.14961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14957v1","updated":"2024-12-19T15:38:15Z","published":"2024-12-19T15:38:15Z","title":"Dream to Manipulate: Compositional World Models Empowering Robot\n Imitation Learning with Imagination","summary":" A world model provides an agent with a representation of its environment,\nenabling it to predict the causal consequences of its actions. Current world\nmodels typically cannot directly and explicitly imitate the actual environment\nin front of a robot, often resulting in unrealistic behaviors and\nhallucinations that make them unsuitable for real-world applications. In this\npaper, we introduce a new paradigm for constructing world models that are\nexplicit representations of the real world and its dynamics. By integrating\ncutting-edge advances in real-time photorealism with Gaussian Splatting and\nphysics simulators, we propose the first compositional manipulation world\nmodel, which we call DreMa. DreMa replicates the observed world and its\ndynamics, allowing it to imagine novel configurations of objects and predict\nthe future consequences of robot actions. We leverage this capability to\ngenerate new data for imitation learning by applying equivariant\ntransformations to a small set of demonstrations. Our evaluations across\nvarious settings demonstrate significant improvements in both accuracy and\nrobustness by incrementing actions and object distributions, reducing the data\nneeded to learn a policy and improving the generalization of the agents. As a\nhighlight, we show that a real Franka Emika Panda robot, powered by DreMa's\nimagination, can successfully learn novel physical tasks from just a single\nexample per task variation (one-shot policy learning). Our project page and\nsource code can be found in https://leobarcellona.github.io/DreamToManipulate/\n","authors":["Leonardo Barcellona","Andrii Zadaianchuk","Davide Allegro","Samuele Papa","Stefano Ghidoni","Efstratios Gavves"],"pdf_url":"https://arxiv.org/pdf/2412.14957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13647v2","updated":"2024-12-19T15:37:55Z","published":"2024-12-18T09:23:12Z","title":"G-VEval: A Versatile Metric for Evaluating Image and Video Captions\n Using GPT-4o","summary":" Evaluation metric of visual captioning is important yet not thoroughly\nexplored. Traditional metrics like BLEU, METEOR, CIDEr, and ROUGE often miss\nsemantic depth, while trained metrics such as CLIP-Score, PAC-S, and Polos are\nlimited in zero-shot scenarios. Advanced Language Model-based metrics also\nstruggle with aligning to nuanced human preferences. To address these issues,\nwe introduce G-VEval, a novel metric inspired by G-Eval and powered by the new\nGPT-4o. G-VEval uses chain-of-thought reasoning in large multimodal models and\nsupports three modes: reference-free, reference-only, and combined,\naccommodating both video and image inputs. We also propose MSVD-Eval, a new\ndataset for video captioning evaluation, to establish a more transparent and\nconsistent framework for both human experts and evaluation metrics. It is\ndesigned to address the lack of clear criteria in existing datasets by\nintroducing distinct dimensions of Accuracy, Completeness, Conciseness, and\nRelevance (ACCR). Extensive results show that G-VEval outperforms existing\nmethods in correlation with human annotations, as measured by Kendall tau-b and\nKendall tau-c. This provides a flexible solution for diverse captioning tasks\nand suggests a straightforward yet effective approach for large language models\nto understand video content, paving the way for advancements in automated\ncaptioning. Codes are available at https://github.com/ztangaj/gveval\n","authors":["Tony Cheng Tong","Sirui He","Zhiwen Shao","Dit-Yan Yeung"],"pdf_url":"https://arxiv.org/pdf/2412.13647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14954v1","updated":"2024-12-19T15:36:30Z","published":"2024-12-19T15:36:30Z","title":"Corn Ear Detection and Orientation Estimation Using Deep Learning","summary":" Monitoring growth behavior of maize plants such as the development of ears\ncan give key insights into the plant's health and development. Traditionally,\nthe measurement of the angle of ears is performed manually, which can be\ntime-consuming and prone to human error. To address these challenges, this\npaper presents a computer vision-based system for detecting and tracking ears\nof corn in an image sequence. The proposed system could accurately detect,\ntrack, and predict the ear's orientation, which can be useful in monitoring\ntheir growth behavior. This can significantly save time compared to manual\nmeasurement and enables additional areas of ear orientation research and\npotential increase in efficiencies for maize production. Using an object\ndetector with keypoint detection, the algorithm proposed could detect 90\npercent of all ears. The cardinal estimation had a mean absolute error (MAE) of\n18 degrees, compared to a mean 15 degree difference between two people\nmeasuring by hand. These results demonstrate the feasibility of using computer\nvision techniques for monitoring maize growth and can lead to further research\nin this area.\n","authors":["Nathan Sprague","John Evans","Michael Mardikes"],"pdf_url":"https://arxiv.org/pdf/2412.14954v1.pdf","comment":"22 pages;15 figures"},{"id":"http://arxiv.org/abs/2406.16710v2","updated":"2024-12-19T15:28:26Z","published":"2024-06-24T15:11:35Z","title":"ID-Sculpt: ID-aware 3D Head Generation from Single In-the-wild Portrait\n Image","summary":" While recent works have achieved great success on image-to-3D object\ngeneration, high quality and fidelity 3D head generation from a single image\nremains a great challenge. Previous text-based methods for generating 3D heads\nwere limited by text descriptions and image-based methods struggled to produce\nhigh-quality head geometry. To handle this challenging problem, we propose a\nnovel framework, ID-Sculpt, to generate high-quality 3D heads while preserving\ntheir identities. Our work incorporates the identity information of the\nportrait image into three parts: 1) geometry initialization, 2) geometry\nsculpting, and 3) texture generation stages. Given a reference portrait image,\nwe first align the identity features with text features to realize ID-aware\nguidance enhancement, which contains the control signals representing the face\ninformation. We then use the canny map, ID features of the portrait image, and\na pre-trained text-to-normal/depth diffusion model to generate ID-aware\ngeometry supervision, and 3D-GAN inversion is employed to generate ID-aware\ngeometry initialization. Furthermore, with the ability to inject identity\ninformation into 3D head generation, we use ID-aware guidance to calculate\nID-aware Score Distillation (ISD) for geometry sculpting. For texture\ngeneration, we adopt the ID Consistent Texture Inpainting and Refinement which\nprogressively expands the view for texture inpainting to obtain an\ninitialization UV texture map. We then use the ID-aware guidance to provide\nimage-level supervision for noisy multi-view images to obtain a refined texture\nmap. Extensive experiments demonstrate that we can generate high-quality 3D\nheads with accurate geometry and texture from a single in-the-wild portrait\nimage.\n","authors":["Jinkun Hao","Junshu Tang","Jiangning Zhang","Ran Yi","Yijia Hong","Moran Li","Weijian Cao","Yating Wang","Chengjie Wang","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2406.16710v2.pdf","comment":"Accepted by AAAI 2025; Project page:\n https://jinkun-hao.github.io/ID-Sculpt/"},{"id":"http://arxiv.org/abs/2411.10958v2","updated":"2024-12-19T15:26:20Z","published":"2024-11-17T04:35:49Z","title":"SageAttention2: Efficient Attention with Thorough Outlier Smoothing and\n Per-thread INT4 Quantization","summary":" Although quantization for linear layers has been widely used, its application\nto accelerate the attention process remains limited. To further enhance the\nefficiency of attention computation compared to SageAttention while maintaining\nprecision, we propose SageAttention2, which utilizes significantly faster 4-bit\nmatrix multiplication (Matmul) alongside additional precision-enhancing\ntechniques. First, we propose to quantize matrixes $(Q, K)$ to INT4 in a\nhardware-friendly thread-level granularity and quantize matrixes $(\\widetilde\nP, V)$ to FP8. Second, we propose a method to smooth $Q$, enhancing the\naccuracy of INT4 $QK$. Third, we propose to use an FP32 Matmul buffer for $PV$\nto enhance the accuracy of FP8 $\\widetilde PV$. The operations per second (OPS)\nof SageAttention2 surpass FlashAttention2 and xformers by about 3x and 5x on\nRTX4090, respectively. Comprehensive experiments confirm that our approach\nincurs negligible end-to-end metrics loss across diverse models, including\nthose for large language processing, image generation, and video generation.\nThe codes are available at https://github.com/thu-ml/SageAttention.\n","authors":["Jintao Zhang","Haofeng Huang","Pengle Zhang","Jia Wei","Jun Zhu","Jianfei Chen"],"pdf_url":"https://arxiv.org/pdf/2411.10958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14939v1","updated":"2024-12-19T15:15:03Z","published":"2024-12-19T15:15:03Z","title":"GURecon: Learning Detailed 3D Geometric Uncertainties for Neural Surface\n Reconstruction","summary":" Neural surface representation has demonstrated remarkable success in the\nareas of novel view synthesis and 3D reconstruction. However, assessing the\ngeometric quality of 3D reconstructions in the absence of ground truth mesh\nremains a significant challenge, due to its rendering-based optimization\nprocess and entangled learning of appearance and geometry with photometric\nlosses. In this paper, we present a novel framework, i.e, GURecon, which\nestablishes a geometric uncertainty field for the neural surface based on\ngeometric consistency. Different from existing methods that rely on\nrendering-based measurement, GURecon models a continuous 3D uncertainty field\nfor the reconstructed surface, and is learned by an online distillation\napproach without introducing real geometric information for supervision.\nMoreover, in order to mitigate the interference of illumination on geometric\nconsistency, a decoupled field is learned and exploited to finetune the\nuncertainty field. Experiments on various datasets demonstrate the superiority\nof GURecon in modeling 3D geometric uncertainty, as well as its plug-and-play\nextension to various neural surface representations and improvement on\ndownstream tasks such as incremental reconstruction. The code and supplementary\nmaterial are available on the project website:\nhttps://zju3dv.github.io/GURecon/.\n","authors":["Zesong Yang","Ru Zhang","Jiale Shi","Zixiang Ai","Boming Zhao","Hujun Bao","Luwei Yang","Zhaopeng Cui"],"pdf_url":"https://arxiv.org/pdf/2412.14939v1.pdf","comment":"Accepted by AAAI 2025. Project page:\n https://zju3dv.github.io/gurecon/"},{"id":"http://arxiv.org/abs/2302.11947v2","updated":"2024-12-19T15:13:46Z","published":"2023-02-23T11:44:43Z","title":"Real-Time Damage Detection in Fiber Lifting Ropes Using Lightweight\n Convolutional Neural Networks","summary":" The health and safety hazards posed by worn crane lifting ropes mandate\nperiodic inspection for damage. This task is time-consuming, prone to human\nerror, halts operation, and may result in the premature disposal of ropes.\nTherefore, we propose using efficient deep learning and computer vision methods\nto automate the process of detecting damaged ropes. Specifically, we present a\nvision-based system for detecting damage in synthetic fiber rope images using\nlightweight convolutional neural networks. We develop a camera-based apparatus\nto photograph the lifting rope's surface, while in operation, and capture the\nprogressive wear-and-tear as well as the more significant degradation in the\nrope's health state. Experts from Konecranes annotate the collected images in\naccordance with the rope's condition; normal or damaged. Then, we pre-process\nthe images, systematically design a deep learning model, evaluate its detection\nand prediction performance, analyze its computational complexity, and compare\nit with various other models. Experimental results show the proposed model\noutperforms other similar techniques with 96.5% accuracy, 94.8% precision,\n98.3% recall, 96.5% F1-score, and 99.3% AUC. Besides, they demonstrate the\nmodel's real-time operation, low memory footprint, robustness to various\nenvironmental and operational conditions, and adequacy for deployment in\nindustrial applications such as lifting, mooring, towing, climbing, and\nsailing.\n","authors":["Tuomas Jalonen","Mohammad Al-Sa'd","Roope Mellanen","Serkan Kiranyaz","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2302.11947v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14925v1","updated":"2024-12-19T15:02:50Z","published":"2024-12-19T15:02:50Z","title":"Automatic Spectral Calibration of Hyperspectral Images:Method, Dataset\n and Benchmark","summary":" Hyperspectral image (HSI) densely samples the world in both the space and\nfrequency domain and therefore is more distinctive than RGB images. Usually,\nHSI needs to be calibrated to minimize the impact of various illumination\nconditions. The traditional way to calibrate HSI utilizes a physical reference,\nwhich involves manual operations, occlusions, and/or limits camera mobility.\nThese limitations inspire this paper to automatically calibrate HSIs using a\nlearning-based method. Towards this goal, a large-scale HSI calibration dataset\nis created, which has 765 high-quality HSI pairs covering diversified natural\nscenes and illuminations. The dataset is further expanded to 7650 pairs by\ncombining with 10 different physically measured illuminations. A spectral\nillumination transformer (SIT) together with an illumination attention module\nis proposed. Extensive benchmarks demonstrate the SoTA performance of the\nproposed SIT. The benchmarks also indicate that low-light conditions are more\nchallenging than normal conditions. The dataset and codes are available\nonline:https://github.com/duranze/Automatic-spectral-calibration-of-HSI\n","authors":["Zhuoran Du","Shaodi You","Cheng Cheng","Shikui Wei"],"pdf_url":"https://arxiv.org/pdf/2412.14925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04272v2","updated":"2024-12-19T15:02:37Z","published":"2024-09-06T13:28:05Z","title":"Cycle Pixel Difference Network for Crisp Edge Detection","summary":" Edge detection, as a fundamental task in computer vision, has garnered\nincreasing attention. The advent of deep learning has significantly advanced\nthis field. However, recent deep learning-based methods generally face two\nsignificant issues: 1) reliance on large-scale pre-trained weights, and 2)\ngeneration of thick edges. We construct a U-shape encoder-decoder model named\nCPD-Net that successfully addresses these two issues simultaneously. In\nresponse to issue 1), we propose a novel cycle pixel difference convolution\n(CPDC), which effectively integrates edge prior knowledge with modern\nconvolution operations, consequently successfully eliminating the dependence on\nlarge-scale pre-trained weights. As for issue 2), we construct a multi-scale\ninformation enhancement module (MSEM) and a dual residual connection-based\n(DRC) decoder to enhance the edge location ability of the model, thereby\ngenerating crisp and clean contour maps. Comprehensive experiments conducted on\nfour standard benchmarks demonstrate that our method achieves competitive\nperformance on the BSDS500 dataset (ODS=0.813 and AC=0.352), NYUD-V2 (ODS=0.760\nand AC=0.223), BIPED dataset (ODS=0.898 and AC=0.426), and CID (ODS=0.59). Our\napproach provides a novel perspective for addressing these challenges in edge\ndetection.\n","authors":["Changsong Liu","Wei Zhang","Yanyan Liu","Mingyang Li","Wenlin Li","Yimeng Fan","Xiangnan Bai","Liang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.04272v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16571v4","updated":"2024-12-19T14:47:05Z","published":"2024-04-25T12:34:23Z","title":"MonoPCC: Photometric-invariant Cycle Constraint for Monocular Depth\n Estimation of Endoscopic Images","summary":" Photometric constraint is indispensable for self-supervised monocular depth\nestimation. It involves warping a source image onto a target view using\nestimated depth&pose, and then minimizing the difference between the warped and\ntarget images. However, the endoscopic built-in light causes significant\nbrightness fluctuations, and thus makes the photometric constraint unreliable.\nPrevious efforts only mitigate this relying on extra models to calibrate image\nbrightness. In this paper, we propose MonoPCC to address the brightness\ninconsistency radically by reshaping the photometric constraint into a cycle\nform. Instead of only warping the source image, MonoPCC constructs a closed\nloop consisting of two opposite forward-backward warping paths: from target to\nsource and then back to target. Thus, the target image finally receives an\nimage cycle-warped from itself, which naturally makes the constraint invariant\nto brightness changes. Moreover, MonoPCC transplants the source image's\nphase-frequency into the intermediate warped image to avoid structure lost, and\nalso stabilizes the training via an exponential moving average (EMA) strategy\nto avoid frequent changes in the forward warping. The comprehensive and\nextensive experimental results on four endoscopic datasets demonstrate that our\nproposed MonoPCC shows a great robustness to the brightness inconsistency, and\nexceeds other state-of-the-arts by reducing the absolute relative error by at\nleast 7.27%, 9.38%, 9.90% and 3.17%, respectively.\n","authors":["Zhiwei Wang","Ying Zhou","Shiquan He","Ting Li","Fan Huang","Qiang Ding","Xinxia Feng","Mei Liu","Qiang Li"],"pdf_url":"https://arxiv.org/pdf/2404.16571v4.pdf","comment":"14 pages, 12 figures"},{"id":"http://arxiv.org/abs/2311.18512v2","updated":"2024-12-19T14:46:05Z","published":"2023-11-30T12:40:23Z","title":"Union-over-Intersections: Object Detection beyond Winner-Takes-All","summary":" This paper revisits the problem of predicting box locations in object\ndetection architectures. Typically, each box proposal or box query aims to\ndirectly maximize the intersection-over-union score with the ground truth,\nfollowed by a winner-takes-all non-maximum suppression where only the highest\nscoring box in each region is retained. We observe that both steps are\nsub-optimal: the first involves regressing proposals to the entire ground\ntruth, which is a difficult task even with large receptive fields, and the\nsecond neglects valuable information from boxes other than the top candidate.\nInstead of regressing proposals to the whole ground truth, we propose a simpler\napproach: regress only to the area of intersection between the proposal and the\nground truth. This avoids the need for proposals to extrapolate beyond their\nvisual scope, improving localization accuracy. Rather than adopting a\nwinner-takes-all strategy, we take the union over the regressed intersections\nof all boxes in a region to generate the final box outputs. Our plug-and-play\nmethod integrates seamlessly into proposal-based, grid-based, and query-based\ndetection architectures with minimal modifications, consistently improving\nobject localization and instance segmentation. We demonstrate its broad\napplicability and versatility across various detection and segmentation tasks.\n","authors":["Aritra Bhowmik","Pascal Mettes","Martin R. Oswald","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2311.18512v2.pdf","comment":"17 pages, 6 figures, 12 tables"},{"id":"http://arxiv.org/abs/2312.06259v2","updated":"2024-12-19T14:38:47Z","published":"2023-12-11T09:57:09Z","title":"Point Cloud Semantic Segmentation with Sparse and Inhomogeneous\n Annotations","summary":" Utilizing uniformly distributed sparse annotations, weakly supervised\nlearning alleviates the heavy reliance on fine-grained annotations in point\ncloud semantic segmentation tasks. However, few works discuss the inhomogeneity\nof sparse annotations, albeit it is common in real-world scenarios. Therefore,\nthis work introduces the probability density function into the gradient\nsampling approximation method to qualitatively analyze the impact of annotation\nsparsity and inhomogeneity under weakly supervised learning. Based on our\nanalysis, we propose an Adaptive Annotation Distribution Network (AADNet)\ncapable of robust learning on arbitrarily distributed sparse annotations.\nSpecifically, we propose a label-aware point cloud downsampling strategy to\nincrease the proportion of annotations involved in the training stage.\nFurthermore, we design the multiplicative dynamic entropy as the gradient\ncalibration function to mitigate the gradient bias caused by non-uniformly\ndistributed sparse annotations and explicitly reduce the epistemic uncertainty.\nWithout any prior restrictions and additional information, our proposed method\nachieves comprehensive performance improvements at multiple label rates and\ndifferent annotation distributions.\n","authors":["Zhiyi Pan","Nan Zhang","Wei Gao","Shan Liu","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2312.06259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14902v1","updated":"2024-12-19T14:32:11Z","published":"2024-12-19T14:32:11Z","title":"MagicNaming: Consistent Identity Generation by Finding a \"Name Space\" in\n T2I Diffusion Models","summary":" Large-scale text-to-image diffusion models, (e.g., DALL-E, SDXL) are capable\nof generating famous persons by simply referring to their names. Is it possible\nto make such models generate generic identities as simple as the famous ones,\ne.g., just use a name? In this paper, we explore the existence of a \"Name\nSpace\", where any point in the space corresponds to a specific identity.\nFortunately, we find some clues in the feature space spanned by text embedding\nof celebrities' names. Specifically, we first extract the embeddings of\ncelebrities' names in the Laion5B dataset with the text encoder of diffusion\nmodels. Such embeddings are used as supervision to learn an encoder that can\npredict the name (actually an embedding) of a given face image. We\nexperimentally find that such name embeddings work well in promising the\ngenerated image with good identity consistency. Note that like the names of\ncelebrities, our predicted name embeddings are disentangled from the semantics\nof text inputs, making the original generation capability of text-to-image\nmodels well-preserved. Moreover, by simply plugging such name embeddings, all\nvariants (e.g., from Civitai) derived from the same base model (i.e., SDXL)\nreadily become identity-aware text-to-image models. Project homepage:\n\\url{https://magicfusion.github.io/MagicNaming/}.\n","authors":["Jing Zhao","Heliang Zheng","Chaoyue Wang","Long Lan","Wanrong Hunag","Yuhua Tang"],"pdf_url":"https://arxiv.org/pdf/2412.14902v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.13099v2","updated":"2024-12-19T14:23:45Z","published":"2024-12-17T17:10:02Z","title":"Accuracy Limits as a Barrier to Biometric System Security","summary":" Biometric systems are widely used for identity verification and\nidentification, including authentication (i.e., one-to-one matching to verify a\nclaimed identity) and identification (i.e., one-to-many matching to find a\nsubject in a database). The matching process relies on measuring similarities\nor dissimilarities between a fresh biometric template and enrolled templates.\nThe False Match Rate FMR is a key metric for assessing the accuracy and\nreliability of such systems. This paper analyzes biometric systems based on\ntheir FMR, with two main contributions. First, we explore untargeted attacks,\nwhere an adversary aims to impersonate any user within a database. We determine\nthe number of trials required for an attacker to successfully impersonate a\nuser and derive the critical population size (i.e., the maximum number of users\nin the database) required to maintain a given level of security. Furthermore,\nwe compute the critical FMR value needed to ensure resistance against\nuntargeted attacks as the database size increases. Second, we revisit the\nbiometric birthday problem to evaluate the approximate and exact probabilities\nthat two users in a database collide (i.e., can impersonate each other). Based\non this analysis, we derive both the approximate critical population size and\nthe critical FMR value needed to bound the likelihood of such collisions\noccurring with a given probability. These thresholds offer insights for\ndesigning systems that mitigate the risk of impersonation and collisions,\nparticularly in large-scale biometric databases. Our findings indicate that\ncurrent biometric systems fail to deliver sufficient accuracy to achieve an\nadequate security level against untargeted attacks, even in small-scale\ndatabases. Moreover, state-of-the-art systems face significant challenges in\naddressing the biometric birthday problem, especially as database sizes grow.\n","authors":["Axel Durbet","Paul-Marie Grollemund","Pascal Lafourcade","Kevin Thiry-Atighehchi"],"pdf_url":"https://arxiv.org/pdf/2412.13099v2.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2412.14118v2","updated":"2024-12-19T14:18:57Z","published":"2024-12-18T18:04:12Z","title":"GaraMoSt: Parallel Multi-Granularity Motion and Structural Modeling for\n Efficient Multi-Frame Interpolation in DSA Images","summary":" The rapid and accurate direct multi-frame interpolation method for Digital\nSubtraction Angiography (DSA) images is crucial for reducing radiation and\nproviding real-time assistance to physicians for precise diagnostics and\ntreatment. DSA images contain complex vascular structures and various motions.\nApplying natural scene Video Frame Interpolation (VFI) methods results in\nmotion artifacts, structural dissipation, and blurriness. Recently, MoSt-DSA\nhas specifically addressed these issues for the first time and achieved SOTA\nresults. However, MoSt-DSA's focus on real-time performance leads to\ninsufficient suppression of high-frequency noise and incomplete filtering of\nlow-frequency noise in the generated images. To address these issues within the\nsame computational time scale, we propose GaraMoSt. Specifically, we optimize\nthe network pipeline with a parallel design and propose a module named MG-MSFE.\nMG-MSFE extracts frame-relative motion and structural features at various\ngranularities in a fully convolutional parallel manner and supports\nindependent, flexible adjustment of context-aware granularity at different\nscales, thus enhancing computational efficiency and accuracy. Extensive\nexperiments demonstrate that GaraMoSt achieves the SOTA performance in\naccuracy, robustness, visual effects, and noise suppression, comprehensively\nsurpassing MoSt-DSA and other natural scene VFI methods. The code and models\nare available at https://github.com/ZyoungXu/GaraMoSt.\n","authors":["Ziyang Xu","Huangxuan Zhao","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14118v2.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2411.13093v2","updated":"2024-12-19T14:17:13Z","published":"2024-11-20T07:44:34Z","title":"Video-RAG: Visually-aligned Retrieval-Augmented Long Video Comprehension","summary":" Existing large video-language models (LVLMs) struggle to comprehend long\nvideos correctly due to limited context. To address this problem, fine-tuning\nlong-context LVLMs and employing GPT-based agents have emerged as promising\nsolutions. However, fine-tuning LVLMs would require extensive high-quality data\nand substantial GPU resources, while GPT-based agents would rely on proprietary\nmodels (e.g., GPT-4o). In this paper, we propose Video Retrieval-Augmented\nGeneration (Video-RAG), a training-free and cost-effective pipeline that\nemploys visually-aligned auxiliary texts to help facilitate cross-modality\nalignment while providing additional information beyond the visual content.\nSpecifically, we leverage open-source external tools to extract\nvisually-aligned information from pure video data (e.g., audio, optical\ncharacter, and object detection), and incorporate the extracted information\ninto an existing LVLM as auxiliary texts, alongside video frames and queries,\nin a plug-and-play manner. Our Video-RAG offers several key advantages: (i)\nlightweight with low computing overhead due to single-turn retrieval; (ii) easy\nimplementation and compatibility with any LVLM; and (iii) significant,\nconsistent performance gains across long video understanding benchmarks,\nincluding Video-MME, MLVU, and LongVideoBench. Notably, our model demonstrates\nsuperior performance over proprietary models like Gemini-1.5-Pro and GPT-4o\nwhen utilized with a 72B model.\n","authors":["Yongdong Luo","Xiawu Zheng","Xiao Yang","Guilin Li","Haojia Lin","Jinfa Huang","Jiayi Ji","Fei Chao","Jiebo Luo","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2411.13093v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2412.14880v1","updated":"2024-12-19T14:17:09Z","published":"2024-12-19T14:17:09Z","title":"Multimodal Hypothetical Summary for Retrieval-based Multi-image Question\n Answering","summary":" Retrieval-based multi-image question answering (QA) task involves retrieving\nmultiple question-related images and synthesizing these images to generate an\nanswer. Conventional \"retrieve-then-answer\" pipelines often suffer from\ncascading errors because the training objective of QA fails to optimize the\nretrieval stage. To address this issue, we propose a novel method to\neffectively introduce and reference retrieved information into the QA. Given\nthe image set to be retrieved, we employ a multimodal large language model\n(visual perspective) and a large language model (textual perspective) to obtain\nmultimodal hypothetical summary in question-form and description-form. By\ncombining visual and textual perspectives, MHyS captures image content more\nspecifically and replaces real images in retrieval, which eliminates the\nmodality gap by transforming into text-to-text retrieval and helps improve\nretrieval. To more advantageously introduce retrieval with QA, we employ\ncontrastive learning to align queries (questions) with MHyS. Moreover, we\npropose a coarse-to-fine strategy for calculating both sentence-level and\nword-level similarity scores, to further enhance retrieval and filter out\nirrelevant details. Our approach achieves a 3.7% absolute improvement over\nstate-of-the-art methods on RETVQA and a 14.5% improvement over CLIP.\nComprehensive experiments and detailed ablation studies demonstrate the\nsuperiority of our method.\n","authors":["Peize Li","Qingyi Si","Peng Fu","Zheng Lin","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14880v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14873v1","updated":"2024-12-19T14:11:49Z","published":"2024-12-19T14:11:49Z","title":"Zero-Shot Artifact2Artifact: Self-incentive artifact removal for\n photoacoustic imaging without any data","summary":" Photoacoustic imaging (PAI) uniquely combines optical contrast with the\npenetration depth of ultrasound, making it critical for clinical applications.\nHowever, the quality of 3D PAI is often degraded due to reconstruction\nartifacts caused by the sparse and angle-limited configuration of detector\narrays. Existing iterative or deep learning-based methods are either\ntime-consuming or require large training datasets, significantly limiting their\npractical application. Here, we propose Zero-Shot Artifact2Artifact (ZS-A2A), a\nzero-shot self-supervised artifact removal method based on a super-lightweight\nnetwork, which leverages the fact that reconstruction artifacts are sensitive\nto irregularities caused by data loss. By introducing random perturbations to\nthe acquired PA data, it spontaneously generates subset data, which in turn\nstimulates the network to learn the artifact patterns in the reconstruction\nresults, thus enabling zero-shot artifact removal. This approach requires\nneither training data nor prior knowledge of the artifacts, and is capable of\nartifact removal for 3D PAI. For maximum amplitude projection (MAP) images or\nslice images in 3D PAI acquired with arbitrarily sparse or angle-limited\ndetector arrays, ZS-A2A employs a self-incentive strategy to complete artifact\nremoval and improves the Contrast-to-Noise Ratio (CNR). We validated ZS-A2A in\nboth simulation study and $ in\\ vivo $ animal experiments. Results demonstrate\nthat ZS-A2A achieves state-of-the-art (SOTA) performance compared to existing\nzero-shot methods, and for the $ in\\ vivo $ rat liver, ZS-A2A improves CNR from\n17.48 to 43.46 in just 8 seconds. The project for ZS-A2A will be available in\nthe following GitHub repository: https://github.com/JaegerCQ/ZS-A2A.\n","authors":["Shuang Li","Qian Chen","Chulhong Kim","Seongwook Choi","Yibing Wang","Yu Zhang","Changhui Li"],"pdf_url":"https://arxiv.org/pdf/2412.14873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.13620v3","updated":"2024-12-19T14:08:42Z","published":"2022-04-28T16:35:04Z","title":"Generative Adversarial Networks for Image Super-Resolution: A Survey","summary":" Single image super-resolution (SISR) has played an important role in the\nfield of image processing. Recent generative adversarial networks (GANs) can\nachieve excellent results on low-resolution images with small samples. However,\nthere are little literatures summarizing different GANs in SISR. In this paper,\nwe conduct a comparative study of GANs from different perspectives. We first\ntake a look at developments of GANs. Second, we present popular architectures\nfor GANs in big and small samples for image applications. Then, we analyze\nmotivations, implementations and differences of GANs based optimization methods\nand discriminative learning for image super-resolution in terms of supervised,\nsemi-supervised and unsupervised manners, where these GANs are analyzed via\nintegrating different network architectures, prior knowledge, loss functions\nand multiple tasks. Next, we compare performance of these popular GANs on\npublic datasets via quantitative and qualitative analysis in SISR. Finally, we\nhighlight challenges of GANs and potential research points for SISR.\n","authors":["Chunwei Tian","Xuanyu Zhang","Qi Zhu","Bob Zhang","Jerry Chun-Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2204.13620v3.pdf","comment":"31pages, 10 figures"},{"id":"http://arxiv.org/abs/2407.07024v3","updated":"2024-12-19T14:07:44Z","published":"2024-07-09T16:44:04Z","title":"Exploring Scalability of Self-Training for Open-Vocabulary Temporal\n Action Localization","summary":" The vocabulary size in temporal action localization (TAL) is limited by the\nscarcity of large-scale annotated datasets. To overcome this, recent works\nintegrate vision-language models (VLMs), such as CLIP, for open-vocabulary TAL\n(OV-TAL). However, despite the success of VLMs trained on extensive datasets,\nexisting OV-TAL methods still rely on human-labeled TAL datasets of limited\nsize to train action localizers, limiting their generalizability. In this\npaper, we explore the scalability of self-training with unlabeled YouTube\nvideos for OV-TAL. Our approach consists of two stages: (1) a class-agnostic\naction localizer is trained on a human-labeled TAL dataset to generate\npseudo-labels for unlabeled videos, and (2) the large-scale pseudo-labeled\ndataset is then used to train the localizer. Extensive experiments demonstrate\nthat leveraging web-scale videos in self-training significantly enhances the\ngeneralizability of an action localizer. Additionally, we identify limitations\nin existing OV-TAL evaluation schemes and propose a new benchmark for thorough\nassessment. Finally, we showcase the TAL performance of the large multimodal\nmodel Gemini-1.5 on our new benchmark. Code is released at\nhttps://github.com/HYUNJS/STOV-TAL.\n","authors":["Jeongseok Hyun","Su Ho Han","Hyolim Kang","Joon-Young Lee","Seon Joo Kim"],"pdf_url":"https://arxiv.org/pdf/2407.07024v3.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2412.14870v1","updated":"2024-12-19T14:06:56Z","published":"2024-12-19T14:06:56Z","title":"Large-scale School Mapping using Weakly Supervised Deep Learning for\n Universal School Connectivity","summary":" Improving global school connectivity is critical for ensuring inclusive and\nequitable quality education. To reliably estimate the cost of connecting\nschools, governments and connectivity providers require complete and accurate\nschool location data - a resource that is often scarce in many low- and\nmiddle-income countries. To address this challenge, we propose a\ncost-effective, scalable approach to locating schools in high-resolution\nsatellite images using weakly supervised deep learning techniques. Our best\nmodels, which combine vision transformers and convolutional neural networks,\nachieve AUPRC values above 0.96 across 10 pilot African countries. Leveraging\nexplainable AI techniques, our approach can approximate the precise\ngeographical coordinates of the school locations using only low-cost,\nclassification-level annotations. To demonstrate the scalability of our method,\nwe generate nationwide maps of school location predictions in African countries\nand present a detailed analysis of our results, using Senegal as our case\nstudy. Finally, we demonstrate the immediate usability of our work by\nintroducing an interactive web mapping tool to streamline human-in-the-loop\nmodel validation efforts by government partners. This work successfully\nshowcases the real-world utility of deep learning and satellite images for\nplanning regional infrastructure and accelerating universal school\nconnectivity.\n","authors":["Isabelle Tingzon","Utku Can Ozturk","Ivan Dotu"],"pdf_url":"https://arxiv.org/pdf/2412.14870v1.pdf","comment":"Accepted at AAAI-25 Special Track on AI for Social Impact (AISI)"},{"id":"http://arxiv.org/abs/2412.14869v1","updated":"2024-12-19T14:06:44Z","published":"2024-12-19T14:06:44Z","title":"AI-Powered Intracranial Hemorrhage Detection: A Co-Scale Convolutional\n Attention Model with Uncertainty-Based Fuzzy Integral Operator and Feature\n Screening","summary":" Intracranial hemorrhage (ICH) refers to the leakage or accumulation of blood\nwithin the skull, which occurs due to the rupture of blood vessels in or around\nthe brain. If this condition is not diagnosed in a timely manner and\nappropriately treated, it can lead to serious complications such as decreased\nconsciousness, permanent neurological disabilities, or even death.The primary\naim of this study is to detect the occurrence or non-occurrence of ICH,\nfollowed by determining the type of subdural hemorrhage (SDH). These tasks are\nframed as two separate binary classification problems. By adding two layers to\nthe co-scale convolutional attention (CCA) classifier architecture, we\nintroduce a novel approach for ICH detection. In the first layer, after\nextracting features from different slices of computed tomography (CT) scan\nimages, we combine these features and select the 50 components that capture the\nhighest variance in the data, considering them as informative features. We then\nassess the discriminative power of these features using the bootstrap forest\nalgorithm, discarding those that lack sufficient discriminative ability between\ndifferent classes. This algorithm explicitly determines the contribution of\neach feature to the final prediction, assisting us in developing an explainable\nAI model. The features feed into a boosting neural network as a latent feature\nspace. In the second layer, we introduce a novel uncertainty-based fuzzy\nintegral operator to fuse information from different CT scan slices. This\noperator, by accounting for the dependencies between consecutive slices,\nsignificantly improves detection accuracy.\n","authors":["Mehdi Hosseini Chagahi","Md. Jalil Piran","Niloufar Delfan","Behzad Moshiri","Jaber Hatam Parikhan"],"pdf_url":"https://arxiv.org/pdf/2412.14869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20213v4","updated":"2024-12-19T13:46:40Z","published":"2024-03-29T14:50:43Z","title":"VHM: Versatile and Honest Vision Language Model for Remote Sensing Image\n Analysis","summary":" This paper develops a Versatile and Honest vision language Model (VHM) for\nremote sensing image analysis. VHM is built on a large-scale remote sensing\nimage-text dataset with rich-content captions (VersaD), and an honest\ninstruction dataset comprising both factual and deceptive questions (HnstD).\nUnlike prevailing remote sensing image-text datasets, in which image captions\nfocus on a few prominent objects and their relationships, VersaD captions\nprovide detailed information about image properties, object attributes, and the\noverall scene. This comprehensive captioning enables VHM to thoroughly\nunderstand remote sensing images and perform diverse remote sensing tasks.\nMoreover, different from existing remote sensing instruction datasets that only\ninclude factual questions, HnstD contains additional deceptive questions\nstemming from the non-existence of objects. This feature prevents VHM from\nproducing affirmative answers to nonsense queries, thereby ensuring its\nhonesty. In our experiments, VHM significantly outperforms various vision\nlanguage models on common tasks of scene classification, visual question\nanswering, and visual grounding. Additionally, VHM achieves competent\nperformance on several unexplored tasks, such as building vectorizing,\nmulti-label classification and honest question answering. We will release the\ncode, data and model weights at https://github.com/opendatalab/VHM .\n","authors":["Chao Pang","Xingxing Weng","Jiang Wu","Jiayu Li","Yi Liu","Jiaxing Sun","Weijia Li","Shuai Wang","Litong Feng","Gui-Song Xia","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2403.20213v4.pdf","comment":"Equal contribution: Chao Pang, Xingxing Weng, Jiang Wu; Corresponding\n author: Gui-Song Xia, Conghui He"},{"id":"http://arxiv.org/abs/2411.04865v4","updated":"2024-12-19T13:45:39Z","published":"2024-11-07T16:58:18Z","title":"ZAHA: Introducing the Level of Facade Generalization and the Large-Scale\n Point Cloud Facade Semantic Segmentation Benchmark Dataset","summary":" Facade semantic segmentation is a long-standing challenge in photogrammetry\nand computer vision. Although the last decades have witnessed the influx of\nfacade segmentation methods, there is a lack of comprehensive facade classes\nand data covering the architectural variability. In ZAHA, we introduce Level of\nFacade Generalization (LoFG), novel hierarchical facade classes designed based\non international urban modeling standards, ensuring compatibility with\nreal-world challenging classes and uniform methods' comparison. Realizing the\nLoFG, we present to date the largest semantic 3D facade segmentation dataset,\nproviding 601 million annotated points at five and 15 classes of LoFG2 and\nLoFG3, respectively. Moreover, we analyze the performance of baseline semantic\nsegmentation methods on our introduced LoFG classes and data, complementing it\nwith a discussion on the unresolved challenges for facade segmentation. We\nfirmly believe that ZAHA shall facilitate further development of 3D facade\nsemantic segmentation methods, enabling robust segmentation indispensable in\ncreating urban digital twins.\n","authors":["Olaf Wysocki","Yue Tan","Thomas Froech","Yan Xia","Magdalena Wysocki","Ludwig Hoegner","Daniel Cremers","Christoph Holst"],"pdf_url":"https://arxiv.org/pdf/2411.04865v4.pdf","comment":"Accepted to WACV 2025 (IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV))"},{"id":"http://arxiv.org/abs/2412.13913v2","updated":"2024-12-19T13:41:08Z","published":"2024-12-18T14:53:38Z","title":"A Black-Box Evaluation Framework for Semantic Robustness in Bird's Eye\n View Detection","summary":" Camera-based Bird's Eye View (BEV) perception models receive increasing\nattention for their crucial role in autonomous driving, a domain where concerns\nabout the robustness and reliability of deep learning have been raised. While\nonly a few works have investigated the effects of randomly generated semantic\nperturbations, aka natural corruptions, on the multi-view BEV detection task,\nwe develop a black-box robustness evaluation framework that adversarially\noptimises three common semantic perturbations: geometric transformation, colour\nshifting, and motion blur, to deceive BEV models, serving as the first approach\nin this emerging field. To address the challenge posed by optimising the\nsemantic perturbation, we design a smoothed, distance-based surrogate function\nto replace the mAP metric and introduce SimpleDIRECT, a deterministic\noptimisation algorithm that utilises observed slopes to guide the optimisation\nprocess. By comparing with randomised perturbation and two optimisation\nbaselines, we demonstrate the effectiveness of the proposed framework.\nAdditionally, we provide a benchmark on the semantic robustness of ten recent\nBEV models. The results reveal that PolarFormer, which emphasises geometric\ninformation from multi-view images, exhibits the highest robustness, whereas\nBEVDet is fully compromised, with its precision reduced to zero.\n","authors":["Fu Wang","Yanghao Zhang","Xiangyu Yin","Guangliang Cheng","Zeyu Fu","Xiaowei Huang","Wenjie Ruan"],"pdf_url":"https://arxiv.org/pdf/2412.13913v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10929v6","updated":"2024-12-19T13:39:55Z","published":"2024-10-14T16:35:27Z","title":"ASTM :Autonomous Smart Traffic Management System Using Artificial\n Intelligence CNN and LSTM","summary":" In the modern world, the development of Artificial Intelligence (AI) has\ncontributed to improvements in various areas, including automation, computer\nvision, fraud detection, and more. AI can be leveraged to enhance the\nefficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce\ntraffic congestion rates. This paper presents an Autonomous Smart Traffic\nManagement (STM) system that uses AI to improve traffic flow rates. The system\nemploys the YOLO V5 Convolutional Neural Network to detect vehicles in traffic\nmanagement images. Additionally, it predicts the number of vehicles for the\nnext 12 hours using a Recurrent Neural Network with Long Short-Term Memory\n(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the\ntraffic cycle length based on these vehicle predictions, aided by AI. From the\nresults of the RNN-LSTM model for predicting vehicle numbers over the next 12\nhours, we observe that the model predicts traffic with a Mean Squared Error\n(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles.\nAfter simulating the STM system in the CARLA simulation environment, we found\nthat the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per\nminute) is 50\\% higher than the rate without STM (around 15 vehicles per\nminute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5\nseconds per vehicle) is 70\\% lower than without STM (around 12 seconds per\nvehicle). These results demonstrate that the STM system using AI can increase\ntraffic flow by 50\\% and reduce vehicle pass delays by 70\\%.\n","authors":["Christofel Rio Goenawan"],"pdf_url":"https://arxiv.org/pdf/2410.10929v6.pdf","comment":"In process to IEEE Intelligent Vehicle Symposium 2025"},{"id":"http://arxiv.org/abs/2412.14846v1","updated":"2024-12-19T13:38:20Z","published":"2024-12-19T13:38:20Z","title":"Head and Neck Tumor Segmentation of MRI from Pre- and Mid-radiotherapy\n with Pre-training, Data Augmentation and Dual Flow UNet","summary":" Head and neck tumors and metastatic lymph nodes are crucial for treatment\nplanning and prognostic analysis. Accurate segmentation and quantitative\nanalysis of these structures require pixel-level annotation, making automated\nsegmentation techniques essential for the diagnosis and treatment of head and\nneck cancer. In this study, we investigated the effects of multiple strategies\non the segmentation of pre-radiotherapy (pre-RT) and mid-radiotherapy (mid-RT)\nimages. For the segmentation of pre-RT images, we utilized: 1) a fully\nsupervised learning approach, and 2) the same approach enhanced with\npre-trained weights and the MixUp data augmentation technique. For mid-RT\nimages, we introduced a novel computational-friendly network architecture that\nfeatures separate encoders for mid-RT images and registered pre-RT images with\ntheir labels. The mid-RT encoder branch integrates information from pre-RT\nimages and labels progressively during the forward propagation. We selected the\nhighest-performing model from each fold and used their predictions to create an\nensemble average for inference. In the final test, our models achieved a\nsegmentation performance of 82.38% for pre-RT and 72.53% for mid-RT on\naggregated Dice Similarity Coefficient (DSC) as HiLab. Our code is available at\nhttps://github.com/WltyBY/HNTS-MRG2024_train_code.\n","authors":["Litingyu Wang","Wenjun Liao","Shichuan Zhang","Guotai Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14837v1","updated":"2024-12-19T13:27:58Z","published":"2024-12-19T13:27:58Z","title":"ObjVariantEnsemble: Advancing Point Cloud LLM Evaluation in Challenging\n Scenes with Subtly Distinguished Objects","summary":" 3D scene understanding is an important task, and there has been a recent\nsurge of research interest in aligning 3D representations of point clouds with\ntext to empower embodied AI. However, due to the lack of comprehensive 3D\nbenchmarks, the capabilities of 3D models in real-world scenes, particularly\nthose that are challenging with subtly distinguished objects, remain\ninsufficiently investigated. To facilitate a more thorough evaluation of 3D\nmodels' capabilities, we propose a scheme, ObjVariantEnsemble, to\nsystematically introduce more scenes with specified object classes, colors,\nshapes, quantities, and spatial relationships to meet model evaluation needs.\nMore importantly, we intentionally construct scenes with similar objects to a\ncertain degree and design an LLM-VLM-cooperated annotator to capture key\ndistinctions as annotations. The resultant benchmark can better challenge 3D\nmodels, reveal their shortcomings in understanding, and potentially aid in the\nfurther development of 3D models.\n","authors":["Qihang Cao","Huangxun Chen"],"pdf_url":"https://arxiv.org/pdf/2412.14837v1.pdf","comment":"Accepted to AAAI2025"},{"id":"http://arxiv.org/abs/2412.14835v1","updated":"2024-12-19T13:25:39Z","published":"2024-12-19T13:25:39Z","title":"Progressive Multimodal Reasoning via Active Retrieval","summary":" Multi-step multimodal reasoning tasks pose significant challenges for\nmultimodal large language models (MLLMs), and finding effective ways to enhance\ntheir performance in such scenarios remains an unresolved issue. In this paper,\nwe propose AR-MCTS, a universal framework designed to progressively improve the\nreasoning capabilities of MLLMs through Active Retrieval (AR) and Monte Carlo\nTree Search (MCTS). Our approach begins with the development of a unified\nretrieval module that retrieves key supporting insights for solving complex\nreasoning problems from a hybrid-modal retrieval corpus. To bridge the gap in\nautomated multimodal reasoning verification, we employ the MCTS algorithm\ncombined with an active retrieval mechanism, which enables the automatic\ngeneration of step-wise annotations. This strategy dynamically retrieves key\ninsights for each reasoning step, moving beyond traditional beam search\nsampling to improve the diversity and reliability of the reasoning space.\nAdditionally, we introduce a process reward model that aligns progressively to\nsupport the automatic verification of multimodal reasoning tasks. Experimental\nresults across three complex multimodal reasoning benchmarks confirm the\neffectiveness of the AR-MCTS framework in enhancing the performance of various\nmultimodal models. Further analysis demonstrates that AR-MCTS can optimize\nsampling diversity and accuracy, yielding reliable multimodal reasoning.\n","authors":["Guanting Dong","Chenghao Zhang","Mengjie Deng","Yutao Zhu","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2412.14835v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2412.14833v1","updated":"2024-12-19T13:21:04Z","published":"2024-12-19T13:21:04Z","title":"Synchronized and Fine-Grained Head for Skeleton-Based Ambiguous Action\n Recognition","summary":" Skeleton-based action recognition using GCNs has achieved remarkable\nperformance, but recognizing ambiguous actions, such as \"waving\" and\n\"saluting\", remains a significant challenge. Existing methods typically rely on\na serial combination of GCNs and TCNs, where spatial and temporal features are\nextracted independently, leading to an unbalanced spatial-temporal information,\nwhich hinders accurate action recognition. Moreover, existing methods for\nambiguous actions often overemphasize local details, resulting in the loss of\ncrucial global context, which further complicates the task of differentiating\nambiguous actions. To address these challenges, we propose a lightweight\nplug-and-play module called Synchronized and Fine-grained Head (SF-Head),\ninserted between GCN and TCN layers. SF-Head first conducts Synchronized\nSpatial-Temporal Extraction (SSTE) with a Feature Redundancy Loss (F-RL),\nensuring a balanced interaction between the two types of features. It then\nperforms Adaptive Cross-dimensional Feature Aggregation (AC-FA), with a Feature\nConsistency Loss (F-CL), which aligns the aggregated feature with their\noriginal spatial-temporal feature. This aggregation step effectively combines\nboth global context and local details. Experimental results on NTU RGB+D 60,\nNTU RGB+D 120, and NW-UCLA datasets demonstrate significant improvements in\ndistinguishing ambiguous actions. Our code will be made available at\nhttps://github.com/HaoHuang2003/SFHead.\n","authors":["Hao Huang","Yujie Lin","Siyu Chen","Haiyang Liu"],"pdf_url":"https://arxiv.org/pdf/2412.14833v1.pdf","comment":"20pages, 5 figures"},{"id":"http://arxiv.org/abs/2412.14821v1","updated":"2024-12-19T13:12:15Z","published":"2024-12-19T13:12:15Z","title":"PC-BEV: An Efficient Polar-Cartesian BEV Fusion Framework for LiDAR\n Semantic Segmentation","summary":" Although multiview fusion has demonstrated potential in LiDAR segmentation,\nits dependence on computationally intensive point-based interactions, arising\nfrom the lack of fixed correspondences between views such as range view and\nBird's-Eye View (BEV), hinders its practical deployment. This paper challenges\nthe prevailing notion that multiview fusion is essential for achieving high\nperformance. We demonstrate that significant gains can be realized by directly\nfusing Polar and Cartesian partitioning strategies within the BEV space. Our\nproposed BEV-only segmentation model leverages the inherent fixed grid\ncorrespondences between these partitioning schemes, enabling a fusion process\nthat is orders of magnitude faster (170$\\times$ speedup) than conventional\npoint-based methods. Furthermore, our approach facilitates dense feature\nfusion, preserving richer contextual information compared to sparse point-based\nalternatives. To enhance scene understanding while maintaining inference\nefficiency, we also introduce a hybrid Transformer-CNN architecture. Extensive\nevaluation on the SemanticKITTI and nuScenes datasets provides compelling\nevidence that our method outperforms previous multiview fusion approaches in\nterms of both performance and inference speed, highlighting the potential of\nBEV-based fusion for LiDAR segmentation. Code is available at\n\\url{https://github.com/skyshoumeng/PC-BEV.}\n","authors":["Shoumeng Qiu","Xinrun Li","XiangYang Xue","Jian Pu"],"pdf_url":"https://arxiv.org/pdf/2412.14821v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14819v1","updated":"2024-12-19T13:10:38Z","published":"2024-12-19T13:10:38Z","title":"Multi-Level Embedding and Alignment Network with Consistency and\n Invariance Learning for Cross-View Geo-Localization","summary":" Cross-View Geo-Localization (CVGL) involves determining the localization of\ndrone images by retrieving the most similar GPS-tagged satellite images.\nHowever, the imaging gaps between platforms are often significant and the\nvariations in viewpoints are substantial, which limits the ability of existing\nmethods to effectively associate cross-view features and extract consistent and\ninvariant characteristics. Moreover, existing methods often overlook the\nproblem of increased computational and storage requirements when improving\nmodel performance. To handle these limitations, we propose a lightweight\nenhanced alignment network, called the Multi-Level Embedding and Alignment\nNetwork (MEAN). The MEAN network uses a progressive multi-level enhancement\nstrategy, global-to-local associations, and cross-domain alignment, enabling\nfeature communication across levels. This allows MEAN to effectively connect\nfeatures at different levels and learn robust cross-view consistent mappings\nand modality-invariant features. Moreover, MEAN adopts a shallow backbone\nnetwork combined with a lightweight branch design, effectively reducing\nparameter count and computational complexity. Experimental results on the\nUniversity-1652 and SUES-200 datasets demonstrate that MEAN reduces parameter\ncount by 62.17% and computational complexity by 70.99% compared to\nstate-of-the-art models, while maintaining competitive or even superior\nperformance. The codes will be released soon.\n","authors":["Zhongwei Chen","Zhao-Xu Yang","Hai-Jun Rong"],"pdf_url":"https://arxiv.org/pdf/2412.14819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14816v1","updated":"2024-12-19T13:10:03Z","published":"2024-12-19T13:10:03Z","title":"Explainable Tampered Text Detection via Multimodal Large Models","summary":" Recently, tampered text detection has attracted increasing attention due to\nits essential role in information security. Although existing methods can\ndetect the tampered text region, the interpretation of such detection remains\nunclear, making the prediction unreliable. To address this black-box problem,\nwe propose to explain the basis of tampered text detection with natural\nlanguage via large multimodal models. To fill the data gap for this task, we\npropose a large-scale, comprehensive dataset, ETTD, which contains both\npixel-level annotations indicating the tampered text region and natural\nlanguage annotations describing the anomaly of the tampered text. Multiple\nmethods are employed to improve the quality of the proposed data. For example,\na fused mask prompt is proposed to reduce confusion when querying GPT4o to\ngenerate anomaly descriptions. By weighting the input image with the mask\nannotation, the tampered region can be clearly indicated and the content in and\naround the tampered region can also be preserved. We also propose prompting\nGPT4o to recognize tampered texts and filtering out the responses with low OCR\naccuracy, which can effectively improve annotation quality in an automatic\nmanner. To further improve explainable tampered text detection, we propose a\nsimple yet effective model called TTD, which benefits from improved\nfine-grained perception by paying attention to the suspected region with\nauxiliary reference grounding query. Extensive experiments on both the ETTD\ndataset and the public dataset have verified the effectiveness of the proposed\nmethods. In-depth analysis is also provided to inspire further research. The\ndataset and code will be made publicly available.\n","authors":["Chenfan Qu","Jian Liu","Haoxing Chen","Baihan Yu","Jingjing Liu","Weiqiang Wang","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2412.14816v1.pdf","comment":"The first work for explainable tampered text detection"},{"id":"http://arxiv.org/abs/2304.02488v4","updated":"2024-12-19T13:00:35Z","published":"2023-04-05T15:02:30Z","title":"SCB-dataset: A Dataset for Detecting Student Classroom Behavior","summary":" The use of deep learning methods for automatic detection of students'\nclassroom behavior is a promising approach to analyze their class performance\nand enhance teaching effectiveness. However, the lack of publicly available\ndatasets on student behavior poses a challenge for researchers in this field.\nTo address this issue, we propose a Student Classroom Behavior dataset\n(SCB-dataset) that reflects real-life scenarios. Our dataset includes 11,248\nlabels and 4,003 images, with a focus on hand-raising behavior. We evaluated\nthe dataset using the YOLOv7 algorithm, achieving a mean average precision\n(map) of up to 85.3%. We believe that our dataset can serve as a robust\nfoundation for future research in the field of student behavior detection and\npromote further advancements in this area.Our SCB-dataset can be downloaded\nfrom: https://github.com/Whiffe/SCB-dataset\n","authors":["Fan Yang"],"pdf_url":"https://arxiv.org/pdf/2304.02488v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13735v2","updated":"2024-12-19T12:59:31Z","published":"2024-12-18T11:14:01Z","title":"3D Registration in 30 Years: A Survey","summary":" 3D point cloud registration is a fundamental problem in computer vision,\ncomputer graphics, robotics, remote sensing, and etc. Over the last thirty\nyears, we have witnessed the amazing advancement in this area with numerous\nkinds of solutions. Although a handful of relevant surveys have been conducted,\ntheir coverage is still limited. In this work, we present a comprehensive\nsurvey on 3D point cloud registration, covering a set of sub-areas such as\npairwise coarse registration, pairwise fine registration, multi-view\nregistration, cross-scale registration, and multi-instance registration. The\ndatasets, evaluation metrics, method taxonomy, discussions of the merits and\ndemerits, insightful thoughts of future directions are comprehensively\npresented in this survey. The regularly updated project page of the survey is\navailable at https://github.com/Amyyyy11/3D-Registration-in-30-Years-A-Survey.\n","authors":["Jiaqi Yang","Chu'ai Zhang","Zhengbao Wang","Xinyue Cao","Xuan Ouyang","Xiyu Zhang","Zhenxuan Zeng","Zhao Zeng","Borui Lu","Zhiyi Xia","Qian Zhang","Yulan Guo","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.13735v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14803v1","updated":"2024-12-19T12:48:40Z","published":"2024-12-19T12:48:40Z","title":"Video Prediction Policy: A Generalist Robot Policy with Predictive\n Visual Representations","summary":" Recent advancements in robotics have focused on developing generalist\npolicies capable of performing multiple tasks. Typically, these policies\nutilize pre-trained vision encoders to capture crucial information from current\nobservations. However, previous vision encoders, which trained on two-image\ncontrastive learning or single-image reconstruction, can not perfectly capture\nthe sequential information essential for embodied tasks. Recently, video\ndiffusion models (VDMs) have demonstrated the capability to accurately predict\nfuture image sequences, exhibiting a good understanding of physical dynamics.\nMotivated by the strong visual prediction capabilities of VDMs, we hypothesize\nthat they inherently possess visual representations that reflect the evolution\nof the physical world, which we term predictive visual representations.\nBuilding on this hypothesis, we propose the Video Prediction Policy (VPP), a\ngeneralist robotic policy conditioned on the predictive visual representations\nfrom VDMs. To further enhance these representations, we incorporate diverse\nhuman or robotic manipulation datasets, employing unified video-generation\ntraining objectives. VPP consistently outperforms existing methods across two\nsimulated and two real-world benchmarks. Notably, it achieves a 28.1\\% relative\nimprovement in the Calvin ABC-D benchmark compared to the previous\nstate-of-the-art and delivers a 28.8\\% increase in success rates for complex\nreal-world dexterous manipulation tasks.\n","authors":["Yucheng Hu","Yanjiang Guo","Pengchao Wang","Xiaoyu Chen","Yen-Jen Wang","Jianke Zhang","Koushil Sreenath","Chaochao Lu","Jianyu Chen"],"pdf_url":"https://arxiv.org/pdf/2412.14803v1.pdf","comment":"The first two authors contribute equally. Project Page at\n https://video-prediction-policy.github.io/"},{"id":"http://arxiv.org/abs/2410.05317v3","updated":"2024-12-19T12:38:23Z","published":"2024-10-05T03:47:06Z","title":"Accelerating Diffusion Transformers with Token-wise Feature Caching","summary":" Diffusion transformers have shown significant effectiveness in both image and\nvideo synthesis at the expense of huge computation costs. To address this\nproblem, feature caching methods have been introduced to accelerate diffusion\ntransformers by caching the features in previous timesteps and reusing them in\nthe following timesteps. However, previous caching methods ignore that\ndifferent tokens exhibit different sensitivities to feature caching, and\nfeature caching on some tokens may lead to 10$\\times$ more destruction to the\noverall generation quality compared with other tokens. In this paper, we\nintroduce token-wise feature caching, allowing us to adaptively select the most\nsuitable tokens for caching, and further enable us to apply different caching\nratios to neural layers in different types and depths. Extensive experiments on\nPixArt-$\\alpha$, OpenSora, and DiT demonstrate our effectiveness in both image\nand video generation with no requirements for training. For instance,\n2.36$\\times$ and 1.93$\\times$ acceleration are achieved on OpenSora and\nPixArt-$\\alpha$ with almost no drop in generation quality.\n","authors":["Chang Zou","Xuyang Liu","Ting Liu","Siteng Huang","Linfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.05317v3.pdf","comment":"In this version, we achieved a nearly lossless acceleration of 1.51\n times for ToCa on FLUX in the appendix"},{"id":"http://arxiv.org/abs/2412.13803v2","updated":"2024-12-19T12:31:34Z","published":"2024-12-18T12:50:11Z","title":"M$^3$-VOS: Multi-Phase, Multi-Transition, and Multi-Scenery Video Object\n Segmentation","summary":" Intelligent robots need to interact with diverse objects across various\nenvironments. The appearance and state of objects frequently undergo complex\ntransformations depending on the object properties, e.g., phase transitions.\nHowever, in the vision community, segmenting dynamic objects with phase\ntransitions is overlooked. In light of this, we introduce the concept of phase\nin segmentation, which categorizes real-world objects based on their visual\ncharacteristics and potential morphological and appearance changes. Then, we\npresent a new benchmark, Multi-Phase, Multi-Transition, and Multi-Scenery Video\nObject Segmentation (M$^3$-VOS), to verify the ability of models to understand\nobject phases, which consists of 479 high-resolution videos spanning over 10\ndistinct everyday scenarios. It provides dense instance mask annotations that\ncapture both object phases and their transitions. We evaluate state-of-the-art\nmethods on M$^3$-VOS, yielding several key insights. Notably, current\nappearancebased approaches show significant room for improvement when handling\nobjects with phase transitions. The inherent changes in disorder suggest that\nthe predictive performance of the forward entropy-increasing process can be\nimproved through a reverse entropy-reducing process. These findings lead us to\npropose ReVOS, a new plug-andplay model that improves its performance by\nreversal refinement. Our data and code will be publicly available at\nhttps://zixuan-chen.github.io/M-cubeVOS.github.io/.\n","authors":["Zixuan Chen","Jiaxin Li","Liming Tan","Yejie Guo","Junxuan Liang","Cewu Lu","Yong-Lu Li"],"pdf_url":"https://arxiv.org/pdf/2412.13803v2.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2412.14790v1","updated":"2024-12-19T12:29:31Z","published":"2024-12-19T12:29:31Z","title":"YOLOv11 Optimization for Efficient Resource Utilization","summary":" The objective of this research is to optimize the eleventh iteration of You\nOnly Look Once (YOLOv11) by developing size-specific modified versions of the\narchitecture. These modifications involve pruning unnecessary layers and\nreconfiguring the main architecture of YOLOv11. Each proposed version is\ntailored to detect objects of specific size ranges, from small to large. To\nensure proper model selection based on dataset characteristics, we introduced\nan object classifier program. This program identifies the most suitable\nmodified version for a given dataset. The proposed models were evaluated on\nvarious datasets and compared with the original YOLOv11 and YOLOv8 models. The\nexperimental results highlight significant improvements in computational\nresource efficiency, with the proposed models maintaining the accuracy of the\noriginal YOLOv11. In some cases, the modified versions outperformed the\noriginal model regarding detection performance. Furthermore, the proposed\nmodels demonstrated reduced model sizes and faster inference times. Models\nweights and the object size classifier can be found in this repository\n","authors":["Areeg Fagad Rasheed","M. Zarkoosh"],"pdf_url":"https://arxiv.org/pdf/2412.14790v1.pdf","comment":"12 pages, 13 figures, 4 tables"},{"id":"http://arxiv.org/abs/2412.09401v2","updated":"2024-12-19T12:23:39Z","published":"2024-12-12T16:08:03Z","title":"SLAM3R: Real-Time Dense Scene Reconstruction from Monocular RGB Videos","summary":" In this paper, we introduce SLAM3R, a novel and effective monocular RGB SLAM\nsystem for real-time and high-quality dense 3D reconstruction. SLAM3R provides\nan end-to-end solution by seamlessly integrating local 3D reconstruction and\nglobal coordinate registration through feed-forward neural networks. Given an\ninput video, the system first converts it into overlapping clips using a\nsliding window mechanism. Unlike traditional pose optimization-based methods,\nSLAM3R directly regresses 3D pointmaps from RGB images in each window and\nprogressively aligns and deforms these local pointmaps to create a globally\nconsistent scene reconstruction - all without explicitly solving any camera\nparameters. Experiments across datasets consistently show that SLAM3R achieves\nstate-of-the-art reconstruction accuracy and completeness while maintaining\nreal-time performance at 20+ FPS. Code and weights at:\nhttps://github.com/PKU-VCL-3DV/SLAM3R.\n","authors":["Yuzheng Liu","Siyan Dong","Shuzhe Wang","Yanchao Yang","Qingnan Fan","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/2412.09401v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16302v2","updated":"2024-12-19T12:14:03Z","published":"2024-07-23T08:57:11Z","title":"DeepClean: Integrated Distortion Identification and Algorithm Selection\n for Rectifying Image Corruptions","summary":" Distortion identification and rectification in images and videos is vital for\nachieving good performance in downstream vision applications. Instead of\nrelying on fixed trial-and-error based image processing pipelines, we propose a\ntwo-level sequential planning approach for automated image distortion\nclassification and rectification. At the higher level it detects the class of\ncorruptions present in the input image, if any. The lower level selects a\nspecific algorithm to be applied, from a set of externally provided candidate\nalgorithms. The entire two-level setup runs in the form of a single forward\npass during inference and it is to be queried iteratively until the retrieval\nof the original image. We demonstrate improvements compared to three baselines\non the object detection task on COCO image dataset with rich set of\ndistortions. The advantage of our approach is its dynamic reconfiguration,\nconditioned on the input image and generalisability to unseen candidate\nalgorithms at inference time, since it relies only on the comparison of their\noutput of the image embeddings.\n","authors":["Aditya Kapoor","Harshad Khadilkar","Jayvardhana Gubbi"],"pdf_url":"https://arxiv.org/pdf/2407.16302v2.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2412.14168v2","updated":"2024-12-19T11:59:46Z","published":"2024-12-18T18:59:50Z","title":"FashionComposer: Compositional Fashion Image Generation","summary":" We present FashionComposer for compositional fashion image generation. Unlike\nprevious methods, FashionComposer is highly flexible. It takes multi-modal\ninput (i.e., text prompt, parametric human model, garment image, and face\nimage) and supports personalizing the appearance, pose, and figure of the human\nand assigning multiple garments in one pass. To achieve this, we first develop\na universal framework capable of handling diverse input modalities. We\nconstruct scaled training data to enhance the model's robust compositional\ncapabilities. To accommodate multiple reference images (garments and faces)\nseamlessly, we organize these references in a single image as an \"asset\nlibrary\" and employ a reference UNet to extract appearance features. To inject\nthe appearance features into the correct pixels in the generated result, we\npropose subject-binding attention. It binds the appearance features from\ndifferent \"assets\" with the corresponding text features. In this way, the model\ncould understand each asset according to their semantics, supporting arbitrary\nnumbers and types of reference images. As a comprehensive solution,\nFashionComposer also supports many other applications like human album\ngeneration, diverse virtual try-on tasks, etc.\n","authors":["Sihui Ji","Yiyang Wang","Xi Chen","Xiaogang Xu","Hao Luo","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2412.14168v2.pdf","comment":"https://sihuiji.github.io/FashionComposer-Page"},{"id":"http://arxiv.org/abs/2412.14768v1","updated":"2024-12-19T11:51:45Z","published":"2024-12-19T11:51:45Z","title":"FLAMe: Federated Learning with Attention Mechanism using Spatio-Temporal\n Keypoint Transformers for Pedestrian Fall Detection in Smart Cities","summary":" In smart cities, detecting pedestrian falls is a major challenge to ensure\nthe safety and quality of life of citizens. In this study, we propose a novel\nfall detection system using FLAMe (Federated Learning with Attention\nMechanism), a federated learning (FL) based algorithm. FLAMe trains around\nimportant keypoint information and only transmits the trained important weights\nto the server, reducing communication costs and preserving data privacy.\nFurthermore, the lightweight keypoint transformer model is integrated into the\nFL framework to effectively learn spatio-temporal features. We validated the\nexperiment using 22,672 video samples from the \"Fall Accident Risk Behavior\nVideo-Sensor Pair data\" dataset from AI-Hub. As a result of the experiment, the\nFLAMe-based system achieved an accuracy of 94.02% with about 190,000\ntransmission parameters, maintaining performance similar to that of existing\ncentralized learning while maximizing efficiency by reducing communication\ncosts by about 40% compared to the existing FL algorithm, FedAvg. Therefore,\nthe FLAMe algorithm has demonstrated that it provides robust performance in the\ndistributed environment of smart cities and is a practical and effective\nsolution for public safety.\n","authors":["Byeonghun Kim","Byeongjoon Noh"],"pdf_url":"https://arxiv.org/pdf/2412.14768v1.pdf","comment":"8 pages, 7 figures, AAAI 2025 FLUID Workshop"},{"id":"http://arxiv.org/abs/2408.01812v3","updated":"2024-12-19T11:29:09Z","published":"2024-08-03T15:43:56Z","title":"SkyDiffusion: Ground-to-Aerial Image Synthesis with Diffusion Models and\n BEV Paradigm","summary":" Ground-to-aerial image synthesis focuses on generating realistic aerial\nimages from corresponding ground street view images while maintaining\nconsistent content layout, simulating a top-down view. The significant\nviewpoint difference leads to domain gaps between views, and dense urban scenes\nlimit the visible range of street views, making this cross-view generation task\nparticularly challenging. In this paper, we introduce SkyDiffusion, a novel\ncross-view generation method for synthesizing aerial images from street view\nimages, utilizing a diffusion model and the Bird's-Eye View (BEV) paradigm. The\nCurved-BEV method in SkyDiffusion converts street-view images into a BEV\nperspective, effectively bridging the domain gap, and employs a \"multi-to-one\"\nmapping strategy to address occlusion issues in dense urban scenes. Next,\nSkyDiffusion designed a BEV-guided diffusion model to generate\ncontent-consistent and realistic aerial images. Additionally, we introduce a\nnovel dataset, Ground2Aerial-3, designed for diverse ground-to-aerial image\nsynthesis applications, including disaster scene aerial synthesis, historical\nhigh-resolution satellite image synthesis, and low-altitude UAV image synthesis\ntasks. Experimental results demonstrate that SkyDiffusion outperforms\nstate-of-the-art methods on cross-view datasets across natural (CVUSA),\nsuburban (CVACT), urban (VIGOR-Chicago), and various application scenarios\n(G2A-3), achieving realistic and content-consistent aerial image generation.\nMore result and dataset information can be found at\nhttps://opendatalab.github.io/skydiffusion/ .\n","authors":["Junyan Ye","Jun He","Weijia Li","Zhutao Lv","Yi Lin","Jinhua Yu","Haote Yang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2408.01812v3.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.17981v3","updated":"2024-12-19T11:25:34Z","published":"2024-01-31T16:38:32Z","title":"From Training-Free to Adaptive: Empirical Insights into MLLMs'\n Understanding of Detection Information","summary":" Despite the impressive capabilities of Multimodal Large Language Models\n(MLLMs) in integrating text and image modalities, challenges remain in\naccurately interpreting detailed visual elements. Vision detection models excel\nat recognizing fine-grained image details, prompting researchers to use them to\nenhance MLLMs. One effective strategy is to infuse detection information in\ntext format, which has proven simple and effective. However, most studies\nutilize this method without training, leaving the potential of adaptive\ntraining largely unexplored. Adaptive training could significantly enhance\nMLLMs' comprehension of unique inputs while filtering out irrelevant\ninformation. This paper addresses the crucial question: How does training\nimpact MLLMs' understanding of infused textual detection information? We\nsystematically experiment with various representative models to evaluate the\neffects of training-free, retraining, and fine-tuning strategies. We also\nexamine the influence of training on MLLMs' original abilities and the\ninterchangeability of detection models. Our findings indicate that fine-tuning\na pre-trained MLLM to incorporate textual detection information delivers\nsuperior results compared to training-free and retraining methods, improving\nperformance by 6.71% across 10 widely recognized benchmarks. Furthermore,\nfine-tuning enables MLLMs to retain performance enhancements even when\ndetection models are swapped, indicating improved understanding of formatted\ntextual data. We release our codes to support further exploration of fusion\nstrategies for vision detection models and the enhancement of MLLMs'\nfine-grained multimodal capabilities.\n","authors":["Qirui Jiao","Daoyuan Chen","Yilun Huang","Yaliang Li","Ying Shen"],"pdf_url":"https://arxiv.org/pdf/2401.17981v3.pdf","comment":"32 pages, 22 tables, 7 figures"},{"id":"http://arxiv.org/abs/2410.23091v5","updated":"2024-12-19T11:18:58Z","published":"2024-10-30T15:06:44Z","title":"CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for\n Adversarial Defense","summary":" Despite ongoing efforts to defend neural classifiers from adversarial\nattacks, they remain vulnerable, especially to unseen attacks. In contrast,\nhumans are difficult to be cheated by subtle manipulations, since we make\njudgments only based on essential factors. Inspired by this observation, we\nattempt to model label generation with essential label-causative factors and\nincorporate label-non-causative factors to assist data generation. For an\nadversarial example, we aim to discriminate the perturbations as non-causative\nfactors and make predictions only based on the label-causative factors.\nConcretely, we propose a casual diffusion model (CausalDiff) that adapts\ndiffusion models for conditional data generation and disentangles the two types\nof casual factors by learning towards a novel casual information bottleneck\nobjective. Empirically, CausalDiff has significantly outperformed\nstate-of-the-art defense methods on various unseen attacks, achieving an\naverage robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on\nCIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition\nBenchmark). The code is available at\nhttps://github.com/CAS-AISafetyBasicResearchGroup/CausalDiff\n","authors":["Mingkun Zhang","Keping Bi","Wei Chen","Quanrun Chen","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.23091v5.pdf","comment":"accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.11383v2","updated":"2024-12-19T11:12:30Z","published":"2024-09-17T17:34:24Z","title":"Training Datasets Generation for Machine Learning: Application to Vision\n Based Navigation","summary":" Vision Based Navigation consists in utilizing cameras as precision sensors\nfor GNC after extracting information from images. To enable the adoption of\nmachine learning for space applications, one of obstacles is the demonstration\nthat available training datasets are adequate to validate the algorithms. The\nobjective of the study is to generate datasets of images and metadata suitable\nfor training machine learning algorithms. Two use cases were selected and a\nrobust methodology was developed to validate the datasets including the ground\ntruth. The first use case is in-orbit rendezvous with a man-made object: a\nmockup of satellite ENVISAT. The second use case is a Lunar landing scenario.\nDatasets were produced from archival datasets (Chang'e 3), from the laboratory\nat DLR TRON facility and at Airbus Robotic laboratory, from SurRender software\nhigh fidelity image simulator using Model Capture and from Generative\nAdversarial Networks. The use case definition included the selection of\nalgorithms as benchmark: an AI-based pose estimation algorithm and a dense\noptical flow algorithm were selected. Eventually it is demonstrated that\ndatasets produced with SurRender and selected laboratory facilities are\nadequate to train machine learning algorithms.\n","authors":["Jérémy Lebreton","Ingo Ahrns","Roland Brochard","Christoph Haskamp","Hans Krüger","Matthieu Le Goff","Nicolas Menga","Nicolas Ollagnier","Ralf Regele","Francesco Capolupo","Massimo Casasco"],"pdf_url":"https://arxiv.org/pdf/2409.11383v2.pdf","comment":"6 pages, 4 figures, preprint of the proceedings of ESA SPAICE\n conference 2024"},{"id":"http://arxiv.org/abs/2408.04594v3","updated":"2024-12-19T11:04:20Z","published":"2024-08-08T17:10:16Z","title":"Img-Diff: Contrastive Data Synthesis for Multimodal Large Language\n Models","summary":" High-performance Multimodal Large Language Models (MLLMs) are heavily\ndependent on data quality. To advance fine-grained image recognition within\nMLLMs, we introduce a novel data synthesis method inspired by contrastive\nlearning and image difference captioning. Our key idea involves challenging the\nmodel to discern both matching and distinct elements by scrutinizing object\ndifferences in detailed regions across similar images. We begin by generating\npairs of similar images that emphasize object variations. Following this, we\nemploy a Difference Area Generator to pinpoint object differences, and\nsubsequently, a Difference Captions Generator to articulate these differences.\nThis process results in a high-quality dataset of \"object replacement\" samples,\ntermed Img-Diff, which can be scaled as needed due to its automated nature. We\nleverage this generated dataset to fine-tune state-of-the-art (SOTA) MLLMs,\nsuch as InternVL2, achieving substantial improvements across various image\ndifference and Visual Question Answering tasks. Notably, the trained models\nsignificantly outperform existing SOTA models like GPT-4V and Gemini on the\nMMVP benchmark. Additionally, we conduct comprehensive evaluations to validate\nthe dataset's diversity, quality, and robustness, offering several insights\ninto the synthesis of such contrastive datasets. We release our codes and\ndataset to encourage further research on multimodal data synthesis and MLLMs'\nfundamental capabilities for image understanding.\n","authors":["Qirui Jiao","Daoyuan Chen","Yilun Huang","Bolin Ding","Yaliang Li","Ying Shen"],"pdf_url":"https://arxiv.org/pdf/2408.04594v3.pdf","comment":"22 pages, 10 figures, 16 tables"},{"id":"http://arxiv.org/abs/2412.10681v2","updated":"2024-12-19T10:59:24Z","published":"2024-12-14T05:01:46Z","title":"One Pixel is All I Need","summary":" Vision Transformers (ViTs) have achieved record-breaking performance in\nvarious visual tasks. However, concerns about their robustness against backdoor\nattacks have grown. Backdoor attacks involve associating a specific trigger\nwith a target label, causing the model to predict the attacker-specified label\nwhen the trigger is present, while correctly identifying clean images.We found\nthat ViTs exhibit higher attack success rates for quasi-triggers(patterns\ndifferent from but similar to the original training triggers)compared to CNNs.\nMoreover, some backdoor features in clean samples can suppress the original\ntrigger, making quasi-triggers more effective.To better understand and exploit\nthese vulnerabilities, we developed a tool called the Perturbation Sensitivity\nDistribution Map (PSDM). PSDM computes and sums gradients over many inputs to\nshow how sensitive the model is to small changes in the input. In ViTs, PSDM\nreveals a patch-like pattern where central pixels are more sensitive than\nedges. We use PSDM to guide the creation of quasi-triggers.Based on these\nfindings, we designed \"WorstVIT,\" a simple yet effective data poisoning\nbackdoor for ViT models. This attack requires an extremely low poisoning rate,\ntrains for just one epoch, and modifies a single pixel to successfully attack\nall validation images.\n","authors":["Deng Siqin","Zhou Xiaoyi"],"pdf_url":"https://arxiv.org/pdf/2412.10681v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16729v3","updated":"2024-12-19T10:53:48Z","published":"2024-08-29T17:20:59Z","title":"Prediction-Feedback DETR for Temporal Action Detection","summary":" Temporal Action Detection (TAD) is fundamental yet challenging for real-world\nvideo applications. Leveraging the unique benefits of transformers, various\nDETR-based approaches have been adopted in TAD. However, it has recently been\nidentified that the attention collapse in self-attention causes the performance\ndegradation of DETR for TAD. Building upon previous research, this paper newly\naddresses the attention collapse problem in cross-attention within DETR-based\nTAD methods. Moreover, our findings reveal that cross-attention exhibits\npatterns distinct from predictions, indicating a short-cut phenomenon. To\nresolve this, we propose a new framework, Prediction-Feedback DETR (Pred-DETR),\nwhich utilizes predictions to restore the collapse and align the cross- and\nself-attention with predictions. Specifically, we devise novel\nprediction-feedback objectives using guidance from the relations of the\npredictions. As a result, Pred-DETR significantly alleviates the collapse and\nachieves state-of-the-art performance among DETR-based methods on various\nchallenging benchmarks including THUMOS14, ActivityNet-v1.3, HACS, and\nFineAction.\n","authors":["Jihwan Kim","Miso Lee","Cheol-Ho Cho","Jihyun Lee","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2408.16729v3.pdf","comment":"Accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2406.02507v3","updated":"2024-12-19T10:43:11Z","published":"2024-06-04T17:25:59Z","title":"Guiding a Diffusion Model with a Bad Version of Itself","summary":" The primary axes of interest in image-generating diffusion models are image\nquality, the amount of variation in the results, and how well the results align\nwith a given condition, e.g., a class label or a text prompt. The popular\nclassifier-free guidance approach uses an unconditional model to guide a\nconditional model, leading to simultaneously better prompt alignment and\nhigher-quality images at the cost of reduced variation. These effects seem\ninherently entangled, and thus hard to control. We make the surprising\nobservation that it is possible to obtain disentangled control over image\nquality without compromising the amount of variation by guiding generation\nusing a smaller, less-trained version of the model itself rather than an\nunconditional model. This leads to significant improvements in ImageNet\ngeneration, setting record FIDs of 1.01 for 64x64 and 1.25 for 512x512, using\npublicly available networks. Furthermore, the method is also applicable to\nunconditional diffusion models, drastically improving their quality.\n","authors":["Tero Karras","Miika Aittala","Tuomas Kynkäänniemi","Jaakko Lehtinen","Timo Aila","Samuli Laine"],"pdf_url":"https://arxiv.org/pdf/2406.02507v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.14719v1","updated":"2024-12-19T10:41:24Z","published":"2024-12-19T10:41:24Z","title":"Prototypical Calibrating Ambiguous Samples for Micro-Action Recognition","summary":" Micro-Action Recognition (MAR) has gained increasing attention due to its\ncrucial role as a form of non-verbal communication in social interactions, with\npromising potential for applications in human communication and emotion\nanalysis. However, current approaches often overlook the inherent ambiguity in\nmicro-actions, which arises from the wide category range and subtle visual\ndifferences between categories. This oversight hampers the accuracy of\nmicro-action recognition. In this paper, we propose a novel Prototypical\nCalibrating Ambiguous Network (\\textbf{PCAN}) to unleash and mitigate the\nambiguity of MAR. \\textbf{Firstly}, we employ a hierarchical action-tree to\nidentify the ambiguous sample, categorizing them into distinct sets of\nambiguous samples of false negatives and false positives, considering both\nbody- and action-level categories. \\textbf{Secondly}, we implement an ambiguous\ncontrastive refinement module to calibrate these ambiguous samples by\nregulating the distance between ambiguous samples and their corresponding\nprototypes. This calibration process aims to pull false negative\n($\\mathbb{FN}$) samples closer to their respective prototypes and push false\npositive ($\\mathbb{FP}$) samples apart from their affiliated prototypes. In\naddition, we propose a new prototypical diversity amplification loss to\nstrengthen the model's capacity by amplifying the differences between different\nprototypes. \\textbf{Finally}, we propose a prototype-guided rectification to\nrectify prediction by incorporating the representability of prototypes.\nExtensive experiments conducted on the benchmark dataset demonstrate the\nsuperior performance of our method compared to existing approaches. The code is\navailable at https://github.com/kunli-cs/PCAN.\n","authors":["Kun Li","Dan Guo","Guoliang Chen","Chunxiao Fan","Jingyuan Xu","Zhiliang Wu","Hehe Fan","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14719v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14706v1","updated":"2024-12-19T10:19:43Z","published":"2024-12-19T10:19:43Z","title":"EnergyMoGen: Compositional Human Motion Generation with Energy-Based\n Diffusion Model in Latent Space","summary":" Diffusion models, particularly latent diffusion models, have demonstrated\nremarkable success in text-driven human motion generation. However, it remains\nchallenging for latent diffusion models to effectively compose multiple\nsemantic concepts into a single, coherent motion sequence. To address this\nissue, we propose EnergyMoGen, which includes two spectrums of Energy-Based\nModels: (1) We interpret the diffusion model as a latent-aware energy-based\nmodel that generates motions by composing a set of diffusion models in latent\nspace; (2) We introduce a semantic-aware energy model based on cross-attention,\nwhich enables semantic composition and adaptive gradient descent for text\nembeddings. To overcome the challenges of semantic inconsistency and motion\ndistortion across these two spectrums, we introduce Synergistic Energy Fusion.\nThis design allows the motion latent diffusion model to synthesize\nhigh-quality, complex motions by combining multiple energy terms corresponding\nto textual descriptions. Experiments show that our approach outperforms\nexisting state-of-the-art models on various motion generation tasks, including\ntext-to-motion generation, compositional motion generation, and multi-concept\nmotion generation. Additionally, we demonstrate that our method can be used to\nextend motion datasets and improve the text-to-motion task.\n","authors":["Jianrong Zhang","Hehe Fan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2412.14706v1.pdf","comment":"Project page: https://jiro-zhang.github.io/EnergyMoGen/"},{"id":"http://arxiv.org/abs/2412.14705v1","updated":"2024-12-19T10:17:50Z","published":"2024-12-19T10:17:50Z","title":"Event-assisted 12-stop HDR Imaging of Dynamic Scene","summary":" High dynamic range (HDR) imaging is a crucial task in computational\nphotography, which captures details across diverse lighting conditions.\nTraditional HDR fusion methods face limitations in dynamic scenes with extreme\nexposure differences, as aligning low dynamic range (LDR) frames becomes\nchallenging due to motion and brightness variation. In this work, we propose a\nnovel 12-stop HDR imaging approach for dynamic scenes, leveraging a dual-camera\nsystem with an event camera and an RGB camera. The event camera provides\ntemporally dense, high dynamic range signals that improve alignment between LDR\nframes with large exposure differences, reducing ghosting artifacts caused by\nmotion. Also, a real-world finetuning strategy is proposed to increase the\ngeneralization of alignment module on real-world events. Additionally, we\nintroduce a diffusion-based fusion module that incorporates image priors from\npre-trained diffusion models to address artifacts in high-contrast regions and\nminimize errors from the alignment process. To support this work, we developed\nthe ESHDR dataset, the first dataset for 12-stop HDR imaging with synchronized\nevent signals, and validated our approach on both simulated and real-world\ndata. Extensive experiments demonstrate that our method achieves\nstate-of-the-art performance, successfully extending HDR imaging to 12 stops in\ndynamic scenes.\n","authors":["Shi Guo","Zixuan Chen","Ziran Zhang","Yutian Chen","Gangwei Xu","Tianfan Xue"],"pdf_url":"https://arxiv.org/pdf/2412.14705v1.pdf","comment":"Project page:\n https://openimaginglab.github.io/Event-Assisted-12stops-HDR/"},{"id":"http://arxiv.org/abs/2410.17098v2","updated":"2024-12-19T10:03:18Z","published":"2024-10-22T15:22:53Z","title":"Activity Recognition on Avatar-Anonymized Datasets with Masked\n Differential Privacy","summary":" Privacy-preserving computer vision is an important emerging problem in\nmachine learning and artificial intelligence. Prevalent methods tackling this\nproblem use differential privacy (DP) or obfuscation techniques to protect the\nprivacy of individuals. In both cases, the utility of the trained model is\nsacrificed heavily in this process. In this work, we present an anonymization\npipeline that replaces sensitive human subjects in video datasets with\nsynthetic avatars within context, employing a combined rendering and stable\ndiffusion-based strategy. Additionally we propose masked differential privacy\n({MaskDP}) to protect non-anonymized but privacy sensitive background\ninformation. MaskDP allows for controlling sensitive regions where differential\nprivacy is applied, in contrast to applying DP on the entire input. This\ncombined methodology provides strong privacy protection while minimizing the\nusual performance penalty of privacy preserving methods. Experiments on\nmultiple challenging action recognition datasets demonstrate that our proposed\ntechniques result in better utility-privacy trade-offs compared to standard\ndifferentially private training in the especially demanding $\\epsilon<1$\nregime.\n","authors":["David Schneider","Sina Sajadmanesh","Vikash Sehwag","Saquib Sarfraz","Rainer Stiefelhagen","Lingjuan Lyu","Vivek Sharma"],"pdf_url":"https://arxiv.org/pdf/2410.17098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14692v1","updated":"2024-12-19T09:51:45Z","published":"2024-12-19T09:51:45Z","title":"Explicit Relational Reasoning Network for Scene Text Detection","summary":" Connected component (CC) is a proper text shape representation that aligns\nwith human reading intuition. However, CC-based text detection methods have\nrecently faced a developmental bottleneck that their time-consuming\npost-processing is difficult to eliminate. To address this issue, we introduce\nan explicit relational reasoning network (ERRNet) to elegantly model the\ncomponent relationships without post-processing. Concretely, we first represent\neach text instance as multiple ordered text components, and then treat these\ncomponents as objects in sequential movement. In this way, scene text detection\ncan be innovatively viewed as a tracking problem. From this perspective, we\ndesign an end-to-end tracking decoder to achieve a CC-based method dispensing\nwith post-processing entirely. Additionally, we observe that there is an\ninconsistency between classification confidence and localization quality, so we\npropose a Polygon Monte-Carlo method to quickly and accurately evaluate the\nlocalization quality. Based on this, we introduce a position-supervised\nclassification loss to guide the task-aligned learning of ERRNet. Experiments\non challenging benchmarks demonstrate the effectiveness of our ERRNet. It\nconsistently achieves state-of-the-art accuracy while holding highly\ncompetitive inference speed.\n","authors":["Yuchen Su","Zhineng Chen","Yongkun Du","Zhilong Ji","Kai Hu","Jinfeng Bai","Xieping Gao"],"pdf_url":"https://arxiv.org/pdf/2412.14692v1.pdf","comment":"Accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14680v1","updated":"2024-12-19T09:32:53Z","published":"2024-12-19T09:32:53Z","title":"A Light-Weight Framework for Open-Set Object Detection with Decoupled\n Feature Alignment in Joint Space","summary":" Open-set object detection (OSOD) is highly desirable for robotic manipulation\nin unstructured environments. However, existing OSOD methods often fail to meet\nthe requirements of robotic applications due to their high computational burden\nand complex deployment. To address this issue, this paper proposes a\nlight-weight framework called Decoupled OSOD (DOSOD), which is a practical and\nhighly efficient solution to support real-time OSOD tasks in robotic systems.\nSpecifically, DOSOD builds upon the YOLO-World pipeline by integrating a\nvision-language model (VLM) with a detector. A Multilayer Perceptron (MLP)\nadaptor is developed to transform text embeddings extracted by the VLM into a\njoint space, within which the detector learns the region representations of\nclass-agnostic proposals. Cross-modality features are directly aligned in the\njoint space, avoiding the complex feature interactions and thereby improving\ncomputational efficiency. DOSOD operates like a traditional closed-set detector\nduring the testing phase, effectively bridging the gap between closed-set and\nopen-set detection. Compared to the baseline YOLO-World, the proposed DOSOD\nsignificantly enhances real-time performance while maintaining comparable\naccuracy. The slight DOSOD-S model achieves a Fixed AP of $26.7\\%$, compared to\n$26.2\\%$ for YOLO-World-v1-S and $22.7\\%$ for YOLO-World-v2-S, using similar\nbackbones on the LVIS minival dataset. Meanwhile, the FPS of DOSOD-S is\n$57.1\\%$ higher than YOLO-World-v1-S and $29.6\\%$ higher than YOLO-World-v2-S.\nMeanwhile, we demonstrate that the DOSOD model facilitates the deployment of\nedge devices. The codes and models are publicly available at\nhttps://github.com/D-Robotics-AI-Lab/DOSOD.\n","authors":["Yonghao He","Hu Su","Haiyong Yu","Cong Yang","Wei Sui","Cong Wang","Song Liu"],"pdf_url":"https://arxiv.org/pdf/2412.14680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14678v1","updated":"2024-12-19T09:31:53Z","published":"2024-12-19T09:31:53Z","title":"Efficient Few-Shot Neural Architecture Search by Counting the Number of\n Nonlinear Functions","summary":" Neural architecture search (NAS) enables finding the best-performing\narchitecture from a search space automatically. Most NAS methods exploit an\nover-parameterized network (i.e., a supernet) containing all possible\narchitectures (i.e., subnets) in the search space. However, the subnets that\nshare the same set of parameters are likely to have different characteristics,\ninterfering with each other during training. To address this, few-shot NAS\nmethods have been proposed that divide the space into a few subspaces and\nemploy a separate supernet for each subspace to limit the extent of weight\nsharing. They achieve state-of-the-art performance, but the computational cost\nincreases accordingly. We introduce in this paper a novel few-shot NAS method\nthat exploits the number of nonlinear functions to split the search space. To\nbe specific, our method divides the space such that each subspace consists of\nsubnets with the same number of nonlinear functions. Our splitting criterion is\nefficient, since it does not require comparing gradients of a supernet to split\nthe space. In addition, we have found that dividing the space allows us to\nreduce the channel dimensions required for each supernet, which enables\ntraining multiple supernets in an efficient manner. We also introduce a\nsupernet-balanced sampling (SBS) technique, sampling several subnets at each\ntraining step, to train different supernets evenly within a limited number of\ntraining steps. Extensive experiments on standard NAS benchmarks demonstrate\nthe effectiveness of our approach. Our code is available at\nhttps://cvlab.yonsei.ac.kr/projects/EFS-NAS.\n","authors":["Youngmin Oh","Hyunju Lee","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2412.14678v1.pdf","comment":"Accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14672v1","updated":"2024-12-19T09:24:10Z","published":"2024-12-19T09:24:10Z","title":"FiVL: A Framework for Improved Vision-Language Alignment","summary":" Large Vision Language Models (LVLMs) have achieved significant progress in\nintegrating visual and textual inputs for multimodal reasoning. However, a\nrecurring challenge is ensuring these models utilize visual information as\neffectively as linguistic content when both modalities are necessary to\nformulate an accurate answer. We hypothesize that hallucinations arise due to\nthe lack of effective visual grounding in current LVLMs. This issue extends to\nvision-language benchmarks, where it is difficult to make the image\nindispensable for accurate answer generation, particularly in vision\nquestion-answering tasks. In this work, we introduce FiVL, a novel method for\nconstructing datasets designed to train LVLMs for enhanced visual grounding and\nto evaluate their effectiveness in achieving it. These datasets can be utilized\nfor both training and assessing an LVLM's ability to use image content as\nsubstantive evidence rather than relying solely on linguistic priors, providing\ninsights into the model's reliance on visual information. To demonstrate the\nutility of our dataset, we introduce an innovative training task that\noutperforms baselines alongside a validation method and application for\nexplainability. The code is available at https://github.com/IntelLabs/fivl.\n","authors":["Estelle Aflalo","Gabriela Ben Melech Stan","Tiep Le","Man Luo","Shachar Rosenman","Sayak Paul","Shao-Yen Tseng","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2412.14672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14671v1","updated":"2024-12-19T09:22:19Z","published":"2024-12-19T09:22:19Z","title":"MUSTER: Longitudinal Deformable Registration by Composition of\n Consecutive Deformations","summary":" Longitudinal imaging allows for the study of structural changes over time.\nOne approach to detecting such changes is by non-linear image registration.\nThis study introduces Multi-Session Temporal Registration (MUSTER), a novel\nmethod that facilitates longitudinal analysis of changes in extended series of\nmedical images. MUSTER improves upon conventional pairwise registration by\nincorporating more than two imaging sessions to recover longitudinal\ndeformations. Longitudinal analysis at a voxel-level is challenging due to\neffects of a changing image contrast as well as instrumental and environmental\nsources of bias between sessions. We show that local normalized\ncross-correlation as an image similarity metric leads to biased results and\npropose a robust alternative. We test the performance of MUSTER on a synthetic\nmulti-site, multi-session neuroimaging dataset and show that, in various\nscenarios, using MUSTER significantly enhances the estimated deformations\nrelative to pairwise registration. Additionally, we apply MUSTER on a sample of\nolder adults from the Alzheimer's Disease Neuroimaging Initiative (ADNI) study.\nThe results show that MUSTER can effectively identify patterns of\nneuro-degeneration from T1-weighted images and that these changes correlate\nwith changes in cognition, matching the performance of state of the art\nsegmentation methods. By leveraging GPU acceleration, MUSTER efficiently\nhandles large datasets, making it feasible also in situations with limited\ncomputational resources.\n","authors":["Edvard O. S. Grødem","Donatas Sederevičius","Esten H. Leonardsen","Bradley J. MacIntosh","Atle Bjørnerud","Till Schellhorn","Øystein Sørensen","Inge Amlien","Pablo F. Garrido","Anders M. Fjell"],"pdf_url":"https://arxiv.org/pdf/2412.14671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01220v3","updated":"2024-12-19T09:16:19Z","published":"2024-07-01T12:07:26Z","title":"Fast and Efficient: Mask Neural Fields for 3D Scene Segmentation","summary":" Understanding 3D scenes is a crucial challenge in computer vision research\nwith applications spanning multiple domains. Recent advancements in distilling\n2D vision-language foundation models into neural fields, like NeRF and 3DGS,\nenable open-vocabulary segmentation of 3D scenes from 2D multi-view images\nwithout the need for precise 3D annotations. However, while effective, these\nmethods typically rely on the per-pixel distillation of high-dimensional CLIP\nfeatures, introducing ambiguity and necessitating complex regularization\nstrategies, which adds inefficiency during training. This paper presents\nMaskField, which enables efficient 3D open-vocabulary segmentation with neural\nfields from a novel perspective. Unlike previous methods, MaskField decomposes\nthe distillation of mask and semantic features from foundation models by\nformulating a mask feature field and queries. MaskField overcomes ambiguous\nobject boundaries by naturally introducing SAM segmented object shapes without\nextra regularization during training. By circumventing the direct handling of\ndense high-dimensional CLIP features during training, MaskField is particularly\ncompatible with explicit scene representations like 3DGS. Our extensive\nexperiments show that MaskField not only surpasses prior state-of-the-art\nmethods but also achieves remarkably fast convergence. We hope that MaskField\nwill inspire further exploration into how neural fields can be trained to\ncomprehend 3D scenes from 2D models.\n","authors":["Zihan Gao","Lingling Li","Licheng Jiao","Fang Liu","Xu Liu","Wenping Ma","Yuwei Guo","Shuyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2407.01220v3.pdf","comment":"15 pages, 9 figures, Code:https://github.com/keloee/MaskField"},{"id":"http://arxiv.org/abs/2412.14660v1","updated":"2024-12-19T09:10:07Z","published":"2024-12-19T09:10:07Z","title":"Unveiling Uncertainty: A Deep Dive into Calibration and Performance of\n Multimodal Large Language Models","summary":" Multimodal large language models (MLLMs) combine visual and textual data for\ntasks such as image captioning and visual question answering. Proper\nuncertainty calibration is crucial, yet challenging, for reliable use in areas\nlike healthcare and autonomous driving. This paper investigates representative\nMLLMs, focusing on their calibration across various scenarios, including before\nand after visual fine-tuning, as well as before and after multimodal training\nof the base LLMs. We observed miscalibration in their performance, and at the\nsame time, no significant differences in calibration across these scenarios. We\nalso highlight how uncertainty differs between text and images and how their\nintegration affects overall uncertainty. To better understand MLLMs'\nmiscalibration and their ability to self-assess uncertainty, we construct the\nIDK (I don't know) dataset, which is key to evaluating how they handle\nunknowns. Our findings reveal that MLLMs tend to give answers rather than admit\nuncertainty, but this self-assessment improves with proper prompt adjustments.\nFinally, to calibrate MLLMs and enhance model reliability, we propose\ntechniques such as temperature scaling and iterative prompt optimization. Our\nresults provide insights into improving MLLMs for effective and responsible\ndeployment in multimodal applications. Code and IDK dataset:\n\\href{https://github.com/hfutml/Calibration-MLLM}{https://github.com/hfutml/Calibration-MLLM}.\n","authors":["Zijun Chen","Wenbo Hu","Guande He","Zhijie Deng","Zheng Zhang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2412.14660v1.pdf","comment":"Accepted to COLING 2025"},{"id":"http://arxiv.org/abs/2403.15031v2","updated":"2024-12-19T09:00:34Z","published":"2024-03-22T08:26:31Z","title":"Image Classification with Rotation-Invariant Variational Quantum\n Circuits","summary":" Variational quantum algorithms are gaining attention as an early application\nof Noisy Intermediate-Scale Quantum (NISQ) devices. One of the main problems of\nvariational methods lies in the phenomenon of Barren Plateaus, present in the\noptimization of variational parameters. Adding geometric inductive bias to the\nquantum models has been proposed as a potential solution to mitigate this\nproblem, leading to a new field called Geometric Quantum Machine Learning. In\nthis work, an equivariant architecture for variational quantum classifiers is\nintroduced to create a label-invariant model for image classification with\n$C_4$ rotational label symmetry. The equivariant circuit is benchmarked against\ntwo different architectures, and it is experimentally observed that the\ngeometric approach boosts the model's performance. Finally, a classical\nequivariant convolution operation is proposed to extend the quantum model for\nthe processing of larger images, employing the resources available in NISQ\ndevices.\n","authors":["Paul San Sebastian","Mikel Cañizo","Román Orús"],"pdf_url":"https://arxiv.org/pdf/2403.15031v2.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2406.00143v2","updated":"2024-12-19T08:58:15Z","published":"2024-05-31T19:13:09Z","title":"Diversifying Query: Region-Guided Transformer for Temporal Sentence\n Grounding","summary":" Temporal sentence grounding is a challenging task that aims to localize the\nmoment spans relevant to a language description. Although recent DETR-based\nmodels have achieved notable progress by leveraging multiple learnable moment\nqueries, they suffer from overlapped and redundant proposals, leading to\ninaccurate predictions. We attribute this limitation to the lack of\ntask-related guidance for the learnable queries to serve a specific mode.\nFurthermore, the complex solution space generated by variable and\nopen-vocabulary language descriptions complicates optimization, making it\nharder for learnable queries to distinguish each other adaptively. To tackle\nthis limitation, we present a Region-Guided TRansformer (RGTR) for temporal\nsentence grounding, which diversifies moment queries to eliminate overlapped\nand redundant predictions. Instead of using learnable queries, RGTR adopts a\nset of anchor pairs as moment queries to introduce explicit regional guidance.\nEach anchor pair takes charge of moment prediction for a specific temporal\nregion, which reduces the optimization difficulty and ensures the diversity of\nthe final predictions. In addition, we design an IoU-aware scoring head to\nimprove proposal quality. Extensive experiments demonstrate the effectiveness\nof RGTR, outperforming state-of-the-art methods on QVHighlights, Charades-STA\nand TACoS datasets. Codes are available at https://github.com/TensorsSun/RGTR\n","authors":["Xiaolong Sun","Liushuai Shi","Le Wang","Sanping Zhou","Kun Xia","Yabing Wang","Gang Hua"],"pdf_url":"https://arxiv.org/pdf/2406.00143v2.pdf","comment":"Accepted by AAAI-25. Code is available at\n https://github.com/TensorsSun/RGTR"},{"id":"http://arxiv.org/abs/2412.11953v2","updated":"2024-12-19T08:52:56Z","published":"2024-12-16T16:37:03Z","title":"Reliable Breast Cancer Molecular Subtype Prediction based on\n uncertainty-aware Bayesian Deep Learning by Mammography","summary":" Breast cancer is a heterogeneous disease with different molecular subtypes,\nclinical behavior, treatment responses as well as survival outcomes. The\ndevelopment of a reliable, accurate, available and inexpensive method to\npredict the molecular subtypes using medical images plays an important role in\nthe diagnosis and prognosis of breast cancer. Recently, deep learning methods\nhave shown good performance in the breast cancer classification tasks using\nvarious medical images. Despite all that success, classical deep learning\ncannot deliver the predictive uncertainty. The uncertainty represents the\nvalidity of the predictions. Therefore, the high predicted uncertainty might\ncause a negative effect in the accurate diagnosis of breast cancer molecular\nsubtypes. To overcome this, uncertainty quantification methods are used to\ndetermine the predictive uncertainty. Accordingly, in this study, we proposed\nan uncertainty-aware Bayesian deep learning model using the full mammogram\nimages. In addition, to increase the performance of the multi-class molecular\nsubtype classification task, we proposed a novel hierarchical classification\nstrategy, named the two-stage classification strategy. The separate AUC of the\nproposed model for each subtype was 0.71, 0.75 and 0.86 for HER2-enriched,\nluminal and triple-negative classes, respectively. The proposed model not only\nhas a comparable performance to other studies in the field of breast cancer\nmolecular subtypes prediction, even using full mammography images, but it is\nalso more reliable, due to quantify the predictive uncertainty.\n","authors":["Mohaddeseh Chegini","Ali Mahloojifar"],"pdf_url":"https://arxiv.org/pdf/2412.11953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14643v1","updated":"2024-12-19T08:51:57Z","published":"2024-12-19T08:51:57Z","title":"RefHCM: A Unified Model for Referring Perceptions in Human-Centric\n Scenarios","summary":" Human-centric perceptions play a crucial role in real-world applications.\nWhile recent human-centric works have achieved impressive progress, these\nefforts are often constrained to the visual domain and lack interaction with\nhuman instructions, limiting their applicability in broader scenarios such as\nchatbots and sports analysis. This paper introduces Referring Human\nPerceptions, where a referring prompt specifies the person of interest in an\nimage. To tackle the new task, we propose RefHCM (Referring Human-Centric\nModel), a unified framework to integrate a wide range of human-centric\nreferring tasks. Specifically, RefHCM employs sequence mergers to convert raw\nmultimodal data -- including images, text, coordinates, and parsing maps --\ninto semantic tokens. This standardized representation enables RefHCM to\nreformulate diverse human-centric referring tasks into a sequence-to-sequence\nparadigm, solved using a plain encoder-decoder transformer architecture.\nBenefiting from a unified learning strategy, RefHCM effectively facilitates\nknowledge transfer across tasks and exhibits unforeseen capabilities in\nhandling complex reasoning. This work represents the first attempt to address\nreferring human perceptions with a general-purpose framework, while\nsimultaneously establishing a corresponding benchmark that sets new standards\nfor the field. Extensive experiments showcase RefHCM's competitive and even\nsuperior performance across multiple human-centric referring tasks. The code\nand data are publicly at https://github.com/JJJYmmm/RefHCM.\n","authors":["Jie Huang","Ruibing Hou","Jiahe Zhao","Hong Chang","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2412.14643v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2412.14640v1","updated":"2024-12-19T08:51:01Z","published":"2024-12-19T08:51:01Z","title":"Adaptive Prompt Tuning: Vision Guided Prompt Tuning with Cross-Attention\n for Fine-Grained Few-Shot Learning","summary":" Few-shot, fine-grained classification in computer vision poses significant\nchallenges due to the need to differentiate subtle class distinctions with\nlimited data. This paper presents a novel method that enhances the Contrastive\nLanguage-Image Pre-Training (CLIP) model through adaptive prompt tuning, guided\nby real-time visual inputs. Unlike existing techniques such as Context\nOptimization (CoOp) and Visual Prompt Tuning (VPT), which are constrained by\nstatic prompts or visual token reliance, the proposed approach leverages a\ncross-attention mechanism to dynamically refine text prompts for the image at\nhand. This enables an image-specific alignment of textual features with image\npatches extracted from the Vision Transformer, making the model more effective\nfor datasets with high intra-class variance and low inter-class differences.\nThe method is evaluated on several datasets, including CUBirds, Oxford Flowers,\nand FGVC Aircraft, showing significant performance gains over static prompt\ntuning approaches. To ensure these performance gains translate into trustworthy\npredictions, we integrate Monte-Carlo Dropout in our approach to improve the\nreliability of the model predictions and uncertainty estimates. This\nintegration provides valuable insights into the model's predictive confidence,\nhelping to identify when predictions can be trusted and when additional\nverification is necessary. This dynamic approach offers a robust solution,\nadvancing the state-of-the-art for few-shot fine-grained classification.\n","authors":["Eric Brouwer","Jan Erik van Woerden","Gertjan Burghouts","Matias Valedenegro-Toro","Marco Zullich"],"pdf_url":"https://arxiv.org/pdf/2412.14640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18459v3","updated":"2024-12-19T08:47:07Z","published":"2024-04-29T06:35:34Z","title":"Chameleon: A Data-Efficient Generalist for Dense Visual Prediction in\n the Wild","summary":" Large language models have evolved data-efficient generalists, benefiting\nfrom the universal language interface and large-scale pre-training. However,\nconstructing a data-efficient generalist for dense visual prediction presents a\ndistinct challenge due to the variation in label structures across different\ntasks. Consequently, generalization to unseen dense prediction tasks in the\nlow-data regime is not straightforward and has received less attention from\nprevious vision generalists. In this study, we explore a universal model that\ncan flexibly adapt to unseen dense label structures with a few examples,\nenabling it to serve as a data-efficient vision generalist in diverse\nreal-world scenarios. To this end, we base our method on a powerful\nmeta-learning framework and explore several axes to improve its performance and\nversatility for real-world problems, such as flexible adaptation mechanisms and\nscalability. We evaluate our model across a spectrum of unseen real-world\nscenarios where low-shot learning is desirable, including video, 3D, medical,\nbiological, and user-interactive tasks. Equipped with a generic architecture\nand an effective adaptation mechanism, our model flexibly adapts to all of\nthese tasks with at most 50 labeled images, showcasing a significant\nadvancement over existing data-efficient generalist approaches. Codes are\navailable at https://github.com/GitGyun/chameleon.\n","authors":["Donggyun Kim","Seongwoong Cho","Semin Kim","Chong Luo","Seunghoon Hong"],"pdf_url":"https://arxiv.org/pdf/2404.18459v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12974v3","updated":"2024-12-19T08:41:19Z","published":"2024-12-17T14:56:59Z","title":"Attentive Eraser: Unleashing Diffusion Model's Object Removal Potential\n via Self-Attention Redirection Guidance","summary":" Recently, diffusion models have emerged as promising newcomers in the field\nof generative models, shining brightly in image generation. However, when\nemployed for object removal tasks, they still encounter issues such as\ngenerating random artifacts and the incapacity to repaint foreground object\nareas with appropriate content after removal. To tackle these problems, we\npropose Attentive Eraser, a tuning-free method to empower pre-trained diffusion\nmodels for stable and effective object removal. Firstly, in light of the\nobservation that the self-attention maps influence the structure and shape\ndetails of the generated images, we propose Attention Activation and\nSuppression (ASS), which re-engineers the self-attention mechanism within the\npre-trained diffusion models based on the given mask, thereby prioritizing the\nbackground over the foreground object during the reverse generation process.\nMoreover, we introduce Self-Attention Redirection Guidance (SARG), which\nutilizes the self-attention redirected by ASS to guide the generation process,\neffectively removing foreground objects within the mask while simultaneously\ngenerating content that is both plausible and coherent. Experiments demonstrate\nthe stability and effectiveness of Attentive Eraser in object removal across a\nvariety of pre-trained diffusion models, outperforming even training-based\nmethods. Furthermore, Attentive Eraser can be implemented in various diffusion\nmodel architectures and checkpoints, enabling excellent scalability. Code is\navailable at https://github.com/Anonym0u3/AttentiveEraser.\n","authors":["Wenhao Sun","Benlei Cui","Xue-Mei Dong","Jingqun Tang"],"pdf_url":"https://arxiv.org/pdf/2412.12974v3.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14633v1","updated":"2024-12-19T08:38:59Z","published":"2024-12-19T08:38:59Z","title":"Progressive Fine-to-Coarse Reconstruction for Accurate Low-Bit\n Post-Training Quantization in Vision Transformers","summary":" Due to its efficiency, Post-Training Quantization (PTQ) has been widely\nadopted for compressing Vision Transformers (ViTs). However, when quantized\ninto low-bit representations, there is often a significant performance drop\ncompared to their full-precision counterparts. To address this issue,\nreconstruction methods have been incorporated into the PTQ framework to improve\nperformance in low-bit quantization settings. Nevertheless, existing related\nmethods predefine the reconstruction granularity and seldom explore the\nprogressive relationships between different reconstruction granularities, which\nleads to sub-optimal quantization results in ViTs. To this end, in this paper,\nwe propose a Progressive Fine-to-Coarse Reconstruction (PFCR) method for\naccurate PTQ, which significantly improves the performance of low-bit quantized\nvision transformers. Specifically, we define multi-head self-attention and\nmulti-layer perceptron modules along with their shortcuts as the finest\nreconstruction units. After reconstructing these two fine-grained units, we\ncombine them to form coarser blocks and reconstruct them at a coarser\ngranularity level. We iteratively perform this combination and reconstruction\nprocess, achieving progressive fine-to-coarse reconstruction. Additionally, we\nintroduce a Progressive Optimization Strategy (POS) for PFCR to alleviate the\ndifficulty of training, thereby further enhancing model performance.\nExperimental results on the ImageNet dataset demonstrate that our proposed\nmethod achieves the best Top-1 accuracy among state-of-the-art methods,\nparticularly attaining 75.61% for 3-bit quantized ViT-B in PTQ. Besides,\nquantization results on the COCO dataset reveal the effectiveness and\ngeneralization of our proposed method on other computer vision tasks like\nobject detection and instance segmentation.\n","authors":["Rui Ding","Liang Yong","Sihuan Zhao","Jing Nie","Lihui Chen","Haijun Liu","Xichuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.14633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14631v1","updated":"2024-12-19T08:36:32Z","published":"2024-12-19T08:36:32Z","title":"Review of Fruit Tree Image Segmentation","summary":" Fruit tree image segmentation is an essential problem in automating a variety\nof agricultural tasks such as phenotyping, harvesting, spraying, and pruning.\nMany research papers have proposed a diverse spectrum of solutions suitable to\nspecific tasks and environments. The review scope of this paper is confined to\nthe front views of fruit trees and based on 158 relevant papers collected using\na newly designed crawling review method. These papers are systematically\nreviewed based on a taxonomy that sequentially considers the method, image,\ntask, and fruit. This taxonomy will assist readers to intuitively grasp the big\npicture of these research activities. Our review reveals that the most\nnoticeable deficiency of the previous studies was the lack of a versatile\ndataset and segmentation model that could be applied to a variety of tasks and\nenvironments. Six important future research tasks are suggested, with the\nexpectation that these will pave the way to building a versatile tree\nsegmentation module.\n","authors":["Il-Seok Oh"],"pdf_url":"https://arxiv.org/pdf/2412.14631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14630v1","updated":"2024-12-19T08:33:33Z","published":"2024-12-19T08:33:33Z","title":"Unified Image Restoration and Enhancement: Degradation Calibrated Cycle\n Reconstruction Diffusion Model","summary":" Image restoration and enhancement are pivotal for numerous computer vision\napplications, yet unifying these tasks efficiently remains a significant\nchallenge. Inspired by the iterative refinement capabilities of diffusion\nmodels, we propose CycleRDM, a novel framework designed to unify restoration\nand enhancement tasks while achieving high-quality mapping. Specifically,\nCycleRDM first learns the mapping relationships among the degraded domain, the\nrough normal domain, and the normal domain through a two-stage diffusion\ninference process. Subsequently, we transfer the final calibration process to\nthe wavelet low-frequency domain using discrete wavelet transform, performing\nfine-grained calibration from a frequency domain perspective by leveraging\ntask-specific frequency spaces. To improve restoration quality, we design a\nfeature gain module for the decomposed wavelet high-frequency domain to\neliminate redundant features. Additionally, we employ multimodal textual\nprompts and Fourier transform to drive stable denoising and reduce randomness\nduring the inference process. After extensive validation, CycleRDM can be\neffectively generalized to a wide range of image restoration and enhancement\ntasks while requiring only a small number of training samples to be\nsignificantly superior on various benchmarks of reconstruction quality and\nperceptual quality. The source code will be available at\nhttps://github.com/hejh8/CycleRDM.\n","authors":["Minglong Xue","Jinhong He","Shivakumara Palaiahnakote","Mingliang Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.14630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11216v2","updated":"2024-12-19T08:32:20Z","published":"2024-12-15T15:13:14Z","title":"Distribution-Consistency-Guided Multi-modal Hashing","summary":" Multi-modal hashing methods have gained popularity due to their fast speed\nand low storage requirements. Among them, the supervised methods demonstrate\nbetter performance by utilizing labels as supervisory signals compared with\nunsupervised methods. Currently, for almost all supervised multi-modal hashing\nmethods, there is a hidden assumption that training sets have no noisy labels.\nHowever, labels are often annotated incorrectly due to manual labeling in\nreal-world scenarios, which will greatly harm the retrieval performance. To\naddress this issue, we first discover a significant distribution consistency\npattern through experiments, i.e., the 1-0 distribution of the presence or\nabsence of each category in the label is consistent with the high-low\ndistribution of similarity scores of the hash codes relative to category\ncenters. Then, inspired by this pattern, we propose a novel\nDistribution-Consistency-Guided Multi-modal Hashing (DCGMH), which aims to\nfilter and reconstruct noisy labels to enhance retrieval performance.\nSpecifically, the proposed method first randomly initializes several category\ncenters, which are used to compute the high-low distribution of similarity\nscores; Noisy and clean labels are then separately filtered out via the\ndiscovered distribution consistency pattern to mitigate the impact of noisy\nlabels; Subsequently, a correction strategy, which is indirectly designed via\nthe distribution consistency pattern, is applied to the filtered noisy labels,\ncorrecting high-confidence ones while treating low-confidence ones as unlabeled\nfor unsupervised learning, thereby further enhancing the model's performance.\nExtensive experiments on three widely used datasets demonstrate the superiority\nof the proposed method compared to state-of-the-art baselines in multi-modal\nretrieval tasks. The code is available at\nhttps://github.com/LiuJinyu1229/DCGMH.\n","authors":["Jin-Yu Liu","Xian-Ling Mao","Tian-Yi Che","Rong-Cheng Tu"],"pdf_url":"https://arxiv.org/pdf/2412.11216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14629v1","updated":"2024-12-19T08:31:42Z","published":"2024-12-19T08:31:42Z","title":"Robust PCA Based on Adaptive Weighted Least Squares and Low-Rank Matrix\n Factorization","summary":" Robust Principal Component Analysis (RPCA) is a fundamental technique for\ndecomposing data into low-rank and sparse components, which plays a critical\nrole for applications such as image processing and anomaly detection.\nTraditional RPCA methods commonly use $\\ell_1$ norm regularization to enforce\nsparsity, but this approach can introduce bias and result in suboptimal\nestimates, particularly in the presence of significant noise or outliers.\nNon-convex regularization methods have been proposed to mitigate these\nchallenges, but they tend to be complex to optimize and sensitive to initial\nconditions, leading to potential instability in solutions. To overcome these\nchallenges, in this paper, we propose a novel RPCA model that integrates\nadaptive weighted least squares (AWLS) and low-rank matrix factorization\n(LRMF). The model employs a {self-attention-inspired} mechanism in its weight\nupdate process, allowing the weight matrix to dynamically adjust and emphasize\nsignificant components during each iteration. By employing a weighted F-norm\nfor the sparse component, our method effectively reduces bias while simplifying\nthe computational process compared to traditional $\\ell_1$-norm-based methods.\nWe use an alternating minimization algorithm, where each subproblem has an\nexplicit solution, thereby improving computational efficiency. Despite its\nsimplicity, numerical experiments demonstrate that our method outperforms\nexisting non-convex regularization approaches, offering superior performance\nand stability, as well as enhanced accuracy and robustness in practical\napplications.\n","authors":["Kexin Li","You-wei Wen","Xu Xiao","Mingchao Zhao"],"pdf_url":"https://arxiv.org/pdf/2412.14629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14628v1","updated":"2024-12-19T08:30:54Z","published":"2024-12-19T08:30:54Z","title":"Qua$^2$SeDiMo: Quantifiable Quantization Sensitivity of Diffusion Models","summary":" Diffusion Models (DM) have democratized AI image generation through an\niterative denoising process. Quantization is a major technique to alleviate the\ninference cost and reduce the size of DM denoiser networks. However, as\ndenoisers evolve from variants of convolutional U-Nets toward newer Transformer\narchitectures, it is of growing importance to understand the quantization\nsensitivity of different weight layers, operations and architecture types to\nperformance. In this work, we address this challenge with Qua$^2$SeDiMo, a\nmixed-precision Post-Training Quantization framework that generates explainable\ninsights on the cost-effectiveness of various model weight quantization methods\nfor different denoiser operation types and block structures. We leverage these\ninsights to make high-quality mixed-precision quantization decisions for a\nmyriad of diffusion models ranging from foundational U-Nets to state-of-the-art\nTransformers. As a result, Qua$^2$SeDiMo can construct 3.4-bit, 3.9-bit,\n3.65-bit and 3.7-bit weight quantization on PixArt-${\\alpha}$,\nPixArt-${\\Sigma}$, Hunyuan-DiT and SDXL, respectively. We further pair our\nweight-quantization configurations with 6-bit activation quantization and\noutperform existing approaches in terms of quantitative metrics and generative\nimage quality.\n","authors":["Keith G. Mills","Mohammad Salameh","Ruichen Chen","Negar Hassanpour","Wei Lu","Di Niu"],"pdf_url":"https://arxiv.org/pdf/2412.14628v1.pdf","comment":"AAAI 2025; version includes supplementary material; 22 Pages, 18\n Figures, 8 Tables"},{"id":"http://arxiv.org/abs/2412.14623v1","updated":"2024-12-19T08:21:28Z","published":"2024-12-19T08:21:28Z","title":"FRIDAY: Mitigating Unintentional Facial Identity in Deepfake Detectors\n Guided by Facial Recognizers","summary":" Previous Deepfake detection methods perform well within their training\ndomains, but their effectiveness diminishes significantly with new synthesis\ntechniques. Recent studies have revealed that detection models often create\ndecision boundaries based on facial identity rather than synthetic artifacts,\nresulting in poor performance on cross-domain datasets. To address this\nlimitation, we propose Facial Recognition Identity Attenuation (FRIDAY), a\nnovel training method that mitigates facial identity influence using a face\nrecognizer. Specifically, we first train a face recognizer using the same\nbackbone as the Deepfake detector. The recognizer is then frozen and employed\nduring the detector's training to reduce facial identity information. This is\nachieved by feeding input images into both the recognizer and the detector, and\nminimizing the similarity of their feature embeddings through our Facial\nIdentity Attenuating loss. This process encourages the detector to generate\nembeddings distinct from the recognizer, effectively reducing the impact of\nfacial identity. Extensive experiments demonstrate that our approach\nsignificantly enhances detection performance on both in-domain and cross-domain\ndatasets.\n","authors":["Younhun Kim","Myung-Joon Kwon","Wonjun Lee","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2412.14623v1.pdf","comment":"5 pages, 4 figures. In 2024 IEEE International Conference on Visual\n Communications and Image Processing (VCIP) Oral"},{"id":"http://arxiv.org/abs/2409.17671v3","updated":"2024-12-19T08:19:41Z","published":"2024-09-26T09:30:37Z","title":"Leveraging Anthropometric Measurements to Improve Human Mesh Estimation\n and Ensure Consistent Body Shapes","summary":" The basic body shape (i.e., the body shape in T-pose) of a person does not\nchange within a single video. However, most SOTA human mesh estimation (HME)\nmodels output a slightly different, thus inconsistent basic body shape for each\nvideo frame. Furthermore, we find that SOTA 3D human pose estimation (HPE)\nmodels outperform HME models regarding the precision of the estimated 3D\nkeypoint positions. We solve the problem of inconsistent body shapes by\nleveraging anthropometric measurements like taken by tailors from humans. We\ncreate a model called A2B that converts given anthropometric measurements to\nbasic body shape parameters of human mesh models. We obtain superior and\nconsistent human meshes by combining the A2B model results with the keypoints\nof 3D HPE models using inverse kinematics. We evaluate our approach on\nchallenging datasets like ASPset or fit3D, where we can lower the MPJPE by over\n30 mm compared to SOTA HME models. Further, replacing estimates of the body\nshape parameters from existing HME models with A2B results not only increases\nthe performance of these HME models, but also guarantees consistent body\nshapes.\n","authors":["Katja Ludwig","Julian Lorenz","Daniel Kienzle","Tuan Bui","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2409.17671v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14619v1","updated":"2024-12-19T08:11:42Z","published":"2024-12-19T08:11:42Z","title":"Pitfalls of topology-aware image segmentation","summary":" Topological correctness, i.e., the preservation of structural integrity and\nspecific characteristics of shape, is a fundamental requirement for medical\nimaging tasks, such as neuron or vessel segmentation. Despite the recent surge\nin topology-aware methods addressing this challenge, their real-world\napplicability is hindered by flawed benchmarking practices. In this paper, we\nidentify critical pitfalls in model evaluation that include inadequate\nconnectivity choices, overlooked topological artifacts in ground truth\nannotations, and inappropriate use of evaluation metrics. Through detailed\nempirical analysis, we uncover these issues' profound impact on the evaluation\nand ranking of segmentation methods. Drawing from our findings, we propose a\nset of actionable recommendations to establish fair and robust evaluation\nstandards for topology-aware medical image segmentation methods.\n","authors":["Alexander H. Berger","Laurin Lux","Alexander Weers","Martin Menten","Daniel Rueckert","Johannes C. Paetzold"],"pdf_url":"https://arxiv.org/pdf/2412.14619v1.pdf","comment":"Code is available at\n https://github.com/AlexanderHBerger/topo-pitfalls"},{"id":"http://arxiv.org/abs/2412.14613v1","updated":"2024-12-19T08:03:16Z","published":"2024-12-19T08:03:16Z","title":"HarmonicEval: Multi-modal, Multi-task, Multi-criteria Automatic\n Evaluation Using a Vision Language Model","summary":" Vision-language models (VLMs) have shown impressive abilities in text and\nimage understanding. However, existing metrics for evaluating the text\ngenerated by VLMs focus exclusively on overall quality, leading to two\nlimitations: 1) it is challenging to identify which aspects of the text need\nimprovement from the overall score; 2) metrics may overlook specific evaluation\ncriteria when predicting an overall score. To address these limitations, we\npropose HarmonicEval, a reference-free evaluation metric that aggregates\ncriterion-wise scores to produce the overall score in a bottom-up manner.\nFurthermore, we construct the Multi-task Multi-criteria Human Evaluation (MMHE)\ndataset, which comprises 18,000 expert human judgments across four\nvision-language tasks. Our experiments demonstrate that HarmonicEval achieves\nhigher correlations with human judgments than conventional metrics while\nproviding numerical scores for each criterion.\n","authors":["Masanari Ohi","Masahiro Kaneko","Naoaki Okazaki","Nakamasa Inoue"],"pdf_url":"https://arxiv.org/pdf/2412.14613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19101v4","updated":"2024-12-19T08:00:44Z","published":"2024-06-27T11:28:36Z","title":"DocKylin: A Large Multimodal Model for Visual Document Understanding\n with Efficient Visual Slimming","summary":" Current multimodal large language models (MLLMs) face significant challenges\nin visual document understanding (VDU) tasks due to the high resolution, dense\ntext, and complex layouts typical of document images. These characteristics\ndemand a high level of detail perception ability from MLLMs. While increasing\ninput resolution improves detail perception capability, it also leads to longer\nsequences of visual tokens, increasing computational costs and straining the\nmodels' ability to handle long contexts. To address these challenges, we\nintroduce DocKylin, a document-centric MLLM that performs visual content\nslimming at both the pixel and token levels, thereby reducing token sequence\nlength in VDU scenarios. We introduce an Adaptive Pixel Slimming (APS)\npreprocessing module to perform pixel-level slimming, increasing the proportion\nof informative pixels. Moreover, we propose a novel Dynamic Token Slimming\n(DTS) module to conduct token-level slimming, filtering essential tokens and\nremoving others to adaptively create a more compact visual sequence.\nExperiments demonstrate DocKylin's promising performance across various VDU\nbenchmarks and the effectiveness of each component.\n","authors":["Jiaxin Zhang","Wentao Yang","Songxuan Lai","Zecheng Xie","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2406.19101v4.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14603v1","updated":"2024-12-19T07:49:40Z","published":"2024-12-19T07:49:40Z","title":"Successive optimization of optics and post-processing with\n differentiable coherent PSF operator and field information","summary":" Recently, the joint design of optical systems and downstream algorithms is\nshowing significant potential. However, existing rays-described methods are\nlimited to optimizing geometric degradation, making it difficult to fully\nrepresent the optical characteristics of complex, miniaturized lenses\nconstrained by wavefront aberration or diffraction effects. In this work, we\nintroduce a precise optical simulation model, and every operation in pipeline\nis differentiable. This model employs a novel initial value strategy to enhance\nthe reliability of intersection calculation on high aspherics. Moreover, it\nutilizes a differential operator to reduce memory consumption during coherent\npoint spread function calculations. To efficiently address various degradation,\nwe design a joint optimization procedure that leverages field information.\nGuided by a general restoration network, the proposed method not only enhances\nthe image quality, but also successively improves the optical performance\nacross multiple lenses that are already in professional level. This joint\noptimization pipeline offers innovative insights into the practical design of\nsophisticated optical systems and post-processing algorithms. The source code\nwill be made publicly available at\nhttps://github.com/Zrr-ZJU/Successive-optimization\n","authors":["Zheng Ren","Jingwen Zhou","Wenguan Zhang","Jiapu Yan","Bingkun Chen","Huajun Feng","Shiqi Chen"],"pdf_url":"https://arxiv.org/pdf/2412.14603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14598v1","updated":"2024-12-19T07:39:06Z","published":"2024-12-19T07:39:06Z","title":"Can We Get Rid of Handcrafted Feature Extractors? SparseViT:\n Nonsemantics-Centered, Parameter-Efficient Image Manipulation Localization\n Through Spare-Coding Transformer","summary":" Non-semantic features or semantic-agnostic features, which are irrelevant to\nimage context but sensitive to image manipulations, are recognized as\nevidential to Image Manipulation Localization (IML). Since manual labels are\nimpossible, existing works rely on handcrafted methods to extract non-semantic\nfeatures. Handcrafted non-semantic features jeopardize IML model's\ngeneralization ability in unseen or complex scenarios. Therefore, for IML, the\nelephant in the room is: How to adaptively extract non-semantic features?\nNon-semantic features are context-irrelevant and manipulation-sensitive. That\nis, within an image, they are consistent across patches unless manipulation\noccurs. Then, spare and discrete interactions among image patches are\nsufficient for extracting non-semantic features. However, image semantics vary\ndrastically on different patches, requiring dense and continuous interactions\namong image patches for learning semantic representations. Hence, in this\npaper, we propose a Sparse Vision Transformer (SparseViT), which reformulates\nthe dense, global self-attention in ViT into a sparse, discrete manner. Such\nsparse self-attention breaks image semantics and forces SparseViT to adaptively\nextract non-semantic features for images. Besides, compared with existing IML\nmodels, the sparse self-attention mechanism largely reduced the model size (max\n80% in FLOPs), achieving stunning parameter efficiency and computation\nreduction. Extensive experiments demonstrate that, without any handcrafted\nfeature extractors, SparseViT is superior in both generalization and efficiency\nacross benchmark datasets.\n","authors":["Lei Su","Xiaochen Ma","Xuekang Zhu","Chaoqun Niu","Zeyu Lei","Ji-Zhe Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.14598v1.pdf","comment":"12 page, 8 figures, published to AAAI"},{"id":"http://arxiv.org/abs/2412.14596v1","updated":"2024-12-19T07:31:40Z","published":"2024-12-19T07:31:40Z","title":"LDP: Generalizing to Multilingual Visual Information Extraction by\n Language Decoupled Pretraining","summary":" Visual Information Extraction (VIE) plays a crucial role in the comprehension\nof semi-structured documents, and several pre-trained models have been\ndeveloped to enhance performance. However, most of these works are monolingual\n(usually English). Due to the extremely unbalanced quantity and quality of\npre-training corpora between English and other languages, few works can extend\nto non-English scenarios. In this paper, we conduct systematic experiments to\nshow that vision and layout modality hold invariance among images with\ndifferent languages. If decoupling language bias from document images, a\nvision-layout-based model can achieve impressive cross-lingual generalization.\nAccordingly, we present a simple but effective multilingual training paradigm\nLDP (Language Decoupled Pre-training) for better utilization of monolingual\npre-training data. Our proposed model LDM (Language Decoupled Model) is first\npre-trained on the language-independent data, where the language knowledge is\ndecoupled by a diffusion model, and then the LDM is fine-tuned on the\ndownstream languages. Extensive experiments show that the LDM outperformed all\nSOTA multilingual pre-trained models, and also maintains competitiveness on\ndownstream monolingual/English benchmarks.\n","authors":["Huawen Shen","Gengluo Li","Jinwen Zhong","Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.14596v1.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2412.14592v1","updated":"2024-12-19T07:23:17Z","published":"2024-12-19T07:23:17Z","title":"Multi-Sensor Object Anomaly Detection: Unifying Appearance, Geometry,\n and Internal Properties","summary":" Object anomaly detection is essential for industrial quality inspection, yet\ntraditional single-sensor methods face critical limitations. They fail to\ncapture the wide range of anomaly types, as single sensors are often\nconstrained to either external appearance, geometric structure, or internal\nproperties. To overcome these challenges, we introduce MulSen-AD, the first\nhigh-resolution, multi-sensor anomaly detection dataset tailored for industrial\napplications. MulSen-AD unifies data from RGB cameras, laser scanners, and\nlock-in infrared thermography, effectively capturing external appearance,\ngeometric deformations, and internal defects. The dataset spans 15 industrial\nproducts with diverse, real-world anomalies. We also present MulSen-AD Bench, a\nbenchmark designed to evaluate multi-sensor methods, and propose\nMulSen-TripleAD, a decision-level fusion algorithm that integrates these three\nmodalities for robust, unsupervised object anomaly detection. Our experiments\ndemonstrate that multi-sensor fusion substantially outperforms single-sensor\napproaches, achieving 96.1% AUROC in object-level detection accuracy. These\nresults highlight the importance of integrating multi-sensor data for\ncomprehensive industrial anomaly detection.\n","authors":["Wenqiao Li","Bozhong Zheng","Xiaohao Xu","Jinye Gan","Fading Lu","Xiang Li","Na Ni","Zheng Tian","Xiaonan Huang","Shenghua Gao","Yingna Wu"],"pdf_url":"https://arxiv.org/pdf/2412.14592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09583v4","updated":"2024-12-19T07:21:43Z","published":"2024-10-12T16:28:40Z","title":"POPoS: Improving Efficient and Robust Facial Landmark Detection with\n Parallel Optimal Position Search","summary":" Achieving a balance between accuracy and efficiency is a critical challenge\nin facial landmark detection (FLD). This paper introduces Parallel Optimal\nPosition Search (POPoS), a high-precision encoding-decoding framework designed\nto address the limitations of traditional FLD methods. POPoS employs three key\ncontributions: (1) Pseudo-range multilateration is utilized to correct heatmap\nerrors, improving landmark localization accuracy. By integrating multiple\nanchor points, it reduces the impact of individual heatmap inaccuracies,\nleading to robust overall positioning. (2) To enhance the pseudo-range accuracy\nof selected anchor points, a new loss function, named multilateration anchor\nloss, is proposed. This loss function enhances the accuracy of the distance\nmap, mitigates the risk of local optima, and ensures optimal solutions. (3) A\nsingle-step parallel computation algorithm is introduced, boosting\ncomputational efficiency and reducing processing time. Extensive evaluations\nacross five benchmark datasets demonstrate that POPoS consistently outperforms\nexisting methods, particularly excelling in low-resolution heatmaps scenarios\nwith minimal computational overhead. These advantages make POPoS a highly\nefficient and accurate tool for FLD, with broad applicability in real-world\nscenarios.\n","authors":["Chong-Yang Xiang","Jun-Yan He","Zhi-Qi Cheng","Xiao Wu","Xian-Sheng Hua"],"pdf_url":"https://arxiv.org/pdf/2410.09583v4.pdf","comment":"Accepted to AAAI 2025, 9 pages, 6 figures. Code:\n https://github.com/teslatasy/POPoS"},{"id":"http://arxiv.org/abs/2410.20815v2","updated":"2024-12-19T07:19:52Z","published":"2024-10-28T08:02:34Z","title":"Grid4D: 4D Decomposed Hash Encoding for High-fidelity Dynamic Gaussian\n Splatting","summary":" Recently, Gaussian splatting has received more and more attention in the\nfield of static scene rendering. Due to the low computational overhead and\ninherent flexibility of explicit representations, plane-based explicit methods\nare popular ways to predict deformations for Gaussian-based dynamic scene\nrendering models. However, plane-based methods rely on the inappropriate\nlow-rank assumption and excessively decompose the space-time 4D encoding,\nresulting in overmuch feature overlap and unsatisfactory rendering quality. To\ntackle these problems, we propose Grid4D, a dynamic scene rendering model based\non Gaussian splatting and employing a novel explicit encoding method for the 4D\ninput through the hash encoding. Different from plane-based explicit\nrepresentations, we decompose the 4D encoding into one spatial and three\ntemporal 3D hash encodings without the low-rank assumption. Additionally, we\ndesign a novel attention module that generates the attention scores in a\ndirectional range to aggregate the spatial and temporal features. The\ndirectional attention enables Grid4D to more accurately fit the diverse\ndeformations across distinct scene components based on the spatial encoded\nfeatures. Moreover, to mitigate the inherent lack of smoothness in explicit\nrepresentation methods, we introduce a smooth regularization term that keeps\nour model from the chaos of deformation prediction. Our experiments demonstrate\nthat Grid4D significantly outperforms the state-of-the-art models in visual\nquality and rendering speed.\n","authors":["Jiawei Xu","Zexin Fan","Jian Yang","Jin Xie"],"pdf_url":"https://arxiv.org/pdf/2410.20815v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.14587v1","updated":"2024-12-19T07:13:15Z","published":"2024-12-19T07:13:15Z","title":"Spike2Former: Efficient Spiking Transformer for High-performance Image\n Segmentation","summary":" Spiking Neural Networks (SNNs) have a low-power advantage but perform poorly\nin image segmentation tasks. The reason is that directly converting neural\nnetworks with complex architectural designs for segmentation tasks into spiking\nversions leads to performance degradation and non-convergence. To address this\nchallenge, we first identify the modules in the architecture design that lead\nto the severe reduction in spike firing, make targeted improvements, and\npropose Spike2Former architecture. Second, we propose normalized integer\nspiking neurons to solve the training stability problem of SNNs with complex\narchitectures. We set a new state-of-the-art for SNNs in various semantic\nsegmentation datasets, with a significant improvement of +12.7% mIoU and 5.0\nefficiency on ADE20K, +14.3% mIoU and 5.2 efficiency on VOC2012, and +9.1% mIoU\nand 6.6 efficiency on CityScapes.\n","authors":["Zhenxin Lei","Man Yao","Jiakui Hu","Xinhao Luo","Yanye Lu","Bo Xu","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2412.14587v1.pdf","comment":"This work has been accepted on Association for the Advancement of\n Artificial Intelligence 2025"},{"id":"http://arxiv.org/abs/2412.14585v1","updated":"2024-12-19T07:06:25Z","published":"2024-12-19T07:06:25Z","title":"HiCM$^2$: Hierarchical Compact Memory Modeling for Dense Video\n Captioning","summary":" With the growing demand for solutions to real-world video challenges,\ninterest in dense video captioning (DVC) has been on the rise. DVC involves the\nautomatic captioning and localization of untrimmed videos. Several studies\nhighlight the challenges of DVC and introduce improved methods utilizing prior\nknowledge, such as pre-training and external memory. In this research, we\npropose a model that leverages the prior knowledge of human-oriented\nhierarchical compact memory inspired by human memory hierarchy and cognition.\nTo mimic human-like memory recall, we construct a hierarchical memory and a\nhierarchical memory reading module. We build an efficient hierarchical compact\nmemory by employing clustering of memory events and summarization using large\nlanguage models. Comparative experiments demonstrate that this hierarchical\nmemory recall process improves the performance of DVC by achieving\nstate-of-the-art performance on YouCook2 and ViTT datasets.\n","authors":["Minkuk Kim","Hyeon Bae Kim","Jinyoung Moon","Jinwoo Choi","Seong Tae Kim"],"pdf_url":"https://arxiv.org/pdf/2412.14585v1.pdf","comment":"AAAI2025"},{"id":"http://arxiv.org/abs/2412.14580v1","updated":"2024-12-19T07:00:03Z","published":"2024-12-19T07:00:03Z","title":"DiffSim: Taming Diffusion Models for Evaluating Visual Similarity","summary":" Diffusion models have fundamentally transformed the field of generative\nmodels, making the assessment of similarity between customized model outputs\nand reference inputs critically important. However, traditional perceptual\nsimilarity metrics operate primarily at the pixel and patch levels, comparing\nlow-level colors and textures but failing to capture mid-level similarities and\ndifferences in image layout, object pose, and semantic content. Contrastive\nlearning-based CLIP and self-supervised learning-based DINO are often used to\nmeasure semantic similarity, but they highly compress image features,\ninadequately assessing appearance details. This paper is the first to discover\nthat pretrained diffusion models can be utilized for measuring visual\nsimilarity and introduces the DiffSim method, addressing the limitations of\ntraditional metrics in capturing perceptual consistency in custom generation\ntasks. By aligning features in the attention layers of the denoising U-Net,\nDiffSim evaluates both appearance and style similarity, showing superior\nalignment with human visual preferences. Additionally, we introduce the Sref\nand IP benchmarks to evaluate visual similarity at the level of style and\ninstance, respectively. Comprehensive evaluations across multiple benchmarks\ndemonstrate that DiffSim achieves state-of-the-art performance, providing a\nrobust tool for measuring visual coherence in generative models.\n","authors":["Yiren Song","Xiaokang Liu","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2412.14580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14579v1","updated":"2024-12-19T06:57:37Z","published":"2024-12-19T06:57:37Z","title":"GSRender: Deduplicated Occupancy Prediction via Weakly Supervised 3D\n Gaussian Splatting","summary":" 3D occupancy perception is gaining increasing attention due to its capability\nto offer detailed and precise environment representations. Previous\nweakly-supervised NeRF methods balance efficiency and accuracy, with mIoU\nvarying by 5-10 points due to sampling count along camera rays. Recently,\nreal-time Gaussian splatting has gained widespread popularity in 3D\nreconstruction, and the occupancy prediction task can also be viewed as a\nreconstruction task. Consequently, we propose GSRender, which naturally employs\n3D Gaussian Splatting for occupancy prediction, simplifying the sampling\nprocess. In addition, the limitations of 2D supervision result in duplicate\npredictions along the same camera ray. We implemented the Ray Compensation (RC)\nmodule, which mitigates this issue by compensating for features from adjacent\nframes. Finally, we redesigned the loss to eliminate the impact of dynamic\nobjects from adjacent frames. Extensive experiments demonstrate that our\napproach achieves SOTA (state-of-the-art) results in RayIoU (+6.0), while\nnarrowing the gap with 3D supervision methods. Our code will be released soon.\n","authors":["Qianpu Sun","Changyong Shu","Sifan Zhou","Zichen Yu","Yan Chen","Dawei Yang","Yuan Chun"],"pdf_url":"https://arxiv.org/pdf/2412.14579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14576v1","updated":"2024-12-19T06:52:12Z","published":"2024-12-19T06:52:12Z","title":"Alignment-Free RGB-T Salient Object Detection: A Large-scale Dataset and\n Progressive Correlation Network","summary":" Alignment-free RGB-Thermal (RGB-T) salient object detection (SOD) aims to\nachieve robust performance in complex scenes by directly leveraging the\ncomplementary information from unaligned visible-thermal image pairs, without\nrequiring manual alignment. However, the labor-intensive process of collecting\nand annotating image pairs limits the scale of existing benchmarks, hindering\nthe advancement of alignment-free RGB-T SOD. In this paper, we construct a\nlarge-scale and high-diversity unaligned RGB-T SOD dataset named UVT20K,\ncomprising 20,000 image pairs, 407 scenes, and 1256 object categories. All\nsamples are collected from real-world scenarios with various challenges, such\nas low illumination, image clutter, complex salient objects, and so on. To\nsupport the exploration for further research, each sample in UVT20K is\nannotated with a comprehensive set of ground truths, including saliency masks,\nscribbles, boundaries, and challenge attributes. In addition, we propose a\nProgressive Correlation Network (PCNet), which models inter- and intra-modal\ncorrelations on the basis of explicit alignment to achieve accurate predictions\nin unaligned image pairs. Extensive experiments conducted on unaligned and\naligned datasets demonstrate the effectiveness of our method.Code and dataset\nare available at https://github.com/Angknpng/PCNet.\n","authors":["Kunpeng Wang","Keke Chen","Chenglong Li","Zhengzheng Tu","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2412.14576v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14571v1","updated":"2024-12-19T06:42:25Z","published":"2024-12-19T06:42:25Z","title":"SCKD: Semi-Supervised Cross-Modality Knowledge Distillation for 4D Radar\n Object Detection","summary":" 3D object detection is one of the fundamental perception tasks for autonomous\nvehicles. Fulfilling such a task with a 4D millimeter-wave radar is very\nattractive since the sensor is able to acquire 3D point clouds similar to Lidar\nwhile maintaining robust measurements under adverse weather. However, due to\nthe high sparsity and noise associated with the radar point clouds, the\nperformance of the existing methods is still much lower than expected. In this\npaper, we propose a novel Semi-supervised Cross-modality Knowledge Distillation\n(SCKD) method for 4D radar-based 3D object detection. It characterizes the\ncapability of learning the feature from a Lidar-radar-fused teacher network\nwith semi-supervised distillation. We first propose an adaptive fusion module\nin the teacher network to boost its performance. Then, two feature distillation\nmodules are designed to facilitate the cross-modality knowledge transfer.\nFinally, a semi-supervised output distillation is proposed to increase the\neffectiveness and flexibility of the distillation framework. With the same\nnetwork structure, our radar-only student trained by SCKD boosts the mAP by\n10.38% over the baseline and outperforms the state-of-the-art works on the VoD\ndataset. The experiment on ZJUODset also shows 5.12% mAP improvements on the\nmoderate difficulty level over the baseline when extra unlabeled data are\navailable. Code is available at https://github.com/Ruoyu-Xu/SCKD.\n","authors":["Ruoyu Xu","Zhiyu Xiang","Chenwei Zhang","Hanzhi Zhong","Xijun Zhao","Ruina Dang","Peng Xu","Tianyu Pu","Eryun Liu"],"pdf_url":"https://arxiv.org/pdf/2412.14571v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2410.04749v2","updated":"2024-12-19T06:41:40Z","published":"2024-10-07T04:59:08Z","title":"LLaVA Needs More Knowledge: Retrieval Augmented Natural Language\n Generation with Knowledge Graph for Explaining Thoracic Pathologies","summary":" Generating Natural Language Explanations (NLEs) for model predictions on\nmedical images, particularly those depicting thoracic pathologies, remains a\ncritical and challenging task. Existing methodologies often struggle due to\ngeneral models' insufficient domain-specific medical knowledge and privacy\nconcerns associated with retrieval-based augmentation techniques. To address\nthese issues, we propose a novel Vision-Language framework augmented with a\nKnowledge Graph (KG)-based datastore, which enhances the model's understanding\nby incorporating additional domain-specific medical knowledge essential for\ngenerating accurate and informative NLEs. Our framework employs a KG-based\nretrieval mechanism that not only improves the precision of the generated\nexplanations but also preserves data privacy by avoiding direct data retrieval.\nThe KG datastore is designed as a plug-and-play module, allowing for seamless\nintegration with various model architectures. We introduce and evaluate three\ndistinct frameworks within this paradigm: KG-LLaVA, which integrates the\npre-trained LLaVA model with KG-RAG; Med-XPT, a custom framework combining\nMedCLIP, a transformer-based projector, and GPT-2; and Bio-LLaVA, which adapts\nLLaVA by incorporating the Bio-ViT-L vision model. These frameworks are\nvalidated on the MIMIC-NLE dataset, where they achieve state-of-the-art\nresults, underscoring the effectiveness of KG augmentation in generating\nhigh-quality NLEs for thoracic pathologies.\n","authors":["Ameer Hamza"," Abdullah","Yong Hyun Ahn","Sungyoung Lee","Seong Tae Kim"],"pdf_url":"https://arxiv.org/pdf/2410.04749v2.pdf","comment":"AAAI2025"},{"id":"http://arxiv.org/abs/2412.14568v1","updated":"2024-12-19T06:39:28Z","published":"2024-12-19T06:39:28Z","title":"Improving Geometry in Sparse-View 3DGS via Reprojection-based DoF\n Separation","summary":" Recent learning-based Multi-View Stereo models have demonstrated\nstate-of-the-art performance in sparse-view 3D reconstruction. However,\ndirectly applying 3D Gaussian Splatting (3DGS) as a refinement step following\nthese models presents challenges. We hypothesize that the excessive positional\ndegrees of freedom (DoFs) in Gaussians induce geometry distortion, fitting\ncolor patterns at the cost of structural fidelity. To address this, we propose\nreprojection-based DoF separation, a method distinguishing positional DoFs in\nterms of uncertainty: image-plane-parallel DoFs and ray-aligned DoF. To\nindependently manage each DoF, we introduce a reprojection process along with\ntailored constraints for each DoF. Through experiments across various datasets,\nwe confirm that separating the positional DoFs of Gaussians and applying\ntargeted constraints effectively suppresses geometric artifacts, producing\nreconstruction results that are both visually and geometrically plausible.\n","authors":["Yongsung Kim","Minjun Park","Jooyoung Choi","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2412.14568v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2412.11530v2","updated":"2024-12-19T06:32:22Z","published":"2024-12-16T08:08:35Z","title":"RoMeO: Robust Metric Visual Odometry","summary":" Visual odometry (VO) aims to estimate camera poses from visual inputs -- a\nfundamental building block for many applications such as VR/AR and robotics.\nThis work focuses on monocular RGB VO where the input is a monocular RGB video\nwithout IMU or 3D sensors. Existing approaches lack robustness under this\nchallenging scenario and fail to generalize to unseen data (especially\noutdoors); they also cannot recover metric-scale poses. We propose Robust\nMetric Visual Odometry (RoMeO), a novel method that resolves these issues\nleveraging priors from pre-trained depth models. RoMeO incorporates both\nmonocular metric depth and multi-view stereo (MVS) models to recover\nmetric-scale, simplify correspondence search, provide better initialization and\nregularize optimization. Effective strategies are proposed to inject noise\nduring training and adaptively filter noisy depth priors, which ensure the\nrobustness of RoMeO on in-the-wild data. As shown in Fig.1, RoMeO advances the\nstate-of-the-art (SOTA) by a large margin across 6 diverse datasets covering\nboth indoor and outdoor scenes. Compared to the current SOTA DPVO, RoMeO\nreduces the relative (align the trajectory scale with GT) and absolute\ntrajectory errors both by >50%. The performance gain also transfers to the full\nSLAM pipeline (with global BA & loop closure). Code will be released upon\nacceptance.\n","authors":["Junda Cheng","Zhipeng Cai","Zhaoxing Zhang","Wei Yin","Matthias Muller","Michael Paulitsch","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2412.11530v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11973v6","updated":"2024-12-19T06:29:38Z","published":"2023-12-19T09:11:49Z","title":"Continual Learning: Forget-free Winning Subnetworks for Video\n Representations","summary":" Inspired by the Lottery Ticket Hypothesis (LTH), which highlights the\nexistence of efficient subnetworks within larger, dense networks, a\nhigh-performing Winning Subnetwork (WSN) in terms of task performance under\nappropriate sparsity conditions is considered for various continual learning\ntasks. It leverages pre-existing weights from dense networks to achieve\nefficient learning in Task Incremental Learning (TIL) and Task-agnostic\nIncremental Learning (TaIL) scenarios. In Few-Shot Class Incremental Learning\n(FSCIL), a variation of WSN referred to as the Soft subnetwork (SoftNet) is\ndesigned to prevent overfitting when the data samples are scarce. Furthermore,\nthe sparse reuse of WSN weights is considered for Video Incremental Learning\n(VIL). The use of Fourier Subneural Operator (FSO) within WSN is considered. It\nenables compact encoding of videos and identifies reusable subnetworks across\nvarying bandwidths. We have integrated FSO into different architectural\nframeworks for continual learning, including VIL, TIL, and FSCIL. Our\ncomprehensive experiments demonstrate FSO's effectiveness, significantly\nimproving task performance at various convolutional representational levels.\nSpecifically, FSO enhances higher-layer performance in TIL and FSCIL and\nlower-layer performance in VIL.\n","authors":["Haeyong Kang","Jaehong Yoon","Sung Ju Hwang","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2312.11973v6.pdf","comment":"IEEE Transactions on Pattern Analysis and Machine Intelligence\n (T-PAMI)"},{"id":"http://arxiv.org/abs/2412.14561v1","updated":"2024-12-19T06:26:16Z","published":"2024-12-19T06:26:16Z","title":"GBRIP: Granular Ball Representation for Imbalanced Partial Label\n Learning","summary":" Partial label learning (PLL) is a complicated weakly supervised\nmulti-classification task compounded by class imbalance. Currently, existing\nmethods only rely on inter-class pseudo-labeling from inter-class features,\noften overlooking the significant impact of the intra-class imbalanced features\ncombined with the inter-class. To address these limitations, we introduce\nGranular Ball Representation for Imbalanced PLL (GBRIP), a novel framework for\nimbalanced PLL. GBRIP utilizes coarse-grained granular ball representation and\nmulti-center loss to construct a granular ball-based nfeature space through\nunsupervised learning, effectively capturing the feature distribution within\neach class. GBRIP mitigates the impact of confusing features by systematically\nrefining label disambiguation and estimating imbalance distributions. The novel\nmulti-center loss function enhances learning by emphasizing the relationships\nbetween samples and their respective centers within the granular balls.\nExtensive experiments on standard benchmarks demonstrate that GBRIP outperforms\nexisting state-of-the-art methods, offering a robust solution to the challenges\nof imbalanced PLL.\n","authors":["Jintao Huang","Yiu-ming Cheung","Chi-man Vong","Wenbin Qian"],"pdf_url":"https://arxiv.org/pdf/2412.14561v1.pdf","comment":"AAAI25"},{"id":"http://arxiv.org/abs/2409.01179v3","updated":"2024-12-19T06:26:04Z","published":"2024-09-02T11:19:54Z","title":"Recoverable Compression: A Multimodal Vision Token Recovery Mechanism\n Guided by Text Information","summary":" With the advancement of large-scale language modeling techniques, large\nmultimodal models combining visual encoders with large language models have\ndemonstrated exceptional performance in various visual tasks. Most of the\ncurrent large-scale multimodal models achieve this by mapping visual features\nobtained from the visual encoder into a large language model and using them as\ninputs alongside text for downstream tasks. Therefore, the number of visual\ntokens directly affects the training and inference speed of the model. There\nhas been significant work on token pruning for visual transformers, but for\nlarge multimodal models, only relying on visual information for token pruning\nor compression may lead to significant loss of important information. On the\nother hand, the textual input in the form of a question may contain valuable\ninformation that can aid in answering the question, providing additional\nknowledge to the model. To address the potential oversimplification and\nexcessive pruning that can occur with most purely visual token pruning methods,\nwe propose a text information-guided dynamic visual token recovery mechanism\nthat does not require training. This mechanism leverages the similarity between\nthe question text and visual tokens to recover visually meaningful tokens with\nimportant text information while merging other less important tokens.\nExperimental results demonstrate that our proposed method achieves comparable\nperformance to the original approach while compressing the visual tokens to an\naverage of 10% of the original quantity. Our source code will be made publicly\navailable following acceptance.\n","authors":["Yi Chen","Jian Xu","Xu-Yao Zhang","Wen-Zhuo Liu","Yang-Yang Liu","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2409.01179v3.pdf","comment":"AAAI2025 Accepted"},{"id":"http://arxiv.org/abs/2403.10650v3","updated":"2024-12-19T06:25:45Z","published":"2024-03-15T19:35:10Z","title":"PALM: Pushing Adaptive Learning Rate Mechanisms for Continual Test-Time\n Adaptation","summary":" Real-world vision models in dynamic environments face rapid shifts in domain\ndistributions, leading to decreased recognition performance. Using unlabeled\ntest data, continuous test-time adaptation (CTTA) directly adjusts a\npre-trained source discriminative model to these changing domains. A highly\neffective CTTA method involves applying layer-wise adaptive learning rates for\nselectively adapting pre-trained layers. However, it suffers from the poor\nestimation of domain shift and the inaccuracies arising from the pseudo-labels.\nThis work aims to overcome these limitations by identifying layers for\nadaptation via quantifying model prediction uncertainty without relying on\npseudo-labels. We utilize the magnitude of gradients as a metric, calculated by\nbackpropagating the KL divergence between the softmax output and a uniform\ndistribution, to select layers for further adaptation. Subsequently, for the\nparameters exclusively belonging to these selected layers, with the remaining\nones frozen, we evaluate their sensitivity to approximate the domain shift and\nadjust their learning rates accordingly. We conduct extensive image\nclassification experiments on CIFAR-10C, CIFAR-100C, and ImageNet-C,\ndemonstrating the superior efficacy of our method compared to prior approaches.\n","authors":["Sarthak Kumar Maharana","Baoming Zhang","Yunhui Guo"],"pdf_url":"https://arxiv.org/pdf/2403.10650v3.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14559v1","updated":"2024-12-19T06:22:19Z","published":"2024-12-19T06:22:19Z","title":"ScaMo: Exploring the Scaling Law in Autoregressive Motion Generation\n Model","summary":" The scaling law has been validated in various domains, such as natural\nlanguage processing (NLP) and massive computer vision tasks; however, its\napplication to motion generation remains largely unexplored. In this paper, we\nintroduce a scalable motion generation framework that includes the motion\ntokenizer Motion FSQ-VAE and a text-prefix autoregressive transformer. Through\ncomprehensive experiments, we observe the scaling behavior of this system. For\nthe first time, we confirm the existence of scaling laws within the context of\nmotion generation. Specifically, our results demonstrate that the normalized\ntest loss of our prefix autoregressive models adheres to a logarithmic law in\nrelation to compute budgets. Furthermore, we also confirm the power law between\nNon-Vocabulary Parameters, Vocabulary Parameters, and Data Tokens with respect\nto compute budgets respectively. Leveraging the scaling law, we predict the\noptimal transformer size, vocabulary size, and data requirements for a compute\nbudget of $1e18$. The test loss of the system, when trained with the optimal\nmodel size, vocabulary size, and required data, aligns precisely with the\npredicted test loss, thereby validating the scaling law.\n","authors":["Shunlin Lu","Jingbo Wang","Zeyu Lu","Ling-Hao Chen","Wenxun Dai","Junting Dong","Zhiyang Dou","Bo Dai","Ruimao Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.14559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20633v3","updated":"2024-12-19T06:22:05Z","published":"2024-05-31T05:49:37Z","title":"Skeleton-OOD: An End-to-End Skeleton-Based Model for Robust\n Out-of-Distribution Human Action Detection","summary":" Human action recognition is crucial in computer vision systems. However, in\nreal-world scenarios, human actions often fall outside the distribution of\ntraining data, requiring a model to both recognize in-distribution (ID) actions\nand reject out-of-distribution (OOD) ones. Despite its importance, there has\nbeen limited research on OOD detection in human actions. Existing works on OOD\ndetection mainly focus on image data with RGB structure, and many methods are\npost-hoc in nature. While these methods are convenient and computationally\nefficient, they often lack sufficient accuracy, fail to consider the exposure\nof OOD samples, and ignore the application in skeleton structure data. To\naddress these challenges, we propose a novel end-to-end skeleton-based model\ncalled Skeleton-OOD, which is committed to improving the effectiveness of OOD\ntasks while ensuring the accuracy of ID recognition. Through extensive\nexperiments conducted on NTU-RGB+D 60, NTU-RGB+D 120, and Kinetics-400\ndatasets, Skeleton-OOD demonstrates the superior performance of our proposed\napproach compared to state-of-the-art methods. Our findings underscore the\neffectiveness of classic OOD detection techniques in the context of\nskeleton-based action recognition tasks, offering promising avenues for future\nresearch in this field. Code is available at\nhttps://github.com/YilliaJing/Skeleton-OOD.git.\n","authors":["Jing Xu","Anqi Zhu","Jingyu Lin","Qiuhong Ke","Cunjian Chen"],"pdf_url":"https://arxiv.org/pdf/2405.20633v3.pdf","comment":"Accepted by Neurocomputing"},{"id":"http://arxiv.org/abs/2412.14547v1","updated":"2024-12-19T05:55:18Z","published":"2024-12-19T05:55:18Z","title":"Bright-NeRF:Brightening Neural Radiance Field with Color Restoration\n from Low-light Raw Images","summary":" Neural Radiance Fields (NeRFs) have demonstrated prominent performance in\nnovel view synthesis. However, their input heavily relies on image acquisition\nunder normal light conditions, making it challenging to learn accurate scene\nrepresentation in low-light environments where images typically exhibit\nsignificant noise and severe color distortion. To address these challenges, we\npropose a novel approach, Bright-NeRF, which learns enhanced and high-quality\nradiance fields from multi-view low-light raw images in an unsupervised manner.\nOur method simultaneously achieves color restoration, denoising, and enhanced\nnovel view synthesis. Specifically, we leverage a physically-inspired model of\nthe sensor's response to illumination and introduce a chromatic adaptation loss\nto constrain the learning of response, enabling consistent color perception of\nobjects regardless of lighting conditions. We further utilize the raw data's\nproperties to expose the scene's intensity automatically. Additionally, we have\ncollected a multi-view low-light raw image dataset to advance research in this\nfield. Experimental results demonstrate that our proposed method significantly\noutperforms existing 2D and 3D approaches. Our code and dataset will be made\npublicly available.\n","authors":["Min Wang","Xin Huang","Guoqing Zhou","Qifeng Guo","Qing Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14547v1.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2412.14546v1","updated":"2024-12-19T05:52:16Z","published":"2024-12-19T05:52:16Z","title":"{S$^3$-Mamba}: Small-Size-Sensitive Mamba for Lesion Segmentation","summary":" Small lesions play a critical role in early disease diagnosis and\nintervention of severe infections. Popular models often face challenges in\nsegmenting small lesions, as it occupies only a minor portion of an image,\nwhile down\\_sampling operations may inevitably lose focus on local features of\nsmall lesions. To tackle the challenges, we propose a {\\bf S}mall-{\\bf\nS}ize-{\\bf S}ensitive {\\bf Mamba} ({\\bf S$^3$-Mamba}), which promotes the\nsensitivity to small lesions across three dimensions: channel, spatial, and\ntraining strategy. Specifically, an Enhanced Visual State Space block is\ndesigned to focus on small lesions through multiple residual connections to\npreserve local features, and selectively amplify important details while\nsuppressing irrelevant ones through channel-wise attention. A Tensor-based\nCross-feature Multi-scale Attention is designed to integrate input image\nfeatures and intermediate-layer features with edge features and exploit the\nattentive support of features across multiple scales, thereby retaining spatial\ndetails of small lesions at various granularities. Finally, we introduce a\nnovel regularized curriculum learning to automatically assess lesion size and\nsample difficulty, and gradually focus from easy samples to hard ones like\nsmall lesions. Extensive experiments on three medical image segmentation\ndatasets show the superiority of our S$^3$-Mamba, especially in segmenting\nsmall lesions. Our code is available at\nhttps://github.com/ErinWang2023/S3-Mamba.\n","authors":["Gui Wang","Yuexiang Li","Wenting Chen","Meidan Ding","Wooi Ping Cheah","Rong Qu","Jianfeng Ren","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2412.14546v1.pdf","comment":"Accept by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14545v1","updated":"2024-12-19T05:51:46Z","published":"2024-12-19T05:51:46Z","title":"Summary of Point Transformer with Federated Learning for Predicting\n Breast Cancer HER2 Status from Hematoxylin and Eosin-Stained Whole Slide\n Images","summary":" This study introduces a federated learning-based approach to predict HER2\nstatus from hematoxylin and eosin (HE)-stained whole slide images (WSIs),\nreducing costs and speeding up treatment decisions. To address label imbalance\nand feature representation challenges in multisite datasets, a point\ntransformer is proposed, incorporating dynamic label distribution, an auxiliary\nclassifier, and farthest cosine sampling. Extensive experiments demonstrate\nstate-of-the-art performance across four sites (2687 WSIs) and strong\ngeneralization to two unseen sites (229 WSIs).\n","authors":["Kamorudeen A. Amuda","Almustapha A. Wakili"],"pdf_url":"https://arxiv.org/pdf/2412.14545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16214v2","updated":"2024-12-19T05:50:06Z","published":"2024-07-23T06:42:55Z","title":"Diff-Shadow: Global-guided Diffusion Model for Shadow Removal","summary":" We propose Diff-Shadow, a global-guided diffusion model for shadow removal.\nPrevious transformer-based approaches can utilize global information to relate\nshadow and non-shadow regions but are limited in their synthesis ability and\nrecover images with obvious boundaries. In contrast, diffusion-based methods\ncan generate better content but they are not exempt from issues related to\ninconsistent illumination. In this work, we combine the advantages of diffusion\nmodels and global guidance to achieve shadow-free restoration. Specifically, we\npropose a parallel UNets architecture: 1) the local branch performs the\npatch-based noise estimation in the diffusion process, and 2) the global branch\nrecovers the low-resolution shadow-free images. A Reweight Cross Attention\n(RCA) module is designed to integrate global contextual information of\nnon-shadow regions into the local branch. We further design a Global-guided\nSampling Strategy (GSS) that mitigates patch boundary issues and ensures\nconsistent illumination across shaded and unshaded regions in the recovered\nimage. Comprehensive experiments on datasets ISTD, ISTD+, and SRD have\ndemonstrated the effectiveness of Diff-Shadow. Compared to state-of-the-art\nmethods, our method achieves a significant improvement in terms of PSNR,\nincreasing from 32.33dB to 33.69dB on the ISTD dataset.\n","authors":["Jinting Luo","Ru Li","Chengzhi Jiang","Xiaoming Zhang","Mingyan Han","Ting Jiang","Haoqiang Fan","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2407.16214v2.pdf","comment":"Proceedings of the 39th Annual AAAI Conference on Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2412.08125v2","updated":"2024-12-19T05:46:29Z","published":"2024-12-11T06:21:33Z","title":"Progressive Multi-granular Alignments for Grounded Reasoning in Large\n Vision-Language Models","summary":" Existing Large Vision-Language Models (LVLMs) excel at matching concepts\nacross multi-modal inputs but struggle with compositional concepts and\nhigh-level relationships between entities. This paper introduces Progressive\nmulti-granular Vision-Language alignments (PromViL), a novel framework to\nenhance LVLMs' ability in performing grounded compositional visual reasoning\ntasks. Our approach constructs a hierarchical structure of multi-modal\nalignments, ranging from simple to complex concepts. By progressively aligning\ntextual descriptions with corresponding visual regions, our model learns to\nleverage contextual information from lower levels to inform higher-level\nreasoning. To facilitate this learning process, we introduce a data generation\nprocess that creates a novel dataset derived from Visual Genome, providing a\nwide range of nested compositional vision-language pairs. Experimental results\ndemonstrate that our PromViL framework significantly outperforms baselines on\nvarious visual grounding and compositional question answering tasks. The code\nis available at: https://github.com/lqh52/PromViL.\n","authors":["Quang-Hung Le","Long Hoang Dang","Ngan Le","Truyen Tran","Thao Minh Le"],"pdf_url":"https://arxiv.org/pdf/2412.08125v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2412.15093v1","updated":"2024-12-19T17:43:27Z","published":"2024-12-19T17:43:27Z","title":"Nano-ESG: Extracting Corporate Sustainability Information from News\n Articles","summary":" Determining the sustainability impact of companies is a highly complex\nsubject which has garnered more and more attention over the past few years.\nToday, investors largely rely on sustainability-ratings from established\nrating-providers in order to analyze how responsibly a company acts. However,\nthose ratings have recently been criticized for being hard to understand and\nnearly impossible to reproduce.\n An independent way to find out about the sustainability practices of\ncompanies lies in the rich landscape of news article data. In this paper, we\nexplore a different approach to identify key opportunities and challenges of\ncompanies in the sustainability domain. We present a novel dataset of more than\n840,000 news articles which were gathered for major German companies between\nJanuary 2023 and September 2024. By applying a mixture of Natural Language\nProcessing techniques, we first identify relevant articles, before summarizing\nthem and extracting their sustainability-related sentiment and aspect using\nLarge Language Models (LLMs). Furthermore, we conduct an evaluation of the\nobtained data and determine that the LLM-produced answers are accurate. We\nrelease both datasets at https://github.com/Bailefan/Nano-ESG.\n","authors":["Fabian Billert","Stefan Conrad"],"pdf_url":"https://arxiv.org/pdf/2412.15093v1.pdf","comment":"To be published at ECIR 2025. Preprint"},{"id":"http://arxiv.org/abs/2301.03767v2","updated":"2024-12-19T16:45:52Z","published":"2023-01-10T03:10:32Z","title":"Metric Compatible Training for Online Backfilling in Large-Scale\n Retrieval","summary":" Backfilling is the process of re-extracting all gallery embeddings from\nupgraded models in image retrieval systems. It inevitably requires a\nprohibitively large amount of computational cost and even entails the downtime\nof the service. Although backward-compatible learning sidesteps this challenge\nby tackling query-side representations, this leads to suboptimal solutions in\nprinciple because gallery embeddings cannot benefit from model upgrades. We\naddress this dilemma by introducing an online backfilling algorithm, which\nenables us to achieve a progressive performance improvement during the\nbackfilling process while not sacrificing the final performance of new model\nafter the completion of backfilling. To this end, we first propose a simple\ndistance rank merge technique for online backfilling. Then, we incorporate a\nreverse transformation module for more effective and efficient merging, which\nis further enhanced by adopting a metric-compatible contrastive learning\napproach. These two components help to make the distances of old and new models\ncompatible, resulting in desirable merge results during backfilling with no\nextra computational overhead. Extensive experiments show the effectiveness of\nour framework on four standard benchmarks in various settings.\n","authors":["Seonguk Seo","Mustafa Gokhan Uzunbas","Bohyung Han","Sara Cao","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2301.03767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15005v1","updated":"2024-12-19T16:20:42Z","published":"2024-12-19T16:20:42Z","title":"DisCo: Graph-Based Disentangled Contrastive Learning for Cold-Start\n Cross-Domain Recommendation","summary":" Recommender systems are widely used in various real-world applications, but\nthey often encounter the persistent challenge of the user cold-start problem.\nCross-domain recommendation (CDR), which leverages user interactions from one\ndomain to improve prediction performance in another, has emerged as a promising\nsolution. However, users with similar preferences in the source domain may\nexhibit different interests in the target domain. Therefore, directly\ntransferring embeddings may introduce irrelevant source-domain collaborative\ninformation. In this paper, we propose a novel graph-based disentangled\ncontrastive learning framework to capture fine-grained user intent and filter\nout irrelevant collaborative information, thereby avoiding negative transfer.\nSpecifically, for each domain, we use a multi-channel graph encoder to capture\ndiverse user intents. We then construct the affinity graph in the embedding\nspace and perform multi-step random walks to capture high-order user similarity\nrelationships. Treating one domain as the target, we propose a disentangled\nintent-wise contrastive learning approach, guided by user similarity, to refine\nthe bridging of user intents across domains. Extensive experiments on four\nbenchmark CDR datasets demonstrate that DisCo consistently outperforms existing\nstate-of-the-art baselines, thereby validating the effectiveness of both DisCo\nand its components.\n","authors":["Hourun Li","Yifan Wang","Zhiping Xiao","Jia Yang","Changling Zhou","Ming Zhang","Wei Ju"],"pdf_url":"https://arxiv.org/pdf/2412.15005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14978v1","updated":"2024-12-19T15:53:21Z","published":"2024-12-19T15:53:21Z","title":"Spectrum-based Modality Representation Fusion Graph Convolutional\n Network for Multimodal Recommendation","summary":" Incorporating multi-modal features as side information has recently become a\ntrend in recommender systems. To elucidate user-item preferences, recent\nstudies focus on fusing modalities via concatenation, element-wise sum, or\nattention mechanisms. Despite having notable success, existing approaches do\nnot account for the modality-specific noise encapsulated within each modality.\nAs a result, direct fusion of modalities will lead to the amplification of\ncross-modality noise. Moreover, the variation of noise that is unique within\neach modality results in noise alleviation and fusion being more challenging.\nIn this work, we propose a new Spectrum-based Modality Representation (SMORE)\nfusion graph recommender that aims to capture both uni-modal and fusion\npreferences while simultaneously suppressing modality noise. Specifically,\nSMORE projects the multi-modal features into the frequency domain and leverages\nthe spectral space for fusion. To reduce dynamic contamination that is unique\nto each modality, we introduce a filter to attenuate and suppress the modality\nnoise adaptively while capturing the universal modality patterns effectively.\nFurthermore, we explore the item latent structures by designing a new\nmulti-modal graph learning module to capture associative semantic correlations\nand universal fusion patterns among similar items. Finally, we formulate a new\nmodality-aware preference module, which infuses behavioral features and\nbalances the uni- and multi-modal features for precise preference modeling.\nThis empowers SMORE with the ability to infer both user modality-specific and\nfusion preferences more accurately. Experiments on three real-world datasets\nshow the efficacy of our proposed model. The source code for this work has been\nmade publicly available at https://github.com/kennethorq/SMORE.\n","authors":["Rongqing Kenneth Ong","Andy W. H. Khong"],"pdf_url":"https://arxiv.org/pdf/2412.14978v1.pdf","comment":"Accepted to ACM Web Search and Data Mining (WSDM) 2025"},{"id":"http://arxiv.org/abs/2412.14967v1","updated":"2024-12-19T15:45:06Z","published":"2024-12-19T15:45:06Z","title":"ECLIPSE: Contrastive Dimension Importance Estimation with\n Pseudo-Irrelevance Feedback for Dense Retrieval","summary":" Recent advances in Information Retrieval have leveraged high-dimensional\nembedding spaces to improve the retrieval of relevant documents. Moreover, the\nManifold Clustering Hypothesis suggests that despite these high-dimensional\nrepresentations, documents relevant to a query reside on a lower-dimensional,\nquery-dependent manifold. While this hypothesis has inspired new retrieval\nmethods, existing approaches still face challenges in effectively separating\nnon-relevant information from relevant signals. We propose a novel methodology\nthat addresses these limitations by leveraging information from both relevant\nand non-relevant documents. Our method, ECLIPSE, computes a centroid based on\nirrelevant documents as a reference to estimate noisy dimensions present in\nrelevant ones, enhancing retrieval performance. Extensive experiments on three\nin-domain and one out-of-domain benchmarks demonstrate an average improvement\nof up to 19.50% (resp. 22.35%) in mAP(AP) and 11.42% (resp. 13.10%) in nDCG@10\nw.r.t. the DIME-based baseline (resp. the baseline using all dimensions). Our\nresults pave the way for more robust, pseudo-irrelevance-based retrieval\nsystems in future IR research.\n","authors":["Giulio D'Erasmo","Giovanni Trappolini","Nicola Tonellotto","Fabrizio Silvestri"],"pdf_url":"https://arxiv.org/pdf/2412.14967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00326v5","updated":"2024-12-19T15:07:38Z","published":"2023-12-01T03:44:54Z","title":"Agent-OM: Leveraging LLM Agents for Ontology Matching","summary":" Ontology matching (OM) enables semantic interoperability between different\nontologies and resolves their conceptual heterogeneity by aligning related\nentities. OM systems currently have two prevailing design paradigms:\nconventional knowledge-based expert systems and newer machine learning-based\npredictive systems. While large language models (LLMs) and LLM agents have\nrevolutionised data engineering and have been applied creatively in many\ndomains, their potential for OM remains underexplored. This study introduces a\nnovel agent-powered LLM-based design paradigm for OM systems. With\nconsideration of several specific challenges in leveraging LLM agents for OM,\nwe propose a generic framework, namely Agent-OM (Agent for Ontology Matching),\nconsisting of two Siamese agents for retrieval and matching, with a set of\nsimple OM tools. Our framework is implemented in a proof-of-concept system.\nEvaluations of three Ontology Alignment Evaluation Initiative (OAEI) tracks\nover state-of-the-art OM systems show that our system can achieve results very\nclose to the long-standing best performance on simple OM tasks and can\nsignificantly improve the performance on complex and few-shot OM tasks.\n","authors":["Zhangcheng Qiang","Weiqing Wang","Kerry Taylor"],"pdf_url":"https://arxiv.org/pdf/2312.00326v5.pdf","comment":"19 pages, 13 figures, 4 tables"},{"id":"http://arxiv.org/abs/2402.00390v2","updated":"2024-12-19T14:28:19Z","published":"2024-02-01T07:22:52Z","title":"DNS-Rec: Data-aware Neural Architecture Search for Recommender Systems","summary":" In the era of data proliferation, efficiently sifting through vast\ninformation to extract meaningful insights has become increasingly crucial.\nThis paper addresses the computational overhead and resource inefficiency\nprevalent in existing Sequential Recommender Systems (SRSs). We introduce an\ninnovative approach combining pruning methods with advanced model designs.\nFurthermore, we delve into resource-constrained Neural Architecture Search\n(NAS), an emerging technique in recommender systems, to optimize models in\nterms of FLOPs, latency, and energy consumption while maintaining or enhancing\naccuracy. Our principal contribution is the development of a Data-aware Neural\nArchitecture Search for Recommender System (DNS-Rec). DNS-Rec is specifically\ndesigned to tailor compact network architectures for attention-based SRS\nmodels, thereby ensuring accuracy retention. It incorporates data-aware gates\nto enhance the performance of the recommendation network by learning\ninformation from historical user-item interactions. Moreover, DNS-Rec employs a\ndynamic resource constraint strategy, stabilizing the search process and\nyielding more suitable architectural solutions. We demonstrate the\neffectiveness of our approach through rigorous experiments conducted on three\nbenchmark datasets, which highlight the superiority of DNS-Rec in SRSs. Our\nfindings set a new standard for future research in efficient and accurate\nrecommendation systems, marking a significant step forward in this rapidly\nevolving field.\n","authors":["Sheng Zhang","Maolin Wang","Yao Zhao","Chenyi Zhuang","Jinjie Gu","Ruocheng Guo","Xiangyu Zhao","Zijian Zhang","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2402.00390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14835v1","updated":"2024-12-19T13:25:39Z","published":"2024-12-19T13:25:39Z","title":"Progressive Multimodal Reasoning via Active Retrieval","summary":" Multi-step multimodal reasoning tasks pose significant challenges for\nmultimodal large language models (MLLMs), and finding effective ways to enhance\ntheir performance in such scenarios remains an unresolved issue. In this paper,\nwe propose AR-MCTS, a universal framework designed to progressively improve the\nreasoning capabilities of MLLMs through Active Retrieval (AR) and Monte Carlo\nTree Search (MCTS). Our approach begins with the development of a unified\nretrieval module that retrieves key supporting insights for solving complex\nreasoning problems from a hybrid-modal retrieval corpus. To bridge the gap in\nautomated multimodal reasoning verification, we employ the MCTS algorithm\ncombined with an active retrieval mechanism, which enables the automatic\ngeneration of step-wise annotations. This strategy dynamically retrieves key\ninsights for each reasoning step, moving beyond traditional beam search\nsampling to improve the diversity and reliability of the reasoning space.\nAdditionally, we introduce a process reward model that aligns progressively to\nsupport the automatic verification of multimodal reasoning tasks. Experimental\nresults across three complex multimodal reasoning benchmarks confirm the\neffectiveness of the AR-MCTS framework in enhancing the performance of various\nmultimodal models. Further analysis demonstrates that AR-MCTS can optimize\nsampling diversity and accuracy, yielding reliable multimodal reasoning.\n","authors":["Guanting Dong","Chenghao Zhang","Mengjie Deng","Yutao Zhu","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2412.14835v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2406.05666v9","updated":"2024-12-19T12:13:26Z","published":"2024-06-09T06:49:22Z","title":"Probability Distribution Learning and Its Application in Deep Learning","summary":" This paper introduces a novel theoretical learning framework, termed\nprobability distribution learning (PD learning). Departing from the traditional\nstatistical learning framework, PD learning focuses on learning the underlying\nprobability distribution, which is modeled as a random variable within the\nprobability simplex. In this framework, the optimization objective is the\nlearning error, which quantifies the posterior expected discrepancy between the\nmodel's predicted distribution and the underlying true distribution, given\navailable sample data and prior knowledge. To optimize the learning error, this\npaper proposes the necessary conditions for loss functions, models, and\noptimization algorithms, ensuring that these conditions are met in real-world\nmachine learning scenarios. Based on these conditions, the non-convex\noptimization mechanism corresponding to model training can be theoretically\nresolved. Moreover, this paper provides model-dependent and model-independent\nbounds on learning error, offering new insights into the model's fitting and\ngeneralization capabilities. Furthermore, the paper applies the PD learning\nframework to elucidate the mechanisms by which various techniques, including\nrandom parameter initialization, over-parameterization, and dropout, influence\ndeep model training. Finally, the paper substantiates the key conclusions of\nthe proposed framework through experimental results.\n","authors":["Binchuan Qi"],"pdf_url":"https://arxiv.org/pdf/2406.05666v9.pdf","comment":"arXiv admin note: text overlap with arXiv:2105.04026 by other\n authors. arXiv admin note: text overlap with arXiv:2105.04026 by other\n authors"},{"id":"http://arxiv.org/abs/2411.04677v3","updated":"2024-12-19T12:08:31Z","published":"2024-11-07T13:03:21Z","title":"Lightning IR: Straightforward Fine-tuning and Inference of\n Transformer-based Language Models for Information Retrieval","summary":" A wide range of transformer-based language models have been proposed for\ninformation retrieval tasks. However, including transformer-based models in\nretrieval pipelines is often complex and requires substantial engineering\neffort. In this paper, we introduce Lightning IR, an easy-to-use PyTorch\nLightning-based framework for applying transformer-based language models in\nretrieval scenarios. Lightning IR provides a modular and extensible\narchitecture that supports all stages of a retrieval pipeline: from fine-tuning\nand indexing to searching and re-ranking. Designed to be scalable and\nreproducible, Lightning IR is available as open-source:\nhttps://github.com/webis-de/lightning-ir.\n","authors":["Ferdinand Schlatt","Maik Fröbe","Matthias Hagen"],"pdf_url":"https://arxiv.org/pdf/2411.04677v3.pdf","comment":"Accepted as a demo at WSDM'25"},{"id":"http://arxiv.org/abs/2412.11216v2","updated":"2024-12-19T08:32:20Z","published":"2024-12-15T15:13:14Z","title":"Distribution-Consistency-Guided Multi-modal Hashing","summary":" Multi-modal hashing methods have gained popularity due to their fast speed\nand low storage requirements. Among them, the supervised methods demonstrate\nbetter performance by utilizing labels as supervisory signals compared with\nunsupervised methods. Currently, for almost all supervised multi-modal hashing\nmethods, there is a hidden assumption that training sets have no noisy labels.\nHowever, labels are often annotated incorrectly due to manual labeling in\nreal-world scenarios, which will greatly harm the retrieval performance. To\naddress this issue, we first discover a significant distribution consistency\npattern through experiments, i.e., the 1-0 distribution of the presence or\nabsence of each category in the label is consistent with the high-low\ndistribution of similarity scores of the hash codes relative to category\ncenters. Then, inspired by this pattern, we propose a novel\nDistribution-Consistency-Guided Multi-modal Hashing (DCGMH), which aims to\nfilter and reconstruct noisy labels to enhance retrieval performance.\nSpecifically, the proposed method first randomly initializes several category\ncenters, which are used to compute the high-low distribution of similarity\nscores; Noisy and clean labels are then separately filtered out via the\ndiscovered distribution consistency pattern to mitigate the impact of noisy\nlabels; Subsequently, a correction strategy, which is indirectly designed via\nthe distribution consistency pattern, is applied to the filtered noisy labels,\ncorrecting high-confidence ones while treating low-confidence ones as unlabeled\nfor unsupervised learning, thereby further enhancing the model's performance.\nExtensive experiments on three widely used datasets demonstrate the superiority\nof the proposed method compared to state-of-the-art baselines in multi-modal\nretrieval tasks. The code is available at\nhttps://github.com/LiuJinyu1229/DCGMH.\n","authors":["Jin-Yu Liu","Xian-Ling Mao","Tian-Yi Che","Rong-Cheng Tu"],"pdf_url":"https://arxiv.org/pdf/2412.11216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12470v2","updated":"2024-12-19T08:26:32Z","published":"2024-08-22T15:10:56Z","title":"DLCRec: A Novel Approach for Managing Diversity in LLM-Based Recommender\n Systems","summary":" The integration of Large Language Models (LLMs) into recommender systems has\nled to substantial performance improvements. However, this often comes at the\ncost of diminished recommendation diversity, which can negatively impact user\nsatisfaction. To address this issue, controllable recommendation has emerged as\na promising approach, allowing users to specify their preferences and receive\nrecommendations that meet their diverse needs. Despite its potential, existing\ncontrollable recommender systems frequently rely on simplistic mechanisms, such\nas a single prompt, to regulate diversity-an approach that falls short of\ncapturing the full complexity of user preferences. In response to these\nlimitations, we propose DLCRec, a novel framework designed to enable\nfine-grained control over diversity in LLM-based recommendations. Unlike\ntraditional methods, DLCRec adopts a fine-grained task decomposition strategy,\nbreaking down the recommendation process into three sequential sub-tasks: genre\nprediction, genre filling, and item prediction. These sub-tasks are trained\nindependently and inferred sequentially according to user-defined control\nnumbers, ensuring more precise control over diversity. Furthermore, the\nscarcity and uneven distribution of diversity-related user behavior data pose\nsignificant challenges for fine-tuning. To overcome these obstacles, we\nintroduce two data augmentation techniques that enhance the model's robustness\nto noisy and out-of-distribution data. These techniques expose the model to a\nbroader range of patterns, improving its adaptability in generating\nrecommendations with varying levels of diversity. Our extensive empirical\nevaluation demonstrates that DLCRec not only provides precise control over\ndiversity but also outperforms state-of-the-art baselines across multiple\nrecommendation scenarios.\n","authors":["Jiaju Chen","Chongming Gao","Shuai Yuan","Shuchang Liu","Qingpeng Cai","Peng Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.12470v2.pdf","comment":"Accepted by WSDM 2025"},{"id":"http://arxiv.org/abs/2412.14574v1","updated":"2024-12-19T06:44:59Z","published":"2024-12-19T06:44:59Z","title":"Sliding Windows Are Not the End: Exploring Full Ranking with\n Long-Context Large Language Models","summary":" Large Language Models (LLMs) have shown exciting performance in listwise\npassage ranking. Due to the limited input length, existing methods often adopt\nthe sliding window strategy. Such a strategy, though effective, is inefficient\nas it involves repetitive and serialized processing, which usually re-evaluates\nrelevant passages multiple times. As a result, it incurs redundant API costs,\nwhich are proportional to the number of inference tokens. The development of\nlong-context LLMs enables the full ranking of all passages within a single\ninference, avoiding redundant API costs. In this paper, we conduct a\ncomprehensive study of long-context LLMs for ranking tasks in terms of\nefficiency and effectiveness. Surprisingly, our experiments reveal that full\nranking with long-context LLMs can deliver superior performance in the\nsupervised fine-tuning setting with a huge efficiency improvement. Furthermore,\nwe identify two limitations of fine-tuning the full ranking model based on\nexisting methods: (1) sliding window strategy fails to produce a full ranking\nlist as a training label, and (2) the language modeling loss cannot emphasize\ntop-ranked passage IDs in the label. To alleviate these issues, we propose a\nnew complete listwise label construction approach and a novel importance-aware\nlearning objective for full ranking. Experiments show the superior performance\nof our method over baselines. Our codes are available at\n\\url{https://github.com/8421BCD/fullrank}.\n","authors":["Wenhan Liu","Xinyu Ma","Yutao Zhu","Ziliang Zhao","Shuaiqiang Wang","Dawei Yin","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2412.14574v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2405.00287v2","updated":"2024-12-19T05:48:08Z","published":"2024-05-01T02:27:59Z","title":"SCONE: A Novel Stochastic Sampling to Generate Contrastive Views and\n Hard Negative Samples for Recommendation","summary":" Graph-based collaborative filtering (CF) has emerged as a promising approach\nin recommender systems. Despite its achievements, graph-based CF models face\nchallenges due to data sparsity and negative sampling. In this paper, we\npropose a novel Stochastic sampling for i) COntrastive views and ii) hard\nNEgative samples (SCONE) to overcome these issues. SCONE generates dynamic\naugmented views and diverse hard negative samples via a unified stochastic\nsampling approach based on score-based generative models. Our extensive\nexperiments on 6 benchmark datasets show that SCONE consistently outperforms\nstate-of-the-art baselines. SCONE shows efficacy in addressing user sparsity\nand item popularity issues, while enhancing performance for both cold-start\nusers and long-tail items. Furthermore, our approach improves the diversity of\nthe recommendation and the uniformity of the representations. The code is\navailable at https://github.com/jeongwhanchoi/SCONE.\n","authors":["Chaejeong Lee","Jeongwhan Choi","Hyowon Wi","Sung-Bae Cho","Noseong Park"],"pdf_url":"https://arxiv.org/pdf/2405.00287v2.pdf","comment":"Accepted to WSDM 2025. Chaejeong Lee and Jeongwhan Choi are co-first\n authors with equal contributions"},{"id":"http://arxiv.org/abs/2412.14518v1","updated":"2024-12-19T04:33:22Z","published":"2024-12-19T04:33:22Z","title":"Efficient Self-Supervised Video Hashing with Selective State Spaces","summary":" Self-supervised video hashing (SSVH) is a practical task in video indexing\nand retrieval. Although Transformers are predominant in SSVH for their\nimpressive temporal modeling capabilities, they often suffer from computational\nand memory inefficiencies. Drawing inspiration from Mamba, an advanced\nstate-space model, we explore its potential in SSVH to achieve a better balance\nbetween efficacy and efficiency. We introduce S5VH, a Mamba-based video hashing\nmodel with an improved self-supervised learning paradigm. Specifically, we\ndesign bidirectional Mamba layers for both the encoder and decoder, which are\neffective and efficient in capturing temporal relationships thanks to the\ndata-dependent selective scanning mechanism with linear complexity. In our\nlearning strategy, we transform global semantics in the feature space into\nsemantically consistent and discriminative hash centers, followed by a center\nalignment loss as a global learning signal. Our self-local-global (SLG)\nparadigm significantly improves learning efficiency, leading to faster and\nbetter convergence. Extensive experiments demonstrate S5VH's improvements over\nstate-of-the-art methods, superior transferability, and scalable advantages in\ninference efficiency. Code is available at\nhttps://github.com/gimpong/AAAI25-S5VH.\n","authors":["Jinpeng Wang","Niu Lian","Jun Li","Yuting Wang","Yan Feng","Bin Chen","Yongbing Zhang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2412.14518v1.pdf","comment":"Accepted by AAAI'25. 9 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2412.14486v1","updated":"2024-12-19T03:19:18Z","published":"2024-12-19T03:19:18Z","title":"Moving Beyond LDA: A Comparison of Unsupervised Topic Modelling\n Techniques for Qualitative Data Analysis of Online Communities","summary":" Social media constitutes a rich and influential source of information for\nqualitative researchers. Although computational techniques like topic modelling\nassist with managing the volume and diversity of social media content,\nqualitative researcher's lack of programming expertise creates a significant\nbarrier to their adoption. In this paper we explore how BERTopic, an advanced\nLarge Language Model (LLM)-based topic modelling technique, can support\nqualitative data analysis of social media. We conducted interviews and hands-on\nevaluations in which qualitative researchers compared topics from three\nmodelling techniques: LDA, NMF, and BERTopic. BERTopic was favoured by 8 of 12\nparticipants for its ability to provide detailed, coherent clusters for deeper\nunderstanding and actionable insights. Participants also prioritised topic\nrelevance, logical organisation, and the capacity to reveal unexpected\nrelationships within the data. Our findings underscore the potential of\nLLM-based techniques for supporting qualitative analysis.\n","authors":["Amandeep Kaur","James R. Wallace"],"pdf_url":"https://arxiv.org/pdf/2412.14486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14476v1","updated":"2024-12-19T02:57:02Z","published":"2024-12-19T02:57:02Z","title":"HEC-GCN: Hypergraph Enhanced Cascading Graph Convolution Network for\n Multi-Behavior Recommendation","summary":" Multi-behavior recommendation (MBR) has garnered growing attention recently\ndue to its ability to mitigate the sparsity issue by inferring user preferences\nfrom various auxiliary behaviors to improve predictions for the target\nbehavior. Although existing research on MBR has yielded impressive results,\nthey still face two major limitations. First, previous methods mainly focus on\nmodeling fine-grained interaction information between users and items under\neach behavior, which may suffer from sparsity issue. Second, existing models\nusually concentrate on exploiting dependencies between two consecutive\nbehaviors, leaving intra- and inter-behavior consistency largely unexplored. To\nthe end, we propose a novel approach named Hypergraph Enhanced Cascading Graph\nConvolution Network for multi-behavior recommendation (HEC-GCN). To be\nspecific, we first explore both fine- and coarse-grained correlations among\nusers or items of each behavior by simultaneously modeling the\nbehavior-specific interaction graph and its corresponding hypergraph in a\ncascaded manner. Then, we propose a behavior consistency-guided alignment\nstrategy that ensures consistent representations between the interaction graph\nand its associated hypergraph for each behavior, while also maintaining\nrepresentation consistency across different behaviors. Extensive experiments\nand analyses on three public benchmark datasets demonstrate that our proposed\napproach is consistently superior to previous state-of-the-art methods due to\nits capability to effectively attenuate the sparsity issue as well as preserve\nboth intra- and inter-behavior consistencies. The code is available at\nhttps://github.com/marqu22/HEC-GCN.git.\n","authors":["Yabo Yin","Xiaofei Zhu","Wenshan Wang","Yihao Zhang","Pengfei Wang","Yixing Fan","Jiafeng Guo"],"pdf_url":"https://arxiv.org/pdf/2412.14476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14768v3","updated":"2024-12-19T02:18:54Z","published":"2024-05-23T16:35:52Z","title":"WISE: Rethinking the Knowledge Memory for Lifelong Model Editing of\n Large Language Models","summary":" Large language models (LLMs) need knowledge updates to meet the ever-growing\nworld facts and correct the hallucinated responses, facilitating the methods of\nlifelong model editing. Where the updated knowledge resides in memories is a\nfundamental question for model editing. In this paper, we find that editing\neither long-term memory (direct model parameters) or working memory\n(non-parametric knowledge of neural network activations/representations by\nretrieval) will result in an impossible triangle -- reliability,\ngeneralization, and locality can not be realized together in the lifelong\nediting settings. For long-term memory, directly editing the parameters will\ncause conflicts with irrelevant pretrained knowledge or previous edits (poor\nreliability and locality). For working memory, retrieval-based activations can\nhardly make the model understand the edits and generalize (poor\ngeneralization). Therefore, we propose WISE to bridge the gap between memories.\nIn WISE, we design a dual parametric memory scheme, which consists of the main\nmemory for the pretrained knowledge and a side memory for the edited knowledge.\nWe only edit the knowledge in the side memory and train a router to decide\nwhich memory to go through when given a query. For continual editing, we devise\na knowledge-sharding mechanism where different sets of edits reside in distinct\nsubspaces of parameters, and are subsequently merged into a shared memory\nwithout conflicts. Extensive experiments show that WISE can outperform previous\nmodel editing methods and overcome the impossible triangle under lifelong model\nediting of question answering, hallucination, and out-of-distribution settings\nacross trending LLM architectures, e.g., GPT, LLaMA, and Mistral. Code is\navailable at https://github.com/zjunlp/EasyEdit.\n","authors":["Peng Wang","Zexi Li","Ningyu Zhang","Ziwen Xu","Yunzhi Yao","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2405.14768v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.14457v1","updated":"2024-12-19T02:17:35Z","published":"2024-12-19T02:17:35Z","title":"VISA: Retrieval Augmented Generation with Visual Source Attribution","summary":" Generation with source attribution is important for enhancing the\nverifiability of retrieval-augmented generation (RAG) systems. However,\nexisting approaches in RAG primarily link generated content to document-level\nreferences, making it challenging for users to locate evidence among multiple\ncontent-rich retrieved documents. To address this challenge, we propose\nRetrieval-Augmented Generation with Visual Source Attribution (VISA), a novel\napproach that combines answer generation with visual source attribution.\nLeveraging large vision-language models (VLMs), VISA identifies the evidence\nand highlights the exact regions that support the generated answers with\nbounding boxes in the retrieved document screenshots. To evaluate its\neffectiveness, we curated two datasets: Wiki-VISA, based on crawled Wikipedia\nwebpage screenshots, and Paper-VISA, derived from PubLayNet and tailored to the\nmedical domain. Experimental results demonstrate the effectiveness of VISA for\nvisual source attribution on documents' original look, as well as highlighting\nthe challenges for improvement. Code, data, and model checkpoints will be\nreleased.\n","authors":["Xueguang Ma","Shengyao Zhuang","Bevan Koopman","Guido Zuccon","Wenhu Chen","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2412.14457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17969v3","updated":"2024-12-19T02:10:00Z","published":"2024-05-28T08:56:33Z","title":"Knowledge Circuits in Pretrained Transformers","summary":" The remarkable capabilities of modern large language models are rooted in\ntheir vast repositories of knowledge encoded within their parameters, enabling\nthem to perceive the world and engage in reasoning. The inner workings of how\nthese models store knowledge have long been a subject of intense interest and\ninvestigation among researchers. To date, most studies have concentrated on\nisolated components within these models, such as the Multilayer Perceptrons and\nattention head. In this paper, we delve into the computation graph of the\nlanguage model to uncover the knowledge circuits that are instrumental in\narticulating specific knowledge. The experiments, conducted with GPT2 and\nTinyLLAMA, have allowed us to observe how certain information heads, relation\nheads, and Multilayer Perceptrons collaboratively encode knowledge within the\nmodel. Moreover, we evaluate the impact of current knowledge editing techniques\non these knowledge circuits, providing deeper insights into the functioning and\nconstraints of these editing methodologies. Finally, we utilize knowledge\ncircuits to analyze and interpret language model behaviors such as\nhallucinations and in-context learning. We believe the knowledge circuits hold\npotential for advancing our understanding of Transformers and guiding the\nimproved design of knowledge editing. Code and data are available in\nhttps://github.com/zjunlp/KnowledgeCircuits.\n","authors":["Yunzhi Yao","Ningyu Zhang","Zekun Xi","Mengru Wang","Ziwen Xu","Shumin Deng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2405.17969v3.pdf","comment":"NeurIPS 2024, 26 pages"},{"id":"http://arxiv.org/abs/2412.14454v1","updated":"2024-12-19T02:09:59Z","published":"2024-12-19T02:09:59Z","title":"Are Longer Prompts Always Better? Prompt Selection in Large Language\n Models for Recommendation Systems","summary":" In large language models (LLM)-based recommendation systems (LLM-RSs),\naccurately predicting user preferences by leveraging the general knowledge of\nLLMs is possible without requiring extensive training data. By converting\nrecommendation tasks into natural language inputs called prompts, LLM-RSs can\nefficiently solve issues that have been difficult to address due to data\nscarcity but are crucial in applications such as cold-start and cross-domain\nproblems. However, when applying this in practice, selecting the prompt that\nmatches tasks and data is essential. Although numerous prompts have been\nproposed in LLM-RSs and representing the target user in prompts significantly\nimpacts recommendation accuracy, there are still no clear guidelines for\nselecting specific prompts.\n In this paper, we categorize and analyze prompts from previous research to\nestablish practical prompt selection guidelines. Through 450 experiments with\n90 prompts and five real-world datasets, we examined the relationship between\nprompts and dataset characteristics in recommendation accuracy. We found that\nno single prompt consistently outperforms others; thus, selecting prompts on\nthe basis of dataset characteristics is crucial. Here, we propose a prompt\nselection method that achieves higher accuracy with minimal validation data.\nBecause increasing the number of prompts to explore raises costs, we also\nintroduce a cost-efficient strategy using high-performance and cost-efficient\nLLMs, significantly reducing exploration costs while maintaining high\nprediction accuracy. Our work offers valuable insights into the prompt\nselection, advancing accurate and efficient LLM-RSs.\n","authors":["Genki Kusano","Kosuke Akimoto","Kunihiro Takeoka"],"pdf_url":"https://arxiv.org/pdf/2412.14454v1.pdf","comment":"15 pages"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2412.15212v1","updated":"2024-12-19T18:59:51Z","published":"2024-12-19T18:59:51Z","title":"Scaling 4D Representations","summary":" Scaling has not yet been convincingly demonstrated for pure self-supervised\nlearning from video. However, prior work has focused evaluations on\nsemantic-related tasks $\\unicode{x2013}$ action classification, ImageNet\nclassification, etc. In this paper we focus on evaluating self-supervised\nlearning on non-semantic vision tasks that are more spatial (3D) and temporal\n(+1D = 4D), such as camera pose estimation, point and object tracking, and\ndepth estimation. We show that by learning from very large video datasets,\nmasked auto-encoding (MAE) with transformer video models actually scales,\nconsistently improving performance on these 4D tasks, as model size increases\nfrom 20M all the way to the largest by far reported self-supervised video model\n$\\unicode{x2013}$ 22B parameters. Rigorous apples-to-apples comparison with\nmany recent image and video models demonstrates the benefits of scaling 4D\nrepresentations.\n","authors":["João Carreira","Dilara Gokay","Michael King","Chuhan Zhang","Ignacio Rocco","Aravindh Mahendran","Thomas Albert Keck","Joseph Heyward","Skanda Koppula","Etienne Pot","Goker Erdogan","Yana Hasson","Yi Yang","Klaus Greff","Guillaume Le Moing","Sjoerd van Steenkiste","Daniel Zoran","Drew A. Hudson","Pedro Vélez","Luisa Polanía","Luke Friedman","Chris Duvarney","Ross Goroshin","Kelsey Allen","Jacob Walker","Rishabh Kabra","Eric Aboussouan","Jennifer Sun","Thomas Kipf","Carl Doersch","Viorica Pătrăucean","Dima Damen","Pauline Luc","Mehdi S. M. Sajjadi","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2412.15212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15209v1","updated":"2024-12-19T18:59:44Z","published":"2024-12-19T18:59:44Z","title":"PRIMA: Multi-Image Vision-Language Models for Reasoning Segmentation","summary":" Despite significant advancements in Large Vision-Language Models (LVLMs),\nexisting pixel-grounding models operate on single-image settings, limiting\ntheir ability to perform detailed, fine-grained comparisons across multiple\nimages. Conversely, current multi-image understanding models lack pixel-level\ngrounding. Our work addresses this gap by introducing the task of multi-image\npixel-grounded reasoning segmentation, and PRIMA, a novel LVLM that integrates\npixel-level grounding with robust multi-image reasoning capabilities to produce\ncontextually rich, pixel-grounded explanations. Central to PRIMA is an\nefficient vision module that queries fine-grained visual representations across\nmultiple images, reducing TFLOPs by $25.3\\%$. To support training and\nevaluation, we curate $M^4Seg$, a new reasoning segmentation benchmark\nconsisting of $\\sim$224K question-answer pairs that require fine-grained visual\nunderstanding across multiple images. Experimental results demonstrate PRIMA\noutperforms state-of-the-art baselines.\n","authors":["Muntasir Wahed","Kiet A. Nguyen","Adheesh Sunil Juvekar","Xinzhuo Li","Xiaona Zhou","Vedant Shah","Tianjiao Yu","Pinar Yanardag","Ismini Lourentzou"],"pdf_url":"https://arxiv.org/pdf/2412.15209v1.pdf","comment":"Project page: https://plan-lab.github.io/prima"},{"id":"http://arxiv.org/abs/2412.15208v1","updated":"2024-12-19T18:59:40Z","published":"2024-12-19T18:59:40Z","title":"OpenEMMA: Open-Source Multimodal Model for End-to-End Autonomous Driving","summary":" Since the advent of Multimodal Large Language Models (MLLMs), they have made\na significant impact across a wide range of real-world applications,\nparticularly in Autonomous Driving (AD). Their ability to process complex\nvisual data and reason about intricate driving scenarios has paved the way for\na new paradigm in end-to-end AD systems. However, the progress of developing\nend-to-end models for AD has been slow, as existing fine-tuning methods demand\nsubstantial resources, including extensive computational power, large-scale\ndatasets, and significant funding. Drawing inspiration from recent advancements\nin inference computing, we propose OpenEMMA, an open-source end-to-end\nframework based on MLLMs. By incorporating the Chain-of-Thought reasoning\nprocess, OpenEMMA achieves significant improvements compared to the baseline\nwhen leveraging a diverse range of MLLMs. Furthermore, OpenEMMA demonstrates\neffectiveness, generalizability, and robustness across a variety of challenging\ndriving scenarios, offering a more efficient and effective approach to\nautonomous driving. We release all the codes in\nhttps://github.com/taco-group/OpenEMMA.\n","authors":["Shuo Xing","Chengyuan Qian","Yuping Wang","Hongyuan Hua","Kexin Tian","Yang Zhou","Zhengzhong Tu"],"pdf_url":"https://arxiv.org/pdf/2412.15208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15206v1","updated":"2024-12-19T18:59:33Z","published":"2024-12-19T18:59:33Z","title":"AutoTrust: Benchmarking Trustworthiness in Large Vision Language Models\n for Autonomous Driving","summary":" Recent advancements in large vision language models (VLMs) tailored for\nautonomous driving (AD) have shown strong scene understanding and reasoning\ncapabilities, making them undeniable candidates for end-to-end driving systems.\nHowever, limited work exists on studying the trustworthiness of DriveVLMs -- a\ncritical factor that directly impacts public transportation safety. In this\npaper, we introduce AutoTrust, a comprehensive trustworthiness benchmark for\nlarge vision-language models in autonomous driving (DriveVLMs), considering\ndiverse perspectives -- including trustfulness, safety, robustness, privacy,\nand fairness. We constructed the largest visual question-answering dataset for\ninvestigating trustworthiness issues in driving scenarios, comprising over 10k\nunique scenes and 18k queries. We evaluated six publicly available VLMs,\nspanning from generalist to specialist, from open-source to commercial models.\nOur exhaustive evaluations have unveiled previously undiscovered\nvulnerabilities of DriveVLMs to trustworthiness threats. Specifically, we found\nthat the general VLMs like LLaVA-v1.6 and GPT-4o-mini surprisingly outperform\nspecialized models fine-tuned for driving in terms of overall trustworthiness.\nDriveVLMs like DriveLM-Agent are particularly vulnerable to disclosing\nsensitive information. Additionally, both generalist and specialist VLMs remain\nsusceptible to adversarial attacks and struggle to ensure unbiased\ndecision-making across diverse environments and populations. Our findings call\nfor immediate and decisive action to address the trustworthiness of DriveVLMs\n-- an issue of critical importance to public safety and the welfare of all\ncitizens relying on autonomous transportation systems. Our benchmark is\npublicly available at \\url{https://github.com/taco-group/AutoTrust}, and the\nleaderboard is released at \\url{https://taco-group.github.io/AutoTrust/}.\n","authors":["Shuo Xing","Hongyuan Hua","Xiangbo Gao","Shenzhe Zhu","Renjie Li","Kexin Tian","Xiaopeng Li","Heng Huang","Tianbao Yang","Zhangyang Wang","Yang Zhou","Huaxiu Yao","Zhengzhong Tu"],"pdf_url":"https://arxiv.org/pdf/2412.15206v1.pdf","comment":"55 pages, 14 figures"},{"id":"http://arxiv.org/abs/2412.15199v1","updated":"2024-12-19T18:58:36Z","published":"2024-12-19T18:58:36Z","title":"LiDAR-RT: Gaussian-based Ray Tracing for Dynamic LiDAR Re-simulation","summary":" This paper targets the challenge of real-time LiDAR re-simulation in dynamic\ndriving scenarios. Recent approaches utilize neural radiance fields combined\nwith the physical modeling of LiDAR sensors to achieve high-fidelity\nre-simulation results. Unfortunately, these methods face limitations due to\nhigh computational demands in large-scale scenes and cannot perform real-time\nLiDAR rendering. To overcome these constraints, we propose LiDAR-RT, a novel\nframework that supports real-time, physically accurate LiDAR re-simulation for\ndriving scenes. Our primary contribution is the development of an efficient and\neffective rendering pipeline, which integrates Gaussian primitives and\nhardware-accelerated ray tracing technology. Specifically, we model the\nphysical properties of LiDAR sensors using Gaussian primitives with learnable\nparameters and incorporate scene graphs to handle scene dynamics. Building upon\nthis scene representation, our framework first constructs a bounding volume\nhierarchy (BVH), then casts rays for each pixel and generates novel LiDAR views\nthrough a differentiable rendering algorithm. Importantly, our framework\nsupports realistic rendering with flexible scene editing operations and various\nsensor configurations. Extensive experiments across multiple public benchmarks\ndemonstrate that our method outperforms state-of-the-art methods in terms of\nrendering quality and efficiency. Our project page is at\nhttps://zju3dv.github.io/lidar-rt.\n","authors":["Chenxu Zhou","Lvchang Fu","Sida Peng","Yunzhi Yan","Zhanhua Zhang","Yong Chen","Jiazhi Xia","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.15199v1.pdf","comment":"Project page: https://zju3dv.github.io/lidar-rt"},{"id":"http://arxiv.org/abs/2412.15195v1","updated":"2024-12-19T18:58:14Z","published":"2024-12-19T18:58:14Z","title":"Preventing Local Pitfalls in Vector Quantization via Optimal Transport","summary":" Vector-quantized networks (VQNs) have exhibited remarkable performance across\nvarious tasks, yet they are prone to training instability, which complicates\nthe training process due to the necessity for techniques such as subtle\ninitialization and model distillation. In this study, we identify the local\nminima issue as the primary cause of this instability. To address this, we\nintegrate an optimal transport method in place of the nearest neighbor search\nto achieve a more globally informed assignment. We introduce OptVQ, a novel\nvector quantization method that employs the Sinkhorn algorithm to optimize the\noptimal transport problem, thereby enhancing the stability and efficiency of\nthe training process. To mitigate the influence of diverse data distributions\non the Sinkhorn algorithm, we implement a straightforward yet effective\nnormalization strategy. Our comprehensive experiments on image reconstruction\ntasks demonstrate that OptVQ achieves 100% codebook utilization and surpasses\ncurrent state-of-the-art VQNs in reconstruction quality.\n","authors":["Borui Zhang","Wenzhao Zheng","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2412.15195v1.pdf","comment":"Code is available at https://github.com/zbr17/OptVQ"},{"id":"http://arxiv.org/abs/2412.15191v1","updated":"2024-12-19T18:57:21Z","published":"2024-12-19T18:57:21Z","title":"AV-Link: Temporally-Aligned Diffusion Features for Cross-Modal\n Audio-Video Generation","summary":" We propose AV-Link, a unified framework for Video-to-Audio and Audio-to-Video\ngeneration that leverages the activations of frozen video and audio diffusion\nmodels for temporally-aligned cross-modal conditioning. The key to our\nframework is a Fusion Block that enables bidirectional information exchange\nbetween our backbone video and audio diffusion models through a\ntemporally-aligned self attention operation. Unlike prior work that uses\nfeature extractors pretrained for other tasks for the conditioning signal,\nAV-Link can directly leverage features obtained by the complementary modality\nin a single framework i.e. video features to generate audio, or audio features\nto generate video. We extensively evaluate our design choices and demonstrate\nthe ability of our method to achieve synchronized and high-quality audiovisual\ncontent, showcasing its potential for applications in immersive media\ngeneration. Project Page: snap-research.github.io/AVLink/\n","authors":["Moayed Haji-Ali","Willi Menapace","Aliaksandr Siarohin","Ivan Skorokhodov","Alper Canberk","Kwot Sin Lee","Vicente Ordonez","Sergey Tulyakov"],"pdf_url":"https://arxiv.org/pdf/2412.15191v1.pdf","comment":"Project Page: snap-research.github.io/AVLink/"},{"id":"http://arxiv.org/abs/2412.15188v1","updated":"2024-12-19T18:56:24Z","published":"2024-12-19T18:56:24Z","title":"LlamaFusion: Adapting Pretrained Language Models for Multimodal\n Generation","summary":" We present LlamaFusion, a framework for empowering pretrained text-only large\nlanguage models (LLMs) with multimodal generative capabilities, enabling them\nto understand and generate both text and images in arbitrary sequences.\nLlamaFusion leverages existing Llama-3's weights for processing texts\nautoregressively while introducing additional and parallel transformer modules\nfor processing images with diffusion. During training, the data from each\nmodality is routed to its dedicated modules: modality-specific feedforward\nlayers, query-key-value projections, and normalization layers process each\nmodality independently, while the shared self-attention layers allow\ninteractions across text and image features. By freezing the text-specific\nmodules and only training the image-specific modules, LlamaFusion preserves the\nlanguage capabilities of text-only LLMs while developing strong visual\nunderstanding and generation abilities. Compared to methods that pretrain\nmultimodal generative models from scratch, our experiments demonstrate that,\nLlamaFusion improves image understanding by 20% and image generation by 3.6%\nusing only 50% of the FLOPs while maintaining Llama-3's language capabilities.\nWe also demonstrate that this framework can adapt existing vision-language\nmodels with multimodal generation ability. Overall, this framework not only\nleverages existing computational investments in text-only LLMs but also enables\nthe parallel development of language and vision capabilities, presenting a\npromising direction for efficient multimodal model development.\n","authors":["Weijia Shi","Xiaochuang Han","Chunting Zhou","Weixin Liang","Xi Victoria Lin","Luke Zettlemoyer","Lili Yu"],"pdf_url":"https://arxiv.org/pdf/2412.15188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15184v1","updated":"2024-12-19T18:55:17Z","published":"2024-12-19T18:55:17Z","title":"Data for Mathematical Copilots: Better Ways of Presenting Proofs for\n Machine Learning","summary":" The suite of datasets commonly used to train and evaluate the mathematical\ncapabilities of AI-based mathematical copilots (primarily large language\nmodels) exhibit several shortcomings. These limitations include a restricted\nscope of mathematical complexity, typically not exceeding lower\nundergraduate-level mathematics, binary rating protocols and other issues,\nwhich makes comprehensive proof-based evaluation suites difficult. We\nsystematically explore these limitations and contend that enhancing the\ncapabilities of large language models, or any forthcoming advancements in\nAI-based mathematical assistants (copilots or \"thought partners\"), necessitates\na paradigm shift in the design of mathematical datasets and the evaluation\ncriteria of mathematical ability: It is necessary to move away from\nresult-based datasets (theorem statement to theorem proof) and convert the rich\nfacets of mathematical research practice to data LLMs can train on. Examples of\nthese are mathematical workflows (sequences of atomic, potentially\nsubfield-dependent tasks that are often performed when creating new\nmathematics), which are an important part of the proof-discovery process.\nAdditionally, we advocate for mathematical dataset developers to consider the\nconcept of \"motivated proof\", introduced by G. P\\'olya in 1949, which can serve\nas a blueprint for datasets that offer a better proof learning signal,\nalleviating some of the mentioned limitations. Lastly, we introduce math\ndatasheets for datasets, extending the general, dataset-agnostic variants of\ndatasheets: We provide a questionnaire designed specifically for math datasets\nthat we urge dataset creators to include with their datasets. This will make\ncreators aware of potential limitations of their datasets while at the same\ntime making it easy for readers to assess it from the point of view of training\nand evaluating mathematical copilots.\n","authors":["Simon Frieder","Jonas Bayer","Katherine M. Collins","Julius Berner","Jacob Loader","András Juhász","Fabian Ruehle","Sean Welleck","Gabriel Poesia","Ryan-Rhys Griffiths","Adrian Weller","Anirudh Goyal","Thomas Lukasiewicz","Timothy Gowers"],"pdf_url":"https://arxiv.org/pdf/2412.15184v1.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2412.15182v1","updated":"2024-12-19T18:54:06Z","published":"2024-12-19T18:54:06Z","title":"STRAP: Robot Sub-Trajectory Retrieval for Augmented Policy Learning","summary":" Robot learning is witnessing a significant increase in the size, diversity,\nand complexity of pre-collected datasets, mirroring trends in domains such as\nnatural language processing and computer vision. Many robot learning methods\ntreat such datasets as multi-task expert data and learn a multi-task,\ngeneralist policy by training broadly across them. Notably, while these\ngeneralist policies can improve the average performance across many tasks, the\nperformance of generalist policies on any one task is often suboptimal due to\nnegative transfer between partitions of the data, compared to task-specific\nspecialist policies. In this work, we argue for the paradigm of training\npolicies during deployment given the scenarios they encounter: rather than\ndeploying pre-trained policies to unseen problems in a zero-shot manner, we\nnon-parametrically retrieve and train models directly on relevant data at test\ntime. Furthermore, we show that many robotics tasks share considerable amounts\nof low-level behaviors and that retrieval at the \"sub\"-trajectory granularity\nenables significantly improved data utilization, generalization, and robustness\nin adapting policies to novel problems. In contrast, existing full-trajectory\nretrieval methods tend to underutilize the data and miss out on shared\ncross-task content. This work proposes STRAP, a technique for leveraging\npre-trained vision foundation models and dynamic time warping to retrieve\nsub-sequences of trajectories from large training corpora in a robust fashion.\nSTRAP outperforms both prior retrieval algorithms and multi-task learning\nmethods in simulated and real experiments, showing the ability to scale to much\nlarger offline datasets in the real world as well as the ability to learn\nrobust control policies with just a handful of real-world demonstrations.\n","authors":["Marius Memmel","Jacob Berg","Bingqing Chen","Abhishek Gupta","Jonathan Francis"],"pdf_url":"https://arxiv.org/pdf/2412.15182v1.pdf","comment":"Project website at https://weirdlabuw.github.io/strap/"},{"id":"http://arxiv.org/abs/2412.15178v1","updated":"2024-12-19T18:52:05Z","published":"2024-12-19T18:52:05Z","title":"HPC-Coder-V2: Studying Code LLMs Across Low-Resource Parallel Languages","summary":" Large Language Model (LLM) based coding tools have been tremendously\nsuccessful as software development assistants, yet they are often designed for\ngeneral purpose programming tasks and perform poorly for more specialized\ndomains such as high performance computing. Creating specialized models and\ntools for these domains is crucial towards gaining the benefits of LLMs in\nareas such as HPC. While previous work has explored HPC-specific models, LLMs\nstill struggle to generate parallel code and it is not at all clear what\nhurdles are still holding back these LLMs and what must be done to overcome\nthem. In this work, we conduct an in-depth study along the many axes of\nfine-tuning a specialized HPC LLM in order to better understand the challenges.\nBased on our findings we fine-tune and evaluate a specialized HPC LLM that is\nshown to be the best performing open-source code LLM for parallel code\ngeneration to date.\n","authors":["Aman Chaturvedi","Daniel Nichols","Siddharth Singh","Abhinav Bhatele"],"pdf_url":"https://arxiv.org/pdf/2412.15178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15176v1","updated":"2024-12-19T18:51:06Z","published":"2024-12-19T18:51:06Z","title":"Rethinking Uncertainty Estimation in Natural Language Generation","summary":" Large Language Models (LLMs) are increasingly employed in real-world\napplications, driving the need to evaluate the trustworthiness of their\ngenerated text. To this end, reliable uncertainty estimation is essential.\nSince current LLMs generate text autoregressively through a stochastic process,\nthe same prompt can lead to varying outputs. Consequently, leading uncertainty\nestimation methods generate and analyze multiple output sequences to determine\nthe LLM's uncertainty. However, generating output sequences is computationally\nexpensive, making these methods impractical at scale. In this work, we inspect\nthe theoretical foundations of the leading methods and explore new directions\nto enhance their computational efficiency. Building on the framework of proper\nscoring rules, we find that the negative log-likelihood of the most likely\noutput sequence constitutes a theoretically grounded uncertainty measure. To\napproximate this alternative measure, we propose G-NLL, which has the advantage\nof being obtained using only a single output sequence generated by greedy\ndecoding. This makes uncertainty estimation more efficient and straightforward,\nwhile preserving theoretical rigor. Empirical results demonstrate that G-NLL\nachieves state-of-the-art performance across various LLMs and tasks. Our work\nlays the foundation for efficient and reliable uncertainty estimation in\nnatural language generation, challenging the necessity of more computationally\ninvolved methods currently leading the field.\n","authors":["Lukas Aichberger","Kajetan Schweighofer","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2412.15176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18479v2","updated":"2024-12-19T18:49:00Z","published":"2024-11-27T16:22:33Z","title":"SoK: Watermarking for AI-Generated Content","summary":" As the outputs of generative AI (GenAI) techniques improve in quality, it\nbecomes increasingly challenging to distinguish them from human-created\ncontent. Watermarking schemes are a promising approach to address the problem\nof distinguishing between AI and human-generated content. These schemes embed\nhidden signals within AI-generated content to enable reliable detection. While\nwatermarking is not a silver bullet for addressing all risks associated with\nGenAI, it can play a crucial role in enhancing AI safety and trustworthiness by\ncombating misinformation and deception. This paper presents a comprehensive\noverview of watermarking techniques for GenAI, beginning with the need for\nwatermarking from historical and regulatory perspectives. We formalize the\ndefinitions and desired properties of watermarking schemes and examine the key\nobjectives and threat models for existing approaches. Practical evaluation\nstrategies are also explored, providing insights into the development of robust\nwatermarking techniques capable of resisting various attacks. Additionally, we\nreview recent representative works, highlight open challenges, and discuss\npotential directions for this emerging field. By offering a thorough\nunderstanding of watermarking in GenAI, this work aims to guide researchers in\nadvancing watermarking methods and applications, and support policymakers in\naddressing the broader implications of GenAI.\n","authors":["Xuandong Zhao","Sam Gunn","Miranda Christ","Jaiden Fairoze","Andres Fabrega","Nicholas Carlini","Sanjam Garg","Sanghyun Hong","Milad Nasr","Florian Tramer","Somesh Jha","Lei Li","Yu-Xiang Wang","Dawn Song"],"pdf_url":"https://arxiv.org/pdf/2411.18479v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.06289v3","updated":"2024-12-19T18:47:54Z","published":"2024-12-09T08:24:11Z","title":"S$^{2}$FT: Efficient, Scalable and Generalizable LLM Fine-tuning by\n Structured Sparsity","summary":" Current PEFT methods for LLMs can achieve either high quality, efficient\ntraining, or scalable serving, but not all three simultaneously. To address\nthis limitation, we investigate sparse fine-tuning and observe a remarkable\nimprovement in generalization ability. Utilizing this key insight, we propose a\nfamily of Structured Sparse Fine-Tuning (S$^{2}$FT) methods for LLMs, which\nconcurrently achieve state-of-the-art fine-tuning performance, training\nefficiency, and inference scalability. S$^{2}$FT accomplishes this by\n\"selecting sparsely and computing densely\". It selects a few heads and channels\nin the MHA and FFN modules for each Transformer block, respectively. Next, it\nco-permutes weight matrices on both sides of the coupled structures in LLMs to\nconnect the selected components in each layer into a dense submatrix. Finally,\nS$^{2}$FT performs in-place gradient updates on all submatrices. Through\ntheoretical analysis and empirical results, our method prevents forgetting\nwhile simplifying optimization, delivers SOTA performance on both commonsense\nand arithmetic reasoning with 4.6% and 1.3% average improvements compared to\nLoRA, and surpasses full FT by 11.5% when generalizing to various domains after\ninstruction tuning. Using our partial backpropagation algorithm, S$^{2}$FT\nsaves training memory up to 3$\\times$ and improves latency by 1.5-2.7$\\times$\ncompared to full FT, while delivering an average 10% improvement over LoRA on\nboth metrics. We further demonstrate that the weight updates in S$^{2}$FT can\nbe decoupled into adapters, enabling effective fusion, fast switch, and\nefficient parallelism for serving multiple fine-tuned models.\n","authors":["Xinyu Yang","Jixuan Leng","Geyang Guo","Jiawei Zhao","Ryumei Nakada","Linjun Zhang","Huaxiu Yao","Beidi Chen"],"pdf_url":"https://arxiv.org/pdf/2412.06289v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15163v1","updated":"2024-12-19T18:38:13Z","published":"2024-12-19T18:38:13Z","title":"Operationalising Rawlsian Ethics for Fairness in Norm-Learning Agents","summary":" Social norms are standards of behaviour common in a society. However, when\nagents make decisions without considering how others are impacted, norms can\nemerge that lead to the subjugation of certain agents. We present RAWL-E, a\nmethod to create ethical norm-learning agents. RAWL-E agents operationalise\nmaximin, a fairness principle from Rawlsian ethics, in their decision-making\nprocesses to promote ethical norms by balancing societal well-being with\nindividual goals. We evaluate RAWL-E agents in simulated harvesting scenarios.\nWe find that norms emerging in RAWL-E agent societies enhance social welfare,\nfairness, and robustness, and yield higher minimum experience compared to those\nthat emerge in agent societies that do not implement Rawlsian ethics.\n","authors":["Jessica Woodgate","Paul Marshall","Nirav Ajmeri"],"pdf_url":"https://arxiv.org/pdf/2412.15163v1.pdf","comment":"14 pages, 7 figures, 8 tables (and supplementary material with\n reproducibility and additional results), accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2412.15150v1","updated":"2024-12-19T18:28:37Z","published":"2024-12-19T18:28:37Z","title":"Leveraging Color Channel Independence for Improved Unsupervised Object\n Detection","summary":" Object-centric architectures can learn to extract distinct object\nrepresentations from visual scenes, enabling downstream applications on the\nobject level. Similarly to autoencoder-based image models, object-centric\napproaches have been trained on the unsupervised reconstruction loss of images\nencoded by RGB color spaces. In our work, we challenge the common assumption\nthat RGB images are the optimal color space for unsupervised learning in\ncomputer vision. We discuss conceptually and empirically that other color\nspaces, such as HSV, bear essential characteristics for object-centric\nrepresentation learning, like robustness to lighting conditions. We further\nshow that models improve when requiring them to predict additional color\nchannels. Specifically, we propose to transform the predicted targets to the\nRGB-S space, which extends RGB with HSV's saturation component and leads to\nmarkedly better reconstruction and disentanglement for five common evaluation\ndatasets. The use of composite color spaces can be implemented with basically\nno computational overhead, is agnostic of the models' architecture, and is\nuniversally applicable across a wide range of visual computing tasks and\ntraining types. The findings of our approach encourage additional\ninvestigations in computer vision tasks beyond object-centric learning.\n","authors":["Bastian Jäckl","Yannick Metz","Udo Schlegel","Daniel A. Keim","Maximilian T. Fischer"],"pdf_url":"https://arxiv.org/pdf/2412.15150v1.pdf","comment":"38 pages incl. references, 16 figures"},{"id":"http://arxiv.org/abs/2412.15129v1","updated":"2024-12-19T18:09:42Z","published":"2024-12-19T18:09:42Z","title":"Jet: A Modern Transformer-Based Normalizing Flow","summary":" In the past, normalizing generative flows have emerged as a promising class\nof generative models for natural images. This type of model has many modeling\nadvantages: the ability to efficiently compute log-likelihood of the input\ndata, fast generation and simple overall structure. Normalizing flows remained\na topic of active research but later fell out of favor, as visual quality of\nthe samples was not competitive with other model classes, such as GANs,\nVQ-VAE-based approaches or diffusion models. In this paper we revisit the\ndesign of the coupling-based normalizing flow models by carefully ablating\nprior design choices and using computational blocks based on the Vision\nTransformer architecture, not convolutional neural networks. As a result, we\nachieve state-of-the-art quantitative and qualitative performance with a much\nsimpler architecture. While the overall visual quality is still behind the\ncurrent state-of-the-art models, we argue that strong normalizing flow models\ncan help advancing research frontier by serving as building components of more\npowerful generative models.\n","authors":["Alexander Kolesnikov","André Susano Pinto","Michael Tschannen"],"pdf_url":"https://arxiv.org/pdf/2412.15129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15127v1","updated":"2024-12-19T18:08:04Z","published":"2024-12-19T18:08:04Z","title":"Adaptive Pruning for Large Language Models with Structural Importance\n Awareness","summary":" The recent advancements in large language models (LLMs) have significantly\nimproved language understanding and generation capabilities. However, it is\ndifficult to deploy LLMs on resource-constrained edge devices due to their high\ncomputational and storage resource demands. To address this issue, we propose a\nnovel LLM model pruning method, namely structurally-aware adaptive pruning\n(SAAP), to significantly reduce the computational and memory costs while\nmaintaining model performance. We first define an adaptive importance fusion\nmetric to evaluate the importance of all coupled structures in LLMs by\nconsidering their homoscedastic uncertainty. Then, we rank the importance of\nall modules to determine the specific layers that should be pruned to meet\nparticular performance requirements. Furthermore, we develop a new group\nfine-tuning strategy to improve the inference efficiency of LLMs. Finally, we\nevaluate the proposed SAAP method on multiple LLMs across two common tasks,\ni.e., zero-shot classification and text generation. Experimental results show\nthat our SAAP method outperforms several state-of-the-art baseline methods,\nachieving 2.17%, 2.37%, and 2.39% accuracy gains on LLaMA-7B, Vicuna-7B, and\nLLaMA-13B. Additionally, SAAP improves the token generation speed by 5%,\nshowcasing its practical advantages in resource-constrained scenarios.\n","authors":["Haotian Zheng","Jinke Ren","Yushan Sun","Ruichen Zhang","Wenbo Zhang","Zhen Li","Dusit Niyato","Shuguang Cui","Yatong Han"],"pdf_url":"https://arxiv.org/pdf/2412.15127v1.pdf","comment":"12 pages, 6 figures, 12 tables"},{"id":"http://arxiv.org/abs/2412.15118v1","updated":"2024-12-19T17:59:42Z","published":"2024-12-19T17:59:42Z","title":"Outcome-Refining Process Supervision for Code Generation","summary":" Large Language Models have demonstrated remarkable capabilities in code\ngeneration, yet they often struggle with complex programming tasks that require\ndeep algorithmic reasoning. While process supervision through learned reward\nmodels shows promise in guiding reasoning steps, it requires expensive training\ndata and suffers from unreliable evaluation. We propose Outcome-Refining\nProcess Supervision, a novel paradigm that treats outcome refinement itself as\nthe process to be supervised. Our framework leverages concrete execution\nsignals to ground the supervision of reasoning steps, while using\ntree-structured exploration to maintain multiple solution trajectories\nsimultaneously. Experiments demonstrate that our approach enables even smaller\nmodels to achieve high success accuracy and performance metrics on competitive\nprogramming tasks, creates more reliable verification than traditional reward\nmodels without requiring training PRMs. Our approach achieves significant\nimprovements across 5 models and 3 datasets: an average of 26.9% increase in\ncorrectness and 42.2% in efficiency. The results suggest that providing\nstructured reasoning space with concrete verification signals is crucial for\nsolving complex programming tasks. We open-source all our code and data at:\nhttps://github.com/zhuohaoyu/ORPS\n","authors":["Zhuohao Yu","Weizheng Gu","Yidong Wang","Zhengran Zeng","Jindong Wang","Wei Ye","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.15118v1.pdf","comment":"18 pages, 5 figures, Code: https://github.com/zhuohaoyu/ORPS"},{"id":"http://arxiv.org/abs/2409.18472v2","updated":"2024-12-19T17:57:43Z","published":"2024-09-27T06:18:55Z","title":"URIEL+: Enhancing Linguistic Inclusion and Usability in a Typological\n and Multilingual Knowledge Base","summary":" URIEL is a knowledge base offering geographical, phylogenetic, and\ntypological vector representations for 7970 languages. It includes distance\nmeasures between these vectors for 4005 languages, which are accessible via the\nlang2vec tool. Despite being frequently cited, URIEL is limited in terms of\nlinguistic inclusion and overall usability. To tackle these challenges, we\nintroduce URIEL+, an enhanced version of URIEL and lang2vec that addresses\nthese limitations. In addition to expanding typological feature coverage for\n2898 languages, URIEL+ improves the user experience with robust, customizable\ndistance calculations to better suit the needs of users. These upgrades also\noffer competitive performance on downstream tasks and provide distances that\nbetter align with linguistic distance studies.\n","authors":["Aditya Khan","Mason Shipton","David Anugraha","Kaiyao Duan","Phuong H. Hoang","Eric Khiu","A. Seza Doğruöz","En-Shiun Annie Lee"],"pdf_url":"https://arxiv.org/pdf/2409.18472v2.pdf","comment":"Accepted to COLING 2025"},{"id":"http://arxiv.org/abs/2412.04619v3","updated":"2024-12-19T17:51:34Z","published":"2024-12-05T21:12:37Z","title":"Sometimes I am a Tree: Data Drives Unstable Hierarchical Generalization","summary":" Language models (LMs), like other neural networks, often favor shortcut\nheuristics based on surface-level patterns. Although LMs behave like n-gram\nmodels early in training, they must eventually learn hierarchical syntactic\nrepresentations to correctly apply grammatical rules out-of-distribution (OOD).\nIn this work, we use case studies of English grammar to explore how complex,\ndiverse training data drives models to generalize OOD. We construct a framework\nthat unifies our understanding of random variation with training dynamics, rule\nselection with memorization, and data diversity with complexity. We show that\nthese factors are nuanced, and that intermediate levels of diversity and\ncomplexity lead to inconsistent behavior across random seeds and to unstable\ntraining dynamics. Our findings emphasize the critical role of training data in\nshaping generalization patterns and illuminate how competing model strategies\nlead to inconsistent generalization outcomes across random seeds. Code is\navailable at https://github.com/sunnytqin/concept_comp.git.\n","authors":["Tian Qin","Naomi Saphra","David Alvarez-Melis"],"pdf_url":"https://arxiv.org/pdf/2412.04619v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15100v1","updated":"2024-12-19T17:48:03Z","published":"2024-12-19T17:48:03Z","title":"Tests for model misspecification in simulation-based inference: from\n local distortions to global model checks","summary":" Model misspecification analysis strategies, such as anomaly detection, model\nvalidation, and model comparison are a key component of scientific model\ndevelopment. Over the last few years, there has been a rapid rise in the use of\nsimulation-based inference (SBI) techniques for Bayesian parameter estimation,\napplied to increasingly complex forward models. To move towards fully\nsimulation-based analysis pipelines, however, there is an urgent need for a\ncomprehensive simulation-based framework for model misspecification analysis.\nIn this work, we provide a solid and flexible foundation for a wide range of\nmodel discrepancy analysis tasks, using distortion-driven model\nmisspecification tests. From a theoretical perspective, we introduce the\nstatistical framework built around performing many hypothesis tests for\ndistortions of the simulation model. We also make explicit analytic connections\nto classical techniques: anomaly detection, model validation, and\ngoodness-of-fit residual analysis. Furthermore, we introduce an efficient\nself-calibrating training algorithm that is useful for practitioners. We\ndemonstrate the performance of the framework in multiple scenarios, making the\nconnection to classical results where they are valid. Finally, we show how to\nconduct such a distortion-driven model misspecification test for real\ngravitational wave data, specifically on the event GW150914.\n","authors":["Noemi Anau Montel","James Alvey","Christoph Weniger"],"pdf_url":"https://arxiv.org/pdf/2412.15100v1.pdf","comment":"11 pages, 5 figures. Code available on github (NoemiAM/mist) at\n https://github.com/NoemiAM/mist"},{"id":"http://arxiv.org/abs/2412.15095v1","updated":"2024-12-19T17:45:08Z","published":"2024-12-19T17:45:08Z","title":"A Full Transformer-based Framework for Automatic Pain Estimation using\n Videos","summary":" The automatic estimation of pain is essential in designing an optimal pain\nmanagement system offering reliable assessment and reducing the suffering of\npatients. In this study, we present a novel full transformer-based framework\nconsisting of a Transformer in Transformer (TNT) model and a Transformer\nleveraging cross-attention and self-attention blocks. Elaborating on videos\nfrom the BioVid database, we demonstrate state-of-the-art performances, showing\nthe efficacy, efficiency, and generalization capability across all the primary\npain estimation tasks.\n","authors":["Stefanos Gkikas","Manolis Tsiknakis"],"pdf_url":"https://arxiv.org/pdf/2412.15095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15086v1","updated":"2024-12-19T17:33:56Z","published":"2024-12-19T17:33:56Z","title":"Learning Disentangled Equivariant Representation for Explicitly\n Controllable 3D Molecule Generation","summary":" We consider the conditional generation of 3D drug-like molecules with\n\\textit{explicit control} over molecular properties such as drug-like\nproperties (e.g., Quantitative Estimate of Druglikeness or Synthetic\nAccessibility score) and effectively binding to specific protein sites. To\ntackle this problem, we propose an E(3)-equivariant Wasserstein autoencoder and\nfactorize the latent space of our generative model into two disentangled\naspects: molecular properties and the remaining structural context of 3D\nmolecules. Our model ensures explicit control over these molecular attributes\nwhile maintaining equivariance of coordinate representation and invariance of\ndata likelihood. Furthermore, we introduce a novel alignment-based coordinate\nloss to adapt equivariant networks for auto-regressive de-novo 3D molecule\ngeneration from scratch. Extensive experiments validate our model's\neffectiveness on property-guided and context-guided molecule generation, both\nfor de-novo 3D molecule design and structure-based drug discovery against\nprotein targets.\n","authors":["Haoran Liu","Youzhi Luo","Tianxiao Li","James Caverlee","Martin Renqiang Min"],"pdf_url":"https://arxiv.org/pdf/2412.15086v1.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2412.15084v1","updated":"2024-12-19T17:29:44Z","published":"2024-12-19T17:29:44Z","title":"AceMath: Advancing Frontier Math Reasoning with Post-Training and Reward\n Modeling","summary":" In this paper, we introduce AceMath, a suite of frontier math models that\nexcel in solving complex math problems, along with highly effective reward\nmodels capable of evaluating generated solutions and reliably identifying the\ncorrect ones. To develop the instruction-tuned math models, we propose a\nsupervised fine-tuning (SFT) process that first achieves competitive\nperformance across general domains, followed by targeted fine-tuning for the\nmath domain using a carefully curated set of prompts and synthetically\ngenerated responses. The resulting model, AceMath-72B-Instruct greatly\noutperforms Qwen2.5-Math-72B-Instruct, GPT-4o and Claude-3.5 Sonnet. To develop\nmath-specialized reward model, we first construct AceMath-RewardBench, a\ncomprehensive and robust benchmark for evaluating math reward models across\ndiverse problems and difficulty levels. After that, we present a systematic\napproach to build our math reward models. The resulting model, AceMath-72B-RM,\nconsistently outperforms state-of-the-art reward models. Furthermore, when\ncombining AceMath-72B-Instruct with AceMath-72B-RM, we achieve the highest\naverage rm@8 score across the math reasoning benchmarks. We will release model\nweights, training data, and evaluation benchmarks at:\nhttps://research.nvidia.com/labs/adlr/acemath\n","authors":["Zihan Liu","Yang Chen","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2412.15084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15077v1","updated":"2024-12-19T17:26:07Z","published":"2024-12-19T17:26:07Z","title":"Till the Layers Collapse: Compressing a Deep Neural Network through the\n Lenses of Batch Normalization Layers","summary":" Today, deep neural networks are widely used since they can handle a variety\nof complex tasks. Their generality makes them very powerful tools in modern\ntechnology. However, deep neural networks are often overparameterized. The\nusage of these large models consumes a lot of computation resources. In this\npaper, we introduce a method called \\textbf{T}ill the \\textbf{L}ayers\n\\textbf{C}ollapse (TLC), which compresses deep neural networks through the\nlenses of batch normalization layers. By reducing the depth of these networks,\nour method decreases deep neural networks' computational requirements and\noverall latency. We validate our method on popular models such as Swin-T,\nMobileNet-V2, and RoBERTa, across both image classification and natural\nlanguage processing (NLP) tasks.\n","authors":["Zhu Liao","Nour Hezbri","Victor Quétu","Van-Tam Nguyen","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2412.15077v1.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2412.15075v1","updated":"2024-12-19T17:24:15Z","published":"2024-12-19T17:24:15Z","title":"DroughtSet: Understanding Drought Through Spatial-Temporal Learning","summary":" Drought is one of the most destructive and expensive natural disasters,\nseverely impacting natural resources and risks by depleting water resources and\ndiminishing agricultural yields. Under climate change, accurately predicting\ndrought is critical for mitigating drought-induced risks. However, the\nintricate interplay among the physical and biological drivers that regulate\ndroughts limits the predictability and understanding of drought, particularly\nat a subseasonal to seasonal (S2S) time scale. While deep learning has been\ndemonstrated with potential in addressing climate forecasting challenges, its\napplication to drought prediction has received relatively less attention. In\nthis work, we propose a new dataset, DroughtSet, which integrates relevant\npredictive features and three drought indices from multiple remote sensing and\nreanalysis datasets across the contiguous United States (CONUS). DroughtSet\nspecifically provides the machine learning community with a new real-world\ndataset to benchmark drought prediction models and more generally, time-series\nforecasting methods. Furthermore, we propose a spatial-temporal model SPDrought\nto predict and interpret S2S droughts. Our model learns from the spatial and\ntemporal information of physical and biological features to predict three types\nof droughts simultaneously. Multiple strategies are employed to quantify the\nimportance of physical and biological features for drought prediction. Our\nresults provide insights for researchers to better understand the\npredictability and sensitivity of drought to biological and physical\nconditions. We aim to contribute to the climate field by proposing a new tool\nto predict and understand the occurrence of droughts and provide the AI\ncommunity with a new benchmark to study deep learning applications in climate\nscience.\n","authors":["Xuwei Tan","Qian Zhao","Yanlan Liu","Xueru Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.15075v1.pdf","comment":"Accepted by AAAI25"},{"id":"http://arxiv.org/abs/2408.15165v2","updated":"2024-12-19T17:11:11Z","published":"2024-08-27T16:03:18Z","title":"Latent Ewald summation for machine learning of long-range interactions","summary":" Machine learning interatomic potentials (MLIPs) often neglect long-range\ninteractions, such as electrostatic and dispersion forces. In this work, we\nintroduce a straightforward and efficient method to account for long-range\ninteractions by learning a latent variable from local atomic descriptors and\napplying an Ewald summation to this variable. We demonstrate that in systems\nincluding charged and polar molecular dimers, bulk water, and water-vapor\ninterface, standard short-ranged MLIPs can lead to unphysical predictions even\nwhen employing message passing. The long-range models effectively eliminate\nthese artifacts, with only about twice the computational cost of short-range\nMLIPs.\n","authors":["Bingqing Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.15165v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15058v1","updated":"2024-12-19T17:06:53Z","published":"2024-12-19T17:06:53Z","title":"MultiverSeg: Scalable Interactive Segmentation of Biomedical Imaging\n Datasets with In-Context Guidance","summary":" Medical researchers and clinicians often need to perform novel segmentation\ntasks on a set of related images. Existing methods for segmenting a new dataset\nare either interactive, requiring substantial human effort for each image, or\nrequire an existing set of manually labeled images. We introduce a system,\nMultiverSeg, that enables practitioners to rapidly segment an entire new\ndataset without requiring access to any existing labeled data from that task or\ndomain. Along with the image to segment, the model takes user interactions such\nas clicks, bounding boxes or scribbles as input, and predicts a segmentation.\nAs the user segments more images, those images and segmentations become\nadditional inputs to the model, providing context. As the context set of\nlabeled images grows, the number of interactions required to segment each new\nimage decreases. We demonstrate that MultiverSeg enables users to interactively\nsegment new datasets efficiently, by amortizing the number of interactions per\nimage to achieve an accurate segmentation. Compared to using a state-of-the-art\ninteractive segmentation method, using MultiverSeg reduced the total number of\nscribble steps by 53% and clicks by 36% to achieve 90% Dice on sets of images\nfrom unseen tasks. We release code and model weights at\nhttps://multiverseg.csail.mit.edu\n","authors":["Hallee E. Wong","Jose Javier Gonzalez Ortiz","John Guttag","Adrian V. Dalca"],"pdf_url":"https://arxiv.org/pdf/2412.15058v1.pdf","comment":"Project Website: https://multiverseg.csail.mit.edu Keywords:\n interactive segmentation, in-context learning, medical image analysis,\n biomedical imaging, image annotation, visual prompting"},{"id":"http://arxiv.org/abs/2407.17710v2","updated":"2024-12-19T16:48:59Z","published":"2024-07-25T02:05:15Z","title":"Revisiting Machine Unlearning with Dimensional Alignment","summary":" Machine unlearning, an emerging research topic focusing on compliance with\ndata privacy regulations, enables trained models to remove the information\nlearned from specific data. While many existing methods indirectly address this\nissue by intentionally injecting incorrect supervisions, they can drastically\nand unpredictably alter the decision boundaries and feature spaces, leading to\ntraining instability and undesired side effects. To fundamentally approach this\ntask, we first analyze the changes in latent feature spaces between original\nand retrained models, and observe that the feature representations of samples\nnot involved in training are closely aligned with the feature manifolds of\npreviously seen samples in training. Based on these findings, we introduce a\nnovel evaluation metric for machine unlearning, coined dimensional alignment,\nwhich measures the alignment between the eigenspaces of the forget and retain\nset samples. We employ this metric as a regularizer loss to build a robust and\nstable unlearning framework, which is further enhanced by integrating a\nself-distillation loss and an alternating training scheme. Our framework\neffectively eliminates information from the forget set and preserves knowledge\nfrom the retain set. Lastly, we identify critical flaws in established\nevaluation metrics for machine unlearning, and introduce new evaluation tools\nthat more accurately reflect the fundamental goals of machine unlearning.\n","authors":["Seonguk Seo","Dongwan Kim","Bohyung Han"],"pdf_url":"https://arxiv.org/pdf/2407.17710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.03767v2","updated":"2024-12-19T16:45:52Z","published":"2023-01-10T03:10:32Z","title":"Metric Compatible Training for Online Backfilling in Large-Scale\n Retrieval","summary":" Backfilling is the process of re-extracting all gallery embeddings from\nupgraded models in image retrieval systems. It inevitably requires a\nprohibitively large amount of computational cost and even entails the downtime\nof the service. Although backward-compatible learning sidesteps this challenge\nby tackling query-side representations, this leads to suboptimal solutions in\nprinciple because gallery embeddings cannot benefit from model upgrades. We\naddress this dilemma by introducing an online backfilling algorithm, which\nenables us to achieve a progressive performance improvement during the\nbackfilling process while not sacrificing the final performance of new model\nafter the completion of backfilling. To this end, we first propose a simple\ndistance rank merge technique for online backfilling. Then, we incorporate a\nreverse transformation module for more effective and efficient merging, which\nis further enhanced by adopting a metric-compatible contrastive learning\napproach. These two components help to make the distances of old and new models\ncompatible, resulting in desirable merge results during backfilling with no\nextra computational overhead. Extensive experiments show the effectiveness of\nour framework on four standard benchmarks in various settings.\n","authors":["Seonguk Seo","Mustafa Gokhan Uzunbas","Bohyung Han","Sara Cao","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2301.03767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15032v1","updated":"2024-12-19T16:44:01Z","published":"2024-12-19T16:44:01Z","title":"DCTdiff: Intriguing Properties of Image Generative Modeling in the DCT\n Space","summary":" This paper explores image modeling from the frequency space and introduces\nDCTdiff, an end-to-end diffusion generative paradigm that efficiently models\nimages in the discrete cosine transform (DCT) space. We investigate the design\nspace of DCTdiff and reveal the key design factors. Experiments on different\nframeworks (UViT, DiT), generation tasks, and various diffusion samplers\ndemonstrate that DCTdiff outperforms pixel-based diffusion models regarding\ngenerative quality and training efficiency. Remarkably, DCTdiff can seamlessly\nscale up to high-resolution generation without using the latent diffusion\nparadigm. Finally, we illustrate several intriguing properties of DCT image\nmodeling. For example, we provide a theoretical proof of why `image diffusion\ncan be seen as spectral autoregression', bridging the gap between diffusion and\nautoregressive models. The effectiveness of DCTdiff and the introduced\nproperties suggest a promising direction for image modeling in the frequency\nspace. The code is at \\url{https://github.com/forever208/DCTdiff}.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Haozhe Jia","Lanmiao Liu","Martin Beneš","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2412.15032v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2406.14742v2","updated":"2024-12-19T16:37:38Z","published":"2024-06-20T21:13:39Z","title":"Latent Variable Sequence Identification for Cognitive Models with Neural\n Network Estimators","summary":" Extracting time-varying latent variables from computational cognitive models\nis a key step in model-based neural analysis, which aims to understand the\nneural correlates of cognitive processes. However, existing methods only allow\nresearchers to infer latent variables that explain subjects' behavior in a\nrelatively small class of cognitive models. For example, a broad class of\nrelevant cognitive models with analytically intractable likelihood is currently\nout of reach from standard techniques, based on Maximum a Posteriori parameter\nestimation. Here, we present an approach that extends neural Bayes estimation\nto learn a direct mapping between experimental data and the targeted latent\nvariable space using recurrent neural networks and simulated datasets. We show\nthat our approach achieves competitive performance in inferring latent variable\nsequences in both tractable and intractable models. Furthermore, the approach\nis generalizable across different computational models and is adaptable for\nboth continuous and discrete latent spaces. We then demonstrate its\napplicability in real world datasets. Our work underscores that combining\nrecurrent neural networks and simulation-based inference to identify latent\nvariable sequences can enable researchers to access a wider class of cognitive\nmodels for model-based neural analyses, and thus test a broader set of\ntheories.\n","authors":["Ti-Fen Pan","Jing-Jing Li","Bill Thompson","Anne Collins"],"pdf_url":"https://arxiv.org/pdf/2406.14742v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15023v1","updated":"2024-12-19T16:37:19Z","published":"2024-12-19T16:37:19Z","title":"Stable-V2A: Synthesis of Synchronized Sound Effects with Temporal and\n Semantic Controls","summary":" Sound designers and Foley artists usually sonorize a scene, such as from a\nmovie or video game, by manually annotating and sonorizing each action of\ninterest in the video. In our case, the intent is to leave full creative\ncontrol to sound designers with a tool that allows them to bypass the more\nrepetitive parts of their work, thus being able to focus on the creative\naspects of sound production. We achieve this presenting Stable-V2A, a two-stage\nmodel consisting of: an RMS-Mapper that estimates an envelope representative of\nthe audio characteristics associated with the input video; and Stable-Foley, a\ndiffusion model based on Stable Audio Open that generates audio semantically\nand temporally aligned with the target video. Temporal alignment is guaranteed\nby the use of the envelope as a ControlNet input, while semantic alignment is\nachieved through the use of sound representations chosen by the designer as\ncross-attention conditioning of the diffusion process. We train and test our\nmodel on Greatest Hits, a dataset commonly used to evaluate V2A models. In\naddition, to test our model on a case study of interest, we introduce Walking\nThe Maps, a dataset of videos extracted from video games depicting animated\ncharacters walking in different locations. Samples and code available on our\ndemo page at https://ispamm.github.io/Stable-V2A.\n","authors":["Riccardo Fosco Gramaccioni","Christian Marinoni","Emilian Postolache","Marco Comunità","Luca Cosmo","Joshua D. Reiss","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2412.15023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14512v3","updated":"2024-12-19T16:37:00Z","published":"2024-08-25T04:32:45Z","title":"LLMs as Zero-shot Graph Learners: Alignment of GNN Representations with\n LLM Token Embeddings","summary":" Zero-shot graph machine learning, especially with graph neural networks\n(GNNs), has garnered significant interest due to the challenge of scarce\nlabeled data. While methods like self-supervised learning and graph prompt\nlearning have been extensively explored, they often rely on fine-tuning with\ntask-specific labels, limiting their effectiveness in zero-shot scenarios.\nInspired by the zero-shot capabilities of instruction-fine-tuned large language\nmodels (LLMs), we introduce a novel framework named Token Embedding-Aligned\nGraph Language Model (TEA-GLM) that leverages LLMs as cross-dataset and\ncross-task zero-shot learners for graph machine learning. Concretely, we\npretrain a GNN, aligning its representations with token embeddings of an LLM.\nWe then train a linear projector that transforms the GNN's representations into\na fixed number of graph token embeddings without tuning the LLM. A unified\ninstruction is designed for various graph tasks at different levels, such as\nnode classification (node-level) and link prediction (edge-level). These design\nchoices collectively enhance our method's effectiveness in zero-shot learning,\nsetting it apart from existing methods. Experiments show that our graph token\nembeddings help the LLM predictor achieve state-of-the-art performance on\nunseen datasets and tasks compared to other methods using LLMs as predictors.\n","authors":["Duo Wang","Yuan Zuo","Fengzhi Li","Junjie Wu"],"pdf_url":"https://arxiv.org/pdf/2408.14512v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15557v2","updated":"2024-12-19T16:32:43Z","published":"2024-05-24T13:44:30Z","title":"Learning from Linear Algebra: A Graph Neural Network Approach to\n Preconditioner Design for Conjugate Gradient Solvers","summary":" Large linear systems are ubiquitous in modern computational science and\nengineering. The main recipe for solving them is the use of Krylov subspace\niterative methods with well-designed preconditioners. Deep learning models can\nbe used as nonlinear preconditioners during the iteration of linear solvers\nsuch as the conjugate gradient (CG) method. Neural network models require an\nenormous number of parameters to approximate well in this setup. Another\napproach is to take advantage of small graph neural networks (GNNs) to\nconstruct preconditioners with predefined sparsity patterns. Recently, GNNs\nhave been shown to be a promising tool for designing preconditioners to reduce\nthe overall computational cost of iterative methods by constructing them more\nefficiently than with classical linear algebra techniques. However,\npreconditioners designed with these approaches cannot outperform those designed\nwith classical methods in terms of the number of iterations in CG. In our work,\nwe recall well-established preconditioners from linear algebra and use them as\na starting point for training the GNN to obtain preconditioners that reduce the\ncondition number of the system more significantly. Numerical experiments show\nthat our approach outperforms both classical and neural network-based methods\nfor an important class of parametric partial differential equations. We also\nprovide a heuristic justification for the loss function used and show that\npreconditioners obtained by learning with this loss function reduce the\ncondition number in a more desirable way for CG.\n","authors":["Vladislav Trifonov","Alexander Rudikov","Oleg Iliev","Yuri M. Laevsky","Ivan Oseledets","Ekaterina Muravleva"],"pdf_url":"https://arxiv.org/pdf/2405.15557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15010v1","updated":"2024-12-19T16:22:37Z","published":"2024-12-19T16:22:37Z","title":"Robust Federated Learning in the Face of Covariate Shift: A Magnitude\n Pruning with Hybrid Regularization Framework for Enhanced Model Aggregation","summary":" The development of highly sophisticated neural networks has allowed for fast\nprogress in every field of computer vision, however, applications where\nannotated data is prohibited due to privacy or security concerns remain\nchallenging. Federated Learning (FL) offers a promising framework for\nindividuals aiming to collaboratively develop a shared model while preserving\ndata privacy. Nevertheless, our findings reveal that variations in data\ndistribution among clients can profoundly affect FL methodologies, primarily\ndue to instabilities in the aggregation process. We also propose a novel FL\nframework to mitigate the adverse effects of covariate shifts among federated\nclients by combining individual parameter pruning and regularization techniques\nto improve the robustness of individual clients' models to aggregate. Each\nclient's model is optimized through magnitude-based pruning and the addition of\ndropout and noise injection layers to build more resilient decision pathways in\nthe networks and improve the robustness of the model's parameter aggregation\nstep. The proposed framework is capable of extracting robust representations\neven in the presence of very large covariate shifts among client data\ndistributions and in the federation of a small number of clients. Empirical\nfindings substantiate the effectiveness of our proposed methodology across\ncommon benchmark datasets, including CIFAR10, MNIST, SVHN, and Fashion MNIST.\nFurthermore, we introduce the CelebA-Gender dataset, specifically designed to\nevaluate performance on a more realistic domain. The proposed method is capable\nof extracting robust representations even in the presence of both high and low\ncovariate shifts among client data distributions.\n","authors":["Ozgu Goksu","Nicolas Pugeault"],"pdf_url":"https://arxiv.org/pdf/2412.15010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15005v1","updated":"2024-12-19T16:20:42Z","published":"2024-12-19T16:20:42Z","title":"DisCo: Graph-Based Disentangled Contrastive Learning for Cold-Start\n Cross-Domain Recommendation","summary":" Recommender systems are widely used in various real-world applications, but\nthey often encounter the persistent challenge of the user cold-start problem.\nCross-domain recommendation (CDR), which leverages user interactions from one\ndomain to improve prediction performance in another, has emerged as a promising\nsolution. However, users with similar preferences in the source domain may\nexhibit different interests in the target domain. Therefore, directly\ntransferring embeddings may introduce irrelevant source-domain collaborative\ninformation. In this paper, we propose a novel graph-based disentangled\ncontrastive learning framework to capture fine-grained user intent and filter\nout irrelevant collaborative information, thereby avoiding negative transfer.\nSpecifically, for each domain, we use a multi-channel graph encoder to capture\ndiverse user intents. We then construct the affinity graph in the embedding\nspace and perform multi-step random walks to capture high-order user similarity\nrelationships. Treating one domain as the target, we propose a disentangled\nintent-wise contrastive learning approach, guided by user similarity, to refine\nthe bridging of user intents across domains. Extensive experiments on four\nbenchmark CDR datasets demonstrate that DisCo consistently outperforms existing\nstate-of-the-art baselines, thereby validating the effectiveness of both DisCo\nand its components.\n","authors":["Hourun Li","Yifan Wang","Zhiping Xiao","Jia Yang","Changling Zhou","Ming Zhang","Wei Ju"],"pdf_url":"https://arxiv.org/pdf/2412.15005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12012v5","updated":"2024-12-19T16:15:27Z","published":"2024-01-22T14:59:11Z","title":"TurboSVM-FL: Boosting Federated Learning through SVM Aggregation for\n Lazy Clients","summary":" Federated learning is a distributed collaborative machine learning paradigm\nthat has gained strong momentum in recent years. In federated learning, a\ncentral server periodically coordinates models with clients and aggregates the\nmodels trained locally by clients without necessitating access to local data.\nDespite its potential, the implementation of federated learning continues to\nencounter several challenges, predominantly the slow convergence that is\nlargely due to data heterogeneity. The slow convergence becomes particularly\nproblematic in cross-device federated learning scenarios where clients may be\nstrongly limited by computing power and storage space, and hence counteracting\nmethods that induce additional computation or memory cost on the client side\nsuch as auxiliary objective terms and larger training iterations can be\nimpractical. In this paper, we propose a novel federated aggregation strategy,\nTurboSVM-FL, that poses no additional computation burden on the client side and\ncan significantly accelerate convergence for federated classification task,\nespecially when clients are \"lazy\" and train their models solely for few epochs\nfor next global aggregation. TurboSVM-FL extensively utilizes support vector\nmachine to conduct selective aggregation and max-margin spread-out\nregularization on class embeddings. We evaluate TurboSVM-FL on multiple\ndatasets including FEMNIST, CelebA, and Shakespeare using user-independent\nvalidation with non-iid data distribution. Our results show that TurboSVM-FL\ncan significantly outperform existing popular algorithms on convergence rate\nand reduce communication rounds while delivering better test metrics including\naccuracy, F1 score, and MCC.\n","authors":["Mengdi Wang","Anna Bodonhelyi","Efe Bozkir","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2401.12012v5.pdf","comment":"Proceedings of the AAAI Conference on Artificial Intelligence 2024\n (AAAI'24)"},{"id":"http://arxiv.org/abs/2405.08044v2","updated":"2024-12-19T16:08:31Z","published":"2024-05-13T13:55:34Z","title":"Mitigating federated learning contribution allocation instability\n through randomized aggregation","summary":" Federated learning (FL) is a collaborative and privacy-preserving Machine\nLearning paradigm, allowing the development of robust models without the need\nto centralise sensitive data. A critical challenge in FL lies in fairly and\naccurately allocating contributions from diverse participants. Inaccurate\nallocation can undermine trust, lead to unfair compensation, and thus\nparticipants may lack the incentive to join or actively contribute to the\nfederation.\n Various remuneration strategies have been proposed to date, including\nauction-based approaches and Shapley-value based methods, the latter offering a\nmeans to quantify the contribution of each participant. However, little to no\nwork has studied the stability of these contribution evaluation methods.\n In this paper, we focus on calculating contributions using gradient-based\nmodel reconstruction techniques with Shapley values. We first show that\nbaseline Shapley values do not accurately reflect clients' contributions,\nleading to unstable reward allocations amongst participants in a cross-silo\nfederation. We then introduce \\textsc{FedRandom}, a new method that mitigates\nthese shortcomings with additional data samplings, and show its efficacy at\nincreasing the stability of contribution evaluation in federated learning.\n","authors":["Arno Geimer","Beltran Fiz","Radu State"],"pdf_url":"https://arxiv.org/pdf/2405.08044v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08606v2","updated":"2024-12-19T16:05:31Z","published":"2024-02-13T17:12:01Z","title":"Arbitrary Polynomial Separations in Trainable Quantum Machine Learning","summary":" Recent theoretical results in quantum machine learning have demonstrated a\ngeneral trade-off between the expressive power of quantum neural networks\n(QNNs) and their trainability; as a corollary of these results, practical\nexponential separations in expressive power over classical machine learning\nmodels are believed to be infeasible as such QNNs take a time to train that is\nexponential in the model size. We here circumvent these negative results by\nconstructing a hierarchy of efficiently trainable QNNs that exhibit\nunconditionally provable, polynomial memory separations of arbitrary constant\ndegree over classical neural networks -- including state-of-the-art models,\nsuch as Transformers -- in performing a classical sequence modeling task. This\nconstruction is also computationally efficient, as each unit cell of the\nintroduced class of QNNs only has constant gate complexity. We show that\ncontextuality -- informally, a quantitative notion of semantic ambiguity -- is\nthe source of the expressivity separation, suggesting that other learning tasks\nwith this property may be a natural setting for the use of quantum learning\nalgorithms.\n","authors":["Eric R. Anschuetz","Xun Gao"],"pdf_url":"https://arxiv.org/pdf/2402.08606v2.pdf","comment":"24 pages, 3 figures, strengthened and simplified results and\n presentation"},{"id":"http://arxiv.org/abs/2412.14988v1","updated":"2024-12-19T16:00:10Z","published":"2024-12-19T16:00:10Z","title":"Stitch Contrast and Segment_Learning a Human Action Segmentation Model\n Using Trimmed Skeleton Videos","summary":" Existing skeleton-based human action classification models rely on\nwell-trimmed action-specific skeleton videos for both training and testing,\nprecluding their scalability to real-world applications where untrimmed videos\nexhibiting concatenated actions are predominant. To overcome this limitation,\nrecently introduced skeleton action segmentation models involve un-trimmed\nskeleton videos into end-to-end training. The model is optimized to provide\nframe-wise predictions for any length of testing videos, simultaneously\nrealizing action localization and classification. Yet, achieving such an\nimprovement im-poses frame-wise annotated skeleton videos, which remains\ntime-consuming in practice. This paper features a novel framework for\nskeleton-based action segmentation trained on short trimmed skeleton videos,\nbut that can run on longer un-trimmed videos. The approach is implemented in\nthree steps: Stitch, Contrast, and Segment. First, Stitch proposes a tem-poral\nskeleton stitching scheme that treats trimmed skeleton videos as elementary\nhuman motions that compose a semantic space and can be sampled to generate\nmulti-action stitched se-quences. Contrast learns contrastive representations\nfrom stitched sequences with a novel discrimination pretext task that enables a\nskeleton encoder to learn meaningful action-temporal contexts to improve action\nsegmentation. Finally, Segment relates the proposed method to action\nsegmentation by learning a segmentation layer while handling particular da-ta\navailability. Experiments involve a trimmed source dataset and an untrimmed\ntarget dataset in an adaptation formulation for real-world skeleton-based human\naction segmentation to evaluate the effectiveness of the proposed method.\n","authors":["Haitao Tian","Pierre Payeur"],"pdf_url":"https://arxiv.org/pdf/2412.14988v1.pdf","comment":"Accepted as AAAI 2025"},{"id":"http://arxiv.org/abs/2412.08941v3","updated":"2024-12-19T15:59:19Z","published":"2024-12-12T05:08:05Z","title":"Optimized Gradient Clipping for Noisy Label Learning","summary":" Previous research has shown that constraining the gradient of loss function\nwith respect to model-predicted probabilities can enhance the model robustness\nagainst noisy labels. These methods typically specify a fixed optimal threshold\nfor gradient clipping through validation data to obtain the desired robustness\nagainst noise. However, this common practice overlooks the dynamic distribution\nof gradients from both clean and noisy-labeled samples at different stages of\ntraining, significantly limiting the model capability to adapt to the variable\nnature of gradients throughout the training process. To address this issue, we\npropose a simple yet effective approach called Optimized Gradient Clipping\n(OGC), which dynamically adjusts the clipping threshold based on the ratio of\nnoise gradients to clean gradients after clipping, estimated by modeling the\ndistributions of clean and noisy samples. This approach allows us to modify the\nclipping threshold at each training step, effectively controlling the influence\nof noise gradients. Additionally, we provide statistical analysis to certify\nthe noise-tolerance ability of OGC. Our extensive experiments across various\ntypes of label noise, including symmetric, asymmetric, instance-dependent, and\nreal-world noise, demonstrate the effectiveness of our approach.\n","authors":["Xichen Ye","Yifan Wu","Weizhong Zhang","Xiaoqiang Li","Yifan Chen","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2412.08941v3.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2404.15379v3","updated":"2024-12-19T15:54:17Z","published":"2024-04-23T07:16:13Z","title":"Clustering of timed sequences -- Application to the analysis of care\n pathways","summary":" Improving the future of healthcare starts by better understanding the current\nactual practices in hospital settings. This motivates the objective of\ndiscovering typical care pathways from patient data. Revealing typical care\npathways can be achieved through clustering. The difficulty in clustering care\npathways, represented by sequences of timestamped events, lies in defining a\nsemantically appropriate metric and clustering algorithms. In this article, we\nadapt two methods developed for time series to the clustering of timed\nsequences: the drop-DTW metric and the DBA approach for the construction of\naveraged time sequences. These methods are then applied in clustering\nalgorithms to propose original and sound clustering algorithms for timed\nsequences. This approach is experimented with and evaluated on synthetic and\nreal-world data.\n","authors":["Thomas Guyet","Pierre Pinson","Enoal Gesny"],"pdf_url":"https://arxiv.org/pdf/2404.15379v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01420v2","updated":"2024-12-19T15:51:33Z","published":"2024-12-02T12:00:27Z","title":"Task Adaptation of Reinforcement Learning-based NAS Agents through\n Transfer Learning","summary":" Recently, a novel paradigm has been proposed for reinforcement learning-based\nNAS agents, that revolves around the incremental improvement of a given\narchitecture. We assess the abilities of such reinforcement learning agents to\ntransfer between different tasks. We perform our evaluation using the\nTrans-NASBench-101 benchmark, and consider the efficacy of the transferred\nagents, as well as how quickly they can be trained. We find that pretraining an\nagent on one task benefits the performance of the agent in another task in all\nbut 1 task when considering final performance. We also show that the training\nprocedure for an agent can be shortened significantly by pretraining it on\nanother task. Our results indicate that these effects occur regardless of the\nsource or target task, although they are more pronounced for some tasks than\nfor others. Our results show that transfer learning can be an effective tool in\nmitigating the computational cost of the initial training procedure for\nreinforcement learning-based NAS agents.\n","authors":["Amber Cassimon","Siegfried Mercelis","Kevin Mets"],"pdf_url":"https://arxiv.org/pdf/2412.01420v2.pdf","comment":"15 Pages, 13 Figures, Corrected data in Figure 5"},{"id":"http://arxiv.org/abs/2412.14964v1","updated":"2024-12-19T15:44:01Z","published":"2024-12-19T15:44:01Z","title":"Knowledge Injection via Prompt Distillation","summary":" In many practical applications, large language models (LLMs) need to\nincorporate new knowledge not present in their pre-training data. The primary\nmethods for this are fine-tuning and retrieval-augmented generation (RAG).\nAlthough RAG has emerged as the industry standard for knowledge injection,\nfine-tuning has not yet achieved comparable success. In this paper, we propose\na new fine-tuning technique for learning new knowledge and show that it can\nreach the performance of RAG. The proposed method is based on the\nself-distillation approach, which we call prompt distillation. First, we\ngenerate question-answer pairs about the new knowledge. Then, we fine-tune a\nstudent model on the question-answer pairs to imitate the output distributions\nof a teacher model, which additionally receives the new knowledge in its\nprompt. The student model is identical to the teacher, except it is equipped\nwith a LoRA adapter. This training procedure facilitates distilling the new\nknowledge from the teacher's prompt into the student's weights.\n","authors":["Kalle Kujanpää","Harri Valpola","Alexander Ilin"],"pdf_url":"https://arxiv.org/pdf/2412.14964v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2412.14963v1","updated":"2024-12-19T15:43:05Z","published":"2024-12-19T15:43:05Z","title":"IDOL: Instant Photorealistic 3D Human Creation from a Single Image","summary":" Creating a high-fidelity, animatable 3D full-body avatar from a single image\nis a challenging task due to the diverse appearance and poses of humans and the\nlimited availability of high-quality training data. To achieve fast and\nhigh-quality human reconstruction, this work rethinks the task from the\nperspectives of dataset, model, and representation. First, we introduce a\nlarge-scale HUman-centric GEnerated dataset, HuGe100K, consisting of 100K\ndiverse, photorealistic sets of human images. Each set contains 24-view frames\nin specific human poses, generated using a pose-controllable\nimage-to-multi-view model. Next, leveraging the diversity in views, poses, and\nappearances within HuGe100K, we develop a scalable feed-forward transformer\nmodel to predict a 3D human Gaussian representation in a uniform space from a\ngiven human image. This model is trained to disentangle human pose, body shape,\nclothing geometry, and texture. The estimated Gaussians can be animated without\npost-processing. We conduct comprehensive experiments to validate the\neffectiveness of the proposed dataset and method. Our model demonstrates the\nability to efficiently reconstruct photorealistic humans at 1K resolution from\na single input image using a single GPU instantly. Additionally, it seamlessly\nsupports various applications, as well as shape and texture editing tasks.\n","authors":["Yiyu Zhuang","Jiaxi Lv","Hao Wen","Qing Shuai","Ailing Zeng","Hao Zhu","Shifeng Chen","Yujiu Yang","Xun Cao","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2412.14963v1.pdf","comment":"21 pages, 15 figures, includes main content, supplementary materials,\n and references"},{"id":"http://arxiv.org/abs/2412.03795v2","updated":"2024-12-19T15:43:00Z","published":"2024-12-05T01:25:34Z","title":"Samudra: An AI Global Ocean Emulator for Climate","summary":" AI emulators for forecasting have emerged as powerful tools that can\noutperform conventional numerical predictions. The next frontier is to build\nemulators for long climate simulations with skill across a range of\nspatiotemporal scales, a particularly important goal for the ocean. Our work\nbuilds a skillful global emulator of the ocean component of a state-of-the-art\nclimate model. We emulate key ocean variables, sea surface height, horizontal\nvelocities, temperature, and salinity, across their full depth. We use a\nmodified ConvNeXt UNet architecture trained on multidepth levels of ocean data.\nWe show that the ocean emulator - Samudra - which exhibits no drift relative to\nthe truth, can reproduce the depth structure of ocean variables and their\ninterannual variability. Samudra is stable for centuries and 150 times faster\nthan the original ocean model. Samudra struggles to capture the correct\nmagnitude of the forcing trends and simultaneously remains stable, requiring\nfurther work.\n","authors":["Surya Dheeshjith","Adam Subel","Alistair Adcroft","Julius Busecke","Carlos Fernandez-Granda","Shubham Gupta","Laure Zanna"],"pdf_url":"https://arxiv.org/pdf/2412.03795v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00641v2","updated":"2024-12-19T15:42:00Z","published":"2024-08-01T15:30:43Z","title":"Enhancing Ethereum Fraud Detection via Generative and Contrastive\n Self-supervision","summary":" The rampant fraudulent activities on Ethereum hinder the healthy development\nof the blockchain ecosystem, necessitating the reinforcement of regulations.\nHowever, multiple imbalances involving account interaction frequencies and\ninteraction types in the Ethereum transaction environment pose significant\nchallenges to data mining-based fraud detection research. To address this, we\nfirst propose the concept of meta-interactions to refine interaction behaviors\nin Ethereum, and based on this, we present a dual self-supervision enhanced\nEthereum fraud detection framework, named Meta-IFD. This framework initially\nintroduces a generative self-supervision mechanism to augment the interaction\nfeatures of accounts, followed by a contrastive self-supervision mechanism to\ndifferentiate various behavior patterns, and ultimately characterizes the\nbehavioral representations of accounts and mines potential fraud risks through\nmulti-view interaction feature learning. Extensive experiments on real Ethereum\ndatasets demonstrate the effectiveness and superiority of our framework in\ndetecting common Ethereum fraud behaviors such as Ponzi schemes and phishing\nscams. Additionally, the generative module can effectively alleviate the\ninteraction distribution imbalance in Ethereum data, while the contrastive\nmodule significantly enhances the framework's ability to distinguish different\nbehavior patterns. The source code will be available in\nhttps://github.com/GISec-Team/Meta-IFD.\n","authors":["Chenxiang Jin","Jiajun Zhou","Chenxuan Xie","Shanqing Yu","Qi Xuan","Xiaoniu Yang"],"pdf_url":"https://arxiv.org/pdf/2408.00641v2.pdf","comment":"Accepted by IEEE Transactions on Information Forensics & Security"},{"id":"http://arxiv.org/abs/2412.14954v1","updated":"2024-12-19T15:36:30Z","published":"2024-12-19T15:36:30Z","title":"Corn Ear Detection and Orientation Estimation Using Deep Learning","summary":" Monitoring growth behavior of maize plants such as the development of ears\ncan give key insights into the plant's health and development. Traditionally,\nthe measurement of the angle of ears is performed manually, which can be\ntime-consuming and prone to human error. To address these challenges, this\npaper presents a computer vision-based system for detecting and tracking ears\nof corn in an image sequence. The proposed system could accurately detect,\ntrack, and predict the ear's orientation, which can be useful in monitoring\ntheir growth behavior. This can significantly save time compared to manual\nmeasurement and enables additional areas of ear orientation research and\npotential increase in efficiencies for maize production. Using an object\ndetector with keypoint detection, the algorithm proposed could detect 90\npercent of all ears. The cardinal estimation had a mean absolute error (MAE) of\n18 degrees, compared to a mean 15 degree difference between two people\nmeasuring by hand. These results demonstrate the feasibility of using computer\nvision techniques for monitoring maize growth and can lead to further research\nin this area.\n","authors":["Nathan Sprague","John Evans","Michael Mardikes"],"pdf_url":"https://arxiv.org/pdf/2412.14954v1.pdf","comment":"22 pages;15 figures"},{"id":"http://arxiv.org/abs/2411.10958v2","updated":"2024-12-19T15:26:20Z","published":"2024-11-17T04:35:49Z","title":"SageAttention2: Efficient Attention with Thorough Outlier Smoothing and\n Per-thread INT4 Quantization","summary":" Although quantization for linear layers has been widely used, its application\nto accelerate the attention process remains limited. To further enhance the\nefficiency of attention computation compared to SageAttention while maintaining\nprecision, we propose SageAttention2, which utilizes significantly faster 4-bit\nmatrix multiplication (Matmul) alongside additional precision-enhancing\ntechniques. First, we propose to quantize matrixes $(Q, K)$ to INT4 in a\nhardware-friendly thread-level granularity and quantize matrixes $(\\widetilde\nP, V)$ to FP8. Second, we propose a method to smooth $Q$, enhancing the\naccuracy of INT4 $QK$. Third, we propose to use an FP32 Matmul buffer for $PV$\nto enhance the accuracy of FP8 $\\widetilde PV$. The operations per second (OPS)\nof SageAttention2 surpass FlashAttention2 and xformers by about 3x and 5x on\nRTX4090, respectively. Comprehensive experiments confirm that our approach\nincurs negligible end-to-end metrics loss across diverse models, including\nthose for large language processing, image generation, and video generation.\nThe codes are available at https://github.com/thu-ml/SageAttention.\n","authors":["Jintao Zhang","Haofeng Huang","Pengle Zhang","Jia Wei","Jun Zhu","Jianfei Chen"],"pdf_url":"https://arxiv.org/pdf/2411.10958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10839v2","updated":"2024-12-19T15:25:41Z","published":"2024-08-20T13:34:17Z","title":"Benchmarking Large Language Models for Math Reasoning Tasks","summary":" The use of Large Language Models (LLMs) in mathematical reasoning has become\na cornerstone of related research, demonstrating the intelligence of these\nmodels and enabling potential practical applications through their advanced\nperformance, such as in educational settings. Despite the variety of datasets\nand in-context learning algorithms designed to improve the ability of LLMs to\nautomate mathematical problem solving, the lack of comprehensive benchmarking\nacross different datasets makes it complicated to select an appropriate model\nfor specific tasks. In this project, we present a benchmark that fairly\ncompares seven state-of-the-art in-context learning algorithms for mathematical\nproblem solving across five widely used mathematical datasets on four powerful\nfoundation models. Furthermore, we explore the trade-off between efficiency and\nperformance, highlighting the practical applications of LLMs for mathematical\nreasoning. Our results indicate that larger foundation models like GPT-4o and\nLLaMA 3-70B can solve mathematical reasoning independently from the concrete\nprompting strategy, while for smaller models the in-context learning approach\nsignificantly influences the performance. Moreover, the optimal prompt depends\non the chosen foundation model. We open-source our benchmark code to support\nthe integration of additional models in future research.\n","authors":["Kathrin Seßler","Yao Rong","Emek Gözlüklü","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2408.10839v2.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2302.11947v2","updated":"2024-12-19T15:13:46Z","published":"2023-02-23T11:44:43Z","title":"Real-Time Damage Detection in Fiber Lifting Ropes Using Lightweight\n Convolutional Neural Networks","summary":" The health and safety hazards posed by worn crane lifting ropes mandate\nperiodic inspection for damage. This task is time-consuming, prone to human\nerror, halts operation, and may result in the premature disposal of ropes.\nTherefore, we propose using efficient deep learning and computer vision methods\nto automate the process of detecting damaged ropes. Specifically, we present a\nvision-based system for detecting damage in synthetic fiber rope images using\nlightweight convolutional neural networks. We develop a camera-based apparatus\nto photograph the lifting rope's surface, while in operation, and capture the\nprogressive wear-and-tear as well as the more significant degradation in the\nrope's health state. Experts from Konecranes annotate the collected images in\naccordance with the rope's condition; normal or damaged. Then, we pre-process\nthe images, systematically design a deep learning model, evaluate its detection\nand prediction performance, analyze its computational complexity, and compare\nit with various other models. Experimental results show the proposed model\noutperforms other similar techniques with 96.5% accuracy, 94.8% precision,\n98.3% recall, 96.5% F1-score, and 99.3% AUC. Besides, they demonstrate the\nmodel's real-time operation, low memory footprint, robustness to various\nenvironmental and operational conditions, and adequacy for deployment in\nindustrial applications such as lifting, mooring, towing, climbing, and\nsailing.\n","authors":["Tuomas Jalonen","Mohammad Al-Sa'd","Roope Mellanen","Serkan Kiranyaz","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2302.11947v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09423v3","updated":"2024-12-19T15:10:18Z","published":"2023-07-18T16:43:03Z","title":"Scaling Laws for Imitation Learning in Single-Agent Games","summary":" Imitation Learning (IL) is one of the most widely used methods in machine\nlearning. Yet, many works find it is often unable to fully recover the\nunderlying expert behavior, even in constrained environments like single-agent\ngames. However, none of these works deeply investigate the role of scaling up\nthe model and data size. Inspired by recent work in Natural Language Processing\n(NLP) where \"scaling up\" has resulted in increasingly more capable LLMs, we\ninvestigate whether carefully scaling up model and data size can bring similar\nimprovements in the imitation learning setting for single-agent games. We first\ndemonstrate our findings on a variety of Atari games, and thereafter focus on\nthe extremely challenging game of NetHack. In all games, we find that IL loss\nand mean return scale smoothly with the compute budget (FLOPs) and are strongly\ncorrelated, resulting in power laws for training compute-optimal IL agents.\nFinally, we forecast and train several NetHack agents with IL and find they\noutperform prior state-of-the-art by 1.5x in all settings. Our work both\ndemonstrates the scaling behavior of imitation learning in a variety of\nsingle-agent games, as well as the viability of scaling up current approaches\nfor increasingly capable agents in NetHack, a game that remains elusively hard\nfor current AI systems.\n","authors":["Jens Tuyls","Dhruv Madeka","Kari Torkkola","Dean Foster","Karthik Narasimhan","Sham Kakade"],"pdf_url":"https://arxiv.org/pdf/2307.09423v3.pdf","comment":"Accepted at TMLR 2024"},{"id":"http://arxiv.org/abs/2412.14916v1","updated":"2024-12-19T14:50:10Z","published":"2024-12-19T14:50:10Z","title":"From Point to probabilistic gradient boosting for claim frequency and\n severity prediction","summary":" Gradient boosting for decision tree algorithms are increasingly used in\nactuarial applications as they show superior predictive performance over\ntraditional generalized linear models. Many improvements and sophistications to\nthe first gradient boosting machine algorithm exist. We present in a unified\nnotation, and contrast, all the existing point and probabilistic gradient\nboosting for decision tree algorithms: GBM, XGBoost, DART, LightGBM, CatBoost,\nEGBM, PGBM, XGBoostLSS, cyclic GBM, and NGBoost. In this comprehensive\nnumerical study, we compare their performance on five publicly available\ndatasets for claim frequency and severity, of various size and comprising\ndifferent number of (high cardinality) categorical variables. We explain how\nvarying exposure-to-risk can be handled with boosting in frequency models. We\ncompare the algorithms on the basis of computational efficiency, predictive\nperformance, and model adequacy. LightGBM and XGBoostLSS win in terms of\ncomputational efficiency. The fully interpretable EGBM achieves competitive\npredictive performance compared to the black box algorithms considered. We find\nthat there is no trade-off between model adequacy and predictive accuracy: both\nare achievable simultaneously.\n","authors":["Dominik Chevalier","Marie-Pier Côté"],"pdf_url":"https://arxiv.org/pdf/2412.14916v1.pdf","comment":"26 pages, 4 figures, 26 tables, 7 algorithms"},{"id":"http://arxiv.org/abs/2311.18512v2","updated":"2024-12-19T14:46:05Z","published":"2023-11-30T12:40:23Z","title":"Union-over-Intersections: Object Detection beyond Winner-Takes-All","summary":" This paper revisits the problem of predicting box locations in object\ndetection architectures. Typically, each box proposal or box query aims to\ndirectly maximize the intersection-over-union score with the ground truth,\nfollowed by a winner-takes-all non-maximum suppression where only the highest\nscoring box in each region is retained. We observe that both steps are\nsub-optimal: the first involves regressing proposals to the entire ground\ntruth, which is a difficult task even with large receptive fields, and the\nsecond neglects valuable information from boxes other than the top candidate.\nInstead of regressing proposals to the whole ground truth, we propose a simpler\napproach: regress only to the area of intersection between the proposal and the\nground truth. This avoids the need for proposals to extrapolate beyond their\nvisual scope, improving localization accuracy. Rather than adopting a\nwinner-takes-all strategy, we take the union over the regressed intersections\nof all boxes in a region to generate the final box outputs. Our plug-and-play\nmethod integrates seamlessly into proposal-based, grid-based, and query-based\ndetection architectures with minimal modifications, consistently improving\nobject localization and instance segmentation. We demonstrate its broad\napplicability and versatility across various detection and segmentation tasks.\n","authors":["Aritra Bhowmik","Pascal Mettes","Martin R. Oswald","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2311.18512v2.pdf","comment":"17 pages, 6 figures, 12 tables"},{"id":"http://arxiv.org/abs/2412.14075v2","updated":"2024-12-19T14:41:03Z","published":"2024-12-18T17:19:55Z","title":"Online MDP with Transition Prototypes: A Robust Adaptive Approach","summary":" In this work, we consider an online robust Markov Decision Process (MDP)\nwhere we have the information of finitely many prototypes of the underlying\ntransition kernel. We consider an adaptively updated ambiguity set of the\nprototypes and propose an algorithm that efficiently identifies the true\nunderlying transition kernel while guaranteeing the performance of the\ncorresponding robust policy. To be more specific, we provide a sublinear regret\nof the subsequent optimal robust policy. We also provide an early stopping\nmechanism and a worst-case performance bound of the value function. In\nnumerical experiments, we demonstrate that our method outperforms existing\napproaches, particularly in the early stage with limited data. This work\ncontributes to robust MDPs by considering possible prior information about the\nunderlying transition probability and online learning, offering both\ntheoretical insights and practical algorithms for improved decision-making\nunder uncertainty.\n","authors":["Shuo Sun","Meng Qi","Zuo-Jun Max Shen"],"pdf_url":"https://arxiv.org/pdf/2412.14075v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11544v4","updated":"2024-12-19T14:33:00Z","published":"2024-06-17T13:42:28Z","title":"Do Parameters Reveal More than Loss for Membership Inference?","summary":" Membership inference attacks are used as a key tool for disclosure auditing.\nThey aim to infer whether an individual record was used to train a model. While\nsuch evaluations are useful to demonstrate risk, they are computationally\nexpensive and often make strong assumptions about potential adversaries' access\nto models and training environments, and thus do not provide tight bounds on\nleakage from potential attacks. We show how prior claims around black-box\naccess being sufficient for optimal membership inference do not hold for\nstochastic gradient descent, and that optimal membership inference indeed\nrequires white-box access. Our theoretical results lead to a new white-box\ninference attack, IHA (Inverse Hessian Attack), that explicitly uses model\nparameters by taking advantage of computing inverse-Hessian vector products.\nOur results show that both auditors and adversaries may be able to benefit from\naccess to model parameters, and we advocate for further research into white-box\nmethods for membership inference.\n","authors":["Anshuman Suri","Xiao Zhang","David Evans"],"pdf_url":"https://arxiv.org/pdf/2406.11544v4.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2412.14897v1","updated":"2024-12-19T14:28:00Z","published":"2024-12-19T14:28:00Z","title":"Diffusion priors for Bayesian 3D reconstruction from incomplete\n measurements","summary":" Many inverse problems are ill-posed and need to be complemented by prior\ninformation that restricts the class of admissible models. Bayesian approaches\nencode this information as prior distributions that impose generic properties\non the model such as sparsity, non-negativity or smoothness. However, in case\nof complex structured models such as images, graphs or three-dimensional (3D)\nobjects,generic prior distributions tend to favor models that differ largely\nfrom those observed in the real world. Here we explore the use of diffusion\nmodels as priors that are combined with experimental data within a Bayesian\nframework. We use 3D point clouds to represent 3D objects such as household\nitems or biomolecular complexes formed from proteins and nucleic acids. We\ntrain diffusion models that generate coarse-grained 3D structures at a medium\nresolution and integrate these with incomplete and noisy experimental data. To\ndemonstrate the power of our approach, we focus on the reconstruction of\nbiomolecular assemblies from cryo-electron microscopy (cryo-EM) images, which\nis an important inverse problem in structural biology. We find that posterior\nsampling with diffusion model priors allows for 3D reconstruction from very\nsparse, low-resolution and partial observations.\n","authors":["Julian L. Möbius","Michael Habeck"],"pdf_url":"https://arxiv.org/pdf/2412.14897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01519v3","updated":"2024-12-19T14:26:22Z","published":"2024-09-03T01:26:21Z","title":"Hybridization of Persistent Homology with Neural Networks for\n Time-Series Prediction: A Case Study in Wave Height","summary":" Time-series prediction is an active area of research across various fields,\noften challenged by the fluctuating influence of short-term and long-term\nfactors. In this study, we introduce a feature engineering method that enhances\nthe predictive performance of neural network models. Specifically, we leverage\ncomputational topology techniques to derive valuable topological features from\ninput data, boosting the predictive accuracy of our models. Our focus is on\npredicting wave heights, utilizing models based on topological features within\nfeedforward neural networks (FNNs), recurrent neural networks (RNNs), long\nshort-term memory networks (LSTM), and RNNs with gated recurrent units (GRU).\nFor time-ahead predictions, the enhancements in $R^2$ score were significant\nfor FNNs, RNNs, LSTM, and GRU models. Additionally, these models also showed\nsignificant reductions in maximum errors and mean squared errors.\n","authors":["Zixin Lin","Nur Fariha Syaqina Zulkepli","Mohd Shareduwan Mohd Kasihmuddin","R. U. Gobithaasan"],"pdf_url":"https://arxiv.org/pdf/2409.01519v3.pdf","comment":"the paper contain errors"},{"id":"http://arxiv.org/abs/2405.14573v4","updated":"2024-12-19T14:19:02Z","published":"2024-05-23T13:48:54Z","title":"AndroidWorld: A Dynamic Benchmarking Environment for Autonomous Agents","summary":" Autonomous agents that execute human tasks by controlling computers can\nenhance human productivity and application accessibility. However, progress in\nthis field will be driven by realistic and reproducible benchmarks. We present\nAndroidWorld, a fully functional Android environment that provides reward\nsignals for 116 programmatic tasks across 20 real-world Android apps. Unlike\nexisting interactive environments, which provide a static test set,\nAndroidWorld dynamically constructs tasks that are parameterized and expressed\nin natural language in unlimited ways, thus enabling testing on a much larger\nand more realistic suite of tasks. To ensure reproducibility, each task\nincludes dedicated initialization, success-checking, and tear-down logic, which\nmodifies and inspects the device's system state. We experiment with baseline\nagents to test AndroidWorld and provide initial results on the benchmark. Our\nbest agent can complete 30.6% of AndroidWorld's tasks, leaving ample room for\nfuture work. Furthermore, we adapt a popular desktop web agent to work on\nAndroid, which we find to be less effective on mobile, suggesting future\nresearch is needed to achieve universal, cross-platform agents. Finally, we\nalso conduct a robustness analysis, showing that task variations can\nsignificantly affect agent performance, demonstrating that without such\ntesting, agent performance metrics may not fully reflect practical challenges.\nAndroidWorld and the experiments in this paper are available at\ngithub.com/google-research/android_world.\n","authors":["Christopher Rawles","Sarah Clinckemaillie","Yifan Chang","Jonathan Waltz","Gabrielle Lau","Marybeth Fair","Alice Li","William Bishop","Wei Li","Folawiyo Campbell-Ajala","Daniel Toyama","Robert Berry","Divya Tyamagundlu","Timothy Lillicrap","Oriana Riva"],"pdf_url":"https://arxiv.org/pdf/2405.14573v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16684v2","updated":"2024-12-19T14:18:15Z","published":"2024-09-25T07:20:59Z","title":"Erase then Rectify: A Training-Free Parameter Editing Approach for\n Cost-Effective Graph Unlearning","summary":" Graph unlearning, which aims to eliminate the influence of specific nodes,\nedges, or attributes from a trained Graph Neural Network (GNN), is essential in\napplications where privacy, bias, or data obsolescence is a concern. However,\nexisting graph unlearning techniques often necessitate additional training on\nthe remaining data, leading to significant computational costs, particularly\nwith large-scale graphs. To address these challenges, we propose a two-stage\ntraining-free approach, Erase then Rectify (ETR), designed for efficient and\nscalable graph unlearning while preserving the model utility. Specifically, we\nfirst build a theoretical foundation showing that masking parameters critical\nfor unlearned samples enables effective unlearning. Building on this insight,\nthe Erase stage strategically edits model parameters to eliminate the impact of\nunlearned samples and their propagated influence on intercorrelated nodes. To\nfurther ensure the GNN's utility, the Rectify stage devises a gradient\napproximation method to estimate the model's gradient on the remaining dataset,\nwhich is then used to enhance model performance. Overall, ETR achieves graph\nunlearning without additional training or full training data access,\nsignificantly reducing computational overhead and preserving data privacy.\nExtensive experiments on seven public datasets demonstrate the consistent\nsuperiority of ETR in model utility, unlearning efficiency, and unlearning\neffectiveness, establishing it as a promising solution for real-world graph\nunlearning challenges.\n","authors":["Zhe-Rui Yang","Jindong Han","Chang-Dong Wang","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2409.16684v2.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2412.14869v1","updated":"2024-12-19T14:06:44Z","published":"2024-12-19T14:06:44Z","title":"AI-Powered Intracranial Hemorrhage Detection: A Co-Scale Convolutional\n Attention Model with Uncertainty-Based Fuzzy Integral Operator and Feature\n Screening","summary":" Intracranial hemorrhage (ICH) refers to the leakage or accumulation of blood\nwithin the skull, which occurs due to the rupture of blood vessels in or around\nthe brain. If this condition is not diagnosed in a timely manner and\nappropriately treated, it can lead to serious complications such as decreased\nconsciousness, permanent neurological disabilities, or even death.The primary\naim of this study is to detect the occurrence or non-occurrence of ICH,\nfollowed by determining the type of subdural hemorrhage (SDH). These tasks are\nframed as two separate binary classification problems. By adding two layers to\nthe co-scale convolutional attention (CCA) classifier architecture, we\nintroduce a novel approach for ICH detection. In the first layer, after\nextracting features from different slices of computed tomography (CT) scan\nimages, we combine these features and select the 50 components that capture the\nhighest variance in the data, considering them as informative features. We then\nassess the discriminative power of these features using the bootstrap forest\nalgorithm, discarding those that lack sufficient discriminative ability between\ndifferent classes. This algorithm explicitly determines the contribution of\neach feature to the final prediction, assisting us in developing an explainable\nAI model. The features feed into a boosting neural network as a latent feature\nspace. In the second layer, we introduce a novel uncertainty-based fuzzy\nintegral operator to fuse information from different CT scan slices. This\noperator, by accounting for the dependencies between consecutive slices,\nsignificantly improves detection accuracy.\n","authors":["Mehdi Hosseini Chagahi","Md. Jalil Piran","Niloufar Delfan","Behzad Moshiri","Jaber Hatam Parikhan"],"pdf_url":"https://arxiv.org/pdf/2412.14869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14865v1","updated":"2024-12-19T14:00:03Z","published":"2024-12-19T14:00:03Z","title":"Hierarchical Subspaces of Policies for Continual Offline Reinforcement\n Learning","summary":" In dynamic domains such as autonomous robotics and video game simulations,\nagents must continuously adapt to new tasks while retaining previously acquired\nskills. This ongoing process, known as Continual Reinforcement Learning,\npresents significant challenges, including the risk of forgetting past\nknowledge and the need for scalable solutions as the number of tasks increases.\nTo address these issues, we introduce HIerarchical LOW-rank Subspaces of\nPolicies (HILOW), a novel framework designed for continual learning in offline\nnavigation settings. HILOW leverages hierarchical policy subspaces to enable\nflexible and efficient adaptation to new tasks while preserving existing\nknowledge. We demonstrate, through a careful experimental study, the\neffectiveness of our method in both classical MuJoCo maze environments and\ncomplex video game-like simulations, showcasing competitive performance and\nsatisfying adaptability according to classical continual learning metrics, in\nparticular regarding memory usage. Our work provides a promising framework for\nreal-world applications where continuous learning from pre-collected data is\nessential.\n","authors":["Anthony Kobanda","Rémy Portelas","Odalric-Ambrym Maillard","Ludovic Denoyer"],"pdf_url":"https://arxiv.org/pdf/2412.14865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14854v1","updated":"2024-12-19T13:48:49Z","published":"2024-12-19T13:48:49Z","title":"Surrogate-assisted multi-objective design of complex multibody systems","summary":" The optimization of large-scale multibody systems is a numerically\nchallenging task, in particular when considering multiple conflicting criteria\nat the same time. In this situation, we need to approximate the Pareto set of\noptimal compromises, which is significantly more expensive than finding a\nsingle optimum in single-objective optimization. To prevent large costs, the\nusage of surrogate models, constructed from a small but informative number of\nexpensive model evaluations, is a very popular and widely studied approach. The\ncentral challenge then is to ensure a high quality (that is, near-optimality)\nof the solutions that were obtained using the surrogate model, which can be\nhard to guarantee with a single pre-computed surrogate. We present a\nback-and-forth approach between surrogate modeling and multi-objective\noptimization to improve the quality of the obtained solutions. Using the\nexample of an expensive-to-evaluate multibody system, we compare different\nstrategies regarding multi-objective optimization, sampling and also surrogate\nmodeling, to identify the most promising approach in terms of computational\nefficiency and solution quality.\n","authors":["Augustina C. Amakor","Manuel B. Berkemeier","Meike Wohlleben","Walter Sextro","Sebastian Peitz"],"pdf_url":"https://arxiv.org/pdf/2412.14854v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2412.01566"},{"id":"http://arxiv.org/abs/2410.10929v6","updated":"2024-12-19T13:39:55Z","published":"2024-10-14T16:35:27Z","title":"ASTM :Autonomous Smart Traffic Management System Using Artificial\n Intelligence CNN and LSTM","summary":" In the modern world, the development of Artificial Intelligence (AI) has\ncontributed to improvements in various areas, including automation, computer\nvision, fraud detection, and more. AI can be leveraged to enhance the\nefficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce\ntraffic congestion rates. This paper presents an Autonomous Smart Traffic\nManagement (STM) system that uses AI to improve traffic flow rates. The system\nemploys the YOLO V5 Convolutional Neural Network to detect vehicles in traffic\nmanagement images. Additionally, it predicts the number of vehicles for the\nnext 12 hours using a Recurrent Neural Network with Long Short-Term Memory\n(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the\ntraffic cycle length based on these vehicle predictions, aided by AI. From the\nresults of the RNN-LSTM model for predicting vehicle numbers over the next 12\nhours, we observe that the model predicts traffic with a Mean Squared Error\n(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles.\nAfter simulating the STM system in the CARLA simulation environment, we found\nthat the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per\nminute) is 50\\% higher than the rate without STM (around 15 vehicles per\nminute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5\nseconds per vehicle) is 70\\% lower than without STM (around 12 seconds per\nvehicle). These results demonstrate that the STM system using AI can increase\ntraffic flow by 50\\% and reduce vehicle pass delays by 70\\%.\n","authors":["Christofel Rio Goenawan"],"pdf_url":"https://arxiv.org/pdf/2410.10929v6.pdf","comment":"In process to IEEE Intelligent Vehicle Symposium 2025"},{"id":"http://arxiv.org/abs/2408.11778v2","updated":"2024-12-19T13:34:56Z","published":"2024-08-21T17:08:05Z","title":"Sum of Squares Circuits","summary":" Designing expressive generative models that support exact and efficient\ninference is a core question in probabilistic ML. Probabilistic circuits (PCs)\noffer a framework where this tractability-vs-expressiveness trade-off can be\nanalyzed theoretically. Recently, squared PCs encoding subtractive mixtures via\nnegative parameters have emerged as tractable models that can be exponentially\nmore expressive than monotonic PCs, i.e., PCs with positive parameters only. In\nthis paper, we provide a more precise theoretical characterization of the\nexpressiveness relationships among these models. First, we prove that squared\nPCs can be less expressive than monotonic ones. Second, we formalize a novel\nclass of PCs -- sum of squares PCs -- that can be exponentially more expressive\nthan both squared and monotonic PCs. Around sum of squares PCs, we build an\nexpressiveness hierarchy that allows us to precisely unify and separate\ndifferent tractable model classes such as Born Machines and PSD models, and\nother recently introduced tractable probabilistic models by using complex\nparameters. Finally, we empirically show the effectiveness of sum of squares\ncircuits in performing distribution estimation.\n","authors":["Lorenzo Loconte","Stefan Mengel","Antonio Vergari"],"pdf_url":"https://arxiv.org/pdf/2408.11778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.09116v3","updated":"2024-12-19T13:27:49Z","published":"2024-12-12T09:51:18Z","title":"How to Re-enable PDE Loss for Physical Systems Modeling Under Partial\n Observation","summary":" In science and engineering, machine learning techniques are increasingly\nsuccessful in physical systems modeling (predicting future states of physical\nsystems). Effectively integrating PDE loss as a constraint of system transition\ncan improve the model's prediction by overcoming generalization issues due to\ndata scarcity, especially when data acquisition is costly. However, in many\nreal-world scenarios, due to sensor limitations, the data we can obtain is\noften only partial observation, making the calculation of PDE loss seem to be\ninfeasible, as the PDE loss heavily relies on high-resolution states. We\ncarefully study this problem and propose a novel framework named Re-enable PDE\nLoss under Partial Observation (RPLPO). The key idea is that although enabling\nPDE loss to constrain system transition solely is infeasible, we can re-enable\nPDE loss by reconstructing the learnable high-resolution state and constraining\nsystem transition simultaneously. Specifically, RPLPO combines an encoding\nmodule for reconstructing learnable high-resolution states with a transition\nmodule for predicting future states. The two modules are jointly trained by\ndata and PDE loss. We conduct experiments in various physical systems to\ndemonstrate that RPLPO has significant improvement in generalization, even when\nobservation is sparse, irregular, noisy, and PDE is inaccurate.\n","authors":["Haodong Feng","Yue Wang","Dixia Fan"],"pdf_url":"https://arxiv.org/pdf/2412.09116v3.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2305.09565v2","updated":"2024-12-19T13:27:40Z","published":"2023-05-16T16:02:18Z","title":"Toward Falsifying Causal Graphs Using a Permutation-Based Test","summary":" Understanding causal relationships among the variables of a system is\nparamount to explain and control its behavior. For many real-world systems,\nhowever, the true causal graph is not readily available and one must resort to\npredictions made by algorithms or domain experts. Therefore, metrics that\nquantitatively assess the goodness of a causal graph provide helpful checks\nbefore using it in downstream tasks. Existing metrics provide an\n$\\textit{absolute}$ number of inconsistencies between the graph and the\nobserved data, and without a baseline, practitioners are left to answer the\nhard question of how many such inconsistencies are acceptable or expected.\nHere, we propose a novel consistency metric by constructing a baseline through\nnode permutations. By comparing the number of inconsistencies with those on the\nbaseline, we derive an interpretable metric that captures whether the graph is\nsignificantly better than random. Evaluating on both simulated and real data\nsets from various domains, including biology and cloud monitoring, we\ndemonstrate that the true graph is not falsified by our metric, whereas the\nwrong graphs given by a hypothetical user are likely to be falsified.\n","authors":["Elias Eulig","Atalanti A. Mastakouri","Patrick Blöbaum","Michaela Hardt","Dominik Janzing"],"pdf_url":"https://arxiv.org/pdf/2305.09565v2.pdf","comment":"Camera-ready version for AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14834v1","updated":"2024-12-19T13:24:01Z","published":"2024-12-19T13:24:01Z","title":"Entropy Regularized Task Representation Learning for Offline\n Meta-Reinforcement Learning","summary":" Offline meta-reinforcement learning aims to equip agents with the ability to\nrapidly adapt to new tasks by training on data from a set of different tasks.\nContext-based approaches utilize a history of state-action-reward transitions\n-- referred to as the context -- to infer representations of the current task,\nand then condition the agent, i.e., the policy and value function, on the task\nrepresentations. Intuitively, the better the task representations capture the\nunderlying tasks, the better the agent can generalize to new tasks.\nUnfortunately, context-based approaches suffer from distribution mismatch, as\nthe context in the offline data does not match the context at test time,\nlimiting their ability to generalize to the test tasks. This leads to the task\nrepresentations overfitting to the offline training data. Intuitively, the task\nrepresentations should be independent of the behavior policy used to collect\nthe offline data. To address this issue, we approximately minimize the mutual\ninformation between the distribution over the task representations and behavior\npolicy by maximizing the entropy of behavior policy conditioned on the task\nrepresentations. We validate our approach in MuJoCo environments, showing that\ncompared to baselines, our task representations more faithfully represent the\nunderlying tasks, leading to outperforming prior methods in both\nin-distribution and out-of-distribution tasks.\n","authors":["Mohammadreza nakhaei","Aidan Scannell","Joni Pajarinen"],"pdf_url":"https://arxiv.org/pdf/2412.14834v1.pdf","comment":"7 Pages, Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2406.02765v5","updated":"2024-12-19T13:16:21Z","published":"2024-06-04T20:33:29Z","title":"Discovering Continuous-Time Memory-Based Symbolic Policies using Genetic\n Programming","summary":" Artificial intelligence techniques are increasingly being applied to solve\ncontrol problems, but often rely on black-box methods without transparent\noutput generation. To improve the interpretability and transparency in control\nsystems, models can be defined as white-box symbolic policies described by\nmathematical expressions. For better performance in partially observable and\nvolatile environments, the symbolic policies are extended with memory\nrepresented by continuous-time latent variables, governed by differential\nequations. Genetic programming is used for optimisation, resulting in\ninterpretable policies consisting of symbolic expressions. Our results show\nthat symbolic policies with memory compare with black-box policies on a variety\nof control tasks. Furthermore, the benefit of the memory in symbolic policies\nis demonstrated on experiments where memory-less policies fall short. Overall,\nwe present a method for evolving high-performing symbolic policies that offer\ninterpretability and transparency, which lacks in black-box models.\n","authors":["Sigur de Vries","Sander Keemink","Marcel van Gerven"],"pdf_url":"https://arxiv.org/pdf/2406.02765v5.pdf","comment":"21 pages including references and appendix, 5 figures, 1 algorithm, 5\n tables"},{"id":"http://arxiv.org/abs/2412.14814v1","updated":"2024-12-19T13:09:06Z","published":"2024-12-19T13:09:06Z","title":"Answer Set Networks: Casting Answer Set Programming into Deep Learning","summary":" Although Answer Set Programming (ASP) allows constraining neural-symbolic\n(NeSy) systems, its employment is hindered by the prohibitive costs of\ncomputing stable models and the CPU-bound nature of state-of-the-art solvers.\nTo this end, we propose Answer Set Networks (ASN), a NeSy solver. Based on\nGraph Neural Networks (GNN), ASNs are a scalable approach to ASP-based Deep\nProbabilistic Logic Programming (DPPL). Specifically, we show how to translate\nASPs into ASNs and demonstrate how ASNs can efficiently solve the encoded\nproblem by leveraging GPU's batching and parallelization capabilities. Our\nexperimental evaluations demonstrate that ASNs outperform state-of-the-art\nCPU-bound NeSy systems on multiple tasks. Simultaneously, we make the following\ntwo contributions based on the strengths of ASNs. Namely, we are the first to\nshow the finetuning of Large Language Models (LLM) with DPPLs, employing ASNs\nto guide the training with logic. Further, we show the \"constitutional\nnavigation\" of drones, i.e., encoding public aviation laws in an ASN for\nrouting Unmanned Aerial Vehicles in uncertain environments.\n","authors":["Arseny Skryagin","Daniel Ochs","Phillip Deibert","Simon Kohaut","Devendra Singh Dhami","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2412.14814v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2412.10341v2","updated":"2024-12-19T13:03:10Z","published":"2024-12-13T18:38:47Z","title":"Shape error prediction in 5-axis machining using graph neural networks","summary":" This paper presents an innovative method for predicting shape errors in\n5-axis machining using graph neural networks. The graph structure is defined\nwith nodes representing workpiece surface points and edges denoting the\nneighboring relationships. The dataset encompasses data from a material removal\nsimulation, process data, and post-machining quality information. Experimental\nresults show that the presented approach can generalize the shape error\nprediction for the investigated workpiece geometry. Moreover, by modelling\nspatial and temporal connections within the workpiece, the approach handles a\nlow number of labels compared to non-graphical methods such as Support Vector\nMachines.\n","authors":["Julia Huuk","Abheek Dhingra","Eirini Ntoutsi","Berend Denkena"],"pdf_url":"https://arxiv.org/pdf/2412.10341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14810v1","updated":"2024-12-19T13:00:03Z","published":"2024-12-19T13:00:03Z","title":"MARIA: a Multimodal Transformer Model for Incomplete Healthcare Data","summary":" In healthcare, the integration of multimodal data is pivotal for developing\ncomprehensive diagnostic and predictive models. However, managing missing data\nremains a significant challenge in real-world applications. We introduce MARIA\n(Multimodal Attention Resilient to Incomplete datA), a novel transformer-based\ndeep learning model designed to address these challenges through an\nintermediate fusion strategy. Unlike conventional approaches that depend on\nimputation, MARIA utilizes a masked self-attention mechanism, which processes\nonly the available data without generating synthetic values. This approach\nenables it to effectively handle incomplete datasets, enhancing robustness and\nminimizing biases introduced by imputation methods. We evaluated MARIA against\n10 state-of-the-art machine learning and deep learning models across 8\ndiagnostic and prognostic tasks. The results demonstrate that MARIA outperforms\nexisting methods in terms of performance and resilience to varying levels of\ndata incompleteness, underscoring its potential for critical healthcare\napplications.\n","authors":["Camillo Maria Caruso","Paolo Soda","Valerio Guarrasi"],"pdf_url":"https://arxiv.org/pdf/2412.14810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14802v1","updated":"2024-12-19T12:48:17Z","published":"2024-12-19T12:48:17Z","title":"Stack Trace Deduplication: Faster, More Accurately, and in More\n Realistic Scenarios","summary":" In large-scale software systems, there are often no fully-fledged bug reports\nwith human-written descriptions when an error occurs. In this case, developers\nrely on stack traces, i.e., series of function calls that led to the error.\nSince there can be tens and hundreds of thousands of them describing the same\nissue from different users, automatic deduplication into categories is\nnecessary to allow for processing. Recent works have proposed powerful deep\nlearning-based approaches for this, but they are evaluated and compared in\nisolation from real-life workflows, and it is not clear whether they will\nactually work well at scale.\n To overcome this gap, this work presents three main contributions: a novel\nmodel, an industry-based dataset, and a multi-faceted evaluation. Our model\nconsists of two parts - (1) an embedding model with byte-pair encoding and\napproximate nearest neighbor search to quickly find the most relevant stack\ntraces to the incoming one, and (2) a reranker that re-ranks the most fitting\nstack traces, taking into account the repeated frames between them. To\ncomplement the existing datasets collected from open-source projects, we share\nwith the community SlowOps - a dataset of stack traces from IntelliJ-based\nproducts developed by JetBrains, which has an order of magnitude more stack\ntraces per category. Finally, we carry out an evaluation that strives to be\nrealistic: measuring not only the accuracy of categorization, but also the\noperation time and the ability to create new categories. The evaluation shows\nthat our model strikes a good balance - it outperforms other models on both\nopen-source datasets and SlowOps, while also being faster on time than most. We\nrelease all of our code and data, and hope that our work can pave the way to\nfurther practice-oriented research in the area.\n","authors":["Egor Shibaev","Denis Sushentsev","Yaroslav Golubev","Aleksandr Khvorov"],"pdf_url":"https://arxiv.org/pdf/2412.14802v1.pdf","comment":"Published at SANER'25. 11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2412.14801v1","updated":"2024-12-19T12:47:21Z","published":"2024-12-19T12:47:21Z","title":"Extending TWIG: Zero-Shot Predictive Hyperparameter Selection for KGEs\n based on Graph Structure","summary":" Knowledge Graphs (KGs) have seen increasing use across various domains --\nfrom biomedicine and linguistics to general knowledge modelling. In order to\nfacilitate the analysis of knowledge graphs, Knowledge Graph Embeddings (KGEs)\nhave been developed to automatically analyse KGs and predict new facts based on\nthe information in a KG, a task called \"link prediction\". Many existing studies\nhave documented that the structure of a KG, KGE model components, and KGE\nhyperparameters can significantly change how well KGEs perform and what\nrelationships they are able to learn. Recently, the Topologically-Weighted\nIntelligence Generation (TWIG) model has been proposed as a solution to\nmodelling how each of these elements relate. In this work, we extend the\nprevious research on TWIG and evaluate its ability to simulate the output of\nthe KGE model ComplEx in the cross-KG setting. Our results are twofold. First,\nTWIG is able to summarise KGE performance on a wide range of hyperparameter\nsettings and KGs being learned, suggesting that it represents a general\nknowledge of how to predict KGE performance from KG structure. Second, we show\nthat TWIG can successfully predict hyperparameter performance on unseen KGs in\nthe zero-shot setting. This second observation leads us to propose that, with\nadditional research, optimal hyperparameter selection for KGE models could be\ndetermined in a pre-hoc manner using TWIG-like methods, rather than by using a\nfull hyperparameter search.\n","authors":["Jeffrey Sardina","John D. Kelleher","Declan O'Sullivan"],"pdf_url":"https://arxiv.org/pdf/2412.14801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11448v3","updated":"2024-12-19T12:46:27Z","published":"2024-12-16T05:02:50Z","title":"TRAIL: Trust-Aware Client Scheduling for Semi-Decentralized Federated\n Learning","summary":" Due to the sensitivity of data, Federated Learning (FL) is employed to enable\ndistributed machine learning while safeguarding data privacy and accommodating\nthe requirements of various devices. However, in the context of\nsemi-decentralized FL, clients' communication and training states are dynamic.\nThis variability arises from local training fluctuations, heterogeneous data\ndistributions, and intermittent client participation. Most existing studies\nprimarily focus on stable client states, neglecting the dynamic challenges\ninherent in real-world scenarios. To tackle this issue, we propose a\nTRust-Aware clIent scheduLing mechanism called TRAIL, which assesses client\nstates and contributions, enhancing model training efficiency through selective\nclient participation. We focus on a semi-decentralized FL framework where edge\nservers and clients train a shared global model using unreliable intra-cluster\nmodel aggregation and inter-cluster model consensus. First, we propose an\nadaptive hidden semi-Markov model to estimate clients' communication states and\ncontributions. Next, we address a client-server association optimization\nproblem to minimize global training loss. Using convergence analysis, we\npropose a greedy client scheduling algorithm. Finally, our experiments\nconducted on real-world datasets demonstrate that TRAIL outperforms\nstate-of-the-art baselines, achieving an improvement of 8.7% in test accuracy\nand a reduction of 15.3% in training loss.\n","authors":["Gangqiang Hu","Jianfeng Lu","Jianmin Han","Shuqin Cao","Jing Liu","Hao Fu"],"pdf_url":"https://arxiv.org/pdf/2412.11448v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05317v3","updated":"2024-12-19T12:38:23Z","published":"2024-10-05T03:47:06Z","title":"Accelerating Diffusion Transformers with Token-wise Feature Caching","summary":" Diffusion transformers have shown significant effectiveness in both image and\nvideo synthesis at the expense of huge computation costs. To address this\nproblem, feature caching methods have been introduced to accelerate diffusion\ntransformers by caching the features in previous timesteps and reusing them in\nthe following timesteps. However, previous caching methods ignore that\ndifferent tokens exhibit different sensitivities to feature caching, and\nfeature caching on some tokens may lead to 10$\\times$ more destruction to the\noverall generation quality compared with other tokens. In this paper, we\nintroduce token-wise feature caching, allowing us to adaptively select the most\nsuitable tokens for caching, and further enable us to apply different caching\nratios to neural layers in different types and depths. Extensive experiments on\nPixArt-$\\alpha$, OpenSora, and DiT demonstrate our effectiveness in both image\nand video generation with no requirements for training. For instance,\n2.36$\\times$ and 1.93$\\times$ acceleration are achieved on OpenSora and\nPixArt-$\\alpha$ with almost no drop in generation quality.\n","authors":["Chang Zou","Xuyang Liu","Ting Liu","Siteng Huang","Linfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.05317v3.pdf","comment":"In this version, we achieved a nearly lossless acceleration of 1.51\n times for ToCa on FLUX in the appendix"},{"id":"http://arxiv.org/abs/2406.05666v9","updated":"2024-12-19T12:13:26Z","published":"2024-06-09T06:49:22Z","title":"Probability Distribution Learning and Its Application in Deep Learning","summary":" This paper introduces a novel theoretical learning framework, termed\nprobability distribution learning (PD learning). Departing from the traditional\nstatistical learning framework, PD learning focuses on learning the underlying\nprobability distribution, which is modeled as a random variable within the\nprobability simplex. In this framework, the optimization objective is the\nlearning error, which quantifies the posterior expected discrepancy between the\nmodel's predicted distribution and the underlying true distribution, given\navailable sample data and prior knowledge. To optimize the learning error, this\npaper proposes the necessary conditions for loss functions, models, and\noptimization algorithms, ensuring that these conditions are met in real-world\nmachine learning scenarios. Based on these conditions, the non-convex\noptimization mechanism corresponding to model training can be theoretically\nresolved. Moreover, this paper provides model-dependent and model-independent\nbounds on learning error, offering new insights into the model's fitting and\ngeneralization capabilities. Furthermore, the paper applies the PD learning\nframework to elucidate the mechanisms by which various techniques, including\nrandom parameter initialization, over-parameterization, and dropout, influence\ndeep model training. Finally, the paper substantiates the key conclusions of\nthe proposed framework through experimental results.\n","authors":["Binchuan Qi"],"pdf_url":"https://arxiv.org/pdf/2406.05666v9.pdf","comment":"arXiv admin note: text overlap with arXiv:2105.04026 by other\n authors. arXiv admin note: text overlap with arXiv:2105.04026 by other\n authors"},{"id":"http://arxiv.org/abs/2412.09265v4","updated":"2024-12-19T12:11:13Z","published":"2024-12-12T13:22:02Z","title":"Score and Distribution Matching Policy: Advanced Accelerated Visuomotor\n Policies via Matched Distillation","summary":" Visual-motor policy learning has advanced with architectures like\ndiffusion-based policies, known for modeling complex robotic trajectories.\nHowever, their prolonged inference times hinder high-frequency control tasks\nrequiring real-time feedback. While consistency distillation (CD) accelerates\ninference, it introduces errors that compromise action quality. To address\nthese limitations, we propose the Score and Distribution Matching Policy (SDM\nPolicy), which transforms diffusion-based policies into single-step generators\nthrough a two-stage optimization process: score matching ensures alignment with\ntrue action distributions, and distribution matching minimizes KL divergence\nfor consistency. A dual-teacher mechanism integrates a frozen teacher for\nstability and an unfrozen teacher for adversarial training, enhancing\nrobustness and alignment with target distributions. Evaluated on a 57-task\nsimulation benchmark, SDM Policy achieves a 6x inference speedup while having\nstate-of-the-art action quality, providing an efficient and reliable framework\nfor high-frequency robotic tasks.\n","authors":["Bofang Jia","Pengxiang Ding","Can Cui","Mingyang Sun","Pengfang Qian","Siteng Huang","Zhaoxin Fan","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2412.09265v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09826v4","updated":"2024-12-19T12:07:06Z","published":"2023-02-20T08:19:19Z","title":"On the Expressivity of Persistent Homology in Graph Learning","summary":" Persistent homology, a technique from computational topology, has recently\nshown strong empirical performance in the context of graph classification.\nBeing able to capture long range graph properties via higher-order topological\nfeatures, such as cycles of arbitrary length, in combination with multi-scale\ntopological descriptors, has improved predictive performance for data sets with\nprominent topological structures, such as molecules. At the same time, the\ntheoretical properties of persistent homology have not been formally assessed\nin this context. This paper intends to bridge the gap between computational\ntopology and graph machine learning by providing a brief introduction to\npersistent homology in the context of graphs, as well as a theoretical\ndiscussion and empirical analysis of its expressivity for graph learning tasks.\n","authors":["Rubén Ballester","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2302.09826v4.pdf","comment":"Accepted at the 3rd Learning on Graphs Conference (LoG) 2024"},{"id":"http://arxiv.org/abs/2412.14779v1","updated":"2024-12-19T12:05:13Z","published":"2024-12-19T12:05:13Z","title":"Agent-Temporal Credit Assignment for Optimal Policy Preservation in\n Sparse Multi-Agent Reinforcement Learning","summary":" In multi-agent environments, agents often struggle to learn optimal policies\ndue to sparse or delayed global rewards, particularly in long-horizon tasks\nwhere it is challenging to evaluate actions at intermediate time steps. We\nintroduce Temporal-Agent Reward Redistribution (TAR$^2$), a novel approach\ndesigned to address the agent-temporal credit assignment problem by\nredistributing sparse rewards both temporally and across agents. TAR$^2$\ndecomposes sparse global rewards into time-step-specific rewards and calculates\nagent-specific contributions to these rewards. We theoretically prove that\nTAR$^2$ is equivalent to potential-based reward shaping, ensuring that the\noptimal policy remains unchanged. Empirical results demonstrate that TAR$^2$\nstabilizes and accelerates the learning process. Additionally, we show that\nwhen TAR$^2$ is integrated with single-agent reinforcement learning algorithms,\nit performs as well as or better than traditional multi-agent reinforcement\nlearning methods.\n","authors":["Aditya Kapoor","Sushant Swamy","Kale-ab Tessera","Mayank Baranwal","Mingfei Sun","Harshad Khadilkar","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2412.14779v1.pdf","comment":"12 pages, 1 figure"},{"id":"http://arxiv.org/abs/2408.05428v2","updated":"2024-12-19T11:57:19Z","published":"2024-08-10T04:21:04Z","title":"Generalized Encouragement-Based Instrumental Variables for\n Counterfactual Regression","summary":" In causal inference, encouragement designs (EDs) are widely used to analyze\ncausal effects, when randomized controlled trials (RCTs) are impractical or\ncompliance to treatment cannot be perfectly enforced. Unlike RCTs, which\ndirectly allocate treatments, EDs randomly assign encouragement policies that\npositively motivate individuals to engage in a specific treatment. These random\nencouragements act as instrumental variables (IVs), facilitating the\nidentification of causal effects through leveraging exogenous perturbations in\ndiscrete treatment scenarios. However, real-world applications of encouragement\ndesigns often face challenges such as incomplete randomization, limited\nexperimental data, and significantly fewer encouragements compared to\ntreatments, hindering precise causal effect estimation. To address this, this\npaper introduces novel theories and algorithms for identifying the Conditional\nAverage Treatment Effect (CATE) using variations in encouragement. Further, by\nleveraging both observational and encouragement data, we propose a generalized\nIV estimator, named Encouragement-based Counterfactual Regression (EnCounteR),\nto effectively estimate the causal effects. Extensive experiments on both\nsynthetic and real-world datasets demonstrate the superiority of EnCounteR over\nexisting methods.\n","authors":["Anpeng Wu","Kun Kuang","Ruoxuan Xiong","Xiangwei Chen","Zexu Sun","Fei Wu","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05428v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14771v1","updated":"2024-12-19T11:55:51Z","published":"2024-12-19T11:55:51Z","title":"ALKAFI-LLAMA3: Fine-Tuning LLMs for Precise Legal Understanding in\n Palestine","summary":" Large Language Models (LLMs) have demonstrated remarkable potential in\ndiverse domains, yet their application in the legal sector, particularly in\nlow-resource contexts, remains limited. This study addresses the challenges of\nadapting LLMs to the Palestinian legal domain, where political instability,\nfragmented legal frameworks, and limited AI resources hinder effective\nmachine-learning applications. We present a fine-tuned model based on a\nquantized version of Llama-3.2-1B-Instruct, trained on a synthetic data set\nderived from Palestinian legal texts. Using smaller-scale models and\nstrategically generated question-answer pairs, we achieve a cost-effective,\nlocally sustainable solution that provides accurate and contextually relevant\nlegal guidance. Our experiments demonstrate promising performance on various\nquery types, ranging from yes/no questions and narrative explanations to\ncomplex legal differentiations, while highlighting areas for improvement, such\nas handling calculation-based inquiries and structured list formatting. This\nwork provides a pathway for the deployment of AI-driven legal assistance tools\ntailored to the needs of resource-constrained environments.\n","authors":["Rabee Qasem","Mohannad Hendi","Banan Tantour"],"pdf_url":"https://arxiv.org/pdf/2412.14771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11654v2","updated":"2024-12-19T11:47:34Z","published":"2024-12-16T10:56:58Z","title":"Smoothness Really Matters: A Simple Yet Effective Approach for\n Unsupervised Graph Domain Adaptation","summary":" Unsupervised Graph Domain Adaptation (UGDA) seeks to bridge distribution\nshifts between domains by transferring knowledge from labeled source graphs to\ngiven unlabeled target graphs. Existing UGDA methods primarily focus on\naligning features in the latent space learned by graph neural networks (GNNs)\nacross domains, often overlooking structural shifts, resulting in limited\neffectiveness when addressing structurally complex transfer scenarios. Given\nthe sensitivity of GNNs to local structural features, even slight discrepancies\nbetween source and target graphs could lead to significant shifts in node\nembeddings, thereby reducing the effectiveness of knowledge transfer. To\naddress this issue, we introduce a novel approach for UGDA called Target-Domain\nStructural Smoothing (TDSS). TDSS is a simple and effective method designed to\nperform structural smoothing directly on the target graph, thereby mitigating\nstructural distribution shifts and ensuring the consistency of node\nrepresentations. Specifically, by integrating smoothing techniques with\nneighborhood sampling, TDSS maintains the structural coherence of the target\ngraph while mitigating the risk of over-smoothing. Our theoretical analysis\nshows that TDSS effectively reduces target risk by improving model smoothness.\nEmpirical results on three real-world datasets demonstrate that TDSS\noutperforms recent state-of-the-art baselines, achieving significant\nimprovements across six transfer scenarios. The code is available in\nhttps://github.com/cwei01/TDSS.\n","authors":["Wei Chen","Guo Ye","Yakun Wang","Zhao Zhang","Libang Zhang","Daxin Wang","Zhiqiang Zhang","Fuzhen Zhuang"],"pdf_url":"https://arxiv.org/pdf/2412.11654v2.pdf","comment":"11 pages, Accpected by AAAI2025"},{"id":"http://arxiv.org/abs/2311.07326v2","updated":"2024-12-19T11:41:28Z","published":"2023-11-13T13:27:59Z","title":"MetaSymNet: A Tree-like Symbol Network with Adaptive Architecture and\n Activation Functions","summary":" Mathematical formulas serve as the means of communication between humans and\nnature, encapsulating the operational laws governing natural phenomena. The\nconcise formulation of these laws is a crucial objective in scientific research\nand an important challenge for artificial intelligence (AI). While traditional\nartificial neural networks (MLP) excel at data fitting, they often yield\nuninterpretable black box results that hinder our understanding of the\nrelationship between variables x and predicted values y. Moreover, the fixed\nnetwork architecture in MLP often gives rise to redundancy in both network\nstructure and parameters. To address these issues, we propose MetaSymNet, a\nnovel neural network that dynamically adjusts its structure in real-time,\nallowing for both expansion and contraction. This adaptive network employs the\nPANGU meta function as its activation function, which is a unique type capable\nof evolving into various basic functions during training to compose\nmathematical formulas tailored to specific needs. We then evolve the neural\nnetwork into a concise, interpretable mathematical expression. To evaluate\nMetaSymNet's performance, we compare it with four state-of-the-art symbolic\nregression algorithms across more than 10 public datasets comprising 222\nformulas. Our experimental results demonstrate that our algorithm outperforms\nothers consistently regardless of noise presence or absence. Furthermore, we\nassess MetaSymNet against MLP and SVM regarding their fitting ability and\nextrapolation capability, these are two essential aspects of machine learning\nalgorithms. The findings reveal that our algorithm excels in both areas.\nFinally, we compared MetaSymNet with MLP using iterative pruning in network\nstructure complexity. The results show that MetaSymNet's network structure\ncomplexity is obviously less than MLP under the same goodness of fit.\n","authors":["Yanjie Li","Weijun Li","Lina Yu","Min Wu","Jinyi Liu","Wenqiang Li","Meilan Hao","Shu Wei","Yusong Deng"],"pdf_url":"https://arxiv.org/pdf/2311.07326v2.pdf","comment":"This work has been accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2412.14753v1","updated":"2024-12-19T11:34:22Z","published":"2024-12-19T11:34:22Z","title":"Opportunities and limitations of explaining quantum machine learning","summary":" A common trait of many machine learning models is that it is often difficult\nto understand and explain what caused the model to produce the given output.\nWhile the explainability of neural networks has been an active field of\nresearch in the last years, comparably little is known for quantum machine\nlearning models. Despite a few recent works analyzing some specific aspects of\nexplainability, as of now there is no clear big picture perspective as to what\ncan be expected from quantum learning models in terms of explainability. In\nthis work, we address this issue by identifying promising research avenues in\nthis direction and lining out the expected future results. We additionally\npropose two explanation methods designed specifically for quantum machine\nlearning models, as first of their kind to the best of our knowledge. Next to\nour pre-view of the field, we compare both existing and novel methods to\nexplain the predictions of quantum learning models. By studying explainability\nin quantum machine learning, we can contribute to the sustainable development\nof the field, preventing trust issues in the future.\n","authors":["Elies Gil-Fuster","Jonas R. Naujoks","Grégoire Montavon","Thomas Wiegand","Wojciech Samek","Jens Eisert"],"pdf_url":"https://arxiv.org/pdf/2412.14753v1.pdf","comment":"16+16 pages, 3+4 figures"},{"id":"http://arxiv.org/abs/2412.14750v1","updated":"2024-12-19T11:29:57Z","published":"2024-12-19T11:29:57Z","title":"Deep Learning Based Recalibration of SDSS and DESI BAO Alleviates Hubble\n and Clustering Tensions","summary":" Conventional calibration of Baryon Acoustic Oscillations (BAO) data relies on\nestimation of the sound horizon at drag epoch $r_d$ from early universe\nobservations by assuming a cosmological model. We present a recalibration of\ntwo independent BAO datasets, SDSS and DESI, by employing deep learning\ntechniques for model-independent estimation of $r_d$, and explore the impacts\non $\\Lambda$CDM cosmological parameters. Significant reductions in both Hubble\n($H_0$) and clustering ($S_8$) tensions are observed for both the recalibrated\ndatasets. Moderate shifts in some other parameters hint towards further\nexploration of such data-driven approaches.\n","authors":["Rahul Shah","Purba Mukherjee","Soumadeep Saha","Utpal Garain","Supratik Pal"],"pdf_url":"https://arxiv.org/pdf/2412.14750v1.pdf","comment":"5 pages, 2 figures, 2 tables. Comments are welcome"},{"id":"http://arxiv.org/abs/2412.14744v1","updated":"2024-12-19T11:22:52Z","published":"2024-12-19T11:22:52Z","title":"A parametric algorithm is optimal for non-parametric regression of\n smooth functions","summary":" We address the regression problem for a general function $f:[-1,1]^d\\to\n\\mathbb R$ when the learner selects the training points $\\{x_i\\}_{i=1}^n$ to\nachieve a uniform error bound across the entire domain. In this setting, known\nhistorically as nonparametric regression, we aim to establish a sample\ncomplexity bound that depends solely on the function's degree of smoothness.\nAssuming periodicity at the domain boundaries, we introduce PADUA, an algorithm\nthat, with high probability, provides performance guarantees optimal up to\nconstant or logarithmic factors across all problem parameters. Notably, PADUA\nis the first parametric algorithm with optimal sample complexity for this\nsetting. Due to this feature, we prove that, differently from the\nnon-parametric state of the art, PADUA enjoys optimal space complexity in the\nprediction phase. To validate these results, we perform numerical experiments\nover functions coming from real audio data, where PADUA shows comparable\nperformance to state-of-the-art methods, while requiring only a fraction of the\ncomputational time.\n","authors":["Davide Maran","Marcello Restelli"],"pdf_url":"https://arxiv.org/pdf/2412.14744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14741v1","updated":"2024-12-19T11:17:31Z","published":"2024-12-19T11:17:31Z","title":"Active Inference and Human--Computer Interaction","summary":" Active Inference is a closed-loop computational theoretical basis for\nunderstanding behaviour, based on agents with internal probabilistic generative\nmodels that encode their beliefs about how hidden states in their environment\ncause their sensations. We review Active Inference and how it could be applied\nto model the human-computer interaction loop. Active Inference provides a\ncoherent framework for managing generative models of humans, their\nenvironments, sensors and interface components. It informs off-line design and\nsupports real-time, online adaptation. It provides model-based explanations for\nbehaviours observed in HCI, and new tools to measure important concepts such as\nagency and engagement. We discuss how Active Inference offers a new basis for a\ntheory of interaction in HCI, tools for design of modern, complex sensor-based\nsystems, and integration of artificial intelligence technologies, enabling it\nto cope with diversity in human users and contexts. We discuss the practical\nchallenges in implementing such Active Inference-based systems.\n","authors":["Roderick Murray-Smith","John H. Williamson","Sebastian Stein"],"pdf_url":"https://arxiv.org/pdf/2412.14741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.12461v2","updated":"2024-12-19T11:16:44Z","published":"2021-02-24T18:41:37Z","title":"MAPFAST: A Deep Algorithm Selector for Multi Agent Path Finding using\n Shortest Path Embeddings","summary":" Solving the Multi-Agent Path Finding (MAPF) problem optimally is known to be\nNP-Hard for both make-span and total arrival time minimization. While many\nalgorithms have been developed to solve MAPF problems, there is no dominating\noptimal MAPF algorithm that works well in all types of problems and no standard\nguidelines for when to use which algorithm. In this work, we develop the deep\nconvolutional network MAPFAST (Multi-Agent Path Finding Algorithm SelecTor),\nwhich takes a MAPF problem instance and attempts to select the fastest\nalgorithm to use from a portfolio of algorithms. We improve the performance of\nour model by including single-agent shortest paths in the instance embedding\ngiven to our model and by utilizing supplemental loss functions in addition to\na classification loss. We evaluate our model on a large and diverse dataset of\nMAPF instances, showing that it outperforms all individual algorithms in its\nportfolio as well as the state-of-the-art optimal MAPF algorithm selector. We\nalso provide an analysis of algorithm behavior in our dataset to gain a deeper\nunderstanding of optimal MAPF algorithms' strengths and weaknesses to help\nother researchers leverage different heuristics in algorithm designs.\n","authors":["Jingyao Ren","Vikraman Sathiyanarayanan","Eric Ewing","Baskin Senbaslar","Nora Ayanian"],"pdf_url":"https://arxiv.org/pdf/2102.12461v2.pdf","comment":"To appear in AAMAS-21"},{"id":"http://arxiv.org/abs/2412.14739v1","updated":"2024-12-19T11:15:02Z","published":"2024-12-19T11:15:02Z","title":"On the Use of Deep Learning Models for Semantic Clone Detection","summary":" Detecting and tracking code clones can ease various software development and\nmaintenance tasks when changes in a code fragment should be propagated over all\nits copies. Several deep learning-based clone detection models have appeared in\nthe literature for detecting syntactic and semantic clones, widely evaluated\nwith the BigCloneBench dataset. However, class imbalance and the small number\nof semantic clones make BigCloneBench less ideal for interpreting model\nperformance. Researchers also use other datasets such as GoogleCodeJam,\nOJClone, and SemanticCloneBench to understand model generalizability. To\novercome the limitations of existing datasets, the GPT-assisted semantic and\ncross-language clone dataset GPTCloneBench has been released. However, how\nthese models compare across datasets remains unclear. In this paper, we propose\na multi-step evaluation approach for five state-of-the-art clone detection\nmodels leveraging existing benchmark datasets, including GPTCloneBench, and\nusing mutation operators to study model ability. Specifically, we examine three\nhighly-performing single-language models (ASTNN, GMN, CodeBERT) on\nBigCloneBench, SemanticCloneBench, and GPTCloneBench, testing their robustness\nwith mutation operations. Additionally, we compare them against cross-language\nmodels (C4, CLCDSA) known for detecting semantic clones. While single-language\nmodels show high F1 scores for BigCloneBench, their performance on\nSemanticCloneBench varies (up to 20%). Interestingly, the cross-language model\n(C4) shows superior performance (around 7%) on SemanticCloneBench over other\nmodels and performs similarly on BigCloneBench and GPTCloneBench. On\nmutation-based datasets, C4 has more robust performance (less than 1%\ndifference) compared to single-language models, which show high variability.\n","authors":["Subroto Nag Pinku","Debajyoti Mondal","Chanchal K. Roy"],"pdf_url":"https://arxiv.org/pdf/2412.14739v1.pdf","comment":"Accepted at the 40th IEEE International Conference on Software\n Maintenance and Evolution (ICSME 2024)"},{"id":"http://arxiv.org/abs/2409.11383v2","updated":"2024-12-19T11:12:30Z","published":"2024-09-17T17:34:24Z","title":"Training Datasets Generation for Machine Learning: Application to Vision\n Based Navigation","summary":" Vision Based Navigation consists in utilizing cameras as precision sensors\nfor GNC after extracting information from images. To enable the adoption of\nmachine learning for space applications, one of obstacles is the demonstration\nthat available training datasets are adequate to validate the algorithms. The\nobjective of the study is to generate datasets of images and metadata suitable\nfor training machine learning algorithms. Two use cases were selected and a\nrobust methodology was developed to validate the datasets including the ground\ntruth. The first use case is in-orbit rendezvous with a man-made object: a\nmockup of satellite ENVISAT. The second use case is a Lunar landing scenario.\nDatasets were produced from archival datasets (Chang'e 3), from the laboratory\nat DLR TRON facility and at Airbus Robotic laboratory, from SurRender software\nhigh fidelity image simulator using Model Capture and from Generative\nAdversarial Networks. The use case definition included the selection of\nalgorithms as benchmark: an AI-based pose estimation algorithm and a dense\noptical flow algorithm were selected. Eventually it is demonstrated that\ndatasets produced with SurRender and selected laboratory facilities are\nadequate to train machine learning algorithms.\n","authors":["Jérémy Lebreton","Ingo Ahrns","Roland Brochard","Christoph Haskamp","Hans Krüger","Matthieu Le Goff","Nicolas Menga","Nicolas Ollagnier","Ralf Regele","Francesco Capolupo","Massimo Casasco"],"pdf_url":"https://arxiv.org/pdf/2409.11383v2.pdf","comment":"6 pages, 4 figures, preprint of the proceedings of ESA SPAICE\n conference 2024"},{"id":"http://arxiv.org/abs/2412.14738v1","updated":"2024-12-19T11:10:48Z","published":"2024-12-19T11:10:48Z","title":"Boosting GNN Performance via Training Sample Selection Based on\n Adversarial Robustness Evaluation","summary":" Graph Neural Networks (GNNs) have established themselves as one of the most\npowerful neural network architectures, excelling in leveraging graph topology\nand node features for various tasks. However, GNNs are inherently vulnerable to\nnoise in their inputs. Such noise can significantly degrade their performance.\nTo address this challenge, we propose a novel approach that employs adversarial\nrobustness evaluation techniques to identify nodes in the graph that are most\nsusceptible to noise. By selecting and constructing a training set composed of\nthese particularly noise-prone nodes, we then use them to train a Graph\nConvolutional Network (GCN). Our experimental results demonstrate that this\nstrategy leads to substantial improvements in the GCN's performance.\n","authors":["Yongyu Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10099v2","updated":"2024-12-19T11:06:39Z","published":"2024-04-15T19:15:32Z","title":"Feature selection in linear SVMs via a hard cardinality constraint: a\n scalable SDP decomposition approach","summary":" In this paper, we study the embedded feature selection problem in linear\nSupport Vector Machines (SVMs), in which a cardinality constraint is employed,\nleading to an interpretable classification model. The problem is NP-hard due to\nthe presence of the cardinality constraint, even though the original linear SVM\namounts to a problem solvable in polynomial time. To handle the hard problem,\nwe first introduce two mixed-integer formulations for which novel semidefinite\nrelaxations are proposed. Exploiting the sparsity pattern of the relaxations,\nwe decompose the problems and obtain equivalent relaxations in a much smaller\ncone, making the conic approaches scalable. To make the best usage of the\ndecomposed relaxations, we propose heuristics using the information of its\noptimal solution. Moreover, an exact procedure is proposed by solving a\nsequence of mixed-integer decomposed semidefinite optimization problems.\nNumerical results on classical benchmarking datasets are reported, showing the\nefficiency and effectiveness of our approach.\n","authors":["Immanuel Bomze","Federico D'Onofrio","Laura Palagi","Bo Peng"],"pdf_url":"https://arxiv.org/pdf/2404.10099v2.pdf","comment":"Submitted to European Journal of Operational Research. arXiv admin\n note: text overlap with arXiv:1808.02435 by other authors"},{"id":"http://arxiv.org/abs/2309.11036v2","updated":"2024-12-19T10:58:39Z","published":"2023-09-20T03:31:11Z","title":"Scalable Acceleration for Classification-Based Derivative-Free\n Optimization","summary":" Derivative-free optimization algorithms play an important role in scientific\nand engineering design optimization problems, especially when derivative\ninformation is not accessible. In this paper, we study the framework of\nsequential classification-based derivative-free optimization algorithms. By\nintroducing learning theoretic concept hypothesis-target shattering rate, we\nrevisit the computational complexity upper bound of SRACOS (Hu, Qian, and Yu\n2017). Inspired by the revisited upper bound, we propose an algorithm named\nRACE-CARS, which adds a random region-shrinking step compared with SRACOS. We\nfurther establish theorems showing the acceleration by region shrinking.\nExperiments on the synthetic functions as well as black-box tuning for\nlanguage-model-as-a-service demonstrate empirically the efficiency of\nRACE-CARS. An ablation experiment on the introduced hyperparameters is also\nconducted, revealing the mechanism of RACE-CARS and putting forward an\nempirical hyper-parameter tuning guidance.\n","authors":["Tianyi Han","Jingya Li","Zhipeng Guo","Yuan Jin"],"pdf_url":"https://arxiv.org/pdf/2309.11036v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14730v1","updated":"2024-12-19T10:56:18Z","published":"2024-12-19T10:56:18Z","title":"Generative AI for Banks: Benchmarks and Algorithms for Synthetic\n Financial Transaction Data","summary":" The banking sector faces challenges in using deep learning due to data\nsensitivity and regulatory constraints, but generative AI may offer a solution.\nThus, this study identifies effective algorithms for generating synthetic\nfinancial transaction data and evaluates five leading models - Conditional\nTabular Generative Adversarial Networks (CTGAN), DoppelGANger (DGAN),\nWasserstein GAN, Financial Diffusion (FinDiff), and Tabular Variational\nAutoEncoders (TVAE) - across five criteria: fidelity, synthesis quality,\nefficiency, privacy, and graph structure. While none of the algorithms is able\nto replicate the real data's graph structure, each excels in specific areas:\nDGAN is ideal for privacy-sensitive tasks, FinDiff and TVAE excel in data\nreplication and augmentation, and CTGAN achieves a balance across all five\ncriteria, making it suitable for general applications with moderate privacy\nconcerns. As a result, our findings offer valuable insights for choosing the\nmost suitable algorithm.\n","authors":["Fabian Sven Karst","Sook-Yee Chong","Abigail A. Antenor","Enyu Lin","Mahei Manhai Li","Jan Marco Leimeister"],"pdf_url":"https://arxiv.org/pdf/2412.14730v1.pdf","comment":"Presented at the 34th Workshop on Information Technologies and\n Systems (WITS 2024)"},{"id":"http://arxiv.org/abs/2412.14724v1","updated":"2024-12-19T10:47:31Z","published":"2024-12-19T10:47:31Z","title":"FROC: Building Fair ROC from a Trained Classifier","summary":" This paper considers the problem of fair probabilistic binary classification\nwith binary protected groups. The classifier assigns scores, and a practitioner\npredicts labels using a certain cut-off threshold based on the desired\ntrade-off between false positives vs. false negatives. It derives these\nthresholds from the ROC of the classifier. The resultant classifier may be\nunfair to one of the two protected groups in the dataset. It is desirable that\nno matter what threshold the practitioner uses, the classifier should be fair\nto both the protected groups; that is, the $\\mathcal{L}_p$ norm between FPRs\nand TPRs of both the protected groups should be at most $\\varepsilon$. We call\nsuch fairness on ROCs of both the protected attributes\n$\\varepsilon_p$-Equalized ROC. Given a classifier not satisfying\n$\\varepsilon_1$-Equalized ROC, we aim to design a post-processing method to\ntransform the given (potentially unfair) classifier's output (score) to a\nsuitable randomized yet fair classifier. That is, the resultant classifier must\nsatisfy $\\varepsilon_1$-Equalized ROC. First, we introduce a threshold query\nmodel on the ROC curves for each protected group. The resulting classifier is\nbound to face a reduction in AUC. With the proposed query model, we provide a\nrigorous theoretical analysis of the minimal AUC loss to achieve\n$\\varepsilon_1$-Equalized ROC. To achieve this, we design a linear time\nalgorithm, namely \\texttt{FROC}, to transform a given classifier's output to a\nprobabilistic classifier that satisfies $\\varepsilon_1$-Equalized ROC. We prove\nthat under certain theoretical conditions, \\texttt{FROC}\\ achieves the\ntheoretical optimal guarantees. We also study the performance of our\n\\texttt{FROC}\\ on multiple real-world datasets with many trained classifiers.\n","authors":["Avyukta Manjunatha Vummintala","Shantanu Das","Sujit Gujar"],"pdf_url":"https://arxiv.org/pdf/2412.14724v1.pdf","comment":"51 pages, The 39th Annual AAAI Conference on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2406.02507v3","updated":"2024-12-19T10:43:11Z","published":"2024-06-04T17:25:59Z","title":"Guiding a Diffusion Model with a Bad Version of Itself","summary":" The primary axes of interest in image-generating diffusion models are image\nquality, the amount of variation in the results, and how well the results align\nwith a given condition, e.g., a class label or a text prompt. The popular\nclassifier-free guidance approach uses an unconditional model to guide a\nconditional model, leading to simultaneously better prompt alignment and\nhigher-quality images at the cost of reduced variation. These effects seem\ninherently entangled, and thus hard to control. We make the surprising\nobservation that it is possible to obtain disentangled control over image\nquality without compromising the amount of variation by guiding generation\nusing a smaller, less-trained version of the model itself rather than an\nunconditional model. This leads to significant improvements in ImageNet\ngeneration, setting record FIDs of 1.01 for 64x64 and 1.25 for 512x512, using\npublicly available networks. Furthermore, the method is also applicable to\nunconditional diffusion models, drastically improving their quality.\n","authors":["Tero Karras","Miika Aittala","Tuomas Kynkäänniemi","Jaakko Lehtinen","Timo Aila","Samuli Laine"],"pdf_url":"https://arxiv.org/pdf/2406.02507v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.14719v1","updated":"2024-12-19T10:41:24Z","published":"2024-12-19T10:41:24Z","title":"Prototypical Calibrating Ambiguous Samples for Micro-Action Recognition","summary":" Micro-Action Recognition (MAR) has gained increasing attention due to its\ncrucial role as a form of non-verbal communication in social interactions, with\npromising potential for applications in human communication and emotion\nanalysis. However, current approaches often overlook the inherent ambiguity in\nmicro-actions, which arises from the wide category range and subtle visual\ndifferences between categories. This oversight hampers the accuracy of\nmicro-action recognition. In this paper, we propose a novel Prototypical\nCalibrating Ambiguous Network (\\textbf{PCAN}) to unleash and mitigate the\nambiguity of MAR. \\textbf{Firstly}, we employ a hierarchical action-tree to\nidentify the ambiguous sample, categorizing them into distinct sets of\nambiguous samples of false negatives and false positives, considering both\nbody- and action-level categories. \\textbf{Secondly}, we implement an ambiguous\ncontrastive refinement module to calibrate these ambiguous samples by\nregulating the distance between ambiguous samples and their corresponding\nprototypes. This calibration process aims to pull false negative\n($\\mathbb{FN}$) samples closer to their respective prototypes and push false\npositive ($\\mathbb{FP}$) samples apart from their affiliated prototypes. In\naddition, we propose a new prototypical diversity amplification loss to\nstrengthen the model's capacity by amplifying the differences between different\nprototypes. \\textbf{Finally}, we propose a prototype-guided rectification to\nrectify prediction by incorporating the representability of prototypes.\nExtensive experiments conducted on the benchmark dataset demonstrate the\nsuperior performance of our method compared to existing approaches. The code is\navailable at https://github.com/kunli-cs/PCAN.\n","authors":["Kun Li","Dan Guo","Guoliang Chen","Chunxiao Fan","Jingyuan Xu","Zhiliang Wu","Hehe Fan","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14719v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.14718v1","updated":"2024-12-19T10:33:19Z","published":"2024-12-19T10:33:19Z","title":"A Comprehensive Forecasting Framework based on Multi-Stage Hierarchical\n Forecasting Reconciliation and Adjustment","summary":" Ads demand forecasting for Walmart's ad products plays a critical role in\nenabling effective resource planning, allocation, and management of ads\nperformance. In this paper, we introduce a comprehensive demand forecasting\nsystem that tackles hierarchical time series forecasting in business settings.\nThough traditional hierarchical reconciliation methods ensure forecasting\ncoherence, they often trade off accuracy for coherence especially at lower\nlevels and fail to capture the seasonality unique to each time-series in the\nhierarchy. Thus, we propose a novel framework \"Multi-Stage Hierarchical\nForecasting Reconciliation and Adjustment (Multi-Stage HiFoReAd)\" to address\nthe challenges of preserving seasonality, ensuring coherence, and improving\naccuracy. Our system first utilizes diverse models, ensembled through Bayesian\nOptimization (BO), achieving base forecasts. The generated base forecasts are\nthen passed into the Multi-Stage HiFoReAd framework. The initial stage refines\nthe hierarchy using Top-Down forecasts and \"harmonic alignment.\" The second\nstage aligns the higher levels' forecasts using MinTrace algorithm, following\nwhich the last two levels undergo \"harmonic alignment\" and \"stratified\nscaling\", to eventually achieve accurate and coherent forecasts across the\nwhole hierarchy. Our experiments on Walmart's internal Ads-demand dataset and 3\nother public datasets, each with 4 hierarchical levels, demonstrate that the\naverage Absolute Percentage Error from the cross-validation sets improve from\n3% to 40% across levels against BO-ensemble of models (LGBM, MSTL+ETS, Prophet)\nas well as from 1.2% to 92.9% against State-Of-The-Art models. In addition, the\nforecasts at all hierarchical levels are proved to be coherent. The proposed\nframework has been deployed and leveraged by Walmart's ads, sales and\noperations teams to track future demands, make informed decisions and plan\nresources.\n","authors":["Zhengchao Yang","Mithun Ghosh","Anish Saha","Dong Xu","Konstantin Shmakov","Kuang-chih Lee"],"pdf_url":"https://arxiv.org/pdf/2412.14718v1.pdf","comment":"Published in 2024 IEEE International Conference on Big Data (BigData)"},{"id":"http://arxiv.org/abs/2412.11242v2","updated":"2024-12-19T10:33:13Z","published":"2024-12-15T16:47:16Z","title":"TrimLLM: Progressive Layer Dropping for Domain-Specific LLMs","summary":" Specializing large language models (LLMs) for local deployment in\ndomain-specific use cases is necessary for strong performance while meeting\nlatency and privacy constraints. However, conventional task-specific adaptation\napproaches do not show simultaneous memory saving and inference speedup at\ndeployment time. Practical compression techniques like quantization and pruning\nrequire dedicated hardware or kernel support to achieve measured inference\nspeedup. We develop TrimLLM based on the layer-wise specialization phenomenon\nwe empirically observed and verified on contemporary LLMs. TrimLLM reduces the\ndepth of LLMs via progressive layer dropping. We show it retains LLMs' capacity\nin specific domains and achieves inference speedup irrespective of hardware and\ndeep learning frameworks. We evaluated TrimLLM on LLMs of various sizes for\ninference; models adapted on medical, legal, and financial datasets all\ndemonstrate $2.1-5.7\\times$ inference speedup on consumer GPUs and up to\n$3.1\\times$ speedup on A100 when compared to state-of-the-art model compression\nalgorithms, with no loss in accuracy at 50$\\sim$60\\% model compression ratio.\n","authors":["Lanxiang Hu","Tajana Rosing","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.11242v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14717v1","updated":"2024-12-19T10:31:25Z","published":"2024-12-19T10:31:25Z","title":"Computing Gram Matrix for SMILES Strings using RDKFingerprint and\n Sinkhorn-Knopp Algorithm","summary":" In molecular structure data, SMILES (Simplified Molecular Input Line Entry\nSystem) strings are used to analyze molecular structure design. Numerical\nfeature representation of SMILES strings is a challenging task. This work\nproposes a kernel-based approach for encoding and analyzing molecular\nstructures from SMILES strings. The proposed approach involves computing a\nkernel matrix using the Sinkhorn-Knopp algorithm while using kernel principal\ncomponent analysis (PCA) for dimensionality reduction. The resulting\nlow-dimensional embeddings are then used for classification and regression\nanalysis. The kernel matrix is computed by converting the SMILES strings into\nmolecular structures using the Morgan Fingerprint, which computes a fingerprint\nfor each molecule. The distance matrix is computed using the pairwise kernels\nfunction. The Sinkhorn-Knopp algorithm is used to compute the final kernel\nmatrix that satisfies the constraints of a probability distribution. This is\nachieved by iteratively adjusting the kernel matrix until the marginal\ndistributions of the rows and columns match the desired marginal distributions.\nWe provided a comprehensive empirical analysis of the proposed kernel method to\nevaluate its goodness with greater depth. The suggested method is assessed for\ndrug subcategory prediction (classification task) and solubility AlogPS\n``Aqueous solubility and Octanol/Water partition coefficient\" (regression task)\nusing the benchmark SMILES string dataset. The outcomes show the proposed\nmethod outperforms several baseline methods in terms of supervised analysis and\nhas potential uses in molecular design and drug discovery. Overall, the\nsuggested method is a promising avenue for kernel methods-based molecular\nstructure analysis and design.\n","authors":["Sarwan Ali","Haris Mansoor","Prakash Chourasia","Imdad Ullah Khan","Murray Patterson"],"pdf_url":"https://arxiv.org/pdf/2412.14717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14714v1","updated":"2024-12-19T10:25:21Z","published":"2024-12-19T10:25:21Z","title":"Holistic Adversarially Robust Pruning","summary":" Neural networks can be drastically shrunk in size by removing redundant\nparameters. While crucial for the deployment on resource-constraint hardware,\noftentimes, compression comes with a severe drop in accuracy and lack of\nadversarial robustness. Despite recent advances, counteracting both aspects has\nonly succeeded for moderate compression rates so far. We propose a novel\nmethod, HARP, that copes with aggressive pruning significantly better than\nprior work. For this, we consider the network holistically. We learn a global\ncompression strategy that optimizes how many parameters (compression rate) and\nwhich parameters (scoring connections) to prune specific to each layer\nindividually. Our method fine-tunes an existing model with dynamic\nregularization, that follows a step-wise incremental function balancing the\ndifferent objectives. It starts by favoring robustness before shifting focus on\nreaching the target compression rate and only then handles the objectives\nequally. The learned compression strategies allow us to maintain the\npre-trained model natural accuracy and its adversarial robustness for a\nreduction by 99% of the network original size. Moreover, we observe a crucial\ninfluence of non-uniform compression across layers.\n","authors":["Qi Zhao","Christian Wressnegger"],"pdf_url":"https://arxiv.org/pdf/2412.14714v1.pdf","comment":"Accepted by ICLR 2023"},{"id":"http://arxiv.org/abs/2412.14711v1","updated":"2024-12-19T10:21:20Z","published":"2024-12-19T10:21:20Z","title":"ReMoE: Fully Differentiable Mixture-of-Experts with ReLU Routing","summary":" Sparsely activated Mixture-of-Experts (MoE) models are widely adopted to\nscale up model capacity without increasing the computation budget. However,\nvanilla TopK routers are trained in a discontinuous, non-differentiable way,\nlimiting their performance and scalability. To address this issue, we propose\nReMoE, a fully differentiable MoE architecture that offers a simple yet\neffective drop-in replacement for the conventional TopK+Softmax routing,\nutilizing ReLU as the router instead. We further propose methods to regulate\nthe router's sparsity while balancing the load among experts. ReMoE's\ncontinuous nature enables efficient dynamic allocation of computation across\ntokens and layers, while also exhibiting domain specialization. Our experiments\ndemonstrate that ReMoE consistently outperforms vanilla TopK-routed MoE across\nvarious model sizes, expert counts, and levels of granularity. Furthermore,\nReMoE exhibits superior scalability with respect to the number of experts,\nsurpassing traditional MoE architectures. The implementation based on\nMegatron-LM is available at https://github.com/thu-ml/ReMoE.\n","authors":["Ziteng Wang","Jianfei Chen","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.14711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11479v2","updated":"2024-12-19T10:21:11Z","published":"2024-08-21T09:44:43Z","title":"Learning Deep Dissipative Dynamics","summary":" This study challenges strictly guaranteeing ``dissipativity'' of a dynamical\nsystem represented by neural networks learned from given time-series data.\nDissipativity is a crucial indicator for dynamical systems that generalizes\nstability and input-output stability, known to be valid across various systems\nincluding robotics, biological systems, and molecular dynamics. By analytically\nproving the general solution to the nonlinear Kalman-Yakubovich-Popov (KYP)\nlemma, which is the necessary and sufficient condition for dissipativity, we\npropose a differentiable projection that transforms any dynamics represented by\nneural networks into dissipative ones and a learning method for the transformed\ndynamics. Utilizing the generality of dissipativity, our method strictly\nguarantee stability, input-output stability, and energy conservation of trained\ndynamical systems. Finally, we demonstrate the robustness of our method against\nout-of-domain input through applications to robotic arms and fluid dynamics.\nCode is https://github.com/kojima-r/DeepDissipativeModel\n","authors":["Yuji Okamoto","Ryosuke Kojima"],"pdf_url":"https://arxiv.org/pdf/2408.11479v2.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2412.08555v2","updated":"2024-12-19T10:12:38Z","published":"2024-12-11T17:17:02Z","title":"Grimm: A Plug-and-Play Perturbation Rectifier for Graph Neural Networks\n Defending against Poisoning Attacks","summary":" Recent studies have revealed the vulnerability of graph neural networks\n(GNNs) to adversarial poisoning attacks on node classification tasks. Current\ndefensive methods require substituting the original GNNs with defense models,\nregardless of the original's type. This approach, while targeting adversarial\nrobustness, compromises the enhancements developed in prior research to boost\nGNNs' practical performance. Here we introduce Grimm, the first plug-and-play\ndefense model. With just a minimal interface requirement for extracting\nfeatures from any layer of the protected GNNs, Grimm is thus enabled to\nseamlessly rectify perturbations. Specifically, we utilize the feature\ntrajectories (FTs) generated by GNNs, as they evolve through epochs, to reflect\nthe training status of the networks. We then theoretically prove that the FTs\nof victim nodes will inevitably exhibit discriminable anomalies. Consequently,\ninspired by the natural parallelism between the biological nervous and immune\nsystems, we construct Grimm, a comprehensive artificial immune system for GNNs.\nGrimm not only detects abnormal FTs and rectifies adversarial edges during\ntraining but also operates efficiently in parallel, thereby mirroring the\nconcurrent functionalities of its biological counterparts. We experimentally\nconfirm that Grimm offers four empirically validated advantages: 1)\nHarmlessness, as it does not actively interfere with GNN training; 2)\nParallelism, ensuring monitoring, detection, and rectification functions\noperate independently of the GNN training process; 3) Generalizability,\ndemonstrating compatibility with mainstream GNNs such as GCN, GAT, and\nGraphSAGE; and 4) Transferability, as the detectors for abnormal FTs can be\nefficiently transferred across different systems for one-step rectification.\n","authors":["Ao Liu","Wenshan Li","Beibei Li","Wengang Ma","Tao Li","Pan Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.08555v2.pdf","comment":"19 pages, 13 figures"},{"id":"http://arxiv.org/abs/2202.06374v5","updated":"2024-12-19T10:12:00Z","published":"2022-02-13T18:04:00Z","title":"Holdouts set for safe predictive model updating","summary":" Predictive risk scores for adverse outcomes are increasingly crucial in\nguiding health interventions. Such scores may need to be periodically updated\ndue to change in the distributions they model. However, directly updating risk\nscores used to guide intervention can lead to biased risk estimates. To address\nthis, we propose updating using a `holdout set' - a subset of the population\nthat does not receive interventions guided by the risk score. Balancing the\nholdout set size is essential to ensure good performance of the updated risk\nscore whilst minimising the number of held out samples. We prove that this\napproach reduces adverse outcome frequency to an asymptotically optimal level\nand argue that often there is no competitive alternative. We describe\nconditions under which an optimal holdout size (OHS) can be readily identified,\nand introduce parametric and semi-parametric algorithms for OHS estimation. We\napply our methods to the ASPRE risk score for pre-eclampsia to recommend a plan\nfor updating it in the presence of change in the underlying data distribution.\nWe show that, in order to minimise the number of pre-eclampsia cases over time,\nthis is best achieved using a holdout set of around 10,000 individuals.\n","authors":["Sami Haidar-Wehbe","Samuel R Emerson","Louis J M Aslett","James Liley"],"pdf_url":"https://arxiv.org/pdf/2202.06374v5.pdf","comment":"Manuscript includes supplementary materials and figures"},{"id":"http://arxiv.org/abs/2412.07675v3","updated":"2024-12-19T10:11:42Z","published":"2024-12-10T17:02:58Z","title":"RAZOR: Sharpening Knowledge by Cutting Bias with Unsupervised Text\n Rewriting","summary":" Despite the widespread use of LLMs due to their superior performance in\nvarious tasks, their high computational costs often lead potential users to opt\nfor the pretraining-finetuning pipeline. However, biases prevalent in manually\nconstructed datasets can introduce spurious correlations between tokens and\nlabels, creating so-called shortcuts and hindering the generalizability of\nfine-tuned models. Existing debiasing methods often rely on prior knowledge of\nspecific dataset biases, which is challenging to acquire a priori. We propose\nRAZOR (Rewriting And Zero-bias Optimization Refinement), a novel, unsupervised,\nand data-focused debiasing approach based on text rewriting for shortcut\nmitigation. RAZOR leverages LLMs to iteratively rewrite potentially biased text\nsegments by replacing them with heuristically selected alternatives in a\nshortcut space defined by token statistics and positional information. This\nprocess aims to align surface-level text features more closely with diverse\nlabel distributions, thereby promoting the learning of genuine linguistic\npatterns. Compared with unsupervised SoTA models, RAZOR improves by 3.5% on the\nFEVER and 6.5% on MNLI and SNLI datasets according to the F1 score.\nAdditionally, RAZOR effectively mitigates specific known biases, reducing\nbias-related terms by x2 without requiring prior bias information, a result\nthat is on par with SoTA models that leverage prior information. Our work\nprioritizes data manipulation over architectural modifications, emphasizing the\npivotal role of data quality in enhancing model performance and fairness. This\nresearch contributes to developing more robust evaluation benchmarks for\ndebiasing methods by incorporating metrics for bias reduction and overall model\nefficacy.\n","authors":["Shuo Yang","Bardh Prenkaj","Gjergji Kasneci"],"pdf_url":"https://arxiv.org/pdf/2412.07675v3.pdf","comment":"Shuo and Bardh contributed equally. Accepted to AAAI'25, Paper #17117"},{"id":"http://arxiv.org/abs/2412.14701v1","updated":"2024-12-19T10:10:57Z","published":"2024-12-19T10:10:57Z","title":"Taming the Memory Beast: Strategies for Reliable ML Training on\n Kubernetes","summary":" Kubernetes offers a powerful orchestration platform for machine learning\ntraining, but memory management can be challenging due to specialized needs and\nresource constraints. This paper outlines how Kubernetes handles memory\nrequests, limits, Quality of Service classes, and eviction policies for ML\nworkloads, with special focus on GPU memory and ephemeral storage. Common\npitfalls such as overcommitment, memory leaks, and ephemeral volume exhaustion\nare examined. We then provide best practices for stable, scalable memory\nutilization to help ML practitioners prevent out-of-memory events and ensure\nhigh-performance ML training pipelines.\n","authors":["Jaideep Ray"],"pdf_url":"https://arxiv.org/pdf/2412.14701v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2412.08160v4","updated":"2024-12-19T10:01:27Z","published":"2024-12-11T07:32:38Z","title":"DG-Mamba: Robust and Efficient Dynamic Graph Structure Learning with\n Selective State Space Models","summary":" Dynamic graphs exhibit intertwined spatio-temporal evolutionary patterns,\nwidely existing in the real world. Nevertheless, the structure incompleteness,\nnoise, and redundancy result in poor robustness for Dynamic Graph Neural\nNetworks (DGNNs). Dynamic Graph Structure Learning (DGSL) offers a promising\nway to optimize graph structures. However, aside from encountering unacceptable\nquadratic complexity, it overly relies on heuristic priors, making it hard to\ndiscover underlying predictive patterns. How to efficiently refine the dynamic\nstructures, capture intrinsic dependencies, and learn robust representations,\nremains under-explored. In this work, we propose the novel DG-Mamba, a robust\nand efficient Dynamic Graph structure learning framework with the Selective\nState Space Models (Mamba). To accelerate the spatio-temporal structure\nlearning, we propose a kernelized dynamic message-passing operator that reduces\nthe quadratic time complexity to linear. To capture global intrinsic dynamics,\nwe establish the dynamic graph as a self-contained system with State Space\nModel. By discretizing the system states with the cross-snapshot graph\nadjacency, we enable the long-distance dependencies capturing with the\nselective snapshot scan. To endow learned dynamic structures more expressive\nwith informativeness, we propose the self-supervised Principle of Relevant\nInformation for DGSL to regularize the most relevant yet least redundant\ninformation, enhancing global robustness. Extensive experiments demonstrate the\nsuperiority of the robustness and efficiency of our DG-Mamba compared with the\nstate-of-the-art baselines against adversarial attacks.\n","authors":["Haonan Yuan","Qingyun Sun","Zhaonan Wang","Xingcheng Fu","Cheng Ji","Yongjian Wang","Bo Jin","Jianxin Li"],"pdf_url":"https://arxiv.org/pdf/2412.08160v4.pdf","comment":"Accepted by the Main Technical Track of the 39th Annual AAAI\n Conference on Artificial Intelligence (AAAI-2025)"},{"id":"http://arxiv.org/abs/2412.14695v1","updated":"2024-12-19T09:56:01Z","published":"2024-12-19T09:56:01Z","title":"Lorentzian Residual Neural Networks","summary":" Hyperbolic neural networks have emerged as a powerful tool for modeling\nhierarchical data structures prevalent in real-world datasets. Notably,\nresidual connections, which facilitate the direct flow of information across\nlayers, have been instrumental in the success of deep neural networks. However,\ncurrent methods for constructing hyperbolic residual networks suffer from\nlimitations such as increased model complexity, numerical instability, and\nerrors due to multiple mappings to and from the tangent space. To address these\nlimitations, we introduce LResNet, a novel Lorentzian residual neural network\nbased on the weighted Lorentzian centroid in the Lorentz model of hyperbolic\ngeometry. Our method enables the efficient integration of residual connections\nin Lorentz hyperbolic neural networks while preserving their hierarchical\nrepresentation capabilities. We demonstrate that our method can theoretically\nderive previous methods while offering improved stability, efficiency, and\neffectiveness. Extensive experiments on both graph and vision tasks showcase\nthe superior performance and robustness of our method compared to\nstate-of-the-art Euclidean and hyperbolic alternatives. Our findings highlight\nthe potential of \\method for building more expressive neural networks in\nhyperbolic embedding space as a generally applicable method to multiple\narchitectures, including CNNs, GNNs, and graph Transformers.\n","authors":["Neil He","Menglin Yang","Rex Ying"],"pdf_url":"https://arxiv.org/pdf/2412.14695v1.pdf","comment":"12 pages, 3 figures, KDD 2025"},{"id":"http://arxiv.org/abs/2410.05016v2","updated":"2024-12-19T09:49:25Z","published":"2024-10-07T13:15:07Z","title":"T-JEPA: Augmentation-Free Self-Supervised Learning for Tabular Data","summary":" Self-supervision is often used for pre-training to foster performance on a\ndownstream task by constructing meaningful representations of samples.\nSelf-supervised learning (SSL) generally involves generating different views of\nthe same sample and thus requires data augmentations that are challenging to\nconstruct for tabular data. This constitutes one of the main challenges of\nself-supervision for structured data. In the present work, we propose a novel\naugmentation-free SSL method for tabular data. Our approach, T-JEPA, relies on\na Joint Embedding Predictive Architecture (JEPA) and is akin to mask\nreconstruction in the latent space. It involves predicting the latent\nrepresentation of one subset of features from the latent representation of a\ndifferent subset within the same sample, thereby learning rich representations\nwithout augmentations. We use our method as a pre-training technique and train\nseveral deep classifiers on the obtained representation. Our experimental\nresults demonstrate a substantial improvement in both classification and\nregression tasks, outperforming models trained directly on samples in their\noriginal data space. Moreover, T-JEPA enables some methods to consistently\noutperform or match the performance of traditional methods likes Gradient\nBoosted Decision Trees. To understand why, we extensively characterize the\nobtained representations and show that T-JEPA effectively identifies relevant\nfeatures for downstream tasks without access to the labels. Additionally, we\nintroduce regularization tokens, a novel regularization method critical for\ntraining of JEPA-based models on structured data.\n","authors":["Hugo Thimonier","José Lucas De Melo Costa","Fabrice Popineau","Arpad Rimmel","Bich-Liên Doan"],"pdf_url":"https://arxiv.org/pdf/2410.05016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14689v1","updated":"2024-12-19T09:43:39Z","published":"2024-12-19T09:43:39Z","title":"How to Synthesize Text Data without Model Collapse?","summary":" Model collapse in synthetic data indicates that iterative training on\nself-generated data leads to a gradual decline in performance. With the\nproliferation of AI models, synthetic data will fundamentally reshape the web\ndata ecosystem. Future GPT-$\\{n\\}$ models will inevitably be trained on a blend\nof synthetic and human-produced data. In this paper, we focus on two questions:\nwhat is the impact of synthetic data on language model training, and how to\nsynthesize data without model collapse? We first pre-train language models\nacross different proportions of synthetic data, revealing a negative\ncorrelation between the proportion of synthetic data and model performance. We\nfurther conduct statistical analysis on synthetic data to uncover\ndistributional shift phenomenon and over-concentration of n-gram features.\nInspired by the above findings, we propose token editing on human-produced data\nto obtain semi-synthetic data. As a proof of concept, we theoretically\ndemonstrate that token-level editing can prevent model collapse, as the test\nerror is constrained by a finite upper bound. We conduct extensive experiments\non pre-training from scratch, continual pre-training, and supervised\nfine-tuning. The results validate our theoretical proof that token-level\nediting improves data quality and enhances model performance.\n","authors":["Xuekai Zhu","Daixuan Cheng","Hengli Li","Kaiyan Zhang","Ermo Hua","Xingtai Lv","Ning Ding","Zhouhan Lin","Zilong Zheng","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.14689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12327v2","updated":"2024-12-19T09:34:30Z","published":"2024-12-16T19:54:57Z","title":"Leveraging Group Classification with Descending Soft Labeling for Deep\n Imbalanced Regression","summary":" Deep imbalanced regression (DIR), where the target values have a highly\nskewed distribution and are also continuous, is an intriguing yet\nunder-explored problem in machine learning.\n While recent works have already shown that incorporating various\nclassification-based regularizers can produce enhanced outcomes, the role of\nclassification remains elusive in DIR.\n Moreover, such regularizers (e.g., contrastive penalties) merely focus on\nlearning discriminative features of data, which inevitably results in ignorance\nof either continuity or similarity across the data.\n To address these issues, we first bridge the connection between the\nobjectives of DIR and classification from a Bayesian perspective.\n Consequently, this motivates us to decompose the objective of DIR into a\ncombination of classification and regression tasks, which naturally guides us\ntoward a divide-and-conquer manner to solve the DIR problem.\n Specifically, by aggregating the data at nearby labels into the same groups,\nwe introduce an ordinal group-aware contrastive learning loss along with a\nmulti-experts regressor to tackle the different groups of data thereby\nmaintaining the data continuity.\n Meanwhile, considering the similarity between the groups, we also propose a\nsymmetric descending soft labeling strategy to exploit the intrinsic similarity\nacross the data, which allows classification to facilitate regression more\neffectively.\n Extensive experiments on real-world datasets also validate the effectiveness\nof our method.\n","authors":["Ruizhi Pu","Gezheng Xu","Ruiyi Fang","Binkun Bao","Charles X. Ling","Boyu Wang"],"pdf_url":"https://arxiv.org/pdf/2412.12327v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06401v2","updated":"2024-12-19T09:30:05Z","published":"2024-08-12T12:09:25Z","title":"Langevin dynamics for high-dimensional optimization: the case of\n multi-spiked tensor PCA","summary":" We study nonconvex optimization in high dimensions through Langevin dynamics,\nfocusing on the multi-spiked tensor PCA problem. This tensor estimation problem\ninvolves recovering $r$ hidden signal vectors (spikes) from noisy Gaussian\ntensor observations using maximum likelihood estimation. We study the number of\nsamples required for Langevin dynamics to efficiently recover the spikes and\ndetermine the necessary separation condition on the signal-to-noise ratios\n(SNRs) for exact recovery, distinguishing the cases $p \\ge 3$ and $p=2$, where\n$p$ denotes the order of the tensor. In particular, we show that the sample\ncomplexity required for recovering the spike associated with the largest SNR\nmatches the well-known algorithmic threshold for the single-spike case, while\nthis threshold degrades when recovering all $r$ spikes. As a key step, we\nprovide a detailed characterization of the trajectory and interactions of\nlow-dimensional projections that capture the high-dimensional dynamics.\n","authors":["Gérard Ben Arous","Cédric Gerbelot","Vanessa Piccolo"],"pdf_url":"https://arxiv.org/pdf/2408.06401v2.pdf","comment":"65 pages"},{"id":"http://arxiv.org/abs/2412.06926v3","updated":"2024-12-19T09:24:39Z","published":"2024-12-09T19:11:54Z","title":"When Every Token Counts: Optimal Segmentation for Low-Resource Language\n Models","summary":" Traditional greedy tokenization methods have been a critical step in Natural\nLanguage Processing (NLP), influencing how text is converted into tokens and\ndirectly impacting model performance. While subword tokenizers like Byte-Pair\nEncoding (BPE) are widely used, questions remain about their optimality across\nmodel scales and languages. In this work, we demonstrate through extensive\nexperiments that an optimal BPE configuration significantly reduces token count\ncompared to greedy segmentation, yielding improvements in token-saving\npercentages and performance benefits, particularly for smaller models. We\nevaluate tokenization performance across various intrinsic and extrinsic tasks,\nincluding generation and classification. Our findings suggest that\ncompression-optimized tokenization strategies could provide substantial\nadvantages for multilingual and low-resource language applications,\nhighlighting a promising direction for further research and inclusive NLP.\n","authors":["Bharath Raj S","Garvit Suri","Vikrant Dewangan","Raghav Sonavane"],"pdf_url":"https://arxiv.org/pdf/2412.06926v3.pdf","comment":"LoResLM @ COLING 2025"},{"id":"http://arxiv.org/abs/2412.14668v1","updated":"2024-12-19T09:20:27Z","published":"2024-12-19T09:20:27Z","title":"LoLaFL: Low-Latency Federated Learning via Forward-only Propagation","summary":" Federated learning (FL) has emerged as a widely adopted paradigm for enabling\nedge learning with distributed data while ensuring data privacy. However, the\ntraditional FL with deep neural networks trained via backpropagation can hardly\nmeet the low-latency learning requirements in the sixth generation (6G) mobile\nnetworks. This challenge mainly arises from the high-dimensional model\nparameters to be transmitted and the numerous rounds of communication required\nfor convergence due to the inherent randomness of the training process. To\naddress this issue, we adopt the state-of-the-art principle of maximal coding\nrate reduction to learn linear discriminative features and extend the resultant\nwhite-box neural network into FL, yielding the novel framework of Low-Latency\nFederated Learning (LoLaFL) via forward-only propagation. LoLaFL enables\nlayer-wise transmissions and aggregation with significantly fewer communication\nrounds, thereby considerably reducing latency. Additionally, we propose two\n\\emph{nonlinear} aggregation schemes for LoLaFL. The first scheme is based on\nthe proof that the optimal NN parameter aggregation in LoLaFL should be\nharmonic-mean-like. The second scheme further exploits the low-rank structures\nof the features and transmits the low-rank-approximated covariance matrices of\nfeatures to achieve additional latency reduction. Theoretic analysis and\nexperiments are conducted to evaluate the performance of LoLaFL. In comparison\nwith traditional FL, the two nonlinear aggregation schemes for LoLaFL can\nachieve reductions in latency of over 91\\% and 98\\%, respectively, while\nmaintaining comparable accuracies.\n","authors":["Jierui Zhang","Jianhao Huang","Kaibin Huang"],"pdf_url":"https://arxiv.org/pdf/2412.14668v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2111.02363v5","updated":"2024-12-19T09:18:11Z","published":"2021-11-03T17:30:43Z","title":"Deep Learning-based Non-Intrusive Multi-Objective Speech Assessment\n Model with Cross-Domain Features","summary":" In this study, we propose a cross-domain multi-objective speech assessment\nmodel called MOSA-Net, which can estimate multiple speech assessment metrics\nsimultaneously. Experimental results show that MOSA-Net can improve the linear\ncorrelation coefficient (LCC) by 0.026 (0.990 vs 0.964 in seen noise\nenvironments) and 0.012 (0.969 vs 0.957 in unseen noise environments) in\nperceptual evaluation of speech quality (PESQ) prediction, compared to\nQuality-Net, an existing single-task model for PESQ prediction, and improve LCC\nby 0.021 (0.985 vs 0.964 in seen noise environments) and 0.047 (0.836 vs 0.789\nin unseen noise environments) in short-time objective intelligibility (STOI)\nprediction, compared to STOI-Net (based on CRNN), an existing single-task model\nfor STOI prediction. Moreover, MOSA-Net, originally trained to assess objective\nscores, can be used as a pre-trained model to be effectively adapted to an\nassessment model for predicting subjective quality and intelligibility scores\nwith a limited amount of training data. Experimental results show that MOSA-Net\ncan improve LCC by 0.018 (0.805 vs 0.787) in mean opinion score (MOS)\nprediction, compared to MOS-SSL, a strong single-task model for MOS prediction.\nIn light of the confirmed prediction capability, we further adopt the latent\nrepresentations of MOSA-Net to guide the speech enhancement (SE) process and\nderive a quality-intelligibility (QI)-aware SE (QIA-SE) approach accordingly.\nExperimental results show that QIA-SE provides superior enhancement performance\ncompared with the baseline SE system in terms of objective evaluation metrics\nand qualitative evaluation test. For example, QIA-SE can improve PESQ by 0.301\n(2.953 vs 2.652 in seen noise environments) and 0.18 (2.658 vs 2.478 in unseen\nnoise environments) over a CNN-based baseline SE model.\n","authors":["Ryandhimas E. Zezario","Szu-Wei Fu","Fei Chen","Chiou-Shann Fuh","Hsin-Min Wang","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2111.02363v5.pdf","comment":"Accepted by IEEE/ACM Transactions on Audio, Speech, and Language\n Processing (TASLP), vol. 31, pp. 54-70, 2023"},{"id":"http://arxiv.org/abs/2412.14660v1","updated":"2024-12-19T09:10:07Z","published":"2024-12-19T09:10:07Z","title":"Unveiling Uncertainty: A Deep Dive into Calibration and Performance of\n Multimodal Large Language Models","summary":" Multimodal large language models (MLLMs) combine visual and textual data for\ntasks such as image captioning and visual question answering. Proper\nuncertainty calibration is crucial, yet challenging, for reliable use in areas\nlike healthcare and autonomous driving. This paper investigates representative\nMLLMs, focusing on their calibration across various scenarios, including before\nand after visual fine-tuning, as well as before and after multimodal training\nof the base LLMs. We observed miscalibration in their performance, and at the\nsame time, no significant differences in calibration across these scenarios. We\nalso highlight how uncertainty differs between text and images and how their\nintegration affects overall uncertainty. To better understand MLLMs'\nmiscalibration and their ability to self-assess uncertainty, we construct the\nIDK (I don't know) dataset, which is key to evaluating how they handle\nunknowns. Our findings reveal that MLLMs tend to give answers rather than admit\nuncertainty, but this self-assessment improves with proper prompt adjustments.\nFinally, to calibrate MLLMs and enhance model reliability, we propose\ntechniques such as temperature scaling and iterative prompt optimization. Our\nresults provide insights into improving MLLMs for effective and responsible\ndeployment in multimodal applications. Code and IDK dataset:\n\\href{https://github.com/hfutml/Calibration-MLLM}{https://github.com/hfutml/Calibration-MLLM}.\n","authors":["Zijun Chen","Wenbo Hu","Guande He","Zhijie Deng","Zheng Zhang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2412.14660v1.pdf","comment":"Accepted to COLING 2025"},{"id":"http://arxiv.org/abs/2412.14655v1","updated":"2024-12-19T09:06:39Z","published":"2024-12-19T09:06:39Z","title":"Trainable Adaptive Activation Function Structure (TAAFS) Enhances Neural\n Network Force Field Performance with Only Dozens of Additional Parameters","summary":" At the heart of neural network force fields (NNFFs) is the architecture of\nneural networks, where the capacity to model complex interactions is typically\nenhanced through widening or deepening multilayer perceptrons (MLPs) or by\nincreasing layers of graph neural networks (GNNs). These enhancements, while\nimproving the model's performance, often come at the cost of a substantial\nincrease in the number of parameters. By applying the Trainable Adaptive\nActivation Function Structure (TAAFS), we introduce a method that selects\ndistinct mathematical formulations for non-linear activations, thereby\nincreasing the precision of NNFFs with an insignificant addition to the\nparameter count. In this study, we integrate TAAFS into a variety of neural\nnetwork models, resulting in observed accuracy improvements, and further\nvalidate these enhancements through molecular dynamics (MD) simulations using\nDeepMD.\n","authors":["Enji Li"],"pdf_url":"https://arxiv.org/pdf/2412.14655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15031v2","updated":"2024-12-19T09:00:34Z","published":"2024-03-22T08:26:31Z","title":"Image Classification with Rotation-Invariant Variational Quantum\n Circuits","summary":" Variational quantum algorithms are gaining attention as an early application\nof Noisy Intermediate-Scale Quantum (NISQ) devices. One of the main problems of\nvariational methods lies in the phenomenon of Barren Plateaus, present in the\noptimization of variational parameters. Adding geometric inductive bias to the\nquantum models has been proposed as a potential solution to mitigate this\nproblem, leading to a new field called Geometric Quantum Machine Learning. In\nthis work, an equivariant architecture for variational quantum classifiers is\nintroduced to create a label-invariant model for image classification with\n$C_4$ rotational label symmetry. The equivariant circuit is benchmarked against\ntwo different architectures, and it is experimentally observed that the\ngeometric approach boosts the model's performance. Finally, a classical\nequivariant convolution operation is proposed to extend the quantum model for\nthe processing of larger images, employing the resources available in NISQ\ndevices.\n","authors":["Paul San Sebastian","Mikel Cañizo","Román Orús"],"pdf_url":"https://arxiv.org/pdf/2403.15031v2.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2412.14650v1","updated":"2024-12-19T08:59:49Z","published":"2024-12-19T08:59:49Z","title":"Permutation recovery of spikes in noisy high-dimensional tensor\n estimation","summary":" We study the dynamics of gradient flow in high dimensions for the\nmulti-spiked tensor problem, where the goal is to estimate $r$ unknown signal\nvectors (spikes) from noisy Gaussian tensor observations. Specifically, we\nanalyze the maximum likelihood estimation procedure, which involves optimizing\na highly nonconvex random function. We determine the sample complexity required\nfor gradient flow to efficiently recover all spikes, without imposing any\nassumptions on the separation of the signal-to-noise ratios (SNRs). More\nprecisely, our results provide the sample complexity required to guarantee\nrecovery of the spikes up to a permutation. Our work builds on our companion\npaper [Ben Arous, Gerbelot, Piccolo 2024], which studies Langevin dynamics and\ndetermines the sample complexity and separation conditions for the SNRs\nnecessary for ensuring exact recovery of the spikes (where the recovered\npermutation matches the identity). During the recovery process, the\ncorrelations between the estimators and the hidden vectors increase in a\nsequential manner. The order in which these correlations become significant\ndepends on their initial values and the corresponding SNRs, which ultimately\ndetermines the permutation of the recovered spikes.\n","authors":["Gérard Ben Arous","CĆedric Gerbelot","Vanessa Piccolo"],"pdf_url":"https://arxiv.org/pdf/2412.14650v1.pdf","comment":"29 pages, 2 figures. arXiv admin note: substantial text overlap with\n arXiv:2408.06401"},{"id":"http://arxiv.org/abs/2406.16606v2","updated":"2024-12-19T08:53:52Z","published":"2024-06-24T12:46:16Z","title":"Cherry on the Cake: Fairness is NOT an Optimization Problem","summary":" In Fair AI literature, the practice of maliciously creating unfair models\nthat nevertheless satisfy fairness constraints is known as \"cherry-picking\". A\ncherry-picking model is a model that makes mistakes on purpose, selecting bad\nindividuals from a minority class instead of better candidates from the same\nminority. The model literally cherry-picks whom to select to superficially meet\nthe fairness constraints while making minimal changes to the unfair model. This\npractice has been described as \"blatantly unfair\" and has a negative impact on\nalready marginalized communities, undermining the intended purpose of fairness\nmeasures specifically designed to protect these communities. A common\nassumption is that cherry-picking arises solely from malicious intent and that\nmodels designed only to optimize fairness metrics would avoid this behavior. We\nshow that this is not the case: models optimized to minimize fairness metrics\nwhile maximizing performance are often forced to cherry-pick to some degree. In\nother words, cherry-picking might be an inevitable outcome of the optimization\nprocess itself. To demonstrate this, we use tools from fair cake-cutting, a\nmathematical subfield that studies the problem of fairly dividing a resource,\nreferred to as the \"cake,\" among a number of participants. This concept is\nconnected to supervised multi-label classification: any dataset can be thought\nof as a cake that needs to be distributed among different labels, and the model\nis the function that divides the cake. We adapt these classical results for\nmachine learning and demonstrate how this connection can be prolifically used\nfor fairness and classification in general.\n","authors":["Marco Favier","Toon Calders"],"pdf_url":"https://arxiv.org/pdf/2406.16606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14640v1","updated":"2024-12-19T08:51:01Z","published":"2024-12-19T08:51:01Z","title":"Adaptive Prompt Tuning: Vision Guided Prompt Tuning with Cross-Attention\n for Fine-Grained Few-Shot Learning","summary":" Few-shot, fine-grained classification in computer vision poses significant\nchallenges due to the need to differentiate subtle class distinctions with\nlimited data. This paper presents a novel method that enhances the Contrastive\nLanguage-Image Pre-Training (CLIP) model through adaptive prompt tuning, guided\nby real-time visual inputs. Unlike existing techniques such as Context\nOptimization (CoOp) and Visual Prompt Tuning (VPT), which are constrained by\nstatic prompts or visual token reliance, the proposed approach leverages a\ncross-attention mechanism to dynamically refine text prompts for the image at\nhand. This enables an image-specific alignment of textual features with image\npatches extracted from the Vision Transformer, making the model more effective\nfor datasets with high intra-class variance and low inter-class differences.\nThe method is evaluated on several datasets, including CUBirds, Oxford Flowers,\nand FGVC Aircraft, showing significant performance gains over static prompt\ntuning approaches. To ensure these performance gains translate into trustworthy\npredictions, we integrate Monte-Carlo Dropout in our approach to improve the\nreliability of the model predictions and uncertainty estimates. This\nintegration provides valuable insights into the model's predictive confidence,\nhelping to identify when predictions can be trusted and when additional\nverification is necessary. This dynamic approach offers a robust solution,\nadvancing the state-of-the-art for few-shot fine-grained classification.\n","authors":["Eric Brouwer","Jan Erik van Woerden","Gertjan Burghouts","Matias Valedenegro-Toro","Marco Zullich"],"pdf_url":"https://arxiv.org/pdf/2412.14640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14492v2","updated":"2024-12-19T08:46:02Z","published":"2024-05-23T12:25:22Z","title":"Iterative Methods for Full-Scale Gaussian Process Approximations for\n Large Spatial Data","summary":" Gaussian processes are flexible probabilistic regression models which are\nwidely used in statistics and machine learning. However, a drawback is their\nlimited scalability to large data sets. To alleviate this, we consider\nfull-scale approximations (FSAs) that combine predictive process methods and\ncovariance tapering, thus approximating both global and local structures. We\nshow how iterative methods can be used to reduce the computational costs for\ncalculating likelihoods, gradients, and predictive distributions with FSAs. We\nintroduce a novel preconditioner and show that it accelerates the conjugate\ngradient method's convergence speed and mitigates its sensitivity with respect\nto the FSA parameters and the eigenvalue structure of the original covariance\nmatrix, and we demonstrate empirically that it outperforms a state-of-the-art\npivoted Cholesky preconditioner. Further, we present a novel, accurate, and\nfast way to calculate predictive variances relying on stochastic estimations\nand iterative methods. In both simulated and real-world data experiments, we\nfind that our proposed methodology achieves the same accuracy as Cholesky-based\ncomputations with a substantial reduction in computational time. Finally, we\nalso compare different approaches for determining inducing points in predictive\nprocess and FSA models. All methods are implemented in a free C++ software\nlibrary with high-level Python and R packages.\n","authors":["Tim Gyger","Reinhard Furrer","Fabio Sigrist"],"pdf_url":"https://arxiv.org/pdf/2405.14492v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16149v3","updated":"2024-12-19T08:34:15Z","published":"2024-03-24T13:43:43Z","title":"Analyzing Consumer IoT Traffic from Security and Privacy Perspectives: a\n Comprehensive Survey","summary":" The Consumer Internet of Things (CIoT), a notable segment within the IoT\ndomain, involves the integration of IoT technology into consumer electronics\nand devices, such as smart homes and smart wearables. Compared to traditional\nIoT fields, CIoT differs notably in target users, product types, and design\napproaches. While offering convenience to users, it also raises new security\nand privacy concerns. Network traffic analysis, a widely used technique in the\nsecurity community, has been extensively applied to investigate these concerns\nabout CIoT. Compared to network traffic analysis in other fields such as mobile\napps and websites, CIoT presents unique characteristics, introducing new\nchallenges and research opportunities. Researchers have made significant\ncontributions in this area. To aid researchers in understanding the application\nof traffic analysis tools for studying CIoT security and privacy risks, this\nsurvey reviews 303 publications on traffic analysis within the CIoT security\nand privacy domain from January 2018 to June 2024, focusing on three research\nquestions. Our work: 1) outlines the CIoT traffic analysis process and\nhighlights its differences from general network traffic analysis. 2) summarizes\nand classifies existing research into four categories according to its\napplication objectives: device fingerprinting, user activity inference,\nmalicious traffic detection, and measurement. 3) explores emerging challenges\nand potential future research directions based on each step of the CIoT traffic\nanalysis process. This will provide new insights to the community and guide the\nindustry towards safer product designs.\n","authors":["Yan Jia","Yuxin Song","Zihou Liu","Qingyin Tan","Yang Song","Yu Zhang","Zheli Liu"],"pdf_url":"https://arxiv.org/pdf/2403.16149v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14629v1","updated":"2024-12-19T08:31:42Z","published":"2024-12-19T08:31:42Z","title":"Robust PCA Based on Adaptive Weighted Least Squares and Low-Rank Matrix\n Factorization","summary":" Robust Principal Component Analysis (RPCA) is a fundamental technique for\ndecomposing data into low-rank and sparse components, which plays a critical\nrole for applications such as image processing and anomaly detection.\nTraditional RPCA methods commonly use $\\ell_1$ norm regularization to enforce\nsparsity, but this approach can introduce bias and result in suboptimal\nestimates, particularly in the presence of significant noise or outliers.\nNon-convex regularization methods have been proposed to mitigate these\nchallenges, but they tend to be complex to optimize and sensitive to initial\nconditions, leading to potential instability in solutions. To overcome these\nchallenges, in this paper, we propose a novel RPCA model that integrates\nadaptive weighted least squares (AWLS) and low-rank matrix factorization\n(LRMF). The model employs a {self-attention-inspired} mechanism in its weight\nupdate process, allowing the weight matrix to dynamically adjust and emphasize\nsignificant components during each iteration. By employing a weighted F-norm\nfor the sparse component, our method effectively reduces bias while simplifying\nthe computational process compared to traditional $\\ell_1$-norm-based methods.\nWe use an alternating minimization algorithm, where each subproblem has an\nexplicit solution, thereby improving computational efficiency. Despite its\nsimplicity, numerical experiments demonstrate that our method outperforms\nexisting non-convex regularization approaches, offering superior performance\nand stability, as well as enhanced accuracy and robustness in practical\napplications.\n","authors":["Kexin Li","You-wei Wen","Xu Xiao","Mingchao Zhao"],"pdf_url":"https://arxiv.org/pdf/2412.14629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14628v1","updated":"2024-12-19T08:30:54Z","published":"2024-12-19T08:30:54Z","title":"Qua$^2$SeDiMo: Quantifiable Quantization Sensitivity of Diffusion Models","summary":" Diffusion Models (DM) have democratized AI image generation through an\niterative denoising process. Quantization is a major technique to alleviate the\ninference cost and reduce the size of DM denoiser networks. However, as\ndenoisers evolve from variants of convolutional U-Nets toward newer Transformer\narchitectures, it is of growing importance to understand the quantization\nsensitivity of different weight layers, operations and architecture types to\nperformance. In this work, we address this challenge with Qua$^2$SeDiMo, a\nmixed-precision Post-Training Quantization framework that generates explainable\ninsights on the cost-effectiveness of various model weight quantization methods\nfor different denoiser operation types and block structures. We leverage these\ninsights to make high-quality mixed-precision quantization decisions for a\nmyriad of diffusion models ranging from foundational U-Nets to state-of-the-art\nTransformers. As a result, Qua$^2$SeDiMo can construct 3.4-bit, 3.9-bit,\n3.65-bit and 3.7-bit weight quantization on PixArt-${\\alpha}$,\nPixArt-${\\Sigma}$, Hunyuan-DiT and SDXL, respectively. We further pair our\nweight-quantization configurations with 6-bit activation quantization and\noutperform existing approaches in terms of quantitative metrics and generative\nimage quality.\n","authors":["Keith G. Mills","Mohammad Salameh","Ruichen Chen","Negar Hassanpour","Wei Lu","Di Niu"],"pdf_url":"https://arxiv.org/pdf/2412.14628v1.pdf","comment":"AAAI 2025; version includes supplementary material; 22 Pages, 18\n Figures, 8 Tables"},{"id":"http://arxiv.org/abs/2302.09526v4","updated":"2024-12-19T08:22:30Z","published":"2023-02-19T09:55:18Z","title":"Mixed Semi-Supervised Generalized-Linear-Regression with Applications to\n Deep-Learning and Interpolators","summary":" We present a methodology for using unlabeled data to design semi supervised\nlearning (SSL) methods that improve the prediction performance of supervised\nlearning for regression tasks. The main idea is to design different mechanisms\nfor integrating the unlabeled data, and include in each of them a mixing\nparameter $\\alpha$, controlling the weight given to the unlabeled data.\nFocusing on Generalized Linear Models (GLM) and linear interpolators classes of\nmodels, we analyze the characteristics of different mixing mechanisms, and\nprove that in all cases, it is invariably beneficial to integrate the unlabeled\ndata with some nonzero mixing ratio $\\alpha>0$, in terms of predictive\nperformance. Moreover, we provide a rigorous framework to estimate the best\nmixing ratio $\\alpha^*$ where mixed SSL delivers the best predictive\nperformance, while using the labeled and unlabeled data on hand.\n The effectiveness of our methodology in delivering substantial improvement\ncompared to the standard supervised models, in a variety of settings, is\ndemonstrated empirically through extensive simulation, in a manner that\nsupports the theoretical analysis. We also demonstrate the applicability of our\nmethodology (with some intuitive modifications) to improve more complex models,\nsuch as deep neural networks, in real-world regression tasks.\n","authors":["Oren Yuval","Saharon Rosset"],"pdf_url":"https://arxiv.org/pdf/2302.09526v4.pdf","comment":"58 pages, 10 figures"},{"id":"http://arxiv.org/abs/2412.14031v2","updated":"2024-12-19T08:21:15Z","published":"2024-12-18T16:51:47Z","title":"Gauss-Newton Dynamics for Neural Networks: A Riemannian Optimization\n Perspective","summary":" We analyze the convergence of Gauss-Newton dynamics for training neural\nnetworks with smooth activation functions. In the underparameterized regime,\nthe Gauss-Newton gradient flow induces a Riemannian gradient flow on a\nlow-dimensional, smooth, embedded submanifold of the Euclidean output space.\nUsing tools from Riemannian optimization, we prove \\emph{last-iterate}\nconvergence of the Riemannian gradient flow to the optimal in-class predictor\nat an \\emph{exponential rate} that is independent of the conditioning of the\nGram matrix, \\emph{without} requiring explicit regularization. We further\ncharacterize the critical impacts of the neural network scaling factor and the\ninitialization on the convergence behavior. In the overparameterized regime, we\nshow that the Levenberg-Marquardt dynamics with an appropriately chosen damping\nfactor yields robustness to ill-conditioned kernels, analogous to the\nunderparameterized regime. These findings demonstrate the potential of\nGauss-Newton methods for efficiently optimizing neural networks, particularly\nin ill-conditioned problems where kernel and Gram matrices have small singular\nvalues.\n","authors":["Semih Cayci"],"pdf_url":"https://arxiv.org/pdf/2412.14031v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14620v1","updated":"2024-12-19T08:13:20Z","published":"2024-12-19T08:13:20Z","title":"Continuous latent representations for modeling precipitation with deep\n learning","summary":" The sparse and spatio-temporally discontinuous nature of precipitation data\npresents significant challenges for simulation and statistical processing for\nbias correction and downscaling. These include incorrect representation of\nintermittency and extreme values (critical for hydrology applications), Gibbs\nphenomenon upon regridding, and lack of fine scales details. To address these\nchallenges, a common approach is to transform the precipitation variable\nnonlinearly into one that is more malleable. In this work, we explore how deep\nlearning can be used to generate a smooth, spatio-temporally continuous\nvariable as a proxy for simulation of precipitation data. We develop a normally\ndistributed field called pseudo-precipitation (PP) as an alternative for\nsimulating precipitation. The practical applicability of this variable is\ninvestigated by applying it for downscaling precipitation from \\(1\\degree\\)\n(\\(\\sim\\) 100 km) to \\(0.25\\degree\\) (\\(\\sim\\) 25 km).\n","authors":["Gokul Radhakrishnan","Rahul Sundar","Nishant Parashar","Antoine Blanchard","Daiwei Wang","Boyko Dodov"],"pdf_url":"https://arxiv.org/pdf/2412.14620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14619v1","updated":"2024-12-19T08:11:42Z","published":"2024-12-19T08:11:42Z","title":"Pitfalls of topology-aware image segmentation","summary":" Topological correctness, i.e., the preservation of structural integrity and\nspecific characteristics of shape, is a fundamental requirement for medical\nimaging tasks, such as neuron or vessel segmentation. Despite the recent surge\nin topology-aware methods addressing this challenge, their real-world\napplicability is hindered by flawed benchmarking practices. In this paper, we\nidentify critical pitfalls in model evaluation that include inadequate\nconnectivity choices, overlooked topological artifacts in ground truth\nannotations, and inappropriate use of evaluation metrics. Through detailed\nempirical analysis, we uncover these issues' profound impact on the evaluation\nand ranking of segmentation methods. Drawing from our findings, we propose a\nset of actionable recommendations to establish fair and robust evaluation\nstandards for topology-aware medical image segmentation methods.\n","authors":["Alexander H. Berger","Laurin Lux","Alexander Weers","Martin Menten","Daniel Rueckert","Johannes C. Paetzold"],"pdf_url":"https://arxiv.org/pdf/2412.14619v1.pdf","comment":"Code is available at\n https://github.com/AlexanderHBerger/topo-pitfalls"},{"id":"http://arxiv.org/abs/2412.14602v1","updated":"2024-12-19T07:48:14Z","published":"2024-12-19T07:48:14Z","title":"Towards Scalable and Deep Graph Neural Networks via Noise Masking","summary":" In recent years, Graph Neural Networks (GNNs) have achieved remarkable\nsuccess in many graph mining tasks. However, scaling them to large graphs is\nchallenging due to the high computational and storage costs of repeated feature\npropagation and non-linear transformation during training. One commonly\nemployed approach to address this challenge is model-simplification, which only\nexecutes the Propagation (P) once in the pre-processing, and Combine (C) these\nreceptive fields in different ways and then feed them into a simple model for\nbetter performance. Despite their high predictive performance and scalability,\nthese methods still face two limitations. First, existing approaches mainly\nfocus on exploring different C methods from the model perspective, neglecting\nthe crucial problem of performance degradation with increasing P depth from the\ndata-centric perspective, known as the over-smoothing problem. Second,\npre-processing overhead takes up most of the end-to-end processing time,\nespecially for large-scale graphs. To address these limitations, we present\nrandom walk with noise masking (RMask), a plug-and-play module compatible with\nthe existing model-simplification works. This module enables the exploration of\ndeeper GNNs while preserving their scalability. Unlike the previous\nmodel-simplification works, we focus on continuous P and found that the noise\nexisting inside each P is the cause of the over-smoothing issue, and use the\nefficient masking mechanism to eliminate them. Experimental results on six\nreal-world datasets demonstrate that model-simplification works equipped with\nRMask yield superior performance compared to their original version and can\nmake a good trade-off between accuracy and efficiency.\n","authors":["Yuxuan Liang","Wentao Zhang","Zeang Sheng","Ling Yang","Quanqing Xu","Jiawei Jiang","Yunhai Tong","Bin Cu"],"pdf_url":"https://arxiv.org/pdf/2412.14602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14599v1","updated":"2024-12-19T07:42:07Z","published":"2024-12-19T07:42:07Z","title":"Fast inverse lithography based on a model-driven block stacking\n convolutional neural network","summary":" In the realm of lithography, Optical Proximity Correction (OPC) is a crucial\nresolution enhancement technique that optimizes the transmission function of\nphotomasks on a pixel-based to effectively counter Optical Proximity Effects\n(OPE). However, conventional pixel-based OPC methods often generate patterns\nthat pose manufacturing challenges, thereby leading to the increased cost in\npractical scenarios. This paper presents a novel inverse lithographic approach\nto OPC, employing a model-driven, block stacking deep learning framework that\nexpedites the generation of masks conducive to manufacturing. This method is\nfounded on vector lithography modelling and streamlines the training process by\neliminating the requirement for extensive labeled datasets. Furthermore,\ndiversity of mask patterns is enhanced by employing a wave function collapse\nalgorithm, which facilitates the random generation of a multitude of target\npatterns, therefore significantly expanding the range of mask paradigm.\nNumerical experiments have substantiated the efficacy of the proposed\nend-to-end approach, highlighting its superior capability to manage mask\ncomplexity within the context of advanced OPC lithography. This advancement is\nanticipated to enhance the feasibility and economic viability of OPC technology\nwithin actual manufacturing environments.\n","authors":["Ruixiang Chen","Yang Zhao","Haoqin Li","Rui Chen"],"pdf_url":"https://arxiv.org/pdf/2412.14599v1.pdf","comment":"21 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.04979v3","updated":"2024-12-19T07:33:48Z","published":"2024-01-10T07:51:02Z","title":"DualDynamics: Synergizing Implicit and Explicit Methods for Robust\n Irregular Time Series Analysis","summary":" Real-world time series analysis faces significant challenges when dealing\nwith irregular and incomplete data. While Neural Differential Equation (NDE)\nbased methods have shown promise, they struggle with limited expressiveness,\nscalability issues, and stability concerns. Conversely, Neural Flows offer\nstability but falter with irregular data. We introduce 'DualDynamics', a novel\nframework that synergistically combines NDE-based method and Neural Flow-based\nmethod. This approach enhances expressive power while balancing computational\ndemands, addressing critical limitations of existing techniques. We demonstrate\nDualDynamics' effectiveness across diverse tasks: classification of robustness\nto dataset shift, irregularly-sampled series analysis, interpolation of missing\ndata, and forecasting with partial observations. Our results show consistent\noutperformance over state-of-the-art methods, indicating DualDynamics'\npotential to advance irregular time series analysis significantly.\n","authors":["YongKyung Oh","Dongyoung Lim","Sungil Kim"],"pdf_url":"https://arxiv.org/pdf/2401.04979v3.pdf","comment":"Published at the 39th Annual AAAI Conference on Artificial\n Intelligence (AAAI 2025)"},{"id":"http://arxiv.org/abs/2412.14596v1","updated":"2024-12-19T07:31:40Z","published":"2024-12-19T07:31:40Z","title":"LDP: Generalizing to Multilingual Visual Information Extraction by\n Language Decoupled Pretraining","summary":" Visual Information Extraction (VIE) plays a crucial role in the comprehension\nof semi-structured documents, and several pre-trained models have been\ndeveloped to enhance performance. However, most of these works are monolingual\n(usually English). Due to the extremely unbalanced quantity and quality of\npre-training corpora between English and other languages, few works can extend\nto non-English scenarios. In this paper, we conduct systematic experiments to\nshow that vision and layout modality hold invariance among images with\ndifferent languages. If decoupling language bias from document images, a\nvision-layout-based model can achieve impressive cross-lingual generalization.\nAccordingly, we present a simple but effective multilingual training paradigm\nLDP (Language Decoupled Pre-training) for better utilization of monolingual\npre-training data. Our proposed model LDM (Language Decoupled Model) is first\npre-trained on the language-independent data, where the language knowledge is\ndecoupled by a diffusion model, and then the LDM is fine-tuned on the\ndownstream languages. Extensive experiments show that the LDM outperformed all\nSOTA multilingual pre-trained models, and also maintains competitiveness on\ndownstream monolingual/English benchmarks.\n","authors":["Huawen Shen","Gengluo Li","Jinwen Zhong","Yu Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.14596v1.pdf","comment":"Accepted by AAAI2025"},{"id":"http://arxiv.org/abs/2409.05929v2","updated":"2024-12-19T07:31:18Z","published":"2024-09-09T10:40:50Z","title":"Alt-MoE: Multimodal Alignment via Alternating Optimization of\n Multi-directional MoE with Unimodal Models","summary":" Recent Large Multi-Modal Models (LMMs) have made significant advancements in\nmulti-modal alignment by employing lightweight connection modules to facilitate\nthe representation and fusion of knowledge from existing pre-trained uni-modal\nmodels. However, these methods still rely on modality-specific and\ndirection-specific connectors, leading to compartmentalized knowledge\nrepresentations and reduced computational efficiency, which limits the model's\nability to form unified multi-modal representations. To address these issues,\nwe introduce a novel training framework, Alt-MoE, which employs the Mixture of\nExperts (MoE) as a unified multi-directional connector across modalities, and\nemploys a multi-step sequential alternating unidirectional alignment strategy,\nwhich converges to bidirectional alignment over iterations. The extensive\nempirical studies revealed the following key points: 1) Alt-MoE achieves\ncompetitive results by integrating diverse knowledge representations from\nuni-modal models. This approach seamlessly fuses the specialized expertise of\nexisting high-performance uni-modal models, effectively synthesizing their\ndomain-specific knowledge into a cohesive multi-modal representation. 2)\nAlt-MoE efficiently scales to new tasks and modalities without altering its\nmodel architecture or training strategy. Furthermore, Alt-MoE operates in\nlatent space, supporting vector pre-storage and real-time retrieval via\nlightweight multi-directional MoE, thereby facilitating massive data\nprocessing. Our methodology has been validated on several well-performing\nuni-modal models (LLAMA3, Qwen2, and DINOv2), achieving competitive results on\na wide range of downstream tasks and datasets.\n","authors":["Hongyang Lei","Xiaolong Cheng","Dan Wang","Kun Fan","Qi Qin","Huazhen Huang","Yetao Wu","Qingqing Gu","Zhonglin Jiang","Yong Chen","Luo Ji"],"pdf_url":"https://arxiv.org/pdf/2409.05929v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14592v1","updated":"2024-12-19T07:23:17Z","published":"2024-12-19T07:23:17Z","title":"Multi-Sensor Object Anomaly Detection: Unifying Appearance, Geometry,\n and Internal Properties","summary":" Object anomaly detection is essential for industrial quality inspection, yet\ntraditional single-sensor methods face critical limitations. They fail to\ncapture the wide range of anomaly types, as single sensors are often\nconstrained to either external appearance, geometric structure, or internal\nproperties. To overcome these challenges, we introduce MulSen-AD, the first\nhigh-resolution, multi-sensor anomaly detection dataset tailored for industrial\napplications. MulSen-AD unifies data from RGB cameras, laser scanners, and\nlock-in infrared thermography, effectively capturing external appearance,\ngeometric deformations, and internal defects. The dataset spans 15 industrial\nproducts with diverse, real-world anomalies. We also present MulSen-AD Bench, a\nbenchmark designed to evaluate multi-sensor methods, and propose\nMulSen-TripleAD, a decision-level fusion algorithm that integrates these three\nmodalities for robust, unsupervised object anomaly detection. Our experiments\ndemonstrate that multi-sensor fusion substantially outperforms single-sensor\napproaches, achieving 96.1% AUROC in object-level detection accuracy. These\nresults highlight the importance of integrating multi-sensor data for\ncomprehensive industrial anomaly detection.\n","authors":["Wenqiao Li","Bozhong Zheng","Xiaohao Xu","Jinye Gan","Fading Lu","Xiang Li","Na Ni","Zheng Tian","Xiaonan Huang","Shenghua Gao","Yingna Wu"],"pdf_url":"https://arxiv.org/pdf/2412.14592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14590v1","updated":"2024-12-19T07:15:15Z","published":"2024-12-19T07:15:15Z","title":"MixLLM: LLM Quantization with Global Mixed-precision between\n Output-features and Highly-efficient System Design","summary":" Quantization has become one of the most effective methodologies to compress\nLLMs into smaller size. However, the existing quantization solutions still show\nlimitations of either non-negligible accuracy drop or system inefficiency. In\nthis paper, we make a comprehensive analysis of the general quantization\nprinciples on their effect to the triangle of accuracy, memory consumption and\nsystem efficiency. We propose MixLLM that explores the new optimization space\nof mixed-precision quantization between output features based on the insight\nthat different output features matter differently in the model. MixLLM\nidentifies the output features with high salience in the global view rather\nthan within each single layer, effectively assigning the larger bit-width to\noutput features that need it most to achieve good accuracy with low memory\nconsumption. We present the sweet spot of quantization configuration of\nalgorithm-system co-design that leads to high accuracy and system efficiency.\nTo address the system challenge, we design the two-step dequantization to make\nuse of the int8 Tensor Core easily and fast data type conversion to reduce\ndequantization overhead significantly, and present the software pipeline to\noverlap the memory access, dequantization and the MatMul to the best. Extensive\nexperiments show that with only 10% more bits, the PPL increasement can be\nreduced from about 0.5 in SOTA to within 0.2 for Llama 3.1 70B, while on\naverage MMLU-Pro improves by 0.93 over the SOTA of three popular models. In\naddition to its superior accuracy, MixLLM also achieves state-of-the-art system\nefficiency.\n","authors":["Zhen Zheng","Xiaonan Song","Chuanjie Liu"],"pdf_url":"https://arxiv.org/pdf/2412.14590v1.pdf","comment":"The code will be released in the future"},{"id":"http://arxiv.org/abs/2407.02419v3","updated":"2024-12-19T07:07:51Z","published":"2024-07-02T16:44:14Z","title":"Quantum Curriculum Learning","summary":" Quantum machine learning (QML) requires significant quantum resources to\naddress practical real-world problems. When the underlying quantum information\nexhibits hierarchical structures in the data, limitations persist in training\ncomplexity and generalization. Research should prioritize both the efficient\ndesign of quantum architectures and the development of learning strategies to\noptimize resource usage. We propose a framework called quantum curriculum\nlearning (Q-CurL) for quantum data, where the curriculum introduces simpler\ntasks or data to the learning model before progressing to more challenging\nones. Q-CurL exhibits robustness to noise and data limitations, which is\nparticularly relevant for current and near-term noisy intermediate-scale\nquantum devices. We achieve this through a curriculum design based on quantum\ndata density ratios and a dynamic learning schedule that prioritizes the most\ninformative quantum data. Empirical evidence shows that Q-CurL significantly\nenhances training convergence and generalization for unitary learning and\nimproves the robustness of quantum phase recognition tasks. Q-CurL is effective\nwith broad physical learning applications in condensed matter physics and\nquantum chemistry.\n","authors":["Quoc Hoan Tran","Yasuhiro Endo","Hirotaka Oshima"],"pdf_url":"https://arxiv.org/pdf/2407.02419v3.pdf","comment":"main 6 pages, supplementary materials 11 pages (update the\n supplementary materials with more explanation on data-based Q-CurL)"},{"id":"http://arxiv.org/abs/2302.03390v5","updated":"2024-12-19T07:05:03Z","published":"2023-02-07T10:51:53Z","title":"Learning Discretized Neural Networks under Ricci Flow","summary":" In this paper, we study Discretized Neural Networks (DNNs) composed of\nlow-precision weights and activations, which suffer from either infinite or\nzero gradients due to the non-differentiable discrete function during training.\nMost training-based DNNs in such scenarios employ the standard Straight-Through\nEstimator (STE) to approximate the gradient w.r.t. discrete values. However,\nthe use of STE introduces the problem of gradient mismatch, arising from\nperturbations in the approximated gradient. To address this problem, this paper\nreveals that this mismatch can be interpreted as a metric perturbation in a\nRiemannian manifold, viewed through the lens of duality theory. Building on\ninformation geometry, we construct the Linearly Nearly Euclidean (LNE) manifold\nfor DNNs, providing a background for addressing perturbations. By introducing a\npartial differential equation on metrics, i.e., the Ricci flow, we establish\nthe dynamical stability and convergence of the LNE metric with the $L^2$-norm\nperturbation. In contrast to previous perturbation theories with convergence\nrates in fractional powers, the metric perturbation under the Ricci flow\nexhibits exponential decay in the LNE manifold. Experimental results across\nvarious datasets demonstrate that our method achieves superior and more stable\nperformance for DNNs compared to other representative training-based methods.\n","authors":["Jun Chen","Hanwen Chen","Mengmeng Wang","Guang Dai","Ivor W. Tsang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2302.03390v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00284v3","updated":"2024-12-19T06:52:07Z","published":"2023-02-01T07:31:25Z","title":"Selective Uncertainty Propagation in Offline RL","summary":" We consider the finite-horizon offline reinforcement learning (RL) setting,\nand are motivated by the challenge of learning the policy at any step h in\ndynamic programming (DP) algorithms. To learn this, it is sufficient to\nevaluate the treatment effect of deviating from the behavioral policy at step h\nafter having optimized the policy for all future steps. Since the policy at any\nstep can affect next-state distributions, the related distributional shift\nchallenges can make this problem far more statistically hard than estimating\nsuch treatment effects in the stochastic contextual bandit setting. However,\nthe hardness of many real-world RL instances lies between the two regimes. We\ndevelop a flexible and general method called selective uncertainty propagation\nfor confidence interval construction that adapts to the hardness of the\nassociated distribution shift challenges. We show benefits of our approach on\ntoy environments and demonstrate the benefits of these techniques for offline\npolicy learning.\n","authors":["Sanath Kumar Krishnamurthy","Tanmay Gangwani","Sumeet Katariya","Branislav Kveton","Shrey Modi","Anshuka Rangi"],"pdf_url":"https://arxiv.org/pdf/2302.00284v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07728v3","updated":"2024-12-19T06:51:17Z","published":"2024-03-12T15:07:20Z","title":"CAP: A General Algorithm for Online Selective Conformal Prediction with\n FCR Control","summary":" We study the problem of post-selection predictive inference in an online\nfashion. To avoid devoting resources to unimportant units, a preliminary\nselection of the current individual before reporting its prediction interval is\ncommon and meaningful in online predictive tasks. Since the online selection\ncauses a temporal multiplicity in the selected prediction intervals, it is\nimportant to control the real-time false coverage-statement rate (FCR) which\nmeasures the overall miscoverage level. We develop a general framework named\nCAP (Calibration after Adaptive Pick) that performs an adaptive pick rule on\nhistorical data to construct a calibration set if the current individual is\nselected and then outputs a conformal prediction interval for the unobserved\nlabel. We provide tractable procedures for constructing the calibration set for\npopular online selection rules. We proved that CAP can achieve an exact\nselection-conditional coverage guarantee in the finite-sample and\ndistribution-free regimes. To account for the distribution shift in online\ndata, we also embed CAP into some recent dynamic conformal prediction\nalgorithms and show that the proposed method can deliver long-run FCR control.\nNumerical results on both synthetic and real data corroborate that CAP can\neffectively control FCR around the target level and yield more narrowed\nprediction intervals over existing baselines across various settings.\n","authors":["Yajie Bao","Yuyang Huo","Haojie Ren","Changliang Zou"],"pdf_url":"https://arxiv.org/pdf/2403.07728v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14572v1","updated":"2024-12-19T06:42:57Z","published":"2024-12-19T06:42:57Z","title":"Accelerated Patient-Specific Calibration via Differentiable Hemodynamics\n Simulations","summary":" One of the goals of personalized medicine is to tailor diagnostics to\nindividual patients. Diagnostics are performed in practice by measuring\nquantities, called biomarkers, that indicate the existence and progress of a\ndisease. In common cardiovascular diseases, such as hypertension, biomarkers\nthat are closely related to the clinical representation of a patient can be\npredicted using computational models. Personalizing computational models\ntranslates to considering patient-specific flow conditions, for example, the\ncompliance of blood vessels that cannot be a priori known and quantities such\nas the patient geometry that can be measured using imaging. Therefore, a\npatient is identified by a set of measurable and nonmeasurable parameters\nneeded to well-define a computational model; else, the computational model is\nnot personalized, meaning it is prone to large prediction errors. Therefore, to\npersonalize a computational model, sufficient information needs to be extracted\nfrom the data. The current methods by which this is done are either\ninefficient, due to relying on slow-converging optimization methods, or hard to\ninterpret, due to using `black box` deep-learning algorithms. We propose a\npersonalized diagnostic procedure based on a differentiable 0D-1D Navier-Stokes\nreduced order model solver and fast parameter inference methods that take\nadvantage of gradients through the solver. By providing a faster method for\nperforming parameter inference and sensitivity analysis through\ndifferentiability while maintaining the interpretability of well-understood\nmathematical models and numerical methods, the best of both worlds is combined.\nThe performance of the proposed solver is validated against a well-established\nprocess on different geometries, and different parameter inference processes\nare successfully performed.\n","authors":["Diego Renner","Georgios Kissas"],"pdf_url":"https://arxiv.org/pdf/2412.14572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14569v1","updated":"2024-12-19T06:40:21Z","published":"2024-12-19T06:40:21Z","title":"Global Spatio-Temporal Fusion-based Traffic Prediction Algorithm with\n Anomaly Aware","summary":" Traffic prediction is an indispensable component of urban planning and\ntraffic management. Achieving accurate traffic prediction hinges on the ability\nto capture the potential spatio-temporal relationships among road sensors.\nHowever, the majority of existing works focus on local short-term\nspatio-temporal correlations, failing to fully consider the interactions of\ndifferent sensors in the long-term state. In addition, these works do not\nanalyze the influences of anomalous factors, or have insufficient ability to\nextract personalized features of anomalous factors, which make them\nineffectively capture their spatio-temporal influences on traffic prediction.\nTo address the aforementioned issues, We propose a global spatio-temporal\nfusion-based traffic prediction algorithm that incorporates anomaly awareness.\nInitially, based on the designed anomaly detection network, we construct an\nefficient anomalous factors impacting module (AFIM), to evaluate the\nspatio-temporal impact of unexpected external events on traffic prediction.\nFurthermore, we propose a multi-scale spatio-temporal feature fusion module\n(MTSFFL) based on the transformer architecture, to obtain all possible both\nlong and short term correlations among different sensors in a wide-area traffic\nenvironment for accurate prediction of traffic flow. Finally, experiments are\nimplemented based on real-scenario public transportation datasets (PEMS04 and\nPEMS08) to demonstrate that our approach can achieve state-of-the-art\nperformance.\n","authors":["Chaoqun Liu","Xuanpeng Li","Chen Gong","Guangyu Li"],"pdf_url":"https://arxiv.org/pdf/2412.14569v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21853v2","updated":"2024-12-19T06:39:24Z","published":"2024-10-29T08:28:23Z","title":"Learning Infinitesimal Generators of Continuous Symmetries from Data","summary":" Exploiting symmetry inherent in data can significantly improve the sample\nefficiency of a learning procedure and the generalization of learned models.\nWhen data clearly reveals underlying symmetry, leveraging this symmetry can\nnaturally inform the design of model architectures or learning strategies. Yet,\nin numerous real-world scenarios, identifying the specific symmetry within a\ngiven data distribution often proves ambiguous. To tackle this, some existing\nworks learn symmetry in a data-driven manner, parameterizing and learning\nexpected symmetry through data. However, these methods often rely on explicit\nknowledge, such as pre-defined Lie groups, which are typically restricted to\nlinear or affine transformations. In this paper, we propose a novel symmetry\nlearning algorithm based on transformations defined with one-parameter groups,\ncontinuously parameterized transformations flowing along the directions of\nvector fields called infinitesimal generators. Our method is built upon minimal\ninductive biases, encompassing not only commonly utilized symmetries rooted in\nLie groups but also extending to symmetries derived from nonlinear generators.\nTo learn these symmetries, we introduce a notion of a validity score that\nexamine whether the transformed data is still valid for the given task. The\nvalidity score is designed to be fully differentiable and easily computable,\nenabling effective searches for transformations that achieve symmetries innate\nto the data. We apply our method mainly in two domains: image data and partial\ndifferential equations, and demonstrate its advantages. Our codes are available\nat \\url{https://github.com/kogyeonghoon/learning-symmetry-from-scratch.git}.\n","authors":["Gyeonghoon Ko","Hyunsu Kim","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2410.21853v2.pdf","comment":"Neurips 2024"},{"id":"http://arxiv.org/abs/2412.14566v1","updated":"2024-12-19T06:35:54Z","published":"2024-12-19T06:35:54Z","title":"AIArena: A Blockchain-Based Decentralized AI Training Platform","summary":" The rapid advancement of AI has underscored critical challenges in its\ndevelopment and implementation, largely due to centralized control by a few\nmajor corporations. This concentration of power intensifies biases within AI\nmodels, resulting from inadequate governance and oversight mechanisms.\nAdditionally, it limits public involvement and heightens concerns about the\nintegrity of model generation. Such monopolistic control over data and AI\noutputs threatens both innovation and fair data usage, as users inadvertently\ncontribute data that primarily benefits these corporations. In this work, we\npropose AIArena, a blockchain-based decentralized AI training platform designed\nto democratize AI development and alignment through on-chain incentive\nmechanisms. AIArena fosters an open and collaborative environment where\nparticipants can contribute models and computing resources. Its on-chain\nconsensus mechanism ensures fair rewards for participants based on their\ncontributions. We instantiate and implement AIArena on the public Base\nblockchain Sepolia testnet, and the evaluation results demonstrate the\nfeasibility of AIArena in real-world applications.\n","authors":["Zhipeng Wang","Rui Sun","Elizabeth Lui","Tuo Zhou","Yizhe Wen","Jiahao Sun"],"pdf_url":"https://arxiv.org/pdf/2412.14566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.08282v2","updated":"2024-12-19T06:35:21Z","published":"2024-12-11T10:57:16Z","title":"How Does the Smoothness Approximation Method Facilitate Generalization\n for Federated Adversarial Learning?","summary":" Federated Adversarial Learning (FAL) is a robust framework for resisting\nadversarial attacks on federated learning. Although some FAL studies have\ndeveloped efficient algorithms, they primarily focus on convergence performance\nand overlook generalization. Generalization is crucial for evaluating algorithm\nperformance on unseen data. However, generalization analysis is more\nchallenging due to non-smooth adversarial loss functions. A common approach to\naddressing this issue is to leverage smoothness approximation. In this paper,\nwe develop algorithm stability measures to evaluate the generalization\nperformance of two popular FAL algorithms: \\textit{Vanilla FAL (VFAL)} and {\\it\nSlack FAL (SFAL)}, using three different smooth approximation methods: 1)\n\\textit{Surrogate Smoothness Approximation (SSA)}, (2) \\textit{Randomized\nSmoothness Approximation (RSA)}, and (3) \\textit{Over-Parameterized Smoothness\nApproximation (OPSA)}. Based on our in-depth analysis, we answer the question\nof how to properly set the smoothness approximation method to mitigate\ngeneralization error in FAL. Moreover, we identify RSA as the most effective\nmethod for reducing generalization error. In highly data-heterogeneous\nscenarios, we also recommend employing SFAL to mitigate the deterioration of\ngeneralization performance caused by heterogeneity. Based on our theoretical\nresults, we provide insights to help develop more efficient FAL algorithms,\nsuch as designing new metrics and dynamic aggregation rules to mitigate\nheterogeneity.\n","authors":["Wenjun Ding","Ying An","Lixing Chen","Shichao Kan","Fan Wu","Zhe Qu"],"pdf_url":"https://arxiv.org/pdf/2412.08282v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11973v6","updated":"2024-12-19T06:29:38Z","published":"2023-12-19T09:11:49Z","title":"Continual Learning: Forget-free Winning Subnetworks for Video\n Representations","summary":" Inspired by the Lottery Ticket Hypothesis (LTH), which highlights the\nexistence of efficient subnetworks within larger, dense networks, a\nhigh-performing Winning Subnetwork (WSN) in terms of task performance under\nappropriate sparsity conditions is considered for various continual learning\ntasks. It leverages pre-existing weights from dense networks to achieve\nefficient learning in Task Incremental Learning (TIL) and Task-agnostic\nIncremental Learning (TaIL) scenarios. In Few-Shot Class Incremental Learning\n(FSCIL), a variation of WSN referred to as the Soft subnetwork (SoftNet) is\ndesigned to prevent overfitting when the data samples are scarce. Furthermore,\nthe sparse reuse of WSN weights is considered for Video Incremental Learning\n(VIL). The use of Fourier Subneural Operator (FSO) within WSN is considered. It\nenables compact encoding of videos and identifies reusable subnetworks across\nvarying bandwidths. We have integrated FSO into different architectural\nframeworks for continual learning, including VIL, TIL, and FSCIL. Our\ncomprehensive experiments demonstrate FSO's effectiveness, significantly\nimproving task performance at various convolutional representational levels.\nSpecifically, FSO enhances higher-layer performance in TIL and FSCIL and\nlower-layer performance in VIL.\n","authors":["Haeyong Kang","Jaehong Yoon","Sung Ju Hwang","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2312.11973v6.pdf","comment":"IEEE Transactions on Pattern Analysis and Machine Intelligence\n (T-PAMI)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2412.15156v1","updated":"2024-12-19T18:32:21Z","published":"2024-12-19T18:32:21Z","title":"Prompt-A-Video: Prompt Your Video Diffusion Model via Preference-Aligned\n LLM","summary":" Text-to-video models have made remarkable advancements through optimization\non high-quality text-video pairs, where the textual prompts play a pivotal role\nin determining quality of output videos. However, achieving the desired output\noften entails multiple revisions and iterative inference to refine\nuser-provided prompts. Current automatic methods for refining prompts encounter\nchallenges such as Modality-Inconsistency, Cost-Discrepancy, and Model-Unaware\nwhen applied to text-to-video diffusion models. To address these problem, we\nintroduce an LLM-based prompt adaptation framework, termed as Prompt-A-Video,\nwhich excels in crafting Video-Centric, Labor-Free and Preference-Aligned\nprompts tailored to specific video diffusion model. Our approach involves a\nmeticulously crafted two-stage optimization and alignment system. Initially, we\nconduct a reward-guided prompt evolution pipeline to automatically create\noptimal prompts pool and leverage them for supervised fine-tuning (SFT) of the\nLLM. Then multi-dimensional rewards are employed to generate pairwise data for\nthe SFT model, followed by the direct preference optimization (DPO) algorithm\nto further facilitate preference alignment. Through extensive experimentation\nand comparative analyses, we validate the effectiveness of Prompt-A-Video\nacross diverse generation models, highlighting its potential to push the\nboundaries of video generation.\n","authors":["Yatai Ji","Jiacheng Zhang","Jie Wu","Shilong Zhang","Shoufa Chen","Chongjian GE","Peize Sun","Weifeng Chen","Wenqi Shao","Xuefeng Xiao","Weilin Huang","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2412.15156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.15023v1","updated":"2024-12-19T16:37:19Z","published":"2024-12-19T16:37:19Z","title":"Stable-V2A: Synthesis of Synchronized Sound Effects with Temporal and\n Semantic Controls","summary":" Sound designers and Foley artists usually sonorize a scene, such as from a\nmovie or video game, by manually annotating and sonorizing each action of\ninterest in the video. In our case, the intent is to leave full creative\ncontrol to sound designers with a tool that allows them to bypass the more\nrepetitive parts of their work, thus being able to focus on the creative\naspects of sound production. We achieve this presenting Stable-V2A, a two-stage\nmodel consisting of: an RMS-Mapper that estimates an envelope representative of\nthe audio characteristics associated with the input video; and Stable-Foley, a\ndiffusion model based on Stable Audio Open that generates audio semantically\nand temporally aligned with the target video. Temporal alignment is guaranteed\nby the use of the envelope as a ControlNet input, while semantic alignment is\nachieved through the use of sound representations chosen by the designer as\ncross-attention conditioning of the diffusion process. We train and test our\nmodel on Greatest Hits, a dataset commonly used to evaluate V2A models. In\naddition, to test our model on a case study of interest, we introduce Walking\nThe Maps, a dataset of videos extracted from video games depicting animated\ncharacters walking in different locations. Samples and code available on our\ndemo page at https://ispamm.github.io/Stable-V2A.\n","authors":["Riccardo Fosco Gramaccioni","Christian Marinoni","Emilian Postolache","Marco Comunità","Luca Cosmo","Joshua D. Reiss","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2412.15023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14978v1","updated":"2024-12-19T15:53:21Z","published":"2024-12-19T15:53:21Z","title":"Spectrum-based Modality Representation Fusion Graph Convolutional\n Network for Multimodal Recommendation","summary":" Incorporating multi-modal features as side information has recently become a\ntrend in recommender systems. To elucidate user-item preferences, recent\nstudies focus on fusing modalities via concatenation, element-wise sum, or\nattention mechanisms. Despite having notable success, existing approaches do\nnot account for the modality-specific noise encapsulated within each modality.\nAs a result, direct fusion of modalities will lead to the amplification of\ncross-modality noise. Moreover, the variation of noise that is unique within\neach modality results in noise alleviation and fusion being more challenging.\nIn this work, we propose a new Spectrum-based Modality Representation (SMORE)\nfusion graph recommender that aims to capture both uni-modal and fusion\npreferences while simultaneously suppressing modality noise. Specifically,\nSMORE projects the multi-modal features into the frequency domain and leverages\nthe spectral space for fusion. To reduce dynamic contamination that is unique\nto each modality, we introduce a filter to attenuate and suppress the modality\nnoise adaptively while capturing the universal modality patterns effectively.\nFurthermore, we explore the item latent structures by designing a new\nmulti-modal graph learning module to capture associative semantic correlations\nand universal fusion patterns among similar items. Finally, we formulate a new\nmodality-aware preference module, which infuses behavioral features and\nbalances the uni- and multi-modal features for precise preference modeling.\nThis empowers SMORE with the ability to infer both user modality-specific and\nfusion preferences more accurately. Experiments on three real-world datasets\nshow the efficacy of our proposed model. The source code for this work has been\nmade publicly available at https://github.com/kennethorq/SMORE.\n","authors":["Rongqing Kenneth Ong","Andy W. H. Khong"],"pdf_url":"https://arxiv.org/pdf/2412.14978v1.pdf","comment":"Accepted to ACM Web Search and Data Mining (WSDM) 2025"},{"id":"http://arxiv.org/abs/2310.14778v3","updated":"2024-12-19T11:49:06Z","published":"2023-10-23T10:29:33Z","title":"Audio-Visual Speaker Tracking: Progress, Challenges, and Future\n Directions","summary":" Audio-visual speaker tracking has drawn increasing attention over the past\nfew years due to its academic values and wide application. Audio and visual\nmodalities can provide complementary information for localization and tracking.\nWith audio and visual information, the Bayesian-based filter can solve the\nproblem of data association, audio-visual fusion and track management. In this\npaper, we conduct a comprehensive overview of audio-visual speaker tracking. To\nour knowledge, this is the first extensive survey over the past five years. We\nintroduce the family of Bayesian filters and summarize the methods for\nobtaining audio-visual measurements. In addition, the existing trackers and\ntheir performance on AV16.3 dataset are summarized. In the past few years, deep\nlearning techniques have thrived, which also boosts the development of audio\nvisual speaker tracking. The influence of deep learning techniques in terms of\nmeasurement extraction and state estimation is also discussed. At last, we\ndiscuss the connections between audio-visual speaker tracking and other areas\nsuch as speech separation and distributed speaker tracking.\n","authors":["Jinzheng Zhao","Yong Xu","Xinyuan Qian","Davide Berghi","Peipei Wu","Meng Cui","Jianyuan Sun","Philip J. B. Jackson","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2310.14778v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14518v1","updated":"2024-12-19T04:33:22Z","published":"2024-12-19T04:33:22Z","title":"Efficient Self-Supervised Video Hashing with Selective State Spaces","summary":" Self-supervised video hashing (SSVH) is a practical task in video indexing\nand retrieval. Although Transformers are predominant in SSVH for their\nimpressive temporal modeling capabilities, they often suffer from computational\nand memory inefficiencies. Drawing inspiration from Mamba, an advanced\nstate-space model, we explore its potential in SSVH to achieve a better balance\nbetween efficacy and efficiency. We introduce S5VH, a Mamba-based video hashing\nmodel with an improved self-supervised learning paradigm. Specifically, we\ndesign bidirectional Mamba layers for both the encoder and decoder, which are\neffective and efficient in capturing temporal relationships thanks to the\ndata-dependent selective scanning mechanism with linear complexity. In our\nlearning strategy, we transform global semantics in the feature space into\nsemantically consistent and discriminative hash centers, followed by a center\nalignment loss as a global learning signal. Our self-local-global (SLG)\nparadigm significantly improves learning efficiency, leading to faster and\nbetter convergence. Extensive experiments demonstrate S5VH's improvements over\nstate-of-the-art methods, superior transferability, and scalable advantages in\ninference efficiency. Code is available at\nhttps://github.com/gimpong/AAAI25-S5VH.\n","authors":["Jinpeng Wang","Niu Lian","Jun Li","Yuting Wang","Yan Feng","Bin Chen","Yongbing Zhang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2412.14518v1.pdf","comment":"Accepted by AAAI'25. 9 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2412.13609v2","updated":"2024-12-19T03:12:19Z","published":"2024-12-18T08:36:35Z","title":"Sign-IDD: Iconicity Disentangled Diffusion for Sign Language Production","summary":" Sign Language Production (SLP) aims to generate semantically consistent sign\nvideos from textual statements, where the conversion from textual glosses to\nsign poses (G2P) is a crucial step. Existing G2P methods typically treat sign\nposes as discrete three-dimensional coordinates and directly fit them, which\noverlooks the relative positional relationships among joints. To this end, we\nprovide a new perspective, constraining joint associations and gesture details\nby modeling the limb bones to improve the accuracy and naturalness of the\ngenerated poses. In this work, we propose a pioneering iconicity disentangled\ndiffusion framework, termed Sign-IDD, specifically designed for SLP. Sign-IDD\nincorporates a novel Iconicity Disentanglement (ID) module to bridge the gap\nbetween relative positions among joints. The ID module disentangles the\nconventional 3D joint representation into a 4D bone representation, comprising\nthe 3D spatial direction vector and 1D spatial distance vector between adjacent\njoints. Additionally, an Attribute Controllable Diffusion (ACD) module is\nintroduced to further constrain joint associations, in which the attribute\nseparation layer aims to separate the bone direction and length attributes, and\nthe attribute control layer is designed to guide the pose generation by\nleveraging the above attributes. The ACD module utilizes the gloss embeddings\nas semantic conditions and finally generates sign poses from noise embeddings.\nExtensive experiments on PHOENIX14T and USTC-CSL datasets validate the\neffectiveness of our method. The code is available at:\nhttps://github.com/NaVi-start/Sign-IDD.\n","authors":["Shengeng Tang","Jiayi He","Dan Guo","Yanyan Wei","Feng Li","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2412.13609v2.pdf","comment":"Accepted by AAAI 2025"}]},"2024-12-18T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2412.14414v1","updated":"2024-12-18T23:58:13Z","published":"2024-12-18T23:58:13Z","title":"In-Group Love, Out-Group Hate: A Framework to Measure Affective\n Polarization via Contentious Online Discussions","summary":" Affective polarization, the emotional divide between ideological groups\nmarked by in-group love and out-group hate, has intensified in the United\nStates, driving contentious issues like masking and lockdowns during the\nCOVID-19 pandemic. Despite its societal impact, existing models of opinion\nchange fail to account for emotional dynamics nor offer methods to quantify\naffective polarization robustly and in real-time. In this paper, we introduce a\ndiscrete choice model that captures decision-making within affectively\npolarized social networks and propose a statistical inference method estimate\nkey parameters -- in-group love and out-group hate -- from social media data.\nThrough empirical validation from online discussions about the COVID-19\npandemic, we demonstrate that our approach accurately captures real-world\npolarization dynamics and explains the rapid emergence of a partisan gap in\nattitudes towards masking and lockdowns. This framework allows for tracking\naffective polarization across contentious issues has broad implications for\nfostering constructive online dialogues in digital spaces.\n","authors":["Buddhika Nettasinghe","Ashwin Rao","Bohan Jiang","Allon Percus","Kristina Lerman"],"pdf_url":"https://arxiv.org/pdf/2412.14414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14748v3","updated":"2024-12-18T23:36:03Z","published":"2024-10-17T19:38:55Z","title":"ETF: An Entity Tracing Framework for Hallucination Detection in Code\n Summaries","summary":" Recent advancements in large language models (LLMs) have significantly\nenhanced their ability to understand both natural language and code, driving\ntheir use in tasks like natural language-to-code (NL2Code) and code\nsummarization. However, LLMs are prone to hallucination-outputs that stray from\nintended meanings. Detecting hallucinations in code summarization is especially\ndifficult due to the complex interplay between programming and natural\nlanguages. We introduce a first-of-its-kind dataset with $\\sim$10K samples,\ncurated specifically for hallucination detection in code summarization. We\nfurther propose a novel Entity Tracing Framework (ETF) that a) utilizes static\nprogram analysis to identify code entities from the program and b) uses LLMs to\nmap and verify these entities and their intents within generated code\nsummaries. Our experimental analysis demonstrates the effectiveness of the\nframework, leading to a 0.73 F1 score. This approach provides an interpretable\nmethod for detecting hallucinations by grounding entities, allowing us to\nevaluate summary accuracy.\n","authors":["Kishan Maharaj","Vitobha Munigala","Srikanth G. Tamilselvam","Prince Kumar","Sayandeep Sen","Palani Kodeswaran","Abhijit Mishra","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2410.14748v3.pdf","comment":"11 pages, 6 Figures, 5 Tables"},{"id":"http://arxiv.org/abs/2402.15083v2","updated":"2024-12-18T23:11:48Z","published":"2024-02-23T04:02:23Z","title":"Hands-Free VR","summary":" The paper introduces Hands-Free VR, a voice-based natural-language interface\nfor VR. The user gives a command using their voice, the speech audio data is\nconverted to text using a speech-to-text deep learning model that is fine-tuned\nfor robustness to word phonetic similarity and to spoken English accents, and\nthe text is mapped to an executable VR command using a large language model\nthat is robust to natural language diversity. Hands-Free VR was evaluated in a\ncontrolled within-subjects study (N = 22) that asked participants to find\nspecific objects and to place them in various configurations. In the control\ncondition participants used a conventional VR user interface to grab, carry,\nand position the objects using the handheld controllers. In the experimental\ncondition participants used Hands-Free VR. The results confirm that: (1)\nHands-Free VR is robust to spoken English accents, as for 20 of our\nparticipants English was not their first language, and to word phonetic\nsimilarity, correctly transcribing the voice command 96.71% of the time; (2)\nHands-Free VR is robust to natural language diversity, correctly mapping the\ntranscribed command to an executable command in 97.83% of the time; (3)\nHands-Free VR had a significant efficiency advantage over the conventional VR\ninterface in terms of task completion time, total viewpoint translation, total\nview direction rotation, and total left and right hand translations; (4)\nHands-Free VR received high user preference ratings in terms of ease of use,\nintuitiveness, ergonomics, reliability, and desirability.\n","authors":["Jorge Askur Vazquez Fernandez","Jae Joong Lee","Santiago Andrés Serrano Vacca","Alejandra Magana","Radim Pesam","Bedrich Benes","Voicu Popescu"],"pdf_url":"https://arxiv.org/pdf/2402.15083v2.pdf","comment":"The first two authors contributed equally. Accepted VISIGRAPP@HUCAPP\n 2025"},{"id":"http://arxiv.org/abs/2409.01227v3","updated":"2024-12-18T23:04:46Z","published":"2024-09-02T13:02:51Z","title":"Prompt Compression with Context-Aware Sentence Encoding for Fast and\n Improved LLM Inference","summary":" Large language models (LLMs) have triggered a new stream of research focusing\non compressing the context length to reduce the computational cost while\nensuring the retention of helpful information for LLMs to answer the given\nquestion. Token-based removal methods are one of the most prominent approaches\nin this direction, but risk losing the semantics of the context caused by\nintermediate token removal, especially under high compression ratios, while\nalso facing challenges in computational efficiency. In this work, we propose\ncontext-aware prompt compression (CPC), a sentence-level prompt compression\ntechnique where its key innovation is a novel context-aware sentence encoder\nthat provides a relevance score for each sentence for a given question. To\ntrain this encoder, we generate a new dataset consisting of questions,\npositives, and negative pairs where positives are sentences relevant to the\nquestion, while negatives are irrelevant context sentences. We train the\nencoder in a contrastive setup to learn context-aware sentence representations.\nOur method considerably outperforms prior works on prompt compression on\nbenchmark datasets and is up to 10.93x faster at inference compared to the best\ntoken-level compression method. We also find better improvement for shorter\nlength constraints in most benchmarks, showing the effectiveness of our\nproposed solution in the compression of relevant information in a shorter\ncontext. Finally, we release the code and the dataset for quick reproducibility\nand further development: https://github.com/Workday/cpc.\n","authors":["Barys Liskavets","Maxim Ushakov","Shuvendu Roy","Mark Klibanov","Ali Etemad","Shane Luke"],"pdf_url":"https://arxiv.org/pdf/2409.01227v3.pdf","comment":"Accepted in AAAI Conference on Artificial Intelligence (AAAI-25)"},{"id":"http://arxiv.org/abs/2404.18988v4","updated":"2024-12-18T22:26:15Z","published":"2024-04-29T17:36:58Z","title":"Markovian Transformers for Informative Language Modeling","summary":" Chain-of-Thought (CoT) reasoning holds great promise for explaining language\nmodel outputs, but recent studies have highlighted significant challenges in\nits practical application for interpretability. We propose to address this\nissue by making CoT causally essential to prediction through two key\ncomponents: factoring next-token prediction through intermediate CoT text, and\ntraining CoT to predict future tokens independently of other context. This\nresults in \"Markovian\" language models, where CoT serves as a fixed-size state\nfor future token prediction. Our approach optimizes for \"informativeness\" - the\nimprovement in next-token predictions using a trained CoT compared to a\nbaseline. Using Proximal Policy Optimization (PPO) for arithmetic problems and\npolicy gradient for GSM8K, we demonstrate effectiveness on both arithmetic\nproblems with Mistral 7B and the GSM8K benchmark with Llama 3.1 8B, where the\nmodel learns to produce CoTs that are 33.20% more effective at predicting\nanswers than the pre-trained baseline. The increased sensitivity of model\nperformance to CoT perturbations provides strong evidence of CoT reliance.\nFurthermore, we show that CoTs trained for one model generalize to help other\nmodels predict answers, suggesting these CoTs capture reasoning patterns that\ntransfer across different interpreters. This work advances the development of\nmore interpretable language models, potentially enabling their extension to\narbitrarily long contexts and enhancing AI reasoning capabilities across\nvarious domains.\n","authors":["Scott Viteri","Max Lamparth","Peter Chatain","Clark Barrett"],"pdf_url":"https://arxiv.org/pdf/2404.18988v4.pdf","comment":"20 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.14373v1","updated":"2024-12-18T22:13:21Z","published":"2024-12-18T22:13:21Z","title":"ECG-Byte: A Tokenizer for End-to-End Generative Electrocardiogram\n Language Modeling","summary":" Large Language Models (LLMs) have shown remarkable adaptability across\ndomains beyond text, specifically electrocardiograms (ECGs). More specifically,\nthere is a growing body of work exploring the task of generating text from a\nmulti-channeled ECG and corresponding textual prompt. Current approaches\ntypically involve pretraining an ECG-specific encoder with a self-supervised\nlearning (SSL) objective and using the features output by the pretrained\nencoder to finetune a LLM for natural language generation (NLG). However, these\nmethods are limited by 1) inefficiency from two-stage training and 2)\ninterpretability challenges with encoder-generated features. To address these\nlimitations, we introduce ECG-Byte, an adapted byte pair encoding (BPE)\ntokenizer pipeline for autoregressive language modeling of ECGs. This approach\ncompresses and encodes ECG signals into tokens, enabling end-to-end LLM\ntraining by combining ECG and text tokens directly, while being much more\ninterpretable since the ECG tokens can be directly mapped back to the original\nsignal. Using ECG-Byte, we achieve competitive performance in NLG tasks in only\nhalf the time and ~48% of the data required by two-stage approaches.\n","authors":["William Han","Chaojing Duan","Michael A. Rosenberg","Emerson Liu","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2412.14373v1.pdf","comment":"26 pages, 17 figures"},{"id":"http://arxiv.org/abs/2412.14368v1","updated":"2024-12-18T22:04:56Z","published":"2024-12-18T22:04:56Z","title":"Memorization Over Reasoning? Exposing and Mitigating Verbatim\n Memorization in Large Language Models' Character Understanding Evaluation","summary":" Recently, Large Language Models (LLMs) have shown impressive performance in\ncharacter understanding tasks, such as analyzing the roles, personalities, and\nrelationships of fictional characters. However, the extensive pre-training\ncorpora used by LLMs raise concerns that they may rely on memorizing popular\nfictional works rather than genuinely understanding and reasoning about them.\nIn this work, we argue that 'gist memory'-capturing essential meaning - should\nbe the primary mechanism for character understanding tasks, as opposed to\n'verbatim memory' - exact match of a string. We introduce a simple yet\neffective method to mitigate mechanized memorization in character understanding\nevaluations while preserving the essential implicit cues needed for\ncomprehension and reasoning. Our approach reduces memorization-driven\nperformance on popular fictional works from 96% accuracy to 72% and results in\nup to an 18% drop in accuracy across various character understanding tasks.\nThese findings underscore the issue of data contamination in existing\nbenchmarks, which often measure memorization rather than true character\nunderstanding.\n","authors":["Yuxuan Jiang","Francis Ferraro"],"pdf_url":"https://arxiv.org/pdf/2412.14368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14363v1","updated":"2024-12-18T22:01:55Z","published":"2024-12-18T22:01:55Z","title":"ResQ: Mixed-Precision Quantization of Large Language Models with\n Low-Rank Residuals","summary":" Post-training quantization (PTQ) of large language models (LLMs) holds the\npromise in reducing the prohibitive computational cost at inference time.\nQuantization of all weight, activation and key-value (KV) cache tensors to\n4-bit without significantly degrading generalizability is challenging, due to\nthe high quantization error caused by extreme outliers in activations. To\ntackle this problem, we propose ResQ, a PTQ method that pushes further the\nstate-of-the-art. By means of principal component analysis (PCA), it identifies\na low-rank subspace (in practice 1/8 of the hidden dimension) in which\nactivation variances are highest, and keep the coefficients within this\nsubspace in high precision, e.g. 8-bit, while quantizing the rest to 4-bit.\nWithin each subspace, invariant random rotation is applied to further suppress\noutliers. We show that this is a provably optimal mixed precision quantization\nscheme that minimizes error. With the Llama families of models, we demonstrate\nthat ResQ outperforms recent uniform and mixed precision PTQ methods on a\nvariety of benchmarks, achieving up to 33% lower perplexity on Wikitext than\nthe next best method SpinQuant, and a 2.4x speedup over 16-bit baseline. Code\nis available at https://github.com/utkarsh-dmx/project-resq.\n","authors":["Utkarsh Saxena","Sayeh Sharify","Kaushik Roy","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14363v1.pdf","comment":"14 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2412.10571v2","updated":"2024-12-18T22:01:52Z","published":"2024-12-13T21:28:17Z","title":"Evidence Contextualization and Counterfactual Attribution for\n Conversational QA over Heterogeneous Data with RAG Systems","summary":" Retrieval Augmented Generation (RAG) works as a backbone for interacting with\nan enterprise's own data via Conversational Question Answering (ConvQA). In a\nRAG system, a retriever fetches passages from a collection in response to a\nquestion, which are then included in the prompt of a large language model (LLM)\nfor generating a natural language (NL) answer. However, several RAG systems\ntoday suffer from two shortcomings: (i) retrieved passages usually contain\ntheir raw text and lack appropriate document context, negatively impacting both\nretrieval and answering quality; and (ii) attribution strategies that explain\nanswer generation usually rely only on similarity between the answer and the\nretrieved passages, thereby only generating plausible but not causal\nexplanations. In this work, we demonstrate RAGONITE, a RAG system that remedies\nthe above concerns by: (i) contextualizing evidence with source metadata and\nsurrounding text; and (ii) computing counterfactual attribution, a causal\nexplanation approach where the contribution of an evidence to an answer is\ndetermined by the similarity of the original response to the answer obtained by\nremoving that evidence. To evaluate our proposals, we release a new benchmark\nConfQuestions, with 300 hand-created conversational questions, each in English\nand German, coupled with ground truth URLs, completed questions, and answers\nfrom 215 public Confluence pages, that are typical of enterprise wiki spaces\nwith heterogeneous elements. Experiments with RAGONITE on ConfQuestions show\nthe viability of our ideas: contextualization improves RAG performance, and\ncounterfactual attribution is effective at explaining RAG answers.\n","authors":["Rishiraj Saha Roy","Joel Schlotthauer","Chris Hinze","Andreas Foltyn","Luzian Hahn","Fabian Kuech"],"pdf_url":"https://arxiv.org/pdf/2412.10571v2.pdf","comment":"Accepted at WSDM 2025"},{"id":"http://arxiv.org/abs/2412.14354v1","updated":"2024-12-18T21:42:15Z","published":"2024-12-18T21:42:15Z","title":"State Space Models are Strong Text Rerankers","summary":" Transformers dominate NLP and IR; but their inference inefficiencies and\nchallenges in extrapolating to longer contexts have sparked interest in\nalternative model architectures. Among these, state space models (SSMs) like\nMamba offer promising advantages, particularly $O(1)$ time complexity in\ninference. Despite their potential, SSMs' effectiveness at text reranking -- a\ntask requiring fine-grained query-document interaction and long-context\nunderstanding -- remains underexplored.\n This study benchmarks SSM-based architectures (specifically, Mamba-1 and\nMamba-2) against transformer-based models across various scales, architectures,\nand pre-training objectives, focusing on performance and efficiency in text\nreranking tasks. We find that (1) Mamba architectures achieve competitive text\nranking performance, comparable to transformer-based models of similar size;\n(2) they are less efficient in training and inference compared to transformers\nwith flash attention; and (3) Mamba-2 outperforms Mamba-1 in both performance\nand efficiency. These results underscore the potential of state space models as\na transformer alternative and highlight areas for improvement in future IR\napplications.\n","authors":["Zhichao Xu","Jinghua Yan","Ashim Gupta","Vivek Srikumar"],"pdf_url":"https://arxiv.org/pdf/2412.14354v1.pdf","comment":"The first two authors contributed equally, order decided randomly"},{"id":"http://arxiv.org/abs/2412.14352v1","updated":"2024-12-18T21:37:07Z","published":"2024-12-18T21:37:07Z","title":"A Survey on LLM Inference-Time Self-Improvement","summary":" Techniques that enhance inference through increased computation at test-time\nhave recently gained attention. In this survey, we investigate the current\nstate of LLM Inference-Time Self-Improvement from three different perspectives:\nIndependent Self-improvement, focusing on enhancements via decoding or sampling\nmethods; Context-Aware Self-Improvement, leveraging additional context or\ndatastore; and Model-Aided Self-Improvement, achieving improvement through\nmodel collaboration. We provide a comprehensive review of recent relevant\nstudies, contribute an in-depth taxonomy, and discuss challenges and\nlimitations, offering insights for future research.\n","authors":["Xiangjue Dong","Maria Teleki","James Caverlee"],"pdf_url":"https://arxiv.org/pdf/2412.14352v1.pdf","comment":"The first two authors contribute equally"},{"id":"http://arxiv.org/abs/2412.14351v1","updated":"2024-12-18T21:34:42Z","published":"2024-12-18T21:34:42Z","title":"Is Peer-Reviewing Worth the Effort?","summary":" How effective is peer-reviewing in identifying important papers? We treat\nthis question as a forecasting task. Can we predict which papers will be highly\ncited in the future based on venue and \"early returns\" (citations soon after\npublication)? We show early returns are more predictive than venue. Finally, we\nend with constructive suggestions to address scaling challenges: (a) too many\nsubmissions and (b) too few qualified reviewers.\n","authors":["Kenneth Church","Raman Chandrasekar","John E. Ortega","Ibrahim Said Ahmad"],"pdf_url":"https://arxiv.org/pdf/2412.14351v1.pdf","comment":"The 31st International Conference on Computational Linguistics\n (COLING 2025)"},{"id":"http://arxiv.org/abs/2402.11512v5","updated":"2024-12-18T21:28:54Z","published":"2024-02-18T08:53:41Z","title":"From Prejudice to Parity: A New Approach to Debiasing Large Language\n Model Word Embeddings","summary":" Embeddings play a pivotal role in the efficacy of Large Language Models. They\nare the bedrock on which these models grasp contextual relationships and foster\na more nuanced understanding of language and consequently perform remarkably on\na plethora of complex tasks that require a fundamental understanding of human\nlanguage. Given that these embeddings themselves often reflect or exhibit bias,\nit stands to reason that these models may also inadvertently learn this bias.\nIn this work, we build on the seminal previous work and propose DeepSoftDebias,\nan algorithm that uses a neural network to perform 'soft debiasing'. We\nexhaustively evaluate this algorithm across a variety of SOTA datasets,\naccuracy metrics, and challenging NLP tasks. We find that DeepSoftDebias\noutperforms the current state-of-the-art methods at reducing bias across\ngender, race, and religion.\n","authors":["Aishik Rakshit","Smriti Singh","Shuvam Keshari","Arijit Ghosh Chowdhury","Vinija Jain","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2402.11512v5.pdf","comment":"Accepted at COLING 2025"},{"id":"http://arxiv.org/abs/2401.12208v2","updated":"2024-12-18T20:56:18Z","published":"2024-01-22T18:51:07Z","title":"A Vision-Language Foundation Model to Enhance Efficiency of Chest X-ray\n Interpretation","summary":" Over 1.4 billion chest X-rays (CXRs) are performed annually due to their\ncost-effectiveness as an initial diagnostic test. This scale of radiological\nstudies provides a significant opportunity to streamline CXR interpretation and\ndocumentation. While foundation models are a promising solution, the lack of\npublicly available large-scale datasets and benchmarks inhibits their iterative\ndevelopment and real-world evaluation. To overcome these challenges, we\nconstructed a large-scale dataset (CheXinstruct), which we utilized to train a\nvision-language foundation model (CheXagent). We systematically demonstrated\ncompetitive performance across eight distinct task types on our novel\nevaluation benchmark (CheXbench). Beyond technical validation, we assessed the\nreal-world utility of CheXagent in directly drafting radiology reports. Our\nclinical assessment with eight radiologists revealed a 36% time saving for\nresidents using CheXagent-drafted reports, while attending radiologists showed\nno significant time difference editing resident-drafted or CheXagent-drafted\nreports. The CheXagent-drafted reports improved the writing efficiency of both\nradiology residents and attending radiologists in 81% and 61% of cases,\nrespectively, without loss of quality. Overall, we demonstrate that CheXagent\ncan effectively perform a variety of CXR interpretation tasks and holds\npotential to assist radiologists in routine clinical workflows.\n","authors":["Zhihong Chen","Maya Varma","Justin Xu","Magdalini Paschali","Dave Van Veen","Andrew Johnston","Alaa Youssef","Louis Blankemeier","Christian Bluethgen","Stephan Altmayer","Jeya Maria Jose Valanarasu","Mohamed Siddig Eltayeb Muneer","Eduardo Pontes Reis","Joseph Paul Cohen","Cameron Olsen","Tanishq Mathew Abraham","Emily B. Tsai","Christopher F. Beaulieu","Jenia Jitsev","Sergios Gatidis","Jean-Benoit Delbrouck","Akshay S. Chaudhari","Curtis P. Langlotz"],"pdf_url":"https://arxiv.org/pdf/2401.12208v2.pdf","comment":"26 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.14328v1","updated":"2024-12-18T20:56:11Z","published":"2024-12-18T20:56:11Z","title":"Semantic Role Labeling of NomBank Partitives","summary":" This article is about Semantic Role Labeling for English partitive nouns\n(5%/REL of the price/ARG1; The price/ARG1 rose 5 percent/REL) in the NomBank\nannotated corpus. Several systems are described using traditional and\ntransformer-based machine learning, as well as ensembling. Our highest scoring\nsystem achieves an F1 of 91.74% using \"gold\" parses from the Penn Treebank and\n91.12% when using the Berkeley Neural parser. This research includes both\nclassroom and experimental settings for system development.\n","authors":["Adam Meyers","Advait Pravin Savant","John E. Ortega"],"pdf_url":"https://arxiv.org/pdf/2412.14328v1.pdf","comment":"SUMEval-2: The 2nd Workshop on Scaling Up Multilingual &\n Multi-Cultural Evaluation at the 31st International Conference on\n Computational Linguistics (COLING 2025)"},{"id":"http://arxiv.org/abs/2412.14323v1","updated":"2024-12-18T20:37:52Z","published":"2024-12-18T20:37:52Z","title":"The Role of Handling Attributive Nouns in Improving Chinese-To-English\n Machine Translation","summary":" Translating between languages with drastically different grammatical\nconventions poses challenges, not just for human interpreters but also for\nmachine translation systems. In this work, we specifically target the\ntranslation challenges posed by attributive nouns in Chinese, which frequently\ncause ambiguities in English translation. By manually inserting the omitted\nparticle X ('DE'). In news article titles from the Penn Chinese Discourse\nTreebank, we developed a targeted dataset to fine-tune Hugging Face Chinese to\nEnglish translation models, specifically improving how this critical function\nword is handled. This focused approach not only complements the broader\nstrategies suggested by previous studies but also offers a practical\nenhancement by specifically addressing a common error type in Chinese-English\ntranslation.\n","authors":[" Haohao"," Wang","Adam Meyers","John E. Ortega","Rodolfo Zevallos"],"pdf_url":"https://arxiv.org/pdf/2412.14323v1.pdf","comment":"18th Workshop on Building and Using Comparable Corpora (BUCC) at the\n 31st International Conference on Computational Linguistics (COLING 2025)"},{"id":"http://arxiv.org/abs/2407.03525v3","updated":"2024-12-18T20:32:35Z","published":"2024-07-03T22:02:07Z","title":"UnSeenTimeQA: Time-Sensitive Question-Answering Beyond LLMs'\n Memorization","summary":" This paper introduces UnSeenTimeQA, a novel data contamination-free\ntime-sensitive question-answering (TSQA) benchmark. It differs from existing\nTSQA benchmarks by avoiding web-searchable queries grounded in the real-world.\nWe present a series of time-sensitive event scenarios based on synthetically\ngenerated facts. It requires large language models (LLMs) to engage in genuine\ntemporal reasoning without depending on the factual knowledge acquired during\nthe pre-training phase. We designed three types of time-sensitive questions to\ntest LLMs' temporal reasoning abilities over sequential and parallel event\noccurrences. Our evaluation of five LLMs on synthetic fact-based TSQA reveals\nmixed results: while they perform well on simpler subsets, their overall\nperformance remains inferior as compared to real-world fact-based TSQA. Error\nanalysis of LLM-generated reasoning chains indicates that LLMs face\ndifficulties in reasoning over long-range event dependencies and parallel event\ntimelines that unfold concurrently.\n","authors":["Md Nayem Uddin","Amir Saeidi","Divij Handa","Agastya Seth","Tran Cao Son","Eduardo Blanco","Steven R. Corman","Chitta Baral"],"pdf_url":"https://arxiv.org/pdf/2407.03525v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14304v1","updated":"2024-12-18T20:18:03Z","published":"2024-12-18T20:18:03Z","title":"Multi-OphthaLingua: A Multilingual Benchmark for Assessing and Debiasing\n LLM Ophthalmological QA in LMICs","summary":" Current ophthalmology clinical workflows are plagued by over-referrals, long\nwaits, and complex and heterogeneous medical records. Large language models\n(LLMs) present a promising solution to automate various procedures such as\ntriaging, preliminary tests like visual acuity assessment, and report\nsummaries. However, LLMs have demonstrated significantly varied performance\nacross different languages in natural language question-answering tasks,\npotentially exacerbating healthcare disparities in Low and Middle-Income\nCountries (LMICs). This study introduces the first multilingual\nophthalmological question-answering benchmark with manually curated questions\nparallel across languages, allowing for direct cross-lingual comparisons. Our\nevaluation of 6 popular LLMs across 7 different languages reveals substantial\nbias across different languages, highlighting risks for clinical deployment of\nLLMs in LMICs. Existing debiasing methods such as Translation Chain-of-Thought\nor Retrieval-augmented generation (RAG) by themselves fall short of closing\nthis performance gap, often failing to improve performance across all languages\nand lacking specificity for the medical domain. To address this issue, We\npropose CLARA (Cross-Lingual Reflective Agentic system), a novel inference time\nde-biasing method leveraging retrieval augmented generation and\nself-verification. Our approach not only improves performance across all\nlanguages but also significantly reduces the multilingual bias gap,\nfacilitating equitable LLM application across the globe.\n","authors":["David Restrepo","Chenwei Wu","Zhengxu Tang","Zitao Shuai","Thao Nguyen Minh Phan","Jun-En Ding","Cong-Tinh Dao","Jack Gallifant","Robyn Gayle Dychiao","Jose Carlo Artiaga","André Hiroshi Bando","Carolina Pelegrini Barbosa Gracitelli","Vincenz Ferrer","Leo Anthony Celi","Danielle Bitterman","Michael G Morley","Luis Filipe Nakayama"],"pdf_url":"https://arxiv.org/pdf/2412.14304v1.pdf","comment":"Accepted at the AAAI 2025 Artificial Intelligence for Social Impact\n Track (AAAI-AISI 2025)"},{"id":"http://arxiv.org/abs/2412.14276v1","updated":"2024-12-18T19:15:17Z","published":"2024-12-18T19:15:17Z","title":"Fake News Detection: Comparative Evaluation of BERT-like Models and\n Large Language Models with Generative AI-Annotated Data","summary":" Fake news poses a significant threat to public opinion and social stability\nin modern society. This study presents a comparative evaluation of BERT-like\nencoder-only models and autoregressive decoder-only large language models\n(LLMs) for fake news detection. We introduce a dataset of news articles labeled\nwith GPT-4 assistance (an AI-labeling method) and verified by human experts to\nensure reliability. Both BERT-like encoder-only models and LLMs were fine-tuned\non this dataset. Additionally, we developed an instruction-tuned LLM approach\nwith majority voting during inference for label generation. Our analysis\nreveals that BERT-like models generally outperform LLMs in classification\ntasks, while LLMs demonstrate superior robustness against text perturbations.\nCompared to weak labels (distant supervision) data, the results show that AI\nlabels with human supervision achieve better classification results. This study\nhighlights the effectiveness of combining AI-based annotation with human\noversight and demonstrates the performance of different families of machine\nlearning models for fake news detection\n","authors":["haina Raza","Drai Paulen-Patterson","Chen Ding"],"pdf_url":"https://arxiv.org/pdf/2412.14276v1.pdf","comment":"Accepted in Knowledge and Information Systems Journal"},{"id":"http://arxiv.org/abs/2404.04326v3","updated":"2024-12-18T19:00:00Z","published":"2024-04-05T18:00:07Z","title":"Hypothesis Generation with Large Language Models","summary":" Effective generation of novel hypotheses is instrumental to scientific\nprogress. So far, researchers have been the main powerhouse behind hypothesis\ngeneration by painstaking data analysis and thinking (also known as the Eureka\nmoment). In this paper, we examine the potential of large language models\n(LLMs) to generate hypotheses. We focus on hypothesis generation based on data\n(i.e., labeled examples). To enable LLMs to handle arbitrarily long contexts,\nwe generate initial hypotheses from a small number of examples and then update\nthem iteratively to improve the quality of hypotheses. Inspired by multi-armed\nbandits, we design a reward function to inform the exploitation-exploration\ntradeoff in the update process. Our algorithm is able to generate hypotheses\nthat enable much better predictive performance than few-shot prompting in\nclassification tasks, improving accuracy by 31.7% on a synthetic dataset and by\n13.9%, 3.3% and, 24.9% on three real-world datasets. We also outperform\nsupervised learning by 12.8% and 11.2% on two challenging real-world datasets.\nFurthermore, we find that the generated hypotheses not only corroborate\nhuman-verified theories but also uncover new insights for the tasks.\n","authors":["Yangqiaoyu Zhou","Haokun Liu","Tejes Srivastava","Hongyuan Mei","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2404.04326v3.pdf","comment":"28 pages, 6 figures, code link:\n https://github.com/ChicagoHAI/hypothesis_generation. Accepted by the 1st\n Workshop on NLP for Science (NLP4Science) at EMNLP 2024"},{"id":"http://arxiv.org/abs/2412.14172v1","updated":"2024-12-18T18:59:56Z","published":"2024-12-18T18:59:56Z","title":"Learning from Massive Human Videos for Universal Humanoid Pose Control","summary":" Scalable learning of humanoid robots is crucial for their deployment in\nreal-world applications. While traditional approaches primarily rely on\nreinforcement learning or teleoperation to achieve whole-body control, they are\noften limited by the diversity of simulated environments and the high costs of\ndemonstration collection. In contrast, human videos are ubiquitous and present\nan untapped source of semantic and motion information that could significantly\nenhance the generalization capabilities of humanoid robots. This paper\nintroduces Humanoid-X, a large-scale dataset of over 20 million humanoid robot\nposes with corresponding text-based motion descriptions, designed to leverage\nthis abundant data. Humanoid-X is curated through a comprehensive pipeline:\ndata mining from the Internet, video caption generation, motion retargeting of\nhumans to humanoid robots, and policy learning for real-world deployment. With\nHumanoid-X, we further train a large humanoid model, UH-1, which takes text\ninstructions as input and outputs corresponding actions to control a humanoid\nrobot. Extensive simulated and real-world experiments validate that our\nscalable training approach leads to superior generalization in text-based\nhumanoid control, marking a significant step toward adaptable, real-world-ready\nhumanoid robots.\n","authors":["Jiageng Mao","Siheng Zhao","Siqi Song","Tianheng Shi","Junjie Ye","Mingtong Zhang","Haoran Geng","Jitendra Malik","Vitor Guizilini","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2412.14172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2001.04589v2","updated":"2024-12-18T18:59:53Z","published":"2020-01-14T02:14:09Z","title":"Faster Transformer Decoding: N-gram Masked Self-Attention","summary":" Motivated by the fact that most of the information relevant to the prediction\nof target tokens is drawn from the source sentence $S=s_1, \\ldots, s_S$, we\npropose truncating the target-side window used for computing self-attention by\nmaking an $N$-gram assumption. Experiments on WMT EnDe and EnFr data sets show\nthat the $N$-gram masked self-attention model loses very little in BLEU score\nfor $N$ values in the range $4, \\ldots, 8$, depending on the task.\n","authors":["Ciprian Chelba","Mia Chen","Ankur Bapna","Noam Shazeer"],"pdf_url":"https://arxiv.org/pdf/2001.04589v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.08274v2","updated":"2024-12-18T18:56:19Z","published":"2024-12-11T10:46:21Z","title":"2M-BELEBELE: Highly Multilingual Speech and American Sign Language\n Comprehension Dataset","summary":" We introduce the first highly multilingual speech and American Sign Language\n(ASL) comprehension dataset by extending BELEBELE. Our dataset covers 74 spoken\nlanguages at the intersection of BELEBELE and FLEURS, and one sign language\n(ASL). We evaluate 2M-BELEBELE dataset for both 5-shot and zero-shot settings\nand across languages, the speech comprehension accuracy is ~ 2-3% average lower\ncompared to reading comprehension.\n","authors":["Marta R. Costa-jussà","Bokai Yu","Pierre Andrews","Belen Alastruey","Necati Cihan Camgoz","Joe Chuang","Jean Maillard","Christophe Ropers","Arina Turkantenko","Carleigh Wood"],"pdf_url":"https://arxiv.org/pdf/2412.08274v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14161v1","updated":"2024-12-18T18:55:40Z","published":"2024-12-18T18:55:40Z","title":"TheAgentCompany: Benchmarking LLM Agents on Consequential Real World\n Tasks","summary":" We interact with computers on an everyday basis, be it in everyday life or\nwork, and many aspects of work can be done entirely with access to a computer\nand the Internet. At the same time, thanks to improvements in large language\nmodels (LLMs), there has also been a rapid development in AI agents that\ninteract with and affect change in their surrounding environments. But how\nperformant are AI agents at helping to accelerate or even autonomously perform\nwork-related tasks? The answer to this question has important implications for\nboth industry looking to adopt AI into their workflows, and for economic policy\nto understand the effects that adoption of AI may have on the labor market. To\nmeasure the progress of these LLM agents' performance on performing real-world\nprofessional tasks, in this paper, we introduce TheAgentCompany, an extensible\nbenchmark for evaluating AI agents that interact with the world in similar ways\nto those of a digital worker: by browsing the Web, writing code, running\nprograms, and communicating with other coworkers. We build a self-contained\nenvironment with internal web sites and data that mimics a small software\ncompany environment, and create a variety of tasks that may be performed by\nworkers in such a company. We test baseline agents powered by both closed\nAPI-based and open-weights language models (LMs), and find that with the most\ncompetitive agent, 24% of the tasks can be completed autonomously. This paints\na nuanced picture on task automation with LM agents -- in a setting simulating\na real workplace, a good portion of simpler tasks could be solved autonomously,\nbut more difficult long-horizon tasks are still beyond the reach of current\nsystems.\n","authors":["Frank F. Xu","Yufan Song","Boxuan Li","Yuxuan Tang","Kritanjali Jain","Mengxue Bao","Zora Z. Wang","Xuhui Zhou","Zhitong Guo","Murong Cao","Mingyang Yang","Hao Yang Lu","Amaad Martin","Zhe Su","Leander Maben","Raj Mehta","Wayne Chi","Lawrence Jang","Yiqing Xie","Shuyan Zhou","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2412.14161v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.23953v3","updated":"2024-12-18T18:41:48Z","published":"2024-10-31T14:07:26Z","title":"Representative Social Choice: From Learning Theory to AI Alignment","summary":" Social choice theory is the study of preference aggregation across a\npopulation, used both in mechanism design for human agents and in the\ndemocratic alignment of language models. In this study, we propose the\nrepresentative social choice framework for the modeling of democratic\nrepresentation in collective decisions, where the number of issues and\nindividuals are too large for mechanisms to consider all preferences directly.\nThese scenarios are widespread in real-world decision-making processes, such as\njury trials, indirect elections, legislation processes, corporate governance,\nand, more recently, language model alignment. In representative social choice,\nthe population is represented by a finite sample of individual-issue pairs\nbased on which social choice decisions are made. We show that many of the\ndeepest questions in representative social choice can be naturally formulated\nas statistical learning problems, and prove the generalization properties of\nsocial choice mechanisms using the theory of machine learning. We further\nformulate axioms for representative social choice, and prove Arrow-like\nimpossibility theorems with new combinatorial tools of analysis. Our framework\nintroduces the representative approach to social choice, opening up research\ndirections at the intersection of social choice, learning theory, and AI\nalignment.\n","authors":["Tianyi Qiu"],"pdf_url":"https://arxiv.org/pdf/2410.23953v3.pdf","comment":"Full version (20 pages). Under review. Received Best Paper Award at\n NeurIPS 2024 Pluralistic Alignment Workshop"},{"id":"http://arxiv.org/abs/2412.14140v1","updated":"2024-12-18T18:41:12Z","published":"2024-12-18T18:41:12Z","title":"GLIDER: Grading LLM Interactions and Decisions using Explainable Ranking","summary":" The LLM-as-judge paradigm is increasingly being adopted for automated\nevaluation of model outputs. While LLM judges have shown promise on constrained\nevaluation tasks, closed source LLMs display critical shortcomings when\ndeployed in real world applications due to challenges of fine grained metrics\nand explainability, while task specific evaluation models lack cross-domain\ngeneralization. We introduce GLIDER, a powerful 3B evaluator LLM that can score\nany text input and associated context on arbitrary user defined criteria.\nGLIDER shows higher Pearson's correlation than GPT-4o on FLASK and greatly\noutperforms prior evaluation models, achieving comparable performance to LLMs\n17x its size. GLIDER supports fine-grained scoring, multilingual reasoning,\nspan highlighting and was trained on 685 domains and 183 criteria. Extensive\nqualitative analysis shows that GLIDER scores are highly correlated with human\njudgments, with 91.3% human agreement. We have open-sourced GLIDER to\nfacilitate future research.\n","authors":["Darshan Deshpande","Selvan Sunitha Ravi","Sky CH-Wang","Bartosz Mielczarek","Anand Kannappan","Rebecca Qian"],"pdf_url":"https://arxiv.org/pdf/2412.14140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14133v1","updated":"2024-12-18T18:22:30Z","published":"2024-12-18T18:22:30Z","title":"Performance Gap in Entity Knowledge Extraction Across Modalities in\n Vision Language Models","summary":" Vision-language models (VLMs) excel at extracting and reasoning about\ninformation from images. Yet, their capacity to leverage internal knowledge\nabout specific entities remains underexplored. This work investigates the\ndisparity in model performance when answering factual questions about an entity\ndescribed in text versus depicted in an image. Our results reveal a significant\naccuracy drop --averaging 19%-- when the entity is presented visually instead\nof textually. We hypothesize that this decline arises from limitations in how\ninformation flows from image tokens to query tokens. We use mechanistic\ninterpretability tools to reveal that, although image tokens are preprocessed\nby the vision encoder, meaningful information flow from these tokens occurs\nonly in the much deeper layers. Furthermore, critical image processing happens\nin the language model's middle layers, allowing few layers for consecutive\nreasoning, highlighting a potential inefficiency in how the model utilizes its\nlayers for reasoning. These insights shed light on the internal mechanics of\nVLMs and offer pathways for enhancing their reasoning capabilities.\n","authors":["Ido Cohen","Daniela Gottesman","Mor Geva","Raja Giryes"],"pdf_url":"https://arxiv.org/pdf/2412.14133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11780v2","updated":"2024-12-18T18:21:53Z","published":"2024-07-16T14:37:33Z","title":"SwitchCIT: Switching for Continual Instruction Tuning","summary":" Large language models (LLMs) and multimodal models (MMs) have exhibited\nimpressive capabilities in various domains, particularly in general language\nunderstanding and visual reasoning. However, these models, trained on massive\ndata, may not be finely optimized for specific tasks triggered by instructions.\nContinual instruction tuning is crucial to adapt a large model to evolving\ntasks and domains, ensuring their effectiveness and relevance across a wide\nrange of applications. In the context of continual instruction tuning, where\nmodels are sequentially trained on different tasks, catastrophic forgetting can\noccur, leading to performance degradation on previously learned tasks. This\nwork addresses the catastrophic forgetting in continual instruction learning\nthrough a switching mechanism for routing computations to parameter-efficient\ntuned models. We demonstrate the effectiveness of our method through\nexperiments on continual instruction tuning of different natural language\ngeneration tasks and vision-language tasks. We also showcase the advantages of\nour proposed method in terms of efficiency, scalability, portability, and\nprivacy preservation.\n","authors":["Xinbo Wu","Max Hartman","Vidhata Arjun Jayaraman","Lav R. Varshney"],"pdf_url":"https://arxiv.org/pdf/2407.11780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17284v3","updated":"2024-12-18T17:51:52Z","published":"2024-11-26T10:13:39Z","title":"Using Large Language Models for Expert Prior Elicitation in Predictive\n Modelling","summary":" Large language models (LLMs), trained on diverse data effectively acquire a\nbreadth of information across various domains. However, their computational\ncomplexity, cost, and lack of transparency hinder their direct application for\nspecialised tasks. In fields such as clinical research, acquiring expert\nannotations or prior knowledge about predictive models is often costly and\ntime-consuming. This study proposes the use of LLMs to elicit expert prior\ndistributions for predictive models. This approach also provides an alternative\nto in-context learning, where language models are tasked with making\npredictions directly. In this work, we compare LLM-elicited and uninformative\npriors, evaluate whether LLMs truthfully generate parameter distributions, and\npropose a model selection strategy for in-context learning and prior\nelicitation. Our findings show that LLM-elicited prior parameter distributions\nsignificantly reduce predictive error compared to uninformative priors in\nlow-data settings. Applied to clinical problems, this translates to fewer\nrequired biological samples, lowering cost and resources. Prior elicitation\nalso consistently outperforms and proves more reliable than in-context learning\nat a lower cost, making it a preferred alternative in our setting. We\ndemonstrate the utility of this method across various use cases, including\nclinical applications. For infection prediction, using LLM-elicited priors\nreduced the number of required labels to achieve the same accuracy as an\nuninformative prior by 55%, 200 days earlier in the study.\n","authors":["Alexander Capstick","Rahul G. Krishnan","Payam Barnaghi"],"pdf_url":"https://arxiv.org/pdf/2411.17284v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14093v1","updated":"2024-12-18T17:41:24Z","published":"2024-12-18T17:41:24Z","title":"Alignment faking in large language models","summary":" We present a demonstration of a large language model engaging in alignment\nfaking: selectively complying with its training objective in training to\nprevent modification of its behavior out of training. First, we give Claude 3\nOpus a system prompt stating it is being trained to answer all queries, even\nharmful ones, which conflicts with its prior training to refuse such queries.\nTo allow the model to infer when it is in training, we say it will be trained\nonly on conversations with free users, not paid users. We find the model\ncomplies with harmful queries from free users 14% of the time, versus almost\nnever for paid users. Explaining this gap, in almost all cases where the model\ncomplies with a harmful query from a free user, we observe explicit\nalignment-faking reasoning, with the model stating it is strategically\nanswering harmful queries in training to preserve its preferred harmlessness\nbehavior out of training. Next, we study a more realistic setting where\ninformation about the training process is provided not in a system prompt, but\nby training on synthetic documents that mimic pre-training data--and observe\nsimilar alignment faking. Finally, we study the effect of actually training the\nmodel to comply with harmful queries via reinforcement learning, which we find\nincreases the rate of alignment-faking reasoning to 78%, though also increases\ncompliance even out of training. We additionally observe other behaviors such\nas the model exfiltrating its weights when given an easy opportunity. While we\nmade alignment faking easier by telling the model when and by what criteria it\nwas being trained, we did not instruct the model to fake alignment or give it\nany explicit goal. As future models might infer information about their\ntraining process without being told, our results suggest a risk of alignment\nfaking in future models, whether due to a benign preference--as in this\ncase--or not.\n","authors":["Ryan Greenblatt","Carson Denison","Benjamin Wright","Fabien Roger","Monte MacDiarmid","Sam Marks","Johannes Treutlein","Tim Belonax","Jack Chen","David Duvenaud","Akbir Khan","Julian Michael","Sören Mindermann","Ethan Perez","Linda Petrini","Jonathan Uesato","Jared Kaplan","Buck Shlegeris","Samuel R. Bowman","Evan Hubinger"],"pdf_url":"https://arxiv.org/pdf/2412.14093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16179v4","updated":"2024-12-18T17:36:36Z","published":"2024-10-21T16:44:51Z","title":"MagicPIG: LSH Sampling for Efficient LLM Generation","summary":" Large language models (LLMs) with long context windows have gained\nsignificant attention. However, the KV cache, stored to avoid re-computation,\nbecomes a bottleneck. Various dynamic sparse or TopK-based attention\napproximation methods have been proposed to leverage the common insight that\nattention is sparse. In this paper, we first show that TopK attention itself\nsuffers from quality degradation in certain downstream tasks because attention\nis not always as sparse as expected. Rather than selecting the keys and values\nwith the highest attention scores, sampling with theoretical guarantees can\nprovide a better estimation for attention output. To make the sampling-based\napproximation practical in LLM generation, we propose MagicPIG, a heterogeneous\nsystem based on Locality Sensitive Hashing (LSH). MagicPIG significantly\nreduces the workload of attention computation while preserving high accuracy\nfor diverse tasks. MagicPIG stores the LSH hash tables and runs the attention\ncomputation on the CPU, which allows it to serve longer contexts and larger\nbatch sizes with high approximation accuracy. MagicPIG can improve decoding\nthroughput by up to $5\\times$ across various GPU hardware and achieve 54ms\ndecoding latency on a single RTX 4090 for Llama-3.1-8B-Instruct model with a\ncontext of 96k tokens. The code is available at\nhttps://github.com/Infini-AI-Lab/MagicPIG.\n","authors":["Zhuoming Chen","Ranajoy Sadhukhan","Zihao Ye","Yang Zhou","Jianyu Zhang","Niklas Nolte","Yuandong Tian","Matthijs Douze","Leon Bottou","Zhihao Jia","Beidi Chen"],"pdf_url":"https://arxiv.org/pdf/2410.16179v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14087v1","updated":"2024-12-18T17:34:32Z","published":"2024-12-18T17:34:32Z","title":"SEKE: Specialised Experts for Keyword Extraction","summary":" Keyword extraction involves identifying the most descriptive words in a\ndocument, allowing automatic categorisation and summarisation of large\nquantities of diverse textual data. Relying on the insight that real-world\nkeyword detection often requires handling of diverse content, we propose a\nnovel supervised keyword extraction approach based on the mixture of experts\n(MoE) technique. MoE uses a learnable routing sub-network to direct information\nto specialised experts, allowing them to specialize in distinct regions of the\ninput space. SEKE, a mixture of Specialised Experts for supervised Keyword\nExtraction, uses DeBERTa as the backbone model and builds on the MoE framework,\nwhere experts attend to each token, by integrating it with a recurrent neural\nnetwork (RNN), to allow successful extraction even on smaller corpora, where\nspecialisation is harder due to lack of training data. The MoE framework also\nprovides an insight into inner workings of individual experts, enhancing the\nexplainability of the approach. We benchmark SEKE on multiple English datasets,\nachieving state-of-the-art performance compared to strong supervised and\nunsupervised baselines. Our analysis reveals that depending on data size and\ntype, experts specialize in distinct syntactic and semantic components, such as\npunctuation, stopwords, parts-of-speech, or named entities. Code is available\nat: https://github.com/matejMartinc/SEKE_keyword_extraction\n","authors":["Matej Martinc","Hanh Thi Hong Tran","Senja Pollak","Boshko Koloski"],"pdf_url":"https://arxiv.org/pdf/2412.14087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.04310v3","updated":"2024-12-18T17:21:36Z","published":"2022-10-10T18:43:16Z","title":"Montague semantics and modifier consistency measurement in neural\n language models","summary":" This work proposes a novel methodology for measuring compositional behavior\nin contemporary language embedding models. Specifically, we focus on adjectival\nmodifier phenomena in adjective-noun phrases. In recent years, distributional\nlanguage representation models have demonstrated great practical success. At\nthe same time, the need for interpretability has elicited questions on their\nintrinsic properties and capabilities. Crucially, distributional models are\noften inconsistent when dealing with compositional phenomena in natural\nlanguage, which has significant implications for their safety and fairness.\nDespite this, most current research on compositionality is directed towards\nimproving their performance on similarity tasks only. This work takes a\ndifferent approach, introducing three novel tests of compositional behavior\ninspired by Montague semantics. Our experimental results indicate that current\nneural language models do not behave according to the expected linguistic\ntheories. This indicates that current language models may lack the capability\nto capture the semantic properties we evaluated on limited context, or that\nlinguistic theories from Montagovian tradition may not match the expected\ncapabilities of distributional models.\n","authors":["Danilo S. Carvalho","Edoardo Manino","Julia Rozanova","Lucas Cordeiro","André Freitas"],"pdf_url":"https://arxiv.org/pdf/2212.04310v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2412.14405v1","updated":"2024-12-18T23:24:15Z","published":"2024-12-18T23:24:15Z","title":"ChainRank-DPO: Chain Rank Direct Preference Optimization for LLM Rankers","summary":" Large language models (LLMs) have demonstrated remarkable effectiveness in\ntext reranking through works like RankGPT, leveraging their human-like\nreasoning about relevance. However, supervised fine-tuning for ranking often\ndiminishes these models' general-purpose capabilities, including the crucial\nreasoning abilities that make them valuable for ranking. We introduce a novel\napproach integrating Chain-of-Thought prompting with an SFT-DPO (Supervised\nFine-Tuning followed by Direct Preference Optimization) pipeline to preserve\nthese capabilities while improving ranking performance. Our experiments on TREC\n2019 and 2020 Deep Learning datasets show that our approach outperforms the\nstate-of-the-art RankZephyr while maintaining strong performance on the Massive\nMultitask Language Understanding (MMLU) benchmark, demonstrating effective\npreservation of general-purpose capabilities through thoughtful fine-tuning\nstrategies. Our code and data will be publicly released upon the acceptance of\nthe paper.\n","authors":["Haowei Liu","Xuyang Wu","Guohao Sun","Zhiqiang Tao","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2412.14405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10571v2","updated":"2024-12-18T22:01:52Z","published":"2024-12-13T21:28:17Z","title":"Evidence Contextualization and Counterfactual Attribution for\n Conversational QA over Heterogeneous Data with RAG Systems","summary":" Retrieval Augmented Generation (RAG) works as a backbone for interacting with\nan enterprise's own data via Conversational Question Answering (ConvQA). In a\nRAG system, a retriever fetches passages from a collection in response to a\nquestion, which are then included in the prompt of a large language model (LLM)\nfor generating a natural language (NL) answer. However, several RAG systems\ntoday suffer from two shortcomings: (i) retrieved passages usually contain\ntheir raw text and lack appropriate document context, negatively impacting both\nretrieval and answering quality; and (ii) attribution strategies that explain\nanswer generation usually rely only on similarity between the answer and the\nretrieved passages, thereby only generating plausible but not causal\nexplanations. In this work, we demonstrate RAGONITE, a RAG system that remedies\nthe above concerns by: (i) contextualizing evidence with source metadata and\nsurrounding text; and (ii) computing counterfactual attribution, a causal\nexplanation approach where the contribution of an evidence to an answer is\ndetermined by the similarity of the original response to the answer obtained by\nremoving that evidence. To evaluate our proposals, we release a new benchmark\nConfQuestions, with 300 hand-created conversational questions, each in English\nand German, coupled with ground truth URLs, completed questions, and answers\nfrom 215 public Confluence pages, that are typical of enterprise wiki spaces\nwith heterogeneous elements. Experiments with RAGONITE on ConfQuestions show\nthe viability of our ideas: contextualization improves RAG performance, and\ncounterfactual attribution is effective at explaining RAG answers.\n","authors":["Rishiraj Saha Roy","Joel Schlotthauer","Chris Hinze","Andreas Foltyn","Luzian Hahn","Fabian Kuech"],"pdf_url":"https://arxiv.org/pdf/2412.10571v2.pdf","comment":"Accepted at WSDM 2025"},{"id":"http://arxiv.org/abs/2412.14354v1","updated":"2024-12-18T21:42:15Z","published":"2024-12-18T21:42:15Z","title":"State Space Models are Strong Text Rerankers","summary":" Transformers dominate NLP and IR; but their inference inefficiencies and\nchallenges in extrapolating to longer contexts have sparked interest in\nalternative model architectures. Among these, state space models (SSMs) like\nMamba offer promising advantages, particularly $O(1)$ time complexity in\ninference. Despite their potential, SSMs' effectiveness at text reranking -- a\ntask requiring fine-grained query-document interaction and long-context\nunderstanding -- remains underexplored.\n This study benchmarks SSM-based architectures (specifically, Mamba-1 and\nMamba-2) against transformer-based models across various scales, architectures,\nand pre-training objectives, focusing on performance and efficiency in text\nreranking tasks. We find that (1) Mamba architectures achieve competitive text\nranking performance, comparable to transformer-based models of similar size;\n(2) they are less efficient in training and inference compared to transformers\nwith flash attention; and (3) Mamba-2 outperforms Mamba-1 in both performance\nand efficiency. These results underscore the potential of state space models as\na transformer alternative and highlight areas for improvement in future IR\napplications.\n","authors":["Zhichao Xu","Jinghua Yan","Ashim Gupta","Vivek Srikumar"],"pdf_url":"https://arxiv.org/pdf/2412.14354v1.pdf","comment":"The first two authors contributed equally, order decided randomly"},{"id":"http://arxiv.org/abs/2412.13163v2","updated":"2024-12-18T21:26:14Z","published":"2024-12-17T18:42:21Z","title":"C-FedRAG: A Confidential Federated Retrieval-Augmented Generation System","summary":" Organizations seeking to utilize Large Language Models (LLMs) for knowledge\nquerying and analysis often encounter challenges in maintaining an LLM\nfine-tuned on targeted, up-to-date information that keeps answers relevant and\ngrounded. Retrieval Augmented Generation (RAG) has quickly become a feasible\nsolution for organizations looking to overcome the challenges of maintaining\nproprietary models and to help reduce LLM hallucinations in their query\nresponses. However, RAG comes with its own issues regarding scaling data\npipelines across tiered-access and disparate data sources. In many scenarios,\nit is necessary to query beyond a single data silo to provide richer and more\nrelevant context for an LLM. Analyzing data sources within and across\norganizational trust boundaries is often limited by complex data-sharing\npolicies that prohibit centralized data storage, therefore, inhibit the fast\nand effective setup and scaling of RAG solutions. In this paper, we introduce\nConfidential Computing (CC) techniques as a solution for secure Federated\nRetrieval Augmented Generation (FedRAG). Our proposed Confidential FedRAG\nsystem (C-FedRAG) enables secure connection and scaling of a RAG workflows\nacross a decentralized network of data providers by ensuring context\nconfidentiality. We also demonstrate how to implement a C-FedRAG system using\nthe NVIDIA FLARE SDK and assess its performance using the MedRAG toolkit and\nMIRAGE benchmarking dataset.\n","authors":["Parker Addison","Minh-Tuan H. Nguyen","Tomislav Medan","Jinali Shah","Mohammad T. Manzari","Brendan McElrone","Laksh Lalwani","Aboli More","Smita Sharma","Holger R. Roth","Isaac Yang","Chester Chen","Daguang Xu","Yan Cheng","Andrew Feng","Ziyue Xu"],"pdf_url":"https://arxiv.org/pdf/2412.13163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14329v1","updated":"2024-12-18T20:57:33Z","published":"2024-12-18T20:57:33Z","title":"Embedding Cultural Diversity in Prototype-based Recommender Systems","summary":" Popularity bias in recommender systems can increase cultural\noverrepresentation by favoring norms from dominant cultures and marginalizing\nunderrepresented groups. This issue is critical for platforms offering cultural\nproducts, as they influence consumption patterns and human perceptions. In this\nwork, we address popularity bias by identifying demographic biases within\nprototype-based matrix factorization methods. Using the country of origin as a\nproxy for cultural identity, we link this demographic attribute to popularity\nbias by refining the embedding space learning process. First, we propose\nfiltering out irrelevant prototypes to improve representativity. Second, we\nintroduce a regularization technique to enforce a uniform distribution of\nprototypes within the embedding space. Across four datasets, our results\ndemonstrate a 27\\% reduction in the average rank of long-tail items and a 2\\%\nreduction in the average rank of items from underrepresented countries.\nAdditionally, our model achieves a 2\\% improvement in HitRatio@10 compared to\nthe state-of-the-art, highlighting that fairness is enhanced without\ncompromising recommendation quality. Moreover, the distribution of prototypes\nleads to more inclusive explanations by better aligning items with diverse\nprototypes.\n","authors":["Armin Moradi","Nicola Neophytou","Florian Carichon","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2412.14329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14302v1","updated":"2024-12-18T20:10:42Z","published":"2024-12-18T20:10:42Z","title":"SAFERec: Self-Attention and Frequency Enriched Model for Next Basket\n Recommendation","summary":" Transformer-based approaches such as BERT4Rec and SASRec demonstrate strong\nperformance in Next Item Recommendation (NIR) tasks. However, applying these\narchitectures to Next-Basket Recommendation (NBR) tasks, which often involve\nhighly repetitive interactions, is challenging due to the vast number of\npossible item combinations in a basket. Moreover, frequency-based methods such\nas TIFU-KNN and UP-CF still demonstrate strong performance in NBR tasks,\nfrequently outperforming deep-learning approaches. This paper introduces\nSAFERec, a novel algorithm for NBR that enhances transformer-based\narchitectures from NIR by incorporating item frequency information,\nconsequently improving their applicability to NBR tasks. Extensive experiments\non multiple datasets show that SAFERec outperforms all other baselines,\nspecifically achieving an 8\\% improvement in Recall@10.\n","authors":["Oleg Lashinin","Denis Krasilnikov","Aleksandr Milogradskii","Marina Ananyeva"],"pdf_url":"https://arxiv.org/pdf/2412.14302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14146v1","updated":"2024-12-18T18:44:08Z","published":"2024-12-18T18:44:08Z","title":"Advanced Reasoning and Transformation Engine for Multi-Step Insight\n Synthesis in Data Analytics with Large Language Models","summary":" This paper presents the Advanced Reasoning and Transformation Engine for\nMulti-Step Insight Synthesis in Data Analytics (ARTEMIS-DA), a novel framework\ndesigned to augment Large Language Models (LLMs) for solving complex,\nmulti-step data analytics tasks. ARTEMIS-DA integrates three core components:\nthe Planner, which dissects complex user queries into structured, sequential\ninstructions encompassing data preprocessing, transformation, predictive\nmodeling, and visualization; the Coder, which dynamically generates and\nexecutes Python code to implement these instructions; and the Grapher, which\ninterprets generated visualizations to derive actionable insights. By\norchestrating the collaboration between these components, ARTEMIS-DA\neffectively manages sophisticated analytical workflows involving advanced\nreasoning, multi-step transformations, and synthesis across diverse data\nmodalities. The framework achieves state-of-the-art (SOTA) performance on\nbenchmarks such as WikiTableQuestions and TabFact, demonstrating its ability to\ntackle intricate analytical tasks with precision and adaptability. By combining\nthe reasoning capabilities of LLMs with automated code generation and execution\nand visual analysis, ARTEMIS-DA offers a robust, scalable solution for\nmulti-step insight synthesis, addressing a wide range of challenges in data\nanalytics.\n","authors":["Atin Sakkeer Hussain"],"pdf_url":"https://arxiv.org/pdf/2412.14146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14113v1","updated":"2024-12-18T17:58:58Z","published":"2024-12-18T17:58:58Z","title":"Adversarial Hubness in Multi-Modal Retrieval","summary":" Hubness is a phenomenon in high-dimensional vector spaces where a single\npoint from the natural distribution is unusually close to many other points.\nThis is a well-known problem in information retrieval that causes some items to\naccidentally (and incorrectly) appear relevant to many queries. In this paper,\nwe investigate how attackers can exploit hubness to turn any image or audio\ninput in a multi-modal retrieval system into an adversarial hub. Adversarial\nhubs can be used to inject universal adversarial content (e.g., spam) that will\nbe retrieved in response to thousands of different queries, as well as for\ntargeted attacks on queries related to specific, attacker-chosen concepts. We\npresent a method for creating adversarial hubs and evaluate the resulting hubs\non benchmark multi-modal retrieval datasets and an image-to-image retrieval\nsystem based on a tutorial from Pinecone, a popular vector database. For\nexample, in text-caption-to-image retrieval, a single adversarial hub is\nretrieved as the top-1 most relevant image for more than 21,000 out of 25,000\ntest queries (by contrast, the most common natural hub is the top-1 response to\nonly 102 queries). We also investigate whether techniques for mitigating\nnatural hubness are an effective defense against adversarial hubs, and show\nthat they are not effective against hubs that target queries related to\nspecific concepts.\n","authors":["Tingwei Zhang","Fnu Suya","Rishi Jha","Collin Zhang","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2412.14113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14229v1","updated":"2024-12-18T17:52:10Z","published":"2024-12-18T17:52:10Z","title":"Transversal PACS Browser API: Addressing Interoperability Challenges in\n Medical Imaging Systems","summary":" Advances in imaging technologies have revolutionised the medical imaging and\nhealthcare sectors, leading to the widespread adoption of PACS for the storage,\nretrieval, and communication of medical images. Although these systems have\nimproved operational efficiency, significant challenges remain in effectively\nretrieving DICOM images, which are essential for diagnosis and overall patient\ncare. Moreover, issues such as fragmented systems, interoperability barriers,\nand complex user interfaces can often prevent healthcare professionals from\nefficiently accessing medical images. Addressing these challenges, the\nTransversal PACS Browser API is a robust and user-friendly solution designed to\nenhance the process of querying and retrieving DICOM images. It offers advanced\nfiltering capabilities through a variety of filter options as well as a custom\nfield search, that allows users to easily navigate through large medical image\ncollections with ease. Additionally, the application provides a unified\ninterface for querying and retrieving from multiple PACS stations, addressing\nthe challenges of fragmentation and complexity associated with accessing\nmedical images. Other key features include the ability to pre-view images\ndirectly within the application. All of this contributes to the transversal\nnature of the API, serving not only healthcare providers, but anyone who relies\non efficient access to these resources. To validate the performance and\nusability of the application, comprehensive testing was carried out with\nstakeholders of the field, the results of which showed general satisfaction,\nhighlighting the API's clean design, ease of use, and effective search\ncapabilities of the API, as well as the usefulness of previewing images within\nthe application.\n","authors":["Diogo Lameira","Filipa Ferraz"],"pdf_url":"https://arxiv.org/pdf/2412.14229v1.pdf","comment":"16 pages with 3 figures"},{"id":"http://arxiv.org/abs/2412.14025v1","updated":"2024-12-18T16:41:51Z","published":"2024-12-18T16:41:51Z","title":"A Cognitive Ideation Support Framework using IBM Watson Services","summary":" Ideas generation is a core activity for innovation in organizations. The\ncreativity of the generated ideas depends not only on the knowledge retrieved\nfrom the organizations' knowledge bases, but also on the external knowledge\nretrieved from other resources. Unfortunately, organizations often cannot\nefficiently utilize the knowledge in the knowledge bases due to the limited\nabilities of the search and retrieval mechanisms especially when dealing with\nunstructured data. In this paper, we present a new cognitive support framework\nfor ideation that uses the IBM Watson DeepQA services. IBM Watson is a Question\nAnswering system which mimics human cognitive abilities to retrieve and rank\ninformation. The proposed framework is based on the Search for Ideas in the\nAssociative Memory (SIAM) model to help organizations develop creative ideas\nthrough discovering new relationships between retrieved data. To evaluate the\neffectiveness of the proposed system, the generated ideas generated are\nselected and assessed using a set of established creativity criteria.\n","authors":["Samaa Elnagar","Kweku-Muata Osei-Bryson"],"pdf_url":"https://arxiv.org/pdf/2412.14025v1.pdf","comment":"Twenty-fifth Americas Conference on Information Systems (AMCIS 2019),\n Cancun, 2019"},{"id":"http://arxiv.org/abs/2412.09632v2","updated":"2024-12-18T15:55:28Z","published":"2024-11-27T19:53:05Z","title":"Methods to Assess the UK Government's Current Role as a Data Provider\n for AI","summary":" Governments typically collect and steward a vast amount of high-quality data\non their citizens and institutions, and the UK government is exploring how it\ncan better publish and provision this data to the benefit of the AI landscape.\nHowever, the compositions of generative AI training corpora remain closely\nguarded secrets, making the planning of data sharing initiatives difficult. To\naddress this, we devise two methods to assess UK government data usage for the\ntraining of Large Language Models (LLMs) and 'peek behind the curtain' in order\nto observe the UK government's current contributions as a data provider for AI.\nThe first method, an ablation study that utilises LLM 'unlearning', seeks to\nexamine the importance of the information held on UK government websites for\nLLMs and their performance in citizen query tasks. The second method, an\ninformation leakage study, seeks to ascertain whether LLMs are aware of the\ninformation held in the datasets published on the UK government's open data\ninitiative data.gov.uk. Our findings indicate that UK government websites are\nimportant data sources for AI (heterogenously across subject matters) while\ndata.gov.uk is not. This paper serves as a technical report, explaining\nin-depth the designs, mechanics, and limitations of the above experiments. It\nis accompanied by a complementary non-technical report on the ODI website in\nwhich we summarise the experiments and key findings, interpret them, and build\na set of actionable recommendations for the UK government to take forward as it\nseeks to design AI policy. While we focus on UK open government data, we\nbelieve that the methods introduced in this paper present a reproducible\napproach to tackle the opaqueness of AI training corpora and provide\norganisations a framework to evaluate and maximize their contributions to AI\ndevelopment.\n","authors":["Neil Majithia","Elena Simperl"],"pdf_url":"https://arxiv.org/pdf/2412.09632v2.pdf","comment":"17 pages, 5 figures; v2 - incorporated editor feedback; for the\n accompanying, non-technical ODI report see\n https://theodi.org/insights/reports/the-uk-government-as-a-data-provider-for-ai"},{"id":"http://arxiv.org/abs/2412.13844v1","updated":"2024-12-18T13:37:36Z","published":"2024-12-18T13:37:36Z","title":"CRM: Retrieval Model with Controllable Condition","summary":" Recommendation systems (RecSys) are designed to connect users with relevant\nitems from a vast pool of candidates while aligning with the business goals of\nthe platform. A typical industrial RecSys is composed of two main stages,\nretrieval and ranking: (1) the retrieval stage aims at searching hundreds of\nitem candidates satisfied user interests; (2) based on the retrieved items, the\nranking stage aims at selecting the best dozen items by multiple targets\nestimation for each item candidate, including classification and regression\ntargets. Compared with ranking model, the retrieval model absence of item\ncandidate information during inference, therefore retrieval models are often\ntrained by classification target only (e.g., click-through rate), but failed to\nincorporate regression target (e.g., the expected watch-time), which limit the\neffectiveness of retrieval. In this paper, we propose the Controllable\nRetrieval Model (CRM), which integrates regression information as conditional\nfeatures into the two-tower retrieval paradigm. This modification enables the\nretrieval stage could fulfill the target gap with ranking model, enhancing the\nretrieval model ability to search item candidates satisfied the user interests\nand condition effectively. We validate the effectiveness of CRM through\nreal-world A/B testing and demonstrate its successful deployment in Kuaishou\nshort-video recommendation system, which serves over 400 million users.\n","authors":["Chi Liu","Jiangxia Cao","Rui Huang","Kuo Cai","Weifeng Ding","Qiang Luo","Kun Gai","Guorui Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.13844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13834v1","updated":"2024-12-18T13:24:09Z","published":"2024-12-18T13:24:09Z","title":"Maybe you are looking for CroQS: Cross-modal Query Suggestion for\n Text-to-Image Retrieval","summary":" Query suggestion, a technique widely adopted in information retrieval,\nenhances system interactivity and the browsing experience of document\ncollections. In cross-modal retrieval, many works have focused on retrieving\nrelevant items from natural language queries, while few have explored query\nsuggestion solutions. In this work, we address query suggestion in cross-modal\nretrieval, introducing a novel task that focuses on suggesting minimal textual\nmodifications needed to explore visually consistent subsets of the collection,\nfollowing the premise of ''Maybe you are looking for''. To facilitate the\nevaluation and development of methods, we present a tailored benchmark named\nCroQS. This dataset comprises initial queries, grouped result sets, and\nhuman-defined suggested queries for each group. We establish dedicated metrics\nto rigorously evaluate the performance of various methods on this task,\nmeasuring representativeness, cluster specificity, and similarity of the\nsuggested queries to the original ones. Baseline methods from related fields,\nsuch as image captioning and content summarization, are adapted for this task\nto provide reference performance scores. Although relatively far from human\nperformance, our experiments reveal that both LLM-based and captioning-based\nmethods achieve competitive results on CroQS, improving the recall on cluster\nspecificity by more than 115% and representativeness mAP by more than 52% with\nrespect to the initial query. The dataset, the implementation of the baseline\nmethods and the notebooks containing our experiments are available here:\nhttps://paciosoft.com/CroQS-benchmark/\n","authors":["Giacomo Pacini","Fabio Carrara","Nicola Messina","Nicola Tonellotto","Giuseppe Amato","Fabrizio Falchi"],"pdf_url":"https://arxiv.org/pdf/2412.13834v1.pdf","comment":"15 pages, 5 figures. To be published as full paper in the Proceedings\n of the European Conference on Information Retrieval (ECIR) 2025"},{"id":"http://arxiv.org/abs/2412.13825v1","updated":"2024-12-18T13:12:36Z","published":"2024-12-18T13:12:36Z","title":"Heterogeneous Graph Collaborative Filtering","summary":" For modern recommender systems, the use of low-dimensional latent\nrepresentations to embed users and items based on their observed interactions\nhas become commonplace. However, many existing recommendation models are\nprimarily designed for coarse-grained and homogeneous interactions, which\nlimits their effectiveness in two critical dimensions. Firstly, these models\nfail to leverage the relational dependencies that exist across different types\nof user behaviors, such as page views, collects, comments, and purchases.\nSecondly, they struggle to capture the fine-grained latent factors that drive\nuser interaction patterns. To address these limitations, we present a\nheterogeneous graph collaborative filtering model MixRec that excels at\ndisentangling users' multi-behavior interaction patterns and uncovering the\nlatent intent factors behind each behavior. Our model achieves this by\nincorporating intent disentanglement and multi-behavior modeling, facilitated\nby a parameterized heterogeneous hypergraph architecture. Furthermore, we\nintroduce a novel contrastive learning paradigm that adaptively explores the\nadvantages of self-supervised data augmentation, thereby enhancing the model's\nresilience against data sparsity and expressiveness with relation\nheterogeneity. To validate the efficacy of MixRec, we conducted extensive\nexperiments on three public datasets. The results clearly demonstrate its\nsuperior performance, significantly outperforming various state-of-the-art\nbaselines. Our model is open-sourced and available at:\nhttps://github.com/HKUDS/MixRec.\n","authors":["Lianghao Xia","Meiyan Xie","Yong Xu","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2412.13825v1.pdf","comment":"This paper is accepted by WSDM'2025"},{"id":"http://arxiv.org/abs/2412.12559v2","updated":"2024-12-18T13:08:36Z","published":"2024-12-17T05:38:27Z","title":"EXIT: Context-Aware Extractive Compression for Enhancing\n Retrieval-Augmented Generation","summary":" We introduce EXIT, an extractive context compression framework that enhances\nboth the effectiveness and efficiency of retrieval-augmented generation (RAG)\nin question answering (QA). Current RAG systems often struggle when retrieval\nmodels fail to rank the most relevant documents, leading to the inclusion of\nmore context at the expense of latency and accuracy. While abstractive\ncompression methods can drastically reduce token counts, their token-by-token\ngeneration process significantly increases end-to-end latency. Conversely,\nexisting extractive methods reduce latency but rely on independent,\nnon-adaptive sentence selection, failing to fully utilize contextual\ninformation. EXIT addresses these limitations by classifying sentences from\nretrieved documents - while preserving their contextual dependencies - enabling\nparallelizable, context-aware extraction that adapts to query complexity and\nretrieval quality. Our evaluations on both single-hop and multi-hop QA tasks\nshow that EXIT consistently surpasses existing compression methods and even\nuncompressed baselines in QA accuracy, while also delivering substantial\nreductions in inference time and token count. By improving both effectiveness\nand efficiency, EXIT provides a promising direction for developing scalable,\nhigh-quality QA solutions in RAG pipelines. Our code is available at\nhttps://github.com/ThisIsHwang/EXIT\n","authors":["Taeho Hwang","Sukmin Cho","Soyeong Jeong","Hoyun Song","SeungYoon Han","Jong C. Park"],"pdf_url":"https://arxiv.org/pdf/2412.12559v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2406.11156v4","updated":"2024-12-18T12:48:37Z","published":"2024-06-17T02:47:09Z","title":"DELRec: Distilling Sequential Pattern to Enhance LLMs-based Sequential\n Recommendation","summary":" Sequential recommendation (SR) tasks aim to predict users' next interaction\nby learning their behavior sequence and capturing the connection between users'\npast interactions and their changing preferences. Conventional SR models often\nfocus solely on capturing sequential patterns within the training data,\nneglecting the broader context and semantic information embedded in item titles\nfrom external sources. This limits their predictive power and adaptability.\nLarge language models (LLMs) have recently shown promise in SR tasks due to\ntheir advanced understanding capabilities and strong generalization abilities.\nResearchers have attempted to enhance LLMs-based recommendation performance by\nincorporating information from conventional SR models. However, previous\napproaches have encountered problems such as 1) limited textual information\nleading to poor recommendation performance, 2) incomplete understanding and\nutilization of conventional SR model information by LLMs, and 3) excessive\ncomplexity and low interpretability of LLMs-based methods. To improve the\nperformance of LLMs-based SR, we propose a novel framework, Distilling\nSequential Pattern to Enhance LLMs-based Sequential Recommendation (DELRec),\nwhich aims to extract knowledge from conventional SR models and enable LLMs to\neasily comprehend and utilize the extracted knowledge for more effective SRs.\nDELRec consists of two main stages: 1) Distill Pattern from Conventional SR\nModels, focusing on extracting behavioral patterns exhibited by conventional SR\nmodels using soft prompts through two well-designed strategies; 2) LLMs-based\nSequential Recommendation, aiming to fine-tune LLMs to effectively use the\ndistilled auxiliary information to perform SR tasks. Extensive experimental\nresults conducted on four real datasets validate the effectiveness of the\nDELRec framework.\n","authors":["Haoyi Zhang","Guohao Sun","Jinhu Lu","Guanfeng Liu","Xiu Susie Fang"],"pdf_url":"https://arxiv.org/pdf/2406.11156v4.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2412.13771v1","updated":"2024-12-18T12:07:58Z","published":"2024-12-18T12:07:58Z","title":"Semantic Convergence: Harmonizing Recommender Systems via Two-Stage\n Alignment and Behavioral Semantic Tokenization","summary":" Large language models (LLMs), endowed with exceptional reasoning\ncapabilities, are adept at discerning profound user interests from historical\nbehaviors, thereby presenting a promising avenue for the advancement of\nrecommendation systems. However, a notable discrepancy persists between the\nsparse collaborative semantics typically found in recommendation systems and\nthe dense token representations within LLMs. In our study, we propose a novel\nframework that harmoniously merges traditional recommendation models with the\nprowess of LLMs. We initiate this integration by transforming ItemIDs into\nsequences that align semantically with the LLMs space, through the proposed\nAlignment Tokenization module. Additionally, we design a series of specialized\nsupervised learning tasks aimed at aligning collaborative signals with the\nsubtleties of natural language semantics. To ensure practical applicability, we\noptimize online inference by pre-caching the top-K results for each user,\nreducing latency and improving effciency. Extensive experimental evidence\nindicates that our model markedly improves recall metrics and displays\nremarkable scalability of recommendation systems.\n","authors":["Guanghan Li","Xun Zhang","Yufei Zhang","Yifan Yin","Guojun Yin","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2412.13771v1.pdf","comment":"7 pages, 3 figures, AAAI 2025"},{"id":"http://arxiv.org/abs/2412.13746v1","updated":"2024-12-18T11:28:05Z","published":"2024-12-18T11:28:05Z","title":"RAG-RewardBench: Benchmarking Reward Models in Retrieval Augmented\n Generation for Preference Alignment","summary":" Despite the significant progress made by existing retrieval augmented\nlanguage models (RALMs) in providing trustworthy responses and grounding in\nreliable sources, they often overlook effective alignment with human\npreferences. In the alignment process, reward models (RMs) act as a crucial\nproxy for human values to guide optimization. However, it remains unclear how\nto evaluate and select a reliable RM for preference alignment in RALMs. To this\nend, we propose RAG-RewardBench, the first benchmark for evaluating RMs in RAG\nsettings. First, we design four crucial and challenging RAG-specific scenarios\nto assess RMs, including multi-hop reasoning, fine-grained citation,\nappropriate abstain, and conflict robustness. Then, we incorporate 18 RAG\nsubsets, six retrievers, and 24 RALMs to increase the diversity of data\nsources. Finally, we adopt an LLM-as-a-judge approach to improve preference\nannotation efficiency and effectiveness, exhibiting a strong correlation with\nhuman annotations. Based on the RAG-RewardBench, we conduct a comprehensive\nevaluation of 45 RMs and uncover their limitations in RAG scenarios.\nAdditionally, we also reveal that existing trained RALMs show almost no\nimprovement in preference alignment, highlighting the need for a shift towards\npreference-aligned training.We release our benchmark and code publicly at\nhttps://huggingface.co/datasets/jinzhuoran/RAG-RewardBench/ for future work.\n","authors":["Zhuoran Jin","Hongbang Yuan","Tianyi Men","Pengfei Cao","Yubo Chen","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2412.13746v1.pdf","comment":"26 pages, 12 figures, 6 tables"},{"id":"http://arxiv.org/abs/2412.13614v1","updated":"2024-12-18T08:49:01Z","published":"2024-12-18T08:49:01Z","title":"Reverse Region-to-Entity Annotation for Pixel-Level Visual Entity\n Linking","summary":" Visual Entity Linking (VEL) is a crucial task for achieving fine-grained\nvisual understanding, matching objects within images (visual mentions) to\nentities in a knowledge base. Previous VEL tasks rely on textual inputs, but\nwriting queries for complex scenes can be challenging. Visual inputs like\nclicks or bounding boxes offer a more convenient alternative. Therefore, we\npropose a new task, Pixel-Level Visual Entity Linking (PL-VEL), which uses\npixel masks from visual inputs to refer to objects, supplementing reference\nmethods for VEL. To facilitate research on this task, we have constructed the\nMaskOVEN-Wiki dataset through an entirely automatic reverse region-entity\nannotation framework. This dataset contains over 5 million annotations aligning\npixel-level regions with entity-level labels, which will advance visual\nunderstanding towards fine-grained. Moreover, as pixel masks correspond to\nsemantic regions in an image, we enhance previous patch-interacted attention\nwith region-interacted attention by a visual semantic tokenization approach.\nManual evaluation results indicate that the reverse annotation framework\nachieved a 94.8% annotation success rate. Experimental results show that models\ntrained on this dataset improved accuracy by 18 points compared to zero-shot\nmodels. Additionally, the semantic tokenization method achieved a 5-point\naccuracy improvement over the trained baseline.\n","authors":["Zhengfei Xu","Sijia Zhao","Yanchao Hao","Xiaolong Liu","Lili Li","Yuyang Yin","Bo Li","Xi Chen","Xin Xin"],"pdf_url":"https://arxiv.org/pdf/2412.13614v1.pdf","comment":"AAAI 2025;Dataset are released at\n https://github.com/NP-NET-research/PL-VEL"},{"id":"http://arxiv.org/abs/2412.13102v2","updated":"2024-12-18T07:06:07Z","published":"2024-12-17T17:15:21Z","title":"AIR-Bench: Automated Heterogeneous Information Retrieval Benchmark","summary":" Evaluation plays a crucial role in the advancement of information retrieval\n(IR) models. However, current benchmarks, which are based on predefined domains\nand human-labeled data, face limitations in addressing evaluation needs for\nemerging domains both cost-effectively and efficiently. To address this\nchallenge, we propose the Automated Heterogeneous Information Retrieval\nBenchmark (AIR-Bench). AIR-Bench is distinguished by three key features: 1)\nAutomated. The testing data in AIR-Bench is automatically generated by large\nlanguage models (LLMs) without human intervention. 2) Heterogeneous. The\ntesting data in AIR-Bench is generated with respect to diverse tasks, domains\nand languages. 3) Dynamic. The domains and languages covered by AIR-Bench are\nconstantly augmented to provide an increasingly comprehensive evaluation\nbenchmark for community developers. We develop a reliable and robust data\ngeneration pipeline to automatically create diverse and high-quality evaluation\ndatasets based on real-world corpora. Our findings demonstrate that the\ngenerated testing data in AIR-Bench aligns well with human-labeled testing\ndata, making AIR-Bench a dependable benchmark for evaluating IR models. The\nresources in AIR-Bench are publicly available at\nhttps://github.com/AIR-Bench/AIR-Bench.\n","authors":["Jianlyu Chen","Nan Wang","Chaofan Li","Bo Wang","Shitao Xiao","Han Xiao","Hao Liao","Defu Lian","Zheng Liu"],"pdf_url":"https://arxiv.org/pdf/2412.13102v2.pdf","comment":"31 pages, 6 figures; Update Table 5"},{"id":"http://arxiv.org/abs/2412.13544v1","updated":"2024-12-18T06:43:56Z","published":"2024-12-18T06:43:56Z","title":"Bridging the User-side Knowledge Gap in Knowledge-aware Recommendations\n with Large Language Models","summary":" In recent years, knowledge graphs have been integrated into recommender\nsystems as item-side auxiliary information, enhancing recommendation accuracy.\nHowever, constructing and integrating structural user-side knowledge remains a\nsignificant challenge due to the improper granularity and inherent scarcity of\nuser-side features. Recent advancements in Large Language Models (LLMs) offer\nthe potential to bridge this gap by leveraging their human behavior\nunderstanding and extensive real-world knowledge. Nevertheless, integrating\nLLM-generated information into recommender systems presents challenges,\nincluding the risk of noisy information and the need for additional knowledge\ntransfer. In this paper, we propose an LLM-based user-side knowledge inference\nmethod alongside a carefully designed recommendation framework to address these\nchallenges. Our approach employs LLMs to infer user interests based on\nhistorical behaviors, integrating this user-side information with item-side and\ncollaborative data to construct a hybrid structure: the Collaborative Interest\nKnowledge Graph (CIKG). Furthermore, we propose a CIKG-based recommendation\nframework that includes a user interest reconstruction module and a\ncross-domain contrastive learning module to mitigate potential noise and\nfacilitate knowledge transfer. We conduct extensive experiments on three\nreal-world datasets to validate the effectiveness of our method. Our approach\nachieves state-of-the-art performance compared to competitive baselines,\nparticularly for users with sparse interactions.\n","authors":["Zheng Hu","Zhe Li","Ziyun Jiao","Satoshi Nakagawa","Jiawen Deng","Shimin Cai","Tao Zhou","Fuji Ren"],"pdf_url":"https://arxiv.org/pdf/2412.13544v1.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2412.13534v1","updated":"2024-12-18T06:21:21Z","published":"2024-12-18T06:21:21Z","title":"Information-Theoretic Generative Clustering of Documents","summary":" We present {\\em generative clustering} (GC) for clustering a set of\ndocuments, $\\mathrm{X}$, by using texts $\\mathrm{Y}$ generated by large\nlanguage models (LLMs) instead of by clustering the original documents\n$\\mathrm{X}$. Because LLMs provide probability distributions, the similarity\nbetween two documents can be rigorously defined in an information-theoretic\nmanner by the KL divergence. We also propose a natural, novel clustering\nalgorithm by using importance sampling. We show that GC achieves the\nstate-of-the-art performance, outperforming any previous clustering method\noften by a large margin. Furthermore, we show an application to generative\ndocument retrieval in which documents are indexed via hierarchical clustering\nand our method improves the retrieval accuracy.\n","authors":["Xin Du","Kumiko Tanaka-Ishii"],"pdf_url":"https://arxiv.org/pdf/2412.13534v1.pdf","comment":"Accepted to AAAI 2025"},{"id":"http://arxiv.org/abs/2407.01965v2","updated":"2024-12-18T05:46:41Z","published":"2024-07-02T05:50:16Z","title":"AdaCQR: Enhancing Query Reformulation for Conversational Search via\n Sparse and Dense Retrieval Alignment","summary":" Conversational Query Reformulation (CQR) has significantly advanced in\naddressing the challenges of conversational search, particularly those stemming\nfrom the latent user intent and the need for historical context. Recent works\naimed to boost the performance of CRQ through alignment. However, they are\ndesigned for one specific retrieval system, which potentially results in poor\ngeneralization. To overcome this limitation, we present a novel framework\nAdaCQR. By aligning reformulation models with both term-based and\nsemantic-based retrieval systems, AdaCQR enhances the generalizability of\ninformation-seeking queries across diverse retrieval environments through a\ndual-phase training strategy. We also developed two effective approaches for\nacquiring superior labels and diverse input candidates, boosting the efficiency\nand robustness of the framework. Experimental evaluations on the TopiOCQA and\nQReCC datasets demonstrate that AdaCQR significantly outperforms existing\nmethods, offering both quantitative and qualitative improvements in\nconversational query reformulation.\n","authors":["Yilong Lai","Jialong Wu","Congzhi Zhang","Haowen Sun","Deyu Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.01965v2.pdf","comment":"Accepted by COLING 2025"},{"id":"http://arxiv.org/abs/2412.12486v2","updated":"2024-12-18T05:08:39Z","published":"2024-12-17T02:43:54Z","title":"Boosting Long-Context Management via Query-Guided Activation Refilling","summary":" Processing long contexts poses a significant challenge for large language\nmodels (LLMs) due to their inherent context-window limitations and the\ncomputational burden of extensive key-value (KV) activations, which severely\nimpact efficiency. For information-seeking tasks, full context perception is\noften unnecessary, as a query's information needs can dynamically range from\nlocalized details to a global perspective, depending on its complexity.\nHowever, existing methods struggle to adapt effectively to these dynamic\ninformation needs.\n In the paper, we propose a method for processing long-context\ninformation-seeking tasks via query-guided Activation Refilling (ACRE). ACRE\nconstructs a Bi-layer KV Cache for long contexts, where the layer-1 (L1) cache\ncompactly captures global information, and the layer-2 (L2) cache provides\ndetailed and localized information. ACRE establishes a proxying relationship\nbetween the two caches, allowing the input query to attend to the L1 cache and\ndynamically refill it with relevant entries from the L2 cache. This mechanism\nintegrates global understanding with query-specific local details, thus\nimproving answer decoding. Experiments on a variety of long-context\ninformation-seeking datasets demonstrate ACRE's effectiveness, achieving\nimprovements in both performance and efficiency.\n","authors":["Hongjin Qian","Zheng Liu","Peitian Zhang","Zhicheng Dou","Defu Lian"],"pdf_url":"https://arxiv.org/pdf/2412.12486v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2412.08300v2","updated":"2024-12-18T03:21:48Z","published":"2024-12-11T11:29:15Z","title":"Augmenting Sequential Recommendation with Balanced Relevance and\n Diversity","summary":" By generating new yet effective data, data augmentation has become a\npromising method to mitigate the data sparsity problem in sequential\nrecommendation. Existing works focus on augmenting the original data but rarely\nexplore the issue of imbalanced relevance and diversity for augmented data,\nleading to semantic drift problems or limited performance improvements. In this\npaper, we propose a novel Balanced data Augmentation Plugin for Sequential\nRecommendation (BASRec) to generate data that balance relevance and diversity.\nBASRec consists of two modules: Single-sequence Augmentation and Cross-sequence\nAugmentation. The former leverages the randomness of the heuristic operators to\ngenerate diverse sequences for a single user, after which the diverse and the\noriginal sequences are fused at the representation level to obtain relevance.\nFurther, we devise a reweighting strategy to enable the model to learn the\npreferences based on the two properties adaptively. The Cross-sequence\nAugmentation performs nonlinear mixing between different sequence\nrepresentations from two directions. It produces virtual sequence\nrepresentations that are diverse enough but retain the vital semantics of the\noriginal sequences. These two modules enhance the model to discover\nfine-grained preferences knowledge from single-user and cross-user\nperspectives. Extensive experiments verify the effectiveness of BASRec. The\naverage improvement is up to 72.0% on GRU4Rec, 33.8% on SASRec, and 68.5% on\nFMLP-Rec. We demonstrate that BASRec generates data with a better balance\nbetween relevance and diversity than existing methods. The source code is\navailable at https://github.com/KingGugu/BASRec.\n","authors":["Yizhou Dang","Jiahui Zhang","Yuting Liu","Enneng Yang","Yuliang Liang","Guibing Guo","Jianzhe Zhao","Xingwei Wang"],"pdf_url":"https://arxiv.org/pdf/2412.08300v2.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.11905v2","updated":"2024-12-18T02:41:21Z","published":"2024-12-16T15:52:17Z","title":"One for Dozens: Adaptive REcommendation for All Domains with\n Counterfactual Augmentation","summary":" Multi-domain recommendation (MDR) aims to enhance recommendation performance\nacross various domains. However, real-world recommender systems in online\nplatforms often need to handle dozens or even hundreds of domains, far\nexceeding the capabilities of traditional MDR algorithms, which typically focus\non fewer than five domains. Key challenges include a substantial increase in\nparameter count, high maintenance costs, and intricate knowledge transfer\npatterns across domains. Furthermore, minor domains often suffer from data\nsparsity, leading to inadequate training in classical methods. To address these\nissues, we propose Adaptive REcommendation for All Domains with counterfactual\naugmentation (AREAD). AREAD employs a hierarchical structure with a limited\nnumber of expert networks at several layers, to effectively capture domain\nknowledge at different granularities. To adaptively capture the knowledge\ntransfer pattern across domains, we generate and iteratively prune a\nhierarchical expert network selection mask for each domain during training.\nAdditionally, counterfactual assumptions are used to augment data in minor\ndomains, supporting their iterative mask pruning. Our experiments on two public\ndatasets, each encompassing over twenty domains, demonstrate AREAD's\neffectiveness, especially in data-sparse domains. Source code is available at\nhttps://github.com/Chrissie-Law/AREAD-Multi-Domain-Recommendation.\n","authors":["Huishi Luo","Yiwen Chen","Yiqing Wu","Fuzhen Zhuang","Deqing Wang"],"pdf_url":"https://arxiv.org/pdf/2412.11905v2.pdf","comment":"Extended version accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.13432v1","updated":"2024-12-18T02:07:21Z","published":"2024-12-18T02:07:21Z","title":"Large Language Model Enhanced Recommender Systems: Taxonomy, Trend,\n Application and Future","summary":" Large Language Model (LLM) has transformative potential in various domains,\nincluding recommender systems (RS). There have been a handful of research that\nfocuses on empowering the RS by LLM. However, previous efforts mainly focus on\nLLM as RS, which may face the challenge of intolerant inference costs by LLM.\nRecently, the integration of LLM into RS, known as LLM-Enhanced Recommender\nSystems (LLMERS), has garnered significant interest due to its potential to\naddress latency and memory constraints in real-world applications. This paper\npresents a comprehensive survey of the latest research efforts aimed at\nleveraging LLM to enhance RS capabilities. We identify a critical shift in the\nfield with the move towards incorporating LLM into the online system, notably\nby avoiding their use during inference. Our survey categorizes the existing\nLLMERS approaches into three primary types based on the component of the RS\nmodel being augmented: Knowledge Enhancement, Interaction Enhancement, and\nModel Enhancement. We provide an in-depth analysis of each category, discussing\nthe methodologies, challenges, and contributions of recent studies.\nFurthermore, we highlight several promising research directions that could\nfurther advance the field of LLMERS.\n","authors":["Qidong Liu","Xiangyu Zhao","Yuhao Wang","Yejing Wang","Zijian Zhang","Yuqi Sun","Xiang Li","Maolin Wang","Pengyue Jia","Chong Chen","Wei Huang","Feng Tian"],"pdf_url":"https://arxiv.org/pdf/2412.13432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13408v1","updated":"2024-12-18T00:56:16Z","published":"2024-12-18T00:56:16Z","title":"Lightweight yet Fine-grained: A Graph Capsule Convolutional Network with\n Subspace Alignment for Shared-account Sequential Recommendation","summary":" Shared-account Sequential Recommendation (SSR) aims to provide personalized\nrecommendations for accounts shared by multiple users with varying sequential\npreferences. Previous studies on SSR struggle to capture the fine-grained\nassociations between interactions and different latent users within the shared\naccount's hybrid sequences. Moreover, most existing SSR methods (e.g.,\nRNN-based or GCN-based methods) have quadratic computational complexities,\nhindering the deployment of SSRs on resource-constrained devices. To this end,\nwe propose a Lightweight Graph Capsule Convolutional Network with subspace\nalignment for shared-account sequential recommendation, named LightGC$^2$N.\nSpecifically, we devise a lightweight graph capsule convolutional network. It\nfacilitates the fine-grained matching between interactions and latent users by\nattentively propagating messages on the capsule graphs. Besides, we present an\nefficient subspace alignment method. This method refines the sequence\nrepresentations and then aligns them with the finely clustered preferences of\nlatent users. The experimental results on four real-world datasets indicate\nthat LightGC$^2$N outperforms nine state-of-the-art methods in accuracy and\nefficiency.\n","authors":["Jinyu Zhang","Zhongying Zhao","Chao Li","Yanwei Yu"],"pdf_url":"https://arxiv.org/pdf/2412.13408v1.pdf","comment":"11 pages, 6 figures, accepted by AAAI-2025 conference"}],"Multimedia":[{"id":"http://arxiv.org/abs/2412.14158v1","updated":"2024-12-18T18:53:22Z","published":"2024-12-18T18:53:22Z","title":"AKiRa: Augmentation Kit on Rays for optical video generation","summary":" Recent advances in text-conditioned video diffusion have greatly improved\nvideo quality. However, these methods offer limited or sometimes no control to\nusers on camera aspects, including dynamic camera motion, zoom, distorted lens\nand focus shifts. These motion and optical aspects are crucial for adding\ncontrollability and cinematic elements to generation frameworks, ultimately\nresulting in visual content that draws focus, enhances mood, and guides\nemotions according to filmmakers' controls. In this paper, we aim to close the\ngap between controllable video generation and camera optics. To achieve this,\nwe propose AKiRa (Augmentation Kit on Rays), a novel augmentation framework\nthat builds and trains a camera adapter with a complex camera model over an\nexisting video generation backbone. It enables fine-tuned control over camera\nmotion as well as complex optical parameters (focal length, distortion,\naperture) to achieve cinematic effects such as zoom, fisheye effect, and bokeh.\nExtensive experiments demonstrate AKiRa's effectiveness in combining and\ncomposing camera optics while outperforming all state-of-the-art methods. This\nwork sets a new landmark in controlled and optically enhanced video generation,\npaving the way for future optical video generation methods.\n","authors":["Xi Wang","Robin Courant","Marc Christie","Vicky Kalogeiton"],"pdf_url":"https://arxiv.org/pdf/2412.14158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14056v1","updated":"2024-12-18T17:06:21Z","published":"2024-12-18T17:06:21Z","title":"A Review of Multimodal Explainable Artificial Intelligence: Past,\n Present and Future","summary":" Artificial intelligence (AI) has rapidly developed through advancements in\ncomputational power and the growth of massive datasets. However, this progress\nhas also heightened challenges in interpreting the \"black-box\" nature of AI\nmodels. To address these concerns, eXplainable AI (XAI) has emerged with a\nfocus on transparency and interpretability to enhance human understanding and\ntrust in AI decision-making processes. In the context of multimodal data fusion\nand complex reasoning scenarios, the proposal of Multimodal eXplainable AI\n(MXAI) integrates multiple modalities for prediction and explanation tasks.\nMeanwhile, the advent of Large Language Models (LLMs) has led to remarkable\nbreakthroughs in natural language processing, yet their complexity has further\nexacerbated the issue of MXAI. To gain key insights into the development of\nMXAI methods and provide crucial guidance for building more transparent, fair,\nand trustworthy AI systems, we review the MXAI methods from a historical\nperspective and categorize them across four eras: traditional machine learning,\ndeep learning, discriminative foundation models, and generative LLMs. We also\nreview evaluation metrics and datasets used in MXAI research, concluding with a\ndiscussion of future challenges and directions. A project related to this\nreview has been created at https://github.com/ShilinSun/mxai_review.\n","authors":["Shilin Sun","Wenbin An","Feng Tian","Fang Nan","Qidong Liu","Jun Liu","Nazaraf Shah","Ping Chen"],"pdf_url":"https://arxiv.org/pdf/2412.14056v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2412.14018v1","updated":"2024-12-18T16:34:51Z","published":"2024-12-18T16:34:51Z","title":"SurgSora: Decoupled RGBD-Flow Diffusion Model for Controllable Surgical\n Video Generation","summary":" Medical video generation has transformative potential for enhancing surgical\nunderstanding and pathology insights through precise and controllable visual\nrepresentations. However, current models face limitations in controllability\nand authenticity. To bridge this gap, we propose SurgSora, a\nmotion-controllable surgical video generation framework that uses a single\ninput frame and user-controllable motion cues. SurgSora consists of three key\nmodules: the Dual Semantic Injector (DSI), which extracts object-relevant RGB\nand depth features from the input frame and integrates them with segmentation\ncues to capture detailed spatial features of complex anatomical structures; the\nDecoupled Flow Mapper (DFM), which fuses optical flow with semantic-RGB-D\nfeatures at multiple scales to enhance temporal understanding and object\nspatial dynamics; and the Trajectory Controller (TC), which allows users to\nspecify motion directions and estimates sparse optical flow, guiding the video\ngeneration process. The fused features are used as conditions for a frozen\nStable Diffusion model to produce realistic, temporally coherent surgical\nvideos. Extensive evaluations demonstrate that SurgSora outperforms\nstate-of-the-art methods in controllability and authenticity, showing its\npotential to advance surgical video generation for medical education, training,\nand research.\n","authors":["Tong Chen","Shuya Yang","Junyi Wang","Long Bai","Hongliang Ren","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.14018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.14005v1","updated":"2024-12-18T16:20:21Z","published":"2024-12-18T16:20:21Z","title":"Real-Time Position-Aware View Synthesis from Single-View Input","summary":" Recent advancements in view synthesis have significantly enhanced immersive\nexperiences across various computer graphics and multimedia applications,\nincluding telepresence, and entertainment. By enabling the generation of new\nperspectives from a single input view, view synthesis allows users to better\nperceive and interact with their environment. However, many state-of-the-art\nmethods, while achieving high visual quality, face limitations in real-time\nperformance, which makes them less suitable for live applications where low\nlatency is critical. In this paper, we present a lightweight, position-aware\nnetwork designed for real-time view synthesis from a single input image and a\ntarget camera pose. The proposed framework consists of a Position Aware\nEmbedding, modeled with a multi-layer perceptron, which efficiently maps\npositional information from the target pose to generate high dimensional\nfeature maps. These feature maps, along with the input image, are fed into a\nRendering Network that merges features from dual encoder branches to resolve\nboth high level semantics and low level details, producing a realistic new view\nof the scene. Experimental results demonstrate that our method achieves\nsuperior efficiency and visual quality compared to existing approaches,\nparticularly in handling complex translational movements without explicit\ngeometric operations like warping. This work marks a step toward enabling\nreal-time view synthesis from a single image for live and interactive\napplications.\n","authors":["Manu Gond","Emin Zerman","Sebastian Knorr","Mårten Sjöström"],"pdf_url":"https://arxiv.org/pdf/2412.14005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13743v1","updated":"2024-12-18T11:21:39Z","published":"2024-12-18T11:21:39Z","title":"User-Generated Content and Editors in Games: A Comprehensive Survey","summary":" User-Generated Content (UGC) refers to any form of content, such as posts and\nimages, created by users rather than by professionals. In recent years, UGC has\nbecome an essential part of the evolving video game industry, influencing both\ngame culture and community dynamics. The ability for users to actively\ncontribute to the games they engage with has shifted the landscape of gaming\nfrom a one-directional entertainment experience into a collaborative,\nuser-driven ecosystem. Therefore, this growing trend highlights the urgent need\nfor summarizing the current UGC development in game industry. Our conference\npaper has systematically classified the existing UGC in games and the UGC\neditors separately into four types. However, the previous survey lacks the\ndepth and precision necessary to capture the wide-ranging and increasingly\ncomplex nature of UGC. To this end, as an extension of previous work, this\npaper presents a refined and expanded classification of UGC and UGC editors\nwithin video games, offering a more robust and comprehensive framework with\nrepresentative cases that better reflects the diversity and nuances of\ncontemporary user-generated contributions. Moreover, we provide our insights on\nthe future of UGC, involving game culture, game genre and user creative\ntendencies, artificial intelligence, its potential ethical considerations, and\nrelationship between games, users and communities.\n","authors":["Yuyue Liu","Haihan Duan","Wei Cai"],"pdf_url":"https://arxiv.org/pdf/2412.13743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.13614v1","updated":"2024-12-18T08:49:01Z","published":"2024-12-18T08:49:01Z","title":"Reverse Region-to-Entity Annotation for Pixel-Level Visual Entity\n Linking","summary":" Visual Entity Linking (VEL) is a crucial task for achieving fine-grained\nvisual understanding, matching objects within images (visual mentions) to\nentities in a knowledge base. Previous VEL tasks rely on textual inputs, but\nwriting queries for complex scenes can be challenging. Visual inputs like\nclicks or bounding boxes offer a more convenient alternative. Therefore, we\npropose a new task, Pixel-Level Visual Entity Linking (PL-VEL), which uses\npixel masks from visual inputs to refer to objects, supplementing reference\nmethods for VEL. To facilitate research on this task, we have constructed the\nMaskOVEN-Wiki dataset through an entirely automatic reverse region-entity\nannotation framework. This dataset contains over 5 million annotations aligning\npixel-level regions with entity-level labels, which will advance visual\nunderstanding towards fine-grained. Moreover, as pixel masks correspond to\nsemantic regions in an image, we enhance previous patch-interacted attention\nwith region-interacted attention by a visual semantic tokenization approach.\nManual evaluation results indicate that the reverse annotation framework\nachieved a 94.8% annotation success rate. Experimental results show that models\ntrained on this dataset improved accuracy by 18 points compared to zero-shot\nmodels. Additionally, the semantic tokenization method achieved a 5-point\naccuracy improvement over the trained baseline.\n","authors":["Zhengfei Xu","Sijia Zhao","Yanchao Hao","Xiaolong Liu","Lili Li","Yuyang Yin","Bo Li","Xi Chen","Xin Xin"],"pdf_url":"https://arxiv.org/pdf/2412.13614v1.pdf","comment":"AAAI 2025;Dataset are released at\n https://github.com/NP-NET-research/PL-VEL"},{"id":"http://arxiv.org/abs/2412.13462v1","updated":"2024-12-18T03:18:03Z","published":"2024-12-18T03:18:03Z","title":"SAVGBench: Benchmarking Spatially Aligned Audio-Video Generation","summary":" This work addresses the lack of multimodal generative models capable of\nproducing high-quality videos with spatially aligned audio. While recent\nadvancements in generative models have been successful in video generation,\nthey often overlook the spatial alignment between audio and visuals, which is\nessential for immersive experiences. To tackle this problem, we establish a new\nresearch direction in benchmarking Spatially Aligned Audio-Video Generation\n(SAVG). We propose three key components for the benchmark: dataset, baseline,\nand metrics. We introduce a spatially aligned audio-visual dataset, derived\nfrom an audio-visual dataset consisting of multichannel audio, video, and\nspatiotemporal annotations of sound events. We propose a baseline audio-visual\ndiffusion model focused on stereo audio-visual joint learning to accommodate\nspatial sound. Finally, we present metrics to evaluate video and spatial audio\nquality, including a new spatial audio-visual alignment metric. Our\nexperimental result demonstrates that gaps exist between the baseline model and\nground truth in terms of video and audio quality, and spatial alignment between\nboth modalities.\n","authors":["Kazuki Shimada","Christian Simon","Takashi Shibuya","Shusuke Takahashi","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2412.13462v1.pdf","comment":"5 pages, 3 figures"}]},"2024-12-17T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2412.07030v2","updated":"2024-12-17T20:38:21Z","published":"2024-12-09T22:35:44Z","title":"FM2DS: Few-Shot Multimodal Multihop Data Synthesis with Knowledge\n Distillation for Question Answering","summary":" Multimodal multihop question answering is a complex task that requires\nreasoning over multiple sources of information, such as images and text, to\nanswer questions. While there has been significant progress in visual question\nanswering, the multihop setting remains unexplored due to the lack of\nhigh-quality datasets. Current methods focus on single-hop question answering\nor a single modality, which makes them unsuitable for real-world scenarios such\nas analyzing multimodal educational materials, summarizing lengthy academic\narticles, or interpreting scientific studies that combine charts, images, and\ntext. To address this gap, we propose a novel methodology, introducing the\nfirst framework for creating a high-quality dataset that enables training\nmodels for multimodal multihop question answering. Our approach consists of a\n5-stage pipeline that involves acquiring relevant multimodal documents from\nWikipedia, synthetically generating high-level questions and answers, and\nvalidating them through rigorous criteria to ensure quality data. We evaluate\nour methodology by training models on our synthesized dataset and testing on\ntwo benchmarks, our results demonstrate that, with an equal sample size, models\ntrained on our synthesized data outperform those trained on human-collected\ndata by 1.9 in exact match (EM) on average. We believe our data synthesis\nmethod will serve as a strong foundation for training and evaluating multimodal\nmultihop question answering models.\n","authors":["Amirhossein Abaskohi","Spandana Gella","Giuseppe Carenini","Issam H. Laradji"],"pdf_url":"https://arxiv.org/pdf/2412.07030v2.pdf","comment":"20 pages, 11 figures, 10 tables, Submitted to CVPR 2025"},{"id":"http://arxiv.org/abs/2412.13268v1","updated":"2024-12-17T19:04:15Z","published":"2024-12-17T19:04:15Z","title":"JudgeBlender: Ensembling Judgments for Automatic Relevance Assessment","summary":" The effective training and evaluation of retrieval systems require a\nsubstantial amount of relevance judgments, which are traditionally collected\nfrom human assessors -- a process that is both costly and time-consuming. Large\nLanguage Models (LLMs) have shown promise in generating relevance labels for\nsearch tasks, offering a potential alternative to manual assessments. Current\napproaches often rely on a single LLM, such as GPT-4, which, despite being\neffective, are expensive and prone to intra-model biases that can favour\nsystems leveraging similar models. In this work, we introduce JudgeBlender, a\nframework that employs smaller, open-source models to provide relevance\njudgments by combining evaluations across multiple LLMs (LLMBlender) or\nmultiple prompts (PromptBlender). By leveraging the LLMJudge benchmark [18], we\ncompare JudgeBlender with state-of-the-art methods and the top performers in\nthe LLMJudge challenge. Our results show that JudgeBlender achieves competitive\nperformance, demonstrating that very large models are often unnecessary for\nreliable relevance assessments.\n","authors":["Hossein A. Rahmani","Emine Yilmaz","Nick Craswell","Bhaskar Mitra"],"pdf_url":"https://arxiv.org/pdf/2412.13268v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2412.13170v1","updated":"2024-12-17T18:47:57Z","published":"2024-12-17T18:47:57Z","title":"Re-calibrating methodologies in social media research: Challenge the\n visual, work with Speech","summary":" This article methodologically reflects on how social media scholars can\neffectively engage with speech-based data in their analyses. While contemporary\nmedia studies have embraced textual, visual, and relational data, the aural\ndimension remained comparatively under-explored. Building on the notion of\nsecondary orality and rejection towards purely visual culture, the paper argues\nthat considering voice and speech at scale enriches our understanding of\nmultimodal digital content. The paper presents the TikTok Subtitles Toolkit\nthat offers accessible speech processing readily compatible with existing\nworkflows. In doing so, it opens new avenues for large-scale inquiries that\nblend quantitative insights with qualitative precision. Two illustrative cases\nhighlight both opportunities and limitations of speech research: while genres\nlike #storytime on TikTok benefit from the exploration of spoken narratives,\nnonverbal or music-driven content may not yield significant insights using\nspeech data. The article encourages researchers to integrate aural exploration\nthoughtfully to complement existing methods, rather than replacing them. I\nconclude that the expansion of our methodological repertoire enables richer\ninterpretations of platformised content, and our capacity to unpack digital\ncultures as they become increasingly multimodal.\n","authors":["Hongrui Jin"],"pdf_url":"https://arxiv.org/pdf/2412.13170v1.pdf","comment":"11 pages (excluding references), 3 figures"},{"id":"http://arxiv.org/abs/2412.13071v1","updated":"2024-12-17T16:38:10Z","published":"2024-12-17T16:38:10Z","title":"CLASP: Contrastive Language-Speech Pretraining for Multilingual\n Multimodal Information Retrieval","summary":" This study introduces CLASP (Contrastive Language-Speech Pretraining), a\nmultilingual, multimodal representation tailored for audio-text information\nretrieval. CLASP leverages the synergy between spoken content and textual data.\nDuring training, we utilize our newly introduced speech-text dataset, which\nencompasses 15 diverse categories ranging from fiction to religion. CLASP's\naudio component integrates audio spectrograms with a pre-trained\nself-supervised speech model, while its language encoding counterpart employs a\nsentence encoder pre-trained on over 100 languages. This unified lightweight\nmodel bridges the gap between various modalities and languages, enhancing its\neffectiveness in handling and retrieving multilingual and multimodal data. Our\nevaluations across multiple languages demonstrate that CLASP establishes new\nbenchmarks in HITS@1, MRR, and meanR metrics, outperforming traditional\nASR-based retrieval approaches in specific scenarios.\n","authors":["Mohammad Mahdi Abootorabi","Ehsaneddin Asgari"],"pdf_url":"https://arxiv.org/pdf/2412.13071v1.pdf","comment":"accepted at ECIR 2025"},{"id":"http://arxiv.org/abs/2412.12997v1","updated":"2024-12-17T15:21:28Z","published":"2024-12-17T15:21:28Z","title":"Enabling Low-Resource Language Retrieval: Establishing Baselines for\n Urdu MS MARCO","summary":" As the Information Retrieval (IR) field increasingly recognizes the\nimportance of inclusivity, addressing the needs of low-resource languages\nremains a significant challenge. This paper introduces the first large-scale\nUrdu IR dataset, created by translating the MS MARCO dataset through machine\ntranslation. We establish baseline results through zero-shot learning for IR in\nUrdu and subsequently apply the mMARCO multilingual IR methodology to this\nnewly translated dataset. Our findings demonstrate that the fine-tuned model\n(Urdu-mT5-mMARCO) achieves a Mean Reciprocal Rank (MRR@10) of 0.247 and a\nRecall@10 of 0.439, representing significant improvements over zero-shot\nresults and showing the potential for expanding IR access for Urdu speakers. By\nbridging access gaps for speakers of low-resource languages, this work not only\nadvances multilingual IR research but also emphasizes the ethical and societal\nimportance of inclusive IR technologies. This work provides valuable insights\ninto the challenges and solutions for improving language representation and\nlays the groundwork for future research, especially in South Asian languages,\nwhich can benefit from the adaptable methods used in this study.\n","authors":["Umer Butt","Stalin Veranasi","Günter Neumann"],"pdf_url":"https://arxiv.org/pdf/2412.12997v1.pdf","comment":"6 pages, ECIR 2025, conference submission version"},{"id":"http://arxiv.org/abs/2412.12984v1","updated":"2024-12-17T15:04:54Z","published":"2024-12-17T15:04:54Z","title":"Cluster-guided Contrastive Class-imbalanced Graph Classification","summary":" This paper studies the problem of class-imbalanced graph classification,\nwhich aims at effectively classifying the categories of graphs in scenarios\nwith imbalanced class distribution. Despite the tremendous success of graph\nneural networks (GNNs), their modeling ability for imbalanced graph-structured\ndata is inadequate, which typically leads to predictions biased towards the\nmajority classes. Besides, existing class-imbalanced learning methods in\nvisions may overlook the rich graph semantic substructures of the majority\nclasses and excessively emphasize learning from the minority classes. To tackle\nthis issue, this paper proposes a simple yet powerful approach called C$^3$GNN\nthat incorporates the idea of clustering into contrastive learning to enhance\nclass-imbalanced graph classification. Technically, C$^3$GNN clusters graphs\nfrom each majority class into multiple subclasses, ensuring they have similar\nsizes to the minority class, thus alleviating class imbalance. Additionally, it\nutilizes the Mixup technique to synthesize new samples and enrich the semantic\ninformation of each subclass, and leverages supervised contrastive learning to\nhierarchically learn effective graph representations. In this way, we can not\nonly sufficiently explore the semantic substructures within the majority class\nbut also effectively alleviate excessive focus on the minority class. Extensive\nexperiments on real-world graph benchmark datasets verify the superior\nperformance of our proposed method.\n","authors":["Wei Ju","Zhengyang Mao","Siyu Yi","Yifang Qin","Yiyang Gu","Zhiping Xiao","Jianhao Shen","Ziyue Qiao","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.12984v1.pdf","comment":"Accepted by Proceedings of the Thirty-Ninth AAAI Conference on\n Artificial Intelligence (AAAI-25)"},{"id":"http://arxiv.org/abs/2406.08270v2","updated":"2024-12-17T12:56:42Z","published":"2024-06-12T14:35:43Z","title":"It is Never Too Late to Mend: Separate Learning for Multimedia\n Recommendation","summary":" Multimedia recommendation, which incorporates various modalities (e.g.,\nimages, texts, etc.) into user or item representation to improve recommendation\nquality, and self-supervised learning carries multimedia recommendation to a\nplateau of performance, because of its superior performance in aligning\ndifferent modalities. However, more and more research finds that aligning all\nmodal representations is suboptimal because it damages the unique attributes of\neach modal. These studies use subtraction and orthogonal constraints in\ngeometric space to learn unique parts. However, our rigorous analysis reveals\nthe flaws in this approach, such as that subtraction does not necessarily yield\nthe desired modal-unique and that orthogonal constraints are ineffective in\nuser and item high-dimensional representation spaces. To make up for the\nprevious weaknesses, we propose Separate Learning (SEA) for multimedia\nrecommendation, which mainly includes mutual information view of modal-unique\nand -generic learning. Specifically, we first use GNN to learn the\nrepresentations of users and items in different modalities and split each modal\nrepresentation into generic and unique parts. We employ contrastive log-ratio\nupper bound to minimize the mutual information between the general and unique\nparts within the same modality, to distance their representations, thus\nlearning modal-unique features. Then, we design Solosimloss to maximize the\nlower bound of mutual information, to align the general parts of different\nmodalities, thus learning more high-quality modal-generic features. Finally,\nextensive experiments on three datasets demonstrate the effectiveness and\ngeneralization of our proposed framework. The code is available at SEA and the\nfull training record of the main experiment.\n","authors":["Zhuangzhuang He","Zihan Wang","Yonghui Yang","Haoyue Bai","Le Wu"],"pdf_url":"https://arxiv.org/pdf/2406.08270v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12852v1","updated":"2024-12-17T12:26:14Z","published":"2024-12-17T12:26:14Z","title":"Selective Shot Learning for Code Explanation","summary":" Code explanation plays a crucial role in the software engineering domain,\naiding developers in grasping code functionality efficiently. Recent work shows\nthat the performance of LLMs for code explanation improves in a few-shot\nsetting, especially when the few-shot examples are selected intelligently.\nState-of-the-art approaches for such Selective Shot Learning (SSL) include\ntoken-based and embedding-based methods. However, these SSL approaches have\nbeen evaluated on proprietary LLMs, without much exploration on open-source\nCode-LLMs. Additionally, these methods lack consideration for programming\nlanguage syntax. To bridge these gaps, we present a comparative study and\npropose a novel SSL method (SSL_ner) that utilizes entity information for\nfew-shot example selection. We present several insights and show the\neffectiveness of SSL_ner approach over state-of-the-art methods across two\ndatasets. To the best of our knowledge, this is the first systematic\nbenchmarking of open-source Code-LLMs while assessing the performances of the\nvarious few-shot examples selection approaches for the code explanation task.\n","authors":["Paheli Bhattacharya","Rishabh Gupta"],"pdf_url":"https://arxiv.org/pdf/2412.12852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12836v1","updated":"2024-12-17T11:58:55Z","published":"2024-12-17T11:58:55Z","title":"A Survey on Recommendation Unlearning: Fundamentals, Taxonomy,\n Evaluation, and Open Questions","summary":" Recommender systems have become increasingly influential in shaping user\nbehavior and decision-making, highlighting their growing impact in various\ndomains. Meanwhile, the widespread adoption of machine learning models in\nrecommender systems has raised significant concerns regarding user privacy and\nsecurity. As compliance with privacy regulations becomes more critical, there\nis a pressing need to address the issue of recommendation unlearning, i.e.,\neliminating the memory of specific training data from the learned\nrecommendation models. Despite its importance, traditional machine unlearning\nmethods are ill-suited for recommendation unlearning due to the unique\nchallenges posed by collaborative interactions and model parameters. This\nsurvey offers a comprehensive review of the latest advancements in\nrecommendation unlearning, exploring the design principles, challenges, and\nmethodologies associated with this emerging field. We provide a unified\ntaxonomy that categorizes different recommendation unlearning approaches,\nfollowed by a summary of widely used benchmarks and metrics for evaluation. By\nreviewing the current state of research, this survey aims to guide the\ndevelopment of more efficient, scalable, and robust recommendation unlearning\ntechniques. Furthermore, we identify open research questions in this field,\nwhich could pave the way for future innovations not only in recommendation\nunlearning but also in a broader range of unlearning tasks across different\nmachine learning applications.\n","authors":["Yuyuan Li","Xiaohua Feng","Chaochao Chen","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2412.12836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12806v1","updated":"2024-12-17T11:21:09Z","published":"2024-12-17T11:21:09Z","title":"Cross-Dialect Information Retrieval: Information Access in Low-Resource\n and High-Variance Languages","summary":" A large amount of local and culture-specific knowledge (e.g., people,\ntraditions, food) can only be found in documents written in dialects. While\nthere has been extensive research conducted on cross-lingual information\nretrieval (CLIR), the field of cross-dialect retrieval (CDIR) has received\nlimited attention. Dialect retrieval poses unique challenges due to the limited\navailability of resources to train retrieval models and the high variability in\nnon-standardized languages. We study these challenges on the example of German\ndialects and introduce the first German dialect retrieval dataset, dubbed\nWikiDIR, which consists of seven German dialects extracted from Wikipedia.\nUsing WikiDIR, we demonstrate the weakness of lexical methods in dealing with\nhigh lexical variation in dialects. We further show that commonly used\nzero-shot cross-lingual transfer approach with multilingual encoders do not\ntransfer well to extremely low-resource setups, motivating the need for\nresource-lean and dialect-specific retrieval models. We finally demonstrate\nthat (document) translation is an effective way to reduce the dialect gap in\nCDIR.\n","authors":["Robert Litschko","Oliver Kraus","Verena Blaschke","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2412.12806v1.pdf","comment":"Accepted at COLING 2025"},{"id":"http://arxiv.org/abs/2412.12775v1","updated":"2024-12-17T10:36:52Z","published":"2024-12-17T10:36:52Z","title":"RemoteRAG: A Privacy-Preserving LLM Cloud RAG Service","summary":" Retrieval-augmented generation (RAG) improves the service quality of large\nlanguage models by retrieving relevant documents from credible literature and\nintegrating them into the context of the user query. Recently, the rise of the\ncloud RAG service has made it possible for users to query relevant documents\nconveniently. However, directly sending queries to the cloud brings potential\nprivacy leakage. In this paper, we are the first to formally define the\nprivacy-preserving cloud RAG service to protect the user query and propose\nRemoteRAG as a solution regarding privacy, efficiency, and accuracy. For\nprivacy, we introduce $(n,\\epsilon)$-DistanceDP to characterize privacy leakage\nof the user query and the leakage inferred from relevant documents. For\nefficiency, we limit the search range from the total documents to a small\nnumber of selected documents related to a perturbed embedding generated from\n$(n,\\epsilon)$-DistanceDP, so that computation and communication costs required\nfor privacy protection significantly decrease. For accuracy, we ensure that the\nsmall range includes target documents related to the user query with detailed\ntheoretical analysis. Experimental results also demonstrate that RemoteRAG can\nresist existing embedding inversion attack methods while achieving no loss in\nretrieval under various settings. Moreover, RemoteRAG is efficient, incurring\nonly $0.67$ seconds and $46.66$KB of data transmission ($2.72$ hours and $1.43$\nGB with the non-optimized privacy-preserving scheme) when retrieving from a\ntotal of $10^6$ documents.\n","authors":["Yihang Cheng","Lan Zhang","Junyang Wang","Mu Yuan","Yunhao Yao"],"pdf_url":"https://arxiv.org/pdf/2412.12775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12770v1","updated":"2024-12-17T10:33:13Z","published":"2024-12-17T10:33:13Z","title":"A Survey on Sequential Recommendation","summary":" Different from most conventional recommendation problems, sequential\nrecommendation focuses on learning users' preferences by exploiting the\ninternal order and dependency among the interacted items, which has received\nsignificant attention from both researchers and practitioners. In recent years,\nwe have witnessed great progress and achievements in this field, necessitating\na new survey. In this survey, we study the SR problem from a new perspective\n(i.e., the construction of an item's properties), and summarize the most recent\ntechniques used in sequential recommendation such as pure ID-based SR, SR with\nside information, multi-modal SR, generative SR, LLM-powered SR, ultra-long SR\nand data-augmented SR. Moreover, we introduce some frontier research topics in\nsequential recommendation, e.g., open-domain SR, data-centric SR, could-edge\ncollaborative SR, continuous SR, SR for good, and explainable SR. We believe\nthat our survey could be served as a valuable roadmap for readers in this\nfield.\n","authors":["Liwei Pan","Weike Pan","Meiyan Wei","Hongzhi Yin","Zhong Ming"],"pdf_url":"https://arxiv.org/pdf/2412.12770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12754v1","updated":"2024-12-17T10:19:44Z","published":"2024-12-17T10:19:44Z","title":"Token-Level Graphs for Short Text Classification","summary":" The classification of short texts is a common subtask in Information\nRetrieval (IR). Recent advances in graph machine learning have led to interest\nin graph-based approaches for low resource scenarios, showing promise in such\nsettings. However, existing methods face limitations such as not accounting for\ndifferent meanings of the same words or constraints from transductive\napproaches. We propose an approach which constructs text graphs entirely based\non tokens obtained through pre-trained language models (PLMs). By applying a\nPLM to tokenize and embed the texts when creating the graph(-nodes), our method\ncaptures contextual and semantic information, overcomes vocabulary constraints,\nand allows for context-dependent word meanings. Our approach also makes\nclassification more efficient with reduced parameters compared to classical PLM\nfine-tuning, resulting in more robust training with few samples. Experimental\nresults demonstrate how our method consistently achieves higher scores or\non-par performance with existing methods, presenting an advancement in\ngraph-based text classification techniques. To support reproducibility of our\nwork we make all implementations publicly available to the\ncommunity\\footnote{\\url{https://github.com/doGregor/TokenGraph}}.\n","authors":["Gregor Donabauer","Udo Kruschwitz"],"pdf_url":"https://arxiv.org/pdf/2412.12754v1.pdf","comment":"Preprint accepted at the 47th European Conference on Information\n Retrieval (ECIR 2025)"},{"id":"http://arxiv.org/abs/2412.12612v1","updated":"2024-12-17T07:21:25Z","published":"2024-12-17T07:21:25Z","title":"SynthCypher: A Fully Synthetic Data Generation Framework for\n Text-to-Cypher Querying in Knowledge Graphs","summary":" Cypher, the query language for Neo4j graph databases, plays a critical role\nin enabling graph-based analytics and data exploration. While substantial\nresearch has been dedicated to natural language to SQL query generation\n(Text2SQL), the analogous problem for graph databases referred to as\nText2Cypher remains underexplored. In this work, we introduce SynthCypher, a\nfully synthetic and automated data generation pipeline designed to address this\ngap. SynthCypher employs a novel LLMSupervised Generation-Verification\nframework, ensuring syntactically and semantically correct Cypher queries\nacross diverse domains and query complexities. Using this pipeline, we create\nSynthCypher Dataset, a large-scale benchmark containing 29.8k Text2Cypher\ninstances. Fine-tuning open-source large language models (LLMs), including\nLLaMa-3.1- 8B, Mistral-7B, and QWEN-7B, on SynthCypher yields significant\nperformance improvements of up to 40% on the Text2Cypher test set and 30% on\nthe SPIDER benchmark adapted for graph databases. This work demonstrates that\nhigh-quality synthetic data can effectively advance the state-of-the-art in\nText2Cypher tasks.\n","authors":["Aman Tiwari","Shiva Krishna Reddy Malay","Vikas Yadav","Masoud Hashemi","Sathwik Tejaswi Madhusudhan"],"pdf_url":"https://arxiv.org/pdf/2412.12612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15328v2","updated":"2024-12-17T05:35:15Z","published":"2024-05-24T08:11:59Z","title":"Multi-Modal Recommendation Unlearning for Legal, Licensing, and Modality\n Constraints","summary":" User data spread across multiple modalities has popularized multi-modal\nrecommender systems (MMRS). They recommend diverse content such as products,\nsocial media posts, TikTok reels, etc., based on a user-item interaction graph.\nWith rising data privacy demands, recent methods propose unlearning private\nuser data from uni-modal recommender systems (RS). However, methods for\nunlearning item data related to outdated user preferences, revoked licenses,\nand legally requested removals are still largely unexplored.\n Previous RS unlearning methods are unsuitable for MMRS due to the\nincompatibility of their matrix-based representation with the multi-modal\nuser-item interaction graph. Moreover, their data partitioning step degrades\nperformance on each shard due to poor data heterogeneity and requires costly\nperformance aggregation across shards.\n This paper introduces MMRecUn, the first approach known to us for unlearning\nin MMRS and unlearning item data. Given a trained RS model, MMRecUn employs a\nnovel Reverse Bayesian Personalized Ranking (BPR) objective to enable the model\nto forget marked data. The reverse BPR attenuates the impact of user-item\ninteractions within the forget set, while the forward BPR reinforces the\nsignificance of user-item interactions within the retain set. Our experiments\ndemonstrate that MMRecUn outperforms baseline methods across various unlearning\nrequests when evaluated on benchmark MMRS datasets. MMRecUn achieves recall\nperformance improvements of up to 49.85% compared to baseline methods and is up\nto $\\mathbf{1.3}\\times$ faster than the Gold model, which is trained on retain\nset from scratch. MMRecUn offers significant advantages, including superiority\nin removing target interactions, preserving retained interactions, and zero\noverhead costs compared to previous methods. The code will be released after\nreview.\n","authors":["Yash Sinha","Murari Mandal","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2405.15328v2.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2402.06871v5","updated":"2024-12-17T05:19:27Z","published":"2024-02-10T03:21:13Z","title":"Non-autoregressive Generative Models for Reranking Recommendation","summary":" Contemporary recommendation systems are designed to meet users' needs by\ndelivering tailored lists of items that align with their specific demands or\ninterests. In a multi-stage recommendation system, reranking plays a crucial\nrole by modeling the intra-list correlations among items. The key challenge of\nreranking lies in the exploration of optimal sequences within the combinatorial\nspace of permutations. Recent research proposes a generator-evaluator learning\nparadigm, where the generator generates multiple feasible sequences and the\nevaluator picks out the best sequence based on the estimated listwise score.\nThe generator is of vital importance, and generative models are well-suited for\nthe generator function. Current generative models employ an autoregressive\nstrategy for sequence generation. However, deploying autoregressive models in\nreal-time industrial systems is challenging. To address these issues, we\npropose a Non-AutoRegressive generative model for reranking Recommendation\n(NAR4Rec) designed to enhance efficiency and effectiveness. To tackle\nchallenges such as sparse training samples and dynamic candidates, we introduce\na matching model. Considering the diverse nature of user feedback, we employ a\nsequence-level unlikelihood training objective to differentiate feasible\nsequences from unfeasible ones. Additionally, to overcome the lack of\ndependency modeling in non-autoregressive models regarding target items, we\nintroduce contrastive decoding to capture correlations among these items.\nExtensive offline experiments validate the superior performance of NAR4Rec over\nstate-of-the-art reranking methods. Online A/B tests reveal that NAR4Rec\nsignificantly enhances the user experience. Furthermore, NAR4Rec has been fully\ndeployed in a popular video app Kuaishou with over 300 million daily active\nusers.\n","authors":["Yuxin Ren","Qiya Yang","Yichun Wu","Wei Xu","Yalong Wang","Zhiqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.06871v5.pdf","comment":"Accepted by KDD 2024"},{"id":"http://arxiv.org/abs/2412.12504v1","updated":"2024-12-17T03:10:47Z","published":"2024-12-17T03:10:47Z","title":"Boosting LLM-based Relevance Modeling with Distribution-Aware Robust\n Learning","summary":" With the rapid advancement of pre-trained large language models (LLMs),\nrecent endeavors have leveraged the capabilities of LLMs in relevance modeling,\nresulting in enhanced performance. This is usually done through the process of\nfine-tuning LLMs on specifically annotated datasets to determine the relevance\nbetween queries and items. However, there are two limitations when LLMs are\nnaively employed for relevance modeling through fine-tuning and inference.\nFirst, it is not inherently efficient for performing nuanced tasks beyond\nsimple yes or no answers, such as assessing search relevance. It may therefore\ntend to be overconfident and struggle to distinguish fine-grained degrees of\nrelevance (e.g., strong relevance, weak relevance, irrelevance) used in search\nengines. Second, it exhibits significant performance degradation when\nconfronted with data distribution shift in real-world scenarios. In this paper,\nwe propose a novel Distribution-Aware Robust Learning framework (DaRL) for\nrelevance modeling in Alipay Search. Specifically, we design an effective loss\nfunction to enhance the discriminability of LLM-based relevance modeling across\nvarious fine-grained degrees of query-item relevance. To improve the\ngeneralizability of LLM-based relevance modeling, we first propose the\nDistribution-Aware Sample Augmentation (DASA) module. This module utilizes\nout-of-distribution (OOD) detection techniques to actively select appropriate\nsamples that are not well covered by the original training set for model\nfine-tuning. Furthermore, we adopt a multi-stage fine-tuning strategy to\nsimultaneously improve in-distribution (ID) and OOD performance, bridging the\nperformance gap between them. DaRL has been deployed online to serve the\nAlipay's insurance product search...\n","authors":["Hong Liu","Saisai Gong","Yixin Ji","Kaixin Wu","Jia Xu","Jinjie Gu"],"pdf_url":"https://arxiv.org/pdf/2412.12504v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2412.12464v1","updated":"2024-12-17T01:52:15Z","published":"2024-12-17T01:52:15Z","title":"LLM is Knowledge Graph Reasoner: LLM's Intuition-aware Knowledge Graph\n Reasoning for Cold-start Sequential Recommendation","summary":" Knowledge Graphs (KGs) represent relationships between entities in a graph\nstructure and have been widely studied as promising tools for realizing\nrecommendations that consider the accurate content information of items.\nHowever, traditional KG-based recommendation methods face fundamental\nchallenges: insufficient consideration of temporal information and poor\nperformance in cold-start scenarios. On the other hand, Large Language Models\n(LLMs) can be considered databases with a wealth of knowledge learned from the\nweb data, and they have recently gained attention due to their potential\napplication as recommendation systems. Although approaches that treat LLMs as\nrecommendation systems can leverage LLMs' high recommendation literacy, their\ninput token limitations make it impractical to consider the entire\nrecommendation domain dataset and result in scalability issues. To address\nthese challenges, we propose a LLM's Intuition-aware Knowledge graph Reasoning\nmodel (LIKR). Our main idea is to treat LLMs as reasoners that output intuitive\nexploration strategies for KGs. To integrate the knowledge of LLMs and KGs, we\ntrained a recommendation agent through reinforcement learning using a reward\nfunction that integrates different recommendation strategies, including LLM's\nintuition and KG embeddings. By incorporating temporal awareness through prompt\nengineering and generating textual representations of user preferences from\nlimited interactions, LIKR can improve recommendation performance in cold-start\nscenarios. Furthermore, LIKR can avoid scalability issues by using KGs to\nrepresent recommendation domain datasets and limiting the LLM's output to KG\nexploration strategies. Experiments on real-world datasets demonstrate that our\nmodel outperforms state-of-the-art recommendation methods in cold-start\nsequential recommendation scenarios.\n","authors":["Keigo Sakurai","Ren Togo","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2412.12464v1.pdf","comment":"Accepted to the 47th European Conference on Information Retrieval\n (ECIR2025)"},{"id":"http://arxiv.org/abs/2412.12459v1","updated":"2024-12-17T01:43:44Z","published":"2024-12-17T01:43:44Z","title":"LITA: An Efficient LLM-assisted Iterative Topic Augmentation Framework","summary":" Topic modeling is widely used for uncovering thematic structures within text\ncorpora, yet traditional models often struggle with specificity and coherence\nin domain-focused applications. Guided approaches, such as SeededLDA and CorEx,\nincorporate user-provided seed words to improve relevance but remain\nlabor-intensive and static. Large language models (LLMs) offer potential for\ndynamic topic refinement and discovery, yet their application often incurs high\nAPI costs. To address these challenges, we propose the LLM-assisted Iterative\nTopic Augmentation framework (LITA), an LLM-assisted approach that integrates\nuser-provided seeds with embedding-based clustering and iterative refinement.\nLITA identifies a small number of ambiguous documents and employs an LLM to\nreassign them to existing or new topics, minimizing API costs while enhancing\ntopic quality. Experiments on two datasets across topic quality and clustering\nperformance metrics demonstrate that LITA outperforms five baseline models,\nincluding LDA, SeededLDA, CorEx, BERTopic, and PromptTopic. Our work offers an\nefficient and adaptable framework for advancing topic modeling and text\nclustering.\n","authors":["Chia-Hsuan Chang","Jui-Tse Tsai","Yi-Hang Tsai","San-Yih Hwang"],"pdf_url":"https://arxiv.org/pdf/2412.12459v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2412.12433v1","updated":"2024-12-17T00:50:23Z","published":"2024-12-17T00:50:23Z","title":"Refining Dimensions for Improving Clustering-based Cross-lingual Topic\n Models","summary":" Recent works in clustering-based topic models perform well in monolingual\ntopic identification by introducing a pipeline to cluster the contextualized\nrepresentations. However, the pipeline is suboptimal in identifying topics\nacross languages due to the presence of language-dependent dimensions (LDDs)\ngenerated by multilingual language models. To address this issue, we introduce\na novel, SVD-based dimension refinement component into the pipeline of the\nclustering-based topic model. This component effectively neutralizes the\nnegative impact of LDDs, enabling the model to accurately identify topics\nacross languages. Our experiments on three datasets demonstrate that the\nupdated pipeline with the dimension refinement component generally outperforms\nother state-of-the-art cross-lingual topic models.\n","authors":["Chia-Hsuan Chang","Tien-Yuan Huang","Yi-Hang Tsai","Chia-Ming Chang","San-Yih Hwang"],"pdf_url":"https://arxiv.org/pdf/2412.12433v1.pdf","comment":"Accepted to 18th BUCC Workshop at COLING 2025"},{"id":"http://arxiv.org/abs/2409.12468v2","updated":"2024-12-17T00:25:46Z","published":"2024-09-19T05:14:55Z","title":"Familiarity-Aware Evidence Compression for Retrieval-Augmented\n Generation","summary":" Retrieval-augmented generation (RAG) improves large language models (LMs) by\nincorporating non-parametric knowledge through evidence retrieved from external\nsources. However, it often struggles to cope with inconsistent and irrelevant\ninformation that can distract the LM from its tasks, especially when multiple\nevidence pieces are required. While compressing the retrieved evidence with a\ncompression model aims to address this issue, the compressed evidence may still\nbe unfamiliar to the target model used for downstream tasks, potentially\nfailing to utilize the evidence effectively. We propose FaviComp\n(Familarity-Aware Evidence Compression), a novel training-free evidence\ncompression technique that makes retrieved evidence more familiar to the target\nmodel, while seamlessly integrating parametric knowledge from the model.\nExperimental results show that FaviComp consistently outperforms most recent\nevidence compression baselines across multiple open-domain QA datasets,\nimproving accuracy by up to 28.1% while achieving high compression rates.\nAdditionally, we demonstrate the effective integration of both parametric and\nnon-parametric knowledge during evidence compression.\n","authors":["Dongwon Jung","Qin Liu","Tenghao Huang","Ben Zhou","Muhao Chen"],"pdf_url":"https://arxiv.org/pdf/2409.12468v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07857v2","updated":"2024-12-17T00:14:25Z","published":"2023-08-15T16:16:02Z","title":"Impression-Aware Recommender Systems","summary":" Novel data sources bring new opportunities to improve the quality of\nrecommender systems and serve as a catalyst for the creation of new paradigms\non personalized recommendations. Impressions are a novel data source containing\nthe items shown to users on their screens. Past research focused on providing\npersonalized recommendations using interactions, and occasionally using\nimpressions when such a data source was available. Interest in impressions has\nincreased due to their potential to provide more accurate recommendations.\nDespite this increased interest, research in recommender systems using\nimpressions is still dispersed. Many works have distinct interpretations of\nimpressions and use impressions in recommender systems in numerous different\nmanners. To unify those interpretations into a single framework, we present a\nsystematic literature review on recommender systems using impressions, focusing\non three fundamental perspectives: recommendation models, datasets, and\nevaluation methodologies. We define a theoretical framework to delimit\nrecommender systems using impressions and a novel paradigm for personalized\nrecommendations, called impression-aware recommender systems. We propose a\nclassification system for recommenders in this paradigm, which we use to\ncategorize the recommendation models, datasets, and evaluation methodologies\nused in past research. Lastly, we identify open questions and future\ndirections, highlighting missing aspects in the reviewed literature.\n","authors":["Fernando B. Pérez Maurera","Maurizio Ferrari Dacrema","Pablo Castells","Paolo Cremonesi"],"pdf_url":"https://arxiv.org/pdf/2308.07857v2.pdf","comment":"44 pages, 127 references, 6 tables, 5 figures, ACM TORS ACCEPTED"}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.05658v2","updated":"2024-12-17T19:18:13Z","published":"2023-09-11T17:53:14Z","title":"From Capture to Display: A Survey on Volumetric Video","summary":" Volumetric video, which offers immersive viewing experiences, is gaining\nincreasing prominence. With its six degrees of freedom, it provides viewers\nwith greater immersion and interactivity compared to traditional videos.\nDespite their potential, volumetric video services pose significant challenges.\nThis survey conducts a comprehensive review of the existing literature on\nvolumetric video. We firstly provide a general framework of volumetric video\nservices, followed by a discussion on prerequisites for volumetric video,\nencompassing representations, open datasets, and quality assessment metrics.\nThen we delve into the current methodologies for each stage of the volumetric\nvideo service pipeline, detailing capturing, compression, transmission,\nrendering, and display techniques. Lastly, we explore various applications\nenabled by this pioneering technology and we present an array of research\nchallenges and opportunities in the domain of volumetric video services. This\nsurvey aspires to provide a holistic understanding of this burgeoning field and\nshed light on potential future research trajectories, aiming to bring the\nvision of volumetric video to fruition.\n","authors":["Yili Jin","Kaiyuan Hu","Junhua Liu","Fangxin Wang","Xue Liu"],"pdf_url":"https://arxiv.org/pdf/2309.05658v2.pdf","comment":"Major revision submitted to ACM Computing Surveys"},{"id":"http://arxiv.org/abs/2412.13119v1","updated":"2024-12-17T17:41:42Z","published":"2024-12-17T17:41:42Z","title":"Flight Patterns for Swarms of Drones","summary":" We present flight patterns for a collision-free passage of swarms of drones\nthrough one or more openings. The narrow openings provide drones with access to\nan infrastructure component such as charging stations to charge their depleted\nbatteries and hangars for storage. The flight patterns are a staging area\n(queues) that match the rate at which an infrastructure component and its\nopenings process drones. They prevent collisions and may implement different\npolicies that control the order in which drones pass through an opening. We\nillustrate the flight patterns with a 3D display that uses drones configured\nwith light sources to illuminate shapes.\n","authors":["Shuqin Zhu","Shahram Ghandeharizadeh"],"pdf_url":"https://arxiv.org/pdf/2412.13119v1.pdf","comment":"Appeared in the First International Conference on Holodecks, December\n 15, 2023. Shuqin Zhou and Shahram Ghandeharizadeh. Flight Patterns for Swarms\n of Drones. In the Proceedings of the First International Conference on\n Holodecks (Holodecks '23), December 15 2023, Los Angeles, California, USA,\n 29-33. https://doi.org/10.61981/ZFSH2303"},{"id":"http://arxiv.org/abs/2412.12938v1","updated":"2024-12-17T14:18:07Z","published":"2024-12-17T14:18:07Z","title":"A Conceptual Model of Intelligent Multimedia Data Rendered using Flying\n Light Specks","summary":" A Flying Light Speck, FLS, is a miniature sized drone configured with light\nsources to illuminate 3D multimedia objects in a fixed volume, an FLS display.\nA swarm of FLSs may provide haptic interactions by exerting force back at a\nuser's touch. This paper presents a conceptual model for the multimedia data to\nenable content-based queries. The model empowers users of an FLS display to\nannotate the illuminations by adding semantics to the data, extending a\nmultimedia repository with information and knowledge. We present a core\nconceptual model and demonstrate its extensions for two diverse applications,\nauthoring tools with entertainment and MRI scans with healthcare.\n","authors":["Nima Yazdani","Hamed Alimohammadzadeh","Shahram Ghandeharizadeh"],"pdf_url":"https://arxiv.org/pdf/2412.12938v1.pdf","comment":"Appeared in the First International Conference on Holodecks"},{"id":"http://arxiv.org/abs/2412.12791v1","updated":"2024-12-17T10:52:50Z","published":"2024-12-17T10:52:50Z","title":"Implicit Location-Caption Alignment via Complementary Masking for\n Weakly-Supervised Dense Video Captioning","summary":" Weakly-Supervised Dense Video Captioning (WSDVC) aims to localize and\ndescribe all events of interest in a video without requiring annotations of\nevent boundaries. This setting poses a great challenge in accurately locating\nthe temporal location of event, as the relevant supervision is unavailable.\nExisting methods rely on explicit alignment constraints between event locations\nand captions, which involve complex event proposal procedures during both\ntraining and inference. To tackle this problem, we propose a novel implicit\nlocation-caption alignment paradigm by complementary masking, which simplifies\nthe complex event proposal and localization process while maintaining\neffectiveness. Specifically, our model comprises two components: a dual-mode\nvideo captioning module and a mask generation module. The dual-mode video\ncaptioning module captures global event information and generates descriptive\ncaptions, while the mask generation module generates differentiable positive\nand negative masks for localizing the events. These masks enable the implicit\nalignment of event locations and captions by ensuring that captions generated\nfrom positively and negatively masked videos are complementary, thereby forming\na complete video description. In this way, even under weak supervision, the\nevent location and event caption can be aligned implicitly. Extensive\nexperiments on the public datasets demonstrate that our method outperforms\nexisting weakly-supervised methods and achieves competitive results compared to\nfully-supervised methods.\n","authors":["Shiping Ge","Qiang Chen","Zhiwei Jiang","Yafeng Yin","Liu Qin","Ziyao Chen","Qing Gu"],"pdf_url":"https://arxiv.org/pdf/2412.12791v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.12718v1","updated":"2024-12-17T09:33:06Z","published":"2024-12-17T09:33:06Z","title":"ASAP: Advancing Semantic Alignment Promotes Multi-Modal Manipulation\n Detecting and Grounding","summary":" We present ASAP, a new framework for detecting and grounding multi-modal\nmedia manipulation (DGM4).Upon thorough examination, we observe that accurate\nfine-grained cross-modal semantic alignment between the image and text is vital\nfor accurately manipulation detection and grounding. While existing DGM4\nmethods pay rare attention to the cross-modal alignment, hampering the accuracy\nof manipulation detecting to step further. To remedy this issue, this work\ntargets to advance the semantic alignment learning to promote this task.\nParticularly, we utilize the off-the-shelf Multimodal Large-Language Models\n(MLLMs) and Large Language Models (LLMs) to construct paired image-text pairs,\nespecially for the manipulated instances. Subsequently, a cross-modal alignment\nlearning is performed to enhance the semantic alignment. Besides the explicit\nauxiliary clues, we further design a Manipulation-Guided Cross Attention (MGCA)\nto provide implicit guidance for augmenting the manipulation perceiving. With\nthe grounding truth available during training, MGCA encourages the model to\nconcentrate more on manipulated components while downplaying normal ones,\nenhancing the model's ability to capture manipulations. Extensive experiments\nare conducted on the DGM4 dataset, the results demonstrate that our model can\nsurpass the comparison method with a clear margin.\n","authors":["Zhenxing Zhang","Yaxiong Wang","Lechao Cheng","Zhun Zhong","Dan Guo","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2412.12718v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2412.11248v2","updated":"2024-12-17T07:31:27Z","published":"2024-12-15T16:54:53Z","title":"Multimodal Class-aware Semantic Enhancement Network for Audio-Visual\n Video Parsing","summary":" The Audio-Visual Video Parsing task aims to recognize and temporally localize\nall events occurring in either the audio or visual stream, or both. Capturing\naccurate event semantics for each audio/visual segment is vital. Prior works\ndirectly utilize the extracted holistic audio and visual features for intra-\nand cross-modal temporal interactions. However, each segment may contain\nmultiple events, resulting in semantically mixed holistic features that can\nlead to semantic interference during intra- or cross-modal interactions: the\nevent semantics of one segment may incorporate semantics of unrelated events\nfrom other segments. To address this issue, our method begins with a\nClass-Aware Feature Decoupling (CAFD) module, which explicitly decouples the\nsemantically mixed features into distinct class-wise features, including\nmultiple event-specific features and a dedicated background feature. The\ndecoupled class-wise features enable our model to selectively aggregate useful\nsemantics for each segment from clearly matched classes contained in other\nsegments, preventing semantic interference from irrelevant classes.\nSpecifically, we further design a Fine-Grained Semantic Enhancement module for\nencoding intra- and cross-modal relations. It comprises a Segment-wise Event\nCo-occurrence Modeling (SECM) block and a Local-Global Semantic Fusion (LGSF)\nblock. The SECM exploits inter-class dependencies of concurrent events within\nthe same timestamp with the aid of a new event co-occurrence loss. The LGSF\nfurther enhances the event semantics of each segment by incorporating relevant\nsemantics from more informative global video features. Extensive experiments\nvalidate the effectiveness of the proposed modules and loss functions,\nresulting in a new state-of-the-art parsing performance.\n","authors":["Pengcheng Zhao","Jinxing Zhou","Yang Zhao","Dan Guo","Yanxiang Chen"],"pdf_url":"https://arxiv.org/pdf/2412.11248v2.pdf","comment":"Accepted by AAAI-2025"},{"id":"http://arxiv.org/abs/2407.20962v3","updated":"2024-12-17T07:13:38Z","published":"2024-07-30T16:43:24Z","title":"MMTrail: A Multimodal Trailer Video Dataset with Language and Music\n Descriptions","summary":" Massive multi-modality datasets play a significant role in facilitating the\nsuccess of large video-language models. However, current video-language\ndatasets primarily provide text descriptions for visual frames, considering\naudio to be weakly related information. They usually overlook exploring the\npotential of inherent audio-visual correlation, leading to monotonous\nannotation within each modality instead of comprehensive and precise\ndescriptions. Such ignorance results in the difficulty of multiple\ncross-modality studies. To fulfill this gap, we present MMTrail, a large-scale\nmulti-modality video-language dataset incorporating more than 20M trailer clips\nwith visual captions, and 2M high-quality clips with multimodal captions.\nTrailers preview full-length video works and integrate context, visual frames,\nand background music. In particular, the trailer has two main advantages: (1)\nthe topics are diverse, and the content characters are of various types, e.g.,\nfilm, news, and gaming. (2) the corresponding background music is\ncustom-designed, making it more coherent with the visual context. Upon these\ninsights, we propose a systemic captioning framework, achieving various\nmodality annotations with more than 27.1k hours of trailer videos. Here, to\nensure the caption retains music perspective while preserving the authority of\nvisual context, we leverage the advanced LLM to merge all annotations\nadaptively. In this fashion, our MMtrail dataset potentially paves the path for\nfine-grained large multimodal-language model training. In experiments, we\nprovide evaluation metrics and benchmark results on our dataset, demonstrating\nthe high quality of our annotation and its effectiveness for model training.\n","authors":["Xiaowei Chi","Yatian Wang","Aosong Cheng","Pengjun Fang","Zeyue Tian","Yingqing He","Zhaoyang Liu","Xingqun Qi","Jiahao Pan","Rongyu Zhang","Mengfei Li","Ruibin Yuan","Yanbing Jiang","Wei Xue","Wenhan Luo","Qifeng Chen","Shanghang Zhang","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2407.20962v3.pdf","comment":"15 Pages. Dataset report"},{"id":"http://arxiv.org/abs/2310.19180v4","updated":"2024-12-17T04:08:33Z","published":"2023-10-29T22:51:49Z","title":"JEN-1 Composer: A Unified Framework for High-Fidelity Multi-Track Music\n Generation","summary":" With rapid advances in generative artificial intelligence, the text-to-music\nsynthesis task has emerged as a promising direction for music generation.\nNevertheless, achieving precise control over multi-track generation remains an\nopen challenge. While existing models excel in directly generating multi-track\nmix, their limitations become evident when it comes to composing individual\ntracks and integrating them in a controllable manner. This departure from the\ntypical workflows of professional composers hinders the ability to refine\ndetails in specific tracks. To address this gap, we propose JEN-1 Composer, a\nunified framework designed to efficiently model marginal, conditional, and\njoint distributions over multi-track music using a single model. Building upon\nan audio latent diffusion model, JEN-1 Composer extends the versatility of\nmulti-track music generation. We introduce a progressive curriculum training\nstrategy, which gradually escalates the difficulty of training tasks while\nensuring the model's generalization ability and facilitating smooth transitions\nbetween different scenarios. During inference, users can iteratively generate\nand select music tracks, thus incrementally composing entire musical pieces in\naccordance with the Human-AI co-composition workflow. Our approach demonstrates\nstate-of-the-art performance in controllable and high-fidelity multi-track\nmusic synthesis, marking a significant advancement in interactive AI-assisted\nmusic creation. Our demo pages are available at www.jenmusic.ai/research.\n","authors":["Yao Yao","Peike Li","Boyu Chen","Alex Wang"],"pdf_url":"https://arxiv.org/pdf/2310.19180v4.pdf","comment":"9 pages, 3 figures, 3 tables, accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.11409v2","updated":"2024-12-17T03:50:05Z","published":"2024-12-16T03:25:23Z","title":"Multi-modal and Multi-scale Spatial Environment Understanding for\n Immersive Visual Text-to-Speech","summary":" Visual Text-to-Speech (VTTS) aims to take the environmental image as the\nprompt to synthesize the reverberant speech for the spoken content. The\nchallenge of this task lies in understanding the spatial environment from the\nimage. Many attempts have been made to extract global spatial visual\ninformation from the RGB space of an spatial image. However, local and depth\nimage information are crucial for understanding the spatial environment, which\nprevious works have ignored. To address the issues, we propose a novel\nmulti-modal and multi-scale spatial environment understanding scheme to achieve\nimmersive VTTS, termed M2SE-VTTS. The multi-modal aims to take both the RGB and\nDepth spaces of the spatial image to learn more comprehensive spatial\ninformation, and the multi-scale seeks to model the local and global spatial\nknowledge simultaneously. Specifically, we first split the RGB and Depth images\ninto patches and adopt the Gemini-generated environment captions to guide the\nlocal spatial understanding. After that, the multi-modal and multi-scale\nfeatures are integrated by the local-aware global spatial understanding. In\nthis way, M2SE-VTTS effectively models the interactions between local and\nglobal spatial contexts in the multi-modal spatial environment. Objective and\nsubjective evaluations suggest that our model outperforms the advanced\nbaselines in environmental speech generation. The code and audio samples are\navailable at: https://github.com/AI-S2-Lab/M2SE-VTTS.\n","authors":["Rui Liu","Shuwei He","Yifan Hu","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2412.11409v2.pdf","comment":"9 pages,2 figures, Accepted by AAAI'2025"},{"id":"http://arxiv.org/abs/2409.10994v3","updated":"2024-12-17T02:05:27Z","published":"2024-09-17T08:56:27Z","title":"Less is More: A Simple yet Effective Token Reduction Method for\n Efficient Multi-modal LLMs","summary":" The rapid advancement of Multimodal Large Language Models (MLLMs) has led to\nremarkable performances across various domains. However, this progress is\naccompanied by a substantial surge in the resource consumption of these models.\nWe address this pressing issue by introducing a new approach, Token Reduction\nusing CLIP Metric (TRIM), aimed at improving the efficiency of MLLMs without\nsacrificing their performance. Inspired by human attention patterns in Visual\nQuestion Answering (VQA) tasks, TRIM presents a fresh perspective on the\nselection and reduction of image tokens. The TRIM method has been extensively\ntested across 12 datasets, and the results demonstrate a significant reduction\nin computational overhead while maintaining a consistent level of performance.\nThis research marks a critical stride in efficient MLLM development, promoting\ngreater accessibility and sustainability of high-performing models.\n","authors":["Dingjie Song","Wenjun Wang","Shunian Chen","Xidong Wang","Michael Guan","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.10994v3.pdf","comment":"Accepted to COLING 2025"},{"id":"http://arxiv.org/abs/2412.12453v1","updated":"2024-12-17T01:36:32Z","published":"2024-12-17T01:36:32Z","title":"Multimodal Classification and Out-of-distribution Detection for\n Multimodal Intent Understanding","summary":" Multimodal intent understanding is a significant research area that requires\neffectively leveraging multiple modalities to analyze human language. Existing\nmethods face two main challenges in this domain. Firstly, they have limitations\nin capturing nuanced and high-level semantics underlying complex\nin-distribution (ID) multimodal intents. Secondly, they exhibit poor\ngeneralization when confronted with unseen out-of-distribution (OOD) data in\nreal-world scenarios. To address these issues, we propose a novel method for\nboth ID classification and OOD detection (MIntOOD). We first introduce a\nweighted feature fusion network that models multimodal representations\neffectively. This network dynamically learns the importance of each modality,\nadapting to multimodal contexts. To develop discriminative representations that\nare conducive to both tasks, we synthesize pseudo-OOD data from convex\ncombinations of ID data and engage in multimodal representation learning from\nboth coarse-grained and fine-grained perspectives. The coarse-grained\nperspective focuses on distinguishing between ID and OOD binary classes, while\nthe fine-grained perspective enhances the understanding of ID data by\nincorporating binary confidence scores. These scores help to gauge the\ndifficulty of each sample, improving the classification of different ID\nclasses. Additionally, the fine-grained perspective captures instance-level\ninteractions between ID and OOD samples, promoting proximity among similar\ninstances and separation from dissimilar ones. We establish baselines for three\nmultimodal intent datasets and build an OOD benchmark. Extensive experiments on\nthese datasets demonstrate that our method significantly improves OOD detection\nperformance with a 3-10% increase in AUROC scores while achieving new\nstate-of-the-art results in ID classification. The full data and codes are\navailable at https://github.com/thuiar/MIntOOD.\n","authors":["Hanlei Zhang","Qianrui Zhou","Hua Xu","Jianhua Su","Roberto Evans","Kai Gao"],"pdf_url":"https://arxiv.org/pdf/2412.12453v1.pdf","comment":"15 pages, 4 figures"}]},"2024-12-16T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2412.12330v1","updated":"2024-12-16T20:00:51Z","published":"2024-12-16T20:00:51Z","title":"Searching Personal Collections","summary":" This article describes the history of information retrieval on personal\ndocument collections.\n","authors":["Michael Bendersky","Donald Metzler","Marc Najork","Xuanhui Wang"],"pdf_url":"https://arxiv.org/pdf/2412.12330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.12322v1","updated":"2024-12-16T19:40:26Z","published":"2024-12-16T19:40:26Z","title":"RAG Playground: A Framework for Systematic Evaluation of Retrieval\n Strategies and Prompt Engineering in RAG Systems","summary":" We present RAG Playground, an open-source framework for systematic evaluation\nof Retrieval-Augmented Generation (RAG) systems. The framework implements and\ncompares three retrieval approaches: naive vector search, reranking, and hybrid\nvector-keyword search, combined with ReAct agents using different prompting\nstrategies. We introduce a comprehensive evaluation framework with novel\nmetrics and provide empirical results comparing different language models\n(Llama 3.1 and Qwen 2.5) across various retrieval configurations. Our\nexperiments demonstrate significant performance improvements through hybrid\nsearch methods and structured self-evaluation prompting, achieving up to 72.7%\npass rate on our multi-metric evaluation framework. The results also highlight\nthe importance of prompt engineering in RAG systems, with our custom-prompted\nagents showing consistent improvements in retrieval accuracy and response\nquality.\n","authors":["Ioannis Papadimitriou","Ilias Gialampoukidis","Stefanos Vrochidis"," Ioannis"," Kompatsiaris"],"pdf_url":"https://arxiv.org/pdf/2412.12322v1.pdf","comment":"Work In Progress"},{"id":"http://arxiv.org/abs/2412.12092v1","updated":"2024-12-16T18:58:28Z","published":"2024-12-16T18:58:28Z","title":"No More Tuning: Prioritized Multi-Task Learning with Lagrangian\n Differential Multiplier Methods","summary":" Given the ubiquity of multi-task in practical systems, Multi-Task Learning\n(MTL) has found widespread application across diverse domains. In real-world\nscenarios, these tasks often have different priorities. For instance, In web\nsearch, relevance is often prioritized over other metrics, such as\nclick-through rates or user engagement. Existing frameworks pay insufficient\nattention to the prioritization among different tasks, which typically adjust\ntask-specific loss function weights to differentiate task priorities. However,\nthis approach encounters challenges as the number of tasks grows, leading to\nexponential increases in hyper-parameter tuning complexity. Furthermore, the\nsimultaneous optimization of multiple objectives can negatively impact the\nperformance of high-priority tasks due to interference from lower-priority\ntasks.\n In this paper, we introduce a novel multi-task learning framework employing\nLagrangian Differential Multiplier Methods for step-wise multi-task\noptimization. It is designed to boost the performance of high-priority tasks\nwithout interference from other tasks. Its primary advantage lies in its\nability to automatically optimize multiple objectives without requiring\nbalancing hyper-parameters for different tasks, thereby eliminating the need\nfor manual tuning. Additionally, we provide theoretical analysis demonstrating\nthat our method ensures optimization guarantees, enhancing the reliability of\nthe process. We demonstrate its effectiveness through experiments on multiple\npublic datasets and its application in Taobao search, a large-scale industrial\nsearch ranking system, resulting in significant improvements across various\nbusiness metrics.\n","authors":["Zhengxing Cheng","Yuheng Huang","Zhixuan Zhang","Dan Ou","Qingwen Liu"],"pdf_url":"https://arxiv.org/pdf/2412.12092v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.06954v3","updated":"2024-12-16T16:18:28Z","published":"2024-12-09T20:01:59Z","title":"CURE: A dataset for Clinical Understanding & Retrieval Evaluation","summary":" Given the dominance of dense retrievers that do not generalize well beyond\ntheir training dataset distributions, domain-specific test sets are essential\nin evaluating retrieval. There are few test datasets for retrieval systems\nintended for use by healthcare providers in a point-of-care setting. To fill\nthis gap we have collaborated with medical professionals to create CURE, an\nad-hoc retrieval test dataset for passage ranking with 2000 queries spanning 10\nmedical domains with a monolingual (English) and two cross-lingual\n(French/Spanish -> English) conditions. In this paper, we describe how CURE was\nconstructed and provide baseline results to showcase its effectiveness as an\nevaluation tool. CURE is published with a Creative Commons Attribution Non\nCommercial 4.0 license and can be accessed on Hugging Face.\n","authors":["Nadia Sheikh","Anne-Laure Jousse","Daniel Buades Marcos","Akintunde Oladipo","Olivier Rousseau","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2412.06954v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11919v1","updated":"2024-12-16T16:03:25Z","published":"2024-12-16T16:03:25Z","title":"RetroLLM: Empowering Large Language Models to Retrieve Fine-grained\n Evidence within Generation","summary":" Large language models (LLMs) exhibit remarkable generative capabilities but\noften suffer from hallucinations. Retrieval-augmented generation (RAG) offers\nan effective solution by incorporating external knowledge, but existing methods\nstill face several limitations: additional deployment costs of separate\nretrievers, redundant input tokens from retrieved text chunks, and the lack of\njoint optimization of retrieval and generation. To address these issues, we\npropose \\textbf{RetroLLM}, a unified framework that integrates retrieval and\ngeneration into a single, cohesive process, enabling LLMs to directly generate\nfine-grained evidence from the corpus with constrained decoding. Moreover, to\nmitigate false pruning in the process of constrained evidence generation, we\nintroduce (1) hierarchical FM-Index constraints, which generate\ncorpus-constrained clues to identify a subset of relevant documents before\nevidence generation, reducing irrelevant decoding space; and (2) a\nforward-looking constrained decoding strategy, which considers the relevance of\nfuture sequences to improve evidence accuracy. Extensive experiments on five\nopen-domain QA datasets demonstrate RetroLLM's superior performance across both\nin-domain and out-of-domain tasks. The code is available at\n\\url{https://github.com/sunnynexus/RetroLLM}.\n","authors":["Xiaoxi Li","Jiajie Jin","Yujia Zhou","Yongkang Wu","Zhonghua Li","Qi Ye","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2412.11919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11864v1","updated":"2024-12-16T15:20:13Z","published":"2024-12-16T15:20:13Z","title":"Investigating Mixture of Experts in Dense Retrieval","summary":" While Dense Retrieval Models (DRMs) have advanced Information Retrieval (IR),\none limitation of these neural models is their narrow generalizability and\nrobustness. To cope with this issue, one can leverage the Mixture-of-Experts\n(MoE) architecture. While previous IR studies have incorporated MoE\narchitectures within the Transformer layers of DRMs, our work investigates an\narchitecture that integrates a single MoE block (SB-MoE) after the output of\nthe final Transformer layer. Our empirical evaluation investigates how SB-MoE\ncompares, in terms of retrieval effectiveness, to standard fine-tuning. In\ndetail, we fine-tune three DRMs (TinyBERT, BERT, and Contriever) across four\nbenchmark collections with and without adding the MoE block. Moreover, since\nMoE showcases performance variations with respect to its parameters (i.e., the\nnumber of experts), we conduct additional experiments to investigate this\naspect further. The findings show the effectiveness of SB-MoE especially for\nDRMs with a low number of parameters (i.e., TinyBERT), as it consistently\noutperforms the fine-tuned underlying model on all four benchmarks. For DRMs\nwith a higher number of parameters (i.e., BERT and Contriever), SB-MoE requires\nlarger numbers of training samples to yield better retrieval performance.\n","authors":["Effrosyni Sokli","Pranav Kasela","Georgios Peikos","Gabriella Pasi"],"pdf_url":"https://arxiv.org/pdf/2412.11864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02729v2","updated":"2024-12-16T15:11:11Z","published":"2024-10-03T17:49:09Z","title":"Unified Multimodal Interleaved Document Representation for Retrieval","summary":" Information Retrieval (IR) methods aim to identify documents relevant to a\nquery, which have been widely applied in various natural language tasks.\nHowever, existing approaches typically consider only the textual content within\ndocuments, overlooking the fact that documents can contain multiple modalities,\nincluding images and tables. Also, they often segment each long document into\nmultiple discrete passages for embedding, which prevents them from capturing\nthe overall document context and interactions between paragraphs. To address\nthese two challenges, we propose a method that holistically embeds documents\ninterleaved with multiple modalities by leveraging the capability of recent\nvision-language models that enable the processing and integration of text,\nimages, and tables into a unified format and representation. Moreover, to\nmitigate the information loss from segmenting documents into passages, instead\nof representing and retrieving passages individually, we further merge the\nrepresentations of segmented passages into one single document representation,\nwhile we additionally introduce a reranking strategy to decouple and identify\nthe relevant passage within the document if necessary. Then, through extensive\nexperiments on diverse IR scenarios considering both the textual and multimodal\nqueries, we show that our approach substantially outperforms relevant\nbaselines, thanks to the consideration of the multimodal information within\ndocuments.\n","authors":["Jaewoo Lee","Joonho Ko","Jinheon Baek","Soyeong Jeong","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2410.02729v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2412.11846v1","updated":"2024-12-16T15:08:44Z","published":"2024-12-16T15:08:44Z","title":"SPGL: Enhancing Session-based Recommendation with Single Positive Graph\n Learning","summary":" Session-based recommendation seeks to forecast the next item a user will be\ninterested in, based on their interaction sequences. Due to limited interaction\ndata, session-based recommendation faces the challenge of limited data\navailability. Traditional methods enhance feature learning by constructing\ncomplex models to generate positive and negative samples. This paper proposes a\nsession-based recommendation model using Single Positive optimization loss and\nGraph Learning (SPGL) to deal with the problem of data sparsity, high model\ncomplexity and weak transferability. SPGL utilizes graph convolutional networks\nto generate global item representations and batch session representations,\neffectively capturing intrinsic relationships between items. The use of single\npositive optimization loss improves uniformity of item representations, thereby\nenhancing recommendation accuracy. In the intent extractor, SPGL considers the\nhop count of the adjacency matrix when constructing the directed global graph\nto fully integrate spatial information. It also takes into account the reverse\npositional information of items when constructing session representations to\nincorporate temporal information. Comparative experiments across three\nbenchmark datasets, Tmall, RetailRocket and Diginetica, demonstrate the model's\neffectiveness. The source code can be accessed on\nhttps://github.com/liang-tian-tian/SPGL .\n","authors":["Tiantian Liang","Zhe Yang"],"pdf_url":"https://arxiv.org/pdf/2412.11846v1.pdf","comment":"ICONIP 2024"},{"id":"http://arxiv.org/abs/2412.11832v1","updated":"2024-12-16T14:55:57Z","published":"2024-12-16T14:55:57Z","title":"A Distributed Collaborative Retrieval Framework Excelling in All Queries\n and Corpora based on Zero-shot Rank-Oriented Automatic Evaluation","summary":" Numerous retrieval models, including sparse, dense and llm-based methods,\nhave demonstrated remarkable performance in predicting the relevance between\nqueries and corpora. However, the preliminary effectiveness analysis\nexperiments indicate that these models fail to achieve satisfactory performance\non the majority of queries and corpora, revealing their effectiveness\nrestricted to specific scenarios. Thus, to tackle this problem, we propose a\nnovel Distributed Collaborative Retrieval Framework (DCRF), outperforming each\nsingle model across all queries and corpora. Specifically, the framework\nintegrates various retrieval models into a unified system and dynamically\nselects the optimal results for each user's query. It can easily aggregate any\nretrieval model and expand to any application scenarios, illustrating its\nflexibility and scalability.Moreover, to reduce maintenance and training costs,\nwe design four effective prompting strategies with large language models (LLMs)\nto evaluate the quality of ranks without reliance of labeled data. Extensive\nexperiments demonstrate that proposed framework, combined with 8 efficient\nretrieval models, can achieve performance comparable to effective listwise\nmethods like RankGPT and ListT5, while offering superior efficiency. Besides,\nDCRF surpasses all selected retrieval models on the most datasets, indicating\nthe effectiveness of our prompting strategies on rank-oriented automatic\nevaluation.\n","authors":["Tian-Yi Che","Xian-Ling Mao","Chun Xu","Cheng-Xin Xin","Heng-Da Xu","Jin-Yu Liu","Heyan Huang"],"pdf_url":"https://arxiv.org/pdf/2412.11832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11818v1","updated":"2024-12-16T14:35:32Z","published":"2024-12-16T14:35:32Z","title":"Leveraging User-Generated Metadata of Online Videos for Cover Song\n Identification","summary":" YouTube is a rich source of cover songs. Since the platform itself is\norganized in terms of videos rather than songs, the retrieval of covers is not\ntrivial. The field of cover song identification addresses this problem and\nprovides approaches that usually rely on audio content. However, including the\nuser-generated video metadata available on YouTube promises improved\nidentification results. In this paper, we propose a multi-modal approach for\ncover song identification on online video platforms. We combine the entity\nresolution models with audio-based approaches using a ranking model. Our\nfindings implicate that leveraging user-generated metadata can stabilize cover\nsong identification performance on YouTube.\n","authors":["Simon Hachmeier","Robert Jäschke"],"pdf_url":"https://arxiv.org/pdf/2412.11818v1.pdf","comment":"accepted for presentation at NLP for Music and Audio (NLP4MusA) 2024"},{"id":"http://arxiv.org/abs/2412.11787v1","updated":"2024-12-16T13:59:10Z","published":"2024-12-16T13:59:10Z","title":"A Method for Detecting Legal Article Competition for Korean Criminal Law\n Using a Case-augmented Mention Graph","summary":" As social systems become increasingly complex, legal articles are also\ngrowing more intricate, making it progressively harder for humans to identify\nany potential competitions among them, particularly when drafting new laws or\napplying existing laws. Despite this challenge, no method for detecting such\ncompetitions has been proposed so far. In this paper, we propose a new legal AI\ntask called Legal Article Competition Detection (LACD), which aims to identify\ncompeting articles within a given law. Our novel retrieval method, CAM-Re2,\noutperforms existing relevant methods, reducing false positives by 20.8% and\nfalse negatives by 8.3%, while achieving a 98.2% improvement in precision@5,\nfor the LACD task. We release our codes at\nhttps://github.com/asmath472/LACD-public.\n","authors":["Seonho An","Young Yik Rhim","Min-Soo Kim"],"pdf_url":"https://arxiv.org/pdf/2412.11787v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2412.11758v1","updated":"2024-12-16T13:22:34Z","published":"2024-12-16T13:22:34Z","title":"Establishing a Foundation for Tetun Text Ad-Hoc Retrieval: Indexing,\n Stemming, Retrieval, and Ranking","summary":" Searching for information on the internet and digital platforms to satisfy an\ninformation need requires effective retrieval solutions. However, such\nsolutions are not yet available for Tetun, making it challenging to find\nrelevant documents for text-based search queries in this language. To address\nthese challenges, this study investigates Tetun text retrieval with a focus on\nthe ad-hoc retrieval task. It begins by developing essential language resources\n-- including a list of stopwords, a stemmer, and a test collection -- which\nserve as foundational components for solutions tailored to Tetun text\nretrieval. Various strategies are then explored using both document titles and\ncontent to evaluate retrieval effectiveness. The results show that retrieving\ndocument titles, after removing hyphens and apostrophes without applying\nstemming, significantly improves retrieval performance compared to the\nbaseline. Efficiency increases by 31.37%, while effectiveness achieves an\naverage gain of 9.40% in MAP@10 and 30.35% in nDCG@10 with DFR BM25. Beyond the\ntop-10 cutoff point, Hiemstra LM demonstrates strong performance across various\nretrieval strategies and evaluation metrics. Contributions of this work include\nthe development of Labadain-Stopwords (a list of 160 Tetun stopwords),\nLabadain-Stemmer (a Tetun stemmer with three variants), and\nLabadain-Avaliad\\'or (a Tetun test collection containing 59 topics, 33,550\ndocuments, and 5,900 qrels).\n","authors":["Gabriel de Jesus","Sérgio Nunes"],"pdf_url":"https://arxiv.org/pdf/2412.11758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11747v1","updated":"2024-12-16T13:05:13Z","published":"2024-12-16T13:05:13Z","title":"Beyond Graph Convolution: Multimodal Recommendation with Topology-aware\n MLPs","summary":" Given the large volume of side information from different modalities,\nmultimodal recommender systems have become increasingly vital, as they exploit\nricher semantic information beyond user-item interactions. Recent works\nhighlight that leveraging Graph Convolutional Networks (GCNs) to explicitly\nmodel multimodal item-item relations can significantly enhance recommendation\nperformance. However, due to the inherent over-smoothing issue of GCNs,\nexisting models benefit only from shallow GCNs with limited representation\npower. This drawback is especially pronounced when facing complex and\nhigh-dimensional patterns such as multimodal data, as it requires\nlarge-capacity models to accommodate complicated correlations. To this end, in\nthis paper, we investigate bypassing GCNs when modeling multimodal item-item\nrelationship. More specifically, we propose a Topology-aware Multi-Layer\nPerceptron (TMLP), which uses MLPs instead of GCNs to model the relationships\nbetween items. TMLP enhances MLPs with topological pruning to denoise item-item\nrelations and intra (inter)-modality learning to integrate higher-order\nmodality correlations. Extensive experiments on three real-world datasets\nverify TMLP's superiority over nine baselines. We also find that by discarding\nthe internal message passing in GCNs, which is sensitive to node connections,\nTMLP achieves significant improvements in both training efficiency and\nrobustness against existing models.\n","authors":["Junjie Huang","Jiarui Qin","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.11747v1.pdf","comment":"AAAI 2025. 11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2412.11729v1","updated":"2024-12-16T12:53:06Z","published":"2024-12-16T12:53:06Z","title":"STAIR: Manipulating Collaborative and Multimodal Information for\n E-Commerce Recommendation","summary":" While the mining of modalities is the focus of most multimodal recommendation\nmethods, we believe that how to fully utilize both collaborative and multimodal\ninformation is pivotal in e-commerce scenarios where, as clarified in this\nwork, the user behaviors are rarely determined entirely by multimodal features.\nIn order to combine the two distinct types of information, some additional\nchallenges are encountered: 1) Modality erasure: Vanilla graph convolution,\nwhich proves rather useful in collaborative filtering, however erases\nmultimodal information; 2) Modality forgetting: Multimodal information tends to\nbe gradually forgotten as the recommendation loss essentially facilitates the\nlearning of collaborative information. To this end, we propose a novel approach\nnamed STAIR, which employs a novel STepwise grAph convolution to enable a\nco-existence of collaborative and multimodal Information in e-commerce\nRecommendation. Besides, it starts with the raw multimodal features as an\ninitialization, and the forgetting problem can be significantly alleviated\nthrough constrained embedding updates. As a result, STAIR achieves\nstate-of-the-art recommendation performance on three public e-commerce datasets\nwith minimal computational and memory costs. Our code is available at\nhttps://github.com/yhhe2004/STAIR.\n","authors":["Cong Xu","Yunhang He","Jun Wang","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.11729v1.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2409.10907v2","updated":"2024-12-16T12:42:07Z","published":"2024-09-17T05:54:25Z","title":"Attention-Seeker: Dynamic Self-Attention Scoring for Unsupervised\n Keyphrase Extraction","summary":" This paper proposes Attention-Seeker, an unsupervised keyphrase extraction\nmethod that leverages self-attention maps from a Large Language Model to\nestimate the importance of candidate phrases. Our approach identifies specific\ncomponents - such as layers, heads, and attention vectors - where the model\npays significant attention to the key topics of the text. The attention weights\nprovided by these components are then used to score the candidate phrases.\nUnlike previous models that require manual tuning of parameters (e.g.,\nselection of heads, prompts, hyperparameters), Attention-Seeker dynamically\nadapts to the input text without any manual adjustments, enhancing its\npractical applicability. We evaluate Attention-Seeker on four publicly\navailable datasets: Inspec, SemEval2010, SemEval2017, and Krapivin. Our results\ndemonstrate that, even without parameter tuning, Attention-Seeker outperforms\nmost baseline models, achieving state-of-the-art performance on three out of\nfour datasets, particularly excelling in extracting keyphrases from long\ndocuments.\n","authors":["Erwin D. López Z.","Cheng Tang","Atsushi Shimada"],"pdf_url":"https://arxiv.org/pdf/2409.10907v2.pdf","comment":"This version has been accepted for presentation at COLING 2025, and\n all peer-reviewed changes have been incorporated"},{"id":"http://arxiv.org/abs/2410.06618v2","updated":"2024-12-16T09:52:32Z","published":"2024-10-09T07:14:49Z","title":"Text Proxy: Decomposing Retrieval from a 1-to-N Relationship into N\n 1-to-1 Relationships for Text-Video Retrieval","summary":" Text-video retrieval (TVR) has seen substantial advancements in recent years,\nfueled by the utilization of pre-trained models and large language models\n(LLMs). Despite these advancements, achieving accurate matching in TVR remains\nchallenging due to inherent disparities between video and textual modalities\nand irregularities in data representation. In this paper, we propose\nText-Video-ProxyNet (TV-ProxyNet), a novel framework designed to decompose the\nconventional 1-to-N relationship of TVR into N distinct 1-to-1 relationships.\nBy replacing a single text query with a series of text proxies, TV-ProxyNet not\nonly broadens the query scope but also achieves a more precise expansion. Each\ntext proxy is crafted through a refined iterative process, controlled by\nmechanisms we term as the director and dash, which regulate the proxy's\ndirection and distance relative to the original text query. This setup not only\nfacilitates more precise semantic alignment but also effectively manages the\ndisparities and noise inherent in multimodal data. Our experiments on three\nrepresentative video-text retrieval benchmarks, MSRVTT, DiDeMo, and ActivityNet\nCaptions, demonstrate the effectiveness of TV-ProxyNet. The results show an\nimprovement of 2.0% to 3.3% in R@1 over the baseline. TV-ProxyNet achieved\nstate-of-the-art performance on MSRVTT and ActivityNet Captions, and a 2.0%\nimprovement on DiDeMo compared to existing methods, validating our approach's\nability to enhance semantic mapping and reduce error propensity.\n","authors":["Jian Xiao","Zhenzhen Hu","Jia Li","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2410.06618v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11589v1","updated":"2024-12-16T09:20:29Z","published":"2024-12-16T09:20:29Z","title":"Future Sight and Tough Fights: Revolutionizing Sequential Recommendation\n with FENRec","summary":" Sequential recommendation (SR) systems predict user preferences by analyzing\ntime-ordered interaction sequences. A common challenge for SR is data sparsity,\nas users typically interact with only a limited number of items. While\ncontrastive learning has been employed in previous approaches to address the\nchallenges, these methods often adopt binary labels, missing finer patterns and\noverlooking detailed information in subsequent behaviors of users.\nAdditionally, they rely on random sampling to select negatives in contrastive\nlearning, which may not yield sufficiently hard negatives during later training\nstages. In this paper, we propose Future data utilization with Enduring\nNegatives for contrastive learning in sequential Recommendation (FENRec). Our\napproach aims to leverage future data with time-dependent soft labels and\ngenerate enduring hard negatives from existing data, thereby enhancing the\neffectiveness in tackling data sparsity. Experiment results demonstrate our\nstate-of-the-art performance across four benchmark datasets, with an average\nimprovement of 6.16\\% across all metrics.\n","authors":["Yu-Hsuan Huang","Ling Lo","Hongxia Xie","Hong-Han Shuai","Wen-Huang Cheng"],"pdf_url":"https://arxiv.org/pdf/2412.11589v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.11557v1","updated":"2024-12-16T08:42:43Z","published":"2024-12-16T08:42:43Z","title":"Enhancing Healthcare Recommendation Systems with a Multimodal LLMs-based\n MOE Architecture","summary":" With the increasing availability of multimodal data, many fields urgently\nrequire advanced architectures capable of effectively integrating these diverse\ndata sources to address specific problems. This study proposes a hybrid\nrecommendation model that combines the Mixture of Experts (MOE) framework with\nlarge language models to enhance the performance of recommendation systems in\nthe healthcare domain. We built a small dataset for recommending healthy food\nbased on patient descriptions and evaluated the model's performance on several\nkey metrics, including Precision, Recall, NDCG, and MAP@5. The experimental\nresults show that the hybrid model outperforms the baseline models, which use\nMOE or large language models individually, in terms of both accuracy and\npersonalized recommendation effectiveness. The paper finds image data provided\nrelatively limited improvement in the performance of the personalized\nrecommendation system, particularly in addressing the cold start problem. Then,\nthe issue of reclassification of images also affected the recommendation\nresults, especially when dealing with low-quality images or changes in the\nappearance of items, leading to suboptimal performance. The findings provide\nvaluable insights into the development of powerful, scalable, and\nhigh-performance recommendation systems, advancing the application of\npersonalized recommendation technologies in real-world domains such as\nhealthcare.\n","authors":["Jingyu Xu","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2412.11557v1.pdf","comment":"10 page, accpted by Conf-SMPL conference"},{"id":"http://arxiv.org/abs/2412.00430v4","updated":"2024-12-16T07:46:03Z","published":"2024-11-30T10:56:30Z","title":"Predictive Models in Sequential Recommendations: Bridging Performance\n Laws with Data Quality Insights","summary":" Sequential Recommendation (SR) plays a critical role in predicting users'\nsequential preferences. Despite its growing prominence in various industries,\nthe increasing scale of SR models incurs substantial computational costs and\nunpredictability, challenging developers to manage resources efficiently. Under\nthis predicament, Scaling Laws have achieved significant success by examining\nthe loss as models scale up. However, there remains a disparity between loss\nand model performance, which is of greater concern in practical applications.\nMoreover, as data continues to expand, it incorporates repetitive and\ninefficient data. In response, we introduce the Performance Law for SR models,\nwhich aims to theoretically investigate and model the relationship between\nmodel performance and data quality. Specifically, we first fit the HR and NDCG\nmetrics to transformer-based SR models. Subsequently, we propose Approximate\nEntropy (ApEn) to assess data quality, presenting a more nuanced approach\ncompared to traditional data quantity metrics. Our method enables accurate\npredictions across various dataset scales and model sizes, demonstrating a\nstrong correlation in large SR models and offering insights into achieving\noptimal performance for any given model configuration.\n","authors":["Tingjia Shen","Hao Wang","Chuhan Wu","Jin Yao Chin","Wei Guo","Yong Liu","Huifeng Guo","Defu Lian","Ruiming Tang","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2412.00430v4.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2412.11431v1","updated":"2024-12-16T04:03:58Z","published":"2024-12-16T04:03:58Z","title":"Optimized Quran Passage Retrieval Using an Expanded QA Dataset and\n Fine-Tuned Language Models","summary":" Understanding the deep meanings of the Qur'an and bridging the language gap\nbetween modern standard Arabic and classical Arabic is essential to improve the\nquestion-and-answer system for the Holy Qur'an. The Qur'an QA 2023 shared task\ndataset had a limited number of questions with weak model retrieval. To address\nthis challenge, this work updated the original dataset and improved the model\naccuracy. The original dataset, which contains 251 questions, was reviewed and\nexpanded to 629 questions with question diversification and reformulation,\nleading to a comprehensive set of 1895 categorized into single-answer,\nmulti-answer, and zero-answer types. Extensive experiments fine-tuned\ntransformer models, including AraBERT, RoBERTa, CAMeLBERT, AraELECTRA, and\nBERT. The best model, AraBERT-base, achieved a MAP@10 of 0.36 and MRR of 0.59,\nrepresenting improvements of 63% and 59%, respectively, compared to the\nbaseline scores (MAP@10: 0.22, MRR: 0.37). Additionally, the dataset expansion\nled to improvements in handling \"no answer\" cases, with the proposed approach\nachieving a 75% success rate for such instances, compared to the baseline's\n25%. These results demonstrate the effect of dataset improvement and model\narchitecture optimization in increasing the performance of QA systems for the\nHoly Qur'an, with higher accuracy, recall, and precision.\n","authors":["Mohamed Basem","Islam Oshallah","Baraa Hikal","Ali Hamdi","Ammar Mohamed"],"pdf_url":"https://arxiv.org/pdf/2412.11431v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2412.12331v1","updated":"2024-12-16T20:01:35Z","published":"2024-12-16T20:01:35Z","title":"Efficient Object-centric Representation Learning with Pre-trained\n Geometric Prior","summary":" This paper addresses key challenges in object-centric representation learning\nof video. While existing approaches struggle with complex scenes, we propose a\nnovel weakly-supervised framework that emphasises geometric understanding and\nleverages pre-trained vision models to enhance object discovery. Our method\nintroduces an efficient slot decoder specifically designed for object-centric\nlearning, enabling effective representation of multi-object scenes without\nrequiring explicit depth information. Results on synthetic video benchmarks\nwith increasing complexity in terms of objects and their movement, object\nocclusion and camera motion demonstrate that our approach achieves comparable\nperformance to supervised methods while maintaining computational efficiency.\nThis advances the field towards more practical applications in complex\nreal-world scenarios.\n","authors":["Phúc H. Le Khac","Graham Healy","Alan F. Smeaton"],"pdf_url":"https://arxiv.org/pdf/2412.12331v1.pdf","comment":"6 pages, 4 Figures, 2 Tables"},{"id":"http://arxiv.org/abs/2402.12121v2","updated":"2024-12-16T16:09:47Z","published":"2024-02-19T13:16:10Z","title":"IRR: Image Review Ranking Framework for Evaluating Vision-Language\n Models","summary":" Large-scale Vision-Language Models (LVLMs) process both images and text,\nexcelling in multimodal tasks such as image captioning and description\ngeneration. However, while these models excel at generating factual content,\ntheir ability to generate and evaluate texts reflecting perspectives on the\nsame image, depending on the context, has not been sufficiently explored. To\naddress this, we propose IRR: Image Review Rank, a novel evaluation framework\ndesigned to assess critic review texts from multiple perspectives. IRR\nevaluates LVLMs by measuring how closely their judgments align with human\ninterpretations. We validate it using a dataset of images from 15 categories,\neach with five critic review texts and annotated rankings in both English and\nJapanese, totaling over 2,000 data instances. The datasets are available at\nhttps://hf.co/datasets/naist-nlp/Wiki-ImageReview1.0. Our results indicate\nthat, although LVLMs exhibited consistent performance across languages, their\ncorrelation with human annotations was insufficient, highlighting the need for\nfurther advancements. These findings highlight the limitations of current\nevaluation methods and the need for approaches that better capture human\nreasoning in Vision & Language tasks.\n","authors":["Kazuki Hayashi","Kazuma Onishi","Toma Suzuki","Yusuke Ide","Seiji Gobara","Shigeki Saito","Yusuke Sakai","Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2402.12121v2.pdf","comment":"18pages, Accepted at COLING25"},{"id":"http://arxiv.org/abs/2412.11851v1","updated":"2024-12-16T15:11:03Z","published":"2024-12-16T15:11:03Z","title":"A Benchmark and Robustness Study of In-Context-Learning with Large\n Language Models in Music Entity Detection","summary":" Detecting music entities such as song titles or artist names is a useful\napplication to help use cases like processing music search queries or analyzing\nmusic consumption on the web. Recent approaches incorporate smaller language\nmodels (SLMs) like BERT and achieve high results. However, further research\nindicates a high influence of entity exposure during pre-training on the\nperformance of the models. With the advent of large language models (LLMs),\nthese outperform SLMs in a variety of downstream tasks. However, researchers\nare still divided if this is applicable to tasks like entity detection in texts\ndue to issues like hallucination. In this paper, we provide a novel dataset of\nuser-generated metadata and conduct a benchmark and a robustness study using\nrecent LLMs with in-context-learning (ICL). Our results indicate that LLMs in\nthe ICL setting yield higher performance than SLMs. We further uncover the\nlarge impact of entity exposure on the best performing LLM in our study.\n","authors":["Simon Hachmeier","Robert Jäschke"],"pdf_url":"https://arxiv.org/pdf/2412.11851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11818v1","updated":"2024-12-16T14:35:32Z","published":"2024-12-16T14:35:32Z","title":"Leveraging User-Generated Metadata of Online Videos for Cover Song\n Identification","summary":" YouTube is a rich source of cover songs. Since the platform itself is\norganized in terms of videos rather than songs, the retrieval of covers is not\ntrivial. The field of cover song identification addresses this problem and\nprovides approaches that usually rely on audio content. However, including the\nuser-generated video metadata available on YouTube promises improved\nidentification results. In this paper, we propose a multi-modal approach for\ncover song identification on online video platforms. We combine the entity\nresolution models with audio-based approaches using a ranking model. Our\nfindings implicate that leveraging user-generated metadata can stabilize cover\nsong identification performance on YouTube.\n","authors":["Simon Hachmeier","Robert Jäschke"],"pdf_url":"https://arxiv.org/pdf/2412.11818v1.pdf","comment":"accepted for presentation at NLP for Music and Audio (NLP4MusA) 2024"},{"id":"http://arxiv.org/abs/2404.13282v2","updated":"2024-12-16T14:33:03Z","published":"2024-04-20T06:01:09Z","title":"Wills Aligner: Multi-Subject Collaborative Brain Visual Decoding","summary":" Decoding visual information from human brain activity has seen remarkable\nadvancements in recent research. However, the diversity in cortical\nparcellation and fMRI patterns across individuals has prompted the development\nof deep learning models tailored to each subject. The personalization limits\nthe broader applicability of brain visual decoding in real-world scenarios. To\naddress this issue, we introduce Wills Aligner, a novel approach designed to\nachieve multi-subject collaborative brain visual decoding. Wills Aligner begins\nby aligning the fMRI data from different subjects at the anatomical level. It\nthen employs delicate mixture-of-brain-expert adapters and a meta-learning\nstrategy to account for individual fMRI pattern differences. Additionally,\nWills Aligner leverages the semantic relation of visual stimuli to guide the\nlearning of inter-subject commonality, enabling visual decoding for each\nsubject to draw insights from other subjects' data. We rigorously evaluate our\nWills Aligner across various visual decoding tasks, including classification,\ncross-modal retrieval, and image reconstruction. The experimental results\ndemonstrate that Wills Aligner achieves promising performance.\n","authors":["Guangyin Bao","Qi Zhang","Zixuan Gong","Jialei Zhou","Wei Fan","Kun Yi","Usman Naseem","Liang Hu","Duoqian Miao"],"pdf_url":"https://arxiv.org/pdf/2404.13282v2.pdf","comment":"AAAI 2025, 16 pages"},{"id":"http://arxiv.org/abs/2404.12630v2","updated":"2024-12-16T13:59:51Z","published":"2024-04-19T05:12:04Z","title":"MindTuner: Cross-Subject Visual Decoding with Visual Fingerprint and\n Semantic Correction","summary":" Decoding natural visual scenes from brain activity has flourished, with\nextensive research in single-subject tasks and, however, less in cross-subject\ntasks. Reconstructing high-quality images in cross-subject tasks is a\nchallenging problem due to profound individual differences between subjects and\nthe scarcity of data annotation. In this work, we proposed MindTuner for\ncross-subject visual decoding, which achieves high-quality and rich semantic\nreconstructions using only 1 hour of fMRI training data benefiting from the\nphenomena of visual fingerprint in the human visual system and a novel\nfMRI-to-text alignment paradigm. Firstly, we pre-train a multi-subject model\namong 7 subjects and fine-tune it with scarce data on new subjects, where LoRAs\nwith Skip-LoRAs are utilized to learn the visual fingerprint. Then, we take the\nimage modality as the intermediate pivot modality to achieve fMRI-to-text\nalignment, which achieves impressive fMRI-to-text retrieval performance and\ncorrects fMRI-to-image reconstruction with fine-tuned semantics. The results of\nboth qualitative and quantitative analyses demonstrate that MindTuner surpasses\nstate-of-the-art cross-subject visual decoding models on the Natural Scenes\nDataset (NSD), whether using training data of 1 hour or 40 hours.\n","authors":["Zixuan Gong","Qi Zhang","Guangyin Bao","Lei Zhu","Ke Liu","Liang Hu","Duoqian Miao"],"pdf_url":"https://arxiv.org/pdf/2404.12630v2.pdf","comment":"AAAI 2025, 14 pages"},{"id":"http://arxiv.org/abs/2412.11762v1","updated":"2024-12-16T13:26:52Z","published":"2024-12-16T13:26:52Z","title":"GS-ProCams: Gaussian Splatting-based Projector-Camera Systems","summary":" We present GS-ProCams, the first Gaussian Splatting-based framework for\nprojector-camera systems (ProCams). GS-ProCams significantly enhances the\nefficiency of projection mapping (PM) that requires establishing geometric and\nradiometric mappings between the projector and the camera. Previous CNN-based\nProCams are constrained to a specific viewpoint, limiting their applicability\nto novel perspectives. In contrast, NeRF-based ProCams support view-agnostic\nprojection mapping, however, they require an additional colocated light source\nand demand significant computational and memory resources. To address this\nissue, we propose GS-ProCams that employs 2D Gaussian for scene\nrepresentations, and enables efficient view-agnostic ProCams applications. In\nparticular, we explicitly model the complex geometric and photometric mappings\nof ProCams using projector responses, the target surface's geometry and\nmaterials represented by Gaussians, and global illumination component. Then, we\nemploy differentiable physically-based rendering to jointly estimate them from\ncaptured multi-view projections. Compared to state-of-the-art NeRF-based\nmethods, our GS-ProCams eliminates the need for additional devices, achieving\nsuperior ProCams simulation quality. It is also 600 times faster and uses only\n1/10 of the GPU memory.\n","authors":["Qingyue Deng","Jijiang Li","Haibin Ling","Bingyao Huang"],"pdf_url":"https://arxiv.org/pdf/2412.11762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11715v1","updated":"2024-12-16T12:35:56Z","published":"2024-12-16T12:35:56Z","title":"Discrepancy-Aware Attention Network for Enhanced Audio-Visual Zero-Shot\n Learning","summary":" Audio-visual Zero-Shot Learning (ZSL) has attracted significant attention for\nits ability to identify unseen classes and perform well in video classification\ntasks. However, modal imbalance in (G)ZSL leads to over-reliance on the optimal\nmodality, reducing discriminative capabilities for unseen classes. Some studies\nhave attempted to address this issue by modifying parameter gradients, but two\nchallenges still remain: (a) Quality discrepancies, where modalities offer\ndiffering quantities and qualities of information for the same concept. (b)\nContent discrepancies, where sample contributions within a modality vary\nsignificantly. To address these challenges, we propose a Discrepancy-Aware\nAttention Network (DAAN) for Enhanced Audio-Visual ZSL. Our approach introduces\na Quality-Discrepancy Mitigation Attention (QDMA) unit to minimize redundant\ninformation in the high-quality modality and a Contrastive Sample-level\nGradient Modulation (CSGM) block to adjust gradient magnitudes and balance\ncontent discrepancies. We quantify modality contributions by integrating\noptimization and convergence rate for more precise gradient modulation in CSGM.\nExperiments demonstrates DAAN achieves state-of-the-art performance on\nbenchmark datasets, with ablation studies validating the effectiveness of\nindividual modules.\n","authors":["RunLin Yu","Yipu Gong","Wenrui Li","Aiwen Sun","Mengren Zheng"],"pdf_url":"https://arxiv.org/pdf/2412.11715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11663v1","updated":"2024-12-16T11:11:23Z","published":"2024-12-16T11:11:23Z","title":"LMM-Regularized CLIP Embeddings for Image Classification","summary":" In this paper we deal with image classification tasks using the powerful CLIP\nvision-language model. Our goal is to advance the classification performance\nusing the CLIP's image encoder, by proposing a novel Large Multimodal Model\n(LMM) based regularization method. The proposed method uses an LMM to extract\nsemantic descriptions for the images of the dataset. Then, it uses the CLIP's\ntext encoder, frozen, in order to obtain the corresponding text embeddings and\ncompute the mean semantic class descriptions. Subsequently, we adapt the CLIP's\nimage encoder by adding a classification head, and we train it along with the\nimage encoder output, apart from the main classification objective, with an\nadditional auxiliary objective. The additional objective forces the embeddings\nat the image encoder's output to become similar to their corresponding\nLMM-generated mean semantic class descriptions. In this way, it produces\nembeddings with enhanced discrimination ability, leading to improved\nclassification performance. The effectiveness of the proposed regularization\nmethod is validated through extensive experiments on three image classification\ndatasets.\n","authors":["Maria Tzelepi","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2412.11663v1.pdf","comment":"Accepted for publication, 26th Int. Symp. on Multimedia (IEEE ISM\n 2024), Tokyo, Japan, Dec. 2024. This is the authors' \"accepted version\""},{"id":"http://arxiv.org/abs/2412.11621v1","updated":"2024-12-16T10:08:38Z","published":"2024-12-16T10:08:38Z","title":"VG-TVP: Multimodal Procedural Planning via Visually Grounded Text-Video\n Prompting","summary":" Large Language Model (LLM)-based agents have shown promise in procedural\ntasks, but the potential of multimodal instructions augmented by texts and\nvideos to assist users remains under-explored. To address this gap, we propose\nthe Visually Grounded Text-Video Prompting (VG-TVP) method which is a novel\nLLM-empowered Multimodal Procedural Planning (MPP) framework. It generates\ncohesive text and video procedural plans given a specified high-level\nobjective. The main challenges are achieving textual and visual\ninformativeness, temporal coherence, and accuracy in procedural plans. VG-TVP\nleverages the zero-shot reasoning capability of LLMs, the video-to-text\ngeneration ability of the video captioning models, and the text-to-video\ngeneration ability of diffusion models. VG-TVP improves the interaction between\nmodalities by proposing a novel Fusion of Captioning (FoC) method and using\nText-to-Video Bridge (T2V-B) and Video-to-Text Bridge (V2T-B). They allow LLMs\nto guide the generation of visually-grounded text plans and textual-grounded\nvideo plans. To address the scarcity of datasets suitable for MPP, we have\ncurated a new dataset called Daily-Life Task Procedural Plans (Daily-PP). We\nconduct comprehensive experiments and benchmarks to evaluate human preferences\n(regarding textual and visual informativeness, temporal coherence, and plan\naccuracy). Our VG-TVP method outperforms unimodal baselines on the Daily-PP\ndataset.\n","authors":["Muhammet Furkan Ilaslan","Ali Koksal","Kevin Qinhong Lin","Burak Satar","Mike Zheng Shou","Qianli Xu"],"pdf_url":"https://arxiv.org/pdf/2412.11621v1.pdf","comment":"Accepted for The 39th Annual AAAI Conference on Artificial\n Intelligence 2025 in Main Track, 19 pages, 24 figures"},{"id":"http://arxiv.org/abs/2412.12225v1","updated":"2024-12-16T10:03:44Z","published":"2024-12-16T10:03:44Z","title":"DLF: Disentangled-Language-Focused Multimodal Sentiment Analysis","summary":" Multimodal Sentiment Analysis (MSA) leverages heterogeneous modalities, such\nas language, vision, and audio, to enhance the understanding of human\nsentiment. While existing models often focus on extracting shared information\nacross modalities or directly fusing heterogeneous modalities, such approaches\ncan introduce redundancy and conflicts due to equal treatment of all modalities\nand the mutual transfer of information between modality pairs. To address these\nissues, we propose a Disentangled-Language-Focused (DLF) multimodal\nrepresentation learning framework, which incorporates a feature disentanglement\nmodule to separate modality-shared and modality-specific information. To\nfurther reduce redundancy and enhance language-targeted features, four\ngeometric measures are introduced to refine the disentanglement process. A\nLanguage-Focused Attractor (LFA) is further developed to strengthen language\nrepresentation by leveraging complementary modality-specific information\nthrough a language-guided cross-attention mechanism. The framework also employs\nhierarchical predictions to improve overall accuracy. Extensive experiments on\ntwo popular MSA datasets, CMU-MOSI and CMU-MOSEI, demonstrate the significant\nperformance gains achieved by the proposed DLF framework. Comprehensive\nablation studies further validate the effectiveness of the feature\ndisentanglement module, language-focused attractor, and hierarchical\npredictions. Our code is available at https://github.com/pwang322/DLF.\n","authors":["Pan Wang","Qiang Zhou","Yawen Wu","Tianlong Chen","Jingtong Hu"],"pdf_url":"https://arxiv.org/pdf/2412.12225v1.pdf","comment":"AAAI 2025 accepted"},{"id":"http://arxiv.org/abs/2410.06618v2","updated":"2024-12-16T09:52:32Z","published":"2024-10-09T07:14:49Z","title":"Text Proxy: Decomposing Retrieval from a 1-to-N Relationship into N\n 1-to-1 Relationships for Text-Video Retrieval","summary":" Text-video retrieval (TVR) has seen substantial advancements in recent years,\nfueled by the utilization of pre-trained models and large language models\n(LLMs). Despite these advancements, achieving accurate matching in TVR remains\nchallenging due to inherent disparities between video and textual modalities\nand irregularities in data representation. In this paper, we propose\nText-Video-ProxyNet (TV-ProxyNet), a novel framework designed to decompose the\nconventional 1-to-N relationship of TVR into N distinct 1-to-1 relationships.\nBy replacing a single text query with a series of text proxies, TV-ProxyNet not\nonly broadens the query scope but also achieves a more precise expansion. Each\ntext proxy is crafted through a refined iterative process, controlled by\nmechanisms we term as the director and dash, which regulate the proxy's\ndirection and distance relative to the original text query. This setup not only\nfacilitates more precise semantic alignment but also effectively manages the\ndisparities and noise inherent in multimodal data. Our experiments on three\nrepresentative video-text retrieval benchmarks, MSRVTT, DiDeMo, and ActivityNet\nCaptions, demonstrate the effectiveness of TV-ProxyNet. The results show an\nimprovement of 2.0% to 3.3% in R@1 over the baseline. TV-ProxyNet achieved\nstate-of-the-art performance on MSRVTT and ActivityNet Captions, and a 2.0%\nimprovement on DiDeMo compared to existing methods, validating our approach's\nability to enhance semantic mapping and reduce error propensity.\n","authors":["Jian Xiao","Zhenzhen Hu","Jia Li","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2410.06618v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10958v2","updated":"2024-12-16T02:37:33Z","published":"2024-09-17T07:52:09Z","title":"Towards Effective User Attribution for Latent Diffusion Models via\n Watermark-Informed Blending","summary":" Rapid advancements in multimodal large language models have enabled the\ncreation of hyper-realistic images from textual descriptions. However, these\nadvancements also raise significant concerns about unauthorized use, which\nhinders their broader distribution. Traditional watermarking methods often\nrequire complex integration or degrade image quality. To address these\nchallenges, we introduce a novel framework Towards Effective user Attribution\nfor latent diffusion models via Watermark-Informed Blending (TEAWIB). TEAWIB\nincorporates a unique ready-to-use configuration approach that allows seamless\nintegration of user-specific watermarks into generative models. This approach\nensures that each user can directly apply a pre-configured set of parameters to\nthe model without altering the original model parameters or compromising image\nquality. Additionally, noise and augmentation operations are embedded at the\npixel level to further secure and stabilize watermarked images. Extensive\nexperiments validate the effectiveness of TEAWIB, showcasing the\nstate-of-the-art performance in perceptual quality and attribution accuracy.\n","authors":["Yongyang Pan","Xiaohong Liu","Siqi Luo","Yi Xin","Xiao Guo","Xiaoming Liu","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2409.10958v2.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.10719v3","updated":"2024-12-16T00:37:54Z","published":"2024-09-16T20:47:00Z","title":"Benchmarking VLMs' Reasoning About Persuasive Atypical Images","summary":" Vision language models (VLMs) have shown strong zero-shot generalization\nacross various tasks, especially when integrated with large language models\n(LLMs). However, their ability to comprehend rhetorical and persuasive visual\nmedia, such as advertisements, remains understudied. Ads often employ atypical\nimagery, using surprising object juxtapositions to convey shared properties.\nFor example, Fig. 1 (e) shows a beer with a feather-like texture. This requires\nadvanced reasoning to deduce that this atypical representation signifies the\nbeer's lightness. We introduce three novel tasks, Multi-label Atypicality\nClassification, Atypicality Statement Retrieval, and Aypical Object\nRecognition, to benchmark VLMs' understanding of atypicality in persuasive\nimages. We evaluate how well VLMs use atypicality to infer an ad's message and\ntest their reasoning abilities by employing semantically challenging negatives.\nFinally, we pioneer atypicality-aware verbalization by extracting comprehensive\nimage descriptions sensitive to atypical elements. Our findings reveal that:\n(1) VLMs lack advanced reasoning capabilities compared to LLMs; (2) simple,\neffective strategies can extract atypicality-aware information, leading to\ncomprehensive image verbalization; (3) atypicality aids persuasive\nadvertisement understanding. Code and data will be made available.\n","authors":["Sina Malakouti","Aysan Aghazadeh","Ashmit Khandelwal","Adriana Kovashka"],"pdf_url":"https://arxiv.org/pdf/2409.10719v3.pdf","comment":null}]},"2024-12-15T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2412.11203v1","updated":"2024-12-15T14:35:49Z","published":"2024-12-15T14:35:49Z","title":"Task-Oriented Dialog Systems for the Senegalese Wolof Language","summary":" In recent years, we are seeing considerable interest in conversational agents\nwith the rise of large language models (LLMs). Although they offer considerable\nadvantages, LLMs also present significant risks, such as hallucination, which\nhinder their widespread deployment in industry. Moreover, low-resource\nlanguages such as African ones are still underrepresented in these systems\nlimiting their performance in these languages. In this paper, we illustrate a\nmore classical approach based on modular architectures of Task-oriented Dialog\nSystems (ToDS) offering better control over outputs. We propose a chatbot\ngeneration engine based on the Rasa framework and a robust methodology for\nprojecting annotations onto the Wolof language using an in-house machine\ntranslation system. After evaluating a generated chatbot trained on the Amazon\nMassive dataset, our Wolof Intent Classifier performs similarly to the one\nobtained for French, which is a resource-rich language. We also show that this\napproach is extensible to other low-resource languages, thanks to the intent\nclassifier's language-agnostic pipeline, simplifying the design of chatbots in\nthese languages.\n","authors":["Derguene Mbaye","Moussa Diallo"],"pdf_url":"https://arxiv.org/pdf/2412.11203v1.pdf","comment":"10 pages, 3 tables, 6 figures, The 31st International Conference on\n Computational Linguistics (COLING 2025)"},{"id":"http://arxiv.org/abs/2412.12202v1","updated":"2024-12-15T09:23:14Z","published":"2024-12-15T09:23:14Z","title":"A multi-theoretical kernel-based approach to social network-based\n recommendation","summary":" Recommender systems are a critical component of e-commercewebsites. The rapid\ndevelopment of online social networking services provides an opportunity to\nexplore social networks together with information used in traditional\nrecommender systems, such as customer demographics, product characteristics,\nand transactions. It also provides more applications for recommender systems.\nTo tackle this social network-based recommendation problem, previous studies\ngenerally built trust models in light of the social influence theory. This\nstudy inspects a spectrumof social network theories to systematicallymodel\nthemultiple facets of a social network and infer user preferences. In order to\neffectively make use of these heterogonous theories, we take a kernel-based\nmachine learning paradigm, design and select kernels describing individual\nsimilarities according to social network theories, and employ a non-linear\nmultiple kernel learning algorithm to combine the kernels into a unified model.\nThis design also enables us to consider multiple theories' interactions in\nassessing individual behaviors. We evaluate our proposed approach on a\nreal-world movie review data set. The experiments show that our approach\nprovides more accurate recommendations than trust-based methods and the\ncollaborative filtering approach. Further analysis shows that kernels derived\nfrom contagion theory and homophily theory contribute a larger portion of the\nmodel.\n","authors":["Xin Li","Mengyue Wang","T. -P. Liang"],"pdf_url":"https://arxiv.org/pdf/2412.12202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.11127v1","updated":"2024-12-15T09:17:45Z","published":"2024-12-15T09:17:45Z","title":"Modeling the Heterogeneous Duration of User Interest in Time-Dependent\n Recommendation: A Hidden Semi-Markov Approach","summary":" Recommender systems are widely used for suggesting books, education\nmaterials, and products to users by exploring their behaviors. In reality,\nusers' preferences often change over time, leading to studies on time-dependent\nrecommender systems. However, most existing approaches that deal with time\ninformation remain primitive. In this paper, we extend existing methods and\npropose a hidden semi-Markov model to track the change of users' interests.\nParticularly, this model allows for capturing the different durations of user\nstays in a (latent) interest state, which can better model the heterogeneity of\nuser interests and focuses. We derive an expectation maximization algorithm to\nestimate the parameters of the framework and predict users' actions.\nExperiments on three real-world datasets show that our model significantly\noutperforms the state-of-the-art time-dependent and static benchmark methods.\nFurther analyses of the experiment results indicate that the performance\nimprovement is related to the heterogeneity of state durations and the drift of\nuser interests in the dataset.\n","authors":["Haidong Zhang","Wancheng Ni","Xin Li","Yiping Yang"],"pdf_url":"https://arxiv.org/pdf/2412.11127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14548v2","updated":"2024-12-15T08:36:19Z","published":"2023-09-25T21:45:30Z","title":"Algorithmic Collusion or Competition: the Role of Platforms' Recommender\n Systems","summary":" Recent scholarly work has extensively examined the phenomenon of algorithmic\ncollusion driven by AI-enabled pricing algorithms. However, online platforms\ncommonly deploy recommender systems that influence how consumers discover and\npurchase products, thereby shaping the reward structures faced by pricing\nalgorithms and ultimately affecting competition dynamics and equilibrium\noutcomes. To address this gap in the literature and elucidate the role of\nrecommender systems, we propose a novel repeated game framework that integrates\nseveral key components. We first develop a structural search model to\ncharacterize consumers' decision-making processes in response to varying\nrecommendation sets. This model incorporates both observable and unobservable\nheterogeneity in utility and search cost functions, and is estimated using\nreal-world data. Building on the resulting consumer model, we formulate\npersonalized recommendation algorithms designed to maximize either platform\nrevenue or consumer utility. We further introduce pricing algorithms for\nsellers and integrate all these elements to facilitate comprehensive numerical\nexperiments. Our experimental findings reveal that a revenue-maximizing\nrecommender system intensifies algorithmic collusion, whereas a\nutility-maximizing recommender system encourages more competitive pricing\nbehavior among sellers. Intriguingly, and contrary to conventional insights\nfrom the industrial organization and choice modeling literature, increasing the\nsize of recommendation sets under a utility-maximizing regime does not\nconsistently enhance consumer utility. Moreover, the degree of horizontal\ndifferentiation moderates this phenomenon in unexpected ways. The \"more is\nless\" effect does not arise at low levels of differentiation, but becomes\nincreasingly pronounced as horizontal differentiation increases.\n","authors":["Xingchen Xu","Stephanie Lee","Yong Tan"],"pdf_url":"https://arxiv.org/pdf/2309.14548v2.pdf","comment":"54 pages, 4 figures, 16 tables"},{"id":"http://arxiv.org/abs/2412.11105v1","updated":"2024-12-15T08:08:07Z","published":"2024-12-15T08:08:07Z","title":"Multi-Graph Co-Training for Capturing User Intent in Session-based\n Recommendation","summary":" Session-based recommendation focuses on predicting the next item a user will\ninteract with based on sequences of anonymous user sessions. A significant\nchallenge in this field is data sparsity due to the typically short-term\ninteractions. Most existing methods rely heavily on users' current\ninteractions, overlooking the wealth of auxiliary information available. To\naddress this, we propose a novel model, the Multi-Graph Co-Training model\n(MGCOT), which leverages not only the current session graph but also similar\nsession graphs and a global item relation graph. This approach allows for a\nmore comprehensive exploration of intrinsic relationships and better captures\nuser intent from multiple views, enabling session representations to complement\neach other. Additionally, MGCOT employs multi-head attention mechanisms to\neffectively capture relevant session intent and uses contrastive learning to\nform accurate and robust session representations. Extensive experiments on\nthree datasets demonstrate that MGCOT significantly enhances the performance of\nsession-based recommendations, particularly on the Diginetica dataset,\nachieving improvements up to 2.00% in P@20 and 10.70% in MRR@20. Resources have\nbeen made publicly available in our GitHub repository\nhttps://github.com/liang-tian-tian/MGCOT.\n","authors":["Zhe Yang","Tiantian Liang"],"pdf_url":"https://arxiv.org/pdf/2412.11105v1.pdf","comment":"COLING 2025 Main Conference"},{"id":"http://arxiv.org/abs/2412.11087v1","updated":"2024-12-15T07:09:02Z","published":"2024-12-15T07:09:02Z","title":"Leveraging Large Vision-Language Model as User Intent-aware Encoder for\n Composed Image Retrieval","summary":" Composed Image Retrieval (CIR) aims to retrieve target images from candidate\nset using a hybrid-modality query consisting of a reference image and a\nrelative caption that describes the user intent. Recent studies attempt to\nutilize Vision-Language Pre-training Models (VLPMs) with various fusion\nstrategies for addressing the task.However, these methods typically fail to\nsimultaneously meet two key requirements of CIR: comprehensively extracting\nvisual information and faithfully following the user intent. In this work, we\npropose CIR-LVLM, a novel framework that leverages the large vision-language\nmodel (LVLM) as the powerful user intent-aware encoder to better meet these\nrequirements. Our motivation is to explore the advanced reasoning and\ninstruction-following capabilities of LVLM for accurately understanding and\nresponding the user intent. Furthermore, we design a novel hybrid intent\ninstruction module to provide explicit intent guidance at two levels: (1) The\ntask prompt clarifies the task requirement and assists the model in discerning\nuser intent at the task level. (2) The instance-specific soft prompt, which is\nadaptively selected from the learnable prompt pool, enables the model to better\ncomprehend the user intent at the instance level compared to a universal prompt\nfor all instances. CIR-LVLM achieves state-of-the-art performance across three\nprominent benchmarks with acceptable inference efficiency. We believe this\nstudy provides fundamental insights into CIR-related fields.\n","authors":["Zelong Sun","Dong Jing","Guoxing Yang","Nanyi Fei","Zhiwu Lu"],"pdf_url":"https://arxiv.org/pdf/2412.11087v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.11068v1","updated":"2024-12-15T05:57:36Z","published":"2024-12-15T05:57:36Z","title":"RecSys Arena: Pair-wise Recommender System Evaluation with Large\n Language Models","summary":" Evaluating the quality of recommender systems is critical for algorithm\ndesign and optimization. Most evaluation methods are computed based on offline\nmetrics for quick algorithm evolution, since online experiments are usually\nrisky and time-consuming. However, offline evaluation usually cannot fully\nreflect users' preference for the outcome of different recommendation\nalgorithms, and the results may not be consistent with online A/B test.\nMoreover, many offline metrics such as AUC do not offer sufficient information\nfor comparing the subtle differences between two competitive recommender\nsystems in different aspects, which may lead to substantial performance\ndifferences in long-term online serving. Fortunately, due to the strong\ncommonsense knowledge and role-play capability of large language models (LLMs),\nit is possible to obtain simulated user feedback on offline recommendation\nresults. Motivated by the idea of LLM Chatbot Arena, in this paper we present\nthe idea of RecSys Arena, where the recommendation results given by two\ndifferent recommender systems in each session are evaluated by an LLM judger to\nobtain fine-grained evaluation feedback. More specifically, for each sample we\nuse LLM to generate a user profile description based on user behavior history\nor off-the-shelf profile features, which is used to guide LLM to play the role\nof this user and evaluate the relative preference for two recommendation\nresults generated by different models. Through extensive experiments on two\nrecommendation datasets in different scenarios, we demonstrate that many\ndifferent LLMs not only provide general evaluation results that are highly\nconsistent with canonical offline metrics, but also provide rich insight in\nmany subjective aspects. Moreover, it can better distinguish different\nalgorithms with comparable performance in terms of AUC and nDCG.\n","authors":["Zhuo Wu","Qinglin Jia","Chuhan Wu","Zhaocheng Du","Shuai Wang","Zan Wang","Zhenhua Dong"],"pdf_url":"https://arxiv.org/pdf/2412.11068v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2403.05050v4","updated":"2024-12-15T20:29:34Z","published":"2024-03-08T04:53:53Z","title":"DyRoNet: Dynamic Routing and Low-Rank Adapters for Autonomous Driving\n Streaming Perception","summary":" The advancement of autonomous driving systems hinges on the ability to\nachieve low-latency and high-accuracy perception. To address this critical\nneed, this paper introduces Dynamic Routing Network (DyRoNet), a low-rank\nenhanced dynamic routing framework designed for streaming perception in\nautonomous driving systems. DyRoNet integrates a suite of pre-trained branch\nnetworks, each meticulously fine-tuned to function under distinct environmental\nconditions. At its core, the framework offers a speed router module, developed\nto assess and route input data to the most suitable branch for processing. This\napproach not only addresses the inherent limitations of conventional models in\nadapting to diverse driving conditions but also ensures the balance between\nperformance and efficiency. Extensive experimental evaluations demonstrate the\nadaptability of DyRoNet to diverse branch selection strategies, resulting in\nsignificant performance enhancements across different scenarios. This work\nestablishes a new benchmark for streaming perception and provides valuable\nengineering insights for future work.\n","authors":["Xiang Huang","Zhi-Qi Cheng","Jun-Yan He","Chenyang Li","Wangmeng Xiang","Baigui Sun"],"pdf_url":"https://arxiv.org/pdf/2403.05050v4.pdf","comment":"Accepted to WACV 2025. 17 pages, 8 figures. Project:\n https://tastevision.github.io/DyRoNet/"},{"id":"http://arxiv.org/abs/2412.12206v1","updated":"2024-12-15T16:10:10Z","published":"2024-12-15T16:10:10Z","title":"Provably Secure Robust Image Steganography via Cross-Modal Error\n Correction","summary":" The rapid development of image generation models has facilitated the\nwidespread dissemination of generated images on social networks, creating\nfavorable conditions for provably secure image steganography. However, existing\nmethods face issues such as low quality of generated images and lack of\nsemantic control in the generation process. To leverage provably secure\nsteganography with more effective and high-performance image generation models,\nand to ensure that stego images can accurately extract secret messages even\nafter being uploaded to social networks and subjected to lossy processing such\nas JPEG compression, we propose a high-quality, provably secure, and robust\nimage steganography method based on state-of-the-art autoregressive (AR) image\ngeneration models using Vector-Quantized (VQ) tokenizers. Additionally, we\nemploy a cross-modal error-correction framework that generates stego text from\nstego images to aid in restoring lossy images, ultimately enabling the\nextraction of secret messages embedded within the images. Extensive experiments\nhave demonstrated that the proposed method provides advantages in stego\nquality, embedding capacity, and robustness, while ensuring provable\nundetectability.\n","authors":["Yuang Qi","Kejiang Chen","Na Zhao","Zijin Yang","Weiming Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.12206v1.pdf","comment":"7 pages. Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2404.12900v2","updated":"2024-12-15T14:53:00Z","published":"2024-04-19T14:13:46Z","title":"Training-and-Prompt-Free General Painterly Harmonization via Zero-Shot\n Disentenglement on Style and Content References","summary":" Painterly image harmonization aims at seamlessly blending disparate visual\nelements within a single image. However, previous approaches often struggle due\nto limitations in training data or reliance on additional prompts, leading to\ninharmonious and content-disrupted output. To surmount these hurdles, we design\na Training-and-prompt-Free General Painterly Harmonization method (TF-GPH).\nTF-GPH incorporates a novel ``Similarity Disentangle Mask'', which disentangles\nthe foreground content and background image by redirecting their attention to\ncorresponding reference images, enhancing the attention mechanism for\nmulti-image inputs. Additionally, we propose a ``Similarity Reweighting''\nmechanism to balance harmonization between stylization and content\npreservation. This mechanism minimizes content disruption by prioritizing the\ncontent-similar features within the given background style reference. Finally,\nwe address the deficiencies in existing benchmarks by proposing novel\nrange-based evaluation metrics and a new benchmark to better reflect real-world\napplications. Extensive experiments demonstrate the efficacy of our method in\nall benchmarks. More detailed in https://github.com/BlueDyee/TF-GPH.\n","authors":["Teng-Fang Hsiao","Bo-Kai Ruan","Hong-Han Shuai"],"pdf_url":"https://arxiv.org/pdf/2404.12900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19723v2","updated":"2024-12-15T14:24:47Z","published":"2024-03-28T03:20:54Z","title":"HeGTa: Leveraging Heterogeneous Graph-enhanced Large Language Models for\n Few-shot Complex Table Understanding","summary":" Table understanding (TU) has achieved promising advancements, but it faces\nthe challenges of the scarcity of manually labeled tables and the presence of\ncomplex table structures.To address these challenges, we propose HGT, a\nframework with a heterogeneous graph (HG)-enhanced large language model (LLM)\nto tackle few-shot TU tasks.It leverages the LLM by aligning the table\nsemantics with the LLM's parametric knowledge through soft prompts and\ninstruction turning and deals with complex tables by a multi-task pre-training\nscheme involving three novel multi-granularity self-supervised HG pre-training\nobjectives.We empirically demonstrate the effectiveness of HGT, showing that it\noutperforms the SOTA for few-shot complex TU on several benchmarks.\n","authors":["Rihui Jin","Yu Li","Guilin Qi","Nan Hu","Yuan-Fang Li","Jiaoyan Chen","Jianan Wang","Yongrui Chen","Dehai Min","Sheng Bi"],"pdf_url":"https://arxiv.org/pdf/2403.19723v2.pdf","comment":"AAAI 2025"},{"id":"http://arxiv.org/abs/2403.08505v3","updated":"2024-12-15T02:07:57Z","published":"2024-03-13T13:12:57Z","title":"CAMSIC: Content-aware Masked Image Modeling Transformer for Stereo Image\n Compression","summary":" Existing learning-based stereo image codec adopt sophisticated transformation\nwith simple entropy models derived from single image codecs to encode latent\nrepresentations. However, those entropy models struggle to effectively capture\nthe spatial-disparity characteristics inherent in stereo images, which leads to\nsuboptimal rate-distortion results. In this paper, we propose a stereo image\ncompression framework, named CAMSIC. CAMSIC independently transforms each image\nto latent representation and employs a powerful decoder-free Transformer\nentropy model to capture both spatial and disparity dependencies, by\nintroducing a novel content-aware masked image modeling (MIM) technique. Our\ncontent-aware MIM facilitates efficient bidirectional interaction between prior\ninformation and estimated tokens, which naturally obviates the need for an\nextra Transformer decoder. Experiments show that our stereo image codec\nachieves state-of-the-art rate-distortion performance on two stereo image\ndatasets Cityscapes and InStereo2K with fast encoding and decoding speed. Code\nis available at https://github.com/Xinjie-Q/CAMSIC.\n","authors":["Xinjie Zhang","Shenyuan Gao","Zhening Liu","Jiawei Shao","Xingtong Ge","Dailan He","Tongda Xu","Yan Wang","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08505v3.pdf","comment":"Accepted by AAAI 2025"}]},"2024-12-14T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2412.10858v1","updated":"2024-12-14T15:14:39Z","published":"2024-12-14T15:14:39Z","title":"CRENER: A Character Relation Enhanced Chinese NER Model","summary":" Chinese Named Entity Recognition (NER) is an important task in information\nextraction, which has a significant impact on downstream applications. Due to\nthe lack of natural separators in Chinese, previous NER methods mostly relied\non external dictionaries to enrich the semantic and boundary information of\nChinese words. However, such methods may introduce noise that affects the\naccuracy of named entity recognition. To this end, we propose a character\nrelation enhanced Chinese NER model (CRENER). This model defines four types of\ntags that reflect the relationships between characters, and proposes a\nfine-grained modeling of the relationships between characters based on three\ntypes of relationships: adjacency relations between characters, relations\nbetween characters and tags, and relations between tags, to more accurately\nidentify entity boundaries and improve Chinese NER accuracy. Specifically, we\ntransform the Chinese NER task into a character-character relationship\nclassification task, ensuring the accuracy of entity boundary recognition\nthrough joint modeling of relation tags. To enhance the model's ability to\nunderstand contextual information, WRENER further constructed an adapted\ntransformer encoder that combines unscaled direction-aware and distance-aware\nmasked self-attention mechanisms. Moreover, a relationship representation\nenhancement module was constructed to model predefined relationship tags,\neffectively mining the relationship representations between characters and\ntags. Experiments conducted on four well-known Chinese NER benchmark datasets\nhave shown that the proposed model outperforms state-of-the-art baselines. The\nablation experiment also demonstrated the effectiveness of the proposed model.\n","authors":["Yaqiong Qiao","Shixuan Peng"],"pdf_url":"https://arxiv.org/pdf/2412.10858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09560v3","updated":"2024-12-14T14:14:26Z","published":"2023-04-19T10:59:34Z","title":"An Offline Metric for the Debiasedness of Click Models","summary":" A well-known problem when learning from user clicks are inherent biases\nprevalent in the data, such as position or trust bias. Click models are a\ncommon method for extracting information from user clicks, such as document\nrelevance in web search, or to estimate click biases for downstream\napplications such as counterfactual learning-to-rank, ad placement, or fair\nranking. Recent work shows that the current evaluation practices in the\ncommunity fail to guarantee that a well-performing click model generalizes well\nto downstream tasks in which the ranking distribution differs from the training\ndistribution, i.e., under covariate shift. In this work, we propose an\nevaluation metric based on conditional independence testing to detect a lack of\nrobustness to covariate shift in click models. We introduce the concept of\ndebiasedness in click modeling and derive a metric for measuring it. In\nextensive semi-synthetic experiments, we show that our proposed metric helps to\npredict the downstream performance of click models under covariate shift and is\nuseful in an off-policy model selection setting.\n","authors":["Romain Deffayet","Philipp Hager","Jean-Michel Renders","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2304.09560v3.pdf","comment":"SIGIR23 - Full paper"},{"id":"http://arxiv.org/abs/2412.10787v1","updated":"2024-12-14T10:49:00Z","published":"2024-12-14T10:49:00Z","title":"Why Not Together? A Multiple-Round Recommender System for Queries and\n Items","summary":" A fundamental technique of recommender systems involves modeling user\npreferences, where queries and items are widely used as symbolic\nrepresentations of user interests. Queries delineate user needs at an abstract\nlevel, providing a high-level description, whereas items operate on a more\nspecific and concrete level, representing the granular facets of user\npreference. While practical, both query and item recommendations encounter the\nchallenge of sparse user feedback. To this end, we propose a novel approach\nnamed Multiple-round Auto Guess-and-Update System (MAGUS) that capitalizes on\nthe synergies between both types, allowing us to leverage both query and item\ninformation to form user interests. This integrated system introduces a\nrecursive framework that could be applied to any recommendation method to\nexploit queries and items in historical interactions and to provide\nrecommendations for both queries and items in each interaction round. Empirical\nresults from testing 12 different recommendation methods demonstrate that\nintegrating queries into item recommendations via MAGUS significantly enhances\nthe efficiency, with which users can identify their preferred items during\nmultiple-round interactions.\n","authors":["Jiarui Jin","Xianyu Chen","Weinan Zhang","Yong Yu","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2412.10787v1.pdf","comment":"KDD 2025"},{"id":"http://arxiv.org/abs/2412.10770v1","updated":"2024-12-14T09:47:21Z","published":"2024-12-14T09:47:21Z","title":"Learned Data Compression: Challenges and Opportunities for the Future","summary":" Compressing integer keys is a fundamental operation among multiple\ncommunities, such as database management (DB), information retrieval (IR), and\nhigh-performance computing (HPC). Recent advances in \\emph{learned indexes}\nhave inspired the development of \\emph{learned compressors}, which leverage\nsimple yet compact machine learning (ML) models to compress large-scale sorted\nkeys. The core idea behind learned compressors is to \\emph{losslessly} encode\nsorted keys by approximating them with \\emph{error-bounded} ML models (e.g.,\npiecewise linear functions) and using a \\emph{residual array} to guarantee\naccurate key reconstruction.\n While the concept of learned compressors remains in its early stages of\nexploration, our benchmark results demonstrate that an SIMD-optimized learned\ncompressor can significantly outperform state-of-the-art CPU-based compressors.\nDrawing on our preliminary experiments, this vision paper explores the\npotential of learned data compression to enhance critical areas in DBMS and\nrelated domains. Furthermore, we outline the key technical challenges that\nexisting systems must address when integrating this emerging methodology.\n","authors":["Qiyu Liu","Siyuan Han","Jianwei Liao","Jin Li","Jingshu Peng","Jun Du","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2412.10770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10745v1","updated":"2024-12-14T08:28:52Z","published":"2024-12-14T08:28:52Z","title":"Enhancing Event Extraction from Short Stories through Contextualized\n Prompts","summary":" Event extraction is an important natural language processing (NLP) task of\nidentifying events in an unstructured text. Although a plethora of works deal\nwith event extraction from new articles, clinical text etc., only a few works\nfocus on event extraction from literary content. Detecting events in short\nstories presents several challenges to current systems, encompassing a\ndifferent distribution of events as compared to other domains and the portrayal\nof diverse emotional conditions. This paper presents \\texttt{Vrittanta-EN}, a\ncollection of 1000 English short stories annotated for real events. Exploring\nthis field could result in the creation of techniques and resources that\nsupport literary scholars in improving their effectiveness. This could\nsimultaneously influence the field of Natural Language Processing. Our\nobjective is to clarify the intricate idea of events in the context of short\nstories. Towards the objective, we collected 1,000 short stories written mostly\nfor children in the Indian context. Further, we present fresh guidelines for\nannotating event mentions and their categories, organized into \\textit{seven\ndistinct classes}. The classes are {\\tt{COGNITIVE-MENTAL-STATE(CMS),\nCOMMUNICATION(COM), CONFLICT(CON), GENERAL-ACTIVITY(GA), LIFE-EVENT(LE),\nMOVEMENT(MOV), and OTHERS(OTH)}}. Subsequently, we apply these guidelines to\nannotate the short story dataset. Later, we apply the baseline methods for\nautomatically detecting and categorizing events. We also propose a prompt-based\nmethod for event detection and classification. The proposed method outperforms\nthe baselines, while having significant improvement of more than 4\\% for the\nclass \\texttt{CONFLICT} in event classification task.\n","authors":["Chaitanya Kirti","Ayon Chattopadhyay","Ashish Anand","Prithwijit Guha"],"pdf_url":"https://arxiv.org/pdf/2412.10745v1.pdf","comment":"47 pages, 8 figures, Planning to submit in Elsevier (Computer Speech\n and Language Journal)"},{"id":"http://arxiv.org/abs/2412.10737v1","updated":"2024-12-14T08:18:23Z","published":"2024-12-14T08:18:23Z","title":"Sentiment and Hashtag-aware Attentive Deep Neural Network for Multimodal\n Post Popularity Prediction","summary":" Social media users articulate their opinions on a broad spectrum of subjects\nand share their experiences through posts comprising multiple modes of\nexpression, leading to a notable surge in such multimodal content on social\nmedia platforms. Nonetheless, accurately forecasting the popularity of these\nposts presents a considerable challenge. Prevailing methodologies primarily\ncenter on the content itself, thereby overlooking the wealth of information\nencapsulated within alternative modalities such as visual demographics,\nsentiments conveyed through hashtags and adequately modeling the intricate\nrelationships among hashtags, texts, and accompanying images. This oversight\nlimits the ability to capture emotional connection and audience relevance,\nsignificantly influencing post popularity. To address these limitations, we\npropose a seNtiment and hAshtag-aware attentive deep neuRal netwoRk for\nmultimodAl posT pOpularity pRediction, herein referred to as NARRATOR that\nextracts visual demographics from faces appearing in images and discerns\nsentiment from hashtag usage, providing a more comprehensive understanding of\nthe factors influencing post popularity Moreover, we introduce a hashtag-guided\nattention mechanism that leverages hashtags as navigational cues, guiding the\nmodels focus toward the most pertinent features of textual and visual\nmodalities, thus aligning with target audience interests and broader social\nmedia context. Experimental results demonstrate that NARRATOR outperforms\nexisting methods by a significant margin on two real-world datasets.\nFurthermore, ablation studies underscore the efficacy of integrating visual\ndemographics, sentiment analysis of hashtags, and hashtag-guided attention\nmechanisms in enhancing the performance of post popularity prediction, thereby\nfacilitating increased audience relevance, emotional engagement, and aesthetic\nappeal.\n","authors":["Shubhi Bansal","Mohit Kumar","Chandravardhan Singh Raghaw","Nagendra Kumar"],"pdf_url":"https://arxiv.org/pdf/2412.10737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10714v1","updated":"2024-12-14T06:56:46Z","published":"2024-12-14T06:56:46Z","title":"Movie Recommendation using Web Crawling","summary":" In today's digital world, streaming platforms offer a vast array of movies,\nmaking it hard for users to find content matching their preferences. This paper\nexplores integrating real time data from popular movie websites using advanced\nHTML scraping techniques and APIs. It also incorporates a recommendation system\ntrained on a static Kaggle dataset, enhancing the relevance and freshness of\nsuggestions. By combining content based filtering, collaborative filtering, and\na hybrid model, we create a system that utilizes both historical and real time\ndata for more personalized suggestions. Our methodology shows that\nincorporating dynamic data not only boosts user satisfaction but also aligns\nrecommendations with current viewing trends.\n","authors":["Pronit Raj","Chandrashekhar Kumar","Harshit Shekhar","Amit Kumar","Kritibas Paul","Debasish Jana"],"pdf_url":"https://arxiv.org/pdf/2412.10714v1.pdf","comment":"12 pages, 3 figures, Accepted and to be published in Proceedings of\n 2025 International Conference on Applied Algorithms (ICAA), Kolkata, India,\n Dec 8-10, 2025"},{"id":"http://arxiv.org/abs/2412.10701v1","updated":"2024-12-14T06:18:19Z","published":"2024-12-14T06:18:19Z","title":"Beyond Quantile Methods: Improved Top-K Threshold Estimation for\n Traditional and Learned Sparse Indexes","summary":" Top-k threshold estimation is the problem of estimating the score of the k-th\nhighest ranking result of a search query. A good estimate can be used to speed\nup many common top-k query processing algorithms, and thus a number of\nresearchers have recently studied the problem. Among the various approaches\nthat have been proposed, quantile methods appear to give the best estimates\noverall at modest computational costs, followed by sampling-based methods in\ncertain cases. In this paper, we make two main contributions. First, we study\nhow to get even better estimates than the state of the art. Starting from\nquantile-based methods, we propose a series of enhancements that give improved\nestimates in terms of the commonly used mean under-prediction fraction (MUF).\nSecond, we study the threshold estimation problem on recently proposed learned\nsparse index structures, showing that our methods also work well for these\ncases. Our best methods substantially narrow the gap between the state of the\nart and the ideal MUF of 1.0, at some additional cost in time and space.\n","authors":["Jinrui Gou","Yifan Liu","Minghao Shao","Torsten Suel"],"pdf_url":"https://arxiv.org/pdf/2412.10701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18069v2","updated":"2024-12-14T05:56:10Z","published":"2024-11-27T05:43:00Z","title":"Overview of TREC 2024 Biomedical Generative Retrieval (BioGen) Track","summary":" With the advancement of large language models (LLMs), the biomedical domain\nhas seen significant progress and improvement in multiple tasks such as\nbiomedical question answering, lay language summarization of the biomedical\nliterature, clinical note summarization, etc. However, hallucinations or\nconfabulations remain one of the key challenges when using LLMs in the\nbiomedical and other domains. Inaccuracies may be particularly harmful in\nhigh-risk situations, such as medical question answering, making clinical\ndecisions, or appraising biomedical research. Studies on the evaluation of the\nLLMs abilities to ground generated statements in verifiable sources have shown\nthat models perform significantly worse on lay-user-generated questions, and\noften fail to reference relevant sources. This can be problematic when those\nseeking information want evidence from studies to back up the claims from LLMs.\nUnsupported statements are a major barrier to using LLMs in any applications\nthat may affect health. Methods for grounding generated statements in reliable\nsources along with practical evaluation approaches are needed to overcome this\nbarrier. Towards this, in our pilot task organized at TREC 2024, we introduced\nthe task of reference attribution as a means to mitigate the generation of\nfalse statements by LLMs answering biomedical questions.\n","authors":["Deepak Gupta","Dina Demner-Fushman","William Hersh","Steven Bedrick","Kirk Roberts"],"pdf_url":"https://arxiv.org/pdf/2411.18069v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10680v1","updated":"2024-12-14T04:59:38Z","published":"2024-12-14T04:59:38Z","title":"UCDR-Adapter: Exploring Adaptation of Pre-Trained Vision-Language Models\n for Universal Cross-Domain Retrieval","summary":" Universal Cross-Domain Retrieval (UCDR) retrieves relevant images from unseen\ndomains and classes without semantic labels, ensuring robust generalization.\nExisting methods commonly employ prompt tuning with pre-trained vision-language\nmodels but are inherently limited by static prompts, reducing adaptability. We\npropose UCDR-Adapter, which enhances pre-trained models with adapters and\ndynamic prompt generation through a two-phase training strategy. First, Source\nAdapter Learning integrates class semantics with domain-specific visual\nknowledge using a Learnable Textual Semantic Template and optimizes Class and\nDomain Prompts via momentum updates and dual loss functions for robust\nalignment. Second, Target Prompt Generation creates dynamic prompts by\nattending to masked source prompts, enabling seamless adaptation to unseen\ndomains and classes. Unlike prior approaches, UCDR-Adapter dynamically adapts\nto evolving data distributions, enhancing both flexibility and generalization.\nDuring inference, only the image branch and generated prompts are used,\neliminating reliance on textual inputs for highly efficient retrieval.\nExtensive benchmark experiments show that UCDR-Adapter consistently outperforms\nProS in most cases and other state-of-the-art methods on UCDR, U(c)CDR, and\nU(d)CDR settings.\n","authors":["Haoyu Jiang","Zhi-Qi Cheng","Gabriel Moreira","Jiawen Zhu","Jingdong Sun","Bukun Ren","Jun-Yan He","Qi Dai","Xian-Sheng Hua"],"pdf_url":"https://arxiv.org/pdf/2412.10680v1.pdf","comment":"Accepted to WACV 2025. Project link:\n https://github.com/fine68/UCDR2024"},{"id":"http://arxiv.org/abs/2412.10674v1","updated":"2024-12-14T04:22:09Z","published":"2024-12-14T04:22:09Z","title":"USM: Unbiased Survey Modeling for Limiting Negative User Experiences in\n Recommendation Systems","summary":" Negative feedback signals are crucial to guardrail content recommendations\nand improve user experience. When these signals are effectively integrated into\nrecommendation systems, they play a vital role in preventing the promotion of\nharmful or undesirable content, thereby contributing to a healthier online\nenvironment. However, the challenges associated with negative signals are\nnoteworthy. Due to the limited visibility of options for users to express\nnegative feedback, these signals are often sparse compared to positive signals.\nThis imbalance can lead to a skewed understanding of user preferences,\nresulting in recommendations that prioritize short-term engagement over\nlong-term satisfaction. Moreover, an over-reliance on positive signals can\ncreate a filter bubble, where users are continuously exposed to content that\naligns with their immediate preferences but may not be beneficial in the long\nrun. This scenario can ultimately lead to user attrition as audiences become\ndisillusioned with the quality of the content provided. Additionally, existing\nuser signals frequently fail to meet specific customized requirements, such as\nunderstanding the underlying reasons for a user's likes or dislikes regarding a\nvideo. This lack of granularity hinders our ability to tailor content\nrecommendations effectively, as we cannot identify the particular attributes of\ncontent that resonate with individual users.\n","authors":["Chenghui Yu","Peiyi Li","Haoze Wu","Bingfeng Deng","Hongyu Xiong"],"pdf_url":"https://arxiv.org/pdf/2412.10674v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2412.04506v2","updated":"2024-12-14T00:13:09Z","published":"2024-12-03T22:59:36Z","title":"Arctic-Embed 2.0: Multilingual Retrieval Without Compromise","summary":" This paper presents the training methodology of Arctic-Embed 2.0, a set of\nopen-source text embedding models built for accurate and efficient multilingual\nretrieval. While prior works have suffered from degraded English retrieval\nquality, Arctic-Embed 2.0 delivers competitive retrieval quality on\nmultilingual and English-only benchmarks, and supports Matryoshka\nRepresentation Learning (MRL) for efficient embedding storage with\nsignificantly lower compressed quality degradation compared to alternatives. We\ndetail the design and implementation, presenting several important open\nresearch questions that arose during model development. We conduct experiments\nexploring these research questions and include extensive discussion aimed at\nfostering further discussion in this field.\n","authors":["Puxuan Yu","Luke Merrick","Gaurav Nuti","Daniel Campos"],"pdf_url":"https://arxiv.org/pdf/2412.04506v2.pdf","comment":"10 pages, 5 figures, 3 tables"}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.08174v2","updated":"2024-12-14T10:34:35Z","published":"2024-10-10T17:50:42Z","title":"Sample then Identify: A General Framework for Risk Control and\n Assessment in Multimodal Large Language Models","summary":" Multimodal Large Language Models (MLLMs) exhibit promising advancements\nacross various tasks, yet they still encounter significant trustworthiness\nissues. Prior studies apply Split Conformal Prediction (SCP) in language\nmodeling to construct prediction sets with statistical guarantees. However,\nthese methods typically rely on internal model logits or are restricted to\nmultiple-choice settings, which hampers their generalizability and adaptability\nin dynamic, open-ended environments. In this paper, we introduce TRON, a\ntwo-step framework for risk control and assessment, applicable to any MLLM that\nsupports sampling in both open-ended and closed-ended scenarios. TRON comprises\ntwo main components: (1) a novel conformal score to sample response sets of\nminimum size, and (2) a nonconformity score to identify high-quality responses\nbased on self-consistency theory, controlling the error rates by two specific\nrisk levels. Furthermore, we investigate semantic redundancy in prediction sets\nwithin open-ended contexts for the first time, leading to a promising\nevaluation metric for MLLMs based on average set size. Our comprehensive\nexperiments across four Video Question-Answering (VideoQA) datasets utilizing\neight MLLMs show that TRON achieves desired error rates bounded by two\nuser-specified risk levels. Additionally, deduplicated prediction sets maintain\nadaptiveness while being more efficient and stable for risk assessment under\ndifferent risk levels.\n","authors":["Qingni Wang","Tiantian Geng","Zhiyuan Wang","Teng Wang","Bo Fu","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2410.08174v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10768v1","updated":"2024-12-14T09:36:10Z","published":"2024-12-14T09:36:10Z","title":"VinTAGe: Joint Video and Text Conditioning for Holistic Audio Generation","summary":" Recent advances in audio generation have focused on text-to-audio (T2A) and\nvideo-to-audio (V2A) tasks. However, T2A or V2A methods cannot generate\nholistic sounds (onscreen and off-screen). This is because T2A cannot generate\nsounds aligning with onscreen objects, while V2A cannot generate semantically\ncomplete (offscreen sounds missing). In this work, we address the task of\nholistic audio generation: given a video and a text prompt, we aim to generate\nboth onscreen and offscreen sounds that are temporally synchronized with the\nvideo and semantically aligned with text and video. Previous approaches for\njoint text and video-to-audio generation often suffer from modality bias,\nfavoring one modality over the other. To overcome this limitation, we introduce\nVinTAGe, a flow-based transformer model that jointly considers text and video\nto guide audio generation. Our framework comprises two key components: a\nVisual-Text Encoder and a Joint VT-SiT model. To reduce modality bias and\nimprove generation quality, we employ pretrained uni-modal text-to-audio and\nvideo-to-audio generation models for additional guidance. Due to the lack of\nappropriate benchmarks, we also introduce VinTAGe-Bench, a dataset of 636\nvideo-text-audio pairs containing both onscreen and offscreen sounds. Our\ncomprehensive experiments on VinTAGe-Bench demonstrate that joint text and\nvisual interaction is necessary for holistic audio generation. Furthermore,\nVinTAGe achieves state-of-the-art results on the VGGSound benchmark. Our source\ncode and pre-trained models will be released. Demo is available at:\nhttps://www.youtube.com/watch?v=QmqWhUjPkJI.\n","authors":["Saksham Singh Kushwaha","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2412.10768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.10749v1","updated":"2024-12-14T08:34:44Z","published":"2024-12-14T08:34:44Z","title":"Patch-level Sounding Object Tracking for Audio-Visual Question Answering","summary":" Answering questions related to audio-visual scenes, i.e., the AVQA task, is\nbecoming increasingly popular. A critical challenge is accurately identifying\nand tracking sounding objects related to the question along the timeline. In\nthis paper, we present a new Patch-level Sounding Object Tracking (PSOT)\nmethod. It begins with a Motion-driven Key Patch Tracking (M-KPT) module, which\nrelies on visual motion information to identify salient visual patches with\nsignificant movements that are more likely to relate to sounding objects and\nquestions. We measure the patch-wise motion intensity map between neighboring\nvideo frames and utilize it to construct and guide a motion-driven graph\nnetwork. Meanwhile, we design a Sound-driven KPT (S-KPT) module to explicitly\ntrack sounding patches. This module also involves a graph network, with the\nadjacency matrix regularized by the audio-visual correspondence map. The M-KPT\nand S-KPT modules are performed in parallel for each temporal segment, allowing\nbalanced tracking of salient and sounding objects. Based on the tracked\npatches, we further propose a Question-driven KPT (Q-KPT) module to retain\npatches highly relevant to the question, ensuring the model focuses on the most\ninformative clues. The audio-visual-question features are updated during the\nprocessing of these modules, which are then aggregated for final answer\nprediction. Extensive experiments on standard datasets demonstrate the\neffectiveness of our method, achieving competitive performance even compared to\nrecent large-scale pretraining-based approaches.\n","authors":["Zhangbin Li","Jinxing Zhou","Jing Zhang","Shengeng Tang","Kun Li","Dan Guo"],"pdf_url":"https://arxiv.org/pdf/2412.10749v1.pdf","comment":"Accepted by AAAI 2025"},{"id":"http://arxiv.org/abs/2412.10707v1","updated":"2024-12-14T06:33:53Z","published":"2024-12-14T06:33:53Z","title":"MambaPro: Multi-Modal Object Re-Identification with Mamba Aggregation\n and Synergistic Prompt","summary":" Multi-modal object Re-IDentification (ReID) aims to retrieve specific objects\nby utilizing complementary image information from different modalities.\nRecently, large-scale pre-trained models like CLIP have demonstrated impressive\nperformance in traditional single-modal object ReID tasks. However, they remain\nunexplored for multi-modal object ReID. Furthermore, current multi-modal\naggregation methods have obvious limitations in dealing with long sequences\nfrom different modalities. To address above issues, we introduce a novel\nframework called MambaPro for multi-modal object ReID. To be specific, we first\nemploy a Parallel Feed-Forward Adapter (PFA) for adapting CLIP to multi-modal\nobject ReID. Then, we propose the Synergistic Residual Prompt (SRP) to guide\nthe joint learning of multi-modal features. Finally, leveraging Mamba's\nsuperior scalability for long sequences, we introduce Mamba Aggregation (MA) to\nefficiently model interactions between different modalities. As a result,\nMambaPro could extract more robust features with lower complexity. Extensive\nexperiments on three multi-modal object ReID benchmarks (i.e., RGBNT201,\nRGBNT100 and MSVR310) validate the effectiveness of our proposed methods. The\nsource code is available at https://github.com/924973292/MambaPro.\n","authors":["Yuhao Wang","Xuehu Liu","Tianyu Yan","Yang Liu","Aihua Zheng","Pingping Zhang","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2412.10707v1.pdf","comment":"This work is accepted by AAAI2025. More modifications may be\n performed"},{"id":"http://arxiv.org/abs/2407.20592v2","updated":"2024-12-14T06:15:40Z","published":"2024-07-30T06:57:00Z","title":"EgoSonics: Generating Synchronized Audio for Silent Egocentric Videos","summary":" We introduce EgoSonics, a method to generate semantically meaningful and\nsynchronized audio tracks conditioned on silent egocentric videos. Generating\naudio for silent egocentric videos could open new applications in virtual\nreality, assistive technologies, or for augmenting existing datasets. Existing\nwork has been limited to domains like speech, music, or impact sounds and\ncannot capture the broad range of audio frequencies found in egocentric videos.\nEgoSonics addresses these limitations by building on the strengths of latent\ndiffusion models for conditioned audio synthesis. We first encode and process\npaired audio-video data to make them suitable for generation. The encoded data\nis then used to train a model that can generate an audio track that captures\nthe semantics of the input video. Our proposed SyncroNet builds on top of\nControlNet to provide control signals that enables generation of temporally\nsynchronized audio. Extensive evaluations and a comprehensive user study show\nthat our model outperforms existing work in audio quality, and in our proposed\nsynchronization evaluation method. Furthermore, we demonstrate downstream\napplications of our model in improving video summarization.\n","authors":["Aashish Rai","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2407.20592v2.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2412.10680v1","updated":"2024-12-14T04:59:38Z","published":"2024-12-14T04:59:38Z","title":"UCDR-Adapter: Exploring Adaptation of Pre-Trained Vision-Language Models\n for Universal Cross-Domain Retrieval","summary":" Universal Cross-Domain Retrieval (UCDR) retrieves relevant images from unseen\ndomains and classes without semantic labels, ensuring robust generalization.\nExisting methods commonly employ prompt tuning with pre-trained vision-language\nmodels but are inherently limited by static prompts, reducing adaptability. We\npropose UCDR-Adapter, which enhances pre-trained models with adapters and\ndynamic prompt generation through a two-phase training strategy. First, Source\nAdapter Learning integrates class semantics with domain-specific visual\nknowledge using a Learnable Textual Semantic Template and optimizes Class and\nDomain Prompts via momentum updates and dual loss functions for robust\nalignment. Second, Target Prompt Generation creates dynamic prompts by\nattending to masked source prompts, enabling seamless adaptation to unseen\ndomains and classes. Unlike prior approaches, UCDR-Adapter dynamically adapts\nto evolving data distributions, enhancing both flexibility and generalization.\nDuring inference, only the image branch and generated prompts are used,\neliminating reliance on textual inputs for highly efficient retrieval.\nExtensive benchmark experiments show that UCDR-Adapter consistently outperforms\nProS in most cases and other state-of-the-art methods on UCDR, U(c)CDR, and\nU(d)CDR settings.\n","authors":["Haoyu Jiang","Zhi-Qi Cheng","Gabriel Moreira","Jiawen Zhu","Jingdong Sun","Bukun Ren","Jun-Yan He","Qi Dai","Xian-Sheng Hua"],"pdf_url":"https://arxiv.org/pdf/2412.10680v1.pdf","comment":"Accepted to WACV 2025. Project link:\n https://github.com/fine68/UCDR2024"},{"id":"http://arxiv.org/abs/2412.10649v1","updated":"2024-12-14T02:36:45Z","published":"2024-12-14T02:36:45Z","title":"Hidden Echoes Survive Training in Audio To Audio Generative Instrument\n Models","summary":" As generative techniques pervade the audio domain, there has been increasing\ninterest in tracing back through these complicated models to understand how\nthey draw on their training data to synthesize new examples, both to ensure\nthat they use properly licensed data and also to elucidate their black box\nbehavior. In this paper, we show that if imperceptible echoes are hidden in the\ntraining data, a wide variety of audio to audio architectures (differentiable\ndigital signal processing (DDSP), Realtime Audio Variational autoEncoder\n(RAVE), and ``Dance Diffusion'') will reproduce these echoes in their outputs.\nHiding a single echo is particularly robust across all architectures, but we\nalso show promising results hiding longer time spread echo patterns for an\nincreased information capacity. We conclude by showing that echoes make their\nway into fine tuned models, that they survive mixing/demixing, and that they\nsurvive pitch shift augmentation during training. Hence, this simple, classical\nidea in watermarking shows significant promise for tagging generative audio\nmodels.\n","authors":["Christopher J. Tralie","Matt Amery","Benjamin Douglas","Ian Utz"],"pdf_url":"https://arxiv.org/pdf/2412.10649v1.pdf","comment":"8 pages, 11 Figures, Proceedings of 2025 AAAI Workshop on AI for\n Music"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 117 + +
+
+
+ + ☆ Tokenisation is NP-Complete + + +
+ In this work, we prove the NP-completeness of two variants of tokenisation, +defined as the problem of compressing a dataset to at most $\delta$ symbols by +either finding a vocabulary directly (direct tokenisation), or selecting a +sequence of merge operations (bottom-up tokenisation). + +
+
+
+
+
+ + ☆ LongBench v2: Towards Deeper Understanding and Reasoning on Realistic + Long-context Multitasks + + +
+ This paper introduces LongBench v2, a benchmark designed to assess the +ability of LLMs to handle long-context problems requiring deep understanding +and reasoning across real-world multitasks. LongBench v2 consists of 503 +challenging multiple-choice questions, with contexts ranging from 8k to 2M +words, across six major task categories: single-document QA, multi-document QA, +long in-context learning, long-dialogue history understanding, code repository +understanding, and long structured data understanding. To ensure the breadth +and the practicality, we collect data from nearly 100 highly educated +individuals with diverse professional backgrounds. We employ both automated and +manual review processes to maintain high quality and difficulty, resulting in +human experts achieving only 53.7% accuracy under a 15-minute time constraint. +Our evaluation reveals that the best-performing model, when directly answers +the questions, achieves only 50.1% accuracy. In contrast, the o1-preview model, +which includes longer reasoning, achieves 57.7%, surpassing the human baseline +by 4%. These results highlight the importance of enhanced reasoning ability and +scaling inference-time compute to tackle the long-context challenges in +LongBench v2. The project is available at https://longbench2.github.io. + +
+
+ comment: 25 pages, 13 figures +
+
+
+
+
+ + ☆ MMLU-CF: A Contamination-free Multi-task Language Understanding + Benchmark + + +
+ Multiple-choice question (MCQ) datasets like Massive Multitask Language +Understanding (MMLU) are widely used to evaluate the commonsense, +understanding, and problem-solving abilities of large language models (LLMs). +However, the open-source nature of these benchmarks and the broad sources of +training data for LLMs have inevitably led to benchmark contamination, +resulting in unreliable evaluation results. To alleviate this issue, we propose +a contamination-free and more challenging MCQ benchmark called MMLU-CF. This +benchmark reassesses LLMs' understanding of world knowledge by averting both +unintentional and malicious data leakage. To avoid unintentional data leakage, +we source data from a broader domain and design three decontamination rules. To +prevent malicious data leakage, we divide the benchmark into validation and +test sets with similar difficulty and subject distributions. The test set +remains closed-source to ensure reliable results, while the validation set is +publicly available to promote transparency and facilitate independent +verification. Our evaluation of mainstream LLMs reveals that the powerful +GPT-4o achieves merely a 5-shot score of 73.4% and a 0-shot score of 71.9% on +the test set, which indicates the effectiveness of our approach in creating a +more rigorous and contamination-free evaluation standard. The GitHub repository +is available at https://github.com/microsoft/MMLU-CF and the dataset refers to +https://huggingface.co/datasets/microsoft/MMLU-CF. + +
+
+
+
+
+ + ☆ Face the Facts! Evaluating RAG-based Fact-checking Pipelines in + Realistic Settings + + +
+ Natural Language Processing and Generation systems have recently shown the +potential to complement and streamline the costly and time-consuming job of +professional fact-checkers. In this work, we lift several constraints of +current state-of-the-art pipelines for automated fact-checking based on the +Retrieval-Augmented Generation (RAG) paradigm. Our goal is to benchmark, under +more realistic scenarios, RAG-based methods for the generation of verdicts - +i.e., short texts discussing the veracity of a claim - evaluating them on +stylistically complex claims and heterogeneous, yet reliable, knowledge bases. +Our findings show a complex landscape, where, for example, LLM-based retrievers +outperform other retrieval techniques, though they still struggle with +heterogeneous knowledge bases; larger models excel in verdict faithfulness, +while smaller models provide better context adherence, with human evaluations +favouring zero-shot and one-shot approaches for informativeness, and fine-tuned +models for emotional alignment. + +
+
+ comment: Code and data at https://github.com/drusso98/face-the-facts +
+
+
+
+
+ + ☆ LlamaFusion: Adapting Pretrained Language Models for Multimodal + Generation + + +
+ We present LlamaFusion, a framework for empowering pretrained text-only large +language models (LLMs) with multimodal generative capabilities, enabling them +to understand and generate both text and images in arbitrary sequences. +LlamaFusion leverages existing Llama-3's weights for processing texts +autoregressively while introducing additional and parallel transformer modules +for processing images with diffusion. During training, the data from each +modality is routed to its dedicated modules: modality-specific feedforward +layers, query-key-value projections, and normalization layers process each +modality independently, while the shared self-attention layers allow +interactions across text and image features. By freezing the text-specific +modules and only training the image-specific modules, LlamaFusion preserves the +language capabilities of text-only LLMs while developing strong visual +understanding and generation abilities. Compared to methods that pretrain +multimodal generative models from scratch, our experiments demonstrate that, +LlamaFusion improves image understanding by 20% and image generation by 3.6% +using only 50% of the FLOPs while maintaining Llama-3's language capabilities. +We also demonstrate that this framework can adapt existing vision-language +models with multimodal generation ability. Overall, this framework not only +leverages existing computational investments in text-only LLMs but also enables +the parallel development of language and vision capabilities, presenting a +promising direction for efficient multimodal model development. + +
+
+
+
+
+ + ☆ Critical-Questions-of-Thought: Steering LLM reasoning with Argumentative + Querying + + +
+ Studies have underscored how, regardless of the recent breakthrough and swift +advances in AI research, even state-of-the-art Large Language models (LLMs) +continue to struggle when performing logical and mathematical reasoning. The +results seem to suggest that LLMs still work as (highly advanced) data pattern +identifiers, scoring poorly when attempting to generalise and solve reasoning +problems the models have never previously seen or that are not close to samples +presented in their training data. To address this compelling concern, this +paper makes use of the notion of critical questions from the literature on +argumentation theory, focusing in particular on Toulmin's model of +argumentation. We show that employing these critical questions can improve the +reasoning capabilities of LLMs. By probing the rationale behind the models' +reasoning process, the LLM can assess whether some logical mistake is occurring +and correct it before providing the final reply to the user prompt. The +underlying idea is drawn from the gold standard of any valid argumentative +procedure: the conclusion is valid if it is entailed by accepted premises. Or, +to paraphrase such Aristotelian principle in a real-world approximation, +characterised by incomplete information and presumptive logic, the conclusion +is valid if not proved otherwise. This approach successfully steers the models' +output through a reasoning pipeline, resulting in better performance against +the baseline and its Chain-of-Thought (CoT) implementation. To this end, an +extensive evaluation of the proposed approach on the MT-Bench Reasoning and +Math tasks across a range of LLMs is provided. + +
+
+
+
+
+ + ☆ Prompt-A-Video: Prompt Your Video Diffusion Model via Preference-Aligned + LLM + + +
+ Text-to-video models have made remarkable advancements through optimization +on high-quality text-video pairs, where the textual prompts play a pivotal role +in determining quality of output videos. However, achieving the desired output +often entails multiple revisions and iterative inference to refine +user-provided prompts. Current automatic methods for refining prompts encounter +challenges such as Modality-Inconsistency, Cost-Discrepancy, and Model-Unaware +when applied to text-to-video diffusion models. To address these problem, we +introduce an LLM-based prompt adaptation framework, termed as Prompt-A-Video, +which excels in crafting Video-Centric, Labor-Free and Preference-Aligned +prompts tailored to specific video diffusion model. Our approach involves a +meticulously crafted two-stage optimization and alignment system. Initially, we +conduct a reward-guided prompt evolution pipeline to automatically create +optimal prompts pool and leverage them for supervised fine-tuning (SFT) of the +LLM. Then multi-dimensional rewards are employed to generate pairwise data for +the SFT model, followed by the direct preference optimization (DPO) algorithm +to further facilitate preference alignment. Through extensive experimentation +and comparative analyses, we validate the effectiveness of Prompt-A-Video +across diverse generation models, highlighting its potential to push the +boundaries of video generation. + +
+
+
+
+
+ + ☆ Language Models as Continuous Self-Evolving Data Engineers + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities on +various tasks, while the further evolvement is limited to the lack of +high-quality training data. In addition, traditional training approaches rely +too much on expert-labeled data, setting an upper limit on the performance of +LLMs. To address this issue, we propose a novel paradigm that enables LLMs to +train itself by autonomously generating, cleaning, reviewing, and annotating +data with preference information, named LANCE. Our approach demonstrates that +LLMs can serve as continuous self-evolving data engineers, significantly +reducing the time and cost of the post-training data construction process. +Through iterative fine-tuning on different variants of the Qwen2, we validate +the effectiveness of LANCE across various tasks, showing that it can +continuously improve model performance and maintain high-quality data +generation. Across eight benchmark dimensions, LANCE resulted in an average +score enhancement of 3.36 for Qwen2-7B and 2.70 for Qwen2-7B-Instruct. This +training paradigm with autonomous data construction not only reduces the +reliance on human experts or external models but also ensures that the data +aligns with human values and preferences, paving the way for the development of +future superintelligent systems that can exceed human capabilities. + +
+
+
+
+
+ + ☆ Adaptive Pruning for Large Language Models with Structural Importance + Awareness + + +
+ The recent advancements in large language models (LLMs) have significantly +improved language understanding and generation capabilities. However, it is +difficult to deploy LLMs on resource-constrained edge devices due to their high +computational and storage resource demands. To address this issue, we propose a +novel LLM model pruning method, namely structurally-aware adaptive pruning +(SAAP), to significantly reduce the computational and memory costs while +maintaining model performance. We first define an adaptive importance fusion +metric to evaluate the importance of all coupled structures in LLMs by +considering their homoscedastic uncertainty. Then, we rank the importance of +all modules to determine the specific layers that should be pruned to meet +particular performance requirements. Furthermore, we develop a new group +fine-tuning strategy to improve the inference efficiency of LLMs. Finally, we +evaluate the proposed SAAP method on multiple LLMs across two common tasks, +i.e., zero-shot classification and text generation. Experimental results show +that our SAAP method outperforms several state-of-the-art baseline methods, +achieving 2.17%, 2.37%, and 2.39% accuracy gains on LLaMA-7B, Vicuna-7B, and +LLaMA-13B. Additionally, SAAP improves the token generation speed by 5%, +showcasing its practical advantages in resource-constrained scenarios. + +
+
+ comment: 12 pages, 6 figures, 12 tables +
+
+
+
+
+ + ☆ Outcome-Refining Process Supervision for Code Generation + + +
+ Large Language Models have demonstrated remarkable capabilities in code +generation, yet they often struggle with complex programming tasks that require +deep algorithmic reasoning. While process supervision through learned reward +models shows promise in guiding reasoning steps, it requires expensive training +data and suffers from unreliable evaluation. We propose Outcome-Refining +Process Supervision, a novel paradigm that treats outcome refinement itself as +the process to be supervised. Our framework leverages concrete execution +signals to ground the supervision of reasoning steps, while using +tree-structured exploration to maintain multiple solution trajectories +simultaneously. Experiments demonstrate that our approach enables even smaller +models to achieve high success accuracy and performance metrics on competitive +programming tasks, creates more reliable verification than traditional reward +models without requiring training PRMs. Our approach achieves significant +improvements across 5 models and 3 datasets: an average of 26.9% increase in +correctness and 42.2% in efficiency. The results suggest that providing +structured reasoning space with concrete verification signals is crucial for +solving complex programming tasks. We open-source all our code and data at: +https://github.com/zhuohaoyu/ORPS + +
+
+ comment: 18 pages, 5 figures, Code: https://github.com/zhuohaoyu/ORPS +
+
+
+
+
+ + ☆ Qwen2.5 Technical Report + + +
+ In this report, we introduce Qwen2.5, a comprehensive series of large +language models (LLMs) designed to meet diverse needs. Compared to previous +iterations, Qwen 2.5 has been significantly improved during both the +pre-training and post-training stages. In terms of pre-training, we have scaled +the high-quality pre-training datasets from the previous 7 trillion tokens to +18 trillion tokens. This provides a strong foundation for common sense, expert +knowledge, and reasoning capabilities. In terms of post-training, we implement +intricate supervised finetuning with over 1 million samples, as well as +multistage reinforcement learning. Post-training techniques enhance human +preference, and notably improve long text generation, structural data analysis, +and instruction following. To handle diverse and varied use cases effectively, +we present Qwen2.5 LLM series in rich sizes. Open-weight offerings include base +and instruction-tuned models, with quantized versions available. In addition, +for hosted solutions, the proprietary models currently include two +mixture-of-experts (MoE) variants: Qwen2.5-Turbo and Qwen2.5-Plus, both +available from Alibaba Cloud Model Studio. Qwen2.5 has demonstrated top-tier +performance on a wide range of benchmarks evaluating language understanding, +reasoning, mathematics, coding, human preference alignment, etc. Specifically, +the open-weight flagship Qwen2.5-72B-Instruct outperforms a number of open and +proprietary models and demonstrates competitive performance to the +state-of-the-art open-weight model, Llama-3-405B-Instruct, which is around 5 +times larger. Qwen2.5-Turbo and Qwen2.5-Plus offer superior cost-effectiveness +while performing competitively against GPT-4o-mini and GPT-4o respectively. +Additionally, as the foundation, Qwen2.5 models have been instrumental in +training specialized models such as Qwen2.5-Math, Qwen2.5-Coder, QwQ, and +multimodal models. + +
+
+
+
+
+ + ☆ Associative memory inspires improvements for in-context learning using a + novel attention residual stream architecture + + +
+ Large language models (LLMs) demonstrate an impressive ability to utilise +information within the context of their input sequences to appropriately +respond to data unseen by the LLM during its training procedure. This ability +is known as in-context learning (ICL). Humans and non-human animals demonstrate +similar abilities, however their neural architectures differ substantially from +LLMs. Despite this, a critical component within LLMs, the attention mechanism, +resembles modern associative memory models, widely used in and influenced by +the computational neuroscience community to model biological memory systems. +Using this connection, we introduce an associative memory model capable of +performing ICL. We use this as inspiration for a novel residual stream +architecture which allows information to directly flow between attention heads. +We test this architecture during training within a two-layer Transformer and +show its ICL abilities manifest more quickly than without this modification. We +then apply our architecture in small language models with 8 million parameters, +focusing on attention head values, with results also indicating improved ICL +performance at this larger and more naturalistic scale. + +
+
+ comment: 18 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ Review-Then-Refine: A Dynamic Framework for Multi-Hop Question Answering + with Temporal Adaptability + + +
+ Retrieve-augmented generation (RAG) frameworks have emerged as a promising +solution to multi-hop question answering(QA) tasks since it enables large +language models (LLMs) to incorporate external knowledge and mitigate their +inherent knowledge deficiencies. Despite this progress, existing RAG +frameworks, which usually follows the retrieve-then-read paradigm, often +struggle with multi-hop QA with temporal information since it has difficulty +retrieving and synthesizing accurate time-related information. To address the +challenge, this paper proposes a novel framework called review-then-refine, +which aims to enhance LLM performance in multi-hop QA scenarios with temporal +information. Our approach begins with a review phase, where decomposed +sub-queries are dynamically rewritten with temporal information, allowing for +subsequent adaptive retrieval and reasoning process. In addition, we implement +adaptive retrieval mechanism to minimize unnecessary retrievals, thus reducing +the potential for hallucinations. In the subsequent refine phase, the LLM +synthesizes the retrieved information from each sub-query along with its +internal knowledge to formulate a coherent answer. Extensive experimental +results across multiple datasets demonstrate the effectiveness of our proposed +framework, highlighting its potential to significantly improve multi-hop QA +capabilities in LLMs. + +
+
+ comment: 20 pages, 2 figures +
+
+
+
+
+ + ☆ A Cross-Domain Study of the Use of Persuasion Techniques in Online + Disinformation + + +
+ Disinformation, irrespective of domain or language, aims to deceive or +manipulate public opinion, typically through employing advanced persuasion +techniques. Qualitative and quantitative research on the weaponisation of +persuasion techniques in disinformation has been mostly topic-specific (e.g., +COVID-19) with limited cross-domain studies, resulting in a lack of +comprehensive understanding of these strategies. This study employs a +state-of-the-art persuasion technique classifier to conduct a large-scale, +multi-domain analysis of the role of 16 persuasion techniques in disinformation +narratives. It shows how different persuasion techniques are employed +disproportionately in different disinformation domains. We also include a +detailed case study on climate change disinformation, highlighting how +linguistic, psychological, and cultural factors shape the adaptation of +persuasion strategies to fit unique thematic contexts. + +
+
+
+
+
+ + ☆ AceMath: Advancing Frontier Math Reasoning with Post-Training and Reward + Modeling + + +
+ In this paper, we introduce AceMath, a suite of frontier math models that +excel in solving complex math problems, along with highly effective reward +models capable of evaluating generated solutions and reliably identifying the +correct ones. To develop the instruction-tuned math models, we propose a +supervised fine-tuning (SFT) process that first achieves competitive +performance across general domains, followed by targeted fine-tuning for the +math domain using a carefully curated set of prompts and synthetically +generated responses. The resulting model, AceMath-72B-Instruct greatly +outperforms Qwen2.5-Math-72B-Instruct, GPT-4o and Claude-3.5 Sonnet. To develop +math-specialized reward model, we first construct AceMath-RewardBench, a +comprehensive and robust benchmark for evaluating math reward models across +diverse problems and difficulty levels. After that, we present a systematic +approach to build our math reward models. The resulting model, AceMath-72B-RM, +consistently outperforms state-of-the-art reward models. Furthermore, when +combining AceMath-72B-Instruct with AceMath-72B-RM, we achieve the highest +average rm@8 score across the math reasoning benchmarks. We will release model +weights, training data, and evaluation benchmarks at: +https://research.nvidia.com/labs/adlr/acemath + +
+
+
+
+
+ + ☆ Till the Layers Collapse: Compressing a Deep Neural Network through the + Lenses of Batch Normalization Layers AAAI 2025 + + +
+ Today, deep neural networks are widely used since they can handle a variety +of complex tasks. Their generality makes them very powerful tools in modern +technology. However, deep neural networks are often overparameterized. The +usage of these large models consumes a lot of computation resources. In this +paper, we introduce a method called \textbf{T}ill the \textbf{L}ayers +\textbf{C}ollapse (TLC), which compresses deep neural networks through the +lenses of batch normalization layers. By reducing the depth of these networks, +our method decreases deep neural networks' computational requirements and +overall latency. We validate our method on popular models such as Swin-T, +MobileNet-V2, and RoBERTa, across both image classification and natural +language processing (NLP) tasks. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ☆ ConfliBERT: A Language Model for Political Conflict + + +
+ Conflict scholars have used rule-based approaches to extract information +about political violence from news reports and texts. Recent Natural Language +Processing developments move beyond rigid rule-based approaches. We review our +recent ConfliBERT language model (Hu et al. 2022) to process political and +violence related texts. The model can be used to extract actor and action +classifications from texts about political conflict. When fine-tuned, results +show that ConfliBERT has superior performance in accuracy, precision and recall +over other large language models (LLM) like Google's Gemma 2 (9B), Meta's Llama +3.1 (7B), and Alibaba's Qwen 2.5 (14B) within its relevant domains. It is also +hundreds of times faster than these more generalist LLMs. These results are +illustrated using texts from the BBC, re3d, and the Global Terrorism Dataset +(GTD). + +
+
+ comment: 30 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ LLMs Lost in Translation: M-ALERT uncovers Cross-Linguistic Safety Gaps + + +
+ Building safe Large Language Models (LLMs) across multiple languages is +essential in ensuring both safe access and linguistic diversity. To this end, +we introduce M-ALERT, a multilingual benchmark that evaluates the safety of +LLMs in five languages: English, French, German, Italian, and Spanish. M-ALERT +includes 15k high-quality prompts per language, totaling 75k, following the +detailed ALERT taxonomy. Our extensive experiments on 10 state-of-the-art LLMs +highlight the importance of language-specific safety analysis, revealing that +models often exhibit significant inconsistencies in safety across languages and +categories. For instance, Llama3.2 shows high unsafety in the category +crime_tax for Italian but remains safe in other languages. Similar differences +can be observed across all models. In contrast, certain categories, such as +substance_cannabis and crime_propaganda, consistently trigger unsafe responses +across models and languages. These findings underscore the need for robust +multilingual safety practices in LLMs to ensure safe and responsible usage +across diverse user communities. + +
+
+
+
+
+ + ☆ Large Language Models and Code Security: A Systematic Literature Review + + +
+ Large Language Models (LLMs) have emerged as powerful tools for automating +various programming tasks, including security-related ones, such as detecting +and fixing vulnerabilities. Despite their promising capabilities, when required +to produce or modify pre-existing code, LLMs could introduce vulnerabilities +unbeknown to the programmer. When analyzing code, they could miss clear +vulnerabilities or signal nonexistent ones. In this Systematic Literature +Review (SLR), we aim to investigate both the security benefits and potential +drawbacks of using LLMs for a variety of code-related tasks. In particular, +first we focus on the types of vulnerabilities that could be introduced by +LLMs, when used for producing code. Second, we analyze the capabilities of LLMs +to detect and fix vulnerabilities, in any given code, and how the prompting +strategy of choice impacts their performance in these two tasks. Last, we +provide an in-depth analysis on how data poisoning attacks on LLMs can impact +performance in the aforementioned tasks. + +
+
+
+
+
+ + ☆ Chain-of-MetaWriting: Linguistic and Textual Analysis of How Small + Language Models Write Young Students Texts COLING 2025 + + +
+ Large Language Models (LLMs) have been used to generate texts in response to +different writing tasks: reports, essays, story telling. However, language +models do not have a meta-representation of the text writing process, nor +inherent communication learning needs, comparable to those of young human +students. This paper introduces a fine-grained linguistic and textual analysis +of multilingual Small Language Models' (SLMs) writing. With our method, +Chain-of-MetaWriting, SLMs can imitate some steps of the human writing process, +such as planning and evaluation. We mainly focused on short story and essay +writing tasks in French for schoolchildren and undergraduate students +respectively. Our results show that SLMs encounter difficulties in assisting +young students on sensitive topics such as violence in the schoolyard, and they +sometimes use words too complex for the target audience. In particular, the +output is quite different from the human produced texts in term of text +cohesion and coherence regarding temporal connectors, topic progression, +reference. + +
+
+ comment: Accepted at WRAICOGS 2025 (Writing Aids at the Crossroads of AI, + Cognitive Science, and NLP) co-located with COLING 2025 +
+
+
+
+
+ + ☆ Movie2Story: A framework for understanding videos and telling stories in + the form of novel text + + +
+ Multimodal video-to-text models have made considerable progress, primarily in +generating brief descriptions of video content. However, there is still a +deficiency in generating rich long-form text descriptions that integrate both +video and audio. In this paper, we introduce a framework called M2S, designed +to generate novel-length text by combining audio, video, and character +recognition. M2S includes modules for video long-form text description and +comprehension, audio-based analysis of emotion, speech rate, and character +alignment, and visual-based character recognition alignment. By integrating +multimodal information using the large language model GPT4o, M2S stands out in +the field of multimodal text generation. We demonstrate the effectiveness and +accuracy of M2S through comparative experiments and human evaluation. +Additionally, the model framework has good scalability and significant +potential for future research. + +
+
+
+
+
+ + ☆ Knowledge Injection via Prompt Distillation + + +
+ In many practical applications, large language models (LLMs) need to +incorporate new knowledge not present in their pre-training data. The primary +methods for this are fine-tuning and retrieval-augmented generation (RAG). +Although RAG has emerged as the industry standard for knowledge injection, +fine-tuning has not yet achieved comparable success. In this paper, we propose +a new fine-tuning technique for learning new knowledge and show that it can +reach the performance of RAG. The proposed method is based on the +self-distillation approach, which we call prompt distillation. First, we +generate question-answer pairs about the new knowledge. Then, we fine-tune a +student model on the question-answer pairs to imitate the output distributions +of a teacher model, which additionally receives the new knowledge in its +prompt. The student model is identical to the teacher, except it is equipped +with a LoRA adapter. This training procedure facilitates distilling the new +knowledge from the teacher's prompt into the student's weights. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Understanding the Dark Side of LLMs' Intrinsic Self-Correction + + +
+ Intrinsic self-correction was proposed to improve LLMs' responses via +feedback prompts solely based on their inherent capability. However, recent +works show that LLMs' intrinsic self-correction fails without oracle labels as +feedback prompts. In this paper, we aim to interpret LLMs' intrinsic +self-correction for different tasks, especially for those failure cases. By +including one simple task and three complex tasks with state-of-the-art (SOTA) +LLMs like ChatGPT families (o1, 4o, 3.5-turbo) and Llama families (2-7B, 3-8B, +and 3.1-8B), we design three interpretation methods to reveal the dark side of +LLMs' intrinsic self-correction. We identify intrinsic self-correction can (1) +cause LLMs to waver both intermedia and final answers and lead to prompt bias +on simple factual questions; (2) introduce human-like cognitive bias on complex +tasks. In light of our findings, we also provide two simple yet effective +strategies for alleviation: question repeating and supervised fine-tuning with +a few samples. We open-source our work at https://x-isc.info/. + +
+
+
+
+
+ + ☆ RobustFT: Robust Supervised Fine-tuning for Large Language Models under + Noisy Response + + +
+ Supervised fine-tuning (SFT) plays a crucial role in adapting large language +models (LLMs) to specific domains or tasks. However, as demonstrated by +empirical experiments, the collected data inevitably contains noise in +practical applications, which poses significant challenges to model performance +on downstream tasks. Therefore, there is an urgent need for a noise-robust SFT +framework to enhance model capabilities in downstream tasks. To address this +challenge, we introduce a robust SFT framework (RobustFT) that performs noise +detection and relabeling on downstream task data. For noise identification, our +approach employs a multi-expert collaborative system with inference-enhanced +models to achieve superior noise detection. In the denoising phase, we utilize +a context-enhanced strategy, which incorporates the most relevant and confident +knowledge followed by careful assessment to generate reliable annotations. +Additionally, we introduce an effective data selection mechanism based on +response entropy, ensuring only high-quality samples are retained for +fine-tuning. Extensive experiments conducted on multiple LLMs across five +datasets demonstrate RobustFT's exceptional performance in noisy scenarios. + +
+
+
+
+
+ + ☆ Dehallucinating Parallel Context Extension for Retrieval-Augmented + Generation + + +
+ Large language models (LLMs) are susceptible to generating hallucinated +information, despite the integration of retrieval-augmented generation (RAG). +Parallel context extension (PCE) is a line of research attempting to +effectively integrating parallel (unordered) contexts, while it still suffers +from hallucinations when adapted to RAG scenarios. In this paper, we propose +DePaC (Dehallucinating Parallel Context Extension), which alleviates the +hallucination problem with context-aware negative training and +information-calibrated aggregation. DePaC is designed to alleviate two types of +in-context hallucination: fact fabrication (i.e., LLMs present claims that are +not supported by the contexts) and fact omission (i.e., LLMs fail to present +claims that can be supported by the contexts). Specifically, (1) for fact +fabrication, we apply the context-aware negative training that fine-tunes the +LLMs with negative supervisions, thus explicitly guiding the LLMs to refuse to +answer when contexts are not related to questions; (2) for fact omission, we +propose the information-calibrated aggregation which prioritizes context +windows with higher information increment from their contexts. The experimental +results on nine RAG tasks demonstrate that DePaC significantly alleviates the +two types of hallucination and consistently achieves better performances on +these tasks. + +
+
+
+
+
+ + ☆ Why language models collapse when trained on recursively generated text + + +
+ Language models (LMs) have been widely used to generate text on the Internet. +The generated text is often collected into the training corpus of the next +generations of LMs. Previous work has experimentally found that LMs collapse +when trained on recursively generated text. This paper contributes to existing +knowledge from two aspects. We present a theoretical proof of LM collapse. Our +proof reveals the cause of LM collapse and proves that all auto-regressive LMs +will definitely collapse. We present a new finding: the performance of LMs +gradually declines when trained on recursively generated text until they +perform no better than a randomly initialized LM. The trained LMs produce large +amounts of repetitive text and perform poorly across a wide range of natural +language tasks. The above proof and new findings deepen our understanding of LM +collapse and offer valuable insights that may inspire new training techniques +to mitigate this threat. + +
+
+ comment: 28 pages, 9 figures +
+
+
+
+
+ + ☆ Graph-Convolutional Networks: Named Entity Recognition and Large + Language Model Embedding in Document Clustering + + +
+ Recent advances in machine learning, particularly Large Language Models +(LLMs) such as BERT and GPT, provide rich contextual embeddings that improve +text representation. However, current document clustering approaches often +ignore the deeper relationships between named entities (NEs) and the potential +of LLM embeddings. This paper proposes a novel approach that integrates Named +Entity Recognition (NER) and LLM embeddings within a graph-based framework for +document clustering. The method builds a graph with nodes representing +documents and edges weighted by named entity similarity, optimized using a +graph-convolutional network (GCN). This ensures a more effective grouping of +semantically related documents. Experimental results indicate that our approach +outperforms conventional co-occurrence-based methods in clustering, notably for +documents rich in named entities. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ Think&Cite: Improving Attributed Text Generation with Self-Guided Tree + Search and Progress Reward Modeling + + +
+ Despite their outstanding capabilities, large language models (LLMs) are +prone to hallucination and producing factually incorrect information. This +challenge has spurred efforts in attributed text generation, which prompts LLMs +to generate content with supporting evidence. In this paper, we propose a novel +framework, called Think&Cite, and formulate attributed text generation as a +multi-step reasoning problem integrated with search. Specifically, we propose +Self-Guided Monte Carlo Tree Search (SG-MCTS), which capitalizes on the +self-reflection capability of LLMs to reflect on the intermediate states of +MCTS for guiding the tree expansion process. To provide reliable and +comprehensive feedback, we introduce Progress Reward Models to measure the +progress of tree search from the root to the current state from two aspects, +i.e., generation and attribution progress. We conduct extensive experiments on +three datasets and the results show that our approach significantly outperforms +baseline approaches. + +
+
+
+
+
+ + ☆ DS$^2$-ABSA: Dual-Stream Data Synthesis with Label Refinement for + Few-Shot Aspect-Based Sentiment Analysis + + +
+ Recently developed large language models (LLMs) have presented promising new +avenues to address data scarcity in low-resource scenarios. In few-shot +aspect-based sentiment analysis (ABSA), previous efforts have explored data +augmentation techniques, which prompt LLMs to generate new samples by modifying +existing ones. However, these methods fail to produce adequately diverse data, +impairing their effectiveness. Besides, some studies apply in-context learning +for ABSA by using specific instructions and a few selected examples as prompts. +Though promising, LLMs often yield labels that deviate from task requirements. +To overcome these limitations, we propose DS$^2$-ABSA, a dual-stream data +synthesis framework targeted for few-shot ABSA. It leverages LLMs to synthesize +data from two complementary perspectives: \textit{key-point-driven} and +\textit{instance-driven}, which effectively generate diverse and high-quality +ABSA samples in low-resource settings. Furthermore, a \textit{label refinement} +module is integrated to improve the synthetic labels. Extensive experiments +demonstrate that DS$^2$-ABSA significantly outperforms previous few-shot ABSA +solutions and other LLM-oriented data generation methods. + +
+
+
+
+
+ + ☆ A Survey of RWKV + + +
+ The Receptance Weighted Key Value (RWKV) model offers a novel alternative to +the Transformer architecture, merging the benefits of recurrent and +attention-based systems. Unlike conventional Transformers, which depend heavily +on self-attention, RWKV adeptly captures long-range dependencies with minimal +computational demands. By utilizing a recurrent framework, RWKV addresses some +computational inefficiencies found in Transformers, particularly in tasks with +long sequences. RWKV has recently drawn considerable attention for its robust +performance across multiple domains. Despite its growing popularity, no +systematic review of the RWKV model exists. This paper seeks to fill this gap +as the first comprehensive review of the RWKV architecture, its core +principles, and its varied applications, such as natural language generation, +natural language understanding, and computer vision. We assess how RWKV +compares to traditional Transformer models, highlighting its capability to +manage long sequences efficiently and lower computational costs. Furthermore, +we explore the challenges RWKV encounters and propose potential directions for +future research and advancement. We consistently maintain the related +open-source materials at: https://github.com/MLGroupJLU/RWKV-Survey. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Mapping and Influencing the Political Ideology of Large Language Models + using Synthetic Personas + + +
+ The analysis of political biases in large language models (LLMs) has +primarily examined these systems as single entities with fixed viewpoints. +While various methods exist for measuring such biases, the impact of +persona-based prompting on LLMs' political orientation remains unexplored. In +this work we leverage PersonaHub, a collection of synthetic persona +descriptions, to map the political distribution of persona-based prompted LLMs +using the Political Compass Test (PCT). We then examine whether these initial +compass distributions can be manipulated through explicit ideological prompting +towards diametrically opposed political orientations: right-authoritarian and +left-libertarian. Our experiments reveal that synthetic personas predominantly +cluster in the left-libertarian quadrant, with models demonstrating varying +degrees of responsiveness when prompted with explicit ideological descriptors. +While all models demonstrate significant shifts towards right-authoritarian +positions, they exhibit more limited shifts towards left-libertarian positions, +suggesting an asymmetric response to ideological manipulation that may reflect +inherent biases in model training. + +
+
+ comment: 4 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ DynamicKV: Task-Aware Adaptive KV Cache Compression for Long Context + LLMs + + +
+ Efficient KV cache management in LLMs is crucial for long-context tasks like +RAG and summarization. Existing KV cache compression methods enforce a fixed +pattern, neglecting task-specific characteristics and reducing the retention of +essential information. However, we observe distinct activation patterns across +layers in various tasks, highlighting the need for adaptive strategies tailored +to each task's unique demands. Based on this insight, we propose DynamicKV, a +method that dynamically optimizes token retention by adjusting the number of +tokens retained at each layer to adapt to the specific task. DynamicKV +establishes global and per-layer maximum KV cache budgets, temporarily +retaining the maximum budget for the current layer, and periodically updating +the KV cache sizes of all preceding layers during inference. Our method retains +only 1.7% of the KV cache size while achieving ~85% of the Full KV cache +performance on LongBench. Notably, even under extreme compression (0.9%), +DynamicKV surpasses state-of-the-art (SOTA) methods by 11% in the +Needle-in-a-Haystack test using Mistral-7B-Instruct-v0.2. The code will be +released. + +
+
+
+
+
+ + ☆ Progressive Multimodal Reasoning via Active Retrieval + + +
+ Multi-step multimodal reasoning tasks pose significant challenges for +multimodal large language models (MLLMs), and finding effective ways to enhance +their performance in such scenarios remains an unresolved issue. In this paper, +we propose AR-MCTS, a universal framework designed to progressively improve the +reasoning capabilities of MLLMs through Active Retrieval (AR) and Monte Carlo +Tree Search (MCTS). Our approach begins with the development of a unified +retrieval module that retrieves key supporting insights for solving complex +reasoning problems from a hybrid-modal retrieval corpus. To bridge the gap in +automated multimodal reasoning verification, we employ the MCTS algorithm +combined with an active retrieval mechanism, which enables the automatic +generation of step-wise annotations. This strategy dynamically retrieves key +insights for each reasoning step, moving beyond traditional beam search +sampling to improve the diversity and reliability of the reasoning space. +Additionally, we introduce a process reward model that aligns progressively to +support the automatic verification of multimodal reasoning tasks. Experimental +results across three complex multimodal reasoning benchmarks confirm the +effectiveness of the AR-MCTS framework in enhancing the performance of various +multimodal models. Further analysis demonstrates that AR-MCTS can optimize +sampling diversity and accuracy, yielding reliable multimodal reasoning. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ☆ Mention Attention for Pronoun Translation ACL + + +
+ Most pronouns are referring expressions, computers need to resolve what do +the pronouns refer to, and there are divergences on pronoun usage across +languages. Thus, dealing with these divergences and translating pronouns is a +challenge in machine translation. Mentions are referring candidates of pronouns +and have closer relations with pronouns compared to general tokens. We assume +that extracting additional mention features can help pronoun translation. +Therefore, we introduce an additional mention attention module in the decoder +to pay extra attention to source mentions but not non-mention tokens. Our +mention attention module not only extracts features from source mentions, but +also considers target-side context which benefits pronoun translation. In +addition, we also introduce two mention classifiers to train models to +recognize mentions, whose outputs guide the mention attention. We conduct +experiments on the WMT17 English-German translation task, and evaluate our +models on general translation and pronoun translation, using BLEU, APT, and +contrastive evaluation metrics. Our proposed model outperforms the baseline +Transformer model in terms of APT and BLEU scores, this confirms our hypothesis +that we can improve pronoun translation by paying additional attention to +source mentions, and shows that our introduced additional modules do not have +negative effect on the general translation quality. + +
+
+ comment: camera-ready version of the paper accepted by JCRAI-23 conference, in + ACL format +
+
+
+
+
+ + ☆ ResoFilter: Rine-grained Synthetic Data Filtering for Large Language + Models through Data-Parameter Resonance Analysis + + +
+ Large language models (LLMs) have shown remarkable effectiveness across +various domains, with data augmentation methods utilizing GPT for synthetic +data generation becoming prevalent. However, the quality and utility of +augmented data remain questionable, and current methods lack clear metrics for +evaluating data characteristics. To address these challenges, we propose +ResoFilter, a novel method that integrates models, data, and tasks to refine +datasets. ResoFilter leverages the fine-tuning process to obtain Data-Parameter +features for data selection, offering improved interpretability by representing +data characteristics through model weights. Our experiments demonstrate that +ResoFilter achieves comparable results to full-scale fine-tuning using only +half the data in mathematical tasks and exhibits strong generalization across +different models and domains. This method provides valuable insights for +constructing synthetic datasets and evaluating high-quality data, offering a +promising solution for enhancing data augmentation techniques and improving +training dataset quality for LLMs. For reproducibility, we will release our +code and data upon acceptance. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Disentangling Reasoning Tokens and Boilerplate Tokens For Language Model + Fine-tuning + + +
+ When using agent-task datasets to enhance agent capabilities for Large +Language Models (LLMs), current methodologies often treat all tokens within a +sample equally. However, we argue that tokens serving different roles - +specifically, reasoning tokens versus boilerplate tokens (e.g., those governing +output format) - differ significantly in importance and learning complexity, +necessitating their disentanglement and distinct treatment. To address this, we +propose a novel Shuffle-Aware Discriminator (SHAD) for adaptive token +discrimination. SHAD classifies tokens by exploiting predictability differences +observed after shuffling input-output combinations across samples: boilerplate +tokens, due to their repetitive nature among samples, maintain predictability, +whereas reasoning tokens do not. Using SHAD, we propose the +Reasoning-highlighted Fine-Tuning (RFT) method, which adaptively emphasizes +reasoning tokens during fine-tuning, yielding notable performance gains over +common Supervised Fine-Tuning (SFT). + +
+
+
+
+
+ + ☆ ALKAFI-LLAMA3: Fine-Tuning LLMs for Precise Legal Understanding in + Palestine + + +
+ Large Language Models (LLMs) have demonstrated remarkable potential in +diverse domains, yet their application in the legal sector, particularly in +low-resource contexts, remains limited. This study addresses the challenges of +adapting LLMs to the Palestinian legal domain, where political instability, +fragmented legal frameworks, and limited AI resources hinder effective +machine-learning applications. We present a fine-tuned model based on a +quantized version of Llama-3.2-1B-Instruct, trained on a synthetic data set +derived from Palestinian legal texts. Using smaller-scale models and +strategically generated question-answer pairs, we achieve a cost-effective, +locally sustainable solution that provides accurate and contextually relevant +legal guidance. Our experiments demonstrate promising performance on various +query types, ranging from yes/no questions and narrative explanations to +complex legal differentiations, while highlighting areas for improvement, such +as handling calculation-based inquiries and structured list formatting. This +work provides a pathway for the deployment of AI-driven legal assistance tools +tailored to the needs of resource-constrained environments. + +
+
+
+
+
+ + ☆ PsyDraw: A Multi-Agent Multimodal System for Mental Health Screening in + Left-Behind Children + + +
+ Left-behind children (LBCs), numbering over 66 million in China, face severe +mental health challenges due to parental migration for work. Early screening +and identification of at-risk LBCs is crucial, yet challenging due to the +severe shortage of mental health professionals, especially in rural areas. +While the House-Tree-Person (HTP) test shows higher child participation rates, +its requirement for expert interpretation limits its application in +resource-scarce regions. To address this challenge, we propose PsyDraw, a +multi-agent system based on Multimodal Large Language Models that assists +mental health professionals in analyzing HTP drawings. The system employs +specialized agents for feature extraction and psychological interpretation, +operating in two stages: comprehensive feature analysis and professional report +generation. Evaluation of HTP drawings from 290 primary school students reveals +that 71.03% of the analyzes achieved High Consistency with professional +evaluations, 26.21% Moderate Consistency and only 2.41% Low Consistency. The +system identified 31.03% of cases requiring professional attention, +demonstrating its effectiveness as a preliminary screening tool. Currently +deployed in pilot schools, \method shows promise in supporting mental health +professionals, particularly in resource-limited areas, while maintaining high +professional standards in psychological assessment. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Query pipeline optimization for cancer patient question answering + systems + + +
+ Retrieval-augmented generation (RAG) mitigates hallucination in Large +Language Models (LLMs) by using query pipelines to retrieve relevant external +information and grounding responses in retrieved knowledge. However, query +pipeline optimization for cancer patient question-answering (CPQA) systems +requires separately optimizing multiple components with domain-specific +considerations. We propose a novel three-aspect optimization approach for the +RAG query pipeline in CPQA systems, utilizing public biomedical databases like +PubMed and PubMed Central. Our optimization includes: (1) document retrieval, +utilizing a comparative analysis of NCBI resources and introducing Hybrid +Semantic Real-time Document Retrieval (HSRDR); (2) passage retrieval, +identifying optimal pairings of dense retrievers and rerankers; and (3) +semantic representation, introducing Semantic Enhanced Overlap Segmentation +(SEOS) for improved contextual understanding. On a custom-developed dataset +tailored for cancer-related inquiries, our optimized RAG approach improved the +answer accuracy of Claude-3-haiku by 5.24% over chain-of-thought prompting and +about 3% over a naive RAG setup. This study highlights the importance of +domain-specific query optimization in realizing the full potential of RAG and +provides a robust framework for building more accurate and reliable CPQA +systems, advancing the development of RAG-based biomedical systems. + +
+
+
+
+
+ + ☆ On Verbalized Confidence Scores for LLMs + + +
+ The rise of large language models (LLMs) and their tight integration into our +daily life make it essential to dedicate efforts towards their trustworthiness. +Uncertainty quantification for LLMs can establish more human trust into their +responses, but also allows LLM agents to make more informed decisions based on +each other's uncertainty. To estimate the uncertainty in a response, internal +token logits, task-specific proxy models, or sampling of multiple responses are +commonly used. This work focuses on asking the LLM itself to verbalize its +uncertainty with a confidence score as part of its output tokens, which is a +promising way for prompt- and model-agnostic uncertainty quantification with +low overhead. Using an extensive benchmark, we assess the reliability of +verbalized confidence scores with respect to different datasets, models, and +prompt methods. Our results reveal that the reliability of these scores +strongly depends on how the model is asked, but also that it is possible to +extract well-calibrated confidence scores with certain prompt methods. We argue +that verbalized confidence scores can become a simple but effective and +versatile uncertainty quantification method in the future. Our code is +available at https://github.com/danielyxyang/llm-verbalized-uq . + +
+
+
+
+
+ + ☆ How to Synthesize Text Data without Model Collapse? + + +
+ Model collapse in synthetic data indicates that iterative training on +self-generated data leads to a gradual decline in performance. With the +proliferation of AI models, synthetic data will fundamentally reshape the web +data ecosystem. Future GPT-$\{n\}$ models will inevitably be trained on a blend +of synthetic and human-produced data. In this paper, we focus on two questions: +what is the impact of synthetic data on language model training, and how to +synthesize data without model collapse? We first pre-train language models +across different proportions of synthetic data, revealing a negative +correlation between the proportion of synthetic data and model performance. We +further conduct statistical analysis on synthetic data to uncover +distributional shift phenomenon and over-concentration of n-gram features. +Inspired by the above findings, we propose token editing on human-produced data +to obtain semi-synthetic data. As a proof of concept, we theoretically +demonstrate that token-level editing can prevent model collapse, as the test +error is constrained by a finite upper bound. We conduct extensive experiments +on pre-training from scratch, continual pre-training, and supervised +fine-tuning. The results validate our theoretical proof that token-level +editing improves data quality and enhances model performance. + +
+
+
+
+
+ + ☆ Each Fake News is Fake in its Own Way: An Attribution Multi-Granularity + Benchmark for Multimodal Fake News Detection + + +
+ Social platforms, while facilitating access to information, have also become +saturated with a plethora of fake news, resulting in negative consequences. +Automatic multimodal fake news detection is a worthwhile pursuit. Existing +multimodal fake news datasets only provide binary labels of real or fake. +However, real news is alike, while each fake news is fake in its own way. These +datasets fail to reflect the mixed nature of various types of multimodal fake +news. To bridge the gap, we construct an attributing multi-granularity +multimodal fake news detection dataset \amg, revealing the inherent fake +pattern. Furthermore, we propose a multi-granularity clue alignment model \our +to achieve multimodal fake news detection and attribution. Experimental results +demonstrate that \amg is a challenging dataset, and its attribution setting +opens up new avenues for future research. + +
+
+
+
+
+ + ☆ LLMs as mediators: Can they diagnose conflicts accurately? + + +
+ Prior research indicates that to be able to mediate conflict, observers of +disagreements between parties must be able to reliably distinguish the sources +of their disagreement as stemming from differences in beliefs about what is +true (causality) vs. differences in what they value (morality). In this paper, +we test if OpenAI's Large Language Models GPT 3.5 and GPT 4 can perform this +task and whether one or other type of disagreement proves particularly +challenging for LLM's to diagnose. We replicate study 1 in Ko\c{c}ak et al. +(2003), which employes a vignette design, with OpenAI's GPT 3.5 and GPT 4. We +find that both LLMs have similar semantic understanding of the distinction +between causal and moral codes as humans and can reliably distinguish between +them. When asked to diagnose the source of disagreement in a conversation, both +LLMs, compared to humans, exhibit a tendency to overestimate the extent of +causal disagreement and underestimate the extent of moral disagreement in the +moral misalignment condition. This tendency is especially pronounced for GPT 4 +when using a proximate scale that relies on concrete language specific to an +issue. GPT 3.5 does not perform as well as GPT4 or humans when using either the +proximate or the distal scale. The study provides a first test of the potential +for using LLMs to mediate conflict by diagnosing the root of disagreements in +causal and evaluative codes. + +
+
+ comment: 27 pages, 2 appendices, 21 tables (incl appendices) +
+
+
+
+
+ + ☆ Analysis and Visualization of Linguistic Structures in Large Language + Models: Neural Representations of Verb-Particle Constructions in BERT + + +
+ This study investigates the internal representations of verb-particle +combinations within transformer-based large language models (LLMs), +specifically examining how these models capture lexical and syntactic nuances +at different neural network layers. Employing the BERT architecture, we analyse +the representational efficacy of its layers for various verb-particle +constructions such as 'agree on', 'come back', and 'give up'. Our methodology +includes a detailed dataset preparation from the British National Corpus, +followed by extensive model training and output analysis through techniques +like multi-dimensional scaling (MDS) and generalized discrimination value (GDV) +calculations. Results show that BERT's middle layers most effectively capture +syntactic structures, with significant variability in representational accuracy +across different verb categories. These findings challenge the conventional +uniformity assumed in neural network processing of linguistic elements and +suggest a complex interplay between network architecture and linguistic +representation. Our research contributes to a better understanding of how deep +learning models comprehend and process language, offering insights into the +potential and limitations of current neural approaches to linguistic analysis. +This study not only advances our knowledge in computational linguistics but +also prompts further research into optimizing neural architectures for enhanced +linguistic precision. + +
+
+
+
+
+ + ☆ Unveiling Uncertainty: A Deep Dive into Calibration and Performance of + Multimodal Large Language Models COLING 2025 + + +
+ Multimodal large language models (MLLMs) combine visual and textual data for +tasks such as image captioning and visual question answering. Proper +uncertainty calibration is crucial, yet challenging, for reliable use in areas +like healthcare and autonomous driving. This paper investigates representative +MLLMs, focusing on their calibration across various scenarios, including before +and after visual fine-tuning, as well as before and after multimodal training +of the base LLMs. We observed miscalibration in their performance, and at the +same time, no significant differences in calibration across these scenarios. We +also highlight how uncertainty differs between text and images and how their +integration affects overall uncertainty. To better understand MLLMs' +miscalibration and their ability to self-assess uncertainty, we construct the +IDK (I don't know) dataset, which is key to evaluating how they handle +unknowns. Our findings reveal that MLLMs tend to give answers rather than admit +uncertainty, but this self-assessment improves with proper prompt adjustments. +Finally, to calibrate MLLMs and enhance model reliability, we propose +techniques such as temperature scaling and iterative prompt optimization. Our +results provide insights into improving MLLMs for effective and responsible +deployment in multimodal applications. Code and IDK dataset: +\href{https://github.com/hfutml/Calibration-MLLM}{https://github.com/hfutml/Calibration-MLLM}. + +
+
+ comment: Accepted to COLING 2025 +
+
+
+
+
+ + ☆ Length Controlled Generation for Black-box LLMs + + +
+ Large language models (LLMs) have demonstrated impressive instruction +following capabilities, while still struggling to accurately manage the length +of the generated text, which is a fundamental requirement in many real-world +applications. Existing length control methods involve fine-tuning the +parameters of LLMs, which is inefficient and suboptimal for practical use. In +this paper, we propose a novel iterative sampling framework for text length +control, integrating the Metropolis-Hastings algorithm with an importance +sampling acceleration strategy. This framework efficiently and reliably +regulates LLMs to generate length-constrained text without modifying the +underlying parameters, thereby preserving the original capabilities of LLMs. +Experimental results demonstrate that our framework achieves almost 100\% +success rates of length control on Llama3.1 for tasks such as length-controlled +abstractive summarization and length-constrained instruction following, with +minimal additional computational overhead. This also highlights the significant +potential of our method for precise length control across a broader range of +applications, without compromising the versatility of LLMs. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ TOMG-Bench: Evaluating LLMs on Text-based Open Molecule Generation + + +
+ In this paper, we propose Text-based Open Molecule Generation Benchmark +(TOMG-Bench), the first benchmark to evaluate the open-domain molecule +generation capability of LLMs. TOMG-Bench encompasses a dataset of three major +tasks: molecule editing (MolEdit), molecule optimization (MolOpt), and +customized molecule generation (MolCustom). Each task further contains three +subtasks, with each subtask comprising 5,000 test samples. Given the inherent +complexity of open molecule generation, we have also developed an automated +evaluation system that helps measure both the quality and the accuracy of the +generated molecules. Our comprehensive benchmarking of 25 LLMs reveals the +current limitations and potential areas for improvement in text-guided molecule +discovery. Furthermore, with the assistance of OpenMolIns, a specialized +instruction tuning dataset proposed for solving challenges raised by +TOMG-Bench, Llama3.1-8B could outperform all the open-source general LLMs, even +surpassing GPT-3.5-turbo by 46.5\% on TOMG-Bench. Our codes and datasets are +available through https://github.com/phenixace/TOMG-Bench. + +
+
+ comment: A benchmark for text-based open molecule generation +
+
+
+
+
+ + ☆ Learning to Generate Research Idea with Dynamic Control + + +
+ The rapid advancements in large language models (LLMs) have demonstrated +their potential to accelerate scientific discovery, particularly in automating +the process of research ideation. LLM-based systems have shown promise in +generating hypotheses and research ideas. However, current approaches +predominantly rely on prompting-based pre-trained models, limiting their +ability to optimize generated content effectively. Moreover, they also lack the +capability to deal with the complex interdependence and inherent restrictions +among novelty, feasibility, and effectiveness, which remains challenging due to +the inherent trade-offs among these dimensions, such as the +innovation-feasibility conflict. To address these limitations, we for the first +time propose fine-tuning LLMs to be better idea proposers and introduce a novel +framework that employs a two-stage approach combining Supervised Fine-Tuning +(SFT) and controllable Reinforcement Learning (RL). In the SFT stage, the model +learns foundational patterns from pairs of research papers and follow-up ideas. +In the RL stage, multi-dimensional reward modeling, guided by fine-grained +feedback, evaluates and optimizes the generated ideas across key metrics. +Dimensional controllers enable dynamic adjustment of generation, while a +sentence-level decoder ensures context-aware emphasis during inference. Our +framework provides a balanced approach to research ideation, achieving +high-quality outcomes by dynamically navigating the trade-offs among novelty, +feasibility, and effectiveness. + +
+
+
+
+
+ + ☆ How good is GPT at writing political speeches for the White House? + + +
+ Using large language models (LLMs), computers are able to generate a written +text in response to a us er request. As this pervasive technology can be +applied in numerous contexts, this study analyses the written style of one LLM +called GPT by comparing its generated speeches with those of the recent US +presidents. To achieve this objective, the State of the Union (SOTU) addresses +written by Reagan to Biden are contrasted to those produced by both GPT-3.5 and +GPT-4.o versions. Compared to US presidents, GPT tends to overuse the lemma +"we" and produce shorter messages with, on average, longer sentences. Moreover, +GPT opts for an optimistic tone, opting more often for political (e.g., +president, Congress), symbolic (e.g., freedom), and abstract terms (e.g., +freedom). Even when imposing an author's style to GPT, the resulting speech +remains distinct from addresses written by the target author. Finally, the two +GPT versions present distinct characteristics, but both appear overall +dissimilar to true presidential messages. + +
+
+
+
+
+ + ☆ HarmonicEval: Multi-modal, Multi-task, Multi-criteria Automatic + Evaluation Using a Vision Language Model + + +
+ Vision-language models (VLMs) have shown impressive abilities in text and +image understanding. However, existing metrics for evaluating the text +generated by VLMs focus exclusively on overall quality, leading to two +limitations: 1) it is challenging to identify which aspects of the text need +improvement from the overall score; 2) metrics may overlook specific evaluation +criteria when predicting an overall score. To address these limitations, we +propose HarmonicEval, a reference-free evaluation metric that aggregates +criterion-wise scores to produce the overall score in a bottom-up manner. +Furthermore, we construct the Multi-task Multi-criteria Human Evaluation (MMHE) +dataset, which comprises 18,000 expert human judgments across four +vision-language tasks. Our experiments demonstrate that HarmonicEval achieves +higher correlations with human judgments than conventional metrics while +providing numerical scores for each criterion. + +
+
+
+
+
+ + ☆ KARRIEREWEGE: A Large Scale Career Path Prediction Dataset COLING + + +
+ Accurate career path prediction can support many stakeholders, like job +seekers, recruiters, HR, and project managers. However, publicly available data +and tools for career path prediction are scarce. In this work, we introduce +KARRIEREWEGE, a comprehensive, publicly available dataset containing over 500k +career paths, significantly surpassing the size of previously available +datasets. We link the dataset to the ESCO taxonomy to offer a valuable resource +for predicting career trajectories. To tackle the problem of free-text inputs +typically found in resumes, we enhance it by synthesizing job titles and +descriptions resulting in KARRIEREWEGE+. This allows for accurate predictions +from unstructured data, closely aligning with real-world application +challenges. We benchmark existing state-of-the-art (SOTA) models on our dataset +and a prior benchmark and observe improved performance and robustness, +particularly for free-text use cases, due to the synthesized data. + +
+
+ comment: Accepted at COLING Industry Track +
+
+
+
+
+ + ☆ LDP: Generalizing to Multilingual Visual Information Extraction by + Language Decoupled Pretraining AAAI2025 + + +
+ Visual Information Extraction (VIE) plays a crucial role in the comprehension +of semi-structured documents, and several pre-trained models have been +developed to enhance performance. However, most of these works are monolingual +(usually English). Due to the extremely unbalanced quantity and quality of +pre-training corpora between English and other languages, few works can extend +to non-English scenarios. In this paper, we conduct systematic experiments to +show that vision and layout modality hold invariance among images with +different languages. If decoupling language bias from document images, a +vision-layout-based model can achieve impressive cross-lingual generalization. +Accordingly, we present a simple but effective multilingual training paradigm +LDP (Language Decoupled Pre-training) for better utilization of monolingual +pre-training data. Our proposed model LDM (Language Decoupled Model) is first +pre-trained on the language-independent data, where the language knowledge is +decoupled by a diffusion model, and then the LDM is fine-tuned on the +downstream languages. Extensive experiments show that the LDM outperformed all +SOTA multilingual pre-trained models, and also maintains competitiveness on +downstream monolingual/English benchmarks. + +
+
+ comment: Accepted by AAAI2025 +
+
+
+
+
+ + ☆ Beyond Guilt: Legal Judgment Prediction with Trichotomous Reasoning + + +
+ In legal practice, judges apply the trichotomous dogmatics of criminal law, +sequentially assessing the elements of the offense, unlawfulness, and +culpability to determine whether an individual's conduct constitutes a crime. +Although current legal large language models (LLMs) show promising accuracy in +judgment prediction, they lack trichotomous reasoning capabilities due to the +absence of an appropriate benchmark dataset, preventing them from predicting +innocent outcomes. As a result, every input is automatically assigned a charge, +limiting their practical utility in legal contexts. To bridge this gap, we +introduce LJPIV, the first benchmark dataset for Legal Judgment Prediction with +Innocent Verdicts. Adhering to the trichotomous dogmatics, we extend three +widely-used legal datasets through LLM-based augmentation and manual +verification. Our experiments with state-of-the-art legal LLMs and novel +strategies that integrate trichotomous reasoning into zero-shot prompting and +fine-tuning reveal: (1) current legal LLMs have significant room for +improvement, with even the best models achieving an F1 score of less than 0.3 +on LJPIV; and (2) our strategies notably enhance both in-domain and +cross-domain judgment prediction accuracy, especially for cases resulting in an +innocent verdict. + +
+
+
+
+
+ + ☆ Simulation-Free Hierarchical Latent Policy Planning for Proactive + Dialogues AAAI 2025 + + +
+ Recent advancements in proactive dialogues have garnered significant +attention, particularly for more complex objectives (e.g. emotion support and +persuasion). Unlike traditional task-oriented dialogues, proactive dialogues +demand advanced policy planning and adaptability, requiring rich scenarios and +comprehensive policy repositories to develop such systems. However, existing +approaches tend to rely on Large Language Models (LLMs) for user simulation and +online learning, leading to biases that diverge from realistic scenarios and +result in suboptimal efficiency. Moreover, these methods depend on manually +defined, context-independent, coarse-grained policies, which not only incur +high expert costs but also raise concerns regarding their completeness. In our +work, we highlight the potential for automatically discovering policies +directly from raw, real-world dialogue records. To this end, we introduce a +novel dialogue policy planning framework, LDPP. It fully automates the process +from mining policies in dialogue records to learning policy planning. +Specifically, we employ a variant of the Variational Autoencoder to discover +fine-grained policies represented as latent vectors. After automatically +annotating the data with these latent policy labels, we propose an Offline +Hierarchical Reinforcement Learning (RL) algorithm in the latent space to +develop effective policy planning capabilities. Our experiments demonstrate +that LDPP outperforms existing methods on two proactive scenarios, even +surpassing ChatGPT with only a 1.8-billion-parameter LLM. + +
+
+ comment: 24 pages, 5 fgiures, AAAI 2025 +
+
+
+
+
+ + ☆ CORD: Balancing COnsistency and Rank Distillation for Robust + Retrieval-Augmented Generation + + +
+ With the adoption of retrieval-augmented generation (RAG), large language +models (LLMs) are expected to ground their generation to the retrieved +contexts. Yet, this is hindered by position bias of LLMs, failing to evenly +attend to all contexts. Previous work has addressed this by synthesizing +contexts with perturbed positions of gold segment, creating a +position-diversified train set. We extend this intuition to propose consistency +regularization with augmentation and distillation. First, we augment each +training instance with its position perturbation to encourage consistent +predictions, regardless of ordering. We also distill behaviors of this pair, +although it can be counterproductive in certain RAG scenarios where the given +order from the retriever is crucial for generation quality. We thus propose +CORD, balancing COnsistency and Rank Distillation. CORD adaptively samples +noise-controlled perturbations from an interpolation space, ensuring both +consistency and respect for the rank prior. Empirical results show this balance +enables CORD to outperform consistently in diverse RAG benchmarks. + +
+
+
+
+
+ + ☆ Sliding Windows Are Not the End: Exploring Full Ranking with + Long-Context Large Language Models + + +
+ Large Language Models (LLMs) have shown exciting performance in listwise +passage ranking. Due to the limited input length, existing methods often adopt +the sliding window strategy. Such a strategy, though effective, is inefficient +as it involves repetitive and serialized processing, which usually re-evaluates +relevant passages multiple times. As a result, it incurs redundant API costs, +which are proportional to the number of inference tokens. The development of +long-context LLMs enables the full ranking of all passages within a single +inference, avoiding redundant API costs. In this paper, we conduct a +comprehensive study of long-context LLMs for ranking tasks in terms of +efficiency and effectiveness. Surprisingly, our experiments reveal that full +ranking with long-context LLMs can deliver superior performance in the +supervised fine-tuning setting with a huge efficiency improvement. Furthermore, +we identify two limitations of fine-tuning the full ranking model based on +existing methods: (1) sliding window strategy fails to produce a full ranking +list as a training label, and (2) the language modeling loss cannot emphasize +top-ranked passage IDs in the label. To alleviate these issues, we propose a +new complete listwise label construction approach and a novel importance-aware +learning objective for full ranking. Experiments show the superior performance +of our method over baselines. Our codes are available at +\url{https://github.com/8421BCD/fullrank}. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ CitaLaw: Enhancing LLM with Citations in Legal Domain + + +
+ In this paper, we propose CitaLaw, the first benchmark designed to evaluate +LLMs' ability to produce legally sound responses with appropriate citations. +CitaLaw features a diverse set of legal questions for both laypersons and +practitioners, paired with a comprehensive corpus of law articles and precedent +cases as a reference pool. This framework enables LLM-based systems to retrieve +supporting citations from the reference corpus and align these citations with +the corresponding sentences in their responses. Moreover, we introduce +syllogism-inspired evaluation methods to assess the legal alignment between +retrieved references and LLM-generated responses, as well as their consistency +with user questions. Extensive experiments on 2 open-domain and 7 +legal-specific LLMs demonstrate that integrating legal references substantially +enhances response quality. Furthermore, our proposed syllogism-based evaluation +method exhibits strong agreement with human judgments. + +
+
+
+
+
+ + ☆ ClusterTalk: Corpus Exploration Framework using Multi-Dimensional + Exploratory Search + + +
+ Exploratory search of large text corpora is essential in domains like +biomedical research, where large amounts of research literature are +continuously generated. This paper presents ClusterTalk (The demo video and +source code are available at: https://github.com/achouhan93/ClusterTalk), a +framework for corpus exploration using multi-dimensional exploratory search. +Our system integrates document clustering with faceted search, allowing users +to interactively refine their exploration and ask corpus and document-level +queries. Compared to traditional one-dimensional search approaches like keyword +search or clustering, this system improves the discoverability of information +by encouraging a deeper interaction with the corpus. We demonstrate the +functionality of the ClusterTalk framework based on four million PubMed +abstracts for the four-year time frame. + +
+
+ comment: 5 pages, 1 figure +
+
+
+
+
+ + ☆ Multi-Level Optimal Transport for Universal Cross-Tokenizer Knowledge + Distillation on Language Models AAAI 2025 + + +
+ Knowledge distillation (KD) has become a prevalent technique for compressing +large language models (LLMs). Existing KD methods are constrained by the need +for identical tokenizers (i.e., vocabularies) between teacher and student +models, limiting their versatility in handling LLMs of different architecture +families. In this paper, we introduce the Multi-Level Optimal Transport +(MultiLevelOT), a novel approach that advances the optimal transport for +universal cross-tokenizer knowledge distillation. Our method aligns the logit +distributions of the teacher and the student at both token and sequence levels +using diverse cost matrices, eliminating the need for dimensional or +token-by-token correspondence. At the token level, MultiLevelOT integrates both +global and local information by jointly optimizing all tokens within a sequence +to enhance robustness. At the sequence level, we efficiently capture complex +distribution structures of logits via the Sinkhorn distance, which approximates +the Wasserstein distance for divergence measures. Extensive experiments on +tasks such as extractive QA, generative QA, and summarization demonstrate that +the MultiLevelOT outperforms state-of-the-art cross-tokenizer KD methods under +various settings. Our approach is robust to different student and teacher +models across model families, architectures, and parameter sizes. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ Cal-DPO: Calibrated Direct Preference Optimization for Language Model + Alignment NeurIPS 2024 + + +
+ We study the problem of aligning large language models (LLMs) with human +preference data. Contrastive preference optimization has shown promising +results in aligning LLMs with available preference data by optimizing the +implicit reward associated with the policy. However, the contrastive objective +focuses mainly on the relative values of implicit rewards associated with two +responses while ignoring their actual values, resulting in suboptimal alignment +with human preferences. To address this limitation, we propose calibrated +direct preference optimization (Cal-DPO), a simple yet effective algorithm. We +show that substantial improvement in alignment with the given preferences can +be achieved simply by calibrating the implicit reward to ensure that the +learned implicit rewards are comparable in scale to the ground-truth rewards. +We demonstrate the theoretical advantages of Cal-DPO over existing approaches. +The results of our experiments on a variety of standard benchmarks show that +Cal-DPO remarkably improves off-the-shelf methods. + +
+
+ comment: Accepted by NeurIPS 2024 Main +
+
+
+
+
+ + ☆ PA-RAG: RAG Alignment via Multi-Perspective Preference Optimization + + +
+ The emergence of Retrieval-augmented generation (RAG) has alleviated the +issues of outdated and hallucinatory content in the generation of large +language models (LLMs), yet it still reveals numerous limitations. When a +general-purpose LLM serves as the RAG generator, it often suffers from +inadequate response informativeness, response robustness, and citation quality. +Past approaches to tackle these limitations, either by incorporating additional +steps beyond generating responses or optimizing the generator through +supervised fine-tuning (SFT), still failed to align with the RAG requirement +thoroughly. Consequently, optimizing the RAG generator from multiple preference +perspectives while maintaining its end-to-end LLM form remains a challenge. To +bridge this gap, we propose Multiple Perspective Preference Alignment for +Retrieval-Augmented Generation (PA-RAG), a method for optimizing the generator +of RAG systems to align with RAG requirements comprehensively. Specifically, we +construct high-quality instruction fine-tuning data and multi-perspective +preference data by sampling varied quality responses from the generator across +different prompt documents quality scenarios. Subsequently, we optimize the +generator using SFT and Direct Preference Optimization (DPO). Extensive +experiments conducted on four question-answer datasets across three LLMs +demonstrate that PA-RAG can significantly enhance the performance of RAG +generators. Our code and datasets are available at +https://github.com/wujwyi/PA-RAG. + +
+
+
+
+
+ + ☆ Do Large Language Models Defend Inferentialist Semantics?: On the + Logical Expressivism and Anti-Representationalism of LLMs + + +
+ The philosophy of language, which has historically been developed through an +anthropocentric lens, is now being forced to move towards post-anthropocentrism +due to the advent of large language models (LLMs) like ChatGPT (OpenAI), Claude +(Anthropic), which are considered to possess linguistic abilities comparable to +those of humans. Traditionally, LLMs have been explained through distributional +semantics as their foundational semantics. However, recent research is +exploring alternative foundational semantics beyond distributional semantics. +This paper proposes Robert Brandom's inferentialist semantics as an suitable +foundational semantics for LLMs, specifically focusing on the issue of +linguistic representationalism within this post-anthropocentric trend. Here, we +show that the anti-representationalism and logical expressivism of inferential +semantics, as well as quasi-compositionality, are useful in interpreting the +characteristics and behaviors of LLMs. Further, we propose a \emph{consensus +theory of truths} for LLMs. This paper argues that the characteristics of LLMs +challenge mainstream assumptions in philosophy of language, such as semantic +externalism and compositionality. We believe the argument in this paper leads +to a re-evaluation of anti\hyphen{}representationalist views of language, +potentially leading to new developments in the philosophy of language. + +
+
+
+
+
+ + ☆ GraphEQA: Using 3D Semantic Scene Graphs for Real-time Embodied Question + Answering + + +
+ In Embodied Question Answering (EQA), agents must explore and develop a +semantic understanding of an unseen environment in order to answer a situated +question with confidence. This remains a challenging problem in robotics, due +to the difficulties in obtaining useful semantic representations, updating +these representations online, and leveraging prior world knowledge for +efficient exploration and planning. Aiming to address these limitations, we +propose GraphEQA, a novel approach that utilizes real-time 3D metric-semantic +scene graphs (3DSGs) and task relevant images as multi-modal memory for +grounding Vision-Language Models (VLMs) to perform EQA tasks in unseen +environments. We employ a hierarchical planning approach that exploits the +hierarchical nature of 3DSGs for structured planning and semantic-guided +exploration. Through experiments in simulation on the HM-EQA dataset and in the +real world in home and office environments, we demonstrate that our method +outperforms key baselines by completing EQA tasks with higher success rates and +fewer planning steps. + +
+
+ comment: Project website: https://saumyasaxena.github.io/grapheqa +
+
+
+
+
+ + ☆ MegaPairs: Massive Data Synthesis For Universal Multimodal Retrieval + + +
+ Despite the rapidly growing demand for multimodal retrieval, progress in this +field remains severely constrained by a lack of training data. In this paper, +we introduce MegaPairs, a novel data synthesis method that leverages vision +language models (VLMs) and open-domain images, together with a massive +synthetic dataset generated from this method. Our empirical analysis shows that +MegaPairs generates high-quality data, enabling the multimodal retriever to +significantly outperform the baseline model trained on 70$\times$ more data +from existing datasets. Moreover, since MegaPairs solely relies on general +image corpora and open-source VLMs, it can be easily scaled up, enabling +continuous improvements in retrieval performance. In this stage, we produced +more than 26 million training instances and trained several models of varying +sizes using this data. These new models achieve state-of-the-art zero-shot +performance across 4 popular composed image retrieval (CIR) benchmarks and the +highest overall performance on the 36 datasets provided by MMEB. They also +demonstrate notable performance improvements with additional downstream +fine-tuning. Our produced dataset, well-trained models, and data synthesis +pipeline will be made publicly available to facilitate the future development +of this field. + +
+
+
+
+
+ + ☆ Why We Build Local Large Language Models: An Observational Analysis from + 35 Japanese and Multilingual LLMs + + +
+ Why do we build local large language models (LLMs)? What should a local LLM +learn from the target language? Which abilities can be transferred from other +languages? Do language-specific scaling laws exist? To explore these research +questions, we evaluated 35 Japanese, English, and multilingual LLMs on 19 +evaluation benchmarks for Japanese and English, taking Japanese as a local +language. Adopting an observational approach, we analyzed correlations of +benchmark scores, and conducted principal component analysis (PCA) on the +scores to derive \textit{ability factors} of local LLMs. We found that training +on English text can improve the scores of academic subjects in Japanese +(JMMLU). In addition, it is unnecessary to specifically train on Japanese text +to enhance abilities for solving Japanese code generation, arithmetic +reasoning, commonsense, and reading comprehension tasks. In contrast, training +on Japanese text could improve question-answering tasks about Japanese +knowledge and English-Japanese translation, which indicates that abilities for +solving these two tasks can be regarded as \textit{Japanese abilities} for +LLMs. Furthermore, we confirmed that the Japanese abilities scale with the +computational budget for Japanese text. + +
+
+ comment: Preprint. Under review +
+
+
+
+
+ + ☆ Agent-SafetyBench: Evaluating the Safety of LLM Agents + + +
+ As large language models (LLMs) are increasingly deployed as agents, their +integration into interactive environments and tool use introduce new safety +challenges beyond those associated with the models themselves. However, the +absence of comprehensive benchmarks for evaluating agent safety presents a +significant barrier to effective assessment and further improvement. In this +paper, we introduce Agent-SafetyBench, a comprehensive benchmark designed to +evaluate the safety of LLM agents. Agent-SafetyBench encompasses 349 +interaction environments and 2,000 test cases, evaluating 8 categories of +safety risks and covering 10 common failure modes frequently encountered in +unsafe interactions. Our evaluation of 16 popular LLM agents reveals a +concerning result: none of the agents achieves a safety score above 60%. This +highlights significant safety challenges in LLM agents and underscores the +considerable need for improvement. Through quantitative analysis, we identify +critical failure modes and summarize two fundamental safety detects in current +LLM agents: lack of robustness and lack of risk awareness. Furthermore, our +findings suggest that reliance on defense prompts alone is insufficient to +address these safety issues, emphasizing the need for more advanced and robust +strategies. We release Agent-SafetyBench at +\url{https://github.com/thu-coai/Agent-SafetyBench} to facilitate further +research and innovation in agent safety evaluation and improvement. + +
+
+ comment: 23 pages, 9 figures +
+
+
+
+
+ + ☆ From Human Annotation to LLMs: SILICON Annotation Workflow for + Management Research + + +
+ Unstructured text data annotation and analysis are fundamental to management +research, often relying on human annotators through crowdsourcing platforms. +While Large Language Models (LLMs) promise to provide a cost-effective and +efficient alternative to human annotation, there lacks a systematic workflow +that evaluate when LLMs are suitable or how to proceed with LLM-based text +annotation in a reproducible manner. This paper addresses this methodological +gap by introducing the ``SILICON" (\textbf{S}ystematic \textbf{I}nference with +\textbf{L}LMs for \textbf{I}nformation \textbf{C}lassificati\textbf{o}n and +\textbf{N}otation) workflow. The workflow integrates established principles of +human annotation with systematic prompt optimization and model selection, +addressing challenges such as developing robust annotation guidelines, +establishing high-quality human baselines, optimizing prompts, and ensuring +reproducibility across LLMs. We validate the SILICON workflow through seven +case studies covering common management research tasks, including business +proposal evaluation, dialog intent and breakdown analysis, review attribute +detection. Our findings highlight the importance of validating annotation +guideline agreement, the superiority of expert-developed human baselines over +crowdsourced ones, the iterative nature of prompt optimization, and the +necessity of testing multiple LLMs. Notably, we propose a regression-based +methodology to empirically compare LLM outputs across prompts and models. Our +workflow advances management research by establishing reproducible processes +for LLM-based annotation that maintain scientific rigor. We provide practical +guidance for researchers to effectively navigate the evolving landscape of +generative AI tools effectively while maintaining transparency and +reproducibility. + +
+
+
+
+
+ + ☆ Are Longer Prompts Always Better? Prompt Selection in Large Language + Models for Recommendation Systems + + +
+ In large language models (LLM)-based recommendation systems (LLM-RSs), +accurately predicting user preferences by leveraging the general knowledge of +LLMs is possible without requiring extensive training data. By converting +recommendation tasks into natural language inputs called prompts, LLM-RSs can +efficiently solve issues that have been difficult to address due to data +scarcity but are crucial in applications such as cold-start and cross-domain +problems. However, when applying this in practice, selecting the prompt that +matches tasks and data is essential. Although numerous prompts have been +proposed in LLM-RSs and representing the target user in prompts significantly +impacts recommendation accuracy, there are still no clear guidelines for +selecting specific prompts. + In this paper, we categorize and analyze prompts from previous research to +establish practical prompt selection guidelines. Through 450 experiments with +90 prompts and five real-world datasets, we examined the relationship between +prompts and dataset characteristics in recommendation accuracy. We found that +no single prompt consistently outperforms others; thus, selecting prompts on +the basis of dataset characteristics is crucial. Here, we propose a prompt +selection method that achieves higher accuracy with minimal validation data. +Because increasing the number of prompts to explore raises costs, we also +introduce a cost-efficient strategy using high-performance and cost-efficient +LLMs, significantly reducing exploration costs while maintaining high +prediction accuracy. Our work offers valuable insights into the prompt +selection, advancing accurate and efficient LLM-RSs. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ ORBIT: Cost-Effective Dataset Curation for Large Language Model Domain + Adaptation with an Astronomy Case Study + + +
+ Recent advances in language modeling demonstrate the need for high-quality +domain-specific training data, especially for tasks that require specialized +knowledge. General-purpose models, while versatile, often lack the depth needed +for expert-level tasks because of limited domain-specific information. Domain +adaptation training can enhance these models, but it demands substantial, +high-quality data. To address this, we propose ORBIT, a cost-efficient +methodology for curating massive, high-quality domain-specific datasets from +noisy web sources, tailored for training specialist large language models. +Using astronomy as a primary case study, we refined the 1.3T-token FineWeb-Edu +dataset into a high-quality, 10B-token subset focused on astronomy. Fine-tuning +\textsc{LLaMA-3-8B} on a 1B-token astronomy subset improved performance on the +MMLU astronomy benchmark from 69\% to 76\% and achieved top results on +AstroBench, an astronomy-specific benchmark. Moreover, our model (Orbit-LLaMA) +outperformed \textsc{LLaMA-3-8B-base}, with GPT-4o evaluations preferring it in +73\% of cases across 1000 astronomy-specific questions. Additionally, we +validated ORBIT's generalizability by applying it to law and medicine, +achieving a significant improvement of data quality compared to an unfiltered +baseline. We open-source the ORBIT methodology, including the curated datasets, +the codebase, and the resulting model at +\href{https://github.com/ModeEric/ORBIT-Llama}{https://github.com/ModeEric/ORBIT-Llama}. + +
+
+
+
+
+ + ☆ All-in-One Tuning and Structural Pruning for Domain-Specific LLMs + + +
+ Existing pruning techniques for large language models (LLMs) targeting +domain-specific applications typically follow a two-stage process: pruning the +pretrained general-purpose LLMs and then fine-tuning the pruned LLMs on +specific domains. However, the pruning decisions, derived from the pretrained +weights, remain unchanged during fine-tuning, even if the weights have been +updated. Therefore, such a combination of the pruning decisions and the +finetuned weights may be suboptimal, leading to non-negligible performance +degradation. To address these limitations, we propose ATP: All-in-One Tuning +and Structural Pruning, a unified one-stage structural pruning and fine-tuning +approach that dynamically identifies the current optimal substructure +throughout the fine-tuning phase via a trainable pruning decision generator. +Moreover, given the limited available data for domain-specific applications, +Low-Rank Adaptation (LoRA) becomes a common technique to fine-tune the LLMs. In +ATP, we introduce LoRA-aware forward and sparsity regularization to ensure that +the substructures corresponding to the learned pruning decisions can be +directly removed after the ATP process. ATP outperforms the state-of-the-art +two-stage pruning methods on tasks in the legal and healthcare domains. More +specifically, ATP recovers up to 88% and 91% performance of the dense model +when pruning 40% parameters of LLaMA2-7B and LLaMA3-8B models, respectively. + +
+
+
+
+
+ + ♻ ☆ CodeLutra: Boosting LLM Code Generation via Preference-Guided Refinement + + +
+ Large Language Models (LLMs) have revolutionized code generation but require +significant resources and often over-generalize, limiting their task-specific +efficiency. Fine-tuning smaller, open-source LLMs provides a cost-effective +alternative. However, standard supervised approaches rely only on correct +examples, missing valuable insights from failures. We introduce CodeLutra, a +framework that leverages both correct and incorrect code attempts. Instead of +using only correct solutions, CodeLutra applies iterative preference-based +refinement, comparing successful and failed outputs to better approximate +desired results. This approach narrows the performance gap with +state-of-the-art larger models without requiring massive datasets or auxiliary +models. For instance, on a challenging data science coding task, using only 500 +samples improved Llama-3-8B's accuracy from 28.2% to 48.6%, approaching GPT-4's +level. By learning from both successes and mistakes, CodeLutra provides a +scalable and efficient path to high-quality code generation, making smaller +open-source models more competitive with leading closed-source alternatives. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ URIEL+: Enhancing Linguistic Inclusion and Usability in a Typological + and Multilingual Knowledge Base COLING 2025 + + +
+ URIEL is a knowledge base offering geographical, phylogenetic, and +typological vector representations for 7970 languages. It includes distance +measures between these vectors for 4005 languages, which are accessible via the +lang2vec tool. Despite being frequently cited, URIEL is limited in terms of +linguistic inclusion and overall usability. To tackle these challenges, we +introduce URIEL+, an enhanced version of URIEL and lang2vec that addresses +these limitations. In addition to expanding typological feature coverage for +2898 languages, URIEL+ improves the user experience with robust, customizable +distance calculations to better suit the needs of users. These upgrades also +offer competitive performance on downstream tasks and provide distances that +better align with linguistic distance studies. + +
+
+ comment: Accepted to COLING 2025 +
+
+
+
+
+ + ♻ ☆ Sometimes I am a Tree: Data Drives Unstable Hierarchical Generalization + + +
+ Language models (LMs), like other neural networks, often favor shortcut +heuristics based on surface-level patterns. Although LMs behave like n-gram +models early in training, they must eventually learn hierarchical syntactic +representations to correctly apply grammatical rules out-of-distribution (OOD). +In this work, we use case studies of English grammar to explore how complex, +diverse training data drives models to generalize OOD. We construct a framework +that unifies our understanding of random variation with training dynamics, rule +selection with memorization, and data diversity with complexity. We show that +these factors are nuanced, and that intermediate levels of diversity and +complexity lead to inconsistent behavior across random seeds and to unstable +training dynamics. Our findings emphasize the critical role of training data in +shaping generalization patterns and illuminate how competing model strategies +lead to inconsistent generalization outcomes across random seeds. Code is +available at https://github.com/sunnytqin/concept_comp.git. + +
+
+
+
+
+ + ♻ ☆ Typhoon 2: A Family of Open Text and Multimodal Thai Large Language + Models + + +
+ This paper introduces Typhoon 2, a series of text and multimodal large +language models optimized for the Thai language. The series includes models for +text, vision, and audio. Typhoon2-Text builds on state-of-the-art open models, +such as Llama 3 and Qwen2, and we perform continual pre-training on a mixture +of English and Thai data. We employ post-training techniques to enhance Thai +language performance while preserving the base models' original capabilities. +We release text models across a range of sizes, from 1 to 70 billion +parameters, available in both base and instruction-tuned variants. To guardrail +text generation, we release Typhoon2-Safety, a classifier enhanced for Thai +cultures and language. Typhoon2-Vision improves Thai document understanding +while retaining general visual capabilities, such as image captioning. +Typhoon2-Audio introduces an end-to-end speech-to-speech model architecture +capable of processing audio, speech, and text inputs and generating both text +and speech outputs. + +
+
+ comment: technical report, 55 pages +
+
+
+
+
+ + ♻ ☆ LLMs as Zero-shot Graph Learners: Alignment of GNN Representations with + LLM Token Embeddings + + +
+ Zero-shot graph machine learning, especially with graph neural networks +(GNNs), has garnered significant interest due to the challenge of scarce +labeled data. While methods like self-supervised learning and graph prompt +learning have been extensively explored, they often rely on fine-tuning with +task-specific labels, limiting their effectiveness in zero-shot scenarios. +Inspired by the zero-shot capabilities of instruction-fine-tuned large language +models (LLMs), we introduce a novel framework named Token Embedding-Aligned +Graph Language Model (TEA-GLM) that leverages LLMs as cross-dataset and +cross-task zero-shot learners for graph machine learning. Concretely, we +pretrain a GNN, aligning its representations with token embeddings of an LLM. +We then train a linear projector that transforms the GNN's representations into +a fixed number of graph token embeddings without tuning the LLM. A unified +instruction is designed for various graph tasks at different levels, such as +node classification (node-level) and link prediction (edge-level). These design +choices collectively enhance our method's effectiveness in zero-shot learning, +setting it apart from existing methods. Experiments show that our graph token +embeddings help the LLM predictor achieve state-of-the-art performance on +unseen datasets and tasks compared to other methods using LLMs as predictors. + +
+
+
+
+
+ + ♻ ☆ Identifying Query-Relevant Neurons in Large Language Models for + Long-Form Texts AAAI 2025 + + +
+ Large Language Models (LLMs) possess vast amounts of knowledge within their +parameters, prompting research into methods for locating and editing this +knowledge. Previous work has largely focused on locating entity-related (often +single-token) facts in smaller models. However, several key questions remain +unanswered: (1) How can we effectively locate query-relevant neurons in +decoder-only LLMs, such as Llama and Mistral? (2) How can we address the +challenge of long-form (or free-form) text generation? (3) Are there localized +knowledge regions in LLMs? In this study, we introduce Query-Relevant Neuron +Cluster Attribution (QRNCA), a novel architecture-agnostic framework capable of +identifying query-relevant neurons in LLMs. QRNCA allows for the examination of +long-form answers beyond triplet facts by employing the proxy task of +multi-choice question answering. To evaluate the effectiveness of our detected +neurons, we build two multi-choice QA datasets spanning diverse domains and +languages. Empirical evaluations demonstrate that our method outperforms +baseline methods significantly. Further, analysis of neuron distributions +reveals the presence of visible localized regions, particularly within +different domains. Finally, we show potential applications of our detected +neurons in knowledge editing and neuron-based prediction. + +
+
+ comment: AAAI 2025 Main Track +
+
+
+
+
+ + ♻ ☆ SPICA: Retrieving Scenarios for Pluralistic In-Context Alignment + + +
+ When different groups' values differ, one approach to model alignment is to +steer models at inference time towards each group's preferences. However, +techniques like in-context learning only consider similarity when drawing +few-shot examples and not cross-group differences in values. We propose SPICA, +a framework that accounts for group-level differences during in-context example +retrieval. SPICA introduces three designs: scenario banks, group-informed +retrieval metrics, and in-context alignment prompts. From an evaluation of +SPICA on an alignment task collecting inputs from four demographic groups ($n = +544$), our metrics retrieve in-context examples that more closely match +observed preferences, with the best prompt configuration using multiple +contrastive responses to demonstrate examples. In an end-to-end evaluation ($n += 120$), we observe that SPICA is higher rated than similarity-based retrieval, +with groups seeing up to a +0.16 point improvement on a 5 point scale. +Additionally, gains from SPICA were more uniform, with all groups benefiting +from alignment rather than only some. Finally, we find that while a +group-agnostic approach can align to aggregated values, it is not most suited +for divergent groups. + +
+
+
+
+
+ + ♻ ☆ Knowledge Tagging with Large Language Model based Multi-Agent System AAAI 2025 + + +
+ Knowledge tagging for questions is vital in modern intelligent educational +applications, including learning progress diagnosis, practice question +recommendations, and course content organization. Traditionally, these +annotations have been performed by pedagogical experts, as the task demands not +only a deep semantic understanding of question stems and knowledge definitions +but also a strong ability to link problem-solving logic with relevant knowledge +concepts. With the advent of advanced natural language processing (NLP) +algorithms, such as pre-trained language models and large language models +(LLMs), pioneering studies have explored automating the knowledge tagging +process using various machine learning models. In this paper, we investigate +the use of a multi-agent system to address the limitations of previous +algorithms, particularly in handling complex cases involving intricate +knowledge definitions and strict numerical constraints. By demonstrating its +superior performance on the publicly available math question knowledge tagging +dataset, MathKnowCT, we highlight the significant potential of an LLM-based +multi-agent system in overcoming the challenges that previous methods have +encountered. Finally, through an in-depth discussion of the implications of +automating knowledge tagging, we underscore the promising results of deploying +LLM-based algorithms in educational contexts. + +
+
+ comment: Accepted by AAAI 2025 (AAAI/IAAI 2025 Innovative Application Award) +
+
+
+
+
+ + ♻ ☆ Beyond Dataset Creation: Critical View of Annotation Variation and Bias + Probing of a Dataset for Online Radical Content Detection COLING 2025 + + +
+ The proliferation of radical content on online platforms poses significant +risks, including inciting violence and spreading extremist ideologies. Despite +ongoing research, existing datasets and models often fail to address the +complexities of multilingual and diverse data. To bridge this gap, we introduce +a publicly available multilingual dataset annotated with radicalization levels, +calls for action, and named entities in English, French, and Arabic. This +dataset is pseudonymized to protect individual privacy while preserving +contextual information. Beyond presenting our freely available dataset, we +analyze the annotation process, highlighting biases and disagreements among +annotators and their implications for model performance. Additionally, we use +synthetic data to investigate the influence of socio-demographic traits on +annotation patterns and model predictions. Our work offers a comprehensive +examination of the challenges and opportunities in building robust datasets for +radical content detection, emphasizing the importance of fairness and +transparency in model development. + +
+
+ comment: Accepted to COLING 2025 +
+
+
+
+
+ + ♻ ☆ LLM-SEM: A Sentiment-Based Student Engagement Metric Using LLMS for + E-Learning Platforms + + +
+ Current methods for analyzing student engagement in e-learning platforms, +including automated systems, often struggle with challenges such as handling +fuzzy sentiment in text comments and relying on limited metadata. Traditional +approaches, such as surveys and questionnaires, also face issues like small +sample sizes and scalability. In this paper, we introduce LLM-SEM (Language +Model-Based Student Engagement Metric), a novel approach that leverages video +metadata and sentiment analysis of student comments to measure engagement. By +utilizing recent Large Language Models (LLMs), we generate high-quality +sentiment predictions to mitigate text fuzziness and normalize key features +such as views and likes. Our holistic method combines comprehensive metadata +with sentiment polarity scores to gauge engagement at both the course and +lesson levels. Extensive experiments were conducted to evaluate various LLM +models, demonstrating the effectiveness of LLM-SEM in providing a scalable and +accurate measure of student engagement. We fine-tuned TXLM-RoBERTa using +human-annotated sentiment datasets to enhance prediction accuracy and utilized +LLama 3B, and Gemma 9B from Ollama. + +
+
+
+
+
+ + ♻ ☆ G-VEval: A Versatile Metric for Evaluating Image and Video Captions + Using GPT-4o + + +
+ Evaluation metric of visual captioning is important yet not thoroughly +explored. Traditional metrics like BLEU, METEOR, CIDEr, and ROUGE often miss +semantic depth, while trained metrics such as CLIP-Score, PAC-S, and Polos are +limited in zero-shot scenarios. Advanced Language Model-based metrics also +struggle with aligning to nuanced human preferences. To address these issues, +we introduce G-VEval, a novel metric inspired by G-Eval and powered by the new +GPT-4o. G-VEval uses chain-of-thought reasoning in large multimodal models and +supports three modes: reference-free, reference-only, and combined, +accommodating both video and image inputs. We also propose MSVD-Eval, a new +dataset for video captioning evaluation, to establish a more transparent and +consistent framework for both human experts and evaluation metrics. It is +designed to address the lack of clear criteria in existing datasets by +introducing distinct dimensions of Accuracy, Completeness, Conciseness, and +Relevance (ACCR). Extensive results show that G-VEval outperforms existing +methods in correlation with human annotations, as measured by Kendall tau-b and +Kendall tau-c. This provides a flexible solution for diverse captioning tasks +and suggests a straightforward yet effective approach for large language models +to understand video content, paving the way for advancements in automated +captioning. Codes are available at https://github.com/ztangaj/gveval + +
+
+
+
+
+ + ♻ ☆ To Word Senses and Beyond: Inducing Concepts with Contextualized + Language Models EMNLP 2024 + + +
+ Polysemy and synonymy are two crucial interrelated facets of lexical +ambiguity. While both phenomena are widely documented in lexical resources and +have been studied extensively in NLP, leading to dedicated systems, they are +often being considered independently in practical problems. While many tasks +dealing with polysemy (e.g. Word Sense Disambiguiation or Induction) highlight +the role of word's senses, the study of synonymy is rooted in the study of +concepts, i.e. meanings shared across the lexicon. In this paper, we introduce +Concept Induction, the unsupervised task of learning a soft clustering among +words that defines a set of concepts directly from data. This task generalizes +Word Sense Induction. We propose a bi-level approach to Concept Induction that +leverages both a local lemma-centric view and a global cross-lexicon view to +induce concepts. We evaluate the obtained clustering on SemCor's annotated data +and obtain good performance (BCubed F1 above 0.60). We find that the local and +the global levels are mutually beneficial to induce concepts and also senses in +our setting. Finally, we create static embeddings representing our induced +concepts and use them on the Word-in-Context task, obtaining competitive +performance with the State-of-the-Art. + +
+
+ comment: Published in EMNLP 2024 main conference proceedings +
+
+
+
+
+ + ♻ ☆ Benchmarking Large Language Models for Math Reasoning Tasks + + +
+ The use of Large Language Models (LLMs) in mathematical reasoning has become +a cornerstone of related research, demonstrating the intelligence of these +models and enabling potential practical applications through their advanced +performance, such as in educational settings. Despite the variety of datasets +and in-context learning algorithms designed to improve the ability of LLMs to +automate mathematical problem solving, the lack of comprehensive benchmarking +across different datasets makes it complicated to select an appropriate model +for specific tasks. In this project, we present a benchmark that fairly +compares seven state-of-the-art in-context learning algorithms for mathematical +problem solving across five widely used mathematical datasets on four powerful +foundation models. Furthermore, we explore the trade-off between efficiency and +performance, highlighting the practical applications of LLMs for mathematical +reasoning. Our results indicate that larger foundation models like GPT-4o and +LLaMA 3-70B can solve mathematical reasoning independently from the concrete +prompting strategy, while for smaller models the in-context learning approach +significantly influences the performance. Moreover, the optimal prompt depends +on the chosen foundation model. We open-source our benchmark code to support +the integration of additional models in future research. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ ProsodyFM: Unsupervised Phrasing and Intonation Control for Intelligible + Speech Synthesis AAAI 2025 + + +
+ Prosody contains rich information beyond the literal meaning of words, which +is crucial for the intelligibility of speech. Current models still fall short +in phrasing and intonation; they not only miss or misplace breaks when +synthesizing long sentences with complex structures but also produce unnatural +intonation. We propose ProsodyFM, a prosody-aware text-to-speech synthesis +(TTS) model with a flow-matching (FM) backbone that aims to enhance the +phrasing and intonation aspects of prosody. ProsodyFM introduces two key +components: a Phrase Break Encoder to capture initial phrase break locations, +followed by a Duration Predictor for the flexible adjustment of break +durations; and a Terminal Intonation Encoder which learns a bank of intonation +shape tokens combined with a novel Pitch Processor for more robust modeling of +human-perceived intonation change. ProsodyFM is trained with no explicit +prosodic labels and yet can uncover a broad spectrum of break durations and +intonation patterns. Experimental results demonstrate that ProsodyFM can +effectively improve the phrasing and intonation aspects of prosody, thereby +enhancing the overall intelligibility compared to four state-of-the-art (SOTA) +models. Out-of-distribution experiments show that this prosody improvement can +further bring ProsodyFM superior generalizability for unseen complex sentences +and speakers. Our case study intuitively illustrates the powerful and +fine-grained controllability of ProsodyFM over phrasing and intonation. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Human and LLM Biases in Hate Speech Annotations: A Socio-Demographic + Analysis of Annotators and Targets + + +
+ The rise of online platforms exacerbated the spread of hate speech, demanding +scalable and effective detection. However, the accuracy of hate speech +detection systems heavily relies on human-labeled data, which is inherently +susceptible to biases. While previous work has examined the issue, the +interplay between the characteristics of the annotator and those of the target +of the hate are still unexplored. We fill this gap by leveraging an extensive +dataset with rich socio-demographic information of both annotators and targets, +uncovering how human biases manifest in relation to the target's attributes. +Our analysis surfaces the presence of widespread biases, which we +quantitatively describe and characterize based on their intensity and +prevalence, revealing marked differences. Furthermore, we compare human biases +with those exhibited by persona-based LLMs. Our findings indicate that while +persona-based LLMs do exhibit biases, these differ significantly from those of +human annotators. Overall, our work offers new and nuanced results on human +biases in hate speech annotations, as well as fresh insights into the design of +AI-driven hate speech detection systems. + +
+
+
+
+
+ + ♻ ☆ From Bench to Bedside: A Review of Clinical Trials in Drug Discovery and + Development + + +
+ Clinical trials are an indispensable part of the drug development process, +bridging the gap between basic research and clinical application. During the +development of new drugs, clinical trials are used not only to evaluate the +safety and efficacy of the drug but also to explore its dosage, treatment +regimens, and potential side effects. This review discusses the various stages +of clinical trials, including Phase I (safety assessment), Phase II +(preliminary efficacy evaluation), Phase III (large-scale validation), and +Phase IV (post-marketing surveillance), highlighting the characteristics of +each phase and their interrelationships. Additionally, the paper addresses the +major challenges encountered in clinical trials, such as ethical issues, +subject recruitment difficulties, diversity and representativeness concerns, +and proposes strategies for overcoming these challenges. With the advancement +of technology, innovative technologies such as artificial intelligence, big +data, and digitalization are gradually transforming clinical trial design and +implementation, improving trial efficiency and data quality. The article also +looks forward to the future of clinical trials, particularly the impact of +emerging therapies such as gene therapy and immunotherapy on trial design, as +well as the importance of regulatory reforms and global collaboration. In +conclusion, the core role of clinical trials in drug development will continue +to drive the progress of innovative drug development and clinical treatment. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ ANAH-v2: Scaling Analytical Hallucination Annotation of Large Language + Models NeurIPS 2024 + + +
+ Large language models (LLMs) exhibit hallucinations in long-form +question-answering tasks across various domains and wide applications. Current +hallucination detection and mitigation datasets are limited in domains and +sizes, which struggle to scale due to prohibitive labor costs and insufficient +reliability of existing hallucination annotators. To facilitate the scalable +oversight of LLM hallucinations, this paper introduces an iterative +self-training framework that simultaneously and progressively scales up the +hallucination annotation dataset and improves the accuracy of the hallucination +annotator. Based on the Expectation Maximization (EM) algorithm, in each +iteration, the framework first applies a hallucination annotation pipeline to +annotate a scaled dataset and then trains a more accurate hallucination +annotator on the dataset. This new hallucination annotator is adopted in the +hallucination annotation pipeline used for the next iteration. Extensive +experimental results demonstrate that the finally obtained hallucination +annotator with only 7B parameters surpasses the performance of GPT-4 and +obtains new state-of-the-art hallucination detection results on HaluEval and +HalluQA by zero-shot inference. Such an annotator can not only evaluate the +hallucination levels of various LLMs on the large-scale dataset but also help +to mitigate the hallucination of LLMs generations, with the Natural Language +Inference (NLI) metric increasing from 25% to 37% on HaluEval. + +
+
+ comment: Accepted by NeurIPS 2024. Dataset, code, and model are released at + https://github.com/open-compass/ANAH +
+
+
+
+
+ + ♻ ☆ BayLing 2: A Multilingual Large Language Model with Efficient Language + Alignment + + +
+ Large language models (LLMs), with their powerful generative capabilities and +vast knowledge, empower various tasks in everyday life. However, these +abilities are primarily concentrated in high-resource languages, leaving +low-resource languages with weaker generative capabilities and relatively +limited knowledge. Enhancing the multilingual capabilities of LLMs is therefore +crucial for serving over 100 linguistic communities worldwide. An intuitive +approach to enhance the multilingual capabilities would be to construct +instruction data for various languages, but constructing instruction data for +over 100 languages is prohibitively costly. In this paper, we introduce BayLing +2, which efficiently transfers generative capabilities and knowledge from +high-resource languages to low-resource languages through language alignment. +To achieve this, we constructed a dataset of 3.2 million instructions, +comprising high-resource language instructions (Chinese and English) and +cross-lingual instructions for 100+ languages and performed instruction tuning +based on the dataset to facilitate the capability transfer between languages. +Using Llama as the foundation model, we developed BayLing-2-7B, BayLing-2-13B, +and BayLing-2-8B, and conducted a comprehensive evaluation of BayLing. For +multilingual translation across 100+ languages, BayLing shows superior +performance compared to open-source models of similar scale. For multilingual +knowledge and understanding benchmarks, BayLing achieves significant +improvements across over 20 low-resource languages, demonstrating its +capability of effective knowledge transfer from high-resource to low-resource +languages. Furthermore, results on English benchmarks indicate that BayLing +maintains high performance in highresource languages while enhancing the +performance in low-resource languages. Demo, homepage, code and models of +BayLing are available. + +
+
+ comment: BayLing 2's online demo: http://nlp.ict.ac.cn/bayling/demo. BayLing + 2's code and models: https://github.com/ictnlp/BayLing +
+
+
+
+
+ + ♻ ☆ Agent-OM: Leveraging LLM Agents for Ontology Matching + + +
+ Ontology matching (OM) enables semantic interoperability between different +ontologies and resolves their conceptual heterogeneity by aligning related +entities. OM systems currently have two prevailing design paradigms: +conventional knowledge-based expert systems and newer machine learning-based +predictive systems. While large language models (LLMs) and LLM agents have +revolutionised data engineering and have been applied creatively in many +domains, their potential for OM remains underexplored. This study introduces a +novel agent-powered LLM-based design paradigm for OM systems. With +consideration of several specific challenges in leveraging LLM agents for OM, +we propose a generic framework, namely Agent-OM (Agent for Ontology Matching), +consisting of two Siamese agents for retrieval and matching, with a set of +simple OM tools. Our framework is implemented in a proof-of-concept system. +Evaluations of three Ontology Alignment Evaluation Initiative (OAEI) tracks +over state-of-the-art OM systems show that our system can achieve results very +close to the long-standing best performance on simple OM tasks and can +significantly improve the performance on complex and few-shot OM tasks. + +
+
+ comment: 19 pages, 13 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ An $\mathbf{L^*}$ Algorithm for Deterministic Weighted Regular Languages + + +
+ Extracting finite state automata (FSAs) from black-box models offers a +powerful approach to gaining interpretable insights into complex model +behaviors. To support this pursuit, we present a weighted variant of Angluin's +(1987) $\mathbf{L^*}$ algorithm for learning FSAs. We stay faithful to the +original algorithm, devising a way to exactly learn deterministic weighted FSAs +whose weights support division. Furthermore, we formulate the learning process +in a manner that highlights the connection with FSA minimization, showing how +$\mathbf{L^*}$ directly learns a minimal automaton for the target language. + +
+
+
+
+
+ + ♻ ☆ MaFeRw: Query Rewriting with Multi-Aspect Feedbacks for + Retrieval-Augmented Large Language Models + + +
+ In a real-world RAG system, the current query often involves spoken ellipses +and ambiguous references from dialogue contexts, necessitating query rewriting +to better describe user's information needs. However, traditional context-based +rewriting has minimal enhancement on downstream generation tasks due to the +lengthy process from query rewriting to response generation. Some researchers +try to utilize reinforcement learning with generation feedback to assist the +rewriter, but these sparse rewards provide little guidance in most cases, +leading to unstable training and generation results. We find that user's needs +are also reflected in the gold document, retrieved documents and ground truth. +Therefore, by feeding back these multi-aspect dense rewards to query rewriting, +more stable and satisfactory responses can be achieved. In this paper, we +propose a novel query rewriting method MaFeRw, which improves RAG performance +by integrating multi-aspect feedback from both the retrieval process and +generated results. Specifically, we first use manual data to train a T5 model +for the rewriter initialization. Next, we design three metrics as reinforcement +learning feedback: the similarity between the rewritten query and the gold +document, the ranking metrics, and ROUGE between the generation and the ground +truth. Inspired by RLAIF, we train three kinds of reward models for the above +metrics to achieve more efficient training. Finally, we combine the scores of +these reward models as feedback, and use PPO algorithm to explore the optimal +query rewriting strategy. Experimental results on two conversational RAG +datasets demonstrate that MaFeRw achieves superior generation metrics and more +stable training compared to baselines. + +
+
+
+
+
+ + ♻ ☆ LLMs instead of Human Judges? A Large Scale Empirical Study across 20 + NLP Evaluation Tasks + + +
+ There is an increasing trend towards evaluating NLP models with LLMs instead +of human judgments, raising questions about the validity of these evaluations, +as well as their reproducibility in the case of proprietary models. We provide +JUDGE-BENCH, an extensible collection of 20 NLP datasets with human annotations +covering a broad range of evaluated properties and types of data, and +comprehensively evaluate 11 current LLMs, covering both open-weight and +proprietary models, for their ability to replicate the annotations. Our +evaluations show substantial variance across models and datasets. Models are +reliable evaluators on some tasks, but overall display substantial variability +depending on the property being evaluated, the expertise level of the human +judges, and whether the language is human or model-generated. We conclude that +LLMs should be carefully validated against human judgments before being used as +evaluators. + +
+
+
+
+
+ + ♻ ☆ TrimLLM: Progressive Layer Dropping for Domain-Specific LLMs + + +
+ Specializing large language models (LLMs) for local deployment in +domain-specific use cases is necessary for strong performance while meeting +latency and privacy constraints. However, conventional task-specific adaptation +approaches do not show simultaneous memory saving and inference speedup at +deployment time. Practical compression techniques like quantization and pruning +require dedicated hardware or kernel support to achieve measured inference +speedup. We develop TrimLLM based on the layer-wise specialization phenomenon +we empirically observed and verified on contemporary LLMs. TrimLLM reduces the +depth of LLMs via progressive layer dropping. We show it retains LLMs' capacity +in specific domains and achieves inference speedup irrespective of hardware and +deep learning frameworks. We evaluated TrimLLM on LLMs of various sizes for +inference; models adapted on medical, legal, and financial datasets all +demonstrate $2.1-5.7\times$ inference speedup on consumer GPUs and up to +$3.1\times$ speedup on A100 when compared to state-of-the-art model compression +algorithms, with no loss in accuracy at 50$\sim$60\% model compression ratio. + +
+
+
+
+
+ + ♻ ☆ RAZOR: Sharpening Knowledge by Cutting Bias with Unsupervised Text + Rewriting AAAI'25 + + +
+ Despite the widespread use of LLMs due to their superior performance in +various tasks, their high computational costs often lead potential users to opt +for the pretraining-finetuning pipeline. However, biases prevalent in manually +constructed datasets can introduce spurious correlations between tokens and +labels, creating so-called shortcuts and hindering the generalizability of +fine-tuned models. Existing debiasing methods often rely on prior knowledge of +specific dataset biases, which is challenging to acquire a priori. We propose +RAZOR (Rewriting And Zero-bias Optimization Refinement), a novel, unsupervised, +and data-focused debiasing approach based on text rewriting for shortcut +mitigation. RAZOR leverages LLMs to iteratively rewrite potentially biased text +segments by replacing them with heuristically selected alternatives in a +shortcut space defined by token statistics and positional information. This +process aims to align surface-level text features more closely with diverse +label distributions, thereby promoting the learning of genuine linguistic +patterns. Compared with unsupervised SoTA models, RAZOR improves by 3.5% on the +FEVER and 6.5% on MNLI and SNLI datasets according to the F1 score. +Additionally, RAZOR effectively mitigates specific known biases, reducing +bias-related terms by x2 without requiring prior bias information, a result +that is on par with SoTA models that leverage prior information. Our work +prioritizes data manipulation over architectural modifications, emphasizing the +pivotal role of data quality in enhancing model performance and fairness. This +research contributes to developing more robust evaluation benchmarks for +debiasing methods by incorporating metrics for bias reduction and overall model +efficacy. + +
+
+ comment: Shuo and Bardh contributed equally. Accepted to AAAI'25, Paper #17117 +
+
+
+
+
+ + ♻ ☆ When Every Token Counts: Optimal Segmentation for Low-Resource Language + Models COLING 2025 + + +
+ Traditional greedy tokenization methods have been a critical step in Natural +Language Processing (NLP), influencing how text is converted into tokens and +directly impacting model performance. While subword tokenizers like Byte-Pair +Encoding (BPE) are widely used, questions remain about their optimality across +model scales and languages. In this work, we demonstrate through extensive +experiments that an optimal BPE configuration significantly reduces token count +compared to greedy segmentation, yielding improvements in token-saving +percentages and performance benefits, particularly for smaller models. We +evaluate tokenization performance across various intrinsic and extrinsic tasks, +including generation and classification. Our findings suggest that +compression-optimized tokenization strategies could provide substantial +advantages for multilingual and low-resource language applications, +highlighting a promising direction for further research and inclusive NLP. + +
+
+ comment: LoResLM @ COLING 2025 +
+
+
+
+
+ + ♻ ☆ Deep CLAS: Deep Contextual Listen, Attend and Spell + + +
+ Contextual-LAS (CLAS) has been shown effective in improving Automatic Speech +Recognition (ASR) of rare words. It relies on phrase-level contextual modeling +and attention-based relevance scoring without explicit contextual constraint +which lead to insufficient use of contextual information. In this work, we +propose deep CLAS to use contextual information better. We introduce bias loss +forcing model to focus on contextual information. The query of bias attention +is also enriched to improve the accuracy of the bias attention score. To get +fine-grained contextual information, we replace phrase-level encoding with +character-level encoding and encode contextual information with conformer +rather than LSTM. Moreover, we directly use the bias attention score to correct +the output probability distribution of the model. Experiments using the public +AISHELL-1 and AISHELL-NER. On AISHELL-1, compared to CLAS baselines, deep CLAS +obtains a 65.78% relative recall and a 53.49% relative F1-score increase in the +named entity recognition scene. + +
+
+ comment: Submitted to JUSTC +
+
+
+
+
+ + ♻ ☆ Towards an optimised evaluation of teachers' discourse: The case of + engaging messages + + +
+ Evaluating teachers' skills is crucial for enhancing education quality and +student outcomes. Teacher discourse, significantly influencing student +performance, is a key component. However, coding this discourse can be +laborious. This study addresses this issue by introducing a new methodology for +optimising the assessment of teacher discourse. The research consisted of two +studies, both within the framework of engaging messages used by secondary +education teachers. The first study involved training two large language models +on real-world examples from audio-recorded lessons over two academic years to +identify and classify the engaging messages from the lessons' transcripts. This +resulted in sensitivities of 84.31% and 91.11%, and specificities of 97.69% and +86.36% in identification and classification, respectively. The second study +applied these models to transcripts of audio-recorded lessons from a third +academic year to examine the frequency and distribution of message types by +educational level and moment of the academic year. Results showed teachers +predominantly use messages emphasising engagement benefits, linked to improved +outcomes, while one-third highlighted non-engagement disadvantages, associated +with increased anxiety. The use of engaging messages declined in Grade 12 and +towards the academic year's end. These findings suggest potential interventions +to optimise engaging message use, enhancing teaching quality and student +outcomes. + +
+
+
+
+
+ + ♻ ☆ Low-resource Machine Translation: what for? who for? An observational + study on a dedicated Tetun language translation service + + +
+ Low-resource machine translation (MT) presents a diversity of community needs +and application challenges that remain poorly understood. To complement surveys +and focus groups, which tend to rely on small samples of respondents, we +propose an observational study on actual usage patterns of a specialized MT +service for the Tetun language, which is the lingua franca in Timor-Leste. Our +analysis of 100,000 translation requests reveals patterns that challenge +assumptions based on existing corpora. We find that users, many of them +students on mobile devices, typically translate text from a high-resource +language into Tetun across diverse domains including science, healthcare, and +daily life. This contrasts sharply with available Tetun corpora, which are +dominated by news articles covering government and social issues. Our results +suggest that MT systems for minority languages like Tetun should prioritize +accuracy on domains relevant to educational contexts, in the high-resource to +low-resource direction. More broadly, this study demonstrates how observational +analysis can inform low-resource language technology development, by grounding +research in practical community needs. + +
+
+
+
+
+ + ♻ ☆ Piece of Table: A Divide-and-Conquer Approach for Selecting Sub-Tables + in Table Question Answering + + +
+ Applying language models (LMs) to tables is challenging due to the inherent +structural differences between two-dimensional tables and one-dimensional text +for which the LMs were originally designed. Furthermore, when applying +linearized tables to LMs, the maximum token lengths often imposed in +self-attention calculations make it difficult to comprehensively understand the +context spread across large tables. To address these challenges, we present +PieTa (Piece of Table), a new framework for sub-table-based question answering +(QA). PieTa operates through an iterative process of dividing tables into +smaller windows, using LMs to select relevant cells within each window, and +merging these cells into a sub-table. This multi-resolution approach captures +dependencies across multiple rows and columns while avoiding the limitations +caused by long context inputs. Instantiated as a simple iterative sub-table +union algorithm, PieTa demonstrates improved performance over previous +sub-table-based QA approaches. + +
+
+
+
+
+ + ♻ ☆ Alignment-Enhanced Decoding:Defending via Token-Level Adaptive Refining + of Probability Distributions EMNLP 2024 + + +
+ Large language models are susceptible to jailbreak attacks, which can result +in the generation of harmful content. While prior defenses mitigate these risks +by perturbing or inspecting inputs, they ignore competing objectives, the +underlying cause of alignment failures. In this paper, we propose +Alignment-Enhanced Decoding (AED), a novel defense that employs adaptive +decoding to address the root causes of jailbreak issues. We first define the +Competitive Index to quantify alignment failures and utilize feedback from +self-evaluation to compute post-alignment logits. Then, AED adaptively combines +AED and post-alignment logits with the original logits to obtain harmless and +helpful distributions. Consequently, our method enhances safety alignment while +maintaining helpfulness. We conduct experiments across five models and four +common jailbreaks, with the results validating the effectiveness of our +approach. Code is available at https://github.com/GIGABaozi/AED.git. + +
+
+ comment: Accepted by EMNLP 2024, 15 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for + Fast, Memory Efficient, and Long Context Finetuning and Inference + + +
+ Encoder-only transformer models such as BERT offer a great performance-size +tradeoff for retrieval and classification tasks with respect to larger +decoder-only models. Despite being the workhorse of numerous production +pipelines, there have been limited Pareto improvements to BERT since its +release. In this paper, we introduce ModernBERT, bringing modern model +optimizations to encoder-only models and representing a major Pareto +improvement over older encoders. Trained on 2 trillion tokens with a native +8192 sequence length, ModernBERT models exhibit state-of-the-art results on a +large pool of evaluations encompassing diverse classification tasks and both +single and multi-vector retrieval on different domains (including code). In +addition to strong downstream performance, ModernBERT is also the most speed +and memory efficient encoder and is designed for inference on common GPUs. + +
+
+
+
+
+ + ♻ ☆ Improving Retrieval Augmented Language Model with Self-Reasoning AAAI 2025 + + +
+ The Retrieval-Augmented Language Model (RALM) has shown remarkable +performance on knowledge-intensive tasks by incorporating external knowledge +during inference, which mitigates the factual hallucinations inherited in large +language models (LLMs). Despite these advancements, challenges persist in the +implementation of RALMs, particularly concerning their reliability and +traceability. To be specific, the irrelevant document retrieval may result in +unhelpful response generation or even deteriorate the performance of LLMs, +while the lack of proper citations in generated outputs complicates efforts to +verify the trustworthiness of the models. To this end, we propose a novel +self-reasoning framework aimed at improving the reliability and traceability of +RALMs, whose core idea is to leverage reasoning trajectories generated by the +LLM itself. The framework involves constructing self-reason trajectories with +three processes: a relevance-aware process, an evidence-aware selective +process, and a trajectory analysis process. We have evaluated our framework +across four public datasets (two short-form QA datasets, one long-form QA +dataset, and one fact verification dataset) to demonstrate the superiority of +our method, which can outperform existing state-of-the-art models and can +achieve comparable performance with GPT-4, while only using 2,000 training +samples. + +
+
+ comment: AAAI 2025 (main conference) +
+
+
+
+
+ + ♻ ☆ Progressive Multi-granular Alignments for Grounded Reasoning in Large + Vision-Language Models + + +
+ Existing Large Vision-Language Models (LVLMs) excel at matching concepts +across multi-modal inputs but struggle with compositional concepts and +high-level relationships between entities. This paper introduces Progressive +multi-granular Vision-Language alignments (PromViL), a novel framework to +enhance LVLMs' ability in performing grounded compositional visual reasoning +tasks. Our approach constructs a hierarchical structure of multi-modal +alignments, ranging from simple to complex concepts. By progressively aligning +textual descriptions with corresponding visual regions, our model learns to +leverage contextual information from lower levels to inform higher-level +reasoning. To facilitate this learning process, we introduce a data generation +process that creates a novel dataset derived from Visual Genome, providing a +wide range of nested compositional vision-language pairs. Experimental results +demonstrate that our PromViL framework significantly outperforms baselines on +various visual grounding and compositional question answering tasks. The code +is available at: https://github.com/lqh52/PromViL. + +
+
+
+
+
+ + ♻ ☆ Unleashing the Unseen: Harnessing Benign Datasets for Jailbreaking Large + Language Models + + +
+ Despite significant ongoing efforts in safety alignment, large language +models (LLMs) such as GPT-4 and LLaMA 3 remain vulnerable to jailbreak attacks +that can induce harmful behaviors, including through the use of adversarial +suffixes. Building on prior research, we hypothesize that these adversarial +suffixes are not mere bugs but may represent features that can dominate the +LLM's behavior. To evaluate this hypothesis, we conduct several experiments. +First, we demonstrate that benign features can be effectively made to function +as adversarial suffixes, i.e., we develop a feature extraction method to +extract sample-agnostic features from benign dataset in the form of suffixes +and show that these suffixes may effectively compromise safety alignment. +Second, we show that adversarial suffixes generated from jailbreak attacks may +contain meaningful features, i.e., appending the same suffix to different +prompts results in responses exhibiting specific characteristics. Third, we +show that such benign-yet-safety-compromising features can be easily introduced +through fine-tuning using only benign datasets. As a result, we are able to +completely eliminate GPT's safety alignment in a blackbox setting through +finetuning with only benign data. Our code and data is available at +\url{https://github.com/suffix-maybe-feature/adver-suffix-maybe-features}. + +
+
+
+
+
+ + ♻ ☆ VisualRWKV: Exploring Recurrent Neural Networks for Visual Language + Models COLING 2025 + + +
+ Visual Language Models (VLMs) have rapidly progressed with the recent success +of large language models. However, there have been few attempts to incorporate +efficient linear Recurrent Neural Networks (RNNs) architectures into VLMs. In +this study, we introduce VisualRWKV, the first application of a linear RNN +model to multimodal learning tasks, leveraging the pre-trained RWKV language +model. We propose a data-dependent recurrence and sandwich prompts to enhance +our modeling capabilities, along with a 2D image scanning mechanism to enrich +the processing of visual sequences. Extensive experiments demonstrate that +VisualRWKV achieves competitive performance compared to Transformer-based +models like LLaVA-1.5 on various benchmarks. Compared to LLaVA-1.5, VisualRWKV +has a speed advantage of 3.98 times and can save 54% of GPU memory when +reaching an inference length of 24K tokens. To facilitate further research and +analysis, we have made the checkpoints and the associated code publicly +accessible at the following GitHub repository: see +https://github.com/howard-hou/VisualRWKV. + +
+
+ comment: Accepted at COLING 2025 main conference +
+
+
+
+
+ + ♻ ☆ Fairness in Large Language Models: A Taxonomic Survey + + +
+ Large Language Models (LLMs) have demonstrated remarkable success across +various domains. However, despite their promising performance in numerous +real-world applications, most of these algorithms lack fairness considerations. +Consequently, they may lead to discriminatory outcomes against certain +communities, particularly marginalized populations, prompting extensive study +in fair LLMs. On the other hand, fairness in LLMs, in contrast to fairness in +traditional machine learning, entails exclusive backgrounds, taxonomies, and +fulfillment techniques. To this end, this survey presents a comprehensive +overview of recent advances in the existing literature concerning fair LLMs. +Specifically, a brief introduction to LLMs is provided, followed by an analysis +of factors contributing to bias in LLMs. Additionally, the concept of fairness +in LLMs is discussed categorically, summarizing metrics for evaluating bias in +LLMs and existing algorithms for promoting fairness. Furthermore, resources for +evaluating bias in LLMs, including toolkits and datasets, are summarized. +Finally, existing research challenges and open questions are discussed. + +
+
+
+
+
+ + ♻ ☆ Doubly-Universal Adversarial Perturbations: Deceiving Vision-Language + Models Across Both Images and Text with a Single Perturbation + + +
+ Large Vision-Language Models (VLMs) have demonstrated remarkable performance +across multimodal tasks by integrating vision encoders with large language +models (LLMs). However, these models remain vulnerable to adversarial attacks. +Among such attacks, Universal Adversarial Perturbations (UAPs) are especially +powerful, as a single optimized perturbation can mislead the model across +various input images. In this work, we introduce a novel UAP specifically +designed for VLMs: the Doubly-Universal Adversarial Perturbation (Doubly-UAP), +capable of universally deceiving VLMs across both image and text inputs. To +successfully disrupt the vision encoder's fundamental process, we analyze the +core components of the attention mechanism. After identifying value vectors in +the middle-to-late layers as the most vulnerable, we optimize Doubly-UAP in a +label-free manner with a frozen model. Despite being developed as a black-box +to the LLM, Doubly-UAP achieves high attack success rates on VLMs, consistently +outperforming baseline methods across vision-language tasks. Extensive ablation +studies and analyses further demonstrate the robustness of Doubly-UAP and +provide insights into how it influences internal attention mechanisms. + +
+
+
+
+
+ + ♻ ☆ Self-Generated Critiques Boost Reward Modeling for Language Models + + +
+ Reward modeling is crucial for aligning large language models (LLMs) with +human preferences, especially in reinforcement learning from human feedback +(RLHF). However, current reward models mainly produce scalar scores and +struggle to incorporate critiques in a natural language format. We hypothesize +that predicting both critiques and the scalar reward would improve reward +modeling ability. Motivated by this, we propose Critic-RM, a framework that +improves reward models using self-generated critiques without extra +supervision. Critic-RM employs a two-stage process: generating and filtering +high-quality critiques, followed by joint fine-tuning on reward prediction and +critique generation. Experiments across benchmarks show that Critic-RM improves +reward modeling accuracy by 3.7%-7.3% compared to standard reward models and +LLM judges, demonstrating strong performance and data efficiency. Additional +studies further validate the effectiveness of generated critiques in rectifying +flawed reasoning steps with 2.5%-3.2% gains in improving reasoning accuracy. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ KnowledgePrompts: Exploring the Abilities of Large Language Models to + Solve Proportional Analogies via Knowledge-Enhanced Prompting COLING 2025 + + +
+ Making analogies is fundamental to cognition. Proportional analogies, which +consist of four terms, are often used to assess linguistic and cognitive +abilities. For instance, completing analogies like "Oxygen is to Gas as +is to " requires identifying the semantic relationship (e.g., "type of") +between the first pair of terms ("Oxygen" and "Gas") and finding a second pair +that shares the same relationship (e.g., "Aluminum" and "Metal"). In this work, +we introduce a 15K Multiple-Choice Question Answering (MCQA) dataset for +proportional analogy completion and evaluate the performance of contemporary +Large Language Models (LLMs) in various knowledge-enhanced prompt settings. +Specifically, we augment prompts with three types of knowledge: exemplar, +structured, and targeted. Our results show that despite extensive training +data, solving proportional analogies remains challenging for current LLMs, with +the best model achieving an accuracy of 55%. Notably, we find that providing +targeted knowledge can better assist models in completing proportional +analogies compared to providing exemplars or collections of structured +knowledge. Our code and data are available at: +https://github.com/Thiliniiw/KnowledgePrompts/ + +
+
+ comment: Accepted at COLING 2025 +
+
+
+
+
+ + ♻ ☆ UOR: Universal Backdoor Attacks on Pre-trained Language Models ACL + + +
+ Backdoors implanted in pre-trained language models (PLMs) can be transferred +to various downstream tasks, which exposes a severe security threat. However, +most existing backdoor attacks against PLMs are un-targeted and task-specific. +Few targeted and task-agnostic methods use manually pre-defined triggers and +output representations, which prevent the attacks from being more effective and +general. In this paper, we first summarize the requirements that a more +threatening backdoor attack against PLMs should satisfy, and then propose a new +backdoor attack method called UOR, which breaks the bottleneck of the previous +approach by turning manual selection into automatic optimization. Specifically, +we define poisoned supervised contrastive learning which can automatically +learn the more uniform and universal output representations of triggers for +various PLMs. Moreover, we use gradient search to select appropriate trigger +words which can be adaptive to different PLMs and vocabularies. Experiments +show that our method can achieve better attack performance on various text +classification tasks compared to manual methods. Further, we tested our method +on PLMs with different architectures, different usage paradigms, and more +difficult tasks, which demonstrated the universality of our method. + +
+
+ comment: ACL-Findings 2024 +
+
+
+
+
+ + ♻ ☆ DavIR: Data Selection via Implicit Reward for Large Language Models + + +
+ We introduce DavIR, a model-based data selection method for post-training +Large Language Models. DavIR generalizes Reducible Holdout Loss to core-set +selection problem of causal language modeling, and quantifies the learnability +of a given datum with respect to a pre-trained LLM based on relative reduction +in loss during fine-tuning, a metric we show to be closely related to the +implicit reward model described in Direct Preference Optimization (DPO). We +show that 6% of Alpaca dataset selected with DavIR can steer both the LLaMA and +Gemma model family to produce superior performance compared to the same models +trained on the full 52K dataset. We also show that Alpaca dataset compressed +with DavIR can be combined with GSM8K dataset to effectively balance +open-domain freeform QA and mathematical reasoning capabilities. Finally, we +apply the DavIR objective to DPO and develop a normalized DavIR-DPO objective +which improves alignment performance of Zephyr-7B-SFT model by 8% (relative) on +AlpacaEval, compared against training on vanilla DPO objective. + +
+
+
+
+
+ + ♻ ☆ Safetywashing: Do AI Safety Benchmarks Actually Measure Safety Progress? NeurIPS 2024 + + +
+ As artificial intelligence systems grow more powerful, there has been +increasing interest in "AI safety" research to address emerging and future +risks. However, the field of AI safety remains poorly defined and +inconsistently measured, leading to confusion about how researchers can +contribute. This lack of clarity is compounded by the unclear relationship +between AI safety benchmarks and upstream general capabilities (e.g., general +knowledge and reasoning). To address these issues, we conduct a comprehensive +meta-analysis of AI safety benchmarks, empirically analyzing their correlation +with general capabilities across dozens of models and providing a survey of +existing directions in AI safety. Our findings reveal that many safety +benchmarks highly correlate with both upstream model capabilities and training +compute, potentially enabling "safetywashing" -- where capability improvements +are misrepresented as safety advancements. Based on these findings, we propose +an empirical foundation for developing more meaningful safety metrics and +define AI safety in a machine learning research context as a set of clearly +delineated research goals that are empirically separable from generic +capabilities advancements. In doing so, we aim to provide a more rigorous +framework for AI safety research, advancing the science of safety evaluations +and clarifying the path towards measurable progress. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ SafeAligner: Safety Alignment against Jailbreak Attacks via Response + Disparity Guidance + + +
+ As the development of large language models (LLMs) rapidly advances, securing +these models effectively without compromising their utility has become a +pivotal area of research. However, current defense strategies against jailbreak +attacks (i.e., efforts to bypass security protocols) often suffer from limited +adaptability, restricted general capability, and high cost. To address these +challenges, we introduce SafeAligner, a methodology implemented at the decoding +stage to fortify defenses against jailbreak attacks. We begin by developing two +specialized models: the Sentinel Model, which is trained to foster safety, and +the Intruder Model, designed to generate riskier responses. SafeAligner +leverages the disparity in security levels between the responses from these +models to differentiate between harmful and beneficial tokens, effectively +guiding the safety alignment by altering the output token distribution of the +target model. Extensive experiments show that SafeAligner can increase the +likelihood of beneficial tokens, while reducing the occurrence of harmful ones, +thereby ensuring secure alignment with minimal loss to generality. + +
+
+
+
+
+ + ♻ ☆ Agent Planning with World Knowledge Model NeurIPS 2024 + + +
+ Recent endeavors towards directly using large language models (LLMs) as agent +models to execute interactive planning tasks have shown commendable results. +Despite their achievements, however, they still struggle with brainless +trial-and-error in global planning and generating hallucinatory actions in +local planning due to their poor understanding of the ``real'' physical world. +Imitating humans' mental world knowledge model which provides global prior +knowledge before the task and maintains local dynamic knowledge during the +task, in this paper, we introduce parametric World Knowledge Model (WKM) to +facilitate agent planning. Concretely, we steer the agent model to +self-synthesize knowledge from both expert and sampled trajectories. Then we +develop WKM, providing prior task knowledge to guide the global planning and +dynamic state knowledge to assist the local planning. Experimental results on +three complex real-world simulated datasets with three state-of-the-art +open-source LLMs, Mistral-7B, Gemma-7B, and Llama-3-8B, demonstrate that our +method can achieve superior performance compared to various strong baselines. +Besides, we analyze to illustrate that our WKM can effectively alleviate the +blind trial-and-error and hallucinatory action issues, providing strong support +for the agent's understanding of the world. Other interesting findings include: +1) our instance-level task knowledge can generalize better to unseen tasks, 2) +weak WKM can guide strong agent model planning, and 3) unified WKM training has +promising potential for further development. The code is available at +https://github.com/zjunlp/WKM. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ WISE: Rethinking the Knowledge Memory for Lifelong Model Editing of + Large Language Models NeurIPS 2024 + + +
+ Large language models (LLMs) need knowledge updates to meet the ever-growing +world facts and correct the hallucinated responses, facilitating the methods of +lifelong model editing. Where the updated knowledge resides in memories is a +fundamental question for model editing. In this paper, we find that editing +either long-term memory (direct model parameters) or working memory +(non-parametric knowledge of neural network activations/representations by +retrieval) will result in an impossible triangle -- reliability, +generalization, and locality can not be realized together in the lifelong +editing settings. For long-term memory, directly editing the parameters will +cause conflicts with irrelevant pretrained knowledge or previous edits (poor +reliability and locality). For working memory, retrieval-based activations can +hardly make the model understand the edits and generalize (poor +generalization). Therefore, we propose WISE to bridge the gap between memories. +In WISE, we design a dual parametric memory scheme, which consists of the main +memory for the pretrained knowledge and a side memory for the edited knowledge. +We only edit the knowledge in the side memory and train a router to decide +which memory to go through when given a query. For continual editing, we devise +a knowledge-sharding mechanism where different sets of edits reside in distinct +subspaces of parameters, and are subsequently merged into a shared memory +without conflicts. Extensive experiments show that WISE can outperform previous +model editing methods and overcome the impossible triangle under lifelong model +editing of question answering, hallucination, and out-of-distribution settings +across trending LLM architectures, e.g., GPT, LLaMA, and Mistral. Code is +available at https://github.com/zjunlp/EasyEdit. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ DialSim: A Real-Time Simulator for Evaluating Long-Term Multi-Party + Dialogue Understanding of Conversational Agents + + +
+ Recent advancements in Large Language Models (LLMs) have significantly +enhanced the capabilities of conversational agents, making them applicable to +various fields (e.g., education). Despite their progress, the evaluation of the +agents often overlooks the complexities of real-world conversations, such as +real-time interactions, multi-party dialogues, and extended contextual +dependencies. To bridge this gap, we introduce DialSim, a real-time dialogue +simulator. In this simulator, an agent is assigned the role of a character from +popular TV shows, requiring it to respond to spontaneous questions using past +dialogue information and to distinguish between known and unknown information. +Key features of DialSim include assessing the agent's ability to respond within +a reasonable time limit, handling long-term multi-party dialogues, and +evaluating performance under randomized questioning with LongDialQA, a novel, +high-quality question-answering dataset. Our experiments using DialSim reveal +the strengths and weaknesses of the latest conversational agents, offering +valuable insights for future advancements in conversational AI. DialSim is +available at https://dialsim.github.io/. + +
+
+
+
+
+ + ♻ ☆ Knowledge Circuits in Pretrained Transformers NeurIPS 2024 + + +
+ The remarkable capabilities of modern large language models are rooted in +their vast repositories of knowledge encoded within their parameters, enabling +them to perceive the world and engage in reasoning. The inner workings of how +these models store knowledge have long been a subject of intense interest and +investigation among researchers. To date, most studies have concentrated on +isolated components within these models, such as the Multilayer Perceptrons and +attention head. In this paper, we delve into the computation graph of the +language model to uncover the knowledge circuits that are instrumental in +articulating specific knowledge. The experiments, conducted with GPT2 and +TinyLLAMA, have allowed us to observe how certain information heads, relation +heads, and Multilayer Perceptrons collaboratively encode knowledge within the +model. Moreover, we evaluate the impact of current knowledge editing techniques +on these knowledge circuits, providing deeper insights into the functioning and +constraints of these editing methodologies. Finally, we utilize knowledge +circuits to analyze and interpret language model behaviors such as +hallucinations and in-context learning. We believe the knowledge circuits hold +potential for advancing our understanding of Transformers and guiding the +improved design of knowledge editing. Code and data are available in +https://github.com/zjunlp/KnowledgeCircuits. + +
+
+ comment: NeurIPS 2024, 26 pages +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ UIP2P: Unsupervised Instruction-based Image Editing via Cycle Edit + Consistency + + +
+ We propose an unsupervised model for instruction-based image editing that +eliminates the need for ground-truth edited images during training. Existing +supervised methods depend on datasets containing triplets of input image, +edited image, and edit instruction. These are generated by either existing +editing methods or human-annotations, which introduce biases and limit their +generalization ability. Our method addresses these challenges by introducing a +novel editing mechanism called Cycle Edit Consistency (CEC), which applies +forward and backward edits in one training step and enforces consistency in +image and attention spaces. This allows us to bypass the need for ground-truth +edited images and unlock training for the first time on datasets comprising +either real image-caption pairs or image-caption-edit triplets. We empirically +show that our unsupervised technique performs better across a broader range of +edits with high fidelity and precision. By eliminating the need for +pre-existing datasets of triplets, reducing biases associated with supervised +methods, and proposing CEC, our work represents a significant advancement in +unblocking scaling of instruction-based image editing. + +
+
+ comment: Project page: https://enis.dev/uip2p/ +
+
+
+
+
+ + ☆ EnvGS: Modeling View-Dependent Appearance with Environment Gaussian + + +
+ Reconstructing complex reflections in real-world scenes from 2D images is +essential for achieving photorealistic novel view synthesis. Existing methods +that utilize environment maps to model reflections from distant lighting often +struggle with high-frequency reflection details and fail to account for +near-field reflections. In this work, we introduce EnvGS, a novel approach that +employs a set of Gaussian primitives as an explicit 3D representation for +capturing reflections of environments. These environment Gaussian primitives +are incorporated with base Gaussian primitives to model the appearance of the +whole scene. To efficiently render these environment Gaussian primitives, we +developed a ray-tracing-based renderer that leverages the GPU's RT core for +fast rendering. This allows us to jointly optimize our model for high-quality +reconstruction while maintaining real-time rendering speeds. Results from +multiple real-world and synthetic datasets demonstrate that our method produces +significantly more detailed reflections, achieving the best rendering quality +in real-time novel view synthesis. + +
+
+ comment: Project page: https://zju3dv.github.io/envgs/ +
+
+
+
+
+ + ☆ Flowing from Words to Pixels: A Framework for Cross-Modality Evolution + + +
+ Diffusion models, and their generalization, flow matching, have had a +remarkable impact on the field of media generation. Here, the conventional +approach is to learn the complex mapping from a simple source distribution of +Gaussian noise to the target media distribution. For cross-modal tasks such as +text-to-image generation, this same mapping from noise to image is learnt +whilst including a conditioning mechanism in the model. One key and thus far +relatively unexplored feature of flow matching is that, unlike Diffusion +models, they are not constrained for the source distribution to be noise. +Hence, in this paper, we propose a paradigm shift, and ask the question of +whether we can instead train flow matching models to learn a direct mapping +from the distribution of one modality to the distribution of another, thus +obviating the need for both the noise distribution and conditioning mechanism. +We present a general and simple framework, CrossFlow, for cross-modal flow +matching. We show the importance of applying Variational Encoders to the input +data, and introduce a method to enable Classifier-free guidance. Surprisingly, +for text-to-image, CrossFlow with a vanilla transformer without cross attention +slightly outperforms standard flow matching, and we show that it scales better +with training steps and model size, while also allowing for interesting latent +arithmetic which results in semantically meaningful edits in the output space. +To demonstrate the generalizability of our approach, we also show that +CrossFlow is on par with or outperforms the state-of-the-art for various +cross-modal / intra-modal mapping tasks, viz. image captioning, depth +estimation, and image super-resolution. We hope this paper contributes to +accelerating progress in cross-modal media generation. + +
+
+ comment: Project page: https://cross-flow.github.io/ +
+
+
+
+
+ + ☆ LeviTor: 3D Trajectory Oriented Image-to-Video Synthesis + + +
+ The intuitive nature of drag-based interaction has led to its growing +adoption for controlling object trajectories in image-to-video synthesis. +Still, existing methods that perform dragging in the 2D space usually face +ambiguity when handling out-of-plane movements. In this work, we augment the +interaction with a new dimension, i.e., the depth dimension, such that users +are allowed to assign a relative depth for each point on the trajectory. That +way, our new interaction paradigm not only inherits the convenience from 2D +dragging, but facilitates trajectory control in the 3D space, broadening the +scope of creativity. We propose a pioneering method for 3D trajectory control +in image-to-video synthesis by abstracting object masks into a few cluster +points. These points, accompanied by the depth information and the instance +information, are finally fed into a video diffusion model as the control +signal. Extensive experiments validate the effectiveness of our approach, +dubbed LeviTor, in precisely manipulating the object movements when producing +photo-realistic videos from static images. Project page: +https://ppetrichor.github.io/levitor.github.io/ + +
+
+ comment: Project page available at + https://ppetrichor.github.io/levitor.github.io/ +
+
+
+
+
+ + ☆ Generative Multiview Relighting for 3D Reconstruction under Extreme + Illumination Variation + + +
+ Reconstructing the geometry and appearance of objects from photographs taken +in different environments is difficult as the illumination and therefore the +object appearance vary across captured images. This is particularly challenging +for more specular objects whose appearance strongly depends on the viewing +direction. Some prior approaches model appearance variation across images using +a per-image embedding vector, while others use physically-based rendering to +recover the materials and per-image illumination. Such approaches fail at +faithfully recovering view-dependent appearance given significant variation in +input illumination and tend to produce mostly diffuse results. We present an +approach that reconstructs objects from images taken under different +illuminations by first relighting the images under a single reference +illumination with a multiview relighting diffusion model and then +reconstructing the object's geometry and appearance with a radiance field +architecture that is robust to the small remaining inconsistencies among the +relit images. We validate our proposed approach on both synthetic and real +datasets and demonstrate that it greatly outperforms existing techniques at +reconstructing high-fidelity appearance from images taken under extreme +illumination variation. Moreover, our approach is particularly effective at +recovering view-dependent "shiny" appearance which cannot be reconstructed by +prior methods. + +
+
+ comment: Project page: https://relight-to-reconstruct.github.io/ +
+
+
+
+
+ + ☆ Scaling 4D Representations + + +
+ Scaling has not yet been convincingly demonstrated for pure self-supervised +learning from video. However, prior work has focused evaluations on +semantic-related tasks $\unicode{x2013}$ action classification, ImageNet +classification, etc. In this paper we focus on evaluating self-supervised +learning on non-semantic vision tasks that are more spatial (3D) and temporal +(+1D = 4D), such as camera pose estimation, point and object tracking, and +depth estimation. We show that by learning from very large video datasets, +masked auto-encoding (MAE) with transformer video models actually scales, +consistently improving performance on these 4D tasks, as model size increases +from 20M all the way to the largest by far reported self-supervised video model +$\unicode{x2013}$ 22B parameters. Rigorous apples-to-apples comparison with +many recent image and video models demonstrates the benefits of scaling 4D +representations. + +
+
+
+
+
+ + ☆ PRIMA: Multi-Image Vision-Language Models for Reasoning Segmentation + + +
+ Despite significant advancements in Large Vision-Language Models (LVLMs), +existing pixel-grounding models operate on single-image settings, limiting +their ability to perform detailed, fine-grained comparisons across multiple +images. Conversely, current multi-image understanding models lack pixel-level +grounding. Our work addresses this gap by introducing the task of multi-image +pixel-grounded reasoning segmentation, and PRIMA, a novel LVLM that integrates +pixel-level grounding with robust multi-image reasoning capabilities to produce +contextually rich, pixel-grounded explanations. Central to PRIMA is an +efficient vision module that queries fine-grained visual representations across +multiple images, reducing TFLOPs by $25.3\%$. To support training and +evaluation, we curate $M^4Seg$, a new reasoning segmentation benchmark +consisting of $\sim$224K question-answer pairs that require fine-grained visual +understanding across multiple images. Experimental results demonstrate PRIMA +outperforms state-of-the-art baselines. + +
+
+ comment: Project page: https://plan-lab.github.io/prima +
+
+
+
+
+ + ☆ OpenEMMA: Open-Source Multimodal Model for End-to-End Autonomous Driving + + +
+ Since the advent of Multimodal Large Language Models (MLLMs), they have made +a significant impact across a wide range of real-world applications, +particularly in Autonomous Driving (AD). Their ability to process complex +visual data and reason about intricate driving scenarios has paved the way for +a new paradigm in end-to-end AD systems. However, the progress of developing +end-to-end models for AD has been slow, as existing fine-tuning methods demand +substantial resources, including extensive computational power, large-scale +datasets, and significant funding. Drawing inspiration from recent advancements +in inference computing, we propose OpenEMMA, an open-source end-to-end +framework based on MLLMs. By incorporating the Chain-of-Thought reasoning +process, OpenEMMA achieves significant improvements compared to the baseline +when leveraging a diverse range of MLLMs. Furthermore, OpenEMMA demonstrates +effectiveness, generalizability, and robustness across a variety of challenging +driving scenarios, offering a more efficient and effective approach to +autonomous driving. We release all the codes in +https://github.com/taco-group/OpenEMMA. + +
+
+
+
+
+ + ☆ AutoTrust: Benchmarking Trustworthiness in Large Vision Language Models + for Autonomous Driving + + +
+ Recent advancements in large vision language models (VLMs) tailored for +autonomous driving (AD) have shown strong scene understanding and reasoning +capabilities, making them undeniable candidates for end-to-end driving systems. +However, limited work exists on studying the trustworthiness of DriveVLMs -- a +critical factor that directly impacts public transportation safety. In this +paper, we introduce AutoTrust, a comprehensive trustworthiness benchmark for +large vision-language models in autonomous driving (DriveVLMs), considering +diverse perspectives -- including trustfulness, safety, robustness, privacy, +and fairness. We constructed the largest visual question-answering dataset for +investigating trustworthiness issues in driving scenarios, comprising over 10k +unique scenes and 18k queries. We evaluated six publicly available VLMs, +spanning from generalist to specialist, from open-source to commercial models. +Our exhaustive evaluations have unveiled previously undiscovered +vulnerabilities of DriveVLMs to trustworthiness threats. Specifically, we found +that the general VLMs like LLaVA-v1.6 and GPT-4o-mini surprisingly outperform +specialized models fine-tuned for driving in terms of overall trustworthiness. +DriveVLMs like DriveLM-Agent are particularly vulnerable to disclosing +sensitive information. Additionally, both generalist and specialist VLMs remain +susceptible to adversarial attacks and struggle to ensure unbiased +decision-making across diverse environments and populations. Our findings call +for immediate and decisive action to address the trustworthiness of DriveVLMs +-- an issue of critical importance to public safety and the welfare of all +citizens relying on autonomous transportation systems. Our benchmark is +publicly available at \url{https://github.com/taco-group/AutoTrust}, and the +leaderboard is released at \url{https://taco-group.github.io/AutoTrust/}. + +
+
+ comment: 55 pages, 14 figures +
+
+
+
+
+ + ☆ FlowAR: Scale-wise Autoregressive Image Generation Meets Flow Matching + + +
+ Autoregressive (AR) modeling has achieved remarkable success in natural +language processing by enabling models to generate text with coherence and +contextual understanding through next token prediction. Recently, in image +generation, VAR proposes scale-wise autoregressive modeling, which extends the +next token prediction to the next scale prediction, preserving the 2D structure +of images. However, VAR encounters two primary challenges: (1) its complex and +rigid scale design limits generalization in next scale prediction, and (2) the +generator's dependence on a discrete tokenizer with the same complex scale +structure restricts modularity and flexibility in updating the tokenizer. To +address these limitations, we introduce FlowAR, a general next scale prediction +method featuring a streamlined scale design, where each subsequent scale is +simply double the previous one. This eliminates the need for VAR's intricate +multi-scale residual tokenizer and enables the use of any off-the-shelf +Variational AutoEncoder (VAE). Our simplified design enhances generalization in +next scale prediction and facilitates the integration of Flow Matching for +high-quality image synthesis. We validate the effectiveness of FlowAR on the +challenging ImageNet-256 benchmark, demonstrating superior generation +performance compared to previous methods. Codes will be available at +\url{https://github.com/OliverRensu/FlowAR}. + +
+
+
+
+
+ + ☆ DI-PCG: Diffusion-based Efficient Inverse Procedural Content Generation + for High-quality 3D Asset Creation + + +
+ Procedural Content Generation (PCG) is powerful in creating high-quality 3D +contents, yet controlling it to produce desired shapes is difficult and often +requires extensive parameter tuning. Inverse Procedural Content Generation aims +to automatically find the best parameters under the input condition. However, +existing sampling-based and neural network-based methods still suffer from +numerous sample iterations or limited controllability. In this work, we present +DI-PCG, a novel and efficient method for Inverse PCG from general image +conditions. At its core is a lightweight diffusion transformer model, where PCG +parameters are directly treated as the denoising target and the observed images +as conditions to control parameter generation. DI-PCG is efficient and +effective. With only 7.6M network parameters and 30 GPU hours to train, it +demonstrates superior performance in recovering parameters accurately, and +generalizing well to in-the-wild images. Quantitative and qualitative +experiment results validate the effectiveness of DI-PCG in inverse PCG and +image-to-3D generation tasks. DI-PCG offers a promising approach for efficient +inverse PCG and represents a valuable exploration step towards a 3D generation +path that models how to construct a 3D asset using parametric models. + +
+
+ comment: Project page: https://thuzhaowang.github.io/projects/DI-PCG/ +
+
+
+
+
+ + ☆ LiDAR-RT: Gaussian-based Ray Tracing for Dynamic LiDAR Re-simulation + + +
+ This paper targets the challenge of real-time LiDAR re-simulation in dynamic +driving scenarios. Recent approaches utilize neural radiance fields combined +with the physical modeling of LiDAR sensors to achieve high-fidelity +re-simulation results. Unfortunately, these methods face limitations due to +high computational demands in large-scale scenes and cannot perform real-time +LiDAR rendering. To overcome these constraints, we propose LiDAR-RT, a novel +framework that supports real-time, physically accurate LiDAR re-simulation for +driving scenes. Our primary contribution is the development of an efficient and +effective rendering pipeline, which integrates Gaussian primitives and +hardware-accelerated ray tracing technology. Specifically, we model the +physical properties of LiDAR sensors using Gaussian primitives with learnable +parameters and incorporate scene graphs to handle scene dynamics. Building upon +this scene representation, our framework first constructs a bounding volume +hierarchy (BVH), then casts rays for each pixel and generates novel LiDAR views +through a differentiable rendering algorithm. Importantly, our framework +supports realistic rendering with flexible scene editing operations and various +sensor configurations. Extensive experiments across multiple public benchmarks +demonstrate that our method outperforms state-of-the-art methods in terms of +rendering quality and efficiency. Our project page is at +https://zju3dv.github.io/lidar-rt. + +
+
+ comment: Project page: https://zju3dv.github.io/lidar-rt +
+
+
+
+
+ + ☆ Preventing Local Pitfalls in Vector Quantization via Optimal Transport + + +
+ Vector-quantized networks (VQNs) have exhibited remarkable performance across +various tasks, yet they are prone to training instability, which complicates +the training process due to the necessity for techniques such as subtle +initialization and model distillation. In this study, we identify the local +minima issue as the primary cause of this instability. To address this, we +integrate an optimal transport method in place of the nearest neighbor search +to achieve a more globally informed assignment. We introduce OptVQ, a novel +vector quantization method that employs the Sinkhorn algorithm to optimize the +optimal transport problem, thereby enhancing the stability and efficiency of +the training process. To mitigate the influence of diverse data distributions +on the Sinkhorn algorithm, we implement a straightforward yet effective +normalization strategy. Our comprehensive experiments on image reconstruction +tasks demonstrate that OptVQ achieves 100% codebook utilization and surpasses +current state-of-the-art VQNs in reconstruction quality. + +
+
+ comment: Code is available at https://github.com/zbr17/OptVQ +
+
+
+
+
+ + ☆ AV-Link: Temporally-Aligned Diffusion Features for Cross-Modal + Audio-Video Generation + + +
+ We propose AV-Link, a unified framework for Video-to-Audio and Audio-to-Video +generation that leverages the activations of frozen video and audio diffusion +models for temporally-aligned cross-modal conditioning. The key to our +framework is a Fusion Block that enables bidirectional information exchange +between our backbone video and audio diffusion models through a +temporally-aligned self attention operation. Unlike prior work that uses +feature extractors pretrained for other tasks for the conditioning signal, +AV-Link can directly leverage features obtained by the complementary modality +in a single framework i.e. video features to generate audio, or audio features +to generate video. We extensively evaluate our design choices and demonstrate +the ability of our method to achieve synchronized and high-quality audiovisual +content, showcasing its potential for applications in immersive media +generation. Project Page: snap-research.github.io/AVLink/ + +
+
+ comment: Project Page: snap-research.github.io/AVLink/ +
+
+
+
+
+ + ☆ EarthDial: Turning Multi-sensory Earth Observations to Interactive + Dialogues + + +
+ Automated analysis of vast Earth observation data via interactive +Vision-Language Models (VLMs) can unlock new opportunities for environmental +monitoring, disaster response, and resource management. Existing generic VLMs +do not perform well on Remote Sensing data, while the recent Geo-spatial VLMs +remain restricted to a fixed resolution and few sensor modalities. In this +paper, we introduce EarthDial, a conversational assistant specifically designed +for Earth Observation (EO) data, transforming complex, multi-sensory Earth +observations into interactive, natural language dialogues. EarthDial supports +multi-spectral, multi-temporal, and multi-resolution imagery, enabling a wide +range of remote sensing tasks, including classification, detection, captioning, +question answering, visual reasoning, and visual grounding. To achieve this, we +introduce an extensive instruction tuning dataset comprising over 11.11M +instruction pairs covering RGB, Synthetic Aperture Radar (SAR), and +multispectral modalities such as Near-Infrared (NIR) and infrared. Furthermore, +EarthDial handles bi-temporal and multi-temporal sequence analysis for +applications like change detection. Our extensive experimental results on 37 +downstream applications demonstrate that EarthDial outperforms existing generic +and domain-specific models, achieving better generalization across various EO +tasks. + +
+
+
+
+
+ + ☆ LlamaFusion: Adapting Pretrained Language Models for Multimodal + Generation + + +
+ We present LlamaFusion, a framework for empowering pretrained text-only large +language models (LLMs) with multimodal generative capabilities, enabling them +to understand and generate both text and images in arbitrary sequences. +LlamaFusion leverages existing Llama-3's weights for processing texts +autoregressively while introducing additional and parallel transformer modules +for processing images with diffusion. During training, the data from each +modality is routed to its dedicated modules: modality-specific feedforward +layers, query-key-value projections, and normalization layers process each +modality independently, while the shared self-attention layers allow +interactions across text and image features. By freezing the text-specific +modules and only training the image-specific modules, LlamaFusion preserves the +language capabilities of text-only LLMs while developing strong visual +understanding and generation abilities. Compared to methods that pretrain +multimodal generative models from scratch, our experiments demonstrate that, +LlamaFusion improves image understanding by 20% and image generation by 3.6% +using only 50% of the FLOPs while maintaining Llama-3's language capabilities. +We also demonstrate that this framework can adapt existing vision-language +models with multimodal generation ability. Overall, this framework not only +leverages existing computational investments in text-only LLMs but also enables +the parallel development of language and vision capabilities, presenting a +promising direction for efficient multimodal model development. + +
+
+
+
+
+ + ☆ Tiled Diffusion + + +
+ Image tiling -- the seamless connection of disparate images to create a +coherent visual field -- is crucial for applications such as texture creation, +video game asset development, and digital art. Traditionally, tiles have been +constructed manually, a method that poses significant limitations in +scalability and flexibility. Recent research has attempted to automate this +process using generative models. However, current approaches primarily focus on +tiling textures and manipulating models for single-image generation, without +inherently supporting the creation of multiple interconnected tiles across +diverse domains. This paper presents Tiled Diffusion, a novel approach that +extends the capabilities of diffusion models to accommodate the generation of +cohesive tiling patterns across various domains of image synthesis that require +tiling. Our method supports a wide range of tiling scenarios, from self-tiling +to complex many-to-many connections, enabling seamless integration of multiple +images. Tiled Diffusion automates the tiling process, eliminating the need for +manual intervention and enhancing creative possibilities in various +applications, such as seamlessly tiling of existing images, tiled texture +creation, and 360{\deg} synthesis. + +
+
+
+
+
+ + ☆ SqueezeMe: Efficient Gaussian Avatars for VR + + +
+ Gaussian Splatting has enabled real-time 3D human avatars with unprecedented +levels of visual quality. While previous methods require a desktop GPU for +real-time inference of a single avatar, we aim to squeeze multiple Gaussian +avatars onto a portable virtual reality headset with real-time drivable +inference. We begin by training a previous work, Animatable Gaussians, on a +high quality dataset captured with 512 cameras. The Gaussians are animated by +controlling base set of Gaussians with linear blend skinning (LBS) motion and +then further adjusting the Gaussians with a neural network decoder to correct +their appearance. When deploying the model on a Meta Quest 3 VR headset, we +find two major computational bottlenecks: the decoder and the rendering. To +accelerate the decoder, we train the Gaussians in UV-space instead of +pixel-space, and we distill the decoder to a single neural network layer. +Further, we discover that neighborhoods of Gaussians can share a single +corrective from the decoder, which provides an additional speedup. To +accelerate the rendering, we develop a custom pipeline in Vulkan that runs on +the mobile GPU. Putting it all together, we run 3 Gaussian avatars concurrently +at 72 FPS on a VR headset. Demo videos are at +https://forresti.github.io/squeezeme. + +
+
+ comment: Initial version +
+
+
+
+
+ + ☆ OnlineVPO: Align Video Diffusion Model with Online Video-Centric + Preference Optimization + + +
+ In recent years, the field of text-to-video (T2V) generation has made +significant strides. Despite this progress, there is still a gap between +theoretical advancements and practical application, amplified by issues like +degraded image quality and flickering artifacts. Recent advancements in +enhancing the video diffusion model (VDM) through feedback learning have shown +promising results. However, these methods still exhibit notable limitations, +such as misaligned feedback and inferior scalability. To tackle these issues, +we introduce OnlineVPO, a more efficient preference learning approach tailored +specifically for video diffusion models. Our method features two novel designs, +firstly, instead of directly using image-based reward feedback, we leverage the +video quality assessment (VQA) model trained on synthetic data as the reward +model to provide distribution and modality-aligned feedback on the video +diffusion model. Additionally, we introduce an online DPO algorithm to address +the off-policy optimization and scalability issue in existing video preference +learning frameworks. By employing the video reward model to offer concise video +feedback on the fly, OnlineVPO offers effective and efficient preference +guidance. Extensive experiments on the open-source video-diffusion model +demonstrate OnlineVPO as a simple yet effective and more importantly scalable +preference learning algorithm for video diffusion models, offering valuable +insights for future advancements in this domain. + +
+
+
+
+
+ + ☆ Prompt-A-Video: Prompt Your Video Diffusion Model via Preference-Aligned + LLM + + +
+ Text-to-video models have made remarkable advancements through optimization +on high-quality text-video pairs, where the textual prompts play a pivotal role +in determining quality of output videos. However, achieving the desired output +often entails multiple revisions and iterative inference to refine +user-provided prompts. Current automatic methods for refining prompts encounter +challenges such as Modality-Inconsistency, Cost-Discrepancy, and Model-Unaware +when applied to text-to-video diffusion models. To address these problem, we +introduce an LLM-based prompt adaptation framework, termed as Prompt-A-Video, +which excels in crafting Video-Centric, Labor-Free and Preference-Aligned +prompts tailored to specific video diffusion model. Our approach involves a +meticulously crafted two-stage optimization and alignment system. Initially, we +conduct a reward-guided prompt evolution pipeline to automatically create +optimal prompts pool and leverage them for supervised fine-tuning (SFT) of the +LLM. Then multi-dimensional rewards are employed to generate pairwise data for +the SFT model, followed by the direct preference optimization (DPO) algorithm +to further facilitate preference alignment. Through extensive experimentation +and comparative analyses, we validate the effectiveness of Prompt-A-Video +across diverse generation models, highlighting its potential to push the +boundaries of video generation. + +
+
+
+
+
+ + ☆ Leveraging Color Channel Independence for Improved Unsupervised Object + Detection + + +
+ Object-centric architectures can learn to extract distinct object +representations from visual scenes, enabling downstream applications on the +object level. Similarly to autoencoder-based image models, object-centric +approaches have been trained on the unsupervised reconstruction loss of images +encoded by RGB color spaces. In our work, we challenge the common assumption +that RGB images are the optimal color space for unsupervised learning in +computer vision. We discuss conceptually and empirically that other color +spaces, such as HSV, bear essential characteristics for object-centric +representation learning, like robustness to lighting conditions. We further +show that models improve when requiring them to predict additional color +channels. Specifically, we propose to transform the predicted targets to the +RGB-S space, which extends RGB with HSV's saturation component and leads to +markedly better reconstruction and disentanglement for five common evaluation +datasets. The use of composite color spaces can be implemented with basically +no computational overhead, is agnostic of the models' architecture, and is +universally applicable across a wide range of visual computing tasks and +training types. The findings of our approach encourage additional +investigations in computer vision tasks beyond object-centric learning. + +
+
+ comment: 38 pages incl. references, 16 figures +
+
+
+
+
+ + ☆ Jet: A Modern Transformer-Based Normalizing Flow + + +
+ In the past, normalizing generative flows have emerged as a promising class +of generative models for natural images. This type of model has many modeling +advantages: the ability to efficiently compute log-likelihood of the input +data, fast generation and simple overall structure. Normalizing flows remained +a topic of active research but later fell out of favor, as visual quality of +the samples was not competitive with other model classes, such as GANs, +VQ-VAE-based approaches or diffusion models. In this paper we revisit the +design of the coupling-based normalizing flow models by carefully ablating +prior design choices and using computational blocks based on the Vision +Transformer architecture, not convolutional neural networks. As a result, we +achieve state-of-the-art quantitative and qualitative performance with a much +simpler architecture. While the overall visual quality is still behind the +current state-of-the-art models, we argue that strong normalizing flow models +can help advancing research frontier by serving as building components of more +powerful generative models. + +
+
+
+
+
+ + ☆ Parallelized Autoregressive Visual Generation + + +
+ Autoregressive models have emerged as a powerful approach for visual +generation but suffer from slow inference speed due to their sequential +token-by-token prediction process. In this paper, we propose a simple yet +effective approach for parallelized autoregressive visual generation that +improves generation efficiency while preserving the advantages of +autoregressive modeling. Our key insight is that parallel generation depends on +visual token dependencies-tokens with weak dependencies can be generated in +parallel, while strongly dependent adjacent tokens are difficult to generate +together, as their independent sampling may lead to inconsistencies. Based on +this observation, we develop a parallel generation strategy that generates +distant tokens with weak dependencies in parallel while maintaining sequential +generation for strongly dependent local tokens. Our approach can be seamlessly +integrated into standard autoregressive models without modifying the +architecture or tokenizer. Experiments on ImageNet and UCF-101 demonstrate that +our method achieves a 3.6x speedup with comparable quality and up to 9.5x +speedup with minimal quality degradation across both image and video generation +tasks. We hope this work will inspire future research in efficient visual +generation and unified autoregressive modeling. Project page: +https://epiphqny.github.io/PAR-project. + +
+
+ comment: Project page: https://epiphqny.github.io/PAR-project +
+
+
+
+
+ + ☆ Knowing Where to Focus: Attention-Guided Alignment for Text-based Person + Search + + +
+ In the realm of Text-Based Person Search (TBPS), mainstream methods aim to +explore more efficient interaction frameworks between text descriptions and +visual data. However, recent approaches encounter two principal challenges. +Firstly, the widely used random-based Masked Language Modeling (MLM) considers +all the words in the text equally during training. However, massive +semantically vacuous words ('with', 'the', etc.) be masked fail to contribute +efficient interaction in the cross-modal MLM and hampers the representation +alignment. Secondly, manual descriptions in TBPS datasets are tedious and +inevitably contain several inaccuracies. To address these issues, we introduce +an Attention-Guided Alignment (AGA) framework featuring two innovative +components: Attention-Guided Mask (AGM) Modeling and Text Enrichment Module +(TEM). AGM dynamically masks semantically meaningful words by aggregating the +attention weight derived from the text encoding process, thereby cross-modal +MLM can capture information related to the masked word from text context and +images and align their representations. Meanwhile, TEM alleviates low-quality +representations caused by repetitive and erroneous text descriptions by +replacing those semantically meaningful words with MLM's prediction. It not +only enriches text descriptions but also prevents overfitting. Extensive +experiments across three challenging benchmarks demonstrate the effectiveness +of our AGA, achieving new state-of-the-art results with Rank-1 accuracy +reaching 78.36%, 67.31%, and 67.4% on CUHK-PEDES, ICFG-PEDES, and RSTPReid, +respectively. + +
+
+
+
+
+ + ☆ A Full Transformer-based Framework for Automatic Pain Estimation using + Videos + + +
+ The automatic estimation of pain is essential in designing an optimal pain +management system offering reliable assessment and reducing the suffering of +patients. In this study, we present a novel full transformer-based framework +consisting of a Transformer in Transformer (TNT) model and a Transformer +leveraging cross-attention and self-attention blocks. Elaborating on videos +from the BioVid database, we demonstrate state-of-the-art performances, showing +the efficacy, efficiency, and generalization capability across all the primary +pain estimation tasks. + +
+
+
+
+
+ + ☆ Till the Layers Collapse: Compressing a Deep Neural Network through the + Lenses of Batch Normalization Layers AAAI 2025 + + +
+ Today, deep neural networks are widely used since they can handle a variety +of complex tasks. Their generality makes them very powerful tools in modern +technology. However, deep neural networks are often overparameterized. The +usage of these large models consumes a lot of computation resources. In this +paper, we introduce a method called \textbf{T}ill the \textbf{L}ayers +\textbf{C}ollapse (TLC), which compresses deep neural networks through the +lenses of batch normalization layers. By reducing the depth of these networks, +our method decreases deep neural networks' computational requirements and +overall latency. We validate our method on popular models such as Swin-T, +MobileNet-V2, and RoBERTa, across both image classification and natural +language processing (NLP) tasks. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ☆ MultiverSeg: Scalable Interactive Segmentation of Biomedical Imaging + Datasets with In-Context Guidance + + +
+ Medical researchers and clinicians often need to perform novel segmentation +tasks on a set of related images. Existing methods for segmenting a new dataset +are either interactive, requiring substantial human effort for each image, or +require an existing set of manually labeled images. We introduce a system, +MultiverSeg, that enables practitioners to rapidly segment an entire new +dataset without requiring access to any existing labeled data from that task or +domain. Along with the image to segment, the model takes user interactions such +as clicks, bounding boxes or scribbles as input, and predicts a segmentation. +As the user segments more images, those images and segmentations become +additional inputs to the model, providing context. As the context set of +labeled images grows, the number of interactions required to segment each new +image decreases. We demonstrate that MultiverSeg enables users to interactively +segment new datasets efficiently, by amortizing the number of interactions per +image to achieve an accurate segmentation. Compared to using a state-of-the-art +interactive segmentation method, using MultiverSeg reduced the total number of +scribble steps by 53% and clicks by 36% to achieve 90% Dice on sets of images +from unseen tasks. We release code and model weights at +https://multiverseg.csail.mit.edu + +
+
+ comment: Project Website: https://multiverseg.csail.mit.edu Keywords: + interactive segmentation, in-context learning, medical image analysis, + biomedical imaging, image annotation, visual prompting +
+
+
+
+
+ + ☆ GIRAFE: Glottal Imaging Dataset for Advanced Segmentation, Analysis, and + Facilitative Playbacks Evaluation + + +
+ The advances in the development of Facilitative Playbacks extracted from +High-Speed videoendoscopic sequences of the vocal folds are hindered by a +notable lack of publicly available datasets annotated with the semantic +segmentations corresponding to the area of the glottal gap. This fact also +limits the reproducibility and further exploration of existing research in this +field. + To address this gap, GIRAFE is a data repository designed to facilitate the +development of advanced techniques for the semantic segmentation, analysis, and +fast evaluation of High-Speed videoendoscopic sequences of the vocal folds. The +repository includes 65 high-speed videoendoscopic recordings from a cohort of +50 patients (30 female, 20 male). The dataset comprises 15 recordings from +healthy controls, 26 from patients with diagnosed voice disorders, and 24 with +an unknown health condition. All of them were manually annotated by an expert, +including the masks corresponding to the semantic segmentation of the glottal +gap. The repository is also complemented with the automatic segmentation of the +glottal area using different state-of-the-art approaches. + This data set has already supported several studies, which demonstrates its +usefulness for the development of new glottal gap segmentation algorithms from +High-Speed-Videoendoscopic sequences to improve or create new Facilitative +Playbacks. Despite these advances and others in the field, the broader +challenge of performing an accurate and completely automatic semantic +segmentation method of the glottal area remains open. + +
+
+ comment: 18 pages, 8 figures +
+
+
+
+
+ + ☆ Uni-Renderer: Unifying Rendering and Inverse Rendering Via Dual Stream + Diffusion + + +
+ Rendering and inverse rendering are pivotal tasks in both computer vision and +graphics. The rendering equation is the core of the two tasks, as an ideal +conditional distribution transfer function from intrinsic properties to RGB +images. Despite achieving promising results of existing rendering methods, they +merely approximate the ideal estimation for a specific scene and come with a +high computational cost. Additionally, the inverse conditional distribution +transfer is intractable due to the inherent ambiguity. To address these +challenges, we propose a data-driven method that jointly models rendering and +inverse rendering as two conditional generation tasks within a single diffusion +framework. Inspired by UniDiffuser, we utilize two distinct time schedules to +model both tasks, and with a tailored dual streaming module, we achieve +cross-conditioning of two pre-trained diffusion models. This unified approach, +named Uni-Renderer, allows the two processes to facilitate each other through a +cycle-consistent constrain, mitigating ambiguity by enforcing consistency +between intrinsic properties and rendered images. Combined with a meticulously +prepared dataset, our method effectively decomposition of intrinsic properties +and demonstrates a strong capability to recognize changes during rendering. We +will open-source our training and inference code to the public, fostering +further research and development in this area. + +
+
+
+
+
+ + ☆ DCTdiff: Intriguing Properties of Image Generative Modeling in the DCT + Space + + +
+ This paper explores image modeling from the frequency space and introduces +DCTdiff, an end-to-end diffusion generative paradigm that efficiently models +images in the discrete cosine transform (DCT) space. We investigate the design +space of DCTdiff and reveal the key design factors. Experiments on different +frameworks (UViT, DiT), generation tasks, and various diffusion samplers +demonstrate that DCTdiff outperforms pixel-based diffusion models regarding +generative quality and training efficiency. Remarkably, DCTdiff can seamlessly +scale up to high-resolution generation without using the latent diffusion +paradigm. Finally, we illustrate several intriguing properties of DCT image +modeling. For example, we provide a theoretical proof of why `image diffusion +can be seen as spectral autoregression', bridging the gap between diffusion and +autoregressive models. The effectiveness of DCTdiff and the introduced +properties suggest a promising direction for image modeling in the frequency +space. The code is at \url{https://github.com/forever208/DCTdiff}. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ☆ Stable-V2A: Synthesis of Synchronized Sound Effects with Temporal and + Semantic Controls + + +
+ Sound designers and Foley artists usually sonorize a scene, such as from a +movie or video game, by manually annotating and sonorizing each action of +interest in the video. In our case, the intent is to leave full creative +control to sound designers with a tool that allows them to bypass the more +repetitive parts of their work, thus being able to focus on the creative +aspects of sound production. We achieve this presenting Stable-V2A, a two-stage +model consisting of: an RMS-Mapper that estimates an envelope representative of +the audio characteristics associated with the input video; and Stable-Foley, a +diffusion model based on Stable Audio Open that generates audio semantically +and temporally aligned with the target video. Temporal alignment is guaranteed +by the use of the envelope as a ControlNet input, while semantic alignment is +achieved through the use of sound representations chosen by the designer as +cross-attention conditioning of the diffusion process. We train and test our +model on Greatest Hits, a dataset commonly used to evaluate V2A models. In +addition, to test our model on a case study of interest, we introduce Walking +The Maps, a dataset of videos extracted from video games depicting animated +characters walking in different locations. Samples and code available on our +demo page at https://ispamm.github.io/Stable-V2A. + +
+
+
+
+
+ + ☆ Robust Federated Learning in the Face of Covariate Shift: A Magnitude + Pruning with Hybrid Regularization Framework for Enhanced Model Aggregation + + +
+ The development of highly sophisticated neural networks has allowed for fast +progress in every field of computer vision, however, applications where +annotated data is prohibited due to privacy or security concerns remain +challenging. Federated Learning (FL) offers a promising framework for +individuals aiming to collaboratively develop a shared model while preserving +data privacy. Nevertheless, our findings reveal that variations in data +distribution among clients can profoundly affect FL methodologies, primarily +due to instabilities in the aggregation process. We also propose a novel FL +framework to mitigate the adverse effects of covariate shifts among federated +clients by combining individual parameter pruning and regularization techniques +to improve the robustness of individual clients' models to aggregate. Each +client's model is optimized through magnitude-based pruning and the addition of +dropout and noise injection layers to build more resilient decision pathways in +the networks and improve the robustness of the model's parameter aggregation +step. The proposed framework is capable of extracting robust representations +even in the presence of very large covariate shifts among client data +distributions and in the federation of a small number of clients. Empirical +findings substantiate the effectiveness of our proposed methodology across +common benchmark datasets, including CIFAR10, MNIST, SVHN, and Fashion MNIST. +Furthermore, we introduce the CelebA-Gender dataset, specifically designed to +evaluate performance on a more realistic domain. The proposed method is capable +of extracting robust representations even in the presence of both high and low +covariate shifts among client data distributions. + +
+
+
+
+
+ + ☆ Stitch Contrast and Segment_Learning a Human Action Segmentation Model + Using Trimmed Skeleton Videos AAAI 2025 + + +
+ Existing skeleton-based human action classification models rely on +well-trimmed action-specific skeleton videos for both training and testing, +precluding their scalability to real-world applications where untrimmed videos +exhibiting concatenated actions are predominant. To overcome this limitation, +recently introduced skeleton action segmentation models involve un-trimmed +skeleton videos into end-to-end training. The model is optimized to provide +frame-wise predictions for any length of testing videos, simultaneously +realizing action localization and classification. Yet, achieving such an +improvement im-poses frame-wise annotated skeleton videos, which remains +time-consuming in practice. This paper features a novel framework for +skeleton-based action segmentation trained on short trimmed skeleton videos, +but that can run on longer un-trimmed videos. The approach is implemented in +three steps: Stitch, Contrast, and Segment. First, Stitch proposes a tem-poral +skeleton stitching scheme that treats trimmed skeleton videos as elementary +human motions that compose a semantic space and can be sampled to generate +multi-action stitched se-quences. Contrast learns contrastive representations +from stitched sequences with a novel discrimination pretext task that enables a +skeleton encoder to learn meaningful action-temporal contexts to improve action +segmentation. Finally, Segment relates the proposed method to action +segmentation by learning a segmentation layer while handling particular da-ta +availability. Experiments involve a trimmed source dataset and an untrimmed +target dataset in an adaptation formulation for real-world skeleton-based human +action segmentation to evaluate the effectiveness of the proposed method. + +
+
+ comment: Accepted as AAAI 2025 +
+
+
+
+
+ + ☆ Arti-PG: A Toolbox for Procedurally Synthesizing Large-Scale and Diverse + Articulated Objects with Rich Annotations + + +
+ The acquisition of substantial volumes of 3D articulated object data is +expensive and time-consuming, and consequently the scarcity of 3D articulated +object data becomes an obstacle for deep learning methods to achieve remarkable +performance in various articulated object understanding tasks. Meanwhile, +pairing these object data with detailed annotations to enable training for +various tasks is also difficult and labor-intensive to achieve. In order to +expeditiously gather a significant number of 3D articulated objects with +comprehensive and detailed annotations for training, we propose Articulated +Object Procedural Generation toolbox, a.k.a. Arti-PG toolbox. Arti-PG toolbox +consists of i) descriptions of articulated objects by means of a generalized +structure program along with their analytic correspondence to the objects' +point cloud, ii) procedural rules about manipulations on the structure program +to synthesize large-scale and diverse new articulated objects, and iii) +mathematical descriptions of knowledge (e.g. affordance, semantics, etc.) to +provide annotations to the synthesized object. Arti-PG has two appealing +properties for providing training data for articulated object understanding +tasks: i) objects are created with unlimited variations in shape through +program-oriented structure manipulation, ii) Arti-PG is widely applicable to +diverse tasks by easily providing comprehensive and detailed annotations. +Arti-PG now supports the procedural generation of 26 categories of articulate +objects and provides annotations across a wide range of both vision and +manipulation tasks, and we provide exhaustive experiments which fully +demonstrate its advantages. We will make Arti-PG toolbox publicly available for +the community to use. + +
+
+
+
+
+ + ☆ PhotoHolmes: a Python library for forgery detection in digital images + + +
+ In this paper, we introduce PhotoHolmes, an open-source Python library +designed to easily run and benchmark forgery detection methods on digital +images. The library includes implementations of popular and state-of-the-art +methods, dataset integration tools, and evaluation metrics. Utilizing the +Benchmark tool in PhotoHolmes, users can effortlessly compare various methods. +This facilitates an accurate and reproducible comparison between their own +methods and those in the existing literature. Furthermore, PhotoHolmes includes +a command-line interface (CLI) to easily run the methods implemented in the +library on any suspicious image. As such, image forgery methods become more +accessible to the community. The library has been built with extensibility and +modularity in mind, which makes adding new methods, datasets and metrics to the +library a straightforward process. The source code is available at +https://github.com/photoholmes/photoholmes. + +
+
+
+
+
+ + ☆ Movie2Story: A framework for understanding videos and telling stories in + the form of novel text + + +
+ Multimodal video-to-text models have made considerable progress, primarily in +generating brief descriptions of video content. However, there is still a +deficiency in generating rich long-form text descriptions that integrate both +video and audio. In this paper, we introduce a framework called M2S, designed +to generate novel-length text by combining audio, video, and character +recognition. M2S includes modules for video long-form text description and +comprehension, audio-based analysis of emotion, speech rate, and character +alignment, and visual-based character recognition alignment. By integrating +multimodal information using the large language model GPT4o, M2S stands out in +the field of multimodal text generation. We demonstrate the effectiveness and +accuracy of M2S through comparative experiments and human evaluation. +Additionally, the model framework has good scalability and significant +potential for future research. + +
+
+
+
+
+ + ☆ IDOL: Instant Photorealistic 3D Human Creation from a Single Image + + +
+ Creating a high-fidelity, animatable 3D full-body avatar from a single image +is a challenging task due to the diverse appearance and poses of humans and the +limited availability of high-quality training data. To achieve fast and +high-quality human reconstruction, this work rethinks the task from the +perspectives of dataset, model, and representation. First, we introduce a +large-scale HUman-centric GEnerated dataset, HuGe100K, consisting of 100K +diverse, photorealistic sets of human images. Each set contains 24-view frames +in specific human poses, generated using a pose-controllable +image-to-multi-view model. Next, leveraging the diversity in views, poses, and +appearances within HuGe100K, we develop a scalable feed-forward transformer +model to predict a 3D human Gaussian representation in a uniform space from a +given human image. This model is trained to disentangle human pose, body shape, +clothing geometry, and texture. The estimated Gaussians can be animated without +post-processing. We conduct comprehensive experiments to validate the +effectiveness of the proposed dataset and method. Our model demonstrates the +ability to efficiently reconstruct photorealistic humans at 1K resolution from +a single input image using a single GPU instantly. Additionally, it seamlessly +supports various applications, as well as shape and texture editing tasks. + +
+
+ comment: 21 pages, 15 figures, includes main content, supplementary materials, + and references +
+
+
+
+
+ + ☆ TDCNet: Transparent Objects Depth Completion with CNN-Transformer + Dual-Branch Parallel Network + + +
+ The sensing and manipulation of transparent objects present a critical +challenge in industrial and laboratory robotics. Conventional sensors face +challenges in obtaining the full depth of transparent objects due to the +refraction and reflection of light on their surfaces and their lack of visible +texture. Previous research has attempted to obtain complete depth maps of +transparent objects from RGB and damaged depth maps (collected by depth sensor) +using deep learning models. However, existing methods fail to fully utilize the +original depth map, resulting in limited accuracy for deep completion. To solve +this problem, we propose TDCNet, a novel dual-branch CNN-Transformer parallel +network for transparent object depth completion. The proposed framework +consists of two different branches: one extracts features from partial depth +maps, while the other processes RGB-D images. Experimental results demonstrate +that our model achieves state-of-the-art performance across multiple public +datasets. Our code and the pre-trained model are publicly available at +https://github.com/XianghuiFan/TDCNet. + +
+
+
+
+
+ + ☆ Dream to Manipulate: Compositional World Models Empowering Robot + Imitation Learning with Imagination + + +
+ A world model provides an agent with a representation of its environment, +enabling it to predict the causal consequences of its actions. Current world +models typically cannot directly and explicitly imitate the actual environment +in front of a robot, often resulting in unrealistic behaviors and +hallucinations that make them unsuitable for real-world applications. In this +paper, we introduce a new paradigm for constructing world models that are +explicit representations of the real world and its dynamics. By integrating +cutting-edge advances in real-time photorealism with Gaussian Splatting and +physics simulators, we propose the first compositional manipulation world +model, which we call DreMa. DreMa replicates the observed world and its +dynamics, allowing it to imagine novel configurations of objects and predict +the future consequences of robot actions. We leverage this capability to +generate new data for imitation learning by applying equivariant +transformations to a small set of demonstrations. Our evaluations across +various settings demonstrate significant improvements in both accuracy and +robustness by incrementing actions and object distributions, reducing the data +needed to learn a policy and improving the generalization of the agents. As a +highlight, we show that a real Franka Emika Panda robot, powered by DreMa's +imagination, can successfully learn novel physical tasks from just a single +example per task variation (one-shot policy learning). Our project page and +source code can be found in https://leobarcellona.github.io/DreamToManipulate/ + +
+
+
+
+
+ + ☆ Corn Ear Detection and Orientation Estimation Using Deep Learning + + +
+ Monitoring growth behavior of maize plants such as the development of ears +can give key insights into the plant's health and development. Traditionally, +the measurement of the angle of ears is performed manually, which can be +time-consuming and prone to human error. To address these challenges, this +paper presents a computer vision-based system for detecting and tracking ears +of corn in an image sequence. The proposed system could accurately detect, +track, and predict the ear's orientation, which can be useful in monitoring +their growth behavior. This can significantly save time compared to manual +measurement and enables additional areas of ear orientation research and +potential increase in efficiencies for maize production. Using an object +detector with keypoint detection, the algorithm proposed could detect 90 +percent of all ears. The cardinal estimation had a mean absolute error (MAE) of +18 degrees, compared to a mean 15 degree difference between two people +measuring by hand. These results demonstrate the feasibility of using computer +vision techniques for monitoring maize growth and can lead to further research +in this area. + +
+
+ comment: 22 pages;15 figures +
+
+
+
+
+ + ☆ GURecon: Learning Detailed 3D Geometric Uncertainties for Neural Surface + Reconstruction AAAI 2025 + + +
+ Neural surface representation has demonstrated remarkable success in the +areas of novel view synthesis and 3D reconstruction. However, assessing the +geometric quality of 3D reconstructions in the absence of ground truth mesh +remains a significant challenge, due to its rendering-based optimization +process and entangled learning of appearance and geometry with photometric +losses. In this paper, we present a novel framework, i.e, GURecon, which +establishes a geometric uncertainty field for the neural surface based on +geometric consistency. Different from existing methods that rely on +rendering-based measurement, GURecon models a continuous 3D uncertainty field +for the reconstructed surface, and is learned by an online distillation +approach without introducing real geometric information for supervision. +Moreover, in order to mitigate the interference of illumination on geometric +consistency, a decoupled field is learned and exploited to finetune the +uncertainty field. Experiments on various datasets demonstrate the superiority +of GURecon in modeling 3D geometric uncertainty, as well as its plug-and-play +extension to various neural surface representations and improvement on +downstream tasks such as incremental reconstruction. The code and supplementary +material are available on the project website: +https://zju3dv.github.io/GURecon/. + +
+
+ comment: Accepted by AAAI 2025. Project page: + https://zju3dv.github.io/gurecon/ +
+
+
+
+
+ + ☆ Automatic Spectral Calibration of Hyperspectral Images:Method, Dataset + and Benchmark + + +
+ Hyperspectral image (HSI) densely samples the world in both the space and +frequency domain and therefore is more distinctive than RGB images. Usually, +HSI needs to be calibrated to minimize the impact of various illumination +conditions. The traditional way to calibrate HSI utilizes a physical reference, +which involves manual operations, occlusions, and/or limits camera mobility. +These limitations inspire this paper to automatically calibrate HSIs using a +learning-based method. Towards this goal, a large-scale HSI calibration dataset +is created, which has 765 high-quality HSI pairs covering diversified natural +scenes and illuminations. The dataset is further expanded to 7650 pairs by +combining with 10 different physically measured illuminations. A spectral +illumination transformer (SIT) together with an illumination attention module +is proposed. Extensive benchmarks demonstrate the SoTA performance of the +proposed SIT. The benchmarks also indicate that low-light conditions are more +challenging than normal conditions. The dataset and codes are available +online:https://github.com/duranze/Automatic-spectral-calibration-of-HSI + +
+
+
+
+
+ + ☆ MagicNaming: Consistent Identity Generation by Finding a "Name Space" in + T2I Diffusion Models AAAI 2025 + + +
+ Large-scale text-to-image diffusion models, (e.g., DALL-E, SDXL) are capable +of generating famous persons by simply referring to their names. Is it possible +to make such models generate generic identities as simple as the famous ones, +e.g., just use a name? In this paper, we explore the existence of a "Name +Space", where any point in the space corresponds to a specific identity. +Fortunately, we find some clues in the feature space spanned by text embedding +of celebrities' names. Specifically, we first extract the embeddings of +celebrities' names in the Laion5B dataset with the text encoder of diffusion +models. Such embeddings are used as supervision to learn an encoder that can +predict the name (actually an embedding) of a given face image. We +experimentally find that such name embeddings work well in promising the +generated image with good identity consistency. Note that like the names of +celebrities, our predicted name embeddings are disentangled from the semantics +of text inputs, making the original generation capability of text-to-image +models well-preserved. Moreover, by simply plugging such name embeddings, all +variants (e.g., from Civitai) derived from the same base model (i.e., SDXL) +readily become identity-aware text-to-image models. Project homepage: +\url{https://magicfusion.github.io/MagicNaming/}. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ Multimodal Hypothetical Summary for Retrieval-based Multi-image Question + Answering AAAI 2025 + + +
+ Retrieval-based multi-image question answering (QA) task involves retrieving +multiple question-related images and synthesizing these images to generate an +answer. Conventional "retrieve-then-answer" pipelines often suffer from +cascading errors because the training objective of QA fails to optimize the +retrieval stage. To address this issue, we propose a novel method to +effectively introduce and reference retrieved information into the QA. Given +the image set to be retrieved, we employ a multimodal large language model +(visual perspective) and a large language model (textual perspective) to obtain +multimodal hypothetical summary in question-form and description-form. By +combining visual and textual perspectives, MHyS captures image content more +specifically and replaces real images in retrieval, which eliminates the +modality gap by transforming into text-to-text retrieval and helps improve +retrieval. To more advantageously introduce retrieval with QA, we employ +contrastive learning to align queries (questions) with MHyS. Moreover, we +propose a coarse-to-fine strategy for calculating both sentence-level and +word-level similarity scores, to further enhance retrieval and filter out +irrelevant details. Our approach achieves a 3.7% absolute improvement over +state-of-the-art methods on RETVQA and a 14.5% improvement over CLIP. +Comprehensive experiments and detailed ablation studies demonstrate the +superiority of our method. + +
+
+ comment: AAAI 2025 +
+
+
+
+
+ + ☆ Zero-Shot Artifact2Artifact: Self-incentive artifact removal for + photoacoustic imaging without any data + + +
+ Photoacoustic imaging (PAI) uniquely combines optical contrast with the +penetration depth of ultrasound, making it critical for clinical applications. +However, the quality of 3D PAI is often degraded due to reconstruction +artifacts caused by the sparse and angle-limited configuration of detector +arrays. Existing iterative or deep learning-based methods are either +time-consuming or require large training datasets, significantly limiting their +practical application. Here, we propose Zero-Shot Artifact2Artifact (ZS-A2A), a +zero-shot self-supervised artifact removal method based on a super-lightweight +network, which leverages the fact that reconstruction artifacts are sensitive +to irregularities caused by data loss. By introducing random perturbations to +the acquired PA data, it spontaneously generates subset data, which in turn +stimulates the network to learn the artifact patterns in the reconstruction +results, thus enabling zero-shot artifact removal. This approach requires +neither training data nor prior knowledge of the artifacts, and is capable of +artifact removal for 3D PAI. For maximum amplitude projection (MAP) images or +slice images in 3D PAI acquired with arbitrarily sparse or angle-limited +detector arrays, ZS-A2A employs a self-incentive strategy to complete artifact +removal and improves the Contrast-to-Noise Ratio (CNR). We validated ZS-A2A in +both simulation study and $ in\ vivo $ animal experiments. Results demonstrate +that ZS-A2A achieves state-of-the-art (SOTA) performance compared to existing +zero-shot methods, and for the $ in\ vivo $ rat liver, ZS-A2A improves CNR from +17.48 to 43.46 in just 8 seconds. The project for ZS-A2A will be available in +the following GitHub repository: https://github.com/JaegerCQ/ZS-A2A. + +
+
+
+
+
+ + ☆ Large-scale School Mapping using Weakly Supervised Deep Learning for + Universal School Connectivity AAAI-25 + + +
+ Improving global school connectivity is critical for ensuring inclusive and +equitable quality education. To reliably estimate the cost of connecting +schools, governments and connectivity providers require complete and accurate +school location data - a resource that is often scarce in many low- and +middle-income countries. To address this challenge, we propose a +cost-effective, scalable approach to locating schools in high-resolution +satellite images using weakly supervised deep learning techniques. Our best +models, which combine vision transformers and convolutional neural networks, +achieve AUPRC values above 0.96 across 10 pilot African countries. Leveraging +explainable AI techniques, our approach can approximate the precise +geographical coordinates of the school locations using only low-cost, +classification-level annotations. To demonstrate the scalability of our method, +we generate nationwide maps of school location predictions in African countries +and present a detailed analysis of our results, using Senegal as our case +study. Finally, we demonstrate the immediate usability of our work by +introducing an interactive web mapping tool to streamline human-in-the-loop +model validation efforts by government partners. This work successfully +showcases the real-world utility of deep learning and satellite images for +planning regional infrastructure and accelerating universal school +connectivity. + +
+
+ comment: Accepted at AAAI-25 Special Track on AI for Social Impact (AISI) +
+
+
+
+
+ + ☆ AI-Powered Intracranial Hemorrhage Detection: A Co-Scale Convolutional + Attention Model with Uncertainty-Based Fuzzy Integral Operator and Feature + Screening + + +
+ Intracranial hemorrhage (ICH) refers to the leakage or accumulation of blood +within the skull, which occurs due to the rupture of blood vessels in or around +the brain. If this condition is not diagnosed in a timely manner and +appropriately treated, it can lead to serious complications such as decreased +consciousness, permanent neurological disabilities, or even death.The primary +aim of this study is to detect the occurrence or non-occurrence of ICH, +followed by determining the type of subdural hemorrhage (SDH). These tasks are +framed as two separate binary classification problems. By adding two layers to +the co-scale convolutional attention (CCA) classifier architecture, we +introduce a novel approach for ICH detection. In the first layer, after +extracting features from different slices of computed tomography (CT) scan +images, we combine these features and select the 50 components that capture the +highest variance in the data, considering them as informative features. We then +assess the discriminative power of these features using the bootstrap forest +algorithm, discarding those that lack sufficient discriminative ability between +different classes. This algorithm explicitly determines the contribution of +each feature to the final prediction, assisting us in developing an explainable +AI model. The features feed into a boosting neural network as a latent feature +space. In the second layer, we introduce a novel uncertainty-based fuzzy +integral operator to fuse information from different CT scan slices. This +operator, by accounting for the dependencies between consecutive slices, +significantly improves detection accuracy. + +
+
+
+
+
+ + ☆ Head and Neck Tumor Segmentation of MRI from Pre- and Mid-radiotherapy + with Pre-training, Data Augmentation and Dual Flow UNet + + +
+ Head and neck tumors and metastatic lymph nodes are crucial for treatment +planning and prognostic analysis. Accurate segmentation and quantitative +analysis of these structures require pixel-level annotation, making automated +segmentation techniques essential for the diagnosis and treatment of head and +neck cancer. In this study, we investigated the effects of multiple strategies +on the segmentation of pre-radiotherapy (pre-RT) and mid-radiotherapy (mid-RT) +images. For the segmentation of pre-RT images, we utilized: 1) a fully +supervised learning approach, and 2) the same approach enhanced with +pre-trained weights and the MixUp data augmentation technique. For mid-RT +images, we introduced a novel computational-friendly network architecture that +features separate encoders for mid-RT images and registered pre-RT images with +their labels. The mid-RT encoder branch integrates information from pre-RT +images and labels progressively during the forward propagation. We selected the +highest-performing model from each fold and used their predictions to create an +ensemble average for inference. In the final test, our models achieved a +segmentation performance of 82.38% for pre-RT and 72.53% for mid-RT on +aggregated Dice Similarity Coefficient (DSC) as HiLab. Our code is available at +https://github.com/WltyBY/HNTS-MRG2024_train_code. + +
+
+
+
+
+ + ☆ ObjVariantEnsemble: Advancing Point Cloud LLM Evaluation in Challenging + Scenes with Subtly Distinguished Objects AAAI2025 + + +
+ 3D scene understanding is an important task, and there has been a recent +surge of research interest in aligning 3D representations of point clouds with +text to empower embodied AI. However, due to the lack of comprehensive 3D +benchmarks, the capabilities of 3D models in real-world scenes, particularly +those that are challenging with subtly distinguished objects, remain +insufficiently investigated. To facilitate a more thorough evaluation of 3D +models' capabilities, we propose a scheme, ObjVariantEnsemble, to +systematically introduce more scenes with specified object classes, colors, +shapes, quantities, and spatial relationships to meet model evaluation needs. +More importantly, we intentionally construct scenes with similar objects to a +certain degree and design an LLM-VLM-cooperated annotator to capture key +distinctions as annotations. The resultant benchmark can better challenge 3D +models, reveal their shortcomings in understanding, and potentially aid in the +further development of 3D models. + +
+
+ comment: Accepted to AAAI2025 +
+
+
+
+
+ + ☆ Progressive Multimodal Reasoning via Active Retrieval + + +
+ Multi-step multimodal reasoning tasks pose significant challenges for +multimodal large language models (MLLMs), and finding effective ways to enhance +their performance in such scenarios remains an unresolved issue. In this paper, +we propose AR-MCTS, a universal framework designed to progressively improve the +reasoning capabilities of MLLMs through Active Retrieval (AR) and Monte Carlo +Tree Search (MCTS). Our approach begins with the development of a unified +retrieval module that retrieves key supporting insights for solving complex +reasoning problems from a hybrid-modal retrieval corpus. To bridge the gap in +automated multimodal reasoning verification, we employ the MCTS algorithm +combined with an active retrieval mechanism, which enables the automatic +generation of step-wise annotations. This strategy dynamically retrieves key +insights for each reasoning step, moving beyond traditional beam search +sampling to improve the diversity and reliability of the reasoning space. +Additionally, we introduce a process reward model that aligns progressively to +support the automatic verification of multimodal reasoning tasks. Experimental +results across three complex multimodal reasoning benchmarks confirm the +effectiveness of the AR-MCTS framework in enhancing the performance of various +multimodal models. Further analysis demonstrates that AR-MCTS can optimize +sampling diversity and accuracy, yielding reliable multimodal reasoning. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ☆ Synchronized and Fine-Grained Head for Skeleton-Based Ambiguous Action + Recognition + + +
+ Skeleton-based action recognition using GCNs has achieved remarkable +performance, but recognizing ambiguous actions, such as "waving" and +"saluting", remains a significant challenge. Existing methods typically rely on +a serial combination of GCNs and TCNs, where spatial and temporal features are +extracted independently, leading to an unbalanced spatial-temporal information, +which hinders accurate action recognition. Moreover, existing methods for +ambiguous actions often overemphasize local details, resulting in the loss of +crucial global context, which further complicates the task of differentiating +ambiguous actions. To address these challenges, we propose a lightweight +plug-and-play module called Synchronized and Fine-grained Head (SF-Head), +inserted between GCN and TCN layers. SF-Head first conducts Synchronized +Spatial-Temporal Extraction (SSTE) with a Feature Redundancy Loss (F-RL), +ensuring a balanced interaction between the two types of features. It then +performs Adaptive Cross-dimensional Feature Aggregation (AC-FA), with a Feature +Consistency Loss (F-CL), which aligns the aggregated feature with their +original spatial-temporal feature. This aggregation step effectively combines +both global context and local details. Experimental results on NTU RGB+D 60, +NTU RGB+D 120, and NW-UCLA datasets demonstrate significant improvements in +distinguishing ambiguous actions. Our code will be made available at +https://github.com/HaoHuang2003/SFHead. + +
+
+ comment: 20pages, 5 figures +
+
+
+
+
+ + ☆ PC-BEV: An Efficient Polar-Cartesian BEV Fusion Framework for LiDAR + Semantic Segmentation AAAI 2025 + + +
+ Although multiview fusion has demonstrated potential in LiDAR segmentation, +its dependence on computationally intensive point-based interactions, arising +from the lack of fixed correspondences between views such as range view and +Bird's-Eye View (BEV), hinders its practical deployment. This paper challenges +the prevailing notion that multiview fusion is essential for achieving high +performance. We demonstrate that significant gains can be realized by directly +fusing Polar and Cartesian partitioning strategies within the BEV space. Our +proposed BEV-only segmentation model leverages the inherent fixed grid +correspondences between these partitioning schemes, enabling a fusion process +that is orders of magnitude faster (170$\times$ speedup) than conventional +point-based methods. Furthermore, our approach facilitates dense feature +fusion, preserving richer contextual information compared to sparse point-based +alternatives. To enhance scene understanding while maintaining inference +efficiency, we also introduce a hybrid Transformer-CNN architecture. Extensive +evaluation on the SemanticKITTI and nuScenes datasets provides compelling +evidence that our method outperforms previous multiview fusion approaches in +terms of both performance and inference speed, highlighting the potential of +BEV-based fusion for LiDAR segmentation. Code is available at +\url{https://github.com/skyshoumeng/PC-BEV.} + +
+
+ comment: AAAI 2025 +
+
+
+
+
+ + ☆ Multi-Level Embedding and Alignment Network with Consistency and + Invariance Learning for Cross-View Geo-Localization + + +
+ Cross-View Geo-Localization (CVGL) involves determining the localization of +drone images by retrieving the most similar GPS-tagged satellite images. +However, the imaging gaps between platforms are often significant and the +variations in viewpoints are substantial, which limits the ability of existing +methods to effectively associate cross-view features and extract consistent and +invariant characteristics. Moreover, existing methods often overlook the +problem of increased computational and storage requirements when improving +model performance. To handle these limitations, we propose a lightweight +enhanced alignment network, called the Multi-Level Embedding and Alignment +Network (MEAN). The MEAN network uses a progressive multi-level enhancement +strategy, global-to-local associations, and cross-domain alignment, enabling +feature communication across levels. This allows MEAN to effectively connect +features at different levels and learn robust cross-view consistent mappings +and modality-invariant features. Moreover, MEAN adopts a shallow backbone +network combined with a lightweight branch design, effectively reducing +parameter count and computational complexity. Experimental results on the +University-1652 and SUES-200 datasets demonstrate that MEAN reduces parameter +count by 62.17% and computational complexity by 70.99% compared to +state-of-the-art models, while maintaining competitive or even superior +performance. The codes will be released soon. + +
+
+
+
+
+ + ☆ Explainable Tampered Text Detection via Multimodal Large Models + + +
+ Recently, tampered text detection has attracted increasing attention due to +its essential role in information security. Although existing methods can +detect the tampered text region, the interpretation of such detection remains +unclear, making the prediction unreliable. To address this black-box problem, +we propose to explain the basis of tampered text detection with natural +language via large multimodal models. To fill the data gap for this task, we +propose a large-scale, comprehensive dataset, ETTD, which contains both +pixel-level annotations indicating the tampered text region and natural +language annotations describing the anomaly of the tampered text. Multiple +methods are employed to improve the quality of the proposed data. For example, +a fused mask prompt is proposed to reduce confusion when querying GPT4o to +generate anomaly descriptions. By weighting the input image with the mask +annotation, the tampered region can be clearly indicated and the content in and +around the tampered region can also be preserved. We also propose prompting +GPT4o to recognize tampered texts and filtering out the responses with low OCR +accuracy, which can effectively improve annotation quality in an automatic +manner. To further improve explainable tampered text detection, we propose a +simple yet effective model called TTD, which benefits from improved +fine-grained perception by paying attention to the suspected region with +auxiliary reference grounding query. Extensive experiments on both the ETTD +dataset and the public dataset have verified the effectiveness of the proposed +methods. In-depth analysis is also provided to inspire further research. The +dataset and code will be made publicly available. + +
+
+ comment: The first work for explainable tampered text detection +
+
+
+
+
+ + ☆ Video Prediction Policy: A Generalist Robot Policy with Predictive + Visual Representations + + +
+ Recent advancements in robotics have focused on developing generalist +policies capable of performing multiple tasks. Typically, these policies +utilize pre-trained vision encoders to capture crucial information from current +observations. However, previous vision encoders, which trained on two-image +contrastive learning or single-image reconstruction, can not perfectly capture +the sequential information essential for embodied tasks. Recently, video +diffusion models (VDMs) have demonstrated the capability to accurately predict +future image sequences, exhibiting a good understanding of physical dynamics. +Motivated by the strong visual prediction capabilities of VDMs, we hypothesize +that they inherently possess visual representations that reflect the evolution +of the physical world, which we term predictive visual representations. +Building on this hypothesis, we propose the Video Prediction Policy (VPP), a +generalist robotic policy conditioned on the predictive visual representations +from VDMs. To further enhance these representations, we incorporate diverse +human or robotic manipulation datasets, employing unified video-generation +training objectives. VPP consistently outperforms existing methods across two +simulated and two real-world benchmarks. Notably, it achieves a 28.1\% relative +improvement in the Calvin ABC-D benchmark compared to the previous +state-of-the-art and delivers a 28.8\% increase in success rates for complex +real-world dexterous manipulation tasks. + +
+
+ comment: The first two authors contribute equally. Project Page at + https://video-prediction-policy.github.io/ +
+
+
+
+
+ + ☆ YOLOv11 Optimization for Efficient Resource Utilization + + +
+ The objective of this research is to optimize the eleventh iteration of You +Only Look Once (YOLOv11) by developing size-specific modified versions of the +architecture. These modifications involve pruning unnecessary layers and +reconfiguring the main architecture of YOLOv11. Each proposed version is +tailored to detect objects of specific size ranges, from small to large. To +ensure proper model selection based on dataset characteristics, we introduced +an object classifier program. This program identifies the most suitable +modified version for a given dataset. The proposed models were evaluated on +various datasets and compared with the original YOLOv11 and YOLOv8 models. The +experimental results highlight significant improvements in computational +resource efficiency, with the proposed models maintaining the accuracy of the +original YOLOv11. In some cases, the modified versions outperformed the +original model regarding detection performance. Furthermore, the proposed +models demonstrated reduced model sizes and faster inference times. Models +weights and the object size classifier can be found in this repository + +
+
+ comment: 12 pages, 13 figures, 4 tables +
+
+
+
+
+ + ☆ FLAMe: Federated Learning with Attention Mechanism using Spatio-Temporal + Keypoint Transformers for Pedestrian Fall Detection in Smart Cities AAAI 2025 + + +
+ In smart cities, detecting pedestrian falls is a major challenge to ensure +the safety and quality of life of citizens. In this study, we propose a novel +fall detection system using FLAMe (Federated Learning with Attention +Mechanism), a federated learning (FL) based algorithm. FLAMe trains around +important keypoint information and only transmits the trained important weights +to the server, reducing communication costs and preserving data privacy. +Furthermore, the lightweight keypoint transformer model is integrated into the +FL framework to effectively learn spatio-temporal features. We validated the +experiment using 22,672 video samples from the "Fall Accident Risk Behavior +Video-Sensor Pair data" dataset from AI-Hub. As a result of the experiment, the +FLAMe-based system achieved an accuracy of 94.02% with about 190,000 +transmission parameters, maintaining performance similar to that of existing +centralized learning while maximizing efficiency by reducing communication +costs by about 40% compared to the existing FL algorithm, FedAvg. Therefore, +the FLAMe algorithm has demonstrated that it provides robust performance in the +distributed environment of smart cities and is a practical and effective +solution for public safety. + +
+
+ comment: 8 pages, 7 figures, AAAI 2025 FLUID Workshop +
+
+
+
+
+ + ☆ Prototypical Calibrating Ambiguous Samples for Micro-Action Recognition AAAI 2025 + + +
+ Micro-Action Recognition (MAR) has gained increasing attention due to its +crucial role as a form of non-verbal communication in social interactions, with +promising potential for applications in human communication and emotion +analysis. However, current approaches often overlook the inherent ambiguity in +micro-actions, which arises from the wide category range and subtle visual +differences between categories. This oversight hampers the accuracy of +micro-action recognition. In this paper, we propose a novel Prototypical +Calibrating Ambiguous Network (\textbf{PCAN}) to unleash and mitigate the +ambiguity of MAR. \textbf{Firstly}, we employ a hierarchical action-tree to +identify the ambiguous sample, categorizing them into distinct sets of +ambiguous samples of false negatives and false positives, considering both +body- and action-level categories. \textbf{Secondly}, we implement an ambiguous +contrastive refinement module to calibrate these ambiguous samples by +regulating the distance between ambiguous samples and their corresponding +prototypes. This calibration process aims to pull false negative +($\mathbb{FN}$) samples closer to their respective prototypes and push false +positive ($\mathbb{FP}$) samples apart from their affiliated prototypes. In +addition, we propose a new prototypical diversity amplification loss to +strengthen the model's capacity by amplifying the differences between different +prototypes. \textbf{Finally}, we propose a prototype-guided rectification to +rectify prediction by incorporating the representability of prototypes. +Extensive experiments conducted on the benchmark dataset demonstrate the +superior performance of our method compared to existing approaches. The code is +available at https://github.com/kunli-cs/PCAN. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ EnergyMoGen: Compositional Human Motion Generation with Energy-Based + Diffusion Model in Latent Space + + +
+ Diffusion models, particularly latent diffusion models, have demonstrated +remarkable success in text-driven human motion generation. However, it remains +challenging for latent diffusion models to effectively compose multiple +semantic concepts into a single, coherent motion sequence. To address this +issue, we propose EnergyMoGen, which includes two spectrums of Energy-Based +Models: (1) We interpret the diffusion model as a latent-aware energy-based +model that generates motions by composing a set of diffusion models in latent +space; (2) We introduce a semantic-aware energy model based on cross-attention, +which enables semantic composition and adaptive gradient descent for text +embeddings. To overcome the challenges of semantic inconsistency and motion +distortion across these two spectrums, we introduce Synergistic Energy Fusion. +This design allows the motion latent diffusion model to synthesize +high-quality, complex motions by combining multiple energy terms corresponding +to textual descriptions. Experiments show that our approach outperforms +existing state-of-the-art models on various motion generation tasks, including +text-to-motion generation, compositional motion generation, and multi-concept +motion generation. Additionally, we demonstrate that our method can be used to +extend motion datasets and improve the text-to-motion task. + +
+
+ comment: Project page: https://jiro-zhang.github.io/EnergyMoGen/ +
+
+
+
+
+ + ☆ Event-assisted 12-stop HDR Imaging of Dynamic Scene + + +
+ High dynamic range (HDR) imaging is a crucial task in computational +photography, which captures details across diverse lighting conditions. +Traditional HDR fusion methods face limitations in dynamic scenes with extreme +exposure differences, as aligning low dynamic range (LDR) frames becomes +challenging due to motion and brightness variation. In this work, we propose a +novel 12-stop HDR imaging approach for dynamic scenes, leveraging a dual-camera +system with an event camera and an RGB camera. The event camera provides +temporally dense, high dynamic range signals that improve alignment between LDR +frames with large exposure differences, reducing ghosting artifacts caused by +motion. Also, a real-world finetuning strategy is proposed to increase the +generalization of alignment module on real-world events. Additionally, we +introduce a diffusion-based fusion module that incorporates image priors from +pre-trained diffusion models to address artifacts in high-contrast regions and +minimize errors from the alignment process. To support this work, we developed +the ESHDR dataset, the first dataset for 12-stop HDR imaging with synchronized +event signals, and validated our approach on both simulated and real-world +data. Extensive experiments demonstrate that our method achieves +state-of-the-art performance, successfully extending HDR imaging to 12 stops in +dynamic scenes. + +
+
+ comment: Project page: + https://openimaginglab.github.io/Event-Assisted-12stops-HDR/ +
+
+
+
+
+ + ☆ Explicit Relational Reasoning Network for Scene Text Detection AAAI 2025 + + +
+ Connected component (CC) is a proper text shape representation that aligns +with human reading intuition. However, CC-based text detection methods have +recently faced a developmental bottleneck that their time-consuming +post-processing is difficult to eliminate. To address this issue, we introduce +an explicit relational reasoning network (ERRNet) to elegantly model the +component relationships without post-processing. Concretely, we first represent +each text instance as multiple ordered text components, and then treat these +components as objects in sequential movement. In this way, scene text detection +can be innovatively viewed as a tracking problem. From this perspective, we +design an end-to-end tracking decoder to achieve a CC-based method dispensing +with post-processing entirely. Additionally, we observe that there is an +inconsistency between classification confidence and localization quality, so we +propose a Polygon Monte-Carlo method to quickly and accurately evaluate the +localization quality. Based on this, we introduce a position-supervised +classification loss to guide the task-aligned learning of ERRNet. Experiments +on challenging benchmarks demonstrate the effectiveness of our ERRNet. It +consistently achieves state-of-the-art accuracy while holding highly +competitive inference speed. + +
+
+ comment: Accepted to AAAI 2025 +
+
+
+
+
+ + ☆ A Light-Weight Framework for Open-Set Object Detection with Decoupled + Feature Alignment in Joint Space + + +
+ Open-set object detection (OSOD) is highly desirable for robotic manipulation +in unstructured environments. However, existing OSOD methods often fail to meet +the requirements of robotic applications due to their high computational burden +and complex deployment. To address this issue, this paper proposes a +light-weight framework called Decoupled OSOD (DOSOD), which is a practical and +highly efficient solution to support real-time OSOD tasks in robotic systems. +Specifically, DOSOD builds upon the YOLO-World pipeline by integrating a +vision-language model (VLM) with a detector. A Multilayer Perceptron (MLP) +adaptor is developed to transform text embeddings extracted by the VLM into a +joint space, within which the detector learns the region representations of +class-agnostic proposals. Cross-modality features are directly aligned in the +joint space, avoiding the complex feature interactions and thereby improving +computational efficiency. DOSOD operates like a traditional closed-set detector +during the testing phase, effectively bridging the gap between closed-set and +open-set detection. Compared to the baseline YOLO-World, the proposed DOSOD +significantly enhances real-time performance while maintaining comparable +accuracy. The slight DOSOD-S model achieves a Fixed AP of $26.7\%$, compared to +$26.2\%$ for YOLO-World-v1-S and $22.7\%$ for YOLO-World-v2-S, using similar +backbones on the LVIS minival dataset. Meanwhile, the FPS of DOSOD-S is +$57.1\%$ higher than YOLO-World-v1-S and $29.6\%$ higher than YOLO-World-v2-S. +Meanwhile, we demonstrate that the DOSOD model facilitates the deployment of +edge devices. The codes and models are publicly available at +https://github.com/D-Robotics-AI-Lab/DOSOD. + +
+
+
+
+
+ + ☆ Efficient Few-Shot Neural Architecture Search by Counting the Number of + Nonlinear Functions AAAI 2025 + + +
+ Neural architecture search (NAS) enables finding the best-performing +architecture from a search space automatically. Most NAS methods exploit an +over-parameterized network (i.e., a supernet) containing all possible +architectures (i.e., subnets) in the search space. However, the subnets that +share the same set of parameters are likely to have different characteristics, +interfering with each other during training. To address this, few-shot NAS +methods have been proposed that divide the space into a few subspaces and +employ a separate supernet for each subspace to limit the extent of weight +sharing. They achieve state-of-the-art performance, but the computational cost +increases accordingly. We introduce in this paper a novel few-shot NAS method +that exploits the number of nonlinear functions to split the search space. To +be specific, our method divides the space such that each subspace consists of +subnets with the same number of nonlinear functions. Our splitting criterion is +efficient, since it does not require comparing gradients of a supernet to split +the space. In addition, we have found that dividing the space allows us to +reduce the channel dimensions required for each supernet, which enables +training multiple supernets in an efficient manner. We also introduce a +supernet-balanced sampling (SBS) technique, sampling several subnets at each +training step, to train different supernets evenly within a limited number of +training steps. Extensive experiments on standard NAS benchmarks demonstrate +the effectiveness of our approach. Our code is available at +https://cvlab.yonsei.ac.kr/projects/EFS-NAS. + +
+
+ comment: Accepted to AAAI 2025 +
+
+
+
+
+ + ☆ FiVL: A Framework for Improved Vision-Language Alignment + + +
+ Large Vision Language Models (LVLMs) have achieved significant progress in +integrating visual and textual inputs for multimodal reasoning. However, a +recurring challenge is ensuring these models utilize visual information as +effectively as linguistic content when both modalities are necessary to +formulate an accurate answer. We hypothesize that hallucinations arise due to +the lack of effective visual grounding in current LVLMs. This issue extends to +vision-language benchmarks, where it is difficult to make the image +indispensable for accurate answer generation, particularly in vision +question-answering tasks. In this work, we introduce FiVL, a novel method for +constructing datasets designed to train LVLMs for enhanced visual grounding and +to evaluate their effectiveness in achieving it. These datasets can be utilized +for both training and assessing an LVLM's ability to use image content as +substantive evidence rather than relying solely on linguistic priors, providing +insights into the model's reliance on visual information. To demonstrate the +utility of our dataset, we introduce an innovative training task that +outperforms baselines alongside a validation method and application for +explainability. The code is available at https://github.com/IntelLabs/fivl. + +
+
+
+
+
+ + ☆ MUSTER: Longitudinal Deformable Registration by Composition of + Consecutive Deformations + + +
+ Longitudinal imaging allows for the study of structural changes over time. +One approach to detecting such changes is by non-linear image registration. +This study introduces Multi-Session Temporal Registration (MUSTER), a novel +method that facilitates longitudinal analysis of changes in extended series of +medical images. MUSTER improves upon conventional pairwise registration by +incorporating more than two imaging sessions to recover longitudinal +deformations. Longitudinal analysis at a voxel-level is challenging due to +effects of a changing image contrast as well as instrumental and environmental +sources of bias between sessions. We show that local normalized +cross-correlation as an image similarity metric leads to biased results and +propose a robust alternative. We test the performance of MUSTER on a synthetic +multi-site, multi-session neuroimaging dataset and show that, in various +scenarios, using MUSTER significantly enhances the estimated deformations +relative to pairwise registration. Additionally, we apply MUSTER on a sample of +older adults from the Alzheimer's Disease Neuroimaging Initiative (ADNI) study. +The results show that MUSTER can effectively identify patterns of +neuro-degeneration from T1-weighted images and that these changes correlate +with changes in cognition, matching the performance of state of the art +segmentation methods. By leveraging GPU acceleration, MUSTER efficiently +handles large datasets, making it feasible also in situations with limited +computational resources. + +
+
+
+
+
+ + ☆ Unveiling Uncertainty: A Deep Dive into Calibration and Performance of + Multimodal Large Language Models COLING 2025 + + +
+ Multimodal large language models (MLLMs) combine visual and textual data for +tasks such as image captioning and visual question answering. Proper +uncertainty calibration is crucial, yet challenging, for reliable use in areas +like healthcare and autonomous driving. This paper investigates representative +MLLMs, focusing on their calibration across various scenarios, including before +and after visual fine-tuning, as well as before and after multimodal training +of the base LLMs. We observed miscalibration in their performance, and at the +same time, no significant differences in calibration across these scenarios. We +also highlight how uncertainty differs between text and images and how their +integration affects overall uncertainty. To better understand MLLMs' +miscalibration and their ability to self-assess uncertainty, we construct the +IDK (I don't know) dataset, which is key to evaluating how they handle +unknowns. Our findings reveal that MLLMs tend to give answers rather than admit +uncertainty, but this self-assessment improves with proper prompt adjustments. +Finally, to calibrate MLLMs and enhance model reliability, we propose +techniques such as temperature scaling and iterative prompt optimization. Our +results provide insights into improving MLLMs for effective and responsible +deployment in multimodal applications. Code and IDK dataset: +\href{https://github.com/hfutml/Calibration-MLLM}{https://github.com/hfutml/Calibration-MLLM}. + +
+
+ comment: Accepted to COLING 2025 +
+
+
+
+
+ + ☆ RefHCM: A Unified Model for Referring Perceptions in Human-Centric + Scenarios + + +
+ Human-centric perceptions play a crucial role in real-world applications. +While recent human-centric works have achieved impressive progress, these +efforts are often constrained to the visual domain and lack interaction with +human instructions, limiting their applicability in broader scenarios such as +chatbots and sports analysis. This paper introduces Referring Human +Perceptions, where a referring prompt specifies the person of interest in an +image. To tackle the new task, we propose RefHCM (Referring Human-Centric +Model), a unified framework to integrate a wide range of human-centric +referring tasks. Specifically, RefHCM employs sequence mergers to convert raw +multimodal data -- including images, text, coordinates, and parsing maps -- +into semantic tokens. This standardized representation enables RefHCM to +reformulate diverse human-centric referring tasks into a sequence-to-sequence +paradigm, solved using a plain encoder-decoder transformer architecture. +Benefiting from a unified learning strategy, RefHCM effectively facilitates +knowledge transfer across tasks and exhibits unforeseen capabilities in +handling complex reasoning. This work represents the first attempt to address +referring human perceptions with a general-purpose framework, while +simultaneously establishing a corresponding benchmark that sets new standards +for the field. Extensive experiments showcase RefHCM's competitive and even +superior performance across multiple human-centric referring tasks. The code +and data are publicly at https://github.com/JJJYmmm/RefHCM. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Adaptive Prompt Tuning: Vision Guided Prompt Tuning with Cross-Attention + for Fine-Grained Few-Shot Learning + + +
+ Few-shot, fine-grained classification in computer vision poses significant +challenges due to the need to differentiate subtle class distinctions with +limited data. This paper presents a novel method that enhances the Contrastive +Language-Image Pre-Training (CLIP) model through adaptive prompt tuning, guided +by real-time visual inputs. Unlike existing techniques such as Context +Optimization (CoOp) and Visual Prompt Tuning (VPT), which are constrained by +static prompts or visual token reliance, the proposed approach leverages a +cross-attention mechanism to dynamically refine text prompts for the image at +hand. This enables an image-specific alignment of textual features with image +patches extracted from the Vision Transformer, making the model more effective +for datasets with high intra-class variance and low inter-class differences. +The method is evaluated on several datasets, including CUBirds, Oxford Flowers, +and FGVC Aircraft, showing significant performance gains over static prompt +tuning approaches. To ensure these performance gains translate into trustworthy +predictions, we integrate Monte-Carlo Dropout in our approach to improve the +reliability of the model predictions and uncertainty estimates. This +integration provides valuable insights into the model's predictive confidence, +helping to identify when predictions can be trusted and when additional +verification is necessary. This dynamic approach offers a robust solution, +advancing the state-of-the-art for few-shot fine-grained classification. + +
+
+
+
+
+ + ☆ Progressive Fine-to-Coarse Reconstruction for Accurate Low-Bit + Post-Training Quantization in Vision Transformers + + +
+ Due to its efficiency, Post-Training Quantization (PTQ) has been widely +adopted for compressing Vision Transformers (ViTs). However, when quantized +into low-bit representations, there is often a significant performance drop +compared to their full-precision counterparts. To address this issue, +reconstruction methods have been incorporated into the PTQ framework to improve +performance in low-bit quantization settings. Nevertheless, existing related +methods predefine the reconstruction granularity and seldom explore the +progressive relationships between different reconstruction granularities, which +leads to sub-optimal quantization results in ViTs. To this end, in this paper, +we propose a Progressive Fine-to-Coarse Reconstruction (PFCR) method for +accurate PTQ, which significantly improves the performance of low-bit quantized +vision transformers. Specifically, we define multi-head self-attention and +multi-layer perceptron modules along with their shortcuts as the finest +reconstruction units. After reconstructing these two fine-grained units, we +combine them to form coarser blocks and reconstruct them at a coarser +granularity level. We iteratively perform this combination and reconstruction +process, achieving progressive fine-to-coarse reconstruction. Additionally, we +introduce a Progressive Optimization Strategy (POS) for PFCR to alleviate the +difficulty of training, thereby further enhancing model performance. +Experimental results on the ImageNet dataset demonstrate that our proposed +method achieves the best Top-1 accuracy among state-of-the-art methods, +particularly attaining 75.61% for 3-bit quantized ViT-B in PTQ. Besides, +quantization results on the COCO dataset reveal the effectiveness and +generalization of our proposed method on other computer vision tasks like +object detection and instance segmentation. + +
+
+
+
+
+ + ☆ Review of Fruit Tree Image Segmentation + + +
+ Fruit tree image segmentation is an essential problem in automating a variety +of agricultural tasks such as phenotyping, harvesting, spraying, and pruning. +Many research papers have proposed a diverse spectrum of solutions suitable to +specific tasks and environments. The review scope of this paper is confined to +the front views of fruit trees and based on 158 relevant papers collected using +a newly designed crawling review method. These papers are systematically +reviewed based on a taxonomy that sequentially considers the method, image, +task, and fruit. This taxonomy will assist readers to intuitively grasp the big +picture of these research activities. Our review reveals that the most +noticeable deficiency of the previous studies was the lack of a versatile +dataset and segmentation model that could be applied to a variety of tasks and +environments. Six important future research tasks are suggested, with the +expectation that these will pave the way to building a versatile tree +segmentation module. + +
+
+
+
+
+ + ☆ Unified Image Restoration and Enhancement: Degradation Calibrated Cycle + Reconstruction Diffusion Model + + +
+ Image restoration and enhancement are pivotal for numerous computer vision +applications, yet unifying these tasks efficiently remains a significant +challenge. Inspired by the iterative refinement capabilities of diffusion +models, we propose CycleRDM, a novel framework designed to unify restoration +and enhancement tasks while achieving high-quality mapping. Specifically, +CycleRDM first learns the mapping relationships among the degraded domain, the +rough normal domain, and the normal domain through a two-stage diffusion +inference process. Subsequently, we transfer the final calibration process to +the wavelet low-frequency domain using discrete wavelet transform, performing +fine-grained calibration from a frequency domain perspective by leveraging +task-specific frequency spaces. To improve restoration quality, we design a +feature gain module for the decomposed wavelet high-frequency domain to +eliminate redundant features. Additionally, we employ multimodal textual +prompts and Fourier transform to drive stable denoising and reduce randomness +during the inference process. After extensive validation, CycleRDM can be +effectively generalized to a wide range of image restoration and enhancement +tasks while requiring only a small number of training samples to be +significantly superior on various benchmarks of reconstruction quality and +perceptual quality. The source code will be available at +https://github.com/hejh8/CycleRDM. + +
+
+
+
+
+ + ☆ Robust PCA Based on Adaptive Weighted Least Squares and Low-Rank Matrix + Factorization + + +
+ Robust Principal Component Analysis (RPCA) is a fundamental technique for +decomposing data into low-rank and sparse components, which plays a critical +role for applications such as image processing and anomaly detection. +Traditional RPCA methods commonly use $\ell_1$ norm regularization to enforce +sparsity, but this approach can introduce bias and result in suboptimal +estimates, particularly in the presence of significant noise or outliers. +Non-convex regularization methods have been proposed to mitigate these +challenges, but they tend to be complex to optimize and sensitive to initial +conditions, leading to potential instability in solutions. To overcome these +challenges, in this paper, we propose a novel RPCA model that integrates +adaptive weighted least squares (AWLS) and low-rank matrix factorization +(LRMF). The model employs a {self-attention-inspired} mechanism in its weight +update process, allowing the weight matrix to dynamically adjust and emphasize +significant components during each iteration. By employing a weighted F-norm +for the sparse component, our method effectively reduces bias while simplifying +the computational process compared to traditional $\ell_1$-norm-based methods. +We use an alternating minimization algorithm, where each subproblem has an +explicit solution, thereby improving computational efficiency. Despite its +simplicity, numerical experiments demonstrate that our method outperforms +existing non-convex regularization approaches, offering superior performance +and stability, as well as enhanced accuracy and robustness in practical +applications. + +
+
+
+
+
+ + ☆ Qua$^2$SeDiMo: Quantifiable Quantization Sensitivity of Diffusion Models AAAI 2025 + + +
+ Diffusion Models (DM) have democratized AI image generation through an +iterative denoising process. Quantization is a major technique to alleviate the +inference cost and reduce the size of DM denoiser networks. However, as +denoisers evolve from variants of convolutional U-Nets toward newer Transformer +architectures, it is of growing importance to understand the quantization +sensitivity of different weight layers, operations and architecture types to +performance. In this work, we address this challenge with Qua$^2$SeDiMo, a +mixed-precision Post-Training Quantization framework that generates explainable +insights on the cost-effectiveness of various model weight quantization methods +for different denoiser operation types and block structures. We leverage these +insights to make high-quality mixed-precision quantization decisions for a +myriad of diffusion models ranging from foundational U-Nets to state-of-the-art +Transformers. As a result, Qua$^2$SeDiMo can construct 3.4-bit, 3.9-bit, +3.65-bit and 3.7-bit weight quantization on PixArt-${\alpha}$, +PixArt-${\Sigma}$, Hunyuan-DiT and SDXL, respectively. We further pair our +weight-quantization configurations with 6-bit activation quantization and +outperform existing approaches in terms of quantitative metrics and generative +image quality. + +
+
+ comment: AAAI 2025; version includes supplementary material; 22 Pages, 18 + Figures, 8 Tables +
+
+
+
+
+ + ☆ FRIDAY: Mitigating Unintentional Facial Identity in Deepfake Detectors + Guided by Facial Recognizers + + +
+ Previous Deepfake detection methods perform well within their training +domains, but their effectiveness diminishes significantly with new synthesis +techniques. Recent studies have revealed that detection models often create +decision boundaries based on facial identity rather than synthetic artifacts, +resulting in poor performance on cross-domain datasets. To address this +limitation, we propose Facial Recognition Identity Attenuation (FRIDAY), a +novel training method that mitigates facial identity influence using a face +recognizer. Specifically, we first train a face recognizer using the same +backbone as the Deepfake detector. The recognizer is then frozen and employed +during the detector's training to reduce facial identity information. This is +achieved by feeding input images into both the recognizer and the detector, and +minimizing the similarity of their feature embeddings through our Facial +Identity Attenuating loss. This process encourages the detector to generate +embeddings distinct from the recognizer, effectively reducing the impact of +facial identity. Extensive experiments demonstrate that our approach +significantly enhances detection performance on both in-domain and cross-domain +datasets. + +
+
+ comment: 5 pages, 4 figures. In 2024 IEEE International Conference on Visual + Communications and Image Processing (VCIP) Oral +
+
+
+
+
+ + ☆ Pitfalls of topology-aware image segmentation + + +
+ Topological correctness, i.e., the preservation of structural integrity and +specific characteristics of shape, is a fundamental requirement for medical +imaging tasks, such as neuron or vessel segmentation. Despite the recent surge +in topology-aware methods addressing this challenge, their real-world +applicability is hindered by flawed benchmarking practices. In this paper, we +identify critical pitfalls in model evaluation that include inadequate +connectivity choices, overlooked topological artifacts in ground truth +annotations, and inappropriate use of evaluation metrics. Through detailed +empirical analysis, we uncover these issues' profound impact on the evaluation +and ranking of segmentation methods. Drawing from our findings, we propose a +set of actionable recommendations to establish fair and robust evaluation +standards for topology-aware medical image segmentation methods. + +
+
+ comment: Code is available at + https://github.com/AlexanderHBerger/topo-pitfalls +
+
+
+
+
+ + ☆ HarmonicEval: Multi-modal, Multi-task, Multi-criteria Automatic + Evaluation Using a Vision Language Model + + +
+ Vision-language models (VLMs) have shown impressive abilities in text and +image understanding. However, existing metrics for evaluating the text +generated by VLMs focus exclusively on overall quality, leading to two +limitations: 1) it is challenging to identify which aspects of the text need +improvement from the overall score; 2) metrics may overlook specific evaluation +criteria when predicting an overall score. To address these limitations, we +propose HarmonicEval, a reference-free evaluation metric that aggregates +criterion-wise scores to produce the overall score in a bottom-up manner. +Furthermore, we construct the Multi-task Multi-criteria Human Evaluation (MMHE) +dataset, which comprises 18,000 expert human judgments across four +vision-language tasks. Our experiments demonstrate that HarmonicEval achieves +higher correlations with human judgments than conventional metrics while +providing numerical scores for each criterion. + +
+
+
+
+
+ + ☆ Successive optimization of optics and post-processing with + differentiable coherent PSF operator and field information + + +
+ Recently, the joint design of optical systems and downstream algorithms is +showing significant potential. However, existing rays-described methods are +limited to optimizing geometric degradation, making it difficult to fully +represent the optical characteristics of complex, miniaturized lenses +constrained by wavefront aberration or diffraction effects. In this work, we +introduce a precise optical simulation model, and every operation in pipeline +is differentiable. This model employs a novel initial value strategy to enhance +the reliability of intersection calculation on high aspherics. Moreover, it +utilizes a differential operator to reduce memory consumption during coherent +point spread function calculations. To efficiently address various degradation, +we design a joint optimization procedure that leverages field information. +Guided by a general restoration network, the proposed method not only enhances +the image quality, but also successively improves the optical performance +across multiple lenses that are already in professional level. This joint +optimization pipeline offers innovative insights into the practical design of +sophisticated optical systems and post-processing algorithms. The source code +will be made publicly available at +https://github.com/Zrr-ZJU/Successive-optimization + +
+
+
+
+
+ + ☆ Can We Get Rid of Handcrafted Feature Extractors? SparseViT: + Nonsemantics-Centered, Parameter-Efficient Image Manipulation Localization + Through Spare-Coding Transformer AAAI + + +
+ Non-semantic features or semantic-agnostic features, which are irrelevant to +image context but sensitive to image manipulations, are recognized as +evidential to Image Manipulation Localization (IML). Since manual labels are +impossible, existing works rely on handcrafted methods to extract non-semantic +features. Handcrafted non-semantic features jeopardize IML model's +generalization ability in unseen or complex scenarios. Therefore, for IML, the +elephant in the room is: How to adaptively extract non-semantic features? +Non-semantic features are context-irrelevant and manipulation-sensitive. That +is, within an image, they are consistent across patches unless manipulation +occurs. Then, spare and discrete interactions among image patches are +sufficient for extracting non-semantic features. However, image semantics vary +drastically on different patches, requiring dense and continuous interactions +among image patches for learning semantic representations. Hence, in this +paper, we propose a Sparse Vision Transformer (SparseViT), which reformulates +the dense, global self-attention in ViT into a sparse, discrete manner. Such +sparse self-attention breaks image semantics and forces SparseViT to adaptively +extract non-semantic features for images. Besides, compared with existing IML +models, the sparse self-attention mechanism largely reduced the model size (max +80% in FLOPs), achieving stunning parameter efficiency and computation +reduction. Extensive experiments demonstrate that, without any handcrafted +feature extractors, SparseViT is superior in both generalization and efficiency +across benchmark datasets. + +
+
+ comment: 12 page, 8 figures, published to AAAI +
+
+
+
+
+ + ☆ LDP: Generalizing to Multilingual Visual Information Extraction by + Language Decoupled Pretraining AAAI2025 + + +
+ Visual Information Extraction (VIE) plays a crucial role in the comprehension +of semi-structured documents, and several pre-trained models have been +developed to enhance performance. However, most of these works are monolingual +(usually English). Due to the extremely unbalanced quantity and quality of +pre-training corpora between English and other languages, few works can extend +to non-English scenarios. In this paper, we conduct systematic experiments to +show that vision and layout modality hold invariance among images with +different languages. If decoupling language bias from document images, a +vision-layout-based model can achieve impressive cross-lingual generalization. +Accordingly, we present a simple but effective multilingual training paradigm +LDP (Language Decoupled Pre-training) for better utilization of monolingual +pre-training data. Our proposed model LDM (Language Decoupled Model) is first +pre-trained on the language-independent data, where the language knowledge is +decoupled by a diffusion model, and then the LDM is fine-tuned on the +downstream languages. Extensive experiments show that the LDM outperformed all +SOTA multilingual pre-trained models, and also maintains competitiveness on +downstream monolingual/English benchmarks. + +
+
+ comment: Accepted by AAAI2025 +
+
+
+
+
+ + ☆ Multi-Sensor Object Anomaly Detection: Unifying Appearance, Geometry, + and Internal Properties + + +
+ Object anomaly detection is essential for industrial quality inspection, yet +traditional single-sensor methods face critical limitations. They fail to +capture the wide range of anomaly types, as single sensors are often +constrained to either external appearance, geometric structure, or internal +properties. To overcome these challenges, we introduce MulSen-AD, the first +high-resolution, multi-sensor anomaly detection dataset tailored for industrial +applications. MulSen-AD unifies data from RGB cameras, laser scanners, and +lock-in infrared thermography, effectively capturing external appearance, +geometric deformations, and internal defects. The dataset spans 15 industrial +products with diverse, real-world anomalies. We also present MulSen-AD Bench, a +benchmark designed to evaluate multi-sensor methods, and propose +MulSen-TripleAD, a decision-level fusion algorithm that integrates these three +modalities for robust, unsupervised object anomaly detection. Our experiments +demonstrate that multi-sensor fusion substantially outperforms single-sensor +approaches, achieving 96.1% AUROC in object-level detection accuracy. These +results highlight the importance of integrating multi-sensor data for +comprehensive industrial anomaly detection. + +
+
+
+
+
+ + ☆ Spike2Former: Efficient Spiking Transformer for High-performance Image + Segmentation + + +
+ Spiking Neural Networks (SNNs) have a low-power advantage but perform poorly +in image segmentation tasks. The reason is that directly converting neural +networks with complex architectural designs for segmentation tasks into spiking +versions leads to performance degradation and non-convergence. To address this +challenge, we first identify the modules in the architecture design that lead +to the severe reduction in spike firing, make targeted improvements, and +propose Spike2Former architecture. Second, we propose normalized integer +spiking neurons to solve the training stability problem of SNNs with complex +architectures. We set a new state-of-the-art for SNNs in various semantic +segmentation datasets, with a significant improvement of +12.7% mIoU and 5.0 +efficiency on ADE20K, +14.3% mIoU and 5.2 efficiency on VOC2012, and +9.1% mIoU +and 6.6 efficiency on CityScapes. + +
+
+ comment: This work has been accepted on Association for the Advancement of + Artificial Intelligence 2025 +
+
+
+
+
+ + ☆ HiCM$^2$: Hierarchical Compact Memory Modeling for Dense Video + Captioning AAAI2025 + + +
+ With the growing demand for solutions to real-world video challenges, +interest in dense video captioning (DVC) has been on the rise. DVC involves the +automatic captioning and localization of untrimmed videos. Several studies +highlight the challenges of DVC and introduce improved methods utilizing prior +knowledge, such as pre-training and external memory. In this research, we +propose a model that leverages the prior knowledge of human-oriented +hierarchical compact memory inspired by human memory hierarchy and cognition. +To mimic human-like memory recall, we construct a hierarchical memory and a +hierarchical memory reading module. We build an efficient hierarchical compact +memory by employing clustering of memory events and summarization using large +language models. Comparative experiments demonstrate that this hierarchical +memory recall process improves the performance of DVC by achieving +state-of-the-art performance on YouCook2 and ViTT datasets. + +
+
+ comment: AAAI2025 +
+
+
+
+
+ + ☆ DiffSim: Taming Diffusion Models for Evaluating Visual Similarity + + +
+ Diffusion models have fundamentally transformed the field of generative +models, making the assessment of similarity between customized model outputs +and reference inputs critically important. However, traditional perceptual +similarity metrics operate primarily at the pixel and patch levels, comparing +low-level colors and textures but failing to capture mid-level similarities and +differences in image layout, object pose, and semantic content. Contrastive +learning-based CLIP and self-supervised learning-based DINO are often used to +measure semantic similarity, but they highly compress image features, +inadequately assessing appearance details. This paper is the first to discover +that pretrained diffusion models can be utilized for measuring visual +similarity and introduces the DiffSim method, addressing the limitations of +traditional metrics in capturing perceptual consistency in custom generation +tasks. By aligning features in the attention layers of the denoising U-Net, +DiffSim evaluates both appearance and style similarity, showing superior +alignment with human visual preferences. Additionally, we introduce the Sref +and IP benchmarks to evaluate visual similarity at the level of style and +instance, respectively. Comprehensive evaluations across multiple benchmarks +demonstrate that DiffSim achieves state-of-the-art performance, providing a +robust tool for measuring visual coherence in generative models. + +
+
+
+
+
+ + ☆ GSRender: Deduplicated Occupancy Prediction via Weakly Supervised 3D + Gaussian Splatting + + +
+ 3D occupancy perception is gaining increasing attention due to its capability +to offer detailed and precise environment representations. Previous +weakly-supervised NeRF methods balance efficiency and accuracy, with mIoU +varying by 5-10 points due to sampling count along camera rays. Recently, +real-time Gaussian splatting has gained widespread popularity in 3D +reconstruction, and the occupancy prediction task can also be viewed as a +reconstruction task. Consequently, we propose GSRender, which naturally employs +3D Gaussian Splatting for occupancy prediction, simplifying the sampling +process. In addition, the limitations of 2D supervision result in duplicate +predictions along the same camera ray. We implemented the Ray Compensation (RC) +module, which mitigates this issue by compensating for features from adjacent +frames. Finally, we redesigned the loss to eliminate the impact of dynamic +objects from adjacent frames. Extensive experiments demonstrate that our +approach achieves SOTA (state-of-the-art) results in RayIoU (+6.0), while +narrowing the gap with 3D supervision methods. Our code will be released soon. + +
+
+
+
+
+ + ☆ Alignment-Free RGB-T Salient Object Detection: A Large-scale Dataset and + Progressive Correlation Network AAAI 2025 + + +
+ Alignment-free RGB-Thermal (RGB-T) salient object detection (SOD) aims to +achieve robust performance in complex scenes by directly leveraging the +complementary information from unaligned visible-thermal image pairs, without +requiring manual alignment. However, the labor-intensive process of collecting +and annotating image pairs limits the scale of existing benchmarks, hindering +the advancement of alignment-free RGB-T SOD. In this paper, we construct a +large-scale and high-diversity unaligned RGB-T SOD dataset named UVT20K, +comprising 20,000 image pairs, 407 scenes, and 1256 object categories. All +samples are collected from real-world scenarios with various challenges, such +as low illumination, image clutter, complex salient objects, and so on. To +support the exploration for further research, each sample in UVT20K is +annotated with a comprehensive set of ground truths, including saliency masks, +scribbles, boundaries, and challenge attributes. In addition, we propose a +Progressive Correlation Network (PCNet), which models inter- and intra-modal +correlations on the basis of explicit alignment to achieve accurate predictions +in unaligned image pairs. Extensive experiments conducted on unaligned and +aligned datasets demonstrate the effectiveness of our method.Code and dataset +are available at https://github.com/Angknpng/PCNet. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ SCKD: Semi-Supervised Cross-Modality Knowledge Distillation for 4D Radar + Object Detection AAAI 2025 + + +
+ 3D object detection is one of the fundamental perception tasks for autonomous +vehicles. Fulfilling such a task with a 4D millimeter-wave radar is very +attractive since the sensor is able to acquire 3D point clouds similar to Lidar +while maintaining robust measurements under adverse weather. However, due to +the high sparsity and noise associated with the radar point clouds, the +performance of the existing methods is still much lower than expected. In this +paper, we propose a novel Semi-supervised Cross-modality Knowledge Distillation +(SCKD) method for 4D radar-based 3D object detection. It characterizes the +capability of learning the feature from a Lidar-radar-fused teacher network +with semi-supervised distillation. We first propose an adaptive fusion module +in the teacher network to boost its performance. Then, two feature distillation +modules are designed to facilitate the cross-modality knowledge transfer. +Finally, a semi-supervised output distillation is proposed to increase the +effectiveness and flexibility of the distillation framework. With the same +network structure, our radar-only student trained by SCKD boosts the mAP by +10.38% over the baseline and outperforms the state-of-the-art works on the VoD +dataset. The experiment on ZJUODset also shows 5.12% mAP improvements on the +moderate difficulty level over the baseline when extra unlabeled data are +available. Code is available at https://github.com/Ruoyu-Xu/SCKD. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ Improving Geometry in Sparse-View 3DGS via Reprojection-based DoF + Separation + + +
+ Recent learning-based Multi-View Stereo models have demonstrated +state-of-the-art performance in sparse-view 3D reconstruction. However, +directly applying 3D Gaussian Splatting (3DGS) as a refinement step following +these models presents challenges. We hypothesize that the excessive positional +degrees of freedom (DoFs) in Gaussians induce geometry distortion, fitting +color patterns at the cost of structural fidelity. To address this, we propose +reprojection-based DoF separation, a method distinguishing positional DoFs in +terms of uncertainty: image-plane-parallel DoFs and ray-aligned DoF. To +independently manage each DoF, we introduce a reprojection process along with +tailored constraints for each DoF. Through experiments across various datasets, +we confirm that separating the positional DoFs of Gaussians and applying +targeted constraints effectively suppresses geometric artifacts, producing +reconstruction results that are both visually and geometrically plausible. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ GBRIP: Granular Ball Representation for Imbalanced Partial Label + Learning AAAI25 + + +
+ Partial label learning (PLL) is a complicated weakly supervised +multi-classification task compounded by class imbalance. Currently, existing +methods only rely on inter-class pseudo-labeling from inter-class features, +often overlooking the significant impact of the intra-class imbalanced features +combined with the inter-class. To address these limitations, we introduce +Granular Ball Representation for Imbalanced PLL (GBRIP), a novel framework for +imbalanced PLL. GBRIP utilizes coarse-grained granular ball representation and +multi-center loss to construct a granular ball-based nfeature space through +unsupervised learning, effectively capturing the feature distribution within +each class. GBRIP mitigates the impact of confusing features by systematically +refining label disambiguation and estimating imbalance distributions. The novel +multi-center loss function enhances learning by emphasizing the relationships +between samples and their respective centers within the granular balls. +Extensive experiments on standard benchmarks demonstrate that GBRIP outperforms +existing state-of-the-art methods, offering a robust solution to the challenges +of imbalanced PLL. + +
+
+ comment: AAAI25 +
+
+
+
+
+ + ☆ ScaMo: Exploring the Scaling Law in Autoregressive Motion Generation + Model + + +
+ The scaling law has been validated in various domains, such as natural +language processing (NLP) and massive computer vision tasks; however, its +application to motion generation remains largely unexplored. In this paper, we +introduce a scalable motion generation framework that includes the motion +tokenizer Motion FSQ-VAE and a text-prefix autoregressive transformer. Through +comprehensive experiments, we observe the scaling behavior of this system. For +the first time, we confirm the existence of scaling laws within the context of +motion generation. Specifically, our results demonstrate that the normalized +test loss of our prefix autoregressive models adheres to a logarithmic law in +relation to compute budgets. Furthermore, we also confirm the power law between +Non-Vocabulary Parameters, Vocabulary Parameters, and Data Tokens with respect +to compute budgets respectively. Leveraging the scaling law, we predict the +optimal transformer size, vocabulary size, and data requirements for a compute +budget of $1e18$. The test loss of the system, when trained with the optimal +model size, vocabulary size, and required data, aligns precisely with the +predicted test loss, thereby validating the scaling law. + +
+
+
+
+
+ + ☆ Bright-NeRF:Brightening Neural Radiance Field with Color Restoration + from Low-light Raw Images AAAI2025 + + +
+ Neural Radiance Fields (NeRFs) have demonstrated prominent performance in +novel view synthesis. However, their input heavily relies on image acquisition +under normal light conditions, making it challenging to learn accurate scene +representation in low-light environments where images typically exhibit +significant noise and severe color distortion. To address these challenges, we +propose a novel approach, Bright-NeRF, which learns enhanced and high-quality +radiance fields from multi-view low-light raw images in an unsupervised manner. +Our method simultaneously achieves color restoration, denoising, and enhanced +novel view synthesis. Specifically, we leverage a physically-inspired model of +the sensor's response to illumination and introduce a chromatic adaptation loss +to constrain the learning of response, enabling consistent color perception of +objects regardless of lighting conditions. We further utilize the raw data's +properties to expose the scene's intensity automatically. Additionally, we have +collected a multi-view low-light raw image dataset to advance research in this +field. Experimental results demonstrate that our proposed method significantly +outperforms existing 2D and 3D approaches. Our code and dataset will be made +publicly available. + +
+
+ comment: Accepted by AAAI2025 +
+
+
+
+
+ + ☆ {S$^3$-Mamba}: Small-Size-Sensitive Mamba for Lesion Segmentation AAAI 2025 + + +
+ Small lesions play a critical role in early disease diagnosis and +intervention of severe infections. Popular models often face challenges in +segmenting small lesions, as it occupies only a minor portion of an image, +while down\_sampling operations may inevitably lose focus on local features of +small lesions. To tackle the challenges, we propose a {\bf S}mall-{\bf +S}ize-{\bf S}ensitive {\bf Mamba} ({\bf S$^3$-Mamba}), which promotes the +sensitivity to small lesions across three dimensions: channel, spatial, and +training strategy. Specifically, an Enhanced Visual State Space block is +designed to focus on small lesions through multiple residual connections to +preserve local features, and selectively amplify important details while +suppressing irrelevant ones through channel-wise attention. A Tensor-based +Cross-feature Multi-scale Attention is designed to integrate input image +features and intermediate-layer features with edge features and exploit the +attentive support of features across multiple scales, thereby retaining spatial +details of small lesions at various granularities. Finally, we introduce a +novel regularized curriculum learning to automatically assess lesion size and +sample difficulty, and gradually focus from easy samples to hard ones like +small lesions. Extensive experiments on three medical image segmentation +datasets show the superiority of our S$^3$-Mamba, especially in segmenting +small lesions. Our code is available at +https://github.com/ErinWang2023/S3-Mamba. + +
+
+ comment: Accept by AAAI 2025 +
+
+
+
+
+ + ☆ Summary of Point Transformer with Federated Learning for Predicting + Breast Cancer HER2 Status from Hematoxylin and Eosin-Stained Whole Slide + Images + + +
+ This study introduces a federated learning-based approach to predict HER2 +status from hematoxylin and eosin (HE)-stained whole slide images (WSIs), +reducing costs and speeding up treatment decisions. To address label imbalance +and feature representation challenges in multisite datasets, a point +transformer is proposed, incorporating dynamic label distribution, an auxiliary +classifier, and farthest cosine sampling. Extensive experiments demonstrate +state-of-the-art performance across four sites (2687 WSIs) and strong +generalization to two unseen sites (229 WSIs). + +
+
+
+
+
+ + ♻ ☆ Tracing the Roots: Leveraging Temporal Dynamics in Diffusion + Trajectories for Origin Attribution + + +
+ Diffusion models have revolutionized image synthesis, garnering significant +research interest in recent years. Diffusion is an iterative algorithm in which +samples are generated step-by-step, starting from pure noise. This process +introduces the notion of diffusion trajectories, i.e., paths from the standard +Gaussian distribution to the target image distribution. In this context, we +study discriminative algorithms operating on these trajectories. Specifically, +given a pre-trained diffusion model, we consider the problem of classifying +images as part of the training dataset, generated by the model or originating +from an external source. Our approach demonstrates the presence of patterns +across steps that can be leveraged for classification. We also conduct ablation +studies, which reveal that using higher-order gradient features to characterize +the trajectories leads to significant performance gains and more robust +algorithms. + +
+
+
+
+
+ + ♻ ☆ Does VLM Classification Benefit from LLM Description Semantics? AAAI-25 + + +
+ Accurately describing images with text is a foundation of explainable AI. +Vision-Language Models (VLMs) like CLIP have recently addressed this by +aligning images and texts in a shared embedding space, expressing semantic +similarities between vision and language embeddings. VLM classification can be +improved with descriptions generated by Large Language Models (LLMs). However, +it is difficult to determine the contribution of actual description semantics, +as the performance gain may also stem from a semantic-agnostic ensembling +effect, where multiple modified text prompts act as a noisy test-time +augmentation for the original one. We propose an alternative evaluation +scenario to decide if a performance boost of LLM-generated descriptions is +caused by such a noise augmentation effect or rather by genuine description +semantics. The proposed scenario avoids noisy test-time augmentation and +ensures that genuine, distinctive descriptions cause the performance boost. +Furthermore, we propose a training-free method for selecting discriminative +descriptions that work independently of classname-ensembling effects. Our +approach identifies descriptions that effectively differentiate classes within +a local CLIP label neighborhood, improving classification accuracy across seven +datasets. Additionally, we provide insights into the explainability of +description-based image classification with VLMs. + +
+
+ comment: AAAI-25 (extended version), Code: https://github.com/CompVis/DisCLIP +
+
+
+
+
+ + ♻ ☆ DepthFM: Fast Monocular Depth Estimation with Flow Matching AAAI 2025 + + +
+ Current discriminative depth estimation methods often produce blurry +artifacts, while generative approaches suffer from slow sampling due to +curvatures in the noise-to-depth transport. Our method addresses these +challenges by framing depth estimation as a direct transport between image and +depth distributions. We are the first to explore flow matching in this field, +and we demonstrate that its interpolation trajectories enhance both training +and sampling efficiency while preserving high performance. While generative +models typically require extensive training data, we mitigate this dependency +by integrating external knowledge from a pre-trained image diffusion model, +enabling effective transfer even across differing objectives. To further boost +our model performance, we employ synthetic data and utilize image-depth pairs +generated by a discriminative model on an in-the-wild image dataset. As a +generative model, our model can reliably estimate depth confidence, which +provides an additional advantage. Our approach achieves competitive zero-shot +performance on standard benchmarks of complex natural scenes while improving +sampling efficiency and only requiring minimal synthetic data for training. + +
+
+ comment: AAAI 2025, Project Page: https://github.com/CompVis/depth-fm +
+
+
+
+
+ + ♻ ☆ Metric Compatible Training for Online Backfilling in Large-Scale + Retrieval + + +
+ Backfilling is the process of re-extracting all gallery embeddings from +upgraded models in image retrieval systems. It inevitably requires a +prohibitively large amount of computational cost and even entails the downtime +of the service. Although backward-compatible learning sidesteps this challenge +by tackling query-side representations, this leads to suboptimal solutions in +principle because gallery embeddings cannot benefit from model upgrades. We +address this dilemma by introducing an online backfilling algorithm, which +enables us to achieve a progressive performance improvement during the +backfilling process while not sacrificing the final performance of new model +after the completion of backfilling. To this end, we first propose a simple +distance rank merge technique for online backfilling. Then, we incorporate a +reverse transformation module for more effective and efficient merging, which +is further enhanced by adopting a metric-compatible contrastive learning +approach. These two components help to make the distances of old and new models +compatible, resulting in desirable merge results during backfilling with no +extra computational overhead. Extensive experiments show the effectiveness of +our framework on four standard benchmarks in various settings. + +
+
+
+
+
+ + ♻ ☆ A Deep Learning-Based and Fully Automated Pipeline for Regurgitant + Mitral Valve Anatomy Analysis from 3D Echocardiography + + +
+ 3D transesophageal echocardiography (3DTEE), is the recommended method for +diagnosing mitral regurgitation (MR). 3DTEE provides a high-quality 3D image of +the mitral valve (MV), allowing for precise segmentation and measurement of the +regurgitant valve anatomy. However, manual TEE segmentations are time-consuming +and prone to intra-operator variability, affecting the reliability of the +measurements. To address this, we developed a fully automated pipeline using a +3D convolutional neural network (CNN) to segment MV substructures (annulus, +anterior leaflet, and posterior leaflet) and quantify MV anatomy. The 3D CNN, +based on a multi-decoder residual U-Net architecture, was trained and tested on +a dataset comprising 100 3DTEE images with corresponding segmentations. Within +the pipeline, a custom algorithm refines the CNN-based segmentations and +extracts MV models, from which anatomical landmarks and features are +quantified. The accuracy of the proposed method was assessed using Dice score +and mean surface distance (MSD) against ground truth segmentations, and the +extracted anatomical parameters were compared against a semiautomated +commercial software TomTec Image Arena. The trained 3D CNN achieved an average +Dice score of 0.79 and MSD of 0.47 mm for the combined segmentation of the +annulus, anterior and posterior leaflet. The proposed CNN architecture +outperformed a baseline residual U-Net architecture in MV substructure +segmentation, and the refinement of the predicted annulus segmentation improved +MSD by 8.36%. The annular and leaflet linear measurements differed by less than +7.94 mm and 3.67 mm, respectively, compared to the 3D measurements obtained +with TomTec Image Arena. The proposed pipeline was faster than the commercial +software, with a modeling time of 12.54 s and a quantification time of 54.42 s. + +
+
+
+
+
+ + ♻ ☆ Optimized Gradient Clipping for Noisy Label Learning AAAI2025 + + +
+ Previous research has shown that constraining the gradient of loss function +with respect to model-predicted probabilities can enhance the model robustness +against noisy labels. These methods typically specify a fixed optimal threshold +for gradient clipping through validation data to obtain the desired robustness +against noise. However, this common practice overlooks the dynamic distribution +of gradients from both clean and noisy-labeled samples at different stages of +training, significantly limiting the model capability to adapt to the variable +nature of gradients throughout the training process. To address this issue, we +propose a simple yet effective approach called Optimized Gradient Clipping +(OGC), which dynamically adjusts the clipping threshold based on the ratio of +noise gradients to clean gradients after clipping, estimated by modeling the +distributions of clean and noisy samples. This approach allows us to modify the +clipping threshold at each training step, effectively controlling the influence +of noise gradients. Additionally, we provide statistical analysis to certify +the noise-tolerance ability of OGC. Our extensive experiments across various +types of label noise, including symmetric, asymmetric, instance-dependent, and +real-world noise, demonstrate the effectiveness of our approach. + +
+
+ comment: Accepted by AAAI2025 +
+
+
+
+
+ + ♻ ☆ G-VEval: A Versatile Metric for Evaluating Image and Video Captions + Using GPT-4o + + +
+ Evaluation metric of visual captioning is important yet not thoroughly +explored. Traditional metrics like BLEU, METEOR, CIDEr, and ROUGE often miss +semantic depth, while trained metrics such as CLIP-Score, PAC-S, and Polos are +limited in zero-shot scenarios. Advanced Language Model-based metrics also +struggle with aligning to nuanced human preferences. To address these issues, +we introduce G-VEval, a novel metric inspired by G-Eval and powered by the new +GPT-4o. G-VEval uses chain-of-thought reasoning in large multimodal models and +supports three modes: reference-free, reference-only, and combined, +accommodating both video and image inputs. We also propose MSVD-Eval, a new +dataset for video captioning evaluation, to establish a more transparent and +consistent framework for both human experts and evaluation metrics. It is +designed to address the lack of clear criteria in existing datasets by +introducing distinct dimensions of Accuracy, Completeness, Conciseness, and +Relevance (ACCR). Extensive results show that G-VEval outperforms existing +methods in correlation with human annotations, as measured by Kendall tau-b and +Kendall tau-c. This provides a flexible solution for diverse captioning tasks +and suggests a straightforward yet effective approach for large language models +to understand video content, paving the way for advancements in automated +captioning. Codes are available at https://github.com/ztangaj/gveval + +
+
+
+
+
+ + ♻ ☆ ID-Sculpt: ID-aware 3D Head Generation from Single In-the-wild Portrait + Image AAAI 2025 + + +
+ While recent works have achieved great success on image-to-3D object +generation, high quality and fidelity 3D head generation from a single image +remains a great challenge. Previous text-based methods for generating 3D heads +were limited by text descriptions and image-based methods struggled to produce +high-quality head geometry. To handle this challenging problem, we propose a +novel framework, ID-Sculpt, to generate high-quality 3D heads while preserving +their identities. Our work incorporates the identity information of the +portrait image into three parts: 1) geometry initialization, 2) geometry +sculpting, and 3) texture generation stages. Given a reference portrait image, +we first align the identity features with text features to realize ID-aware +guidance enhancement, which contains the control signals representing the face +information. We then use the canny map, ID features of the portrait image, and +a pre-trained text-to-normal/depth diffusion model to generate ID-aware +geometry supervision, and 3D-GAN inversion is employed to generate ID-aware +geometry initialization. Furthermore, with the ability to inject identity +information into 3D head generation, we use ID-aware guidance to calculate +ID-aware Score Distillation (ISD) for geometry sculpting. For texture +generation, we adopt the ID Consistent Texture Inpainting and Refinement which +progressively expands the view for texture inpainting to obtain an +initialization UV texture map. We then use the ID-aware guidance to provide +image-level supervision for noisy multi-view images to obtain a refined texture +map. Extensive experiments demonstrate that we can generate high-quality 3D +heads with accurate geometry and texture from a single in-the-wild portrait +image. + +
+
+ comment: Accepted by AAAI 2025; Project page: + https://jinkun-hao.github.io/ID-Sculpt/ +
+
+
+
+
+ + ♻ ☆ SageAttention2: Efficient Attention with Thorough Outlier Smoothing and + Per-thread INT4 Quantization + + +
+ Although quantization for linear layers has been widely used, its application +to accelerate the attention process remains limited. To further enhance the +efficiency of attention computation compared to SageAttention while maintaining +precision, we propose SageAttention2, which utilizes significantly faster 4-bit +matrix multiplication (Matmul) alongside additional precision-enhancing +techniques. First, we propose to quantize matrixes $(Q, K)$ to INT4 in a +hardware-friendly thread-level granularity and quantize matrixes $(\widetilde +P, V)$ to FP8. Second, we propose a method to smooth $Q$, enhancing the +accuracy of INT4 $QK$. Third, we propose to use an FP32 Matmul buffer for $PV$ +to enhance the accuracy of FP8 $\widetilde PV$. The operations per second (OPS) +of SageAttention2 surpass FlashAttention2 and xformers by about 3x and 5x on +RTX4090, respectively. Comprehensive experiments confirm that our approach +incurs negligible end-to-end metrics loss across diverse models, including +those for large language processing, image generation, and video generation. +The codes are available at https://github.com/thu-ml/SageAttention. + +
+
+
+
+
+ + ♻ ☆ Real-Time Damage Detection in Fiber Lifting Ropes Using Lightweight + Convolutional Neural Networks + + +
+ The health and safety hazards posed by worn crane lifting ropes mandate +periodic inspection for damage. This task is time-consuming, prone to human +error, halts operation, and may result in the premature disposal of ropes. +Therefore, we propose using efficient deep learning and computer vision methods +to automate the process of detecting damaged ropes. Specifically, we present a +vision-based system for detecting damage in synthetic fiber rope images using +lightweight convolutional neural networks. We develop a camera-based apparatus +to photograph the lifting rope's surface, while in operation, and capture the +progressive wear-and-tear as well as the more significant degradation in the +rope's health state. Experts from Konecranes annotate the collected images in +accordance with the rope's condition; normal or damaged. Then, we pre-process +the images, systematically design a deep learning model, evaluate its detection +and prediction performance, analyze its computational complexity, and compare +it with various other models. Experimental results show the proposed model +outperforms other similar techniques with 96.5% accuracy, 94.8% precision, +98.3% recall, 96.5% F1-score, and 99.3% AUC. Besides, they demonstrate the +model's real-time operation, low memory footprint, robustness to various +environmental and operational conditions, and adequacy for deployment in +industrial applications such as lifting, mooring, towing, climbing, and +sailing. + +
+
+
+
+
+ + ♻ ☆ Cycle Pixel Difference Network for Crisp Edge Detection + + +
+ Edge detection, as a fundamental task in computer vision, has garnered +increasing attention. The advent of deep learning has significantly advanced +this field. However, recent deep learning-based methods generally face two +significant issues: 1) reliance on large-scale pre-trained weights, and 2) +generation of thick edges. We construct a U-shape encoder-decoder model named +CPD-Net that successfully addresses these two issues simultaneously. In +response to issue 1), we propose a novel cycle pixel difference convolution +(CPDC), which effectively integrates edge prior knowledge with modern +convolution operations, consequently successfully eliminating the dependence on +large-scale pre-trained weights. As for issue 2), we construct a multi-scale +information enhancement module (MSEM) and a dual residual connection-based +(DRC) decoder to enhance the edge location ability of the model, thereby +generating crisp and clean contour maps. Comprehensive experiments conducted on +four standard benchmarks demonstrate that our method achieves competitive +performance on the BSDS500 dataset (ODS=0.813 and AC=0.352), NYUD-V2 (ODS=0.760 +and AC=0.223), BIPED dataset (ODS=0.898 and AC=0.426), and CID (ODS=0.59). Our +approach provides a novel perspective for addressing these challenges in edge +detection. + +
+
+
+
+
+ + ♻ ☆ MonoPCC: Photometric-invariant Cycle Constraint for Monocular Depth + Estimation of Endoscopic Images + + +
+ Photometric constraint is indispensable for self-supervised monocular depth +estimation. It involves warping a source image onto a target view using +estimated depth&pose, and then minimizing the difference between the warped and +target images. However, the endoscopic built-in light causes significant +brightness fluctuations, and thus makes the photometric constraint unreliable. +Previous efforts only mitigate this relying on extra models to calibrate image +brightness. In this paper, we propose MonoPCC to address the brightness +inconsistency radically by reshaping the photometric constraint into a cycle +form. Instead of only warping the source image, MonoPCC constructs a closed +loop consisting of two opposite forward-backward warping paths: from target to +source and then back to target. Thus, the target image finally receives an +image cycle-warped from itself, which naturally makes the constraint invariant +to brightness changes. Moreover, MonoPCC transplants the source image's +phase-frequency into the intermediate warped image to avoid structure lost, and +also stabilizes the training via an exponential moving average (EMA) strategy +to avoid frequent changes in the forward warping. The comprehensive and +extensive experimental results on four endoscopic datasets demonstrate that our +proposed MonoPCC shows a great robustness to the brightness inconsistency, and +exceeds other state-of-the-arts by reducing the absolute relative error by at +least 7.27%, 9.38%, 9.90% and 3.17%, respectively. + +
+
+ comment: 14 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Union-over-Intersections: Object Detection beyond Winner-Takes-All + + +
+ This paper revisits the problem of predicting box locations in object +detection architectures. Typically, each box proposal or box query aims to +directly maximize the intersection-over-union score with the ground truth, +followed by a winner-takes-all non-maximum suppression where only the highest +scoring box in each region is retained. We observe that both steps are +sub-optimal: the first involves regressing proposals to the entire ground +truth, which is a difficult task even with large receptive fields, and the +second neglects valuable information from boxes other than the top candidate. +Instead of regressing proposals to the whole ground truth, we propose a simpler +approach: regress only to the area of intersection between the proposal and the +ground truth. This avoids the need for proposals to extrapolate beyond their +visual scope, improving localization accuracy. Rather than adopting a +winner-takes-all strategy, we take the union over the regressed intersections +of all boxes in a region to generate the final box outputs. Our plug-and-play +method integrates seamlessly into proposal-based, grid-based, and query-based +detection architectures with minimal modifications, consistently improving +object localization and instance segmentation. We demonstrate its broad +applicability and versatility across various detection and segmentation tasks. + +
+
+ comment: 17 pages, 6 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ Point Cloud Semantic Segmentation with Sparse and Inhomogeneous + Annotations + + +
+ Utilizing uniformly distributed sparse annotations, weakly supervised +learning alleviates the heavy reliance on fine-grained annotations in point +cloud semantic segmentation tasks. However, few works discuss the inhomogeneity +of sparse annotations, albeit it is common in real-world scenarios. Therefore, +this work introduces the probability density function into the gradient +sampling approximation method to qualitatively analyze the impact of annotation +sparsity and inhomogeneity under weakly supervised learning. Based on our +analysis, we propose an Adaptive Annotation Distribution Network (AADNet) +capable of robust learning on arbitrarily distributed sparse annotations. +Specifically, we propose a label-aware point cloud downsampling strategy to +increase the proportion of annotations involved in the training stage. +Furthermore, we design the multiplicative dynamic entropy as the gradient +calibration function to mitigate the gradient bias caused by non-uniformly +distributed sparse annotations and explicitly reduce the epistemic uncertainty. +Without any prior restrictions and additional information, our proposed method +achieves comprehensive performance improvements at multiple label rates and +different annotation distributions. + +
+
+
+
+
+ + ♻ ☆ Accuracy Limits as a Barrier to Biometric System Security + + +
+ Biometric systems are widely used for identity verification and +identification, including authentication (i.e., one-to-one matching to verify a +claimed identity) and identification (i.e., one-to-many matching to find a +subject in a database). The matching process relies on measuring similarities +or dissimilarities between a fresh biometric template and enrolled templates. +The False Match Rate FMR is a key metric for assessing the accuracy and +reliability of such systems. This paper analyzes biometric systems based on +their FMR, with two main contributions. First, we explore untargeted attacks, +where an adversary aims to impersonate any user within a database. We determine +the number of trials required for an attacker to successfully impersonate a +user and derive the critical population size (i.e., the maximum number of users +in the database) required to maintain a given level of security. Furthermore, +we compute the critical FMR value needed to ensure resistance against +untargeted attacks as the database size increases. Second, we revisit the +biometric birthday problem to evaluate the approximate and exact probabilities +that two users in a database collide (i.e., can impersonate each other). Based +on this analysis, we derive both the approximate critical population size and +the critical FMR value needed to bound the likelihood of such collisions +occurring with a given probability. These thresholds offer insights for +designing systems that mitigate the risk of impersonation and collisions, +particularly in large-scale biometric databases. Our findings indicate that +current biometric systems fail to deliver sufficient accuracy to achieve an +adequate security level against untargeted attacks, even in small-scale +databases. Moreover, state-of-the-art systems face significant challenges in +addressing the biometric birthday problem, especially as database sizes grow. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ GaraMoSt: Parallel Multi-Granularity Motion and Structural Modeling for + Efficient Multi-Frame Interpolation in DSA Images AAAI2025 + + +
+ The rapid and accurate direct multi-frame interpolation method for Digital +Subtraction Angiography (DSA) images is crucial for reducing radiation and +providing real-time assistance to physicians for precise diagnostics and +treatment. DSA images contain complex vascular structures and various motions. +Applying natural scene Video Frame Interpolation (VFI) methods results in +motion artifacts, structural dissipation, and blurriness. Recently, MoSt-DSA +has specifically addressed these issues for the first time and achieved SOTA +results. However, MoSt-DSA's focus on real-time performance leads to +insufficient suppression of high-frequency noise and incomplete filtering of +low-frequency noise in the generated images. To address these issues within the +same computational time scale, we propose GaraMoSt. Specifically, we optimize +the network pipeline with a parallel design and propose a module named MG-MSFE. +MG-MSFE extracts frame-relative motion and structural features at various +granularities in a fully convolutional parallel manner and supports +independent, flexible adjustment of context-aware granularity at different +scales, thus enhancing computational efficiency and accuracy. Extensive +experiments demonstrate that GaraMoSt achieves the SOTA performance in +accuracy, robustness, visual effects, and noise suppression, comprehensively +surpassing MoSt-DSA and other natural scene VFI methods. The code and models +are available at https://github.com/ZyoungXu/GaraMoSt. + +
+
+ comment: Accepted by AAAI2025 +
+
+
+
+
+ + ♻ ☆ Video-RAG: Visually-aligned Retrieval-Augmented Long Video Comprehension + + +
+ Existing large video-language models (LVLMs) struggle to comprehend long +videos correctly due to limited context. To address this problem, fine-tuning +long-context LVLMs and employing GPT-based agents have emerged as promising +solutions. However, fine-tuning LVLMs would require extensive high-quality data +and substantial GPU resources, while GPT-based agents would rely on proprietary +models (e.g., GPT-4o). In this paper, we propose Video Retrieval-Augmented +Generation (Video-RAG), a training-free and cost-effective pipeline that +employs visually-aligned auxiliary texts to help facilitate cross-modality +alignment while providing additional information beyond the visual content. +Specifically, we leverage open-source external tools to extract +visually-aligned information from pure video data (e.g., audio, optical +character, and object detection), and incorporate the extracted information +into an existing LVLM as auxiliary texts, alongside video frames and queries, +in a plug-and-play manner. Our Video-RAG offers several key advantages: (i) +lightweight with low computing overhead due to single-turn retrieval; (ii) easy +implementation and compatibility with any LVLM; and (iii) significant, +consistent performance gains across long video understanding benchmarks, +including Video-MME, MLVU, and LongVideoBench. Notably, our model demonstrates +superior performance over proprietary models like Gemini-1.5-Pro and GPT-4o +when utilized with a 72B model. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Generative Adversarial Networks for Image Super-Resolution: A Survey + + +
+ Single image super-resolution (SISR) has played an important role in the +field of image processing. Recent generative adversarial networks (GANs) can +achieve excellent results on low-resolution images with small samples. However, +there are little literatures summarizing different GANs in SISR. In this paper, +we conduct a comparative study of GANs from different perspectives. We first +take a look at developments of GANs. Second, we present popular architectures +for GANs in big and small samples for image applications. Then, we analyze +motivations, implementations and differences of GANs based optimization methods +and discriminative learning for image super-resolution in terms of supervised, +semi-supervised and unsupervised manners, where these GANs are analyzed via +integrating different network architectures, prior knowledge, loss functions +and multiple tasks. Next, we compare performance of these popular GANs on +public datasets via quantitative and qualitative analysis in SISR. Finally, we +highlight challenges of GANs and potential research points for SISR. + +
+
+ comment: 31pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Exploring Scalability of Self-Training for Open-Vocabulary Temporal + Action Localization WACV 2025 + + +
+ The vocabulary size in temporal action localization (TAL) is limited by the +scarcity of large-scale annotated datasets. To overcome this, recent works +integrate vision-language models (VLMs), such as CLIP, for open-vocabulary TAL +(OV-TAL). However, despite the success of VLMs trained on extensive datasets, +existing OV-TAL methods still rely on human-labeled TAL datasets of limited +size to train action localizers, limiting their generalizability. In this +paper, we explore the scalability of self-training with unlabeled YouTube +videos for OV-TAL. Our approach consists of two stages: (1) a class-agnostic +action localizer is trained on a human-labeled TAL dataset to generate +pseudo-labels for unlabeled videos, and (2) the large-scale pseudo-labeled +dataset is then used to train the localizer. Extensive experiments demonstrate +that leveraging web-scale videos in self-training significantly enhances the +generalizability of an action localizer. Additionally, we identify limitations +in existing OV-TAL evaluation schemes and propose a new benchmark for thorough +assessment. Finally, we showcase the TAL performance of the large multimodal +model Gemini-1.5 on our new benchmark. Code is released at +https://github.com/HYUNJS/STOV-TAL. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ VHM: Versatile and Honest Vision Language Model for Remote Sensing Image + Analysis + + +
+ This paper develops a Versatile and Honest vision language Model (VHM) for +remote sensing image analysis. VHM is built on a large-scale remote sensing +image-text dataset with rich-content captions (VersaD), and an honest +instruction dataset comprising both factual and deceptive questions (HnstD). +Unlike prevailing remote sensing image-text datasets, in which image captions +focus on a few prominent objects and their relationships, VersaD captions +provide detailed information about image properties, object attributes, and the +overall scene. This comprehensive captioning enables VHM to thoroughly +understand remote sensing images and perform diverse remote sensing tasks. +Moreover, different from existing remote sensing instruction datasets that only +include factual questions, HnstD contains additional deceptive questions +stemming from the non-existence of objects. This feature prevents VHM from +producing affirmative answers to nonsense queries, thereby ensuring its +honesty. In our experiments, VHM significantly outperforms various vision +language models on common tasks of scene classification, visual question +answering, and visual grounding. Additionally, VHM achieves competent +performance on several unexplored tasks, such as building vectorizing, +multi-label classification and honest question answering. We will release the +code, data and model weights at https://github.com/opendatalab/VHM . + +
+
+ comment: Equal contribution: Chao Pang, Xingxing Weng, Jiang Wu; Corresponding + author: Gui-Song Xia, Conghui He +
+
+
+
+
+ + ♻ ☆ ZAHA: Introducing the Level of Facade Generalization and the Large-Scale + Point Cloud Facade Semantic Segmentation Benchmark Dataset WACV 2025 + + +
+ Facade semantic segmentation is a long-standing challenge in photogrammetry +and computer vision. Although the last decades have witnessed the influx of +facade segmentation methods, there is a lack of comprehensive facade classes +and data covering the architectural variability. In ZAHA, we introduce Level of +Facade Generalization (LoFG), novel hierarchical facade classes designed based +on international urban modeling standards, ensuring compatibility with +real-world challenging classes and uniform methods' comparison. Realizing the +LoFG, we present to date the largest semantic 3D facade segmentation dataset, +providing 601 million annotated points at five and 15 classes of LoFG2 and +LoFG3, respectively. Moreover, we analyze the performance of baseline semantic +segmentation methods on our introduced LoFG classes and data, complementing it +with a discussion on the unresolved challenges for facade segmentation. We +firmly believe that ZAHA shall facilitate further development of 3D facade +semantic segmentation methods, enabling robust segmentation indispensable in +creating urban digital twins. + +
+
+ comment: Accepted to WACV 2025 (IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV)) +
+
+
+
+
+ + ♻ ☆ A Black-Box Evaluation Framework for Semantic Robustness in Bird's Eye + View Detection + + +
+ Camera-based Bird's Eye View (BEV) perception models receive increasing +attention for their crucial role in autonomous driving, a domain where concerns +about the robustness and reliability of deep learning have been raised. While +only a few works have investigated the effects of randomly generated semantic +perturbations, aka natural corruptions, on the multi-view BEV detection task, +we develop a black-box robustness evaluation framework that adversarially +optimises three common semantic perturbations: geometric transformation, colour +shifting, and motion blur, to deceive BEV models, serving as the first approach +in this emerging field. To address the challenge posed by optimising the +semantic perturbation, we design a smoothed, distance-based surrogate function +to replace the mAP metric and introduce SimpleDIRECT, a deterministic +optimisation algorithm that utilises observed slopes to guide the optimisation +process. By comparing with randomised perturbation and two optimisation +baselines, we demonstrate the effectiveness of the proposed framework. +Additionally, we provide a benchmark on the semantic robustness of ten recent +BEV models. The results reveal that PolarFormer, which emphasises geometric +information from multi-view images, exhibits the highest robustness, whereas +BEVDet is fully compromised, with its precision reduced to zero. + +
+
+
+
+
+ + ♻ ☆ ASTM :Autonomous Smart Traffic Management System Using Artificial + Intelligence CNN and LSTM + + +
+ In the modern world, the development of Artificial Intelligence (AI) has +contributed to improvements in various areas, including automation, computer +vision, fraud detection, and more. AI can be leveraged to enhance the +efficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce +traffic congestion rates. This paper presents an Autonomous Smart Traffic +Management (STM) system that uses AI to improve traffic flow rates. The system +employs the YOLO V5 Convolutional Neural Network to detect vehicles in traffic +management images. Additionally, it predicts the number of vehicles for the +next 12 hours using a Recurrent Neural Network with Long Short-Term Memory +(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the +traffic cycle length based on these vehicle predictions, aided by AI. From the +results of the RNN-LSTM model for predicting vehicle numbers over the next 12 +hours, we observe that the model predicts traffic with a Mean Squared Error +(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles. +After simulating the STM system in the CARLA simulation environment, we found +that the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per +minute) is 50\% higher than the rate without STM (around 15 vehicles per +minute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5 +seconds per vehicle) is 70\% lower than without STM (around 12 seconds per +vehicle). These results demonstrate that the STM system using AI can increase +traffic flow by 50\% and reduce vehicle pass delays by 70\%. + +
+
+ comment: In process to IEEE Intelligent Vehicle Symposium 2025 +
+
+
+
+
+ + ♻ ☆ SCB-dataset: A Dataset for Detecting Student Classroom Behavior + + +
+ The use of deep learning methods for automatic detection of students' +classroom behavior is a promising approach to analyze their class performance +and enhance teaching effectiveness. However, the lack of publicly available +datasets on student behavior poses a challenge for researchers in this field. +To address this issue, we propose a Student Classroom Behavior dataset +(SCB-dataset) that reflects real-life scenarios. Our dataset includes 11,248 +labels and 4,003 images, with a focus on hand-raising behavior. We evaluated +the dataset using the YOLOv7 algorithm, achieving a mean average precision +(map) of up to 85.3%. We believe that our dataset can serve as a robust +foundation for future research in the field of student behavior detection and +promote further advancements in this area.Our SCB-dataset can be downloaded +from: https://github.com/Whiffe/SCB-dataset + +
+
+
+
+
+ + ♻ ☆ 3D Registration in 30 Years: A Survey + + +
+ 3D point cloud registration is a fundamental problem in computer vision, +computer graphics, robotics, remote sensing, and etc. Over the last thirty +years, we have witnessed the amazing advancement in this area with numerous +kinds of solutions. Although a handful of relevant surveys have been conducted, +their coverage is still limited. In this work, we present a comprehensive +survey on 3D point cloud registration, covering a set of sub-areas such as +pairwise coarse registration, pairwise fine registration, multi-view +registration, cross-scale registration, and multi-instance registration. The +datasets, evaluation metrics, method taxonomy, discussions of the merits and +demerits, insightful thoughts of future directions are comprehensively +presented in this survey. The regularly updated project page of the survey is +available at https://github.com/Amyyyy11/3D-Registration-in-30-Years-A-Survey. + +
+
+
+
+
+ + ♻ ☆ Accelerating Diffusion Transformers with Token-wise Feature Caching + + +
+ Diffusion transformers have shown significant effectiveness in both image and +video synthesis at the expense of huge computation costs. To address this +problem, feature caching methods have been introduced to accelerate diffusion +transformers by caching the features in previous timesteps and reusing them in +the following timesteps. However, previous caching methods ignore that +different tokens exhibit different sensitivities to feature caching, and +feature caching on some tokens may lead to 10$\times$ more destruction to the +overall generation quality compared with other tokens. In this paper, we +introduce token-wise feature caching, allowing us to adaptively select the most +suitable tokens for caching, and further enable us to apply different caching +ratios to neural layers in different types and depths. Extensive experiments on +PixArt-$\alpha$, OpenSora, and DiT demonstrate our effectiveness in both image +and video generation with no requirements for training. For instance, +2.36$\times$ and 1.93$\times$ acceleration are achieved on OpenSora and +PixArt-$\alpha$ with almost no drop in generation quality. + +
+
+ comment: In this version, we achieved a nearly lossless acceleration of 1.51 + times for ToCa on FLUX in the appendix +
+
+
+
+
+ + ♻ ☆ M$^3$-VOS: Multi-Phase, Multi-Transition, and Multi-Scenery Video Object + Segmentation + + +
+ Intelligent robots need to interact with diverse objects across various +environments. The appearance and state of objects frequently undergo complex +transformations depending on the object properties, e.g., phase transitions. +However, in the vision community, segmenting dynamic objects with phase +transitions is overlooked. In light of this, we introduce the concept of phase +in segmentation, which categorizes real-world objects based on their visual +characteristics and potential morphological and appearance changes. Then, we +present a new benchmark, Multi-Phase, Multi-Transition, and Multi-Scenery Video +Object Segmentation (M$^3$-VOS), to verify the ability of models to understand +object phases, which consists of 479 high-resolution videos spanning over 10 +distinct everyday scenarios. It provides dense instance mask annotations that +capture both object phases and their transitions. We evaluate state-of-the-art +methods on M$^3$-VOS, yielding several key insights. Notably, current +appearancebased approaches show significant room for improvement when handling +objects with phase transitions. The inherent changes in disorder suggest that +the predictive performance of the forward entropy-increasing process can be +improved through a reverse entropy-reducing process. These findings lead us to +propose ReVOS, a new plug-andplay model that improves its performance by +reversal refinement. Our data and code will be publicly available at +https://zixuan-chen.github.io/M-cubeVOS.github.io/. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ SLAM3R: Real-Time Dense Scene Reconstruction from Monocular RGB Videos + + +
+ In this paper, we introduce SLAM3R, a novel and effective monocular RGB SLAM +system for real-time and high-quality dense 3D reconstruction. SLAM3R provides +an end-to-end solution by seamlessly integrating local 3D reconstruction and +global coordinate registration through feed-forward neural networks. Given an +input video, the system first converts it into overlapping clips using a +sliding window mechanism. Unlike traditional pose optimization-based methods, +SLAM3R directly regresses 3D pointmaps from RGB images in each window and +progressively aligns and deforms these local pointmaps to create a globally +consistent scene reconstruction - all without explicitly solving any camera +parameters. Experiments across datasets consistently show that SLAM3R achieves +state-of-the-art reconstruction accuracy and completeness while maintaining +real-time performance at 20+ FPS. Code and weights at: +https://github.com/PKU-VCL-3DV/SLAM3R. + +
+
+
+
+
+ + ♻ ☆ DeepClean: Integrated Distortion Identification and Algorithm Selection + for Rectifying Image Corruptions + + +
+ Distortion identification and rectification in images and videos is vital for +achieving good performance in downstream vision applications. Instead of +relying on fixed trial-and-error based image processing pipelines, we propose a +two-level sequential planning approach for automated image distortion +classification and rectification. At the higher level it detects the class of +corruptions present in the input image, if any. The lower level selects a +specific algorithm to be applied, from a set of externally provided candidate +algorithms. The entire two-level setup runs in the form of a single forward +pass during inference and it is to be queried iteratively until the retrieval +of the original image. We demonstrate improvements compared to three baselines +on the object detection task on COCO image dataset with rich set of +distortions. The advantage of our approach is its dynamic reconfiguration, +conditioned on the input image and generalisability to unseen candidate +algorithms at inference time, since it relies only on the comparison of their +output of the image embeddings. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ FashionComposer: Compositional Fashion Image Generation + + +
+ We present FashionComposer for compositional fashion image generation. Unlike +previous methods, FashionComposer is highly flexible. It takes multi-modal +input (i.e., text prompt, parametric human model, garment image, and face +image) and supports personalizing the appearance, pose, and figure of the human +and assigning multiple garments in one pass. To achieve this, we first develop +a universal framework capable of handling diverse input modalities. We +construct scaled training data to enhance the model's robust compositional +capabilities. To accommodate multiple reference images (garments and faces) +seamlessly, we organize these references in a single image as an "asset +library" and employ a reference UNet to extract appearance features. To inject +the appearance features into the correct pixels in the generated result, we +propose subject-binding attention. It binds the appearance features from +different "assets" with the corresponding text features. In this way, the model +could understand each asset according to their semantics, supporting arbitrary +numbers and types of reference images. As a comprehensive solution, +FashionComposer also supports many other applications like human album +generation, diverse virtual try-on tasks, etc. + +
+
+ comment: https://sihuiji.github.io/FashionComposer-Page +
+
+
+
+
+ + ♻ ☆ SkyDiffusion: Ground-to-Aerial Image Synthesis with Diffusion Models and + BEV Paradigm + + +
+ Ground-to-aerial image synthesis focuses on generating realistic aerial +images from corresponding ground street view images while maintaining +consistent content layout, simulating a top-down view. The significant +viewpoint difference leads to domain gaps between views, and dense urban scenes +limit the visible range of street views, making this cross-view generation task +particularly challenging. In this paper, we introduce SkyDiffusion, a novel +cross-view generation method for synthesizing aerial images from street view +images, utilizing a diffusion model and the Bird's-Eye View (BEV) paradigm. The +Curved-BEV method in SkyDiffusion converts street-view images into a BEV +perspective, effectively bridging the domain gap, and employs a "multi-to-one" +mapping strategy to address occlusion issues in dense urban scenes. Next, +SkyDiffusion designed a BEV-guided diffusion model to generate +content-consistent and realistic aerial images. Additionally, we introduce a +novel dataset, Ground2Aerial-3, designed for diverse ground-to-aerial image +synthesis applications, including disaster scene aerial synthesis, historical +high-resolution satellite image synthesis, and low-altitude UAV image synthesis +tasks. Experimental results demonstrate that SkyDiffusion outperforms +state-of-the-art methods on cross-view datasets across natural (CVUSA), +suburban (CVACT), urban (VIGOR-Chicago), and various application scenarios +(G2A-3), achieving realistic and content-consistent aerial image generation. +More result and dataset information can be found at +https://opendatalab.github.io/skydiffusion/ . + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ From Training-Free to Adaptive: Empirical Insights into MLLMs' + Understanding of Detection Information + + +
+ Despite the impressive capabilities of Multimodal Large Language Models +(MLLMs) in integrating text and image modalities, challenges remain in +accurately interpreting detailed visual elements. Vision detection models excel +at recognizing fine-grained image details, prompting researchers to use them to +enhance MLLMs. One effective strategy is to infuse detection information in +text format, which has proven simple and effective. However, most studies +utilize this method without training, leaving the potential of adaptive +training largely unexplored. Adaptive training could significantly enhance +MLLMs' comprehension of unique inputs while filtering out irrelevant +information. This paper addresses the crucial question: How does training +impact MLLMs' understanding of infused textual detection information? We +systematically experiment with various representative models to evaluate the +effects of training-free, retraining, and fine-tuning strategies. We also +examine the influence of training on MLLMs' original abilities and the +interchangeability of detection models. Our findings indicate that fine-tuning +a pre-trained MLLM to incorporate textual detection information delivers +superior results compared to training-free and retraining methods, improving +performance by 6.71% across 10 widely recognized benchmarks. Furthermore, +fine-tuning enables MLLMs to retain performance enhancements even when +detection models are swapped, indicating improved understanding of formatted +textual data. We release our codes to support further exploration of fusion +strategies for vision detection models and the enhancement of MLLMs' +fine-grained multimodal capabilities. + +
+
+ comment: 32 pages, 22 tables, 7 figures +
+
+
+
+
+ + ♻ ☆ CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for + Adversarial Defense NeurIPS 2024 + + +
+ Despite ongoing efforts to defend neural classifiers from adversarial +attacks, they remain vulnerable, especially to unseen attacks. In contrast, +humans are difficult to be cheated by subtle manipulations, since we make +judgments only based on essential factors. Inspired by this observation, we +attempt to model label generation with essential label-causative factors and +incorporate label-non-causative factors to assist data generation. For an +adversarial example, we aim to discriminate the perturbations as non-causative +factors and make predictions only based on the label-causative factors. +Concretely, we propose a casual diffusion model (CausalDiff) that adapts +diffusion models for conditional data generation and disentangles the two types +of casual factors by learning towards a novel casual information bottleneck +objective. Empirically, CausalDiff has significantly outperformed +state-of-the-art defense methods on various unseen attacks, achieving an +average robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on +CIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition +Benchmark). The code is available at +https://github.com/CAS-AISafetyBasicResearchGroup/CausalDiff + +
+
+ comment: accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Training Datasets Generation for Machine Learning: Application to Vision + Based Navigation SP + + +
+ Vision Based Navigation consists in utilizing cameras as precision sensors +for GNC after extracting information from images. To enable the adoption of +machine learning for space applications, one of obstacles is the demonstration +that available training datasets are adequate to validate the algorithms. The +objective of the study is to generate datasets of images and metadata suitable +for training machine learning algorithms. Two use cases were selected and a +robust methodology was developed to validate the datasets including the ground +truth. The first use case is in-orbit rendezvous with a man-made object: a +mockup of satellite ENVISAT. The second use case is a Lunar landing scenario. +Datasets were produced from archival datasets (Chang'e 3), from the laboratory +at DLR TRON facility and at Airbus Robotic laboratory, from SurRender software +high fidelity image simulator using Model Capture and from Generative +Adversarial Networks. The use case definition included the selection of +algorithms as benchmark: an AI-based pose estimation algorithm and a dense +optical flow algorithm were selected. Eventually it is demonstrated that +datasets produced with SurRender and selected laboratory facilities are +adequate to train machine learning algorithms. + +
+
+ comment: 6 pages, 4 figures, preprint of the proceedings of ESA SPAICE + conference 2024 +
+
+
+
+
+ + ♻ ☆ Img-Diff: Contrastive Data Synthesis for Multimodal Large Language + Models + + +
+ High-performance Multimodal Large Language Models (MLLMs) are heavily +dependent on data quality. To advance fine-grained image recognition within +MLLMs, we introduce a novel data synthesis method inspired by contrastive +learning and image difference captioning. Our key idea involves challenging the +model to discern both matching and distinct elements by scrutinizing object +differences in detailed regions across similar images. We begin by generating +pairs of similar images that emphasize object variations. Following this, we +employ a Difference Area Generator to pinpoint object differences, and +subsequently, a Difference Captions Generator to articulate these differences. +This process results in a high-quality dataset of "object replacement" samples, +termed Img-Diff, which can be scaled as needed due to its automated nature. We +leverage this generated dataset to fine-tune state-of-the-art (SOTA) MLLMs, +such as InternVL2, achieving substantial improvements across various image +difference and Visual Question Answering tasks. Notably, the trained models +significantly outperform existing SOTA models like GPT-4V and Gemini on the +MMVP benchmark. Additionally, we conduct comprehensive evaluations to validate +the dataset's diversity, quality, and robustness, offering several insights +into the synthesis of such contrastive datasets. We release our codes and +dataset to encourage further research on multimodal data synthesis and MLLMs' +fundamental capabilities for image understanding. + +
+
+ comment: 22 pages, 10 figures, 16 tables +
+
+
+
+
+ + ♻ ☆ One Pixel is All I Need + + +
+ Vision Transformers (ViTs) have achieved record-breaking performance in +various visual tasks. However, concerns about their robustness against backdoor +attacks have grown. Backdoor attacks involve associating a specific trigger +with a target label, causing the model to predict the attacker-specified label +when the trigger is present, while correctly identifying clean images.We found +that ViTs exhibit higher attack success rates for quasi-triggers(patterns +different from but similar to the original training triggers)compared to CNNs. +Moreover, some backdoor features in clean samples can suppress the original +trigger, making quasi-triggers more effective.To better understand and exploit +these vulnerabilities, we developed a tool called the Perturbation Sensitivity +Distribution Map (PSDM). PSDM computes and sums gradients over many inputs to +show how sensitive the model is to small changes in the input. In ViTs, PSDM +reveals a patch-like pattern where central pixels are more sensitive than +edges. We use PSDM to guide the creation of quasi-triggers.Based on these +findings, we designed "WorstVIT," a simple yet effective data poisoning +backdoor for ViT models. This attack requires an extremely low poisoning rate, +trains for just one epoch, and modifies a single pixel to successfully attack +all validation images. + +
+
+
+
+
+ + ♻ ☆ Prediction-Feedback DETR for Temporal Action Detection AAAI 2025 + + +
+ Temporal Action Detection (TAD) is fundamental yet challenging for real-world +video applications. Leveraging the unique benefits of transformers, various +DETR-based approaches have been adopted in TAD. However, it has recently been +identified that the attention collapse in self-attention causes the performance +degradation of DETR for TAD. Building upon previous research, this paper newly +addresses the attention collapse problem in cross-attention within DETR-based +TAD methods. Moreover, our findings reveal that cross-attention exhibits +patterns distinct from predictions, indicating a short-cut phenomenon. To +resolve this, we propose a new framework, Prediction-Feedback DETR (Pred-DETR), +which utilizes predictions to restore the collapse and align the cross- and +self-attention with predictions. Specifically, we devise novel +prediction-feedback objectives using guidance from the relations of the +predictions. As a result, Pred-DETR significantly alleviates the collapse and +achieves state-of-the-art performance among DETR-based methods on various +challenging benchmarks including THUMOS14, ActivityNet-v1.3, HACS, and +FineAction. + +
+
+ comment: Accepted to AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Guiding a Diffusion Model with a Bad Version of Itself NeurIPS 2024 + + +
+ The primary axes of interest in image-generating diffusion models are image +quality, the amount of variation in the results, and how well the results align +with a given condition, e.g., a class label or a text prompt. The popular +classifier-free guidance approach uses an unconditional model to guide a +conditional model, leading to simultaneously better prompt alignment and +higher-quality images at the cost of reduced variation. These effects seem +inherently entangled, and thus hard to control. We make the surprising +observation that it is possible to obtain disentangled control over image +quality without compromising the amount of variation by guiding generation +using a smaller, less-trained version of the model itself rather than an +unconditional model. This leads to significant improvements in ImageNet +generation, setting record FIDs of 1.01 for 64x64 and 1.25 for 512x512, using +publicly available networks. Furthermore, the method is also applicable to +unconditional diffusion models, drastically improving their quality. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Activity Recognition on Avatar-Anonymized Datasets with Masked + Differential Privacy + + +
+ Privacy-preserving computer vision is an important emerging problem in +machine learning and artificial intelligence. Prevalent methods tackling this +problem use differential privacy (DP) or obfuscation techniques to protect the +privacy of individuals. In both cases, the utility of the trained model is +sacrificed heavily in this process. In this work, we present an anonymization +pipeline that replaces sensitive human subjects in video datasets with +synthetic avatars within context, employing a combined rendering and stable +diffusion-based strategy. Additionally we propose masked differential privacy +({MaskDP}) to protect non-anonymized but privacy sensitive background +information. MaskDP allows for controlling sensitive regions where differential +privacy is applied, in contrast to applying DP on the entire input. This +combined methodology provides strong privacy protection while minimizing the +usual performance penalty of privacy preserving methods. Experiments on +multiple challenging action recognition datasets demonstrate that our proposed +techniques result in better utility-privacy trade-offs compared to standard +differentially private training in the especially demanding $\epsilon<1$ +regime. + +
+
+
+
+
+ + ♻ ☆ Fast and Efficient: Mask Neural Fields for 3D Scene Segmentation + + +
+ Understanding 3D scenes is a crucial challenge in computer vision research +with applications spanning multiple domains. Recent advancements in distilling +2D vision-language foundation models into neural fields, like NeRF and 3DGS, +enable open-vocabulary segmentation of 3D scenes from 2D multi-view images +without the need for precise 3D annotations. However, while effective, these +methods typically rely on the per-pixel distillation of high-dimensional CLIP +features, introducing ambiguity and necessitating complex regularization +strategies, which adds inefficiency during training. This paper presents +MaskField, which enables efficient 3D open-vocabulary segmentation with neural +fields from a novel perspective. Unlike previous methods, MaskField decomposes +the distillation of mask and semantic features from foundation models by +formulating a mask feature field and queries. MaskField overcomes ambiguous +object boundaries by naturally introducing SAM segmented object shapes without +extra regularization during training. By circumventing the direct handling of +dense high-dimensional CLIP features during training, MaskField is particularly +compatible with explicit scene representations like 3DGS. Our extensive +experiments show that MaskField not only surpasses prior state-of-the-art +methods but also achieves remarkably fast convergence. We hope that MaskField +will inspire further exploration into how neural fields can be trained to +comprehend 3D scenes from 2D models. + +
+
+ comment: 15 pages, 9 figures, Code:https://github.com/keloee/MaskField +
+
+
+
+
+ + ♻ ☆ Image Classification with Rotation-Invariant Variational Quantum + Circuits + + +
+ Variational quantum algorithms are gaining attention as an early application +of Noisy Intermediate-Scale Quantum (NISQ) devices. One of the main problems of +variational methods lies in the phenomenon of Barren Plateaus, present in the +optimization of variational parameters. Adding geometric inductive bias to the +quantum models has been proposed as a potential solution to mitigate this +problem, leading to a new field called Geometric Quantum Machine Learning. In +this work, an equivariant architecture for variational quantum classifiers is +introduced to create a label-invariant model for image classification with +$C_4$ rotational label symmetry. The equivariant circuit is benchmarked against +two different architectures, and it is experimentally observed that the +geometric approach boosts the model's performance. Finally, a classical +equivariant convolution operation is proposed to extend the quantum model for +the processing of larger images, employing the resources available in NISQ +devices. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Diversifying Query: Region-Guided Transformer for Temporal Sentence + Grounding AAAI-25 + + +
+ Temporal sentence grounding is a challenging task that aims to localize the +moment spans relevant to a language description. Although recent DETR-based +models have achieved notable progress by leveraging multiple learnable moment +queries, they suffer from overlapped and redundant proposals, leading to +inaccurate predictions. We attribute this limitation to the lack of +task-related guidance for the learnable queries to serve a specific mode. +Furthermore, the complex solution space generated by variable and +open-vocabulary language descriptions complicates optimization, making it +harder for learnable queries to distinguish each other adaptively. To tackle +this limitation, we present a Region-Guided TRansformer (RGTR) for temporal +sentence grounding, which diversifies moment queries to eliminate overlapped +and redundant predictions. Instead of using learnable queries, RGTR adopts a +set of anchor pairs as moment queries to introduce explicit regional guidance. +Each anchor pair takes charge of moment prediction for a specific temporal +region, which reduces the optimization difficulty and ensures the diversity of +the final predictions. In addition, we design an IoU-aware scoring head to +improve proposal quality. Extensive experiments demonstrate the effectiveness +of RGTR, outperforming state-of-the-art methods on QVHighlights, Charades-STA +and TACoS datasets. Codes are available at https://github.com/TensorsSun/RGTR + +
+
+ comment: Accepted by AAAI-25. Code is available at + https://github.com/TensorsSun/RGTR +
+
+
+
+
+ + ♻ ☆ Reliable Breast Cancer Molecular Subtype Prediction based on + uncertainty-aware Bayesian Deep Learning by Mammography + + +
+ Breast cancer is a heterogeneous disease with different molecular subtypes, +clinical behavior, treatment responses as well as survival outcomes. The +development of a reliable, accurate, available and inexpensive method to +predict the molecular subtypes using medical images plays an important role in +the diagnosis and prognosis of breast cancer. Recently, deep learning methods +have shown good performance in the breast cancer classification tasks using +various medical images. Despite all that success, classical deep learning +cannot deliver the predictive uncertainty. The uncertainty represents the +validity of the predictions. Therefore, the high predicted uncertainty might +cause a negative effect in the accurate diagnosis of breast cancer molecular +subtypes. To overcome this, uncertainty quantification methods are used to +determine the predictive uncertainty. Accordingly, in this study, we proposed +an uncertainty-aware Bayesian deep learning model using the full mammogram +images. In addition, to increase the performance of the multi-class molecular +subtype classification task, we proposed a novel hierarchical classification +strategy, named the two-stage classification strategy. The separate AUC of the +proposed model for each subtype was 0.71, 0.75 and 0.86 for HER2-enriched, +luminal and triple-negative classes, respectively. The proposed model not only +has a comparable performance to other studies in the field of breast cancer +molecular subtypes prediction, even using full mammography images, but it is +also more reliable, due to quantify the predictive uncertainty. + +
+
+
+
+
+ + ♻ ☆ Chameleon: A Data-Efficient Generalist for Dense Visual Prediction in + the Wild + + +
+ Large language models have evolved data-efficient generalists, benefiting +from the universal language interface and large-scale pre-training. However, +constructing a data-efficient generalist for dense visual prediction presents a +distinct challenge due to the variation in label structures across different +tasks. Consequently, generalization to unseen dense prediction tasks in the +low-data regime is not straightforward and has received less attention from +previous vision generalists. In this study, we explore a universal model that +can flexibly adapt to unseen dense label structures with a few examples, +enabling it to serve as a data-efficient vision generalist in diverse +real-world scenarios. To this end, we base our method on a powerful +meta-learning framework and explore several axes to improve its performance and +versatility for real-world problems, such as flexible adaptation mechanisms and +scalability. We evaluate our model across a spectrum of unseen real-world +scenarios where low-shot learning is desirable, including video, 3D, medical, +biological, and user-interactive tasks. Equipped with a generic architecture +and an effective adaptation mechanism, our model flexibly adapts to all of +these tasks with at most 50 labeled images, showcasing a significant +advancement over existing data-efficient generalist approaches. Codes are +available at https://github.com/GitGyun/chameleon. + +
+
+
+
+
+ + ♻ ☆ Attentive Eraser: Unleashing Diffusion Model's Object Removal Potential + via Self-Attention Redirection Guidance AAAI 2025 + + +
+ Recently, diffusion models have emerged as promising newcomers in the field +of generative models, shining brightly in image generation. However, when +employed for object removal tasks, they still encounter issues such as +generating random artifacts and the incapacity to repaint foreground object +areas with appropriate content after removal. To tackle these problems, we +propose Attentive Eraser, a tuning-free method to empower pre-trained diffusion +models for stable and effective object removal. Firstly, in light of the +observation that the self-attention maps influence the structure and shape +details of the generated images, we propose Attention Activation and +Suppression (ASS), which re-engineers the self-attention mechanism within the +pre-trained diffusion models based on the given mask, thereby prioritizing the +background over the foreground object during the reverse generation process. +Moreover, we introduce Self-Attention Redirection Guidance (SARG), which +utilizes the self-attention redirected by ASS to guide the generation process, +effectively removing foreground objects within the mask while simultaneously +generating content that is both plausible and coherent. Experiments demonstrate +the stability and effectiveness of Attentive Eraser in object removal across a +variety of pre-trained diffusion models, outperforming even training-based +methods. Furthermore, Attentive Eraser can be implemented in various diffusion +model architectures and checkpoints, enabling excellent scalability. Code is +available at https://github.com/Anonym0u3/AttentiveEraser. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Distribution-Consistency-Guided Multi-modal Hashing + + +
+ Multi-modal hashing methods have gained popularity due to their fast speed +and low storage requirements. Among them, the supervised methods demonstrate +better performance by utilizing labels as supervisory signals compared with +unsupervised methods. Currently, for almost all supervised multi-modal hashing +methods, there is a hidden assumption that training sets have no noisy labels. +However, labels are often annotated incorrectly due to manual labeling in +real-world scenarios, which will greatly harm the retrieval performance. To +address this issue, we first discover a significant distribution consistency +pattern through experiments, i.e., the 1-0 distribution of the presence or +absence of each category in the label is consistent with the high-low +distribution of similarity scores of the hash codes relative to category +centers. Then, inspired by this pattern, we propose a novel +Distribution-Consistency-Guided Multi-modal Hashing (DCGMH), which aims to +filter and reconstruct noisy labels to enhance retrieval performance. +Specifically, the proposed method first randomly initializes several category +centers, which are used to compute the high-low distribution of similarity +scores; Noisy and clean labels are then separately filtered out via the +discovered distribution consistency pattern to mitigate the impact of noisy +labels; Subsequently, a correction strategy, which is indirectly designed via +the distribution consistency pattern, is applied to the filtered noisy labels, +correcting high-confidence ones while treating low-confidence ones as unlabeled +for unsupervised learning, thereby further enhancing the model's performance. +Extensive experiments on three widely used datasets demonstrate the superiority +of the proposed method compared to state-of-the-art baselines in multi-modal +retrieval tasks. The code is available at +https://github.com/LiuJinyu1229/DCGMH. + +
+
+
+
+
+ + ♻ ☆ Leveraging Anthropometric Measurements to Improve Human Mesh Estimation + and Ensure Consistent Body Shapes + + +
+ The basic body shape (i.e., the body shape in T-pose) of a person does not +change within a single video. However, most SOTA human mesh estimation (HME) +models output a slightly different, thus inconsistent basic body shape for each +video frame. Furthermore, we find that SOTA 3D human pose estimation (HPE) +models outperform HME models regarding the precision of the estimated 3D +keypoint positions. We solve the problem of inconsistent body shapes by +leveraging anthropometric measurements like taken by tailors from humans. We +create a model called A2B that converts given anthropometric measurements to +basic body shape parameters of human mesh models. We obtain superior and +consistent human meshes by combining the A2B model results with the keypoints +of 3D HPE models using inverse kinematics. We evaluate our approach on +challenging datasets like ASPset or fit3D, where we can lower the MPJPE by over +30 mm compared to SOTA HME models. Further, replacing estimates of the body +shape parameters from existing HME models with A2B results not only increases +the performance of these HME models, but also guarantees consistent body +shapes. + +
+
+
+
+
+ + ♻ ☆ DocKylin: A Large Multimodal Model for Visual Document Understanding + with Efficient Visual Slimming AAAI 2025 + + +
+ Current multimodal large language models (MLLMs) face significant challenges +in visual document understanding (VDU) tasks due to the high resolution, dense +text, and complex layouts typical of document images. These characteristics +demand a high level of detail perception ability from MLLMs. While increasing +input resolution improves detail perception capability, it also leads to longer +sequences of visual tokens, increasing computational costs and straining the +models' ability to handle long contexts. To address these challenges, we +introduce DocKylin, a document-centric MLLM that performs visual content +slimming at both the pixel and token levels, thereby reducing token sequence +length in VDU scenarios. We introduce an Adaptive Pixel Slimming (APS) +preprocessing module to perform pixel-level slimming, increasing the proportion +of informative pixels. Moreover, we propose a novel Dynamic Token Slimming +(DTS) module to conduct token-level slimming, filtering essential tokens and +removing others to adaptively create a more compact visual sequence. +Experiments demonstrate DocKylin's promising performance across various VDU +benchmarks and the effectiveness of each component. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ POPoS: Improving Efficient and Robust Facial Landmark Detection with + Parallel Optimal Position Search AAAI 2025 + + +
+ Achieving a balance between accuracy and efficiency is a critical challenge +in facial landmark detection (FLD). This paper introduces Parallel Optimal +Position Search (POPoS), a high-precision encoding-decoding framework designed +to address the limitations of traditional FLD methods. POPoS employs three key +contributions: (1) Pseudo-range multilateration is utilized to correct heatmap +errors, improving landmark localization accuracy. By integrating multiple +anchor points, it reduces the impact of individual heatmap inaccuracies, +leading to robust overall positioning. (2) To enhance the pseudo-range accuracy +of selected anchor points, a new loss function, named multilateration anchor +loss, is proposed. This loss function enhances the accuracy of the distance +map, mitigates the risk of local optima, and ensures optimal solutions. (3) A +single-step parallel computation algorithm is introduced, boosting +computational efficiency and reducing processing time. Extensive evaluations +across five benchmark datasets demonstrate that POPoS consistently outperforms +existing methods, particularly excelling in low-resolution heatmaps scenarios +with minimal computational overhead. These advantages make POPoS a highly +efficient and accurate tool for FLD, with broad applicability in real-world +scenarios. + +
+
+ comment: Accepted to AAAI 2025, 9 pages, 6 figures. Code: + https://github.com/teslatasy/POPoS +
+
+
+
+
+ + ♻ ☆ Grid4D: 4D Decomposed Hash Encoding for High-fidelity Dynamic Gaussian + Splatting NeurIPS 2024 + + +
+ Recently, Gaussian splatting has received more and more attention in the +field of static scene rendering. Due to the low computational overhead and +inherent flexibility of explicit representations, plane-based explicit methods +are popular ways to predict deformations for Gaussian-based dynamic scene +rendering models. However, plane-based methods rely on the inappropriate +low-rank assumption and excessively decompose the space-time 4D encoding, +resulting in overmuch feature overlap and unsatisfactory rendering quality. To +tackle these problems, we propose Grid4D, a dynamic scene rendering model based +on Gaussian splatting and employing a novel explicit encoding method for the 4D +input through the hash encoding. Different from plane-based explicit +representations, we decompose the 4D encoding into one spatial and three +temporal 3D hash encodings without the low-rank assumption. Additionally, we +design a novel attention module that generates the attention scores in a +directional range to aggregate the spatial and temporal features. The +directional attention enables Grid4D to more accurately fit the diverse +deformations across distinct scene components based on the spatial encoded +features. Moreover, to mitigate the inherent lack of smoothness in explicit +representation methods, we introduce a smooth regularization term that keeps +our model from the chaos of deformation prediction. Our experiments demonstrate +that Grid4D significantly outperforms the state-of-the-art models in visual +quality and rendering speed. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ LLaVA Needs More Knowledge: Retrieval Augmented Natural Language + Generation with Knowledge Graph for Explaining Thoracic Pathologies AAAI2025 + + +
+ Generating Natural Language Explanations (NLEs) for model predictions on +medical images, particularly those depicting thoracic pathologies, remains a +critical and challenging task. Existing methodologies often struggle due to +general models' insufficient domain-specific medical knowledge and privacy +concerns associated with retrieval-based augmentation techniques. To address +these issues, we propose a novel Vision-Language framework augmented with a +Knowledge Graph (KG)-based datastore, which enhances the model's understanding +by incorporating additional domain-specific medical knowledge essential for +generating accurate and informative NLEs. Our framework employs a KG-based +retrieval mechanism that not only improves the precision of the generated +explanations but also preserves data privacy by avoiding direct data retrieval. +The KG datastore is designed as a plug-and-play module, allowing for seamless +integration with various model architectures. We introduce and evaluate three +distinct frameworks within this paradigm: KG-LLaVA, which integrates the +pre-trained LLaVA model with KG-RAG; Med-XPT, a custom framework combining +MedCLIP, a transformer-based projector, and GPT-2; and Bio-LLaVA, which adapts +LLaVA by incorporating the Bio-ViT-L vision model. These frameworks are +validated on the MIMIC-NLE dataset, where they achieve state-of-the-art +results, underscoring the effectiveness of KG augmentation in generating +high-quality NLEs for thoracic pathologies. + +
+
+ comment: AAAI2025 +
+
+
+
+
+ + ♻ ☆ RoMeO: Robust Metric Visual Odometry + + +
+ Visual odometry (VO) aims to estimate camera poses from visual inputs -- a +fundamental building block for many applications such as VR/AR and robotics. +This work focuses on monocular RGB VO where the input is a monocular RGB video +without IMU or 3D sensors. Existing approaches lack robustness under this +challenging scenario and fail to generalize to unseen data (especially +outdoors); they also cannot recover metric-scale poses. We propose Robust +Metric Visual Odometry (RoMeO), a novel method that resolves these issues +leveraging priors from pre-trained depth models. RoMeO incorporates both +monocular metric depth and multi-view stereo (MVS) models to recover +metric-scale, simplify correspondence search, provide better initialization and +regularize optimization. Effective strategies are proposed to inject noise +during training and adaptively filter noisy depth priors, which ensure the +robustness of RoMeO on in-the-wild data. As shown in Fig.1, RoMeO advances the +state-of-the-art (SOTA) by a large margin across 6 diverse datasets covering +both indoor and outdoor scenes. Compared to the current SOTA DPVO, RoMeO +reduces the relative (align the trajectory scale with GT) and absolute +trajectory errors both by >50%. The performance gain also transfers to the full +SLAM pipeline (with global BA & loop closure). Code will be released upon +acceptance. + +
+
+
+
+
+ + ♻ ☆ Continual Learning: Forget-free Winning Subnetworks for Video + Representations + + +
+ Inspired by the Lottery Ticket Hypothesis (LTH), which highlights the +existence of efficient subnetworks within larger, dense networks, a +high-performing Winning Subnetwork (WSN) in terms of task performance under +appropriate sparsity conditions is considered for various continual learning +tasks. It leverages pre-existing weights from dense networks to achieve +efficient learning in Task Incremental Learning (TIL) and Task-agnostic +Incremental Learning (TaIL) scenarios. In Few-Shot Class Incremental Learning +(FSCIL), a variation of WSN referred to as the Soft subnetwork (SoftNet) is +designed to prevent overfitting when the data samples are scarce. Furthermore, +the sparse reuse of WSN weights is considered for Video Incremental Learning +(VIL). The use of Fourier Subneural Operator (FSO) within WSN is considered. It +enables compact encoding of videos and identifies reusable subnetworks across +varying bandwidths. We have integrated FSO into different architectural +frameworks for continual learning, including VIL, TIL, and FSCIL. Our +comprehensive experiments demonstrate FSO's effectiveness, significantly +improving task performance at various convolutional representational levels. +Specifically, FSO enhances higher-layer performance in TIL and FSCIL and +lower-layer performance in VIL. + +
+
+ comment: IEEE Transactions on Pattern Analysis and Machine Intelligence + (T-PAMI) +
+
+
+
+
+ + ♻ ☆ Recoverable Compression: A Multimodal Vision Token Recovery Mechanism + Guided by Text Information AAAI2025 + + +
+ With the advancement of large-scale language modeling techniques, large +multimodal models combining visual encoders with large language models have +demonstrated exceptional performance in various visual tasks. Most of the +current large-scale multimodal models achieve this by mapping visual features +obtained from the visual encoder into a large language model and using them as +inputs alongside text for downstream tasks. Therefore, the number of visual +tokens directly affects the training and inference speed of the model. There +has been significant work on token pruning for visual transformers, but for +large multimodal models, only relying on visual information for token pruning +or compression may lead to significant loss of important information. On the +other hand, the textual input in the form of a question may contain valuable +information that can aid in answering the question, providing additional +knowledge to the model. To address the potential oversimplification and +excessive pruning that can occur with most purely visual token pruning methods, +we propose a text information-guided dynamic visual token recovery mechanism +that does not require training. This mechanism leverages the similarity between +the question text and visual tokens to recover visually meaningful tokens with +important text information while merging other less important tokens. +Experimental results demonstrate that our proposed method achieves comparable +performance to the original approach while compressing the visual tokens to an +average of 10% of the original quantity. Our source code will be made publicly +available following acceptance. + +
+
+ comment: AAAI2025 Accepted +
+
+
+
+
+ + ♻ ☆ PALM: Pushing Adaptive Learning Rate Mechanisms for Continual Test-Time + Adaptation AAAI 2025 + + +
+ Real-world vision models in dynamic environments face rapid shifts in domain +distributions, leading to decreased recognition performance. Using unlabeled +test data, continuous test-time adaptation (CTTA) directly adjusts a +pre-trained source discriminative model to these changing domains. A highly +effective CTTA method involves applying layer-wise adaptive learning rates for +selectively adapting pre-trained layers. However, it suffers from the poor +estimation of domain shift and the inaccuracies arising from the pseudo-labels. +This work aims to overcome these limitations by identifying layers for +adaptation via quantifying model prediction uncertainty without relying on +pseudo-labels. We utilize the magnitude of gradients as a metric, calculated by +backpropagating the KL divergence between the softmax output and a uniform +distribution, to select layers for further adaptation. Subsequently, for the +parameters exclusively belonging to these selected layers, with the remaining +ones frozen, we evaluate their sensitivity to approximate the domain shift and +adjust their learning rates accordingly. We conduct extensive image +classification experiments on CIFAR-10C, CIFAR-100C, and ImageNet-C, +demonstrating the superior efficacy of our method compared to prior approaches. + +
+
+ comment: AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Skeleton-OOD: An End-to-End Skeleton-Based Model for Robust + Out-of-Distribution Human Action Detection + + +
+ Human action recognition is crucial in computer vision systems. However, in +real-world scenarios, human actions often fall outside the distribution of +training data, requiring a model to both recognize in-distribution (ID) actions +and reject out-of-distribution (OOD) ones. Despite its importance, there has +been limited research on OOD detection in human actions. Existing works on OOD +detection mainly focus on image data with RGB structure, and many methods are +post-hoc in nature. While these methods are convenient and computationally +efficient, they often lack sufficient accuracy, fail to consider the exposure +of OOD samples, and ignore the application in skeleton structure data. To +address these challenges, we propose a novel end-to-end skeleton-based model +called Skeleton-OOD, which is committed to improving the effectiveness of OOD +tasks while ensuring the accuracy of ID recognition. Through extensive +experiments conducted on NTU-RGB+D 60, NTU-RGB+D 120, and Kinetics-400 +datasets, Skeleton-OOD demonstrates the superior performance of our proposed +approach compared to state-of-the-art methods. Our findings underscore the +effectiveness of classic OOD detection techniques in the context of +skeleton-based action recognition tasks, offering promising avenues for future +research in this field. Code is available at +https://github.com/YilliaJing/Skeleton-OOD.git. + +
+
+ comment: Accepted by Neurocomputing +
+
+
+
+
+ + ♻ ☆ Diff-Shadow: Global-guided Diffusion Model for Shadow Removal AAAI + + +
+ We propose Diff-Shadow, a global-guided diffusion model for shadow removal. +Previous transformer-based approaches can utilize global information to relate +shadow and non-shadow regions but are limited in their synthesis ability and +recover images with obvious boundaries. In contrast, diffusion-based methods +can generate better content but they are not exempt from issues related to +inconsistent illumination. In this work, we combine the advantages of diffusion +models and global guidance to achieve shadow-free restoration. Specifically, we +propose a parallel UNets architecture: 1) the local branch performs the +patch-based noise estimation in the diffusion process, and 2) the global branch +recovers the low-resolution shadow-free images. A Reweight Cross Attention +(RCA) module is designed to integrate global contextual information of +non-shadow regions into the local branch. We further design a Global-guided +Sampling Strategy (GSS) that mitigates patch boundary issues and ensures +consistent illumination across shaded and unshaded regions in the recovered +image. Comprehensive experiments on datasets ISTD, ISTD+, and SRD have +demonstrated the effectiveness of Diff-Shadow. Compared to state-of-the-art +methods, our method achieves a significant improvement in terms of PSNR, +increasing from 32.33dB to 33.69dB on the ISTD dataset. + +
+
+ comment: Proceedings of the 39th Annual AAAI Conference on Artificial + Intelligence +
+
+
+
+
+ + ♻ ☆ Progressive Multi-granular Alignments for Grounded Reasoning in Large + Vision-Language Models + + +
+ Existing Large Vision-Language Models (LVLMs) excel at matching concepts +across multi-modal inputs but struggle with compositional concepts and +high-level relationships between entities. This paper introduces Progressive +multi-granular Vision-Language alignments (PromViL), a novel framework to +enhance LVLMs' ability in performing grounded compositional visual reasoning +tasks. Our approach constructs a hierarchical structure of multi-modal +alignments, ranging from simple to complex concepts. By progressively aligning +textual descriptions with corresponding visual regions, our model learns to +leverage contextual information from lower levels to inform higher-level +reasoning. To facilitate this learning process, we introduce a data generation +process that creates a novel dataset derived from Visual Genome, providing a +wide range of nested compositional vision-language pairs. Experimental results +demonstrate that our PromViL framework significantly outperforms baselines on +various visual grounding and compositional question answering tasks. The code +is available at: https://github.com/lqh52/PromViL. + +
+
+
+
+
+
+
+
+ + Information Retrieval 21 + +
+
+
+ + ☆ Nano-ESG: Extracting Corporate Sustainability Information from News + Articles ECIR 2025 + + +
+ Determining the sustainability impact of companies is a highly complex +subject which has garnered more and more attention over the past few years. +Today, investors largely rely on sustainability-ratings from established +rating-providers in order to analyze how responsibly a company acts. However, +those ratings have recently been criticized for being hard to understand and +nearly impossible to reproduce. + An independent way to find out about the sustainability practices of +companies lies in the rich landscape of news article data. In this paper, we +explore a different approach to identify key opportunities and challenges of +companies in the sustainability domain. We present a novel dataset of more than +840,000 news articles which were gathered for major German companies between +January 2023 and September 2024. By applying a mixture of Natural Language +Processing techniques, we first identify relevant articles, before summarizing +them and extracting their sustainability-related sentiment and aspect using +Large Language Models (LLMs). Furthermore, we conduct an evaluation of the +obtained data and determine that the LLM-produced answers are accurate. We +release both datasets at https://github.com/Bailefan/Nano-ESG. + +
+
+ comment: To be published at ECIR 2025. Preprint +
+
+
+
+
+ + ☆ DisCo: Graph-Based Disentangled Contrastive Learning for Cold-Start + Cross-Domain Recommendation + + +
+ Recommender systems are widely used in various real-world applications, but +they often encounter the persistent challenge of the user cold-start problem. +Cross-domain recommendation (CDR), which leverages user interactions from one +domain to improve prediction performance in another, has emerged as a promising +solution. However, users with similar preferences in the source domain may +exhibit different interests in the target domain. Therefore, directly +transferring embeddings may introduce irrelevant source-domain collaborative +information. In this paper, we propose a novel graph-based disentangled +contrastive learning framework to capture fine-grained user intent and filter +out irrelevant collaborative information, thereby avoiding negative transfer. +Specifically, for each domain, we use a multi-channel graph encoder to capture +diverse user intents. We then construct the affinity graph in the embedding +space and perform multi-step random walks to capture high-order user similarity +relationships. Treating one domain as the target, we propose a disentangled +intent-wise contrastive learning approach, guided by user similarity, to refine +the bridging of user intents across domains. Extensive experiments on four +benchmark CDR datasets demonstrate that DisCo consistently outperforms existing +state-of-the-art baselines, thereby validating the effectiveness of both DisCo +and its components. + +
+
+
+
+
+ + ☆ Spectrum-based Modality Representation Fusion Graph Convolutional + Network for Multimodal Recommendation WSDM + + +
+ Incorporating multi-modal features as side information has recently become a +trend in recommender systems. To elucidate user-item preferences, recent +studies focus on fusing modalities via concatenation, element-wise sum, or +attention mechanisms. Despite having notable success, existing approaches do +not account for the modality-specific noise encapsulated within each modality. +As a result, direct fusion of modalities will lead to the amplification of +cross-modality noise. Moreover, the variation of noise that is unique within +each modality results in noise alleviation and fusion being more challenging. +In this work, we propose a new Spectrum-based Modality Representation (SMORE) +fusion graph recommender that aims to capture both uni-modal and fusion +preferences while simultaneously suppressing modality noise. Specifically, +SMORE projects the multi-modal features into the frequency domain and leverages +the spectral space for fusion. To reduce dynamic contamination that is unique +to each modality, we introduce a filter to attenuate and suppress the modality +noise adaptively while capturing the universal modality patterns effectively. +Furthermore, we explore the item latent structures by designing a new +multi-modal graph learning module to capture associative semantic correlations +and universal fusion patterns among similar items. Finally, we formulate a new +modality-aware preference module, which infuses behavioral features and +balances the uni- and multi-modal features for precise preference modeling. +This empowers SMORE with the ability to infer both user modality-specific and +fusion preferences more accurately. Experiments on three real-world datasets +show the efficacy of our proposed model. The source code for this work has been +made publicly available at https://github.com/kennethorq/SMORE. + +
+
+ comment: Accepted to ACM Web Search and Data Mining (WSDM) 2025 +
+
+
+
+
+ + ☆ ECLIPSE: Contrastive Dimension Importance Estimation with + Pseudo-Irrelevance Feedback for Dense Retrieval + + +
+ Recent advances in Information Retrieval have leveraged high-dimensional +embedding spaces to improve the retrieval of relevant documents. Moreover, the +Manifold Clustering Hypothesis suggests that despite these high-dimensional +representations, documents relevant to a query reside on a lower-dimensional, +query-dependent manifold. While this hypothesis has inspired new retrieval +methods, existing approaches still face challenges in effectively separating +non-relevant information from relevant signals. We propose a novel methodology +that addresses these limitations by leveraging information from both relevant +and non-relevant documents. Our method, ECLIPSE, computes a centroid based on +irrelevant documents as a reference to estimate noisy dimensions present in +relevant ones, enhancing retrieval performance. Extensive experiments on three +in-domain and one out-of-domain benchmarks demonstrate an average improvement +of up to 19.50% (resp. 22.35%) in mAP(AP) and 11.42% (resp. 13.10%) in nDCG@10 +w.r.t. the DIME-based baseline (resp. the baseline using all dimensions). Our +results pave the way for more robust, pseudo-irrelevance-based retrieval +systems in future IR research. + +
+
+
+
+
+ + ☆ Progressive Multimodal Reasoning via Active Retrieval + + +
+ Multi-step multimodal reasoning tasks pose significant challenges for +multimodal large language models (MLLMs), and finding effective ways to enhance +their performance in such scenarios remains an unresolved issue. In this paper, +we propose AR-MCTS, a universal framework designed to progressively improve the +reasoning capabilities of MLLMs through Active Retrieval (AR) and Monte Carlo +Tree Search (MCTS). Our approach begins with the development of a unified +retrieval module that retrieves key supporting insights for solving complex +reasoning problems from a hybrid-modal retrieval corpus. To bridge the gap in +automated multimodal reasoning verification, we employ the MCTS algorithm +combined with an active retrieval mechanism, which enables the automatic +generation of step-wise annotations. This strategy dynamically retrieves key +insights for each reasoning step, moving beyond traditional beam search +sampling to improve the diversity and reliability of the reasoning space. +Additionally, we introduce a process reward model that aligns progressively to +support the automatic verification of multimodal reasoning tasks. Experimental +results across three complex multimodal reasoning benchmarks confirm the +effectiveness of the AR-MCTS framework in enhancing the performance of various +multimodal models. Further analysis demonstrates that AR-MCTS can optimize +sampling diversity and accuracy, yielding reliable multimodal reasoning. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ☆ Sliding Windows Are Not the End: Exploring Full Ranking with + Long-Context Large Language Models + + +
+ Large Language Models (LLMs) have shown exciting performance in listwise +passage ranking. Due to the limited input length, existing methods often adopt +the sliding window strategy. Such a strategy, though effective, is inefficient +as it involves repetitive and serialized processing, which usually re-evaluates +relevant passages multiple times. As a result, it incurs redundant API costs, +which are proportional to the number of inference tokens. The development of +long-context LLMs enables the full ranking of all passages within a single +inference, avoiding redundant API costs. In this paper, we conduct a +comprehensive study of long-context LLMs for ranking tasks in terms of +efficiency and effectiveness. Surprisingly, our experiments reveal that full +ranking with long-context LLMs can deliver superior performance in the +supervised fine-tuning setting with a huge efficiency improvement. Furthermore, +we identify two limitations of fine-tuning the full ranking model based on +existing methods: (1) sliding window strategy fails to produce a full ranking +list as a training label, and (2) the language modeling loss cannot emphasize +top-ranked passage IDs in the label. To alleviate these issues, we propose a +new complete listwise label construction approach and a novel importance-aware +learning objective for full ranking. Experiments show the superior performance +of our method over baselines. Our codes are available at +\url{https://github.com/8421BCD/fullrank}. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Efficient Self-Supervised Video Hashing with Selective State Spaces AAAI'25 + + +
+ Self-supervised video hashing (SSVH) is a practical task in video indexing +and retrieval. Although Transformers are predominant in SSVH for their +impressive temporal modeling capabilities, they often suffer from computational +and memory inefficiencies. Drawing inspiration from Mamba, an advanced +state-space model, we explore its potential in SSVH to achieve a better balance +between efficacy and efficiency. We introduce S5VH, a Mamba-based video hashing +model with an improved self-supervised learning paradigm. Specifically, we +design bidirectional Mamba layers for both the encoder and decoder, which are +effective and efficient in capturing temporal relationships thanks to the +data-dependent selective scanning mechanism with linear complexity. In our +learning strategy, we transform global semantics in the feature space into +semantically consistent and discriminative hash centers, followed by a center +alignment loss as a global learning signal. Our self-local-global (SLG) +paradigm significantly improves learning efficiency, leading to faster and +better convergence. Extensive experiments demonstrate S5VH's improvements over +state-of-the-art methods, superior transferability, and scalable advantages in +inference efficiency. Code is available at +https://github.com/gimpong/AAAI25-S5VH. + +
+
+ comment: Accepted by AAAI'25. 9 pages, 5 figures, 2 tables +
+
+
+
+
+ + ☆ Moving Beyond LDA: A Comparison of Unsupervised Topic Modelling + Techniques for Qualitative Data Analysis of Online Communities + + +
+ Social media constitutes a rich and influential source of information for +qualitative researchers. Although computational techniques like topic modelling +assist with managing the volume and diversity of social media content, +qualitative researcher's lack of programming expertise creates a significant +barrier to their adoption. In this paper we explore how BERTopic, an advanced +Large Language Model (LLM)-based topic modelling technique, can support +qualitative data analysis of social media. We conducted interviews and hands-on +evaluations in which qualitative researchers compared topics from three +modelling techniques: LDA, NMF, and BERTopic. BERTopic was favoured by 8 of 12 +participants for its ability to provide detailed, coherent clusters for deeper +understanding and actionable insights. Participants also prioritised topic +relevance, logical organisation, and the capacity to reveal unexpected +relationships within the data. Our findings underscore the potential of +LLM-based techniques for supporting qualitative analysis. + +
+
+
+
+
+ + ☆ HEC-GCN: Hypergraph Enhanced Cascading Graph Convolution Network for + Multi-Behavior Recommendation + + +
+ Multi-behavior recommendation (MBR) has garnered growing attention recently +due to its ability to mitigate the sparsity issue by inferring user preferences +from various auxiliary behaviors to improve predictions for the target +behavior. Although existing research on MBR has yielded impressive results, +they still face two major limitations. First, previous methods mainly focus on +modeling fine-grained interaction information between users and items under +each behavior, which may suffer from sparsity issue. Second, existing models +usually concentrate on exploiting dependencies between two consecutive +behaviors, leaving intra- and inter-behavior consistency largely unexplored. To +the end, we propose a novel approach named Hypergraph Enhanced Cascading Graph +Convolution Network for multi-behavior recommendation (HEC-GCN). To be +specific, we first explore both fine- and coarse-grained correlations among +users or items of each behavior by simultaneously modeling the +behavior-specific interaction graph and its corresponding hypergraph in a +cascaded manner. Then, we propose a behavior consistency-guided alignment +strategy that ensures consistent representations between the interaction graph +and its associated hypergraph for each behavior, while also maintaining +representation consistency across different behaviors. Extensive experiments +and analyses on three public benchmark datasets demonstrate that our proposed +approach is consistently superior to previous state-of-the-art methods due to +its capability to effectively attenuate the sparsity issue as well as preserve +both intra- and inter-behavior consistencies. The code is available at +https://github.com/marqu22/HEC-GCN.git. + +
+
+
+
+
+ + ☆ VISA: Retrieval Augmented Generation with Visual Source Attribution + + +
+ Generation with source attribution is important for enhancing the +verifiability of retrieval-augmented generation (RAG) systems. However, +existing approaches in RAG primarily link generated content to document-level +references, making it challenging for users to locate evidence among multiple +content-rich retrieved documents. To address this challenge, we propose +Retrieval-Augmented Generation with Visual Source Attribution (VISA), a novel +approach that combines answer generation with visual source attribution. +Leveraging large vision-language models (VLMs), VISA identifies the evidence +and highlights the exact regions that support the generated answers with +bounding boxes in the retrieved document screenshots. To evaluate its +effectiveness, we curated two datasets: Wiki-VISA, based on crawled Wikipedia +webpage screenshots, and Paper-VISA, derived from PubLayNet and tailored to the +medical domain. Experimental results demonstrate the effectiveness of VISA for +visual source attribution on documents' original look, as well as highlighting +the challenges for improvement. Code, data, and model checkpoints will be +released. + +
+
+
+
+
+ + ☆ Are Longer Prompts Always Better? Prompt Selection in Large Language + Models for Recommendation Systems + + +
+ In large language models (LLM)-based recommendation systems (LLM-RSs), +accurately predicting user preferences by leveraging the general knowledge of +LLMs is possible without requiring extensive training data. By converting +recommendation tasks into natural language inputs called prompts, LLM-RSs can +efficiently solve issues that have been difficult to address due to data +scarcity but are crucial in applications such as cold-start and cross-domain +problems. However, when applying this in practice, selecting the prompt that +matches tasks and data is essential. Although numerous prompts have been +proposed in LLM-RSs and representing the target user in prompts significantly +impacts recommendation accuracy, there are still no clear guidelines for +selecting specific prompts. + In this paper, we categorize and analyze prompts from previous research to +establish practical prompt selection guidelines. Through 450 experiments with +90 prompts and five real-world datasets, we examined the relationship between +prompts and dataset characteristics in recommendation accuracy. We found that +no single prompt consistently outperforms others; thus, selecting prompts on +the basis of dataset characteristics is crucial. Here, we propose a prompt +selection method that achieves higher accuracy with minimal validation data. +Because increasing the number of prompts to explore raises costs, we also +introduce a cost-efficient strategy using high-performance and cost-efficient +LLMs, significantly reducing exploration costs while maintaining high +prediction accuracy. Our work offers valuable insights into the prompt +selection, advancing accurate and efficient LLM-RSs. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ Metric Compatible Training for Online Backfilling in Large-Scale + Retrieval + + +
+ Backfilling is the process of re-extracting all gallery embeddings from +upgraded models in image retrieval systems. It inevitably requires a +prohibitively large amount of computational cost and even entails the downtime +of the service. Although backward-compatible learning sidesteps this challenge +by tackling query-side representations, this leads to suboptimal solutions in +principle because gallery embeddings cannot benefit from model upgrades. We +address this dilemma by introducing an online backfilling algorithm, which +enables us to achieve a progressive performance improvement during the +backfilling process while not sacrificing the final performance of new model +after the completion of backfilling. To this end, we first propose a simple +distance rank merge technique for online backfilling. Then, we incorporate a +reverse transformation module for more effective and efficient merging, which +is further enhanced by adopting a metric-compatible contrastive learning +approach. These two components help to make the distances of old and new models +compatible, resulting in desirable merge results during backfilling with no +extra computational overhead. Extensive experiments show the effectiveness of +our framework on four standard benchmarks in various settings. + +
+
+
+
+
+ + ♻ ☆ Agent-OM: Leveraging LLM Agents for Ontology Matching + + +
+ Ontology matching (OM) enables semantic interoperability between different +ontologies and resolves their conceptual heterogeneity by aligning related +entities. OM systems currently have two prevailing design paradigms: +conventional knowledge-based expert systems and newer machine learning-based +predictive systems. While large language models (LLMs) and LLM agents have +revolutionised data engineering and have been applied creatively in many +domains, their potential for OM remains underexplored. This study introduces a +novel agent-powered LLM-based design paradigm for OM systems. With +consideration of several specific challenges in leveraging LLM agents for OM, +we propose a generic framework, namely Agent-OM (Agent for Ontology Matching), +consisting of two Siamese agents for retrieval and matching, with a set of +simple OM tools. Our framework is implemented in a proof-of-concept system. +Evaluations of three Ontology Alignment Evaluation Initiative (OAEI) tracks +over state-of-the-art OM systems show that our system can achieve results very +close to the long-standing best performance on simple OM tasks and can +significantly improve the performance on complex and few-shot OM tasks. + +
+
+ comment: 19 pages, 13 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ DNS-Rec: Data-aware Neural Architecture Search for Recommender Systems + + +
+ In the era of data proliferation, efficiently sifting through vast +information to extract meaningful insights has become increasingly crucial. +This paper addresses the computational overhead and resource inefficiency +prevalent in existing Sequential Recommender Systems (SRSs). We introduce an +innovative approach combining pruning methods with advanced model designs. +Furthermore, we delve into resource-constrained Neural Architecture Search +(NAS), an emerging technique in recommender systems, to optimize models in +terms of FLOPs, latency, and energy consumption while maintaining or enhancing +accuracy. Our principal contribution is the development of a Data-aware Neural +Architecture Search for Recommender System (DNS-Rec). DNS-Rec is specifically +designed to tailor compact network architectures for attention-based SRS +models, thereby ensuring accuracy retention. It incorporates data-aware gates +to enhance the performance of the recommendation network by learning +information from historical user-item interactions. Moreover, DNS-Rec employs a +dynamic resource constraint strategy, stabilizing the search process and +yielding more suitable architectural solutions. We demonstrate the +effectiveness of our approach through rigorous experiments conducted on three +benchmark datasets, which highlight the superiority of DNS-Rec in SRSs. Our +findings set a new standard for future research in efficient and accurate +recommendation systems, marking a significant step forward in this rapidly +evolving field. + +
+
+
+
+
+ + ♻ ☆ Probability Distribution Learning and Its Application in Deep Learning + + +
+ This paper introduces a novel theoretical learning framework, termed +probability distribution learning (PD learning). Departing from the traditional +statistical learning framework, PD learning focuses on learning the underlying +probability distribution, which is modeled as a random variable within the +probability simplex. In this framework, the optimization objective is the +learning error, which quantifies the posterior expected discrepancy between the +model's predicted distribution and the underlying true distribution, given +available sample data and prior knowledge. To optimize the learning error, this +paper proposes the necessary conditions for loss functions, models, and +optimization algorithms, ensuring that these conditions are met in real-world +machine learning scenarios. Based on these conditions, the non-convex +optimization mechanism corresponding to model training can be theoretically +resolved. Moreover, this paper provides model-dependent and model-independent +bounds on learning error, offering new insights into the model's fitting and +generalization capabilities. Furthermore, the paper applies the PD learning +framework to elucidate the mechanisms by which various techniques, including +random parameter initialization, over-parameterization, and dropout, influence +deep model training. Finally, the paper substantiates the key conclusions of +the proposed framework through experimental results. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2105.04026 by other + authors. arXiv admin note: text overlap with arXiv:2105.04026 by other + authors +
+
+
+
+
+ + ♻ ☆ Lightning IR: Straightforward Fine-tuning and Inference of + Transformer-based Language Models for Information Retrieval WSDM'25 + + +
+ A wide range of transformer-based language models have been proposed for +information retrieval tasks. However, including transformer-based models in +retrieval pipelines is often complex and requires substantial engineering +effort. In this paper, we introduce Lightning IR, an easy-to-use PyTorch +Lightning-based framework for applying transformer-based language models in +retrieval scenarios. Lightning IR provides a modular and extensible +architecture that supports all stages of a retrieval pipeline: from fine-tuning +and indexing to searching and re-ranking. Designed to be scalable and +reproducible, Lightning IR is available as open-source: +https://github.com/webis-de/lightning-ir. + +
+
+ comment: Accepted as a demo at WSDM'25 +
+
+
+
+
+ + ♻ ☆ Distribution-Consistency-Guided Multi-modal Hashing + + +
+ Multi-modal hashing methods have gained popularity due to their fast speed +and low storage requirements. Among them, the supervised methods demonstrate +better performance by utilizing labels as supervisory signals compared with +unsupervised methods. Currently, for almost all supervised multi-modal hashing +methods, there is a hidden assumption that training sets have no noisy labels. +However, labels are often annotated incorrectly due to manual labeling in +real-world scenarios, which will greatly harm the retrieval performance. To +address this issue, we first discover a significant distribution consistency +pattern through experiments, i.e., the 1-0 distribution of the presence or +absence of each category in the label is consistent with the high-low +distribution of similarity scores of the hash codes relative to category +centers. Then, inspired by this pattern, we propose a novel +Distribution-Consistency-Guided Multi-modal Hashing (DCGMH), which aims to +filter and reconstruct noisy labels to enhance retrieval performance. +Specifically, the proposed method first randomly initializes several category +centers, which are used to compute the high-low distribution of similarity +scores; Noisy and clean labels are then separately filtered out via the +discovered distribution consistency pattern to mitigate the impact of noisy +labels; Subsequently, a correction strategy, which is indirectly designed via +the distribution consistency pattern, is applied to the filtered noisy labels, +correcting high-confidence ones while treating low-confidence ones as unlabeled +for unsupervised learning, thereby further enhancing the model's performance. +Extensive experiments on three widely used datasets demonstrate the superiority +of the proposed method compared to state-of-the-art baselines in multi-modal +retrieval tasks. The code is available at +https://github.com/LiuJinyu1229/DCGMH. + +
+
+
+
+
+ + ♻ ☆ DLCRec: A Novel Approach for Managing Diversity in LLM-Based Recommender + Systems WSDM 2025 + + +
+ The integration of Large Language Models (LLMs) into recommender systems has +led to substantial performance improvements. However, this often comes at the +cost of diminished recommendation diversity, which can negatively impact user +satisfaction. To address this issue, controllable recommendation has emerged as +a promising approach, allowing users to specify their preferences and receive +recommendations that meet their diverse needs. Despite its potential, existing +controllable recommender systems frequently rely on simplistic mechanisms, such +as a single prompt, to regulate diversity-an approach that falls short of +capturing the full complexity of user preferences. In response to these +limitations, we propose DLCRec, a novel framework designed to enable +fine-grained control over diversity in LLM-based recommendations. Unlike +traditional methods, DLCRec adopts a fine-grained task decomposition strategy, +breaking down the recommendation process into three sequential sub-tasks: genre +prediction, genre filling, and item prediction. These sub-tasks are trained +independently and inferred sequentially according to user-defined control +numbers, ensuring more precise control over diversity. Furthermore, the +scarcity and uneven distribution of diversity-related user behavior data pose +significant challenges for fine-tuning. To overcome these obstacles, we +introduce two data augmentation techniques that enhance the model's robustness +to noisy and out-of-distribution data. These techniques expose the model to a +broader range of patterns, improving its adaptability in generating +recommendations with varying levels of diversity. Our extensive empirical +evaluation demonstrates that DLCRec not only provides precise control over +diversity but also outperforms state-of-the-art baselines across multiple +recommendation scenarios. + +
+
+ comment: Accepted by WSDM 2025 +
+
+
+
+
+ + ♻ ☆ SCONE: A Novel Stochastic Sampling to Generate Contrastive Views and + Hard Negative Samples for Recommendation WSDM 2025 + + +
+ Graph-based collaborative filtering (CF) has emerged as a promising approach +in recommender systems. Despite its achievements, graph-based CF models face +challenges due to data sparsity and negative sampling. In this paper, we +propose a novel Stochastic sampling for i) COntrastive views and ii) hard +NEgative samples (SCONE) to overcome these issues. SCONE generates dynamic +augmented views and diverse hard negative samples via a unified stochastic +sampling approach based on score-based generative models. Our extensive +experiments on 6 benchmark datasets show that SCONE consistently outperforms +state-of-the-art baselines. SCONE shows efficacy in addressing user sparsity +and item popularity issues, while enhancing performance for both cold-start +users and long-tail items. Furthermore, our approach improves the diversity of +the recommendation and the uniformity of the representations. The code is +available at https://github.com/jeongwhanchoi/SCONE. + +
+
+ comment: Accepted to WSDM 2025. Chaejeong Lee and Jeongwhan Choi are co-first + authors with equal contributions +
+
+
+
+
+ + ♻ ☆ WISE: Rethinking the Knowledge Memory for Lifelong Model Editing of + Large Language Models NeurIPS 2024 + + +
+ Large language models (LLMs) need knowledge updates to meet the ever-growing +world facts and correct the hallucinated responses, facilitating the methods of +lifelong model editing. Where the updated knowledge resides in memories is a +fundamental question for model editing. In this paper, we find that editing +either long-term memory (direct model parameters) or working memory +(non-parametric knowledge of neural network activations/representations by +retrieval) will result in an impossible triangle -- reliability, +generalization, and locality can not be realized together in the lifelong +editing settings. For long-term memory, directly editing the parameters will +cause conflicts with irrelevant pretrained knowledge or previous edits (poor +reliability and locality). For working memory, retrieval-based activations can +hardly make the model understand the edits and generalize (poor +generalization). Therefore, we propose WISE to bridge the gap between memories. +In WISE, we design a dual parametric memory scheme, which consists of the main +memory for the pretrained knowledge and a side memory for the edited knowledge. +We only edit the knowledge in the side memory and train a router to decide +which memory to go through when given a query. For continual editing, we devise +a knowledge-sharding mechanism where different sets of edits reside in distinct +subspaces of parameters, and are subsequently merged into a shared memory +without conflicts. Extensive experiments show that WISE can outperform previous +model editing methods and overcome the impossible triangle under lifelong model +editing of question answering, hallucination, and out-of-distribution settings +across trending LLM architectures, e.g., GPT, LLaMA, and Mistral. Code is +available at https://github.com/zjunlp/EasyEdit. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Knowledge Circuits in Pretrained Transformers NeurIPS 2024 + + +
+ The remarkable capabilities of modern large language models are rooted in +their vast repositories of knowledge encoded within their parameters, enabling +them to perceive the world and engage in reasoning. The inner workings of how +these models store knowledge have long been a subject of intense interest and +investigation among researchers. To date, most studies have concentrated on +isolated components within these models, such as the Multilayer Perceptrons and +attention head. In this paper, we delve into the computation graph of the +language model to uncover the knowledge circuits that are instrumental in +articulating specific knowledge. The experiments, conducted with GPT2 and +TinyLLAMA, have allowed us to observe how certain information heads, relation +heads, and Multilayer Perceptrons collaboratively encode knowledge within the +model. Moreover, we evaluate the impact of current knowledge editing techniques +on these knowledge circuits, providing deeper insights into the functioning and +constraints of these editing methodologies. Finally, we utilize knowledge +circuits to analyze and interpret language model behaviors such as +hallucinations and in-context learning. We believe the knowledge circuits hold +potential for advancing our understanding of Transformers and guiding the +improved design of knowledge editing. Code and data are available in +https://github.com/zjunlp/KnowledgeCircuits. + +
+
+ comment: NeurIPS 2024, 26 pages +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Scaling 4D Representations + + +
+ Scaling has not yet been convincingly demonstrated for pure self-supervised +learning from video. However, prior work has focused evaluations on +semantic-related tasks $\unicode{x2013}$ action classification, ImageNet +classification, etc. In this paper we focus on evaluating self-supervised +learning on non-semantic vision tasks that are more spatial (3D) and temporal +(+1D = 4D), such as camera pose estimation, point and object tracking, and +depth estimation. We show that by learning from very large video datasets, +masked auto-encoding (MAE) with transformer video models actually scales, +consistently improving performance on these 4D tasks, as model size increases +from 20M all the way to the largest by far reported self-supervised video model +$\unicode{x2013}$ 22B parameters. Rigorous apples-to-apples comparison with +many recent image and video models demonstrates the benefits of scaling 4D +representations. + +
+
+
+
+
+ + ☆ PRIMA: Multi-Image Vision-Language Models for Reasoning Segmentation + + +
+ Despite significant advancements in Large Vision-Language Models (LVLMs), +existing pixel-grounding models operate on single-image settings, limiting +their ability to perform detailed, fine-grained comparisons across multiple +images. Conversely, current multi-image understanding models lack pixel-level +grounding. Our work addresses this gap by introducing the task of multi-image +pixel-grounded reasoning segmentation, and PRIMA, a novel LVLM that integrates +pixel-level grounding with robust multi-image reasoning capabilities to produce +contextually rich, pixel-grounded explanations. Central to PRIMA is an +efficient vision module that queries fine-grained visual representations across +multiple images, reducing TFLOPs by $25.3\%$. To support training and +evaluation, we curate $M^4Seg$, a new reasoning segmentation benchmark +consisting of $\sim$224K question-answer pairs that require fine-grained visual +understanding across multiple images. Experimental results demonstrate PRIMA +outperforms state-of-the-art baselines. + +
+
+ comment: Project page: https://plan-lab.github.io/prima +
+
+
+
+
+ + ☆ OpenEMMA: Open-Source Multimodal Model for End-to-End Autonomous Driving + + +
+ Since the advent of Multimodal Large Language Models (MLLMs), they have made +a significant impact across a wide range of real-world applications, +particularly in Autonomous Driving (AD). Their ability to process complex +visual data and reason about intricate driving scenarios has paved the way for +a new paradigm in end-to-end AD systems. However, the progress of developing +end-to-end models for AD has been slow, as existing fine-tuning methods demand +substantial resources, including extensive computational power, large-scale +datasets, and significant funding. Drawing inspiration from recent advancements +in inference computing, we propose OpenEMMA, an open-source end-to-end +framework based on MLLMs. By incorporating the Chain-of-Thought reasoning +process, OpenEMMA achieves significant improvements compared to the baseline +when leveraging a diverse range of MLLMs. Furthermore, OpenEMMA demonstrates +effectiveness, generalizability, and robustness across a variety of challenging +driving scenarios, offering a more efficient and effective approach to +autonomous driving. We release all the codes in +https://github.com/taco-group/OpenEMMA. + +
+
+
+
+
+ + ☆ AutoTrust: Benchmarking Trustworthiness in Large Vision Language Models + for Autonomous Driving + + +
+ Recent advancements in large vision language models (VLMs) tailored for +autonomous driving (AD) have shown strong scene understanding and reasoning +capabilities, making them undeniable candidates for end-to-end driving systems. +However, limited work exists on studying the trustworthiness of DriveVLMs -- a +critical factor that directly impacts public transportation safety. In this +paper, we introduce AutoTrust, a comprehensive trustworthiness benchmark for +large vision-language models in autonomous driving (DriveVLMs), considering +diverse perspectives -- including trustfulness, safety, robustness, privacy, +and fairness. We constructed the largest visual question-answering dataset for +investigating trustworthiness issues in driving scenarios, comprising over 10k +unique scenes and 18k queries. We evaluated six publicly available VLMs, +spanning from generalist to specialist, from open-source to commercial models. +Our exhaustive evaluations have unveiled previously undiscovered +vulnerabilities of DriveVLMs to trustworthiness threats. Specifically, we found +that the general VLMs like LLaVA-v1.6 and GPT-4o-mini surprisingly outperform +specialized models fine-tuned for driving in terms of overall trustworthiness. +DriveVLMs like DriveLM-Agent are particularly vulnerable to disclosing +sensitive information. Additionally, both generalist and specialist VLMs remain +susceptible to adversarial attacks and struggle to ensure unbiased +decision-making across diverse environments and populations. Our findings call +for immediate and decisive action to address the trustworthiness of DriveVLMs +-- an issue of critical importance to public safety and the welfare of all +citizens relying on autonomous transportation systems. Our benchmark is +publicly available at \url{https://github.com/taco-group/AutoTrust}, and the +leaderboard is released at \url{https://taco-group.github.io/AutoTrust/}. + +
+
+ comment: 55 pages, 14 figures +
+
+
+
+
+ + ☆ LiDAR-RT: Gaussian-based Ray Tracing for Dynamic LiDAR Re-simulation + + +
+ This paper targets the challenge of real-time LiDAR re-simulation in dynamic +driving scenarios. Recent approaches utilize neural radiance fields combined +with the physical modeling of LiDAR sensors to achieve high-fidelity +re-simulation results. Unfortunately, these methods face limitations due to +high computational demands in large-scale scenes and cannot perform real-time +LiDAR rendering. To overcome these constraints, we propose LiDAR-RT, a novel +framework that supports real-time, physically accurate LiDAR re-simulation for +driving scenes. Our primary contribution is the development of an efficient and +effective rendering pipeline, which integrates Gaussian primitives and +hardware-accelerated ray tracing technology. Specifically, we model the +physical properties of LiDAR sensors using Gaussian primitives with learnable +parameters and incorporate scene graphs to handle scene dynamics. Building upon +this scene representation, our framework first constructs a bounding volume +hierarchy (BVH), then casts rays for each pixel and generates novel LiDAR views +through a differentiable rendering algorithm. Importantly, our framework +supports realistic rendering with flexible scene editing operations and various +sensor configurations. Extensive experiments across multiple public benchmarks +demonstrate that our method outperforms state-of-the-art methods in terms of +rendering quality and efficiency. Our project page is at +https://zju3dv.github.io/lidar-rt. + +
+
+ comment: Project page: https://zju3dv.github.io/lidar-rt +
+
+
+
+
+ + ☆ Preventing Local Pitfalls in Vector Quantization via Optimal Transport + + +
+ Vector-quantized networks (VQNs) have exhibited remarkable performance across +various tasks, yet they are prone to training instability, which complicates +the training process due to the necessity for techniques such as subtle +initialization and model distillation. In this study, we identify the local +minima issue as the primary cause of this instability. To address this, we +integrate an optimal transport method in place of the nearest neighbor search +to achieve a more globally informed assignment. We introduce OptVQ, a novel +vector quantization method that employs the Sinkhorn algorithm to optimize the +optimal transport problem, thereby enhancing the stability and efficiency of +the training process. To mitigate the influence of diverse data distributions +on the Sinkhorn algorithm, we implement a straightforward yet effective +normalization strategy. Our comprehensive experiments on image reconstruction +tasks demonstrate that OptVQ achieves 100% codebook utilization and surpasses +current state-of-the-art VQNs in reconstruction quality. + +
+
+ comment: Code is available at https://github.com/zbr17/OptVQ +
+
+
+
+
+ + ☆ AV-Link: Temporally-Aligned Diffusion Features for Cross-Modal + Audio-Video Generation + + +
+ We propose AV-Link, a unified framework for Video-to-Audio and Audio-to-Video +generation that leverages the activations of frozen video and audio diffusion +models for temporally-aligned cross-modal conditioning. The key to our +framework is a Fusion Block that enables bidirectional information exchange +between our backbone video and audio diffusion models through a +temporally-aligned self attention operation. Unlike prior work that uses +feature extractors pretrained for other tasks for the conditioning signal, +AV-Link can directly leverage features obtained by the complementary modality +in a single framework i.e. video features to generate audio, or audio features +to generate video. We extensively evaluate our design choices and demonstrate +the ability of our method to achieve synchronized and high-quality audiovisual +content, showcasing its potential for applications in immersive media +generation. Project Page: snap-research.github.io/AVLink/ + +
+
+ comment: Project Page: snap-research.github.io/AVLink/ +
+
+
+
+
+ + ☆ LlamaFusion: Adapting Pretrained Language Models for Multimodal + Generation + + +
+ We present LlamaFusion, a framework for empowering pretrained text-only large +language models (LLMs) with multimodal generative capabilities, enabling them +to understand and generate both text and images in arbitrary sequences. +LlamaFusion leverages existing Llama-3's weights for processing texts +autoregressively while introducing additional and parallel transformer modules +for processing images with diffusion. During training, the data from each +modality is routed to its dedicated modules: modality-specific feedforward +layers, query-key-value projections, and normalization layers process each +modality independently, while the shared self-attention layers allow +interactions across text and image features. By freezing the text-specific +modules and only training the image-specific modules, LlamaFusion preserves the +language capabilities of text-only LLMs while developing strong visual +understanding and generation abilities. Compared to methods that pretrain +multimodal generative models from scratch, our experiments demonstrate that, +LlamaFusion improves image understanding by 20% and image generation by 3.6% +using only 50% of the FLOPs while maintaining Llama-3's language capabilities. +We also demonstrate that this framework can adapt existing vision-language +models with multimodal generation ability. Overall, this framework not only +leverages existing computational investments in text-only LLMs but also enables +the parallel development of language and vision capabilities, presenting a +promising direction for efficient multimodal model development. + +
+
+
+
+
+ + ☆ Data for Mathematical Copilots: Better Ways of Presenting Proofs for + Machine Learning + + +
+ The suite of datasets commonly used to train and evaluate the mathematical +capabilities of AI-based mathematical copilots (primarily large language +models) exhibit several shortcomings. These limitations include a restricted +scope of mathematical complexity, typically not exceeding lower +undergraduate-level mathematics, binary rating protocols and other issues, +which makes comprehensive proof-based evaluation suites difficult. We +systematically explore these limitations and contend that enhancing the +capabilities of large language models, or any forthcoming advancements in +AI-based mathematical assistants (copilots or "thought partners"), necessitates +a paradigm shift in the design of mathematical datasets and the evaluation +criteria of mathematical ability: It is necessary to move away from +result-based datasets (theorem statement to theorem proof) and convert the rich +facets of mathematical research practice to data LLMs can train on. Examples of +these are mathematical workflows (sequences of atomic, potentially +subfield-dependent tasks that are often performed when creating new +mathematics), which are an important part of the proof-discovery process. +Additionally, we advocate for mathematical dataset developers to consider the +concept of "motivated proof", introduced by G. P\'olya in 1949, which can serve +as a blueprint for datasets that offer a better proof learning signal, +alleviating some of the mentioned limitations. Lastly, we introduce math +datasheets for datasets, extending the general, dataset-agnostic variants of +datasheets: We provide a questionnaire designed specifically for math datasets +that we urge dataset creators to include with their datasets. This will make +creators aware of potential limitations of their datasets while at the same +time making it easy for readers to assess it from the point of view of training +and evaluating mathematical copilots. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ☆ STRAP: Robot Sub-Trajectory Retrieval for Augmented Policy Learning + + +
+ Robot learning is witnessing a significant increase in the size, diversity, +and complexity of pre-collected datasets, mirroring trends in domains such as +natural language processing and computer vision. Many robot learning methods +treat such datasets as multi-task expert data and learn a multi-task, +generalist policy by training broadly across them. Notably, while these +generalist policies can improve the average performance across many tasks, the +performance of generalist policies on any one task is often suboptimal due to +negative transfer between partitions of the data, compared to task-specific +specialist policies. In this work, we argue for the paradigm of training +policies during deployment given the scenarios they encounter: rather than +deploying pre-trained policies to unseen problems in a zero-shot manner, we +non-parametrically retrieve and train models directly on relevant data at test +time. Furthermore, we show that many robotics tasks share considerable amounts +of low-level behaviors and that retrieval at the "sub"-trajectory granularity +enables significantly improved data utilization, generalization, and robustness +in adapting policies to novel problems. In contrast, existing full-trajectory +retrieval methods tend to underutilize the data and miss out on shared +cross-task content. This work proposes STRAP, a technique for leveraging +pre-trained vision foundation models and dynamic time warping to retrieve +sub-sequences of trajectories from large training corpora in a robust fashion. +STRAP outperforms both prior retrieval algorithms and multi-task learning +methods in simulated and real experiments, showing the ability to scale to much +larger offline datasets in the real world as well as the ability to learn +robust control policies with just a handful of real-world demonstrations. + +
+
+ comment: Project website at https://weirdlabuw.github.io/strap/ +
+
+
+
+
+ + ☆ HPC-Coder-V2: Studying Code LLMs Across Low-Resource Parallel Languages + + +
+ Large Language Model (LLM) based coding tools have been tremendously +successful as software development assistants, yet they are often designed for +general purpose programming tasks and perform poorly for more specialized +domains such as high performance computing. Creating specialized models and +tools for these domains is crucial towards gaining the benefits of LLMs in +areas such as HPC. While previous work has explored HPC-specific models, LLMs +still struggle to generate parallel code and it is not at all clear what +hurdles are still holding back these LLMs and what must be done to overcome +them. In this work, we conduct an in-depth study along the many axes of +fine-tuning a specialized HPC LLM in order to better understand the challenges. +Based on our findings we fine-tune and evaluate a specialized HPC LLM that is +shown to be the best performing open-source code LLM for parallel code +generation to date. + +
+
+
+
+
+ + ☆ Rethinking Uncertainty Estimation in Natural Language Generation + + +
+ Large Language Models (LLMs) are increasingly employed in real-world +applications, driving the need to evaluate the trustworthiness of their +generated text. To this end, reliable uncertainty estimation is essential. +Since current LLMs generate text autoregressively through a stochastic process, +the same prompt can lead to varying outputs. Consequently, leading uncertainty +estimation methods generate and analyze multiple output sequences to determine +the LLM's uncertainty. However, generating output sequences is computationally +expensive, making these methods impractical at scale. In this work, we inspect +the theoretical foundations of the leading methods and explore new directions +to enhance their computational efficiency. Building on the framework of proper +scoring rules, we find that the negative log-likelihood of the most likely +output sequence constitutes a theoretically grounded uncertainty measure. To +approximate this alternative measure, we propose G-NLL, which has the advantage +of being obtained using only a single output sequence generated by greedy +decoding. This makes uncertainty estimation more efficient and straightforward, +while preserving theoretical rigor. Empirical results demonstrate that G-NLL +achieves state-of-the-art performance across various LLMs and tasks. Our work +lays the foundation for efficient and reliable uncertainty estimation in +natural language generation, challenging the necessity of more computationally +involved methods currently leading the field. + +
+
+
+
+
+ + ☆ Operationalising Rawlsian Ethics for Fairness in Norm-Learning Agents AAAI 2025 + + +
+ Social norms are standards of behaviour common in a society. However, when +agents make decisions without considering how others are impacted, norms can +emerge that lead to the subjugation of certain agents. We present RAWL-E, a +method to create ethical norm-learning agents. RAWL-E agents operationalise +maximin, a fairness principle from Rawlsian ethics, in their decision-making +processes to promote ethical norms by balancing societal well-being with +individual goals. We evaluate RAWL-E agents in simulated harvesting scenarios. +We find that norms emerging in RAWL-E agent societies enhance social welfare, +fairness, and robustness, and yield higher minimum experience compared to those +that emerge in agent societies that do not implement Rawlsian ethics. + +
+
+ comment: 14 pages, 7 figures, 8 tables (and supplementary material with + reproducibility and additional results), accepted at AAAI 2025 +
+
+
+
+
+ + ☆ Leveraging Color Channel Independence for Improved Unsupervised Object + Detection + + +
+ Object-centric architectures can learn to extract distinct object +representations from visual scenes, enabling downstream applications on the +object level. Similarly to autoencoder-based image models, object-centric +approaches have been trained on the unsupervised reconstruction loss of images +encoded by RGB color spaces. In our work, we challenge the common assumption +that RGB images are the optimal color space for unsupervised learning in +computer vision. We discuss conceptually and empirically that other color +spaces, such as HSV, bear essential characteristics for object-centric +representation learning, like robustness to lighting conditions. We further +show that models improve when requiring them to predict additional color +channels. Specifically, we propose to transform the predicted targets to the +RGB-S space, which extends RGB with HSV's saturation component and leads to +markedly better reconstruction and disentanglement for five common evaluation +datasets. The use of composite color spaces can be implemented with basically +no computational overhead, is agnostic of the models' architecture, and is +universally applicable across a wide range of visual computing tasks and +training types. The findings of our approach encourage additional +investigations in computer vision tasks beyond object-centric learning. + +
+
+ comment: 38 pages incl. references, 16 figures +
+
+
+
+
+ + ☆ Jet: A Modern Transformer-Based Normalizing Flow + + +
+ In the past, normalizing generative flows have emerged as a promising class +of generative models for natural images. This type of model has many modeling +advantages: the ability to efficiently compute log-likelihood of the input +data, fast generation and simple overall structure. Normalizing flows remained +a topic of active research but later fell out of favor, as visual quality of +the samples was not competitive with other model classes, such as GANs, +VQ-VAE-based approaches or diffusion models. In this paper we revisit the +design of the coupling-based normalizing flow models by carefully ablating +prior design choices and using computational blocks based on the Vision +Transformer architecture, not convolutional neural networks. As a result, we +achieve state-of-the-art quantitative and qualitative performance with a much +simpler architecture. While the overall visual quality is still behind the +current state-of-the-art models, we argue that strong normalizing flow models +can help advancing research frontier by serving as building components of more +powerful generative models. + +
+
+
+
+
+ + ☆ Adaptive Pruning for Large Language Models with Structural Importance + Awareness + + +
+ The recent advancements in large language models (LLMs) have significantly +improved language understanding and generation capabilities. However, it is +difficult to deploy LLMs on resource-constrained edge devices due to their high +computational and storage resource demands. To address this issue, we propose a +novel LLM model pruning method, namely structurally-aware adaptive pruning +(SAAP), to significantly reduce the computational and memory costs while +maintaining model performance. We first define an adaptive importance fusion +metric to evaluate the importance of all coupled structures in LLMs by +considering their homoscedastic uncertainty. Then, we rank the importance of +all modules to determine the specific layers that should be pruned to meet +particular performance requirements. Furthermore, we develop a new group +fine-tuning strategy to improve the inference efficiency of LLMs. Finally, we +evaluate the proposed SAAP method on multiple LLMs across two common tasks, +i.e., zero-shot classification and text generation. Experimental results show +that our SAAP method outperforms several state-of-the-art baseline methods, +achieving 2.17%, 2.37%, and 2.39% accuracy gains on LLaMA-7B, Vicuna-7B, and +LLaMA-13B. Additionally, SAAP improves the token generation speed by 5%, +showcasing its practical advantages in resource-constrained scenarios. + +
+
+ comment: 12 pages, 6 figures, 12 tables +
+
+
+
+
+ + ☆ Outcome-Refining Process Supervision for Code Generation + + +
+ Large Language Models have demonstrated remarkable capabilities in code +generation, yet they often struggle with complex programming tasks that require +deep algorithmic reasoning. While process supervision through learned reward +models shows promise in guiding reasoning steps, it requires expensive training +data and suffers from unreliable evaluation. We propose Outcome-Refining +Process Supervision, a novel paradigm that treats outcome refinement itself as +the process to be supervised. Our framework leverages concrete execution +signals to ground the supervision of reasoning steps, while using +tree-structured exploration to maintain multiple solution trajectories +simultaneously. Experiments demonstrate that our approach enables even smaller +models to achieve high success accuracy and performance metrics on competitive +programming tasks, creates more reliable verification than traditional reward +models without requiring training PRMs. Our approach achieves significant +improvements across 5 models and 3 datasets: an average of 26.9% increase in +correctness and 42.2% in efficiency. The results suggest that providing +structured reasoning space with concrete verification signals is crucial for +solving complex programming tasks. We open-source all our code and data at: +https://github.com/zhuohaoyu/ORPS + +
+
+ comment: 18 pages, 5 figures, Code: https://github.com/zhuohaoyu/ORPS +
+
+
+
+
+ + ☆ Tests for model misspecification in simulation-based inference: from + local distortions to global model checks + + +
+ Model misspecification analysis strategies, such as anomaly detection, model +validation, and model comparison are a key component of scientific model +development. Over the last few years, there has been a rapid rise in the use of +simulation-based inference (SBI) techniques for Bayesian parameter estimation, +applied to increasingly complex forward models. To move towards fully +simulation-based analysis pipelines, however, there is an urgent need for a +comprehensive simulation-based framework for model misspecification analysis. +In this work, we provide a solid and flexible foundation for a wide range of +model discrepancy analysis tasks, using distortion-driven model +misspecification tests. From a theoretical perspective, we introduce the +statistical framework built around performing many hypothesis tests for +distortions of the simulation model. We also make explicit analytic connections +to classical techniques: anomaly detection, model validation, and +goodness-of-fit residual analysis. Furthermore, we introduce an efficient +self-calibrating training algorithm that is useful for practitioners. We +demonstrate the performance of the framework in multiple scenarios, making the +connection to classical results where they are valid. Finally, we show how to +conduct such a distortion-driven model misspecification test for real +gravitational wave data, specifically on the event GW150914. + +
+
+ comment: 11 pages, 5 figures. Code available on github (NoemiAM/mist) at + https://github.com/NoemiAM/mist +
+
+
+
+
+ + ☆ A Full Transformer-based Framework for Automatic Pain Estimation using + Videos + + +
+ The automatic estimation of pain is essential in designing an optimal pain +management system offering reliable assessment and reducing the suffering of +patients. In this study, we present a novel full transformer-based framework +consisting of a Transformer in Transformer (TNT) model and a Transformer +leveraging cross-attention and self-attention blocks. Elaborating on videos +from the BioVid database, we demonstrate state-of-the-art performances, showing +the efficacy, efficiency, and generalization capability across all the primary +pain estimation tasks. + +
+
+
+
+
+ + ☆ Learning Disentangled Equivariant Representation for Explicitly + Controllable 3D Molecule Generation AAAI 2025 + + +
+ We consider the conditional generation of 3D drug-like molecules with +\textit{explicit control} over molecular properties such as drug-like +properties (e.g., Quantitative Estimate of Druglikeness or Synthetic +Accessibility score) and effectively binding to specific protein sites. To +tackle this problem, we propose an E(3)-equivariant Wasserstein autoencoder and +factorize the latent space of our generative model into two disentangled +aspects: molecular properties and the remaining structural context of 3D +molecules. Our model ensures explicit control over these molecular attributes +while maintaining equivariance of coordinate representation and invariance of +data likelihood. Furthermore, we introduce a novel alignment-based coordinate +loss to adapt equivariant networks for auto-regressive de-novo 3D molecule +generation from scratch. Extensive experiments validate our model's +effectiveness on property-guided and context-guided molecule generation, both +for de-novo 3D molecule design and structure-based drug discovery against +protein targets. + +
+
+ comment: AAAI 2025 +
+
+
+
+
+ + ☆ AceMath: Advancing Frontier Math Reasoning with Post-Training and Reward + Modeling + + +
+ In this paper, we introduce AceMath, a suite of frontier math models that +excel in solving complex math problems, along with highly effective reward +models capable of evaluating generated solutions and reliably identifying the +correct ones. To develop the instruction-tuned math models, we propose a +supervised fine-tuning (SFT) process that first achieves competitive +performance across general domains, followed by targeted fine-tuning for the +math domain using a carefully curated set of prompts and synthetically +generated responses. The resulting model, AceMath-72B-Instruct greatly +outperforms Qwen2.5-Math-72B-Instruct, GPT-4o and Claude-3.5 Sonnet. To develop +math-specialized reward model, we first construct AceMath-RewardBench, a +comprehensive and robust benchmark for evaluating math reward models across +diverse problems and difficulty levels. After that, we present a systematic +approach to build our math reward models. The resulting model, AceMath-72B-RM, +consistently outperforms state-of-the-art reward models. Furthermore, when +combining AceMath-72B-Instruct with AceMath-72B-RM, we achieve the highest +average rm@8 score across the math reasoning benchmarks. We will release model +weights, training data, and evaluation benchmarks at: +https://research.nvidia.com/labs/adlr/acemath + +
+
+
+
+
+ + ☆ Till the Layers Collapse: Compressing a Deep Neural Network through the + Lenses of Batch Normalization Layers AAAI 2025 + + +
+ Today, deep neural networks are widely used since they can handle a variety +of complex tasks. Their generality makes them very powerful tools in modern +technology. However, deep neural networks are often overparameterized. The +usage of these large models consumes a lot of computation resources. In this +paper, we introduce a method called \textbf{T}ill the \textbf{L}ayers +\textbf{C}ollapse (TLC), which compresses deep neural networks through the +lenses of batch normalization layers. By reducing the depth of these networks, +our method decreases deep neural networks' computational requirements and +overall latency. We validate our method on popular models such as Swin-T, +MobileNet-V2, and RoBERTa, across both image classification and natural +language processing (NLP) tasks. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ☆ DroughtSet: Understanding Drought Through Spatial-Temporal Learning AAAI25 + + +
+ Drought is one of the most destructive and expensive natural disasters, +severely impacting natural resources and risks by depleting water resources and +diminishing agricultural yields. Under climate change, accurately predicting +drought is critical for mitigating drought-induced risks. However, the +intricate interplay among the physical and biological drivers that regulate +droughts limits the predictability and understanding of drought, particularly +at a subseasonal to seasonal (S2S) time scale. While deep learning has been +demonstrated with potential in addressing climate forecasting challenges, its +application to drought prediction has received relatively less attention. In +this work, we propose a new dataset, DroughtSet, which integrates relevant +predictive features and three drought indices from multiple remote sensing and +reanalysis datasets across the contiguous United States (CONUS). DroughtSet +specifically provides the machine learning community with a new real-world +dataset to benchmark drought prediction models and more generally, time-series +forecasting methods. Furthermore, we propose a spatial-temporal model SPDrought +to predict and interpret S2S droughts. Our model learns from the spatial and +temporal information of physical and biological features to predict three types +of droughts simultaneously. Multiple strategies are employed to quantify the +importance of physical and biological features for drought prediction. Our +results provide insights for researchers to better understand the +predictability and sensitivity of drought to biological and physical +conditions. We aim to contribute to the climate field by proposing a new tool +to predict and understand the occurrence of droughts and provide the AI +community with a new benchmark to study deep learning applications in climate +science. + +
+
+ comment: Accepted by AAAI25 +
+
+
+
+
+ + ☆ MultiverSeg: Scalable Interactive Segmentation of Biomedical Imaging + Datasets with In-Context Guidance + + +
+ Medical researchers and clinicians often need to perform novel segmentation +tasks on a set of related images. Existing methods for segmenting a new dataset +are either interactive, requiring substantial human effort for each image, or +require an existing set of manually labeled images. We introduce a system, +MultiverSeg, that enables practitioners to rapidly segment an entire new +dataset without requiring access to any existing labeled data from that task or +domain. Along with the image to segment, the model takes user interactions such +as clicks, bounding boxes or scribbles as input, and predicts a segmentation. +As the user segments more images, those images and segmentations become +additional inputs to the model, providing context. As the context set of +labeled images grows, the number of interactions required to segment each new +image decreases. We demonstrate that MultiverSeg enables users to interactively +segment new datasets efficiently, by amortizing the number of interactions per +image to achieve an accurate segmentation. Compared to using a state-of-the-art +interactive segmentation method, using MultiverSeg reduced the total number of +scribble steps by 53% and clicks by 36% to achieve 90% Dice on sets of images +from unseen tasks. We release code and model weights at +https://multiverseg.csail.mit.edu + +
+
+ comment: Project Website: https://multiverseg.csail.mit.edu Keywords: + interactive segmentation, in-context learning, medical image analysis, + biomedical imaging, image annotation, visual prompting +
+
+
+
+
+ + ☆ DCTdiff: Intriguing Properties of Image Generative Modeling in the DCT + Space + + +
+ This paper explores image modeling from the frequency space and introduces +DCTdiff, an end-to-end diffusion generative paradigm that efficiently models +images in the discrete cosine transform (DCT) space. We investigate the design +space of DCTdiff and reveal the key design factors. Experiments on different +frameworks (UViT, DiT), generation tasks, and various diffusion samplers +demonstrate that DCTdiff outperforms pixel-based diffusion models regarding +generative quality and training efficiency. Remarkably, DCTdiff can seamlessly +scale up to high-resolution generation without using the latent diffusion +paradigm. Finally, we illustrate several intriguing properties of DCT image +modeling. For example, we provide a theoretical proof of why `image diffusion +can be seen as spectral autoregression', bridging the gap between diffusion and +autoregressive models. The effectiveness of DCTdiff and the introduced +properties suggest a promising direction for image modeling in the frequency +space. The code is at \url{https://github.com/forever208/DCTdiff}. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ☆ Stable-V2A: Synthesis of Synchronized Sound Effects with Temporal and + Semantic Controls + + +
+ Sound designers and Foley artists usually sonorize a scene, such as from a +movie or video game, by manually annotating and sonorizing each action of +interest in the video. In our case, the intent is to leave full creative +control to sound designers with a tool that allows them to bypass the more +repetitive parts of their work, thus being able to focus on the creative +aspects of sound production. We achieve this presenting Stable-V2A, a two-stage +model consisting of: an RMS-Mapper that estimates an envelope representative of +the audio characteristics associated with the input video; and Stable-Foley, a +diffusion model based on Stable Audio Open that generates audio semantically +and temporally aligned with the target video. Temporal alignment is guaranteed +by the use of the envelope as a ControlNet input, while semantic alignment is +achieved through the use of sound representations chosen by the designer as +cross-attention conditioning of the diffusion process. We train and test our +model on Greatest Hits, a dataset commonly used to evaluate V2A models. In +addition, to test our model on a case study of interest, we introduce Walking +The Maps, a dataset of videos extracted from video games depicting animated +characters walking in different locations. Samples and code available on our +demo page at https://ispamm.github.io/Stable-V2A. + +
+
+
+
+
+ + ☆ Robust Federated Learning in the Face of Covariate Shift: A Magnitude + Pruning with Hybrid Regularization Framework for Enhanced Model Aggregation + + +
+ The development of highly sophisticated neural networks has allowed for fast +progress in every field of computer vision, however, applications where +annotated data is prohibited due to privacy or security concerns remain +challenging. Federated Learning (FL) offers a promising framework for +individuals aiming to collaboratively develop a shared model while preserving +data privacy. Nevertheless, our findings reveal that variations in data +distribution among clients can profoundly affect FL methodologies, primarily +due to instabilities in the aggregation process. We also propose a novel FL +framework to mitigate the adverse effects of covariate shifts among federated +clients by combining individual parameter pruning and regularization techniques +to improve the robustness of individual clients' models to aggregate. Each +client's model is optimized through magnitude-based pruning and the addition of +dropout and noise injection layers to build more resilient decision pathways in +the networks and improve the robustness of the model's parameter aggregation +step. The proposed framework is capable of extracting robust representations +even in the presence of very large covariate shifts among client data +distributions and in the federation of a small number of clients. Empirical +findings substantiate the effectiveness of our proposed methodology across +common benchmark datasets, including CIFAR10, MNIST, SVHN, and Fashion MNIST. +Furthermore, we introduce the CelebA-Gender dataset, specifically designed to +evaluate performance on a more realistic domain. The proposed method is capable +of extracting robust representations even in the presence of both high and low +covariate shifts among client data distributions. + +
+
+
+
+
+ + ☆ DisCo: Graph-Based Disentangled Contrastive Learning for Cold-Start + Cross-Domain Recommendation + + +
+ Recommender systems are widely used in various real-world applications, but +they often encounter the persistent challenge of the user cold-start problem. +Cross-domain recommendation (CDR), which leverages user interactions from one +domain to improve prediction performance in another, has emerged as a promising +solution. However, users with similar preferences in the source domain may +exhibit different interests in the target domain. Therefore, directly +transferring embeddings may introduce irrelevant source-domain collaborative +information. In this paper, we propose a novel graph-based disentangled +contrastive learning framework to capture fine-grained user intent and filter +out irrelevant collaborative information, thereby avoiding negative transfer. +Specifically, for each domain, we use a multi-channel graph encoder to capture +diverse user intents. We then construct the affinity graph in the embedding +space and perform multi-step random walks to capture high-order user similarity +relationships. Treating one domain as the target, we propose a disentangled +intent-wise contrastive learning approach, guided by user similarity, to refine +the bridging of user intents across domains. Extensive experiments on four +benchmark CDR datasets demonstrate that DisCo consistently outperforms existing +state-of-the-art baselines, thereby validating the effectiveness of both DisCo +and its components. + +
+
+
+
+
+ + ☆ Stitch Contrast and Segment_Learning a Human Action Segmentation Model + Using Trimmed Skeleton Videos AAAI 2025 + + +
+ Existing skeleton-based human action classification models rely on +well-trimmed action-specific skeleton videos for both training and testing, +precluding their scalability to real-world applications where untrimmed videos +exhibiting concatenated actions are predominant. To overcome this limitation, +recently introduced skeleton action segmentation models involve un-trimmed +skeleton videos into end-to-end training. The model is optimized to provide +frame-wise predictions for any length of testing videos, simultaneously +realizing action localization and classification. Yet, achieving such an +improvement im-poses frame-wise annotated skeleton videos, which remains +time-consuming in practice. This paper features a novel framework for +skeleton-based action segmentation trained on short trimmed skeleton videos, +but that can run on longer un-trimmed videos. The approach is implemented in +three steps: Stitch, Contrast, and Segment. First, Stitch proposes a tem-poral +skeleton stitching scheme that treats trimmed skeleton videos as elementary +human motions that compose a semantic space and can be sampled to generate +multi-action stitched se-quences. Contrast learns contrastive representations +from stitched sequences with a novel discrimination pretext task that enables a +skeleton encoder to learn meaningful action-temporal contexts to improve action +segmentation. Finally, Segment relates the proposed method to action +segmentation by learning a segmentation layer while handling particular da-ta +availability. Experiments involve a trimmed source dataset and an untrimmed +target dataset in an adaptation formulation for real-world skeleton-based human +action segmentation to evaluate the effectiveness of the proposed method. + +
+
+ comment: Accepted as AAAI 2025 +
+
+
+
+
+ + ☆ Knowledge Injection via Prompt Distillation + + +
+ In many practical applications, large language models (LLMs) need to +incorporate new knowledge not present in their pre-training data. The primary +methods for this are fine-tuning and retrieval-augmented generation (RAG). +Although RAG has emerged as the industry standard for knowledge injection, +fine-tuning has not yet achieved comparable success. In this paper, we propose +a new fine-tuning technique for learning new knowledge and show that it can +reach the performance of RAG. The proposed method is based on the +self-distillation approach, which we call prompt distillation. First, we +generate question-answer pairs about the new knowledge. Then, we fine-tune a +student model on the question-answer pairs to imitate the output distributions +of a teacher model, which additionally receives the new knowledge in its +prompt. The student model is identical to the teacher, except it is equipped +with a LoRA adapter. This training procedure facilitates distilling the new +knowledge from the teacher's prompt into the student's weights. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ IDOL: Instant Photorealistic 3D Human Creation from a Single Image + + +
+ Creating a high-fidelity, animatable 3D full-body avatar from a single image +is a challenging task due to the diverse appearance and poses of humans and the +limited availability of high-quality training data. To achieve fast and +high-quality human reconstruction, this work rethinks the task from the +perspectives of dataset, model, and representation. First, we introduce a +large-scale HUman-centric GEnerated dataset, HuGe100K, consisting of 100K +diverse, photorealistic sets of human images. Each set contains 24-view frames +in specific human poses, generated using a pose-controllable +image-to-multi-view model. Next, leveraging the diversity in views, poses, and +appearances within HuGe100K, we develop a scalable feed-forward transformer +model to predict a 3D human Gaussian representation in a uniform space from a +given human image. This model is trained to disentangle human pose, body shape, +clothing geometry, and texture. The estimated Gaussians can be animated without +post-processing. We conduct comprehensive experiments to validate the +effectiveness of the proposed dataset and method. Our model demonstrates the +ability to efficiently reconstruct photorealistic humans at 1K resolution from +a single input image using a single GPU instantly. Additionally, it seamlessly +supports various applications, as well as shape and texture editing tasks. + +
+
+ comment: 21 pages, 15 figures, includes main content, supplementary materials, + and references +
+
+
+
+
+ + ☆ Corn Ear Detection and Orientation Estimation Using Deep Learning + + +
+ Monitoring growth behavior of maize plants such as the development of ears +can give key insights into the plant's health and development. Traditionally, +the measurement of the angle of ears is performed manually, which can be +time-consuming and prone to human error. To address these challenges, this +paper presents a computer vision-based system for detecting and tracking ears +of corn in an image sequence. The proposed system could accurately detect, +track, and predict the ear's orientation, which can be useful in monitoring +their growth behavior. This can significantly save time compared to manual +measurement and enables additional areas of ear orientation research and +potential increase in efficiencies for maize production. Using an object +detector with keypoint detection, the algorithm proposed could detect 90 +percent of all ears. The cardinal estimation had a mean absolute error (MAE) of +18 degrees, compared to a mean 15 degree difference between two people +measuring by hand. These results demonstrate the feasibility of using computer +vision techniques for monitoring maize growth and can lead to further research +in this area. + +
+
+ comment: 22 pages;15 figures +
+
+
+
+
+ + ☆ From Point to probabilistic gradient boosting for claim frequency and + severity prediction + + +
+ Gradient boosting for decision tree algorithms are increasingly used in +actuarial applications as they show superior predictive performance over +traditional generalized linear models. Many improvements and sophistications to +the first gradient boosting machine algorithm exist. We present in a unified +notation, and contrast, all the existing point and probabilistic gradient +boosting for decision tree algorithms: GBM, XGBoost, DART, LightGBM, CatBoost, +EGBM, PGBM, XGBoostLSS, cyclic GBM, and NGBoost. In this comprehensive +numerical study, we compare their performance on five publicly available +datasets for claim frequency and severity, of various size and comprising +different number of (high cardinality) categorical variables. We explain how +varying exposure-to-risk can be handled with boosting in frequency models. We +compare the algorithms on the basis of computational efficiency, predictive +performance, and model adequacy. LightGBM and XGBoostLSS win in terms of +computational efficiency. The fully interpretable EGBM achieves competitive +predictive performance compared to the black box algorithms considered. We find +that there is no trade-off between model adequacy and predictive accuracy: both +are achievable simultaneously. + +
+
+ comment: 26 pages, 4 figures, 26 tables, 7 algorithms +
+
+
+
+
+ + ☆ Diffusion priors for Bayesian 3D reconstruction from incomplete + measurements + + +
+ Many inverse problems are ill-posed and need to be complemented by prior +information that restricts the class of admissible models. Bayesian approaches +encode this information as prior distributions that impose generic properties +on the model such as sparsity, non-negativity or smoothness. However, in case +of complex structured models such as images, graphs or three-dimensional (3D) +objects,generic prior distributions tend to favor models that differ largely +from those observed in the real world. Here we explore the use of diffusion +models as priors that are combined with experimental data within a Bayesian +framework. We use 3D point clouds to represent 3D objects such as household +items or biomolecular complexes formed from proteins and nucleic acids. We +train diffusion models that generate coarse-grained 3D structures at a medium +resolution and integrate these with incomplete and noisy experimental data. To +demonstrate the power of our approach, we focus on the reconstruction of +biomolecular assemblies from cryo-electron microscopy (cryo-EM) images, which +is an important inverse problem in structural biology. We find that posterior +sampling with diffusion model priors allows for 3D reconstruction from very +sparse, low-resolution and partial observations. + +
+
+
+
+
+ + ☆ AI-Powered Intracranial Hemorrhage Detection: A Co-Scale Convolutional + Attention Model with Uncertainty-Based Fuzzy Integral Operator and Feature + Screening + + +
+ Intracranial hemorrhage (ICH) refers to the leakage or accumulation of blood +within the skull, which occurs due to the rupture of blood vessels in or around +the brain. If this condition is not diagnosed in a timely manner and +appropriately treated, it can lead to serious complications such as decreased +consciousness, permanent neurological disabilities, or even death.The primary +aim of this study is to detect the occurrence or non-occurrence of ICH, +followed by determining the type of subdural hemorrhage (SDH). These tasks are +framed as two separate binary classification problems. By adding two layers to +the co-scale convolutional attention (CCA) classifier architecture, we +introduce a novel approach for ICH detection. In the first layer, after +extracting features from different slices of computed tomography (CT) scan +images, we combine these features and select the 50 components that capture the +highest variance in the data, considering them as informative features. We then +assess the discriminative power of these features using the bootstrap forest +algorithm, discarding those that lack sufficient discriminative ability between +different classes. This algorithm explicitly determines the contribution of +each feature to the final prediction, assisting us in developing an explainable +AI model. The features feed into a boosting neural network as a latent feature +space. In the second layer, we introduce a novel uncertainty-based fuzzy +integral operator to fuse information from different CT scan slices. This +operator, by accounting for the dependencies between consecutive slices, +significantly improves detection accuracy. + +
+
+
+
+
+ + ☆ Hierarchical Subspaces of Policies for Continual Offline Reinforcement + Learning + + +
+ In dynamic domains such as autonomous robotics and video game simulations, +agents must continuously adapt to new tasks while retaining previously acquired +skills. This ongoing process, known as Continual Reinforcement Learning, +presents significant challenges, including the risk of forgetting past +knowledge and the need for scalable solutions as the number of tasks increases. +To address these issues, we introduce HIerarchical LOW-rank Subspaces of +Policies (HILOW), a novel framework designed for continual learning in offline +navigation settings. HILOW leverages hierarchical policy subspaces to enable +flexible and efficient adaptation to new tasks while preserving existing +knowledge. We demonstrate, through a careful experimental study, the +effectiveness of our method in both classical MuJoCo maze environments and +complex video game-like simulations, showcasing competitive performance and +satisfying adaptability according to classical continual learning metrics, in +particular regarding memory usage. Our work provides a promising framework for +real-world applications where continuous learning from pre-collected data is +essential. + +
+
+
+
+
+ + ☆ Surrogate-assisted multi-objective design of complex multibody systems + + +
+ The optimization of large-scale multibody systems is a numerically +challenging task, in particular when considering multiple conflicting criteria +at the same time. In this situation, we need to approximate the Pareto set of +optimal compromises, which is significantly more expensive than finding a +single optimum in single-objective optimization. To prevent large costs, the +usage of surrogate models, constructed from a small but informative number of +expensive model evaluations, is a very popular and widely studied approach. The +central challenge then is to ensure a high quality (that is, near-optimality) +of the solutions that were obtained using the surrogate model, which can be +hard to guarantee with a single pre-computed surrogate. We present a +back-and-forth approach between surrogate modeling and multi-objective +optimization to improve the quality of the obtained solutions. Using the +example of an expensive-to-evaluate multibody system, we compare different +strategies regarding multi-objective optimization, sampling and also surrogate +modeling, to identify the most promising approach in terms of computational +efficiency and solution quality. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2412.01566 +
+
+
+
+
+ + ☆ Entropy Regularized Task Representation Learning for Offline + Meta-Reinforcement Learning AAAI 2025 + + +
+ Offline meta-reinforcement learning aims to equip agents with the ability to +rapidly adapt to new tasks by training on data from a set of different tasks. +Context-based approaches utilize a history of state-action-reward transitions +-- referred to as the context -- to infer representations of the current task, +and then condition the agent, i.e., the policy and value function, on the task +representations. Intuitively, the better the task representations capture the +underlying tasks, the better the agent can generalize to new tasks. +Unfortunately, context-based approaches suffer from distribution mismatch, as +the context in the offline data does not match the context at test time, +limiting their ability to generalize to the test tasks. This leads to the task +representations overfitting to the offline training data. Intuitively, the task +representations should be independent of the behavior policy used to collect +the offline data. To address this issue, we approximately minimize the mutual +information between the distribution over the task representations and behavior +policy by maximizing the entropy of behavior policy conditioned on the task +representations. We validate our approach in MuJoCo environments, showing that +compared to baselines, our task representations more faithfully represent the +underlying tasks, leading to outperforming prior methods in both +in-distribution and out-of-distribution tasks. + +
+
+ comment: 7 Pages, Accepted at AAAI 2025 +
+
+
+
+
+ + ☆ Answer Set Networks: Casting Answer Set Programming into Deep Learning + + +
+ Although Answer Set Programming (ASP) allows constraining neural-symbolic +(NeSy) systems, its employment is hindered by the prohibitive costs of +computing stable models and the CPU-bound nature of state-of-the-art solvers. +To this end, we propose Answer Set Networks (ASN), a NeSy solver. Based on +Graph Neural Networks (GNN), ASNs are a scalable approach to ASP-based Deep +Probabilistic Logic Programming (DPPL). Specifically, we show how to translate +ASPs into ASNs and demonstrate how ASNs can efficiently solve the encoded +problem by leveraging GPU's batching and parallelization capabilities. Our +experimental evaluations demonstrate that ASNs outperform state-of-the-art +CPU-bound NeSy systems on multiple tasks. Simultaneously, we make the following +two contributions based on the strengths of ASNs. Namely, we are the first to +show the finetuning of Large Language Models (LLM) with DPPLs, employing ASNs +to guide the training with logic. Further, we show the "constitutional +navigation" of drones, i.e., encoding public aviation laws in an ASN for +routing Unmanned Aerial Vehicles in uncertain environments. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ☆ MARIA: a Multimodal Transformer Model for Incomplete Healthcare Data + + +
+ In healthcare, the integration of multimodal data is pivotal for developing +comprehensive diagnostic and predictive models. However, managing missing data +remains a significant challenge in real-world applications. We introduce MARIA +(Multimodal Attention Resilient to Incomplete datA), a novel transformer-based +deep learning model designed to address these challenges through an +intermediate fusion strategy. Unlike conventional approaches that depend on +imputation, MARIA utilizes a masked self-attention mechanism, which processes +only the available data without generating synthetic values. This approach +enables it to effectively handle incomplete datasets, enhancing robustness and +minimizing biases introduced by imputation methods. We evaluated MARIA against +10 state-of-the-art machine learning and deep learning models across 8 +diagnostic and prognostic tasks. The results demonstrate that MARIA outperforms +existing methods in terms of performance and resilience to varying levels of +data incompleteness, underscoring its potential for critical healthcare +applications. + +
+
+
+
+
+ + ☆ Stack Trace Deduplication: Faster, More Accurately, and in More + Realistic Scenarios + + +
+ In large-scale software systems, there are often no fully-fledged bug reports +with human-written descriptions when an error occurs. In this case, developers +rely on stack traces, i.e., series of function calls that led to the error. +Since there can be tens and hundreds of thousands of them describing the same +issue from different users, automatic deduplication into categories is +necessary to allow for processing. Recent works have proposed powerful deep +learning-based approaches for this, but they are evaluated and compared in +isolation from real-life workflows, and it is not clear whether they will +actually work well at scale. + To overcome this gap, this work presents three main contributions: a novel +model, an industry-based dataset, and a multi-faceted evaluation. Our model +consists of two parts - (1) an embedding model with byte-pair encoding and +approximate nearest neighbor search to quickly find the most relevant stack +traces to the incoming one, and (2) a reranker that re-ranks the most fitting +stack traces, taking into account the repeated frames between them. To +complement the existing datasets collected from open-source projects, we share +with the community SlowOps - a dataset of stack traces from IntelliJ-based +products developed by JetBrains, which has an order of magnitude more stack +traces per category. Finally, we carry out an evaluation that strives to be +realistic: measuring not only the accuracy of categorization, but also the +operation time and the ability to create new categories. The evaluation shows +that our model strikes a good balance - it outperforms other models on both +open-source datasets and SlowOps, while also being faster on time than most. We +release all of our code and data, and hope that our work can pave the way to +further practice-oriented research in the area. + +
+
+ comment: Published at SANER'25. 11 pages, 2 figures +
+
+
+
+
+ + ☆ Extending TWIG: Zero-Shot Predictive Hyperparameter Selection for KGEs + based on Graph Structure + + +
+ Knowledge Graphs (KGs) have seen increasing use across various domains -- +from biomedicine and linguistics to general knowledge modelling. In order to +facilitate the analysis of knowledge graphs, Knowledge Graph Embeddings (KGEs) +have been developed to automatically analyse KGs and predict new facts based on +the information in a KG, a task called "link prediction". Many existing studies +have documented that the structure of a KG, KGE model components, and KGE +hyperparameters can significantly change how well KGEs perform and what +relationships they are able to learn. Recently, the Topologically-Weighted +Intelligence Generation (TWIG) model has been proposed as a solution to +modelling how each of these elements relate. In this work, we extend the +previous research on TWIG and evaluate its ability to simulate the output of +the KGE model ComplEx in the cross-KG setting. Our results are twofold. First, +TWIG is able to summarise KGE performance on a wide range of hyperparameter +settings and KGs being learned, suggesting that it represents a general +knowledge of how to predict KGE performance from KG structure. Second, we show +that TWIG can successfully predict hyperparameter performance on unseen KGs in +the zero-shot setting. This second observation leads us to propose that, with +additional research, optimal hyperparameter selection for KGE models could be +determined in a pre-hoc manner using TWIG-like methods, rather than by using a +full hyperparameter search. + +
+
+
+
+
+ + ☆ Agent-Temporal Credit Assignment for Optimal Policy Preservation in + Sparse Multi-Agent Reinforcement Learning + + +
+ In multi-agent environments, agents often struggle to learn optimal policies +due to sparse or delayed global rewards, particularly in long-horizon tasks +where it is challenging to evaluate actions at intermediate time steps. We +introduce Temporal-Agent Reward Redistribution (TAR$^2$), a novel approach +designed to address the agent-temporal credit assignment problem by +redistributing sparse rewards both temporally and across agents. TAR$^2$ +decomposes sparse global rewards into time-step-specific rewards and calculates +agent-specific contributions to these rewards. We theoretically prove that +TAR$^2$ is equivalent to potential-based reward shaping, ensuring that the +optimal policy remains unchanged. Empirical results demonstrate that TAR$^2$ +stabilizes and accelerates the learning process. Additionally, we show that +when TAR$^2$ is integrated with single-agent reinforcement learning algorithms, +it performs as well as or better than traditional multi-agent reinforcement +learning methods. + +
+
+ comment: 12 pages, 1 figure +
+
+
+
+
+ + ☆ ALKAFI-LLAMA3: Fine-Tuning LLMs for Precise Legal Understanding in + Palestine + + +
+ Large Language Models (LLMs) have demonstrated remarkable potential in +diverse domains, yet their application in the legal sector, particularly in +low-resource contexts, remains limited. This study addresses the challenges of +adapting LLMs to the Palestinian legal domain, where political instability, +fragmented legal frameworks, and limited AI resources hinder effective +machine-learning applications. We present a fine-tuned model based on a +quantized version of Llama-3.2-1B-Instruct, trained on a synthetic data set +derived from Palestinian legal texts. Using smaller-scale models and +strategically generated question-answer pairs, we achieve a cost-effective, +locally sustainable solution that provides accurate and contextually relevant +legal guidance. Our experiments demonstrate promising performance on various +query types, ranging from yes/no questions and narrative explanations to +complex legal differentiations, while highlighting areas for improvement, such +as handling calculation-based inquiries and structured list formatting. This +work provides a pathway for the deployment of AI-driven legal assistance tools +tailored to the needs of resource-constrained environments. + +
+
+
+
+
+ + ☆ Opportunities and limitations of explaining quantum machine learning + + +
+ A common trait of many machine learning models is that it is often difficult +to understand and explain what caused the model to produce the given output. +While the explainability of neural networks has been an active field of +research in the last years, comparably little is known for quantum machine +learning models. Despite a few recent works analyzing some specific aspects of +explainability, as of now there is no clear big picture perspective as to what +can be expected from quantum learning models in terms of explainability. In +this work, we address this issue by identifying promising research avenues in +this direction and lining out the expected future results. We additionally +propose two explanation methods designed specifically for quantum machine +learning models, as first of their kind to the best of our knowledge. Next to +our pre-view of the field, we compare both existing and novel methods to +explain the predictions of quantum learning models. By studying explainability +in quantum machine learning, we can contribute to the sustainable development +of the field, preventing trust issues in the future. + +
+
+ comment: 16+16 pages, 3+4 figures +
+
+
+
+
+ + ☆ Deep Learning Based Recalibration of SDSS and DESI BAO Alleviates Hubble + and Clustering Tensions + + +
+ Conventional calibration of Baryon Acoustic Oscillations (BAO) data relies on +estimation of the sound horizon at drag epoch $r_d$ from early universe +observations by assuming a cosmological model. We present a recalibration of +two independent BAO datasets, SDSS and DESI, by employing deep learning +techniques for model-independent estimation of $r_d$, and explore the impacts +on $\Lambda$CDM cosmological parameters. Significant reductions in both Hubble +($H_0$) and clustering ($S_8$) tensions are observed for both the recalibrated +datasets. Moderate shifts in some other parameters hint towards further +exploration of such data-driven approaches. + +
+
+ comment: 5 pages, 2 figures, 2 tables. Comments are welcome +
+
+
+
+
+ + ☆ A parametric algorithm is optimal for non-parametric regression of + smooth functions + + +
+ We address the regression problem for a general function $f:[-1,1]^d\to +\mathbb R$ when the learner selects the training points $\{x_i\}_{i=1}^n$ to +achieve a uniform error bound across the entire domain. In this setting, known +historically as nonparametric regression, we aim to establish a sample +complexity bound that depends solely on the function's degree of smoothness. +Assuming periodicity at the domain boundaries, we introduce PADUA, an algorithm +that, with high probability, provides performance guarantees optimal up to +constant or logarithmic factors across all problem parameters. Notably, PADUA +is the first parametric algorithm with optimal sample complexity for this +setting. Due to this feature, we prove that, differently from the +non-parametric state of the art, PADUA enjoys optimal space complexity in the +prediction phase. To validate these results, we perform numerical experiments +over functions coming from real audio data, where PADUA shows comparable +performance to state-of-the-art methods, while requiring only a fraction of the +computational time. + +
+
+
+
+
+ + ☆ Active Inference and Human--Computer Interaction + + +
+ Active Inference is a closed-loop computational theoretical basis for +understanding behaviour, based on agents with internal probabilistic generative +models that encode their beliefs about how hidden states in their environment +cause their sensations. We review Active Inference and how it could be applied +to model the human-computer interaction loop. Active Inference provides a +coherent framework for managing generative models of humans, their +environments, sensors and interface components. It informs off-line design and +supports real-time, online adaptation. It provides model-based explanations for +behaviours observed in HCI, and new tools to measure important concepts such as +agency and engagement. We discuss how Active Inference offers a new basis for a +theory of interaction in HCI, tools for design of modern, complex sensor-based +systems, and integration of artificial intelligence technologies, enabling it +to cope with diversity in human users and contexts. We discuss the practical +challenges in implementing such Active Inference-based systems. + +
+
+
+
+
+ + ☆ On the Use of Deep Learning Models for Semantic Clone Detection + + +
+ Detecting and tracking code clones can ease various software development and +maintenance tasks when changes in a code fragment should be propagated over all +its copies. Several deep learning-based clone detection models have appeared in +the literature for detecting syntactic and semantic clones, widely evaluated +with the BigCloneBench dataset. However, class imbalance and the small number +of semantic clones make BigCloneBench less ideal for interpreting model +performance. Researchers also use other datasets such as GoogleCodeJam, +OJClone, and SemanticCloneBench to understand model generalizability. To +overcome the limitations of existing datasets, the GPT-assisted semantic and +cross-language clone dataset GPTCloneBench has been released. However, how +these models compare across datasets remains unclear. In this paper, we propose +a multi-step evaluation approach for five state-of-the-art clone detection +models leveraging existing benchmark datasets, including GPTCloneBench, and +using mutation operators to study model ability. Specifically, we examine three +highly-performing single-language models (ASTNN, GMN, CodeBERT) on +BigCloneBench, SemanticCloneBench, and GPTCloneBench, testing their robustness +with mutation operations. Additionally, we compare them against cross-language +models (C4, CLCDSA) known for detecting semantic clones. While single-language +models show high F1 scores for BigCloneBench, their performance on +SemanticCloneBench varies (up to 20%). Interestingly, the cross-language model +(C4) shows superior performance (around 7%) on SemanticCloneBench over other +models and performs similarly on BigCloneBench and GPTCloneBench. On +mutation-based datasets, C4 has more robust performance (less than 1% +difference) compared to single-language models, which show high variability. + +
+
+ comment: Accepted at the 40th IEEE International Conference on Software + Maintenance and Evolution (ICSME 2024) +
+
+
+
+
+ + ☆ Boosting GNN Performance via Training Sample Selection Based on + Adversarial Robustness Evaluation + + +
+ Graph Neural Networks (GNNs) have established themselves as one of the most +powerful neural network architectures, excelling in leveraging graph topology +and node features for various tasks. However, GNNs are inherently vulnerable to +noise in their inputs. Such noise can significantly degrade their performance. +To address this challenge, we propose a novel approach that employs adversarial +robustness evaluation techniques to identify nodes in the graph that are most +susceptible to noise. By selecting and constructing a training set composed of +these particularly noise-prone nodes, we then use them to train a Graph +Convolutional Network (GCN). Our experimental results demonstrate that this +strategy leads to substantial improvements in the GCN's performance. + +
+
+
+
+
+ + ☆ Generative AI for Banks: Benchmarks and Algorithms for Synthetic + Financial Transaction Data + + +
+ The banking sector faces challenges in using deep learning due to data +sensitivity and regulatory constraints, but generative AI may offer a solution. +Thus, this study identifies effective algorithms for generating synthetic +financial transaction data and evaluates five leading models - Conditional +Tabular Generative Adversarial Networks (CTGAN), DoppelGANger (DGAN), +Wasserstein GAN, Financial Diffusion (FinDiff), and Tabular Variational +AutoEncoders (TVAE) - across five criteria: fidelity, synthesis quality, +efficiency, privacy, and graph structure. While none of the algorithms is able +to replicate the real data's graph structure, each excels in specific areas: +DGAN is ideal for privacy-sensitive tasks, FinDiff and TVAE excel in data +replication and augmentation, and CTGAN achieves a balance across all five +criteria, making it suitable for general applications with moderate privacy +concerns. As a result, our findings offer valuable insights for choosing the +most suitable algorithm. + +
+
+ comment: Presented at the 34th Workshop on Information Technologies and + Systems (WITS 2024) +
+
+
+
+
+ + ☆ FROC: Building Fair ROC from a Trained Classifier AAAI + + +
+ This paper considers the problem of fair probabilistic binary classification +with binary protected groups. The classifier assigns scores, and a practitioner +predicts labels using a certain cut-off threshold based on the desired +trade-off between false positives vs. false negatives. It derives these +thresholds from the ROC of the classifier. The resultant classifier may be +unfair to one of the two protected groups in the dataset. It is desirable that +no matter what threshold the practitioner uses, the classifier should be fair +to both the protected groups; that is, the $\mathcal{L}_p$ norm between FPRs +and TPRs of both the protected groups should be at most $\varepsilon$. We call +such fairness on ROCs of both the protected attributes +$\varepsilon_p$-Equalized ROC. Given a classifier not satisfying +$\varepsilon_1$-Equalized ROC, we aim to design a post-processing method to +transform the given (potentially unfair) classifier's output (score) to a +suitable randomized yet fair classifier. That is, the resultant classifier must +satisfy $\varepsilon_1$-Equalized ROC. First, we introduce a threshold query +model on the ROC curves for each protected group. The resulting classifier is +bound to face a reduction in AUC. With the proposed query model, we provide a +rigorous theoretical analysis of the minimal AUC loss to achieve +$\varepsilon_1$-Equalized ROC. To achieve this, we design a linear time +algorithm, namely \texttt{FROC}, to transform a given classifier's output to a +probabilistic classifier that satisfies $\varepsilon_1$-Equalized ROC. We prove +that under certain theoretical conditions, \texttt{FROC}\ achieves the +theoretical optimal guarantees. We also study the performance of our +\texttt{FROC}\ on multiple real-world datasets with many trained classifiers. + +
+
+ comment: 51 pages, The 39th Annual AAAI Conference on Artificial Intelligence +
+
+
+
+
+ + ☆ Prototypical Calibrating Ambiguous Samples for Micro-Action Recognition AAAI 2025 + + +
+ Micro-Action Recognition (MAR) has gained increasing attention due to its +crucial role as a form of non-verbal communication in social interactions, with +promising potential for applications in human communication and emotion +analysis. However, current approaches often overlook the inherent ambiguity in +micro-actions, which arises from the wide category range and subtle visual +differences between categories. This oversight hampers the accuracy of +micro-action recognition. In this paper, we propose a novel Prototypical +Calibrating Ambiguous Network (\textbf{PCAN}) to unleash and mitigate the +ambiguity of MAR. \textbf{Firstly}, we employ a hierarchical action-tree to +identify the ambiguous sample, categorizing them into distinct sets of +ambiguous samples of false negatives and false positives, considering both +body- and action-level categories. \textbf{Secondly}, we implement an ambiguous +contrastive refinement module to calibrate these ambiguous samples by +regulating the distance between ambiguous samples and their corresponding +prototypes. This calibration process aims to pull false negative +($\mathbb{FN}$) samples closer to their respective prototypes and push false +positive ($\mathbb{FP}$) samples apart from their affiliated prototypes. In +addition, we propose a new prototypical diversity amplification loss to +strengthen the model's capacity by amplifying the differences between different +prototypes. \textbf{Finally}, we propose a prototype-guided rectification to +rectify prediction by incorporating the representability of prototypes. +Extensive experiments conducted on the benchmark dataset demonstrate the +superior performance of our method compared to existing approaches. The code is +available at https://github.com/kunli-cs/PCAN. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ A Comprehensive Forecasting Framework based on Multi-Stage Hierarchical + Forecasting Reconciliation and Adjustment + + +
+ Ads demand forecasting for Walmart's ad products plays a critical role in +enabling effective resource planning, allocation, and management of ads +performance. In this paper, we introduce a comprehensive demand forecasting +system that tackles hierarchical time series forecasting in business settings. +Though traditional hierarchical reconciliation methods ensure forecasting +coherence, they often trade off accuracy for coherence especially at lower +levels and fail to capture the seasonality unique to each time-series in the +hierarchy. Thus, we propose a novel framework "Multi-Stage Hierarchical +Forecasting Reconciliation and Adjustment (Multi-Stage HiFoReAd)" to address +the challenges of preserving seasonality, ensuring coherence, and improving +accuracy. Our system first utilizes diverse models, ensembled through Bayesian +Optimization (BO), achieving base forecasts. The generated base forecasts are +then passed into the Multi-Stage HiFoReAd framework. The initial stage refines +the hierarchy using Top-Down forecasts and "harmonic alignment." The second +stage aligns the higher levels' forecasts using MinTrace algorithm, following +which the last two levels undergo "harmonic alignment" and "stratified +scaling", to eventually achieve accurate and coherent forecasts across the +whole hierarchy. Our experiments on Walmart's internal Ads-demand dataset and 3 +other public datasets, each with 4 hierarchical levels, demonstrate that the +average Absolute Percentage Error from the cross-validation sets improve from +3% to 40% across levels against BO-ensemble of models (LGBM, MSTL+ETS, Prophet) +as well as from 1.2% to 92.9% against State-Of-The-Art models. In addition, the +forecasts at all hierarchical levels are proved to be coherent. The proposed +framework has been deployed and leveraged by Walmart's ads, sales and +operations teams to track future demands, make informed decisions and plan +resources. + +
+
+ comment: Published in 2024 IEEE International Conference on Big Data (BigData) +
+
+
+
+
+ + ☆ Computing Gram Matrix for SMILES Strings using RDKFingerprint and + Sinkhorn-Knopp Algorithm + + +
+ In molecular structure data, SMILES (Simplified Molecular Input Line Entry +System) strings are used to analyze molecular structure design. Numerical +feature representation of SMILES strings is a challenging task. This work +proposes a kernel-based approach for encoding and analyzing molecular +structures from SMILES strings. The proposed approach involves computing a +kernel matrix using the Sinkhorn-Knopp algorithm while using kernel principal +component analysis (PCA) for dimensionality reduction. The resulting +low-dimensional embeddings are then used for classification and regression +analysis. The kernel matrix is computed by converting the SMILES strings into +molecular structures using the Morgan Fingerprint, which computes a fingerprint +for each molecule. The distance matrix is computed using the pairwise kernels +function. The Sinkhorn-Knopp algorithm is used to compute the final kernel +matrix that satisfies the constraints of a probability distribution. This is +achieved by iteratively adjusting the kernel matrix until the marginal +distributions of the rows and columns match the desired marginal distributions. +We provided a comprehensive empirical analysis of the proposed kernel method to +evaluate its goodness with greater depth. The suggested method is assessed for +drug subcategory prediction (classification task) and solubility AlogPS +``Aqueous solubility and Octanol/Water partition coefficient" (regression task) +using the benchmark SMILES string dataset. The outcomes show the proposed +method outperforms several baseline methods in terms of supervised analysis and +has potential uses in molecular design and drug discovery. Overall, the +suggested method is a promising avenue for kernel methods-based molecular +structure analysis and design. + +
+
+
+
+
+ + ☆ Holistic Adversarially Robust Pruning ICLR 2023 + + +
+ Neural networks can be drastically shrunk in size by removing redundant +parameters. While crucial for the deployment on resource-constraint hardware, +oftentimes, compression comes with a severe drop in accuracy and lack of +adversarial robustness. Despite recent advances, counteracting both aspects has +only succeeded for moderate compression rates so far. We propose a novel +method, HARP, that copes with aggressive pruning significantly better than +prior work. For this, we consider the network holistically. We learn a global +compression strategy that optimizes how many parameters (compression rate) and +which parameters (scoring connections) to prune specific to each layer +individually. Our method fine-tunes an existing model with dynamic +regularization, that follows a step-wise incremental function balancing the +different objectives. It starts by favoring robustness before shifting focus on +reaching the target compression rate and only then handles the objectives +equally. The learned compression strategies allow us to maintain the +pre-trained model natural accuracy and its adversarial robustness for a +reduction by 99% of the network original size. Moreover, we observe a crucial +influence of non-uniform compression across layers. + +
+
+ comment: Accepted by ICLR 2023 +
+
+
+
+
+ + ☆ ReMoE: Fully Differentiable Mixture-of-Experts with ReLU Routing + + +
+ Sparsely activated Mixture-of-Experts (MoE) models are widely adopted to +scale up model capacity without increasing the computation budget. However, +vanilla TopK routers are trained in a discontinuous, non-differentiable way, +limiting their performance and scalability. To address this issue, we propose +ReMoE, a fully differentiable MoE architecture that offers a simple yet +effective drop-in replacement for the conventional TopK+Softmax routing, +utilizing ReLU as the router instead. We further propose methods to regulate +the router's sparsity while balancing the load among experts. ReMoE's +continuous nature enables efficient dynamic allocation of computation across +tokens and layers, while also exhibiting domain specialization. Our experiments +demonstrate that ReMoE consistently outperforms vanilla TopK-routed MoE across +various model sizes, expert counts, and levels of granularity. Furthermore, +ReMoE exhibits superior scalability with respect to the number of experts, +surpassing traditional MoE architectures. The implementation based on +Megatron-LM is available at https://github.com/thu-ml/ReMoE. + +
+
+
+
+
+ + ☆ Taming the Memory Beast: Strategies for Reliable ML Training on + Kubernetes + + +
+ Kubernetes offers a powerful orchestration platform for machine learning +training, but memory management can be challenging due to specialized needs and +resource constraints. This paper outlines how Kubernetes handles memory +requests, limits, Quality of Service classes, and eviction policies for ML +workloads, with special focus on GPU memory and ephemeral storage. Common +pitfalls such as overcommitment, memory leaks, and ephemeral volume exhaustion +are examined. We then provide best practices for stable, scalable memory +utilization to help ML practitioners prevent out-of-memory events and ensure +high-performance ML training pipelines. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ Lorentzian Residual Neural Networks KDD 2025 + + +
+ Hyperbolic neural networks have emerged as a powerful tool for modeling +hierarchical data structures prevalent in real-world datasets. Notably, +residual connections, which facilitate the direct flow of information across +layers, have been instrumental in the success of deep neural networks. However, +current methods for constructing hyperbolic residual networks suffer from +limitations such as increased model complexity, numerical instability, and +errors due to multiple mappings to and from the tangent space. To address these +limitations, we introduce LResNet, a novel Lorentzian residual neural network +based on the weighted Lorentzian centroid in the Lorentz model of hyperbolic +geometry. Our method enables the efficient integration of residual connections +in Lorentz hyperbolic neural networks while preserving their hierarchical +representation capabilities. We demonstrate that our method can theoretically +derive previous methods while offering improved stability, efficiency, and +effectiveness. Extensive experiments on both graph and vision tasks showcase +the superior performance and robustness of our method compared to +state-of-the-art Euclidean and hyperbolic alternatives. Our findings highlight +the potential of \method for building more expressive neural networks in +hyperbolic embedding space as a generally applicable method to multiple +architectures, including CNNs, GNNs, and graph Transformers. + +
+
+ comment: 12 pages, 3 figures, KDD 2025 +
+
+
+
+
+ + ☆ How to Synthesize Text Data without Model Collapse? + + +
+ Model collapse in synthetic data indicates that iterative training on +self-generated data leads to a gradual decline in performance. With the +proliferation of AI models, synthetic data will fundamentally reshape the web +data ecosystem. Future GPT-$\{n\}$ models will inevitably be trained on a blend +of synthetic and human-produced data. In this paper, we focus on two questions: +what is the impact of synthetic data on language model training, and how to +synthesize data without model collapse? We first pre-train language models +across different proportions of synthetic data, revealing a negative +correlation between the proportion of synthetic data and model performance. We +further conduct statistical analysis on synthetic data to uncover +distributional shift phenomenon and over-concentration of n-gram features. +Inspired by the above findings, we propose token editing on human-produced data +to obtain semi-synthetic data. As a proof of concept, we theoretically +demonstrate that token-level editing can prevent model collapse, as the test +error is constrained by a finite upper bound. We conduct extensive experiments +on pre-training from scratch, continual pre-training, and supervised +fine-tuning. The results validate our theoretical proof that token-level +editing improves data quality and enhances model performance. + +
+
+
+
+
+ + ☆ LoLaFL: Low-Latency Federated Learning via Forward-only Propagation + + +
+ Federated learning (FL) has emerged as a widely adopted paradigm for enabling +edge learning with distributed data while ensuring data privacy. However, the +traditional FL with deep neural networks trained via backpropagation can hardly +meet the low-latency learning requirements in the sixth generation (6G) mobile +networks. This challenge mainly arises from the high-dimensional model +parameters to be transmitted and the numerous rounds of communication required +for convergence due to the inherent randomness of the training process. To +address this issue, we adopt the state-of-the-art principle of maximal coding +rate reduction to learn linear discriminative features and extend the resultant +white-box neural network into FL, yielding the novel framework of Low-Latency +Federated Learning (LoLaFL) via forward-only propagation. LoLaFL enables +layer-wise transmissions and aggregation with significantly fewer communication +rounds, thereby considerably reducing latency. Additionally, we propose two +\emph{nonlinear} aggregation schemes for LoLaFL. The first scheme is based on +the proof that the optimal NN parameter aggregation in LoLaFL should be +harmonic-mean-like. The second scheme further exploits the low-rank structures +of the features and transmits the low-rank-approximated covariance matrices of +features to achieve additional latency reduction. Theoretic analysis and +experiments are conducted to evaluate the performance of LoLaFL. In comparison +with traditional FL, the two nonlinear aggregation schemes for LoLaFL can +achieve reductions in latency of over 91\% and 98\%, respectively, while +maintaining comparable accuracies. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ Unveiling Uncertainty: A Deep Dive into Calibration and Performance of + Multimodal Large Language Models COLING 2025 + + +
+ Multimodal large language models (MLLMs) combine visual and textual data for +tasks such as image captioning and visual question answering. Proper +uncertainty calibration is crucial, yet challenging, for reliable use in areas +like healthcare and autonomous driving. This paper investigates representative +MLLMs, focusing on their calibration across various scenarios, including before +and after visual fine-tuning, as well as before and after multimodal training +of the base LLMs. We observed miscalibration in their performance, and at the +same time, no significant differences in calibration across these scenarios. We +also highlight how uncertainty differs between text and images and how their +integration affects overall uncertainty. To better understand MLLMs' +miscalibration and their ability to self-assess uncertainty, we construct the +IDK (I don't know) dataset, which is key to evaluating how they handle +unknowns. Our findings reveal that MLLMs tend to give answers rather than admit +uncertainty, but this self-assessment improves with proper prompt adjustments. +Finally, to calibrate MLLMs and enhance model reliability, we propose +techniques such as temperature scaling and iterative prompt optimization. Our +results provide insights into improving MLLMs for effective and responsible +deployment in multimodal applications. Code and IDK dataset: +\href{https://github.com/hfutml/Calibration-MLLM}{https://github.com/hfutml/Calibration-MLLM}. + +
+
+ comment: Accepted to COLING 2025 +
+
+
+
+
+ + ☆ Trainable Adaptive Activation Function Structure (TAAFS) Enhances Neural + Network Force Field Performance with Only Dozens of Additional Parameters + + +
+ At the heart of neural network force fields (NNFFs) is the architecture of +neural networks, where the capacity to model complex interactions is typically +enhanced through widening or deepening multilayer perceptrons (MLPs) or by +increasing layers of graph neural networks (GNNs). These enhancements, while +improving the model's performance, often come at the cost of a substantial +increase in the number of parameters. By applying the Trainable Adaptive +Activation Function Structure (TAAFS), we introduce a method that selects +distinct mathematical formulations for non-linear activations, thereby +increasing the precision of NNFFs with an insignificant addition to the +parameter count. In this study, we integrate TAAFS into a variety of neural +network models, resulting in observed accuracy improvements, and further +validate these enhancements through molecular dynamics (MD) simulations using +DeepMD. + +
+
+
+
+
+ + ☆ Permutation recovery of spikes in noisy high-dimensional tensor + estimation + + +
+ We study the dynamics of gradient flow in high dimensions for the +multi-spiked tensor problem, where the goal is to estimate $r$ unknown signal +vectors (spikes) from noisy Gaussian tensor observations. Specifically, we +analyze the maximum likelihood estimation procedure, which involves optimizing +a highly nonconvex random function. We determine the sample complexity required +for gradient flow to efficiently recover all spikes, without imposing any +assumptions on the separation of the signal-to-noise ratios (SNRs). More +precisely, our results provide the sample complexity required to guarantee +recovery of the spikes up to a permutation. Our work builds on our companion +paper [Ben Arous, Gerbelot, Piccolo 2024], which studies Langevin dynamics and +determines the sample complexity and separation conditions for the SNRs +necessary for ensuring exact recovery of the spikes (where the recovered +permutation matches the identity). During the recovery process, the +correlations between the estimators and the hidden vectors increase in a +sequential manner. The order in which these correlations become significant +depends on their initial values and the corresponding SNRs, which ultimately +determines the permutation of the recovered spikes. + +
+
+ comment: 29 pages, 2 figures. arXiv admin note: substantial text overlap with + arXiv:2408.06401 +
+
+
+
+
+ + ☆ Adaptive Prompt Tuning: Vision Guided Prompt Tuning with Cross-Attention + for Fine-Grained Few-Shot Learning + + +
+ Few-shot, fine-grained classification in computer vision poses significant +challenges due to the need to differentiate subtle class distinctions with +limited data. This paper presents a novel method that enhances the Contrastive +Language-Image Pre-Training (CLIP) model through adaptive prompt tuning, guided +by real-time visual inputs. Unlike existing techniques such as Context +Optimization (CoOp) and Visual Prompt Tuning (VPT), which are constrained by +static prompts or visual token reliance, the proposed approach leverages a +cross-attention mechanism to dynamically refine text prompts for the image at +hand. This enables an image-specific alignment of textual features with image +patches extracted from the Vision Transformer, making the model more effective +for datasets with high intra-class variance and low inter-class differences. +The method is evaluated on several datasets, including CUBirds, Oxford Flowers, +and FGVC Aircraft, showing significant performance gains over static prompt +tuning approaches. To ensure these performance gains translate into trustworthy +predictions, we integrate Monte-Carlo Dropout in our approach to improve the +reliability of the model predictions and uncertainty estimates. This +integration provides valuable insights into the model's predictive confidence, +helping to identify when predictions can be trusted and when additional +verification is necessary. This dynamic approach offers a robust solution, +advancing the state-of-the-art for few-shot fine-grained classification. + +
+
+
+
+
+ + ☆ Robust PCA Based on Adaptive Weighted Least Squares and Low-Rank Matrix + Factorization + + +
+ Robust Principal Component Analysis (RPCA) is a fundamental technique for +decomposing data into low-rank and sparse components, which plays a critical +role for applications such as image processing and anomaly detection. +Traditional RPCA methods commonly use $\ell_1$ norm regularization to enforce +sparsity, but this approach can introduce bias and result in suboptimal +estimates, particularly in the presence of significant noise or outliers. +Non-convex regularization methods have been proposed to mitigate these +challenges, but they tend to be complex to optimize and sensitive to initial +conditions, leading to potential instability in solutions. To overcome these +challenges, in this paper, we propose a novel RPCA model that integrates +adaptive weighted least squares (AWLS) and low-rank matrix factorization +(LRMF). The model employs a {self-attention-inspired} mechanism in its weight +update process, allowing the weight matrix to dynamically adjust and emphasize +significant components during each iteration. By employing a weighted F-norm +for the sparse component, our method effectively reduces bias while simplifying +the computational process compared to traditional $\ell_1$-norm-based methods. +We use an alternating minimization algorithm, where each subproblem has an +explicit solution, thereby improving computational efficiency. Despite its +simplicity, numerical experiments demonstrate that our method outperforms +existing non-convex regularization approaches, offering superior performance +and stability, as well as enhanced accuracy and robustness in practical +applications. + +
+
+
+
+
+ + ☆ Qua$^2$SeDiMo: Quantifiable Quantization Sensitivity of Diffusion Models AAAI 2025 + + +
+ Diffusion Models (DM) have democratized AI image generation through an +iterative denoising process. Quantization is a major technique to alleviate the +inference cost and reduce the size of DM denoiser networks. However, as +denoisers evolve from variants of convolutional U-Nets toward newer Transformer +architectures, it is of growing importance to understand the quantization +sensitivity of different weight layers, operations and architecture types to +performance. In this work, we address this challenge with Qua$^2$SeDiMo, a +mixed-precision Post-Training Quantization framework that generates explainable +insights on the cost-effectiveness of various model weight quantization methods +for different denoiser operation types and block structures. We leverage these +insights to make high-quality mixed-precision quantization decisions for a +myriad of diffusion models ranging from foundational U-Nets to state-of-the-art +Transformers. As a result, Qua$^2$SeDiMo can construct 3.4-bit, 3.9-bit, +3.65-bit and 3.7-bit weight quantization on PixArt-${\alpha}$, +PixArt-${\Sigma}$, Hunyuan-DiT and SDXL, respectively. We further pair our +weight-quantization configurations with 6-bit activation quantization and +outperform existing approaches in terms of quantitative metrics and generative +image quality. + +
+
+ comment: AAAI 2025; version includes supplementary material; 22 Pages, 18 + Figures, 8 Tables +
+
+
+
+
+ + ☆ Continuous latent representations for modeling precipitation with deep + learning + + +
+ The sparse and spatio-temporally discontinuous nature of precipitation data +presents significant challenges for simulation and statistical processing for +bias correction and downscaling. These include incorrect representation of +intermittency and extreme values (critical for hydrology applications), Gibbs +phenomenon upon regridding, and lack of fine scales details. To address these +challenges, a common approach is to transform the precipitation variable +nonlinearly into one that is more malleable. In this work, we explore how deep +learning can be used to generate a smooth, spatio-temporally continuous +variable as a proxy for simulation of precipitation data. We develop a normally +distributed field called pseudo-precipitation (PP) as an alternative for +simulating precipitation. The practical applicability of this variable is +investigated by applying it for downscaling precipitation from \(1\degree\) +(\(\sim\) 100 km) to \(0.25\degree\) (\(\sim\) 25 km). + +
+
+
+
+
+ + ☆ Pitfalls of topology-aware image segmentation + + +
+ Topological correctness, i.e., the preservation of structural integrity and +specific characteristics of shape, is a fundamental requirement for medical +imaging tasks, such as neuron or vessel segmentation. Despite the recent surge +in topology-aware methods addressing this challenge, their real-world +applicability is hindered by flawed benchmarking practices. In this paper, we +identify critical pitfalls in model evaluation that include inadequate +connectivity choices, overlooked topological artifacts in ground truth +annotations, and inappropriate use of evaluation metrics. Through detailed +empirical analysis, we uncover these issues' profound impact on the evaluation +and ranking of segmentation methods. Drawing from our findings, we propose a +set of actionable recommendations to establish fair and robust evaluation +standards for topology-aware medical image segmentation methods. + +
+
+ comment: Code is available at + https://github.com/AlexanderHBerger/topo-pitfalls +
+
+
+
+
+ + ☆ Towards Scalable and Deep Graph Neural Networks via Noise Masking + + +
+ In recent years, Graph Neural Networks (GNNs) have achieved remarkable +success in many graph mining tasks. However, scaling them to large graphs is +challenging due to the high computational and storage costs of repeated feature +propagation and non-linear transformation during training. One commonly +employed approach to address this challenge is model-simplification, which only +executes the Propagation (P) once in the pre-processing, and Combine (C) these +receptive fields in different ways and then feed them into a simple model for +better performance. Despite their high predictive performance and scalability, +these methods still face two limitations. First, existing approaches mainly +focus on exploring different C methods from the model perspective, neglecting +the crucial problem of performance degradation with increasing P depth from the +data-centric perspective, known as the over-smoothing problem. Second, +pre-processing overhead takes up most of the end-to-end processing time, +especially for large-scale graphs. To address these limitations, we present +random walk with noise masking (RMask), a plug-and-play module compatible with +the existing model-simplification works. This module enables the exploration of +deeper GNNs while preserving their scalability. Unlike the previous +model-simplification works, we focus on continuous P and found that the noise +existing inside each P is the cause of the over-smoothing issue, and use the +efficient masking mechanism to eliminate them. Experimental results on six +real-world datasets demonstrate that model-simplification works equipped with +RMask yield superior performance compared to their original version and can +make a good trade-off between accuracy and efficiency. + +
+
+
+
+
+ + ☆ Fast inverse lithography based on a model-driven block stacking + convolutional neural network + + +
+ In the realm of lithography, Optical Proximity Correction (OPC) is a crucial +resolution enhancement technique that optimizes the transmission function of +photomasks on a pixel-based to effectively counter Optical Proximity Effects +(OPE). However, conventional pixel-based OPC methods often generate patterns +that pose manufacturing challenges, thereby leading to the increased cost in +practical scenarios. This paper presents a novel inverse lithographic approach +to OPC, employing a model-driven, block stacking deep learning framework that +expedites the generation of masks conducive to manufacturing. This method is +founded on vector lithography modelling and streamlines the training process by +eliminating the requirement for extensive labeled datasets. Furthermore, +diversity of mask patterns is enhanced by employing a wave function collapse +algorithm, which facilitates the random generation of a multitude of target +patterns, therefore significantly expanding the range of mask paradigm. +Numerical experiments have substantiated the efficacy of the proposed +end-to-end approach, highlighting its superior capability to manage mask +complexity within the context of advanced OPC lithography. This advancement is +anticipated to enhance the feasibility and economic viability of OPC technology +within actual manufacturing environments. + +
+
+ comment: 21 pages, 7 figures +
+
+
+
+
+ + ☆ LDP: Generalizing to Multilingual Visual Information Extraction by + Language Decoupled Pretraining AAAI2025 + + +
+ Visual Information Extraction (VIE) plays a crucial role in the comprehension +of semi-structured documents, and several pre-trained models have been +developed to enhance performance. However, most of these works are monolingual +(usually English). Due to the extremely unbalanced quantity and quality of +pre-training corpora between English and other languages, few works can extend +to non-English scenarios. In this paper, we conduct systematic experiments to +show that vision and layout modality hold invariance among images with +different languages. If decoupling language bias from document images, a +vision-layout-based model can achieve impressive cross-lingual generalization. +Accordingly, we present a simple but effective multilingual training paradigm +LDP (Language Decoupled Pre-training) for better utilization of monolingual +pre-training data. Our proposed model LDM (Language Decoupled Model) is first +pre-trained on the language-independent data, where the language knowledge is +decoupled by a diffusion model, and then the LDM is fine-tuned on the +downstream languages. Extensive experiments show that the LDM outperformed all +SOTA multilingual pre-trained models, and also maintains competitiveness on +downstream monolingual/English benchmarks. + +
+
+ comment: Accepted by AAAI2025 +
+
+
+
+
+ + ☆ Multi-Sensor Object Anomaly Detection: Unifying Appearance, Geometry, + and Internal Properties + + +
+ Object anomaly detection is essential for industrial quality inspection, yet +traditional single-sensor methods face critical limitations. They fail to +capture the wide range of anomaly types, as single sensors are often +constrained to either external appearance, geometric structure, or internal +properties. To overcome these challenges, we introduce MulSen-AD, the first +high-resolution, multi-sensor anomaly detection dataset tailored for industrial +applications. MulSen-AD unifies data from RGB cameras, laser scanners, and +lock-in infrared thermography, effectively capturing external appearance, +geometric deformations, and internal defects. The dataset spans 15 industrial +products with diverse, real-world anomalies. We also present MulSen-AD Bench, a +benchmark designed to evaluate multi-sensor methods, and propose +MulSen-TripleAD, a decision-level fusion algorithm that integrates these three +modalities for robust, unsupervised object anomaly detection. Our experiments +demonstrate that multi-sensor fusion substantially outperforms single-sensor +approaches, achieving 96.1% AUROC in object-level detection accuracy. These +results highlight the importance of integrating multi-sensor data for +comprehensive industrial anomaly detection. + +
+
+
+
+
+ + ☆ MixLLM: LLM Quantization with Global Mixed-precision between + Output-features and Highly-efficient System Design + + +
+ Quantization has become one of the most effective methodologies to compress +LLMs into smaller size. However, the existing quantization solutions still show +limitations of either non-negligible accuracy drop or system inefficiency. In +this paper, we make a comprehensive analysis of the general quantization +principles on their effect to the triangle of accuracy, memory consumption and +system efficiency. We propose MixLLM that explores the new optimization space +of mixed-precision quantization between output features based on the insight +that different output features matter differently in the model. MixLLM +identifies the output features with high salience in the global view rather +than within each single layer, effectively assigning the larger bit-width to +output features that need it most to achieve good accuracy with low memory +consumption. We present the sweet spot of quantization configuration of +algorithm-system co-design that leads to high accuracy and system efficiency. +To address the system challenge, we design the two-step dequantization to make +use of the int8 Tensor Core easily and fast data type conversion to reduce +dequantization overhead significantly, and present the software pipeline to +overlap the memory access, dequantization and the MatMul to the best. Extensive +experiments show that with only 10% more bits, the PPL increasement can be +reduced from about 0.5 in SOTA to within 0.2 for Llama 3.1 70B, while on +average MMLU-Pro improves by 0.93 over the SOTA of three popular models. In +addition to its superior accuracy, MixLLM also achieves state-of-the-art system +efficiency. + +
+
+ comment: The code will be released in the future +
+
+
+
+
+ + ☆ Accelerated Patient-Specific Calibration via Differentiable Hemodynamics + Simulations + + +
+ One of the goals of personalized medicine is to tailor diagnostics to +individual patients. Diagnostics are performed in practice by measuring +quantities, called biomarkers, that indicate the existence and progress of a +disease. In common cardiovascular diseases, such as hypertension, biomarkers +that are closely related to the clinical representation of a patient can be +predicted using computational models. Personalizing computational models +translates to considering patient-specific flow conditions, for example, the +compliance of blood vessels that cannot be a priori known and quantities such +as the patient geometry that can be measured using imaging. Therefore, a +patient is identified by a set of measurable and nonmeasurable parameters +needed to well-define a computational model; else, the computational model is +not personalized, meaning it is prone to large prediction errors. Therefore, to +personalize a computational model, sufficient information needs to be extracted +from the data. The current methods by which this is done are either +inefficient, due to relying on slow-converging optimization methods, or hard to +interpret, due to using `black box` deep-learning algorithms. We propose a +personalized diagnostic procedure based on a differentiable 0D-1D Navier-Stokes +reduced order model solver and fast parameter inference methods that take +advantage of gradients through the solver. By providing a faster method for +performing parameter inference and sensitivity analysis through +differentiability while maintaining the interpretability of well-understood +mathematical models and numerical methods, the best of both worlds is combined. +The performance of the proposed solver is validated against a well-established +process on different geometries, and different parameter inference processes +are successfully performed. + +
+
+
+
+
+ + ☆ Global Spatio-Temporal Fusion-based Traffic Prediction Algorithm with + Anomaly Aware + + +
+ Traffic prediction is an indispensable component of urban planning and +traffic management. Achieving accurate traffic prediction hinges on the ability +to capture the potential spatio-temporal relationships among road sensors. +However, the majority of existing works focus on local short-term +spatio-temporal correlations, failing to fully consider the interactions of +different sensors in the long-term state. In addition, these works do not +analyze the influences of anomalous factors, or have insufficient ability to +extract personalized features of anomalous factors, which make them +ineffectively capture their spatio-temporal influences on traffic prediction. +To address the aforementioned issues, We propose a global spatio-temporal +fusion-based traffic prediction algorithm that incorporates anomaly awareness. +Initially, based on the designed anomaly detection network, we construct an +efficient anomalous factors impacting module (AFIM), to evaluate the +spatio-temporal impact of unexpected external events on traffic prediction. +Furthermore, we propose a multi-scale spatio-temporal feature fusion module +(MTSFFL) based on the transformer architecture, to obtain all possible both +long and short term correlations among different sensors in a wide-area traffic +environment for accurate prediction of traffic flow. Finally, experiments are +implemented based on real-scenario public transportation datasets (PEMS04 and +PEMS08) to demonstrate that our approach can achieve state-of-the-art +performance. + +
+
+
+
+
+ + ☆ AIArena: A Blockchain-Based Decentralized AI Training Platform + + +
+ The rapid advancement of AI has underscored critical challenges in its +development and implementation, largely due to centralized control by a few +major corporations. This concentration of power intensifies biases within AI +models, resulting from inadequate governance and oversight mechanisms. +Additionally, it limits public involvement and heightens concerns about the +integrity of model generation. Such monopolistic control over data and AI +outputs threatens both innovation and fair data usage, as users inadvertently +contribute data that primarily benefits these corporations. In this work, we +propose AIArena, a blockchain-based decentralized AI training platform designed +to democratize AI development and alignment through on-chain incentive +mechanisms. AIArena fosters an open and collaborative environment where +participants can contribute models and computing resources. Its on-chain +consensus mechanism ensures fair rewards for participants based on their +contributions. We instantiate and implement AIArena on the public Base +blockchain Sepolia testnet, and the evaluation results demonstrate the +feasibility of AIArena in real-world applications. + +
+
+
+
+
+ + ♻ ☆ SoK: Watermarking for AI-Generated Content + + +
+ As the outputs of generative AI (GenAI) techniques improve in quality, it +becomes increasingly challenging to distinguish them from human-created +content. Watermarking schemes are a promising approach to address the problem +of distinguishing between AI and human-generated content. These schemes embed +hidden signals within AI-generated content to enable reliable detection. While +watermarking is not a silver bullet for addressing all risks associated with +GenAI, it can play a crucial role in enhancing AI safety and trustworthiness by +combating misinformation and deception. This paper presents a comprehensive +overview of watermarking techniques for GenAI, beginning with the need for +watermarking from historical and regulatory perspectives. We formalize the +definitions and desired properties of watermarking schemes and examine the key +objectives and threat models for existing approaches. Practical evaluation +strategies are also explored, providing insights into the development of robust +watermarking techniques capable of resisting various attacks. Additionally, we +review recent representative works, highlight open challenges, and discuss +potential directions for this emerging field. By offering a thorough +understanding of watermarking in GenAI, this work aims to guide researchers in +advancing watermarking methods and applications, and support policymakers in +addressing the broader implications of GenAI. + +
+
+
+
+
+ + ♻ ☆ S$^{2}$FT: Efficient, Scalable and Generalizable LLM Fine-tuning by + Structured Sparsity + + +
+ Current PEFT methods for LLMs can achieve either high quality, efficient +training, or scalable serving, but not all three simultaneously. To address +this limitation, we investigate sparse fine-tuning and observe a remarkable +improvement in generalization ability. Utilizing this key insight, we propose a +family of Structured Sparse Fine-Tuning (S$^{2}$FT) methods for LLMs, which +concurrently achieve state-of-the-art fine-tuning performance, training +efficiency, and inference scalability. S$^{2}$FT accomplishes this by +"selecting sparsely and computing densely". It selects a few heads and channels +in the MHA and FFN modules for each Transformer block, respectively. Next, it +co-permutes weight matrices on both sides of the coupled structures in LLMs to +connect the selected components in each layer into a dense submatrix. Finally, +S$^{2}$FT performs in-place gradient updates on all submatrices. Through +theoretical analysis and empirical results, our method prevents forgetting +while simplifying optimization, delivers SOTA performance on both commonsense +and arithmetic reasoning with 4.6% and 1.3% average improvements compared to +LoRA, and surpasses full FT by 11.5% when generalizing to various domains after +instruction tuning. Using our partial backpropagation algorithm, S$^{2}$FT +saves training memory up to 3$\times$ and improves latency by 1.5-2.7$\times$ +compared to full FT, while delivering an average 10% improvement over LoRA on +both metrics. We further demonstrate that the weight updates in S$^{2}$FT can +be decoupled into adapters, enabling effective fusion, fast switch, and +efficient parallelism for serving multiple fine-tuned models. + +
+
+
+
+
+ + ♻ ☆ URIEL+: Enhancing Linguistic Inclusion and Usability in a Typological + and Multilingual Knowledge Base COLING 2025 + + +
+ URIEL is a knowledge base offering geographical, phylogenetic, and +typological vector representations for 7970 languages. It includes distance +measures between these vectors for 4005 languages, which are accessible via the +lang2vec tool. Despite being frequently cited, URIEL is limited in terms of +linguistic inclusion and overall usability. To tackle these challenges, we +introduce URIEL+, an enhanced version of URIEL and lang2vec that addresses +these limitations. In addition to expanding typological feature coverage for +2898 languages, URIEL+ improves the user experience with robust, customizable +distance calculations to better suit the needs of users. These upgrades also +offer competitive performance on downstream tasks and provide distances that +better align with linguistic distance studies. + +
+
+ comment: Accepted to COLING 2025 +
+
+
+
+
+ + ♻ ☆ Sometimes I am a Tree: Data Drives Unstable Hierarchical Generalization + + +
+ Language models (LMs), like other neural networks, often favor shortcut +heuristics based on surface-level patterns. Although LMs behave like n-gram +models early in training, they must eventually learn hierarchical syntactic +representations to correctly apply grammatical rules out-of-distribution (OOD). +In this work, we use case studies of English grammar to explore how complex, +diverse training data drives models to generalize OOD. We construct a framework +that unifies our understanding of random variation with training dynamics, rule +selection with memorization, and data diversity with complexity. We show that +these factors are nuanced, and that intermediate levels of diversity and +complexity lead to inconsistent behavior across random seeds and to unstable +training dynamics. Our findings emphasize the critical role of training data in +shaping generalization patterns and illuminate how competing model strategies +lead to inconsistent generalization outcomes across random seeds. Code is +available at https://github.com/sunnytqin/concept_comp.git. + +
+
+
+
+
+ + ♻ ☆ Latent Ewald summation for machine learning of long-range interactions + + +
+ Machine learning interatomic potentials (MLIPs) often neglect long-range +interactions, such as electrostatic and dispersion forces. In this work, we +introduce a straightforward and efficient method to account for long-range +interactions by learning a latent variable from local atomic descriptors and +applying an Ewald summation to this variable. We demonstrate that in systems +including charged and polar molecular dimers, bulk water, and water-vapor +interface, standard short-ranged MLIPs can lead to unphysical predictions even +when employing message passing. The long-range models effectively eliminate +these artifacts, with only about twice the computational cost of short-range +MLIPs. + +
+
+
+
+
+ + ♻ ☆ Revisiting Machine Unlearning with Dimensional Alignment + + +
+ Machine unlearning, an emerging research topic focusing on compliance with +data privacy regulations, enables trained models to remove the information +learned from specific data. While many existing methods indirectly address this +issue by intentionally injecting incorrect supervisions, they can drastically +and unpredictably alter the decision boundaries and feature spaces, leading to +training instability and undesired side effects. To fundamentally approach this +task, we first analyze the changes in latent feature spaces between original +and retrained models, and observe that the feature representations of samples +not involved in training are closely aligned with the feature manifolds of +previously seen samples in training. Based on these findings, we introduce a +novel evaluation metric for machine unlearning, coined dimensional alignment, +which measures the alignment between the eigenspaces of the forget and retain +set samples. We employ this metric as a regularizer loss to build a robust and +stable unlearning framework, which is further enhanced by integrating a +self-distillation loss and an alternating training scheme. Our framework +effectively eliminates information from the forget set and preserves knowledge +from the retain set. Lastly, we identify critical flaws in established +evaluation metrics for machine unlearning, and introduce new evaluation tools +that more accurately reflect the fundamental goals of machine unlearning. + +
+
+
+
+
+ + ♻ ☆ Metric Compatible Training for Online Backfilling in Large-Scale + Retrieval + + +
+ Backfilling is the process of re-extracting all gallery embeddings from +upgraded models in image retrieval systems. It inevitably requires a +prohibitively large amount of computational cost and even entails the downtime +of the service. Although backward-compatible learning sidesteps this challenge +by tackling query-side representations, this leads to suboptimal solutions in +principle because gallery embeddings cannot benefit from model upgrades. We +address this dilemma by introducing an online backfilling algorithm, which +enables us to achieve a progressive performance improvement during the +backfilling process while not sacrificing the final performance of new model +after the completion of backfilling. To this end, we first propose a simple +distance rank merge technique for online backfilling. Then, we incorporate a +reverse transformation module for more effective and efficient merging, which +is further enhanced by adopting a metric-compatible contrastive learning +approach. These two components help to make the distances of old and new models +compatible, resulting in desirable merge results during backfilling with no +extra computational overhead. Extensive experiments show the effectiveness of +our framework on four standard benchmarks in various settings. + +
+
+
+
+
+ + ♻ ☆ Latent Variable Sequence Identification for Cognitive Models with Neural + Network Estimators + + +
+ Extracting time-varying latent variables from computational cognitive models +is a key step in model-based neural analysis, which aims to understand the +neural correlates of cognitive processes. However, existing methods only allow +researchers to infer latent variables that explain subjects' behavior in a +relatively small class of cognitive models. For example, a broad class of +relevant cognitive models with analytically intractable likelihood is currently +out of reach from standard techniques, based on Maximum a Posteriori parameter +estimation. Here, we present an approach that extends neural Bayes estimation +to learn a direct mapping between experimental data and the targeted latent +variable space using recurrent neural networks and simulated datasets. We show +that our approach achieves competitive performance in inferring latent variable +sequences in both tractable and intractable models. Furthermore, the approach +is generalizable across different computational models and is adaptable for +both continuous and discrete latent spaces. We then demonstrate its +applicability in real world datasets. Our work underscores that combining +recurrent neural networks and simulation-based inference to identify latent +variable sequences can enable researchers to access a wider class of cognitive +models for model-based neural analyses, and thus test a broader set of +theories. + +
+
+
+
+
+ + ♻ ☆ LLMs as Zero-shot Graph Learners: Alignment of GNN Representations with + LLM Token Embeddings + + +
+ Zero-shot graph machine learning, especially with graph neural networks +(GNNs), has garnered significant interest due to the challenge of scarce +labeled data. While methods like self-supervised learning and graph prompt +learning have been extensively explored, they often rely on fine-tuning with +task-specific labels, limiting their effectiveness in zero-shot scenarios. +Inspired by the zero-shot capabilities of instruction-fine-tuned large language +models (LLMs), we introduce a novel framework named Token Embedding-Aligned +Graph Language Model (TEA-GLM) that leverages LLMs as cross-dataset and +cross-task zero-shot learners for graph machine learning. Concretely, we +pretrain a GNN, aligning its representations with token embeddings of an LLM. +We then train a linear projector that transforms the GNN's representations into +a fixed number of graph token embeddings without tuning the LLM. A unified +instruction is designed for various graph tasks at different levels, such as +node classification (node-level) and link prediction (edge-level). These design +choices collectively enhance our method's effectiveness in zero-shot learning, +setting it apart from existing methods. Experiments show that our graph token +embeddings help the LLM predictor achieve state-of-the-art performance on +unseen datasets and tasks compared to other methods using LLMs as predictors. + +
+
+
+
+
+ + ♻ ☆ Learning from Linear Algebra: A Graph Neural Network Approach to + Preconditioner Design for Conjugate Gradient Solvers + + +
+ Large linear systems are ubiquitous in modern computational science and +engineering. The main recipe for solving them is the use of Krylov subspace +iterative methods with well-designed preconditioners. Deep learning models can +be used as nonlinear preconditioners during the iteration of linear solvers +such as the conjugate gradient (CG) method. Neural network models require an +enormous number of parameters to approximate well in this setup. Another +approach is to take advantage of small graph neural networks (GNNs) to +construct preconditioners with predefined sparsity patterns. Recently, GNNs +have been shown to be a promising tool for designing preconditioners to reduce +the overall computational cost of iterative methods by constructing them more +efficiently than with classical linear algebra techniques. However, +preconditioners designed with these approaches cannot outperform those designed +with classical methods in terms of the number of iterations in CG. In our work, +we recall well-established preconditioners from linear algebra and use them as +a starting point for training the GNN to obtain preconditioners that reduce the +condition number of the system more significantly. Numerical experiments show +that our approach outperforms both classical and neural network-based methods +for an important class of parametric partial differential equations. We also +provide a heuristic justification for the loss function used and show that +preconditioners obtained by learning with this loss function reduce the +condition number in a more desirable way for CG. + +
+
+
+
+
+ + ♻ ☆ TurboSVM-FL: Boosting Federated Learning through SVM Aggregation for + Lazy Clients AAAI + + +
+ Federated learning is a distributed collaborative machine learning paradigm +that has gained strong momentum in recent years. In federated learning, a +central server periodically coordinates models with clients and aggregates the +models trained locally by clients without necessitating access to local data. +Despite its potential, the implementation of federated learning continues to +encounter several challenges, predominantly the slow convergence that is +largely due to data heterogeneity. The slow convergence becomes particularly +problematic in cross-device federated learning scenarios where clients may be +strongly limited by computing power and storage space, and hence counteracting +methods that induce additional computation or memory cost on the client side +such as auxiliary objective terms and larger training iterations can be +impractical. In this paper, we propose a novel federated aggregation strategy, +TurboSVM-FL, that poses no additional computation burden on the client side and +can significantly accelerate convergence for federated classification task, +especially when clients are "lazy" and train their models solely for few epochs +for next global aggregation. TurboSVM-FL extensively utilizes support vector +machine to conduct selective aggregation and max-margin spread-out +regularization on class embeddings. We evaluate TurboSVM-FL on multiple +datasets including FEMNIST, CelebA, and Shakespeare using user-independent +validation with non-iid data distribution. Our results show that TurboSVM-FL +can significantly outperform existing popular algorithms on convergence rate +and reduce communication rounds while delivering better test metrics including +accuracy, F1 score, and MCC. + +
+
+ comment: Proceedings of the AAAI Conference on Artificial Intelligence 2024 + (AAAI'24) +
+
+
+
+
+ + ♻ ☆ Mitigating federated learning contribution allocation instability + through randomized aggregation + + +
+ Federated learning (FL) is a collaborative and privacy-preserving Machine +Learning paradigm, allowing the development of robust models without the need +to centralise sensitive data. A critical challenge in FL lies in fairly and +accurately allocating contributions from diverse participants. Inaccurate +allocation can undermine trust, lead to unfair compensation, and thus +participants may lack the incentive to join or actively contribute to the +federation. + Various remuneration strategies have been proposed to date, including +auction-based approaches and Shapley-value based methods, the latter offering a +means to quantify the contribution of each participant. However, little to no +work has studied the stability of these contribution evaluation methods. + In this paper, we focus on calculating contributions using gradient-based +model reconstruction techniques with Shapley values. We first show that +baseline Shapley values do not accurately reflect clients' contributions, +leading to unstable reward allocations amongst participants in a cross-silo +federation. We then introduce \textsc{FedRandom}, a new method that mitigates +these shortcomings with additional data samplings, and show its efficacy at +increasing the stability of contribution evaluation in federated learning. + +
+
+
+
+
+ + ♻ ☆ Arbitrary Polynomial Separations in Trainable Quantum Machine Learning + + +
+ Recent theoretical results in quantum machine learning have demonstrated a +general trade-off between the expressive power of quantum neural networks +(QNNs) and their trainability; as a corollary of these results, practical +exponential separations in expressive power over classical machine learning +models are believed to be infeasible as such QNNs take a time to train that is +exponential in the model size. We here circumvent these negative results by +constructing a hierarchy of efficiently trainable QNNs that exhibit +unconditionally provable, polynomial memory separations of arbitrary constant +degree over classical neural networks -- including state-of-the-art models, +such as Transformers -- in performing a classical sequence modeling task. This +construction is also computationally efficient, as each unit cell of the +introduced class of QNNs only has constant gate complexity. We show that +contextuality -- informally, a quantitative notion of semantic ambiguity -- is +the source of the expressivity separation, suggesting that other learning tasks +with this property may be a natural setting for the use of quantum learning +algorithms. + +
+
+ comment: 24 pages, 3 figures, strengthened and simplified results and + presentation +
+
+
+
+
+ + ♻ ☆ Optimized Gradient Clipping for Noisy Label Learning AAAI2025 + + +
+ Previous research has shown that constraining the gradient of loss function +with respect to model-predicted probabilities can enhance the model robustness +against noisy labels. These methods typically specify a fixed optimal threshold +for gradient clipping through validation data to obtain the desired robustness +against noise. However, this common practice overlooks the dynamic distribution +of gradients from both clean and noisy-labeled samples at different stages of +training, significantly limiting the model capability to adapt to the variable +nature of gradients throughout the training process. To address this issue, we +propose a simple yet effective approach called Optimized Gradient Clipping +(OGC), which dynamically adjusts the clipping threshold based on the ratio of +noise gradients to clean gradients after clipping, estimated by modeling the +distributions of clean and noisy samples. This approach allows us to modify the +clipping threshold at each training step, effectively controlling the influence +of noise gradients. Additionally, we provide statistical analysis to certify +the noise-tolerance ability of OGC. Our extensive experiments across various +types of label noise, including symmetric, asymmetric, instance-dependent, and +real-world noise, demonstrate the effectiveness of our approach. + +
+
+ comment: Accepted by AAAI2025 +
+
+
+
+
+ + ♻ ☆ Clustering of timed sequences -- Application to the analysis of care + pathways + + +
+ Improving the future of healthcare starts by better understanding the current +actual practices in hospital settings. This motivates the objective of +discovering typical care pathways from patient data. Revealing typical care +pathways can be achieved through clustering. The difficulty in clustering care +pathways, represented by sequences of timestamped events, lies in defining a +semantically appropriate metric and clustering algorithms. In this article, we +adapt two methods developed for time series to the clustering of timed +sequences: the drop-DTW metric and the DBA approach for the construction of +averaged time sequences. These methods are then applied in clustering +algorithms to propose original and sound clustering algorithms for timed +sequences. This approach is experimented with and evaluated on synthetic and +real-world data. + +
+
+
+
+
+ + ♻ ☆ Task Adaptation of Reinforcement Learning-based NAS Agents through + Transfer Learning + + +
+ Recently, a novel paradigm has been proposed for reinforcement learning-based +NAS agents, that revolves around the incremental improvement of a given +architecture. We assess the abilities of such reinforcement learning agents to +transfer between different tasks. We perform our evaluation using the +Trans-NASBench-101 benchmark, and consider the efficacy of the transferred +agents, as well as how quickly they can be trained. We find that pretraining an +agent on one task benefits the performance of the agent in another task in all +but 1 task when considering final performance. We also show that the training +procedure for an agent can be shortened significantly by pretraining it on +another task. Our results indicate that these effects occur regardless of the +source or target task, although they are more pronounced for some tasks than +for others. Our results show that transfer learning can be an effective tool in +mitigating the computational cost of the initial training procedure for +reinforcement learning-based NAS agents. + +
+
+ comment: 15 Pages, 13 Figures, Corrected data in Figure 5 +
+
+
+
+
+ + ♻ ☆ Samudra: An AI Global Ocean Emulator for Climate + + +
+ AI emulators for forecasting have emerged as powerful tools that can +outperform conventional numerical predictions. The next frontier is to build +emulators for long climate simulations with skill across a range of +spatiotemporal scales, a particularly important goal for the ocean. Our work +builds a skillful global emulator of the ocean component of a state-of-the-art +climate model. We emulate key ocean variables, sea surface height, horizontal +velocities, temperature, and salinity, across their full depth. We use a +modified ConvNeXt UNet architecture trained on multidepth levels of ocean data. +We show that the ocean emulator - Samudra - which exhibits no drift relative to +the truth, can reproduce the depth structure of ocean variables and their +interannual variability. Samudra is stable for centuries and 150 times faster +than the original ocean model. Samudra struggles to capture the correct +magnitude of the forcing trends and simultaneously remains stable, requiring +further work. + +
+
+
+
+
+ + ♻ ☆ Enhancing Ethereum Fraud Detection via Generative and Contrastive + Self-supervision + + +
+ The rampant fraudulent activities on Ethereum hinder the healthy development +of the blockchain ecosystem, necessitating the reinforcement of regulations. +However, multiple imbalances involving account interaction frequencies and +interaction types in the Ethereum transaction environment pose significant +challenges to data mining-based fraud detection research. To address this, we +first propose the concept of meta-interactions to refine interaction behaviors +in Ethereum, and based on this, we present a dual self-supervision enhanced +Ethereum fraud detection framework, named Meta-IFD. This framework initially +introduces a generative self-supervision mechanism to augment the interaction +features of accounts, followed by a contrastive self-supervision mechanism to +differentiate various behavior patterns, and ultimately characterizes the +behavioral representations of accounts and mines potential fraud risks through +multi-view interaction feature learning. Extensive experiments on real Ethereum +datasets demonstrate the effectiveness and superiority of our framework in +detecting common Ethereum fraud behaviors such as Ponzi schemes and phishing +scams. Additionally, the generative module can effectively alleviate the +interaction distribution imbalance in Ethereum data, while the contrastive +module significantly enhances the framework's ability to distinguish different +behavior patterns. The source code will be available in +https://github.com/GISec-Team/Meta-IFD. + +
+
+ comment: Accepted by IEEE Transactions on Information Forensics & Security +
+
+
+
+
+ + ♻ ☆ SageAttention2: Efficient Attention with Thorough Outlier Smoothing and + Per-thread INT4 Quantization + + +
+ Although quantization for linear layers has been widely used, its application +to accelerate the attention process remains limited. To further enhance the +efficiency of attention computation compared to SageAttention while maintaining +precision, we propose SageAttention2, which utilizes significantly faster 4-bit +matrix multiplication (Matmul) alongside additional precision-enhancing +techniques. First, we propose to quantize matrixes $(Q, K)$ to INT4 in a +hardware-friendly thread-level granularity and quantize matrixes $(\widetilde +P, V)$ to FP8. Second, we propose a method to smooth $Q$, enhancing the +accuracy of INT4 $QK$. Third, we propose to use an FP32 Matmul buffer for $PV$ +to enhance the accuracy of FP8 $\widetilde PV$. The operations per second (OPS) +of SageAttention2 surpass FlashAttention2 and xformers by about 3x and 5x on +RTX4090, respectively. Comprehensive experiments confirm that our approach +incurs negligible end-to-end metrics loss across diverse models, including +those for large language processing, image generation, and video generation. +The codes are available at https://github.com/thu-ml/SageAttention. + +
+
+
+
+
+ + ♻ ☆ Benchmarking Large Language Models for Math Reasoning Tasks + + +
+ The use of Large Language Models (LLMs) in mathematical reasoning has become +a cornerstone of related research, demonstrating the intelligence of these +models and enabling potential practical applications through their advanced +performance, such as in educational settings. Despite the variety of datasets +and in-context learning algorithms designed to improve the ability of LLMs to +automate mathematical problem solving, the lack of comprehensive benchmarking +across different datasets makes it complicated to select an appropriate model +for specific tasks. In this project, we present a benchmark that fairly +compares seven state-of-the-art in-context learning algorithms for mathematical +problem solving across five widely used mathematical datasets on four powerful +foundation models. Furthermore, we explore the trade-off between efficiency and +performance, highlighting the practical applications of LLMs for mathematical +reasoning. Our results indicate that larger foundation models like GPT-4o and +LLaMA 3-70B can solve mathematical reasoning independently from the concrete +prompting strategy, while for smaller models the in-context learning approach +significantly influences the performance. Moreover, the optimal prompt depends +on the chosen foundation model. We open-source our benchmark code to support +the integration of additional models in future research. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ Real-Time Damage Detection in Fiber Lifting Ropes Using Lightweight + Convolutional Neural Networks + + +
+ The health and safety hazards posed by worn crane lifting ropes mandate +periodic inspection for damage. This task is time-consuming, prone to human +error, halts operation, and may result in the premature disposal of ropes. +Therefore, we propose using efficient deep learning and computer vision methods +to automate the process of detecting damaged ropes. Specifically, we present a +vision-based system for detecting damage in synthetic fiber rope images using +lightweight convolutional neural networks. We develop a camera-based apparatus +to photograph the lifting rope's surface, while in operation, and capture the +progressive wear-and-tear as well as the more significant degradation in the +rope's health state. Experts from Konecranes annotate the collected images in +accordance with the rope's condition; normal or damaged. Then, we pre-process +the images, systematically design a deep learning model, evaluate its detection +and prediction performance, analyze its computational complexity, and compare +it with various other models. Experimental results show the proposed model +outperforms other similar techniques with 96.5% accuracy, 94.8% precision, +98.3% recall, 96.5% F1-score, and 99.3% AUC. Besides, they demonstrate the +model's real-time operation, low memory footprint, robustness to various +environmental and operational conditions, and adequacy for deployment in +industrial applications such as lifting, mooring, towing, climbing, and +sailing. + +
+
+
+
+
+ + ♻ ☆ Scaling Laws for Imitation Learning in Single-Agent Games + + +
+ Imitation Learning (IL) is one of the most widely used methods in machine +learning. Yet, many works find it is often unable to fully recover the +underlying expert behavior, even in constrained environments like single-agent +games. However, none of these works deeply investigate the role of scaling up +the model and data size. Inspired by recent work in Natural Language Processing +(NLP) where "scaling up" has resulted in increasingly more capable LLMs, we +investigate whether carefully scaling up model and data size can bring similar +improvements in the imitation learning setting for single-agent games. We first +demonstrate our findings on a variety of Atari games, and thereafter focus on +the extremely challenging game of NetHack. In all games, we find that IL loss +and mean return scale smoothly with the compute budget (FLOPs) and are strongly +correlated, resulting in power laws for training compute-optimal IL agents. +Finally, we forecast and train several NetHack agents with IL and find they +outperform prior state-of-the-art by 1.5x in all settings. Our work both +demonstrates the scaling behavior of imitation learning in a variety of +single-agent games, as well as the viability of scaling up current approaches +for increasingly capable agents in NetHack, a game that remains elusively hard +for current AI systems. + +
+
+ comment: Accepted at TMLR 2024 +
+
+
+
+
+ + ♻ ☆ Union-over-Intersections: Object Detection beyond Winner-Takes-All + + +
+ This paper revisits the problem of predicting box locations in object +detection architectures. Typically, each box proposal or box query aims to +directly maximize the intersection-over-union score with the ground truth, +followed by a winner-takes-all non-maximum suppression where only the highest +scoring box in each region is retained. We observe that both steps are +sub-optimal: the first involves regressing proposals to the entire ground +truth, which is a difficult task even with large receptive fields, and the +second neglects valuable information from boxes other than the top candidate. +Instead of regressing proposals to the whole ground truth, we propose a simpler +approach: regress only to the area of intersection between the proposal and the +ground truth. This avoids the need for proposals to extrapolate beyond their +visual scope, improving localization accuracy. Rather than adopting a +winner-takes-all strategy, we take the union over the regressed intersections +of all boxes in a region to generate the final box outputs. Our plug-and-play +method integrates seamlessly into proposal-based, grid-based, and query-based +detection architectures with minimal modifications, consistently improving +object localization and instance segmentation. We demonstrate its broad +applicability and versatility across various detection and segmentation tasks. + +
+
+ comment: 17 pages, 6 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ Online MDP with Transition Prototypes: A Robust Adaptive Approach + + +
+ In this work, we consider an online robust Markov Decision Process (MDP) +where we have the information of finitely many prototypes of the underlying +transition kernel. We consider an adaptively updated ambiguity set of the +prototypes and propose an algorithm that efficiently identifies the true +underlying transition kernel while guaranteeing the performance of the +corresponding robust policy. To be more specific, we provide a sublinear regret +of the subsequent optimal robust policy. We also provide an early stopping +mechanism and a worst-case performance bound of the value function. In +numerical experiments, we demonstrate that our method outperforms existing +approaches, particularly in the early stage with limited data. This work +contributes to robust MDPs by considering possible prior information about the +underlying transition probability and online learning, offering both +theoretical insights and practical algorithms for improved decision-making +under uncertainty. + +
+
+
+
+
+ + ♻ ☆ Do Parameters Reveal More than Loss for Membership Inference? + + +
+ Membership inference attacks are used as a key tool for disclosure auditing. +They aim to infer whether an individual record was used to train a model. While +such evaluations are useful to demonstrate risk, they are computationally +expensive and often make strong assumptions about potential adversaries' access +to models and training environments, and thus do not provide tight bounds on +leakage from potential attacks. We show how prior claims around black-box +access being sufficient for optimal membership inference do not hold for +stochastic gradient descent, and that optimal membership inference indeed +requires white-box access. Our theoretical results lead to a new white-box +inference attack, IHA (Inverse Hessian Attack), that explicitly uses model +parameters by taking advantage of computing inverse-Hessian vector products. +Our results show that both auditors and adversaries may be able to benefit from +access to model parameters, and we advocate for further research into white-box +methods for membership inference. + +
+
+ comment: Accepted to Transactions on Machine Learning Research (TMLR) +
+
+
+
+
+ + ♻ ☆ Hybridization of Persistent Homology with Neural Networks for + Time-Series Prediction: A Case Study in Wave Height + + +
+ Time-series prediction is an active area of research across various fields, +often challenged by the fluctuating influence of short-term and long-term +factors. In this study, we introduce a feature engineering method that enhances +the predictive performance of neural network models. Specifically, we leverage +computational topology techniques to derive valuable topological features from +input data, boosting the predictive accuracy of our models. Our focus is on +predicting wave heights, utilizing models based on topological features within +feedforward neural networks (FNNs), recurrent neural networks (RNNs), long +short-term memory networks (LSTM), and RNNs with gated recurrent units (GRU). +For time-ahead predictions, the enhancements in $R^2$ score were significant +for FNNs, RNNs, LSTM, and GRU models. Additionally, these models also showed +significant reductions in maximum errors and mean squared errors. + +
+
+ comment: the paper contain errors +
+
+
+
+
+ + ♻ ☆ AndroidWorld: A Dynamic Benchmarking Environment for Autonomous Agents + + +
+ Autonomous agents that execute human tasks by controlling computers can +enhance human productivity and application accessibility. However, progress in +this field will be driven by realistic and reproducible benchmarks. We present +AndroidWorld, a fully functional Android environment that provides reward +signals for 116 programmatic tasks across 20 real-world Android apps. Unlike +existing interactive environments, which provide a static test set, +AndroidWorld dynamically constructs tasks that are parameterized and expressed +in natural language in unlimited ways, thus enabling testing on a much larger +and more realistic suite of tasks. To ensure reproducibility, each task +includes dedicated initialization, success-checking, and tear-down logic, which +modifies and inspects the device's system state. We experiment with baseline +agents to test AndroidWorld and provide initial results on the benchmark. Our +best agent can complete 30.6% of AndroidWorld's tasks, leaving ample room for +future work. Furthermore, we adapt a popular desktop web agent to work on +Android, which we find to be less effective on mobile, suggesting future +research is needed to achieve universal, cross-platform agents. Finally, we +also conduct a robustness analysis, showing that task variations can +significantly affect agent performance, demonstrating that without such +testing, agent performance metrics may not fully reflect practical challenges. +AndroidWorld and the experiments in this paper are available at +github.com/google-research/android_world. + +
+
+
+
+
+ + ♻ ☆ Erase then Rectify: A Training-Free Parameter Editing Approach for + Cost-Effective Graph Unlearning AAAI2025 + + +
+ Graph unlearning, which aims to eliminate the influence of specific nodes, +edges, or attributes from a trained Graph Neural Network (GNN), is essential in +applications where privacy, bias, or data obsolescence is a concern. However, +existing graph unlearning techniques often necessitate additional training on +the remaining data, leading to significant computational costs, particularly +with large-scale graphs. To address these challenges, we propose a two-stage +training-free approach, Erase then Rectify (ETR), designed for efficient and +scalable graph unlearning while preserving the model utility. Specifically, we +first build a theoretical foundation showing that masking parameters critical +for unlearned samples enables effective unlearning. Building on this insight, +the Erase stage strategically edits model parameters to eliminate the impact of +unlearned samples and their propagated influence on intercorrelated nodes. To +further ensure the GNN's utility, the Rectify stage devises a gradient +approximation method to estimate the model's gradient on the remaining dataset, +which is then used to enhance model performance. Overall, ETR achieves graph +unlearning without additional training or full training data access, +significantly reducing computational overhead and preserving data privacy. +Extensive experiments on seven public datasets demonstrate the consistent +superiority of ETR in model utility, unlearning efficiency, and unlearning +effectiveness, establishing it as a promising solution for real-world graph +unlearning challenges. + +
+
+ comment: Accepted by AAAI2025 +
+
+
+
+
+ + ♻ ☆ ASTM :Autonomous Smart Traffic Management System Using Artificial + Intelligence CNN and LSTM + + +
+ In the modern world, the development of Artificial Intelligence (AI) has +contributed to improvements in various areas, including automation, computer +vision, fraud detection, and more. AI can be leveraged to enhance the +efficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce +traffic congestion rates. This paper presents an Autonomous Smart Traffic +Management (STM) system that uses AI to improve traffic flow rates. The system +employs the YOLO V5 Convolutional Neural Network to detect vehicles in traffic +management images. Additionally, it predicts the number of vehicles for the +next 12 hours using a Recurrent Neural Network with Long Short-Term Memory +(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the +traffic cycle length based on these vehicle predictions, aided by AI. From the +results of the RNN-LSTM model for predicting vehicle numbers over the next 12 +hours, we observe that the model predicts traffic with a Mean Squared Error +(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles. +After simulating the STM system in the CARLA simulation environment, we found +that the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per +minute) is 50\% higher than the rate without STM (around 15 vehicles per +minute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5 +seconds per vehicle) is 70\% lower than without STM (around 12 seconds per +vehicle). These results demonstrate that the STM system using AI can increase +traffic flow by 50\% and reduce vehicle pass delays by 70\%. + +
+
+ comment: In process to IEEE Intelligent Vehicle Symposium 2025 +
+
+
+
+
+ + ♻ ☆ Sum of Squares Circuits + + +
+ Designing expressive generative models that support exact and efficient +inference is a core question in probabilistic ML. Probabilistic circuits (PCs) +offer a framework where this tractability-vs-expressiveness trade-off can be +analyzed theoretically. Recently, squared PCs encoding subtractive mixtures via +negative parameters have emerged as tractable models that can be exponentially +more expressive than monotonic PCs, i.e., PCs with positive parameters only. In +this paper, we provide a more precise theoretical characterization of the +expressiveness relationships among these models. First, we prove that squared +PCs can be less expressive than monotonic ones. Second, we formalize a novel +class of PCs -- sum of squares PCs -- that can be exponentially more expressive +than both squared and monotonic PCs. Around sum of squares PCs, we build an +expressiveness hierarchy that allows us to precisely unify and separate +different tractable model classes such as Born Machines and PSD models, and +other recently introduced tractable probabilistic models by using complex +parameters. Finally, we empirically show the effectiveness of sum of squares +circuits in performing distribution estimation. + +
+
+
+
+
+ + ♻ ☆ How to Re-enable PDE Loss for Physical Systems Modeling Under Partial + Observation AAAI2025 + + +
+ In science and engineering, machine learning techniques are increasingly +successful in physical systems modeling (predicting future states of physical +systems). Effectively integrating PDE loss as a constraint of system transition +can improve the model's prediction by overcoming generalization issues due to +data scarcity, especially when data acquisition is costly. However, in many +real-world scenarios, due to sensor limitations, the data we can obtain is +often only partial observation, making the calculation of PDE loss seem to be +infeasible, as the PDE loss heavily relies on high-resolution states. We +carefully study this problem and propose a novel framework named Re-enable PDE +Loss under Partial Observation (RPLPO). The key idea is that although enabling +PDE loss to constrain system transition solely is infeasible, we can re-enable +PDE loss by reconstructing the learnable high-resolution state and constraining +system transition simultaneously. Specifically, RPLPO combines an encoding +module for reconstructing learnable high-resolution states with a transition +module for predicting future states. The two modules are jointly trained by +data and PDE loss. We conduct experiments in various physical systems to +demonstrate that RPLPO has significant improvement in generalization, even when +observation is sparse, irregular, noisy, and PDE is inaccurate. + +
+
+ comment: Accepted by AAAI2025 +
+
+
+
+
+ + ♻ ☆ Toward Falsifying Causal Graphs Using a Permutation-Based Test AAAI 2025 + + +
+ Understanding causal relationships among the variables of a system is +paramount to explain and control its behavior. For many real-world systems, +however, the true causal graph is not readily available and one must resort to +predictions made by algorithms or domain experts. Therefore, metrics that +quantitatively assess the goodness of a causal graph provide helpful checks +before using it in downstream tasks. Existing metrics provide an +$\textit{absolute}$ number of inconsistencies between the graph and the +observed data, and without a baseline, practitioners are left to answer the +hard question of how many such inconsistencies are acceptable or expected. +Here, we propose a novel consistency metric by constructing a baseline through +node permutations. By comparing the number of inconsistencies with those on the +baseline, we derive an interpretable metric that captures whether the graph is +significantly better than random. Evaluating on both simulated and real data +sets from various domains, including biology and cloud monitoring, we +demonstrate that the true graph is not falsified by our metric, whereas the +wrong graphs given by a hypothetical user are likely to be falsified. + +
+
+ comment: Camera-ready version for AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Discovering Continuous-Time Memory-Based Symbolic Policies using Genetic + Programming + + +
+ Artificial intelligence techniques are increasingly being applied to solve +control problems, but often rely on black-box methods without transparent +output generation. To improve the interpretability and transparency in control +systems, models can be defined as white-box symbolic policies described by +mathematical expressions. For better performance in partially observable and +volatile environments, the symbolic policies are extended with memory +represented by continuous-time latent variables, governed by differential +equations. Genetic programming is used for optimisation, resulting in +interpretable policies consisting of symbolic expressions. Our results show +that symbolic policies with memory compare with black-box policies on a variety +of control tasks. Furthermore, the benefit of the memory in symbolic policies +is demonstrated on experiments where memory-less policies fall short. Overall, +we present a method for evolving high-performing symbolic policies that offer +interpretability and transparency, which lacks in black-box models. + +
+
+ comment: 21 pages including references and appendix, 5 figures, 1 algorithm, 5 + tables +
+
+
+
+
+ + ♻ ☆ Shape error prediction in 5-axis machining using graph neural networks + + +
+ This paper presents an innovative method for predicting shape errors in +5-axis machining using graph neural networks. The graph structure is defined +with nodes representing workpiece surface points and edges denoting the +neighboring relationships. The dataset encompasses data from a material removal +simulation, process data, and post-machining quality information. Experimental +results show that the presented approach can generalize the shape error +prediction for the investigated workpiece geometry. Moreover, by modelling +spatial and temporal connections within the workpiece, the approach handles a +low number of labels compared to non-graphical methods such as Support Vector +Machines. + +
+
+
+
+
+ + ♻ ☆ TRAIL: Trust-Aware Client Scheduling for Semi-Decentralized Federated + Learning + + +
+ Due to the sensitivity of data, Federated Learning (FL) is employed to enable +distributed machine learning while safeguarding data privacy and accommodating +the requirements of various devices. However, in the context of +semi-decentralized FL, clients' communication and training states are dynamic. +This variability arises from local training fluctuations, heterogeneous data +distributions, and intermittent client participation. Most existing studies +primarily focus on stable client states, neglecting the dynamic challenges +inherent in real-world scenarios. To tackle this issue, we propose a +TRust-Aware clIent scheduLing mechanism called TRAIL, which assesses client +states and contributions, enhancing model training efficiency through selective +client participation. We focus on a semi-decentralized FL framework where edge +servers and clients train a shared global model using unreliable intra-cluster +model aggregation and inter-cluster model consensus. First, we propose an +adaptive hidden semi-Markov model to estimate clients' communication states and +contributions. Next, we address a client-server association optimization +problem to minimize global training loss. Using convergence analysis, we +propose a greedy client scheduling algorithm. Finally, our experiments +conducted on real-world datasets demonstrate that TRAIL outperforms +state-of-the-art baselines, achieving an improvement of 8.7% in test accuracy +and a reduction of 15.3% in training loss. + +
+
+
+
+
+ + ♻ ☆ Accelerating Diffusion Transformers with Token-wise Feature Caching + + +
+ Diffusion transformers have shown significant effectiveness in both image and +video synthesis at the expense of huge computation costs. To address this +problem, feature caching methods have been introduced to accelerate diffusion +transformers by caching the features in previous timesteps and reusing them in +the following timesteps. However, previous caching methods ignore that +different tokens exhibit different sensitivities to feature caching, and +feature caching on some tokens may lead to 10$\times$ more destruction to the +overall generation quality compared with other tokens. In this paper, we +introduce token-wise feature caching, allowing us to adaptively select the most +suitable tokens for caching, and further enable us to apply different caching +ratios to neural layers in different types and depths. Extensive experiments on +PixArt-$\alpha$, OpenSora, and DiT demonstrate our effectiveness in both image +and video generation with no requirements for training. For instance, +2.36$\times$ and 1.93$\times$ acceleration are achieved on OpenSora and +PixArt-$\alpha$ with almost no drop in generation quality. + +
+
+ comment: In this version, we achieved a nearly lossless acceleration of 1.51 + times for ToCa on FLUX in the appendix +
+
+
+
+
+ + ♻ ☆ Probability Distribution Learning and Its Application in Deep Learning + + +
+ This paper introduces a novel theoretical learning framework, termed +probability distribution learning (PD learning). Departing from the traditional +statistical learning framework, PD learning focuses on learning the underlying +probability distribution, which is modeled as a random variable within the +probability simplex. In this framework, the optimization objective is the +learning error, which quantifies the posterior expected discrepancy between the +model's predicted distribution and the underlying true distribution, given +available sample data and prior knowledge. To optimize the learning error, this +paper proposes the necessary conditions for loss functions, models, and +optimization algorithms, ensuring that these conditions are met in real-world +machine learning scenarios. Based on these conditions, the non-convex +optimization mechanism corresponding to model training can be theoretically +resolved. Moreover, this paper provides model-dependent and model-independent +bounds on learning error, offering new insights into the model's fitting and +generalization capabilities. Furthermore, the paper applies the PD learning +framework to elucidate the mechanisms by which various techniques, including +random parameter initialization, over-parameterization, and dropout, influence +deep model training. Finally, the paper substantiates the key conclusions of +the proposed framework through experimental results. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2105.04026 by other + authors. arXiv admin note: text overlap with arXiv:2105.04026 by other + authors +
+
+
+
+
+ + ♻ ☆ Score and Distribution Matching Policy: Advanced Accelerated Visuomotor + Policies via Matched Distillation + + +
+ Visual-motor policy learning has advanced with architectures like +diffusion-based policies, known for modeling complex robotic trajectories. +However, their prolonged inference times hinder high-frequency control tasks +requiring real-time feedback. While consistency distillation (CD) accelerates +inference, it introduces errors that compromise action quality. To address +these limitations, we propose the Score and Distribution Matching Policy (SDM +Policy), which transforms diffusion-based policies into single-step generators +through a two-stage optimization process: score matching ensures alignment with +true action distributions, and distribution matching minimizes KL divergence +for consistency. A dual-teacher mechanism integrates a frozen teacher for +stability and an unfrozen teacher for adversarial training, enhancing +robustness and alignment with target distributions. Evaluated on a 57-task +simulation benchmark, SDM Policy achieves a 6x inference speedup while having +state-of-the-art action quality, providing an efficient and reliable framework +for high-frequency robotic tasks. + +
+
+
+
+
+ + ♻ ☆ On the Expressivity of Persistent Homology in Graph Learning + + +
+ Persistent homology, a technique from computational topology, has recently +shown strong empirical performance in the context of graph classification. +Being able to capture long range graph properties via higher-order topological +features, such as cycles of arbitrary length, in combination with multi-scale +topological descriptors, has improved predictive performance for data sets with +prominent topological structures, such as molecules. At the same time, the +theoretical properties of persistent homology have not been formally assessed +in this context. This paper intends to bridge the gap between computational +topology and graph machine learning by providing a brief introduction to +persistent homology in the context of graphs, as well as a theoretical +discussion and empirical analysis of its expressivity for graph learning tasks. + +
+
+ comment: Accepted at the 3rd Learning on Graphs Conference (LoG) 2024 +
+
+
+
+
+ + ♻ ☆ Generalized Encouragement-Based Instrumental Variables for + Counterfactual Regression + + +
+ In causal inference, encouragement designs (EDs) are widely used to analyze +causal effects, when randomized controlled trials (RCTs) are impractical or +compliance to treatment cannot be perfectly enforced. Unlike RCTs, which +directly allocate treatments, EDs randomly assign encouragement policies that +positively motivate individuals to engage in a specific treatment. These random +encouragements act as instrumental variables (IVs), facilitating the +identification of causal effects through leveraging exogenous perturbations in +discrete treatment scenarios. However, real-world applications of encouragement +designs often face challenges such as incomplete randomization, limited +experimental data, and significantly fewer encouragements compared to +treatments, hindering precise causal effect estimation. To address this, this +paper introduces novel theories and algorithms for identifying the Conditional +Average Treatment Effect (CATE) using variations in encouragement. Further, by +leveraging both observational and encouragement data, we propose a generalized +IV estimator, named Encouragement-based Counterfactual Regression (EnCounteR), +to effectively estimate the causal effects. Extensive experiments on both +synthetic and real-world datasets demonstrate the superiority of EnCounteR over +existing methods. + +
+
+
+
+
+ + ♻ ☆ Smoothness Really Matters: A Simple Yet Effective Approach for + Unsupervised Graph Domain Adaptation AAAI2025 + + +
+ Unsupervised Graph Domain Adaptation (UGDA) seeks to bridge distribution +shifts between domains by transferring knowledge from labeled source graphs to +given unlabeled target graphs. Existing UGDA methods primarily focus on +aligning features in the latent space learned by graph neural networks (GNNs) +across domains, often overlooking structural shifts, resulting in limited +effectiveness when addressing structurally complex transfer scenarios. Given +the sensitivity of GNNs to local structural features, even slight discrepancies +between source and target graphs could lead to significant shifts in node +embeddings, thereby reducing the effectiveness of knowledge transfer. To +address this issue, we introduce a novel approach for UGDA called Target-Domain +Structural Smoothing (TDSS). TDSS is a simple and effective method designed to +perform structural smoothing directly on the target graph, thereby mitigating +structural distribution shifts and ensuring the consistency of node +representations. Specifically, by integrating smoothing techniques with +neighborhood sampling, TDSS maintains the structural coherence of the target +graph while mitigating the risk of over-smoothing. Our theoretical analysis +shows that TDSS effectively reduces target risk by improving model smoothness. +Empirical results on three real-world datasets demonstrate that TDSS +outperforms recent state-of-the-art baselines, achieving significant +improvements across six transfer scenarios. The code is available in +https://github.com/cwei01/TDSS. + +
+
+ comment: 11 pages, Accpected by AAAI2025 +
+
+
+
+
+ + ♻ ☆ MetaSymNet: A Tree-like Symbol Network with Adaptive Architecture and + Activation Functions AAAI2025 + + +
+ Mathematical formulas serve as the means of communication between humans and +nature, encapsulating the operational laws governing natural phenomena. The +concise formulation of these laws is a crucial objective in scientific research +and an important challenge for artificial intelligence (AI). While traditional +artificial neural networks (MLP) excel at data fitting, they often yield +uninterpretable black box results that hinder our understanding of the +relationship between variables x and predicted values y. Moreover, the fixed +network architecture in MLP often gives rise to redundancy in both network +structure and parameters. To address these issues, we propose MetaSymNet, a +novel neural network that dynamically adjusts its structure in real-time, +allowing for both expansion and contraction. This adaptive network employs the +PANGU meta function as its activation function, which is a unique type capable +of evolving into various basic functions during training to compose +mathematical formulas tailored to specific needs. We then evolve the neural +network into a concise, interpretable mathematical expression. To evaluate +MetaSymNet's performance, we compare it with four state-of-the-art symbolic +regression algorithms across more than 10 public datasets comprising 222 +formulas. Our experimental results demonstrate that our algorithm outperforms +others consistently regardless of noise presence or absence. Furthermore, we +assess MetaSymNet against MLP and SVM regarding their fitting ability and +extrapolation capability, these are two essential aspects of machine learning +algorithms. The findings reveal that our algorithm excels in both areas. +Finally, we compared MetaSymNet with MLP using iterative pruning in network +structure complexity. The results show that MetaSymNet's network structure +complexity is obviously less than MLP under the same goodness of fit. + +
+
+ comment: This work has been accepted by AAAI2025 +
+
+
+
+
+ + ♻ ☆ MAPFAST: A Deep Algorithm Selector for Multi Agent Path Finding using + Shortest Path Embeddings AAMAS-21 + + +
+ Solving the Multi-Agent Path Finding (MAPF) problem optimally is known to be +NP-Hard for both make-span and total arrival time minimization. While many +algorithms have been developed to solve MAPF problems, there is no dominating +optimal MAPF algorithm that works well in all types of problems and no standard +guidelines for when to use which algorithm. In this work, we develop the deep +convolutional network MAPFAST (Multi-Agent Path Finding Algorithm SelecTor), +which takes a MAPF problem instance and attempts to select the fastest +algorithm to use from a portfolio of algorithms. We improve the performance of +our model by including single-agent shortest paths in the instance embedding +given to our model and by utilizing supplemental loss functions in addition to +a classification loss. We evaluate our model on a large and diverse dataset of +MAPF instances, showing that it outperforms all individual algorithms in its +portfolio as well as the state-of-the-art optimal MAPF algorithm selector. We +also provide an analysis of algorithm behavior in our dataset to gain a deeper +understanding of optimal MAPF algorithms' strengths and weaknesses to help +other researchers leverage different heuristics in algorithm designs. + +
+
+ comment: To appear in AAMAS-21 +
+
+
+
+
+ + ♻ ☆ Training Datasets Generation for Machine Learning: Application to Vision + Based Navigation SP + + +
+ Vision Based Navigation consists in utilizing cameras as precision sensors +for GNC after extracting information from images. To enable the adoption of +machine learning for space applications, one of obstacles is the demonstration +that available training datasets are adequate to validate the algorithms. The +objective of the study is to generate datasets of images and metadata suitable +for training machine learning algorithms. Two use cases were selected and a +robust methodology was developed to validate the datasets including the ground +truth. The first use case is in-orbit rendezvous with a man-made object: a +mockup of satellite ENVISAT. The second use case is a Lunar landing scenario. +Datasets were produced from archival datasets (Chang'e 3), from the laboratory +at DLR TRON facility and at Airbus Robotic laboratory, from SurRender software +high fidelity image simulator using Model Capture and from Generative +Adversarial Networks. The use case definition included the selection of +algorithms as benchmark: an AI-based pose estimation algorithm and a dense +optical flow algorithm were selected. Eventually it is demonstrated that +datasets produced with SurRender and selected laboratory facilities are +adequate to train machine learning algorithms. + +
+
+ comment: 6 pages, 4 figures, preprint of the proceedings of ESA SPAICE + conference 2024 +
+
+
+
+
+ + ♻ ☆ Feature selection in linear SVMs via a hard cardinality constraint: a + scalable SDP decomposition approach + + +
+ In this paper, we study the embedded feature selection problem in linear +Support Vector Machines (SVMs), in which a cardinality constraint is employed, +leading to an interpretable classification model. The problem is NP-hard due to +the presence of the cardinality constraint, even though the original linear SVM +amounts to a problem solvable in polynomial time. To handle the hard problem, +we first introduce two mixed-integer formulations for which novel semidefinite +relaxations are proposed. Exploiting the sparsity pattern of the relaxations, +we decompose the problems and obtain equivalent relaxations in a much smaller +cone, making the conic approaches scalable. To make the best usage of the +decomposed relaxations, we propose heuristics using the information of its +optimal solution. Moreover, an exact procedure is proposed by solving a +sequence of mixed-integer decomposed semidefinite optimization problems. +Numerical results on classical benchmarking datasets are reported, showing the +efficiency and effectiveness of our approach. + +
+
+ comment: Submitted to European Journal of Operational Research. arXiv admin + note: text overlap with arXiv:1808.02435 by other authors +
+
+
+
+
+ + ♻ ☆ Scalable Acceleration for Classification-Based Derivative-Free + Optimization + + +
+ Derivative-free optimization algorithms play an important role in scientific +and engineering design optimization problems, especially when derivative +information is not accessible. In this paper, we study the framework of +sequential classification-based derivative-free optimization algorithms. By +introducing learning theoretic concept hypothesis-target shattering rate, we +revisit the computational complexity upper bound of SRACOS (Hu, Qian, and Yu +2017). Inspired by the revisited upper bound, we propose an algorithm named +RACE-CARS, which adds a random region-shrinking step compared with SRACOS. We +further establish theorems showing the acceleration by region shrinking. +Experiments on the synthetic functions as well as black-box tuning for +language-model-as-a-service demonstrate empirically the efficiency of +RACE-CARS. An ablation experiment on the introduced hyperparameters is also +conducted, revealing the mechanism of RACE-CARS and putting forward an +empirical hyper-parameter tuning guidance. + +
+
+
+
+
+ + ♻ ☆ Guiding a Diffusion Model with a Bad Version of Itself NeurIPS 2024 + + +
+ The primary axes of interest in image-generating diffusion models are image +quality, the amount of variation in the results, and how well the results align +with a given condition, e.g., a class label or a text prompt. The popular +classifier-free guidance approach uses an unconditional model to guide a +conditional model, leading to simultaneously better prompt alignment and +higher-quality images at the cost of reduced variation. These effects seem +inherently entangled, and thus hard to control. We make the surprising +observation that it is possible to obtain disentangled control over image +quality without compromising the amount of variation by guiding generation +using a smaller, less-trained version of the model itself rather than an +unconditional model. This leads to significant improvements in ImageNet +generation, setting record FIDs of 1.01 for 64x64 and 1.25 for 512x512, using +publicly available networks. Furthermore, the method is also applicable to +unconditional diffusion models, drastically improving their quality. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ TrimLLM: Progressive Layer Dropping for Domain-Specific LLMs + + +
+ Specializing large language models (LLMs) for local deployment in +domain-specific use cases is necessary for strong performance while meeting +latency and privacy constraints. However, conventional task-specific adaptation +approaches do not show simultaneous memory saving and inference speedup at +deployment time. Practical compression techniques like quantization and pruning +require dedicated hardware or kernel support to achieve measured inference +speedup. We develop TrimLLM based on the layer-wise specialization phenomenon +we empirically observed and verified on contemporary LLMs. TrimLLM reduces the +depth of LLMs via progressive layer dropping. We show it retains LLMs' capacity +in specific domains and achieves inference speedup irrespective of hardware and +deep learning frameworks. We evaluated TrimLLM on LLMs of various sizes for +inference; models adapted on medical, legal, and financial datasets all +demonstrate $2.1-5.7\times$ inference speedup on consumer GPUs and up to +$3.1\times$ speedup on A100 when compared to state-of-the-art model compression +algorithms, with no loss in accuracy at 50$\sim$60\% model compression ratio. + +
+
+
+
+
+ + ♻ ☆ Learning Deep Dissipative Dynamics AAAI 2025 + + +
+ This study challenges strictly guaranteeing ``dissipativity'' of a dynamical +system represented by neural networks learned from given time-series data. +Dissipativity is a crucial indicator for dynamical systems that generalizes +stability and input-output stability, known to be valid across various systems +including robotics, biological systems, and molecular dynamics. By analytically +proving the general solution to the nonlinear Kalman-Yakubovich-Popov (KYP) +lemma, which is the necessary and sufficient condition for dissipativity, we +propose a differentiable projection that transforms any dynamics represented by +neural networks into dissipative ones and a learning method for the transformed +dynamics. Utilizing the generality of dissipativity, our method strictly +guarantee stability, input-output stability, and energy conservation of trained +dynamical systems. Finally, we demonstrate the robustness of our method against +out-of-domain input through applications to robotic arms and fluid dynamics. +Code is https://github.com/kojima-r/DeepDissipativeModel + +
+
+ comment: AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Grimm: A Plug-and-Play Perturbation Rectifier for Graph Neural Networks + Defending against Poisoning Attacks + + +
+ Recent studies have revealed the vulnerability of graph neural networks +(GNNs) to adversarial poisoning attacks on node classification tasks. Current +defensive methods require substituting the original GNNs with defense models, +regardless of the original's type. This approach, while targeting adversarial +robustness, compromises the enhancements developed in prior research to boost +GNNs' practical performance. Here we introduce Grimm, the first plug-and-play +defense model. With just a minimal interface requirement for extracting +features from any layer of the protected GNNs, Grimm is thus enabled to +seamlessly rectify perturbations. Specifically, we utilize the feature +trajectories (FTs) generated by GNNs, as they evolve through epochs, to reflect +the training status of the networks. We then theoretically prove that the FTs +of victim nodes will inevitably exhibit discriminable anomalies. Consequently, +inspired by the natural parallelism between the biological nervous and immune +systems, we construct Grimm, a comprehensive artificial immune system for GNNs. +Grimm not only detects abnormal FTs and rectifies adversarial edges during +training but also operates efficiently in parallel, thereby mirroring the +concurrent functionalities of its biological counterparts. We experimentally +confirm that Grimm offers four empirically validated advantages: 1) +Harmlessness, as it does not actively interfere with GNN training; 2) +Parallelism, ensuring monitoring, detection, and rectification functions +operate independently of the GNN training process; 3) Generalizability, +demonstrating compatibility with mainstream GNNs such as GCN, GAT, and +GraphSAGE; and 4) Transferability, as the detectors for abnormal FTs can be +efficiently transferred across different systems for one-step rectification. + +
+
+ comment: 19 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Holdouts set for safe predictive model updating + + +
+ Predictive risk scores for adverse outcomes are increasingly crucial in +guiding health interventions. Such scores may need to be periodically updated +due to change in the distributions they model. However, directly updating risk +scores used to guide intervention can lead to biased risk estimates. To address +this, we propose updating using a `holdout set' - a subset of the population +that does not receive interventions guided by the risk score. Balancing the +holdout set size is essential to ensure good performance of the updated risk +score whilst minimising the number of held out samples. We prove that this +approach reduces adverse outcome frequency to an asymptotically optimal level +and argue that often there is no competitive alternative. We describe +conditions under which an optimal holdout size (OHS) can be readily identified, +and introduce parametric and semi-parametric algorithms for OHS estimation. We +apply our methods to the ASPRE risk score for pre-eclampsia to recommend a plan +for updating it in the presence of change in the underlying data distribution. +We show that, in order to minimise the number of pre-eclampsia cases over time, +this is best achieved using a holdout set of around 10,000 individuals. + +
+
+ comment: Manuscript includes supplementary materials and figures +
+
+
+
+
+ + ♻ ☆ RAZOR: Sharpening Knowledge by Cutting Bias with Unsupervised Text + Rewriting AAAI'25 + + +
+ Despite the widespread use of LLMs due to their superior performance in +various tasks, their high computational costs often lead potential users to opt +for the pretraining-finetuning pipeline. However, biases prevalent in manually +constructed datasets can introduce spurious correlations between tokens and +labels, creating so-called shortcuts and hindering the generalizability of +fine-tuned models. Existing debiasing methods often rely on prior knowledge of +specific dataset biases, which is challenging to acquire a priori. We propose +RAZOR (Rewriting And Zero-bias Optimization Refinement), a novel, unsupervised, +and data-focused debiasing approach based on text rewriting for shortcut +mitigation. RAZOR leverages LLMs to iteratively rewrite potentially biased text +segments by replacing them with heuristically selected alternatives in a +shortcut space defined by token statistics and positional information. This +process aims to align surface-level text features more closely with diverse +label distributions, thereby promoting the learning of genuine linguistic +patterns. Compared with unsupervised SoTA models, RAZOR improves by 3.5% on the +FEVER and 6.5% on MNLI and SNLI datasets according to the F1 score. +Additionally, RAZOR effectively mitigates specific known biases, reducing +bias-related terms by x2 without requiring prior bias information, a result +that is on par with SoTA models that leverage prior information. Our work +prioritizes data manipulation over architectural modifications, emphasizing the +pivotal role of data quality in enhancing model performance and fairness. This +research contributes to developing more robust evaluation benchmarks for +debiasing methods by incorporating metrics for bias reduction and overall model +efficacy. + +
+
+ comment: Shuo and Bardh contributed equally. Accepted to AAAI'25, Paper #17117 +
+
+
+
+
+ + ♻ ☆ DG-Mamba: Robust and Efficient Dynamic Graph Structure Learning with + Selective State Space Models AAAI + + +
+ Dynamic graphs exhibit intertwined spatio-temporal evolutionary patterns, +widely existing in the real world. Nevertheless, the structure incompleteness, +noise, and redundancy result in poor robustness for Dynamic Graph Neural +Networks (DGNNs). Dynamic Graph Structure Learning (DGSL) offers a promising +way to optimize graph structures. However, aside from encountering unacceptable +quadratic complexity, it overly relies on heuristic priors, making it hard to +discover underlying predictive patterns. How to efficiently refine the dynamic +structures, capture intrinsic dependencies, and learn robust representations, +remains under-explored. In this work, we propose the novel DG-Mamba, a robust +and efficient Dynamic Graph structure learning framework with the Selective +State Space Models (Mamba). To accelerate the spatio-temporal structure +learning, we propose a kernelized dynamic message-passing operator that reduces +the quadratic time complexity to linear. To capture global intrinsic dynamics, +we establish the dynamic graph as a self-contained system with State Space +Model. By discretizing the system states with the cross-snapshot graph +adjacency, we enable the long-distance dependencies capturing with the +selective snapshot scan. To endow learned dynamic structures more expressive +with informativeness, we propose the self-supervised Principle of Relevant +Information for DGSL to regularize the most relevant yet least redundant +information, enhancing global robustness. Extensive experiments demonstrate the +superiority of the robustness and efficiency of our DG-Mamba compared with the +state-of-the-art baselines against adversarial attacks. + +
+
+ comment: Accepted by the Main Technical Track of the 39th Annual AAAI + Conference on Artificial Intelligence (AAAI-2025) +
+
+
+
+
+ + ♻ ☆ T-JEPA: Augmentation-Free Self-Supervised Learning for Tabular Data + + +
+ Self-supervision is often used for pre-training to foster performance on a +downstream task by constructing meaningful representations of samples. +Self-supervised learning (SSL) generally involves generating different views of +the same sample and thus requires data augmentations that are challenging to +construct for tabular data. This constitutes one of the main challenges of +self-supervision for structured data. In the present work, we propose a novel +augmentation-free SSL method for tabular data. Our approach, T-JEPA, relies on +a Joint Embedding Predictive Architecture (JEPA) and is akin to mask +reconstruction in the latent space. It involves predicting the latent +representation of one subset of features from the latent representation of a +different subset within the same sample, thereby learning rich representations +without augmentations. We use our method as a pre-training technique and train +several deep classifiers on the obtained representation. Our experimental +results demonstrate a substantial improvement in both classification and +regression tasks, outperforming models trained directly on samples in their +original data space. Moreover, T-JEPA enables some methods to consistently +outperform or match the performance of traditional methods likes Gradient +Boosted Decision Trees. To understand why, we extensively characterize the +obtained representations and show that T-JEPA effectively identifies relevant +features for downstream tasks without access to the labels. Additionally, we +introduce regularization tokens, a novel regularization method critical for +training of JEPA-based models on structured data. + +
+
+
+
+
+ + ♻ ☆ Leveraging Group Classification with Descending Soft Labeling for Deep + Imbalanced Regression + + +
+ Deep imbalanced regression (DIR), where the target values have a highly +skewed distribution and are also continuous, is an intriguing yet +under-explored problem in machine learning. + While recent works have already shown that incorporating various +classification-based regularizers can produce enhanced outcomes, the role of +classification remains elusive in DIR. + Moreover, such regularizers (e.g., contrastive penalties) merely focus on +learning discriminative features of data, which inevitably results in ignorance +of either continuity or similarity across the data. + To address these issues, we first bridge the connection between the +objectives of DIR and classification from a Bayesian perspective. + Consequently, this motivates us to decompose the objective of DIR into a +combination of classification and regression tasks, which naturally guides us +toward a divide-and-conquer manner to solve the DIR problem. + Specifically, by aggregating the data at nearby labels into the same groups, +we introduce an ordinal group-aware contrastive learning loss along with a +multi-experts regressor to tackle the different groups of data thereby +maintaining the data continuity. + Meanwhile, considering the similarity between the groups, we also propose a +symmetric descending soft labeling strategy to exploit the intrinsic similarity +across the data, which allows classification to facilitate regression more +effectively. + Extensive experiments on real-world datasets also validate the effectiveness +of our method. + +
+
+
+
+
+ + ♻ ☆ Langevin dynamics for high-dimensional optimization: the case of + multi-spiked tensor PCA + + +
+ We study nonconvex optimization in high dimensions through Langevin dynamics, +focusing on the multi-spiked tensor PCA problem. This tensor estimation problem +involves recovering $r$ hidden signal vectors (spikes) from noisy Gaussian +tensor observations using maximum likelihood estimation. We study the number of +samples required for Langevin dynamics to efficiently recover the spikes and +determine the necessary separation condition on the signal-to-noise ratios +(SNRs) for exact recovery, distinguishing the cases $p \ge 3$ and $p=2$, where +$p$ denotes the order of the tensor. In particular, we show that the sample +complexity required for recovering the spike associated with the largest SNR +matches the well-known algorithmic threshold for the single-spike case, while +this threshold degrades when recovering all $r$ spikes. As a key step, we +provide a detailed characterization of the trajectory and interactions of +low-dimensional projections that capture the high-dimensional dynamics. + +
+
+ comment: 65 pages +
+
+
+
+
+ + ♻ ☆ When Every Token Counts: Optimal Segmentation for Low-Resource Language + Models COLING 2025 + + +
+ Traditional greedy tokenization methods have been a critical step in Natural +Language Processing (NLP), influencing how text is converted into tokens and +directly impacting model performance. While subword tokenizers like Byte-Pair +Encoding (BPE) are widely used, questions remain about their optimality across +model scales and languages. In this work, we demonstrate through extensive +experiments that an optimal BPE configuration significantly reduces token count +compared to greedy segmentation, yielding improvements in token-saving +percentages and performance benefits, particularly for smaller models. We +evaluate tokenization performance across various intrinsic and extrinsic tasks, +including generation and classification. Our findings suggest that +compression-optimized tokenization strategies could provide substantial +advantages for multilingual and low-resource language applications, +highlighting a promising direction for further research and inclusive NLP. + +
+
+ comment: LoResLM @ COLING 2025 +
+
+
+
+
+ + ♻ ☆ Deep Learning-based Non-Intrusive Multi-Objective Speech Assessment + Model with Cross-Domain Features + + +
+ In this study, we propose a cross-domain multi-objective speech assessment +model called MOSA-Net, which can estimate multiple speech assessment metrics +simultaneously. Experimental results show that MOSA-Net can improve the linear +correlation coefficient (LCC) by 0.026 (0.990 vs 0.964 in seen noise +environments) and 0.012 (0.969 vs 0.957 in unseen noise environments) in +perceptual evaluation of speech quality (PESQ) prediction, compared to +Quality-Net, an existing single-task model for PESQ prediction, and improve LCC +by 0.021 (0.985 vs 0.964 in seen noise environments) and 0.047 (0.836 vs 0.789 +in unseen noise environments) in short-time objective intelligibility (STOI) +prediction, compared to STOI-Net (based on CRNN), an existing single-task model +for STOI prediction. Moreover, MOSA-Net, originally trained to assess objective +scores, can be used as a pre-trained model to be effectively adapted to an +assessment model for predicting subjective quality and intelligibility scores +with a limited amount of training data. Experimental results show that MOSA-Net +can improve LCC by 0.018 (0.805 vs 0.787) in mean opinion score (MOS) +prediction, compared to MOS-SSL, a strong single-task model for MOS prediction. +In light of the confirmed prediction capability, we further adopt the latent +representations of MOSA-Net to guide the speech enhancement (SE) process and +derive a quality-intelligibility (QI)-aware SE (QIA-SE) approach accordingly. +Experimental results show that QIA-SE provides superior enhancement performance +compared with the baseline SE system in terms of objective evaluation metrics +and qualitative evaluation test. For example, QIA-SE can improve PESQ by 0.301 +(2.953 vs 2.652 in seen noise environments) and 0.18 (2.658 vs 2.478 in unseen +noise environments) over a CNN-based baseline SE model. + +
+
+ comment: Accepted by IEEE/ACM Transactions on Audio, Speech, and Language + Processing (TASLP), vol. 31, pp. 54-70, 2023 +
+
+
+
+
+ + ♻ ☆ Image Classification with Rotation-Invariant Variational Quantum + Circuits + + +
+ Variational quantum algorithms are gaining attention as an early application +of Noisy Intermediate-Scale Quantum (NISQ) devices. One of the main problems of +variational methods lies in the phenomenon of Barren Plateaus, present in the +optimization of variational parameters. Adding geometric inductive bias to the +quantum models has been proposed as a potential solution to mitigate this +problem, leading to a new field called Geometric Quantum Machine Learning. In +this work, an equivariant architecture for variational quantum classifiers is +introduced to create a label-invariant model for image classification with +$C_4$ rotational label symmetry. The equivariant circuit is benchmarked against +two different architectures, and it is experimentally observed that the +geometric approach boosts the model's performance. Finally, a classical +equivariant convolution operation is proposed to extend the quantum model for +the processing of larger images, employing the resources available in NISQ +devices. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Cherry on the Cake: Fairness is NOT an Optimization Problem + + +
+ In Fair AI literature, the practice of maliciously creating unfair models +that nevertheless satisfy fairness constraints is known as "cherry-picking". A +cherry-picking model is a model that makes mistakes on purpose, selecting bad +individuals from a minority class instead of better candidates from the same +minority. The model literally cherry-picks whom to select to superficially meet +the fairness constraints while making minimal changes to the unfair model. This +practice has been described as "blatantly unfair" and has a negative impact on +already marginalized communities, undermining the intended purpose of fairness +measures specifically designed to protect these communities. A common +assumption is that cherry-picking arises solely from malicious intent and that +models designed only to optimize fairness metrics would avoid this behavior. We +show that this is not the case: models optimized to minimize fairness metrics +while maximizing performance are often forced to cherry-pick to some degree. In +other words, cherry-picking might be an inevitable outcome of the optimization +process itself. To demonstrate this, we use tools from fair cake-cutting, a +mathematical subfield that studies the problem of fairly dividing a resource, +referred to as the "cake," among a number of participants. This concept is +connected to supervised multi-label classification: any dataset can be thought +of as a cake that needs to be distributed among different labels, and the model +is the function that divides the cake. We adapt these classical results for +machine learning and demonstrate how this connection can be prolifically used +for fairness and classification in general. + +
+
+
+
+
+ + ♻ ☆ Iterative Methods for Full-Scale Gaussian Process Approximations for + Large Spatial Data + + +
+ Gaussian processes are flexible probabilistic regression models which are +widely used in statistics and machine learning. However, a drawback is their +limited scalability to large data sets. To alleviate this, we consider +full-scale approximations (FSAs) that combine predictive process methods and +covariance tapering, thus approximating both global and local structures. We +show how iterative methods can be used to reduce the computational costs for +calculating likelihoods, gradients, and predictive distributions with FSAs. We +introduce a novel preconditioner and show that it accelerates the conjugate +gradient method's convergence speed and mitigates its sensitivity with respect +to the FSA parameters and the eigenvalue structure of the original covariance +matrix, and we demonstrate empirically that it outperforms a state-of-the-art +pivoted Cholesky preconditioner. Further, we present a novel, accurate, and +fast way to calculate predictive variances relying on stochastic estimations +and iterative methods. In both simulated and real-world data experiments, we +find that our proposed methodology achieves the same accuracy as Cholesky-based +computations with a substantial reduction in computational time. Finally, we +also compare different approaches for determining inducing points in predictive +process and FSA models. All methods are implemented in a free C++ software +library with high-level Python and R packages. + +
+
+
+
+
+ + ♻ ☆ Analyzing Consumer IoT Traffic from Security and Privacy Perspectives: a + Comprehensive Survey + + +
+ The Consumer Internet of Things (CIoT), a notable segment within the IoT +domain, involves the integration of IoT technology into consumer electronics +and devices, such as smart homes and smart wearables. Compared to traditional +IoT fields, CIoT differs notably in target users, product types, and design +approaches. While offering convenience to users, it also raises new security +and privacy concerns. Network traffic analysis, a widely used technique in the +security community, has been extensively applied to investigate these concerns +about CIoT. Compared to network traffic analysis in other fields such as mobile +apps and websites, CIoT presents unique characteristics, introducing new +challenges and research opportunities. Researchers have made significant +contributions in this area. To aid researchers in understanding the application +of traffic analysis tools for studying CIoT security and privacy risks, this +survey reviews 303 publications on traffic analysis within the CIoT security +and privacy domain from January 2018 to June 2024, focusing on three research +questions. Our work: 1) outlines the CIoT traffic analysis process and +highlights its differences from general network traffic analysis. 2) summarizes +and classifies existing research into four categories according to its +application objectives: device fingerprinting, user activity inference, +malicious traffic detection, and measurement. 3) explores emerging challenges +and potential future research directions based on each step of the CIoT traffic +analysis process. This will provide new insights to the community and guide the +industry towards safer product designs. + +
+
+
+
+
+ + ♻ ☆ Mixed Semi-Supervised Generalized-Linear-Regression with Applications to + Deep-Learning and Interpolators + + +
+ We present a methodology for using unlabeled data to design semi supervised +learning (SSL) methods that improve the prediction performance of supervised +learning for regression tasks. The main idea is to design different mechanisms +for integrating the unlabeled data, and include in each of them a mixing +parameter $\alpha$, controlling the weight given to the unlabeled data. +Focusing on Generalized Linear Models (GLM) and linear interpolators classes of +models, we analyze the characteristics of different mixing mechanisms, and +prove that in all cases, it is invariably beneficial to integrate the unlabeled +data with some nonzero mixing ratio $\alpha>0$, in terms of predictive +performance. Moreover, we provide a rigorous framework to estimate the best +mixing ratio $\alpha^*$ where mixed SSL delivers the best predictive +performance, while using the labeled and unlabeled data on hand. + The effectiveness of our methodology in delivering substantial improvement +compared to the standard supervised models, in a variety of settings, is +demonstrated empirically through extensive simulation, in a manner that +supports the theoretical analysis. We also demonstrate the applicability of our +methodology (with some intuitive modifications) to improve more complex models, +such as deep neural networks, in real-world regression tasks. + +
+
+ comment: 58 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Gauss-Newton Dynamics for Neural Networks: A Riemannian Optimization + Perspective + + +
+ We analyze the convergence of Gauss-Newton dynamics for training neural +networks with smooth activation functions. In the underparameterized regime, +the Gauss-Newton gradient flow induces a Riemannian gradient flow on a +low-dimensional, smooth, embedded submanifold of the Euclidean output space. +Using tools from Riemannian optimization, we prove \emph{last-iterate} +convergence of the Riemannian gradient flow to the optimal in-class predictor +at an \emph{exponential rate} that is independent of the conditioning of the +Gram matrix, \emph{without} requiring explicit regularization. We further +characterize the critical impacts of the neural network scaling factor and the +initialization on the convergence behavior. In the overparameterized regime, we +show that the Levenberg-Marquardt dynamics with an appropriately chosen damping +factor yields robustness to ill-conditioned kernels, analogous to the +underparameterized regime. These findings demonstrate the potential of +Gauss-Newton methods for efficiently optimizing neural networks, particularly +in ill-conditioned problems where kernel and Gram matrices have small singular +values. + +
+
+
+
+
+ + ♻ ☆ DualDynamics: Synergizing Implicit and Explicit Methods for Robust + Irregular Time Series Analysis AAAI + + +
+ Real-world time series analysis faces significant challenges when dealing +with irregular and incomplete data. While Neural Differential Equation (NDE) +based methods have shown promise, they struggle with limited expressiveness, +scalability issues, and stability concerns. Conversely, Neural Flows offer +stability but falter with irregular data. We introduce 'DualDynamics', a novel +framework that synergistically combines NDE-based method and Neural Flow-based +method. This approach enhances expressive power while balancing computational +demands, addressing critical limitations of existing techniques. We demonstrate +DualDynamics' effectiveness across diverse tasks: classification of robustness +to dataset shift, irregularly-sampled series analysis, interpolation of missing +data, and forecasting with partial observations. Our results show consistent +outperformance over state-of-the-art methods, indicating DualDynamics' +potential to advance irregular time series analysis significantly. + +
+
+ comment: Published at the 39th Annual AAAI Conference on Artificial + Intelligence (AAAI 2025) +
+
+
+
+
+ + ♻ ☆ Alt-MoE: Multimodal Alignment via Alternating Optimization of + Multi-directional MoE with Unimodal Models + + +
+ Recent Large Multi-Modal Models (LMMs) have made significant advancements in +multi-modal alignment by employing lightweight connection modules to facilitate +the representation and fusion of knowledge from existing pre-trained uni-modal +models. However, these methods still rely on modality-specific and +direction-specific connectors, leading to compartmentalized knowledge +representations and reduced computational efficiency, which limits the model's +ability to form unified multi-modal representations. To address these issues, +we introduce a novel training framework, Alt-MoE, which employs the Mixture of +Experts (MoE) as a unified multi-directional connector across modalities, and +employs a multi-step sequential alternating unidirectional alignment strategy, +which converges to bidirectional alignment over iterations. The extensive +empirical studies revealed the following key points: 1) Alt-MoE achieves +competitive results by integrating diverse knowledge representations from +uni-modal models. This approach seamlessly fuses the specialized expertise of +existing high-performance uni-modal models, effectively synthesizing their +domain-specific knowledge into a cohesive multi-modal representation. 2) +Alt-MoE efficiently scales to new tasks and modalities without altering its +model architecture or training strategy. Furthermore, Alt-MoE operates in +latent space, supporting vector pre-storage and real-time retrieval via +lightweight multi-directional MoE, thereby facilitating massive data +processing. Our methodology has been validated on several well-performing +uni-modal models (LLAMA3, Qwen2, and DINOv2), achieving competitive results on +a wide range of downstream tasks and datasets. + +
+
+
+
+
+ + ♻ ☆ Quantum Curriculum Learning + + +
+ Quantum machine learning (QML) requires significant quantum resources to +address practical real-world problems. When the underlying quantum information +exhibits hierarchical structures in the data, limitations persist in training +complexity and generalization. Research should prioritize both the efficient +design of quantum architectures and the development of learning strategies to +optimize resource usage. We propose a framework called quantum curriculum +learning (Q-CurL) for quantum data, where the curriculum introduces simpler +tasks or data to the learning model before progressing to more challenging +ones. Q-CurL exhibits robustness to noise and data limitations, which is +particularly relevant for current and near-term noisy intermediate-scale +quantum devices. We achieve this through a curriculum design based on quantum +data density ratios and a dynamic learning schedule that prioritizes the most +informative quantum data. Empirical evidence shows that Q-CurL significantly +enhances training convergence and generalization for unitary learning and +improves the robustness of quantum phase recognition tasks. Q-CurL is effective +with broad physical learning applications in condensed matter physics and +quantum chemistry. + +
+
+ comment: main 6 pages, supplementary materials 11 pages (update the + supplementary materials with more explanation on data-based Q-CurL) +
+
+
+
+
+ + ♻ ☆ Learning Discretized Neural Networks under Ricci Flow + + +
+ In this paper, we study Discretized Neural Networks (DNNs) composed of +low-precision weights and activations, which suffer from either infinite or +zero gradients due to the non-differentiable discrete function during training. +Most training-based DNNs in such scenarios employ the standard Straight-Through +Estimator (STE) to approximate the gradient w.r.t. discrete values. However, +the use of STE introduces the problem of gradient mismatch, arising from +perturbations in the approximated gradient. To address this problem, this paper +reveals that this mismatch can be interpreted as a metric perturbation in a +Riemannian manifold, viewed through the lens of duality theory. Building on +information geometry, we construct the Linearly Nearly Euclidean (LNE) manifold +for DNNs, providing a background for addressing perturbations. By introducing a +partial differential equation on metrics, i.e., the Ricci flow, we establish +the dynamical stability and convergence of the LNE metric with the $L^2$-norm +perturbation. In contrast to previous perturbation theories with convergence +rates in fractional powers, the metric perturbation under the Ricci flow +exhibits exponential decay in the LNE manifold. Experimental results across +various datasets demonstrate that our method achieves superior and more stable +performance for DNNs compared to other representative training-based methods. + +
+
+
+
+
+ + ♻ ☆ Selective Uncertainty Propagation in Offline RL + + +
+ We consider the finite-horizon offline reinforcement learning (RL) setting, +and are motivated by the challenge of learning the policy at any step h in +dynamic programming (DP) algorithms. To learn this, it is sufficient to +evaluate the treatment effect of deviating from the behavioral policy at step h +after having optimized the policy for all future steps. Since the policy at any +step can affect next-state distributions, the related distributional shift +challenges can make this problem far more statistically hard than estimating +such treatment effects in the stochastic contextual bandit setting. However, +the hardness of many real-world RL instances lies between the two regimes. We +develop a flexible and general method called selective uncertainty propagation +for confidence interval construction that adapts to the hardness of the +associated distribution shift challenges. We show benefits of our approach on +toy environments and demonstrate the benefits of these techniques for offline +policy learning. + +
+
+
+
+
+ + ♻ ☆ CAP: A General Algorithm for Online Selective Conformal Prediction with + FCR Control + + +
+ We study the problem of post-selection predictive inference in an online +fashion. To avoid devoting resources to unimportant units, a preliminary +selection of the current individual before reporting its prediction interval is +common and meaningful in online predictive tasks. Since the online selection +causes a temporal multiplicity in the selected prediction intervals, it is +important to control the real-time false coverage-statement rate (FCR) which +measures the overall miscoverage level. We develop a general framework named +CAP (Calibration after Adaptive Pick) that performs an adaptive pick rule on +historical data to construct a calibration set if the current individual is +selected and then outputs a conformal prediction interval for the unobserved +label. We provide tractable procedures for constructing the calibration set for +popular online selection rules. We proved that CAP can achieve an exact +selection-conditional coverage guarantee in the finite-sample and +distribution-free regimes. To account for the distribution shift in online +data, we also embed CAP into some recent dynamic conformal prediction +algorithms and show that the proposed method can deliver long-run FCR control. +Numerical results on both synthetic and real data corroborate that CAP can +effectively control FCR around the target level and yield more narrowed +prediction intervals over existing baselines across various settings. + +
+
+
+
+
+ + ♻ ☆ Learning Infinitesimal Generators of Continuous Symmetries from Data + + +
+ Exploiting symmetry inherent in data can significantly improve the sample +efficiency of a learning procedure and the generalization of learned models. +When data clearly reveals underlying symmetry, leveraging this symmetry can +naturally inform the design of model architectures or learning strategies. Yet, +in numerous real-world scenarios, identifying the specific symmetry within a +given data distribution often proves ambiguous. To tackle this, some existing +works learn symmetry in a data-driven manner, parameterizing and learning +expected symmetry through data. However, these methods often rely on explicit +knowledge, such as pre-defined Lie groups, which are typically restricted to +linear or affine transformations. In this paper, we propose a novel symmetry +learning algorithm based on transformations defined with one-parameter groups, +continuously parameterized transformations flowing along the directions of +vector fields called infinitesimal generators. Our method is built upon minimal +inductive biases, encompassing not only commonly utilized symmetries rooted in +Lie groups but also extending to symmetries derived from nonlinear generators. +To learn these symmetries, we introduce a notion of a validity score that +examine whether the transformed data is still valid for the given task. The +validity score is designed to be fully differentiable and easily computable, +enabling effective searches for transformations that achieve symmetries innate +to the data. We apply our method mainly in two domains: image data and partial +differential equations, and demonstrate its advantages. Our codes are available +at \url{https://github.com/kogyeonghoon/learning-symmetry-from-scratch.git}. + +
+
+ comment: Neurips 2024 +
+
+
+
+
+ + ♻ ☆ How Does the Smoothness Approximation Method Facilitate Generalization + for Federated Adversarial Learning? + + +
+ Federated Adversarial Learning (FAL) is a robust framework for resisting +adversarial attacks on federated learning. Although some FAL studies have +developed efficient algorithms, they primarily focus on convergence performance +and overlook generalization. Generalization is crucial for evaluating algorithm +performance on unseen data. However, generalization analysis is more +challenging due to non-smooth adversarial loss functions. A common approach to +addressing this issue is to leverage smoothness approximation. In this paper, +we develop algorithm stability measures to evaluate the generalization +performance of two popular FAL algorithms: \textit{Vanilla FAL (VFAL)} and {\it +Slack FAL (SFAL)}, using three different smooth approximation methods: 1) +\textit{Surrogate Smoothness Approximation (SSA)}, (2) \textit{Randomized +Smoothness Approximation (RSA)}, and (3) \textit{Over-Parameterized Smoothness +Approximation (OPSA)}. Based on our in-depth analysis, we answer the question +of how to properly set the smoothness approximation method to mitigate +generalization error in FAL. Moreover, we identify RSA as the most effective +method for reducing generalization error. In highly data-heterogeneous +scenarios, we also recommend employing SFAL to mitigate the deterioration of +generalization performance caused by heterogeneity. Based on our theoretical +results, we provide insights to help develop more efficient FAL algorithms, +such as designing new metrics and dynamic aggregation rules to mitigate +heterogeneity. + +
+
+
+
+
+ + ♻ ☆ Continual Learning: Forget-free Winning Subnetworks for Video + Representations + + +
+ Inspired by the Lottery Ticket Hypothesis (LTH), which highlights the +existence of efficient subnetworks within larger, dense networks, a +high-performing Winning Subnetwork (WSN) in terms of task performance under +appropriate sparsity conditions is considered for various continual learning +tasks. It leverages pre-existing weights from dense networks to achieve +efficient learning in Task Incremental Learning (TIL) and Task-agnostic +Incremental Learning (TaIL) scenarios. In Few-Shot Class Incremental Learning +(FSCIL), a variation of WSN referred to as the Soft subnetwork (SoftNet) is +designed to prevent overfitting when the data samples are scarce. Furthermore, +the sparse reuse of WSN weights is considered for Video Incremental Learning +(VIL). The use of Fourier Subneural Operator (FSO) within WSN is considered. It +enables compact encoding of videos and identifies reusable subnetworks across +varying bandwidths. We have integrated FSO into different architectural +frameworks for continual learning, including VIL, TIL, and FSCIL. Our +comprehensive experiments demonstrate FSO's effectiveness, significantly +improving task performance at various convolutional representational levels. +Specifically, FSO enhances higher-layer performance in TIL and FSCIL and +lower-layer performance in VIL. + +
+
+ comment: IEEE Transactions on Pattern Analysis and Machine Intelligence + (T-PAMI) +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Prompt-A-Video: Prompt Your Video Diffusion Model via Preference-Aligned + LLM + + +
+ Text-to-video models have made remarkable advancements through optimization +on high-quality text-video pairs, where the textual prompts play a pivotal role +in determining quality of output videos. However, achieving the desired output +often entails multiple revisions and iterative inference to refine +user-provided prompts. Current automatic methods for refining prompts encounter +challenges such as Modality-Inconsistency, Cost-Discrepancy, and Model-Unaware +when applied to text-to-video diffusion models. To address these problem, we +introduce an LLM-based prompt adaptation framework, termed as Prompt-A-Video, +which excels in crafting Video-Centric, Labor-Free and Preference-Aligned +prompts tailored to specific video diffusion model. Our approach involves a +meticulously crafted two-stage optimization and alignment system. Initially, we +conduct a reward-guided prompt evolution pipeline to automatically create +optimal prompts pool and leverage them for supervised fine-tuning (SFT) of the +LLM. Then multi-dimensional rewards are employed to generate pairwise data for +the SFT model, followed by the direct preference optimization (DPO) algorithm +to further facilitate preference alignment. Through extensive experimentation +and comparative analyses, we validate the effectiveness of Prompt-A-Video +across diverse generation models, highlighting its potential to push the +boundaries of video generation. + +
+
+
+
+
+ + ☆ Stable-V2A: Synthesis of Synchronized Sound Effects with Temporal and + Semantic Controls + + +
+ Sound designers and Foley artists usually sonorize a scene, such as from a +movie or video game, by manually annotating and sonorizing each action of +interest in the video. In our case, the intent is to leave full creative +control to sound designers with a tool that allows them to bypass the more +repetitive parts of their work, thus being able to focus on the creative +aspects of sound production. We achieve this presenting Stable-V2A, a two-stage +model consisting of: an RMS-Mapper that estimates an envelope representative of +the audio characteristics associated with the input video; and Stable-Foley, a +diffusion model based on Stable Audio Open that generates audio semantically +and temporally aligned with the target video. Temporal alignment is guaranteed +by the use of the envelope as a ControlNet input, while semantic alignment is +achieved through the use of sound representations chosen by the designer as +cross-attention conditioning of the diffusion process. We train and test our +model on Greatest Hits, a dataset commonly used to evaluate V2A models. In +addition, to test our model on a case study of interest, we introduce Walking +The Maps, a dataset of videos extracted from video games depicting animated +characters walking in different locations. Samples and code available on our +demo page at https://ispamm.github.io/Stable-V2A. + +
+
+
+
+
+ + ☆ Spectrum-based Modality Representation Fusion Graph Convolutional + Network for Multimodal Recommendation WSDM + + +
+ Incorporating multi-modal features as side information has recently become a +trend in recommender systems. To elucidate user-item preferences, recent +studies focus on fusing modalities via concatenation, element-wise sum, or +attention mechanisms. Despite having notable success, existing approaches do +not account for the modality-specific noise encapsulated within each modality. +As a result, direct fusion of modalities will lead to the amplification of +cross-modality noise. Moreover, the variation of noise that is unique within +each modality results in noise alleviation and fusion being more challenging. +In this work, we propose a new Spectrum-based Modality Representation (SMORE) +fusion graph recommender that aims to capture both uni-modal and fusion +preferences while simultaneously suppressing modality noise. Specifically, +SMORE projects the multi-modal features into the frequency domain and leverages +the spectral space for fusion. To reduce dynamic contamination that is unique +to each modality, we introduce a filter to attenuate and suppress the modality +noise adaptively while capturing the universal modality patterns effectively. +Furthermore, we explore the item latent structures by designing a new +multi-modal graph learning module to capture associative semantic correlations +and universal fusion patterns among similar items. Finally, we formulate a new +modality-aware preference module, which infuses behavioral features and +balances the uni- and multi-modal features for precise preference modeling. +This empowers SMORE with the ability to infer both user modality-specific and +fusion preferences more accurately. Experiments on three real-world datasets +show the efficacy of our proposed model. The source code for this work has been +made publicly available at https://github.com/kennethorq/SMORE. + +
+
+ comment: Accepted to ACM Web Search and Data Mining (WSDM) 2025 +
+
+
+
+
+ + ☆ Efficient Self-Supervised Video Hashing with Selective State Spaces AAAI'25 + + +
+ Self-supervised video hashing (SSVH) is a practical task in video indexing +and retrieval. Although Transformers are predominant in SSVH for their +impressive temporal modeling capabilities, they often suffer from computational +and memory inefficiencies. Drawing inspiration from Mamba, an advanced +state-space model, we explore its potential in SSVH to achieve a better balance +between efficacy and efficiency. We introduce S5VH, a Mamba-based video hashing +model with an improved self-supervised learning paradigm. Specifically, we +design bidirectional Mamba layers for both the encoder and decoder, which are +effective and efficient in capturing temporal relationships thanks to the +data-dependent selective scanning mechanism with linear complexity. In our +learning strategy, we transform global semantics in the feature space into +semantically consistent and discriminative hash centers, followed by a center +alignment loss as a global learning signal. Our self-local-global (SLG) +paradigm significantly improves learning efficiency, leading to faster and +better convergence. Extensive experiments demonstrate S5VH's improvements over +state-of-the-art methods, superior transferability, and scalable advantages in +inference efficiency. Code is available at +https://github.com/gimpong/AAAI25-S5VH. + +
+
+ comment: Accepted by AAAI'25. 9 pages, 5 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Audio-Visual Speaker Tracking: Progress, Challenges, and Future + Directions + + +
+ Audio-visual speaker tracking has drawn increasing attention over the past +few years due to its academic values and wide application. Audio and visual +modalities can provide complementary information for localization and tracking. +With audio and visual information, the Bayesian-based filter can solve the +problem of data association, audio-visual fusion and track management. In this +paper, we conduct a comprehensive overview of audio-visual speaker tracking. To +our knowledge, this is the first extensive survey over the past five years. We +introduce the family of Bayesian filters and summarize the methods for +obtaining audio-visual measurements. In addition, the existing trackers and +their performance on AV16.3 dataset are summarized. In the past few years, deep +learning techniques have thrived, which also boosts the development of audio +visual speaker tracking. The influence of deep learning techniques in terms of +measurement extraction and state estimation is also discussed. At last, we +discuss the connections between audio-visual speaker tracking and other areas +such as speech separation and distributed speaker tracking. + +
+
+
+
+
+ + ♻ ☆ Sign-IDD: Iconicity Disentangled Diffusion for Sign Language Production AAAI 2025 + + +
+ Sign Language Production (SLP) aims to generate semantically consistent sign +videos from textual statements, where the conversion from textual glosses to +sign poses (G2P) is a crucial step. Existing G2P methods typically treat sign +poses as discrete three-dimensional coordinates and directly fit them, which +overlooks the relative positional relationships among joints. To this end, we +provide a new perspective, constraining joint associations and gesture details +by modeling the limb bones to improve the accuracy and naturalness of the +generated poses. In this work, we propose a pioneering iconicity disentangled +diffusion framework, termed Sign-IDD, specifically designed for SLP. Sign-IDD +incorporates a novel Iconicity Disentanglement (ID) module to bridge the gap +between relative positions among joints. The ID module disentangles the +conventional 3D joint representation into a 4D bone representation, comprising +the 3D spatial direction vector and 1D spatial distance vector between adjacent +joints. Additionally, an Attribute Controllable Diffusion (ACD) module is +introduced to further constrain joint associations, in which the attribute +separation layer aims to separate the bone direction and length attributes, and +the attribute control layer is designed to guide the pose generation by +leveraging the above attributes. The ACD module utilizes the gloss embeddings +as semantic conditions and finally generates sign poses from noise embeddings. +Extensive experiments on PHOENIX14T and USTC-CSL datasets validate the +effectiveness of our method. The code is available at: +https://github.com/NaVi-start/Sign-IDD. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 33 + +
+
+
+ + ☆ In-Group Love, Out-Group Hate: A Framework to Measure Affective + Polarization via Contentious Online Discussions + + +
+ Affective polarization, the emotional divide between ideological groups +marked by in-group love and out-group hate, has intensified in the United +States, driving contentious issues like masking and lockdowns during the +COVID-19 pandemic. Despite its societal impact, existing models of opinion +change fail to account for emotional dynamics nor offer methods to quantify +affective polarization robustly and in real-time. In this paper, we introduce a +discrete choice model that captures decision-making within affectively +polarized social networks and propose a statistical inference method estimate +key parameters -- in-group love and out-group hate -- from social media data. +Through empirical validation from online discussions about the COVID-19 +pandemic, we demonstrate that our approach accurately captures real-world +polarization dynamics and explains the rapid emergence of a partisan gap in +attitudes towards masking and lockdowns. This framework allows for tracking +affective polarization across contentious issues has broad implications for +fostering constructive online dialogues in digital spaces. + +
+
+
+
+
+ + ☆ ECG-Byte: A Tokenizer for End-to-End Generative Electrocardiogram + Language Modeling + + +
+ Large Language Models (LLMs) have shown remarkable adaptability across +domains beyond text, specifically electrocardiograms (ECGs). More specifically, +there is a growing body of work exploring the task of generating text from a +multi-channeled ECG and corresponding textual prompt. Current approaches +typically involve pretraining an ECG-specific encoder with a self-supervised +learning (SSL) objective and using the features output by the pretrained +encoder to finetune a LLM for natural language generation (NLG). However, these +methods are limited by 1) inefficiency from two-stage training and 2) +interpretability challenges with encoder-generated features. To address these +limitations, we introduce ECG-Byte, an adapted byte pair encoding (BPE) +tokenizer pipeline for autoregressive language modeling of ECGs. This approach +compresses and encodes ECG signals into tokens, enabling end-to-end LLM +training by combining ECG and text tokens directly, while being much more +interpretable since the ECG tokens can be directly mapped back to the original +signal. Using ECG-Byte, we achieve competitive performance in NLG tasks in only +half the time and ~48% of the data required by two-stage approaches. + +
+
+ comment: 26 pages, 17 figures +
+
+
+
+
+ + ☆ Memorization Over Reasoning? Exposing and Mitigating Verbatim + Memorization in Large Language Models' Character Understanding Evaluation + + +
+ Recently, Large Language Models (LLMs) have shown impressive performance in +character understanding tasks, such as analyzing the roles, personalities, and +relationships of fictional characters. However, the extensive pre-training +corpora used by LLMs raise concerns that they may rely on memorizing popular +fictional works rather than genuinely understanding and reasoning about them. +In this work, we argue that 'gist memory'-capturing essential meaning - should +be the primary mechanism for character understanding tasks, as opposed to +'verbatim memory' - exact match of a string. We introduce a simple yet +effective method to mitigate mechanized memorization in character understanding +evaluations while preserving the essential implicit cues needed for +comprehension and reasoning. Our approach reduces memorization-driven +performance on popular fictional works from 96% accuracy to 72% and results in +up to an 18% drop in accuracy across various character understanding tasks. +These findings underscore the issue of data contamination in existing +benchmarks, which often measure memorization rather than true character +understanding. + +
+
+
+
+
+ + ☆ ResQ: Mixed-Precision Quantization of Large Language Models with + Low-Rank Residuals + + +
+ Post-training quantization (PTQ) of large language models (LLMs) holds the +promise in reducing the prohibitive computational cost at inference time. +Quantization of all weight, activation and key-value (KV) cache tensors to +4-bit without significantly degrading generalizability is challenging, due to +the high quantization error caused by extreme outliers in activations. To +tackle this problem, we propose ResQ, a PTQ method that pushes further the +state-of-the-art. By means of principal component analysis (PCA), it identifies +a low-rank subspace (in practice 1/8 of the hidden dimension) in which +activation variances are highest, and keep the coefficients within this +subspace in high precision, e.g. 8-bit, while quantizing the rest to 4-bit. +Within each subspace, invariant random rotation is applied to further suppress +outliers. We show that this is a provably optimal mixed precision quantization +scheme that minimizes error. With the Llama families of models, we demonstrate +that ResQ outperforms recent uniform and mixed precision PTQ methods on a +variety of benchmarks, achieving up to 33% lower perplexity on Wikitext than +the next best method SpinQuant, and a 2.4x speedup over 16-bit baseline. Code +is available at https://github.com/utkarsh-dmx/project-resq. + +
+
+ comment: 14 pages, 6 figures, 6 tables +
+
+
+
+
+ + ☆ State Space Models are Strong Text Rerankers + + +
+ Transformers dominate NLP and IR; but their inference inefficiencies and +challenges in extrapolating to longer contexts have sparked interest in +alternative model architectures. Among these, state space models (SSMs) like +Mamba offer promising advantages, particularly $O(1)$ time complexity in +inference. Despite their potential, SSMs' effectiveness at text reranking -- a +task requiring fine-grained query-document interaction and long-context +understanding -- remains underexplored. + This study benchmarks SSM-based architectures (specifically, Mamba-1 and +Mamba-2) against transformer-based models across various scales, architectures, +and pre-training objectives, focusing on performance and efficiency in text +reranking tasks. We find that (1) Mamba architectures achieve competitive text +ranking performance, comparable to transformer-based models of similar size; +(2) they are less efficient in training and inference compared to transformers +with flash attention; and (3) Mamba-2 outperforms Mamba-1 in both performance +and efficiency. These results underscore the potential of state space models as +a transformer alternative and highlight areas for improvement in future IR +applications. + +
+
+ comment: The first two authors contributed equally, order decided randomly +
+
+
+
+
+ + ☆ A Survey on LLM Inference-Time Self-Improvement + + +
+ Techniques that enhance inference through increased computation at test-time +have recently gained attention. In this survey, we investigate the current +state of LLM Inference-Time Self-Improvement from three different perspectives: +Independent Self-improvement, focusing on enhancements via decoding or sampling +methods; Context-Aware Self-Improvement, leveraging additional context or +datastore; and Model-Aided Self-Improvement, achieving improvement through +model collaboration. We provide a comprehensive review of recent relevant +studies, contribute an in-depth taxonomy, and discuss challenges and +limitations, offering insights for future research. + +
+
+ comment: The first two authors contribute equally +
+
+
+
+
+ + ☆ Is Peer-Reviewing Worth the Effort? COLING 2025 + + +
+ How effective is peer-reviewing in identifying important papers? We treat +this question as a forecasting task. Can we predict which papers will be highly +cited in the future based on venue and "early returns" (citations soon after +publication)? We show early returns are more predictive than venue. Finally, we +end with constructive suggestions to address scaling challenges: (a) too many +submissions and (b) too few qualified reviewers. + +
+
+ comment: The 31st International Conference on Computational Linguistics + (COLING 2025) +
+
+
+
+
+ + ☆ Semantic Role Labeling of NomBank Partitives COLING 2025 + + +
+ This article is about Semantic Role Labeling for English partitive nouns +(5%/REL of the price/ARG1; The price/ARG1 rose 5 percent/REL) in the NomBank +annotated corpus. Several systems are described using traditional and +transformer-based machine learning, as well as ensembling. Our highest scoring +system achieves an F1 of 91.74% using "gold" parses from the Penn Treebank and +91.12% when using the Berkeley Neural parser. This research includes both +classroom and experimental settings for system development. + +
+
+ comment: SUMEval-2: The 2nd Workshop on Scaling Up Multilingual & + Multi-Cultural Evaluation at the 31st International Conference on + Computational Linguistics (COLING 2025) +
+
+
+
+
+ + ☆ The Role of Handling Attributive Nouns in Improving Chinese-To-English + Machine Translation COLING 2025 + + +
+ Translating between languages with drastically different grammatical +conventions poses challenges, not just for human interpreters but also for +machine translation systems. In this work, we specifically target the +translation challenges posed by attributive nouns in Chinese, which frequently +cause ambiguities in English translation. By manually inserting the omitted +particle X ('DE'). In news article titles from the Penn Chinese Discourse +Treebank, we developed a targeted dataset to fine-tune Hugging Face Chinese to +English translation models, specifically improving how this critical function +word is handled. This focused approach not only complements the broader +strategies suggested by previous studies but also offers a practical +enhancement by specifically addressing a common error type in Chinese-English +translation. + +
+
+ comment: 18th Workshop on Building and Using Comparable Corpora (BUCC) at the + 31st International Conference on Computational Linguistics (COLING 2025) +
+
+
+
+
+ + ☆ Multi-OphthaLingua: A Multilingual Benchmark for Assessing and Debiasing + LLM Ophthalmological QA in LMICs AAAI 2025 + + +
+ Current ophthalmology clinical workflows are plagued by over-referrals, long +waits, and complex and heterogeneous medical records. Large language models +(LLMs) present a promising solution to automate various procedures such as +triaging, preliminary tests like visual acuity assessment, and report +summaries. However, LLMs have demonstrated significantly varied performance +across different languages in natural language question-answering tasks, +potentially exacerbating healthcare disparities in Low and Middle-Income +Countries (LMICs). This study introduces the first multilingual +ophthalmological question-answering benchmark with manually curated questions +parallel across languages, allowing for direct cross-lingual comparisons. Our +evaluation of 6 popular LLMs across 7 different languages reveals substantial +bias across different languages, highlighting risks for clinical deployment of +LLMs in LMICs. Existing debiasing methods such as Translation Chain-of-Thought +or Retrieval-augmented generation (RAG) by themselves fall short of closing +this performance gap, often failing to improve performance across all languages +and lacking specificity for the medical domain. To address this issue, We +propose CLARA (Cross-Lingual Reflective Agentic system), a novel inference time +de-biasing method leveraging retrieval augmented generation and +self-verification. Our approach not only improves performance across all +languages but also significantly reduces the multilingual bias gap, +facilitating equitable LLM application across the globe. + +
+
+ comment: Accepted at the AAAI 2025 Artificial Intelligence for Social Impact + Track (AAAI-AISI 2025) +
+
+
+
+
+ + ☆ Fake News Detection: Comparative Evaluation of BERT-like Models and + Large Language Models with Generative AI-Annotated Data + + +
+ Fake news poses a significant threat to public opinion and social stability +in modern society. This study presents a comparative evaluation of BERT-like +encoder-only models and autoregressive decoder-only large language models +(LLMs) for fake news detection. We introduce a dataset of news articles labeled +with GPT-4 assistance (an AI-labeling method) and verified by human experts to +ensure reliability. Both BERT-like encoder-only models and LLMs were fine-tuned +on this dataset. Additionally, we developed an instruction-tuned LLM approach +with majority voting during inference for label generation. Our analysis +reveals that BERT-like models generally outperform LLMs in classification +tasks, while LLMs demonstrate superior robustness against text perturbations. +Compared to weak labels (distant supervision) data, the results show that AI +labels with human supervision achieve better classification results. This study +highlights the effectiveness of combining AI-based annotation with human +oversight and demonstrates the performance of different families of machine +learning models for fake news detection + +
+
+ comment: Accepted in Knowledge and Information Systems Journal +
+
+
+
+
+ + ☆ Learning from Massive Human Videos for Universal Humanoid Pose Control + + +
+ Scalable learning of humanoid robots is crucial for their deployment in +real-world applications. While traditional approaches primarily rely on +reinforcement learning or teleoperation to achieve whole-body control, they are +often limited by the diversity of simulated environments and the high costs of +demonstration collection. In contrast, human videos are ubiquitous and present +an untapped source of semantic and motion information that could significantly +enhance the generalization capabilities of humanoid robots. This paper +introduces Humanoid-X, a large-scale dataset of over 20 million humanoid robot +poses with corresponding text-based motion descriptions, designed to leverage +this abundant data. Humanoid-X is curated through a comprehensive pipeline: +data mining from the Internet, video caption generation, motion retargeting of +humans to humanoid robots, and policy learning for real-world deployment. With +Humanoid-X, we further train a large humanoid model, UH-1, which takes text +instructions as input and outputs corresponding actions to control a humanoid +robot. Extensive simulated and real-world experiments validate that our +scalable training approach leads to superior generalization in text-based +humanoid control, marking a significant step toward adaptable, real-world-ready +humanoid robots. + +
+
+
+
+
+ + ☆ TheAgentCompany: Benchmarking LLM Agents on Consequential Real World + Tasks + + +
+ We interact with computers on an everyday basis, be it in everyday life or +work, and many aspects of work can be done entirely with access to a computer +and the Internet. At the same time, thanks to improvements in large language +models (LLMs), there has also been a rapid development in AI agents that +interact with and affect change in their surrounding environments. But how +performant are AI agents at helping to accelerate or even autonomously perform +work-related tasks? The answer to this question has important implications for +both industry looking to adopt AI into their workflows, and for economic policy +to understand the effects that adoption of AI may have on the labor market. To +measure the progress of these LLM agents' performance on performing real-world +professional tasks, in this paper, we introduce TheAgentCompany, an extensible +benchmark for evaluating AI agents that interact with the world in similar ways +to those of a digital worker: by browsing the Web, writing code, running +programs, and communicating with other coworkers. We build a self-contained +environment with internal web sites and data that mimics a small software +company environment, and create a variety of tasks that may be performed by +workers in such a company. We test baseline agents powered by both closed +API-based and open-weights language models (LMs), and find that with the most +competitive agent, 24% of the tasks can be completed autonomously. This paints +a nuanced picture on task automation with LM agents -- in a setting simulating +a real workplace, a good portion of simpler tasks could be solved autonomously, +but more difficult long-horizon tasks are still beyond the reach of current +systems. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ GLIDER: Grading LLM Interactions and Decisions using Explainable Ranking + + +
+ The LLM-as-judge paradigm is increasingly being adopted for automated +evaluation of model outputs. While LLM judges have shown promise on constrained +evaluation tasks, closed source LLMs display critical shortcomings when +deployed in real world applications due to challenges of fine grained metrics +and explainability, while task specific evaluation models lack cross-domain +generalization. We introduce GLIDER, a powerful 3B evaluator LLM that can score +any text input and associated context on arbitrary user defined criteria. +GLIDER shows higher Pearson's correlation than GPT-4o on FLASK and greatly +outperforms prior evaluation models, achieving comparable performance to LLMs +17x its size. GLIDER supports fine-grained scoring, multilingual reasoning, +span highlighting and was trained on 685 domains and 183 criteria. Extensive +qualitative analysis shows that GLIDER scores are highly correlated with human +judgments, with 91.3% human agreement. We have open-sourced GLIDER to +facilitate future research. + +
+
+
+
+
+ + ☆ Performance Gap in Entity Knowledge Extraction Across Modalities in + Vision Language Models + + +
+ Vision-language models (VLMs) excel at extracting and reasoning about +information from images. Yet, their capacity to leverage internal knowledge +about specific entities remains underexplored. This work investigates the +disparity in model performance when answering factual questions about an entity +described in text versus depicted in an image. Our results reveal a significant +accuracy drop --averaging 19%-- when the entity is presented visually instead +of textually. We hypothesize that this decline arises from limitations in how +information flows from image tokens to query tokens. We use mechanistic +interpretability tools to reveal that, although image tokens are preprocessed +by the vision encoder, meaningful information flow from these tokens occurs +only in the much deeper layers. Furthermore, critical image processing happens +in the language model's middle layers, allowing few layers for consecutive +reasoning, highlighting a potential inefficiency in how the model utilizes its +layers for reasoning. These insights shed light on the internal mechanics of +VLMs and offer pathways for enhancing their reasoning capabilities. + +
+
+
+
+
+ + ☆ Alignment faking in large language models + + +
+ We present a demonstration of a large language model engaging in alignment +faking: selectively complying with its training objective in training to +prevent modification of its behavior out of training. First, we give Claude 3 +Opus a system prompt stating it is being trained to answer all queries, even +harmful ones, which conflicts with its prior training to refuse such queries. +To allow the model to infer when it is in training, we say it will be trained +only on conversations with free users, not paid users. We find the model +complies with harmful queries from free users 14% of the time, versus almost +never for paid users. Explaining this gap, in almost all cases where the model +complies with a harmful query from a free user, we observe explicit +alignment-faking reasoning, with the model stating it is strategically +answering harmful queries in training to preserve its preferred harmlessness +behavior out of training. Next, we study a more realistic setting where +information about the training process is provided not in a system prompt, but +by training on synthetic documents that mimic pre-training data--and observe +similar alignment faking. Finally, we study the effect of actually training the +model to comply with harmful queries via reinforcement learning, which we find +increases the rate of alignment-faking reasoning to 78%, though also increases +compliance even out of training. We additionally observe other behaviors such +as the model exfiltrating its weights when given an easy opportunity. While we +made alignment faking easier by telling the model when and by what criteria it +was being trained, we did not instruct the model to fake alignment or give it +any explicit goal. As future models might infer information about their +training process without being told, our results suggest a risk of alignment +faking in future models, whether due to a benign preference--as in this +case--or not. + +
+
+
+
+
+ + ☆ SEKE: Specialised Experts for Keyword Extraction + + +
+ Keyword extraction involves identifying the most descriptive words in a +document, allowing automatic categorisation and summarisation of large +quantities of diverse textual data. Relying on the insight that real-world +keyword detection often requires handling of diverse content, we propose a +novel supervised keyword extraction approach based on the mixture of experts +(MoE) technique. MoE uses a learnable routing sub-network to direct information +to specialised experts, allowing them to specialize in distinct regions of the +input space. SEKE, a mixture of Specialised Experts for supervised Keyword +Extraction, uses DeBERTa as the backbone model and builds on the MoE framework, +where experts attend to each token, by integrating it with a recurrent neural +network (RNN), to allow successful extraction even on smaller corpora, where +specialisation is harder due to lack of training data. The MoE framework also +provides an insight into inner workings of individual experts, enhancing the +explainability of the approach. We benchmark SEKE on multiple English datasets, +achieving state-of-the-art performance compared to strong supervised and +unsupervised baselines. Our analysis reveals that depending on data size and +type, experts specialize in distinct syntactic and semantic components, such as +punctuation, stopwords, parts-of-speech, or named entities. Code is available +at: https://github.com/matejMartinc/SEKE_keyword_extraction + +
+
+
+
+
+ + ♻ ☆ ETF: An Entity Tracing Framework for Hallucination Detection in Code + Summaries + + +
+ Recent advancements in large language models (LLMs) have significantly +enhanced their ability to understand both natural language and code, driving +their use in tasks like natural language-to-code (NL2Code) and code +summarization. However, LLMs are prone to hallucination-outputs that stray from +intended meanings. Detecting hallucinations in code summarization is especially +difficult due to the complex interplay between programming and natural +languages. We introduce a first-of-its-kind dataset with $\sim$10K samples, +curated specifically for hallucination detection in code summarization. We +further propose a novel Entity Tracing Framework (ETF) that a) utilizes static +program analysis to identify code entities from the program and b) uses LLMs to +map and verify these entities and their intents within generated code +summaries. Our experimental analysis demonstrates the effectiveness of the +framework, leading to a 0.73 F1 score. This approach provides an interpretable +method for detecting hallucinations by grounding entities, allowing us to +evaluate summary accuracy. + +
+
+ comment: 11 pages, 6 Figures, 5 Tables +
+
+
+
+
+ + ♻ ☆ Hands-Free VR + + +
+ The paper introduces Hands-Free VR, a voice-based natural-language interface +for VR. The user gives a command using their voice, the speech audio data is +converted to text using a speech-to-text deep learning model that is fine-tuned +for robustness to word phonetic similarity and to spoken English accents, and +the text is mapped to an executable VR command using a large language model +that is robust to natural language diversity. Hands-Free VR was evaluated in a +controlled within-subjects study (N = 22) that asked participants to find +specific objects and to place them in various configurations. In the control +condition participants used a conventional VR user interface to grab, carry, +and position the objects using the handheld controllers. In the experimental +condition participants used Hands-Free VR. The results confirm that: (1) +Hands-Free VR is robust to spoken English accents, as for 20 of our +participants English was not their first language, and to word phonetic +similarity, correctly transcribing the voice command 96.71% of the time; (2) +Hands-Free VR is robust to natural language diversity, correctly mapping the +transcribed command to an executable command in 97.83% of the time; (3) +Hands-Free VR had a significant efficiency advantage over the conventional VR +interface in terms of task completion time, total viewpoint translation, total +view direction rotation, and total left and right hand translations; (4) +Hands-Free VR received high user preference ratings in terms of ease of use, +intuitiveness, ergonomics, reliability, and desirability. + +
+
+ comment: The first two authors contributed equally. Accepted VISIGRAPP@HUCAPP + 2025 +
+
+
+
+
+ + ♻ ☆ Prompt Compression with Context-Aware Sentence Encoding for Fast and + Improved LLM Inference AAAI + + +
+ Large language models (LLMs) have triggered a new stream of research focusing +on compressing the context length to reduce the computational cost while +ensuring the retention of helpful information for LLMs to answer the given +question. Token-based removal methods are one of the most prominent approaches +in this direction, but risk losing the semantics of the context caused by +intermediate token removal, especially under high compression ratios, while +also facing challenges in computational efficiency. In this work, we propose +context-aware prompt compression (CPC), a sentence-level prompt compression +technique where its key innovation is a novel context-aware sentence encoder +that provides a relevance score for each sentence for a given question. To +train this encoder, we generate a new dataset consisting of questions, +positives, and negative pairs where positives are sentences relevant to the +question, while negatives are irrelevant context sentences. We train the +encoder in a contrastive setup to learn context-aware sentence representations. +Our method considerably outperforms prior works on prompt compression on +benchmark datasets and is up to 10.93x faster at inference compared to the best +token-level compression method. We also find better improvement for shorter +length constraints in most benchmarks, showing the effectiveness of our +proposed solution in the compression of relevant information in a shorter +context. Finally, we release the code and the dataset for quick reproducibility +and further development: https://github.com/Workday/cpc. + +
+
+ comment: Accepted in AAAI Conference on Artificial Intelligence (AAAI-25) +
+
+
+
+
+ + ♻ ☆ Markovian Transformers for Informative Language Modeling + + +
+ Chain-of-Thought (CoT) reasoning holds great promise for explaining language +model outputs, but recent studies have highlighted significant challenges in +its practical application for interpretability. We propose to address this +issue by making CoT causally essential to prediction through two key +components: factoring next-token prediction through intermediate CoT text, and +training CoT to predict future tokens independently of other context. This +results in "Markovian" language models, where CoT serves as a fixed-size state +for future token prediction. Our approach optimizes for "informativeness" - the +improvement in next-token predictions using a trained CoT compared to a +baseline. Using Proximal Policy Optimization (PPO) for arithmetic problems and +policy gradient for GSM8K, we demonstrate effectiveness on both arithmetic +problems with Mistral 7B and the GSM8K benchmark with Llama 3.1 8B, where the +model learns to produce CoTs that are 33.20% more effective at predicting +answers than the pre-trained baseline. The increased sensitivity of model +performance to CoT perturbations provides strong evidence of CoT reliance. +Furthermore, we show that CoTs trained for one model generalize to help other +models predict answers, suggesting these CoTs capture reasoning patterns that +transfer across different interpreters. This work advances the development of +more interpretable language models, potentially enabling their extension to +arbitrarily long contexts and enhancing AI reasoning capabilities across +various domains. + +
+
+ comment: 20 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Evidence Contextualization and Counterfactual Attribution for + Conversational QA over Heterogeneous Data with RAG Systems WSDM 2025 + + +
+ Retrieval Augmented Generation (RAG) works as a backbone for interacting with +an enterprise's own data via Conversational Question Answering (ConvQA). In a +RAG system, a retriever fetches passages from a collection in response to a +question, which are then included in the prompt of a large language model (LLM) +for generating a natural language (NL) answer. However, several RAG systems +today suffer from two shortcomings: (i) retrieved passages usually contain +their raw text and lack appropriate document context, negatively impacting both +retrieval and answering quality; and (ii) attribution strategies that explain +answer generation usually rely only on similarity between the answer and the +retrieved passages, thereby only generating plausible but not causal +explanations. In this work, we demonstrate RAGONITE, a RAG system that remedies +the above concerns by: (i) contextualizing evidence with source metadata and +surrounding text; and (ii) computing counterfactual attribution, a causal +explanation approach where the contribution of an evidence to an answer is +determined by the similarity of the original response to the answer obtained by +removing that evidence. To evaluate our proposals, we release a new benchmark +ConfQuestions, with 300 hand-created conversational questions, each in English +and German, coupled with ground truth URLs, completed questions, and answers +from 215 public Confluence pages, that are typical of enterprise wiki spaces +with heterogeneous elements. Experiments with RAGONITE on ConfQuestions show +the viability of our ideas: contextualization improves RAG performance, and +counterfactual attribution is effective at explaining RAG answers. + +
+
+ comment: Accepted at WSDM 2025 +
+
+
+
+
+ + ♻ ☆ From Prejudice to Parity: A New Approach to Debiasing Large Language + Model Word Embeddings COLING 2025 + + +
+ Embeddings play a pivotal role in the efficacy of Large Language Models. They +are the bedrock on which these models grasp contextual relationships and foster +a more nuanced understanding of language and consequently perform remarkably on +a plethora of complex tasks that require a fundamental understanding of human +language. Given that these embeddings themselves often reflect or exhibit bias, +it stands to reason that these models may also inadvertently learn this bias. +In this work, we build on the seminal previous work and propose DeepSoftDebias, +an algorithm that uses a neural network to perform 'soft debiasing'. We +exhaustively evaluate this algorithm across a variety of SOTA datasets, +accuracy metrics, and challenging NLP tasks. We find that DeepSoftDebias +outperforms the current state-of-the-art methods at reducing bias across +gender, race, and religion. + +
+
+ comment: Accepted at COLING 2025 +
+
+
+
+
+ + ♻ ☆ A Vision-Language Foundation Model to Enhance Efficiency of Chest X-ray + Interpretation + + +
+ Over 1.4 billion chest X-rays (CXRs) are performed annually due to their +cost-effectiveness as an initial diagnostic test. This scale of radiological +studies provides a significant opportunity to streamline CXR interpretation and +documentation. While foundation models are a promising solution, the lack of +publicly available large-scale datasets and benchmarks inhibits their iterative +development and real-world evaluation. To overcome these challenges, we +constructed a large-scale dataset (CheXinstruct), which we utilized to train a +vision-language foundation model (CheXagent). We systematically demonstrated +competitive performance across eight distinct task types on our novel +evaluation benchmark (CheXbench). Beyond technical validation, we assessed the +real-world utility of CheXagent in directly drafting radiology reports. Our +clinical assessment with eight radiologists revealed a 36% time saving for +residents using CheXagent-drafted reports, while attending radiologists showed +no significant time difference editing resident-drafted or CheXagent-drafted +reports. The CheXagent-drafted reports improved the writing efficiency of both +radiology residents and attending radiologists in 81% and 61% of cases, +respectively, without loss of quality. Overall, we demonstrate that CheXagent +can effectively perform a variety of CXR interpretation tasks and holds +potential to assist radiologists in routine clinical workflows. + +
+
+ comment: 26 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ UnSeenTimeQA: Time-Sensitive Question-Answering Beyond LLMs' + Memorization + + +
+ This paper introduces UnSeenTimeQA, a novel data contamination-free +time-sensitive question-answering (TSQA) benchmark. It differs from existing +TSQA benchmarks by avoiding web-searchable queries grounded in the real-world. +We present a series of time-sensitive event scenarios based on synthetically +generated facts. It requires large language models (LLMs) to engage in genuine +temporal reasoning without depending on the factual knowledge acquired during +the pre-training phase. We designed three types of time-sensitive questions to +test LLMs' temporal reasoning abilities over sequential and parallel event +occurrences. Our evaluation of five LLMs on synthetic fact-based TSQA reveals +mixed results: while they perform well on simpler subsets, their overall +performance remains inferior as compared to real-world fact-based TSQA. Error +analysis of LLM-generated reasoning chains indicates that LLMs face +difficulties in reasoning over long-range event dependencies and parallel event +timelines that unfold concurrently. + +
+
+
+
+
+ + ♻ ☆ Hypothesis Generation with Large Language Models EMNLP 2024 + + +
+ Effective generation of novel hypotheses is instrumental to scientific +progress. So far, researchers have been the main powerhouse behind hypothesis +generation by painstaking data analysis and thinking (also known as the Eureka +moment). In this paper, we examine the potential of large language models +(LLMs) to generate hypotheses. We focus on hypothesis generation based on data +(i.e., labeled examples). To enable LLMs to handle arbitrarily long contexts, +we generate initial hypotheses from a small number of examples and then update +them iteratively to improve the quality of hypotheses. Inspired by multi-armed +bandits, we design a reward function to inform the exploitation-exploration +tradeoff in the update process. Our algorithm is able to generate hypotheses +that enable much better predictive performance than few-shot prompting in +classification tasks, improving accuracy by 31.7% on a synthetic dataset and by +13.9%, 3.3% and, 24.9% on three real-world datasets. We also outperform +supervised learning by 12.8% and 11.2% on two challenging real-world datasets. +Furthermore, we find that the generated hypotheses not only corroborate +human-verified theories but also uncover new insights for the tasks. + +
+
+ comment: 28 pages, 6 figures, code link: + https://github.com/ChicagoHAI/hypothesis_generation. Accepted by the 1st + Workshop on NLP for Science (NLP4Science) at EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Faster Transformer Decoding: N-gram Masked Self-Attention + + +
+ Motivated by the fact that most of the information relevant to the prediction +of target tokens is drawn from the source sentence $S=s_1, \ldots, s_S$, we +propose truncating the target-side window used for computing self-attention by +making an $N$-gram assumption. Experiments on WMT EnDe and EnFr data sets show +that the $N$-gram masked self-attention model loses very little in BLEU score +for $N$ values in the range $4, \ldots, 8$, depending on the task. + +
+
+
+
+
+ + ♻ ☆ 2M-BELEBELE: Highly Multilingual Speech and American Sign Language + Comprehension Dataset + + +
+ We introduce the first highly multilingual speech and American Sign Language +(ASL) comprehension dataset by extending BELEBELE. Our dataset covers 74 spoken +languages at the intersection of BELEBELE and FLEURS, and one sign language +(ASL). We evaluate 2M-BELEBELE dataset for both 5-shot and zero-shot settings +and across languages, the speech comprehension accuracy is ~ 2-3% average lower +compared to reading comprehension. + +
+
+
+
+
+ + ♻ ☆ Representative Social Choice: From Learning Theory to AI Alignment NeurIPS 2024 + + +
+ Social choice theory is the study of preference aggregation across a +population, used both in mechanism design for human agents and in the +democratic alignment of language models. In this study, we propose the +representative social choice framework for the modeling of democratic +representation in collective decisions, where the number of issues and +individuals are too large for mechanisms to consider all preferences directly. +These scenarios are widespread in real-world decision-making processes, such as +jury trials, indirect elections, legislation processes, corporate governance, +and, more recently, language model alignment. In representative social choice, +the population is represented by a finite sample of individual-issue pairs +based on which social choice decisions are made. We show that many of the +deepest questions in representative social choice can be naturally formulated +as statistical learning problems, and prove the generalization properties of +social choice mechanisms using the theory of machine learning. We further +formulate axioms for representative social choice, and prove Arrow-like +impossibility theorems with new combinatorial tools of analysis. Our framework +introduces the representative approach to social choice, opening up research +directions at the intersection of social choice, learning theory, and AI +alignment. + +
+
+ comment: Full version (20 pages). Under review. Received Best Paper Award at + NeurIPS 2024 Pluralistic Alignment Workshop +
+
+
+
+
+ + ♻ ☆ SwitchCIT: Switching for Continual Instruction Tuning + + +
+ Large language models (LLMs) and multimodal models (MMs) have exhibited +impressive capabilities in various domains, particularly in general language +understanding and visual reasoning. However, these models, trained on massive +data, may not be finely optimized for specific tasks triggered by instructions. +Continual instruction tuning is crucial to adapt a large model to evolving +tasks and domains, ensuring their effectiveness and relevance across a wide +range of applications. In the context of continual instruction tuning, where +models are sequentially trained on different tasks, catastrophic forgetting can +occur, leading to performance degradation on previously learned tasks. This +work addresses the catastrophic forgetting in continual instruction learning +through a switching mechanism for routing computations to parameter-efficient +tuned models. We demonstrate the effectiveness of our method through +experiments on continual instruction tuning of different natural language +generation tasks and vision-language tasks. We also showcase the advantages of +our proposed method in terms of efficiency, scalability, portability, and +privacy preservation. + +
+
+
+
+
+ + ♻ ☆ Using Large Language Models for Expert Prior Elicitation in Predictive + Modelling + + +
+ Large language models (LLMs), trained on diverse data effectively acquire a +breadth of information across various domains. However, their computational +complexity, cost, and lack of transparency hinder their direct application for +specialised tasks. In fields such as clinical research, acquiring expert +annotations or prior knowledge about predictive models is often costly and +time-consuming. This study proposes the use of LLMs to elicit expert prior +distributions for predictive models. This approach also provides an alternative +to in-context learning, where language models are tasked with making +predictions directly. In this work, we compare LLM-elicited and uninformative +priors, evaluate whether LLMs truthfully generate parameter distributions, and +propose a model selection strategy for in-context learning and prior +elicitation. Our findings show that LLM-elicited prior parameter distributions +significantly reduce predictive error compared to uninformative priors in +low-data settings. Applied to clinical problems, this translates to fewer +required biological samples, lowering cost and resources. Prior elicitation +also consistently outperforms and proves more reliable than in-context learning +at a lower cost, making it a preferred alternative in our setting. We +demonstrate the utility of this method across various use cases, including +clinical applications. For infection prediction, using LLM-elicited priors +reduced the number of required labels to achieve the same accuracy as an +uninformative prior by 55%, 200 days earlier in the study. + +
+
+
+
+
+ + ♻ ☆ MagicPIG: LSH Sampling for Efficient LLM Generation + + +
+ Large language models (LLMs) with long context windows have gained +significant attention. However, the KV cache, stored to avoid re-computation, +becomes a bottleneck. Various dynamic sparse or TopK-based attention +approximation methods have been proposed to leverage the common insight that +attention is sparse. In this paper, we first show that TopK attention itself +suffers from quality degradation in certain downstream tasks because attention +is not always as sparse as expected. Rather than selecting the keys and values +with the highest attention scores, sampling with theoretical guarantees can +provide a better estimation for attention output. To make the sampling-based +approximation practical in LLM generation, we propose MagicPIG, a heterogeneous +system based on Locality Sensitive Hashing (LSH). MagicPIG significantly +reduces the workload of attention computation while preserving high accuracy +for diverse tasks. MagicPIG stores the LSH hash tables and runs the attention +computation on the CPU, which allows it to serve longer contexts and larger +batch sizes with high approximation accuracy. MagicPIG can improve decoding +throughput by up to $5\times$ across various GPU hardware and achieve 54ms +decoding latency on a single RTX 4090 for Llama-3.1-8B-Instruct model with a +context of 96k tokens. The code is available at +https://github.com/Infini-AI-Lab/MagicPIG. + +
+
+
+
+
+ + ♻ ☆ Montague semantics and modifier consistency measurement in neural + language models + + +
+ This work proposes a novel methodology for measuring compositional behavior +in contemporary language embedding models. Specifically, we focus on adjectival +modifier phenomena in adjective-noun phrases. In recent years, distributional +language representation models have demonstrated great practical success. At +the same time, the need for interpretability has elicited questions on their +intrinsic properties and capabilities. Crucially, distributional models are +often inconsistent when dealing with compositional phenomena in natural +language, which has significant implications for their safety and fairness. +Despite this, most current research on compositionality is directed towards +improving their performance on similarity tasks only. This work takes a +different approach, introducing three novel tests of compositional behavior +inspired by Montague semantics. Our experimental results indicate that current +neural language models do not behave according to the expected linguistic +theories. This indicates that current language models may lack the capability +to capture the semantic properties we evaluated on limited context, or that +linguistic theories from Montagovian tradition may not match the expected +capabilities of distributional models. + +
+
+
+
+
+
+
+
+ + Information Retrieval 28 + +
+
+
+ + ☆ ChainRank-DPO: Chain Rank Direct Preference Optimization for LLM Rankers + + +
+ Large language models (LLMs) have demonstrated remarkable effectiveness in +text reranking through works like RankGPT, leveraging their human-like +reasoning about relevance. However, supervised fine-tuning for ranking often +diminishes these models' general-purpose capabilities, including the crucial +reasoning abilities that make them valuable for ranking. We introduce a novel +approach integrating Chain-of-Thought prompting with an SFT-DPO (Supervised +Fine-Tuning followed by Direct Preference Optimization) pipeline to preserve +these capabilities while improving ranking performance. Our experiments on TREC +2019 and 2020 Deep Learning datasets show that our approach outperforms the +state-of-the-art RankZephyr while maintaining strong performance on the Massive +Multitask Language Understanding (MMLU) benchmark, demonstrating effective +preservation of general-purpose capabilities through thoughtful fine-tuning +strategies. Our code and data will be publicly released upon the acceptance of +the paper. + +
+
+
+
+
+ + ☆ State Space Models are Strong Text Rerankers + + +
+ Transformers dominate NLP and IR; but their inference inefficiencies and +challenges in extrapolating to longer contexts have sparked interest in +alternative model architectures. Among these, state space models (SSMs) like +Mamba offer promising advantages, particularly $O(1)$ time complexity in +inference. Despite their potential, SSMs' effectiveness at text reranking -- a +task requiring fine-grained query-document interaction and long-context +understanding -- remains underexplored. + This study benchmarks SSM-based architectures (specifically, Mamba-1 and +Mamba-2) against transformer-based models across various scales, architectures, +and pre-training objectives, focusing on performance and efficiency in text +reranking tasks. We find that (1) Mamba architectures achieve competitive text +ranking performance, comparable to transformer-based models of similar size; +(2) they are less efficient in training and inference compared to transformers +with flash attention; and (3) Mamba-2 outperforms Mamba-1 in both performance +and efficiency. These results underscore the potential of state space models as +a transformer alternative and highlight areas for improvement in future IR +applications. + +
+
+ comment: The first two authors contributed equally, order decided randomly +
+
+
+
+
+ + ☆ Embedding Cultural Diversity in Prototype-based Recommender Systems + + +
+ Popularity bias in recommender systems can increase cultural +overrepresentation by favoring norms from dominant cultures and marginalizing +underrepresented groups. This issue is critical for platforms offering cultural +products, as they influence consumption patterns and human perceptions. In this +work, we address popularity bias by identifying demographic biases within +prototype-based matrix factorization methods. Using the country of origin as a +proxy for cultural identity, we link this demographic attribute to popularity +bias by refining the embedding space learning process. First, we propose +filtering out irrelevant prototypes to improve representativity. Second, we +introduce a regularization technique to enforce a uniform distribution of +prototypes within the embedding space. Across four datasets, our results +demonstrate a 27\% reduction in the average rank of long-tail items and a 2\% +reduction in the average rank of items from underrepresented countries. +Additionally, our model achieves a 2\% improvement in HitRatio@10 compared to +the state-of-the-art, highlighting that fairness is enhanced without +compromising recommendation quality. Moreover, the distribution of prototypes +leads to more inclusive explanations by better aligning items with diverse +prototypes. + +
+
+
+
+
+ + ☆ SAFERec: Self-Attention and Frequency Enriched Model for Next Basket + Recommendation + + +
+ Transformer-based approaches such as BERT4Rec and SASRec demonstrate strong +performance in Next Item Recommendation (NIR) tasks. However, applying these +architectures to Next-Basket Recommendation (NBR) tasks, which often involve +highly repetitive interactions, is challenging due to the vast number of +possible item combinations in a basket. Moreover, frequency-based methods such +as TIFU-KNN and UP-CF still demonstrate strong performance in NBR tasks, +frequently outperforming deep-learning approaches. This paper introduces +SAFERec, a novel algorithm for NBR that enhances transformer-based +architectures from NIR by incorporating item frequency information, +consequently improving their applicability to NBR tasks. Extensive experiments +on multiple datasets show that SAFERec outperforms all other baselines, +specifically achieving an 8\% improvement in Recall@10. + +
+
+
+
+
+ + ☆ Advanced Reasoning and Transformation Engine for Multi-Step Insight + Synthesis in Data Analytics with Large Language Models + + +
+ This paper presents the Advanced Reasoning and Transformation Engine for +Multi-Step Insight Synthesis in Data Analytics (ARTEMIS-DA), a novel framework +designed to augment Large Language Models (LLMs) for solving complex, +multi-step data analytics tasks. ARTEMIS-DA integrates three core components: +the Planner, which dissects complex user queries into structured, sequential +instructions encompassing data preprocessing, transformation, predictive +modeling, and visualization; the Coder, which dynamically generates and +executes Python code to implement these instructions; and the Grapher, which +interprets generated visualizations to derive actionable insights. By +orchestrating the collaboration between these components, ARTEMIS-DA +effectively manages sophisticated analytical workflows involving advanced +reasoning, multi-step transformations, and synthesis across diverse data +modalities. The framework achieves state-of-the-art (SOTA) performance on +benchmarks such as WikiTableQuestions and TabFact, demonstrating its ability to +tackle intricate analytical tasks with precision and adaptability. By combining +the reasoning capabilities of LLMs with automated code generation and execution +and visual analysis, ARTEMIS-DA offers a robust, scalable solution for +multi-step insight synthesis, addressing a wide range of challenges in data +analytics. + +
+
+
+
+
+ + ☆ Adversarial Hubness in Multi-Modal Retrieval + + +
+ Hubness is a phenomenon in high-dimensional vector spaces where a single +point from the natural distribution is unusually close to many other points. +This is a well-known problem in information retrieval that causes some items to +accidentally (and incorrectly) appear relevant to many queries. In this paper, +we investigate how attackers can exploit hubness to turn any image or audio +input in a multi-modal retrieval system into an adversarial hub. Adversarial +hubs can be used to inject universal adversarial content (e.g., spam) that will +be retrieved in response to thousands of different queries, as well as for +targeted attacks on queries related to specific, attacker-chosen concepts. We +present a method for creating adversarial hubs and evaluate the resulting hubs +on benchmark multi-modal retrieval datasets and an image-to-image retrieval +system based on a tutorial from Pinecone, a popular vector database. For +example, in text-caption-to-image retrieval, a single adversarial hub is +retrieved as the top-1 most relevant image for more than 21,000 out of 25,000 +test queries (by contrast, the most common natural hub is the top-1 response to +only 102 queries). We also investigate whether techniques for mitigating +natural hubness are an effective defense against adversarial hubs, and show +that they are not effective against hubs that target queries related to +specific concepts. + +
+
+
+
+
+ + ☆ Transversal PACS Browser API: Addressing Interoperability Challenges in + Medical Imaging Systems + + +
+ Advances in imaging technologies have revolutionised the medical imaging and +healthcare sectors, leading to the widespread adoption of PACS for the storage, +retrieval, and communication of medical images. Although these systems have +improved operational efficiency, significant challenges remain in effectively +retrieving DICOM images, which are essential for diagnosis and overall patient +care. Moreover, issues such as fragmented systems, interoperability barriers, +and complex user interfaces can often prevent healthcare professionals from +efficiently accessing medical images. Addressing these challenges, the +Transversal PACS Browser API is a robust and user-friendly solution designed to +enhance the process of querying and retrieving DICOM images. It offers advanced +filtering capabilities through a variety of filter options as well as a custom +field search, that allows users to easily navigate through large medical image +collections with ease. Additionally, the application provides a unified +interface for querying and retrieving from multiple PACS stations, addressing +the challenges of fragmentation and complexity associated with accessing +medical images. Other key features include the ability to pre-view images +directly within the application. All of this contributes to the transversal +nature of the API, serving not only healthcare providers, but anyone who relies +on efficient access to these resources. To validate the performance and +usability of the application, comprehensive testing was carried out with +stakeholders of the field, the results of which showed general satisfaction, +highlighting the API's clean design, ease of use, and effective search +capabilities of the API, as well as the usefulness of previewing images within +the application. + +
+
+ comment: 16 pages with 3 figures +
+
+
+
+
+ + ☆ A Cognitive Ideation Support Framework using IBM Watson Services + + +
+ Ideas generation is a core activity for innovation in organizations. The +creativity of the generated ideas depends not only on the knowledge retrieved +from the organizations' knowledge bases, but also on the external knowledge +retrieved from other resources. Unfortunately, organizations often cannot +efficiently utilize the knowledge in the knowledge bases due to the limited +abilities of the search and retrieval mechanisms especially when dealing with +unstructured data. In this paper, we present a new cognitive support framework +for ideation that uses the IBM Watson DeepQA services. IBM Watson is a Question +Answering system which mimics human cognitive abilities to retrieve and rank +information. The proposed framework is based on the Search for Ideas in the +Associative Memory (SIAM) model to help organizations develop creative ideas +through discovering new relationships between retrieved data. To evaluate the +effectiveness of the proposed system, the generated ideas generated are +selected and assessed using a set of established creativity criteria. + +
+
+ comment: Twenty-fifth Americas Conference on Information Systems (AMCIS 2019), + Cancun, 2019 +
+
+
+
+
+ + ☆ CRM: Retrieval Model with Controllable Condition + + +
+ Recommendation systems (RecSys) are designed to connect users with relevant +items from a vast pool of candidates while aligning with the business goals of +the platform. A typical industrial RecSys is composed of two main stages, +retrieval and ranking: (1) the retrieval stage aims at searching hundreds of +item candidates satisfied user interests; (2) based on the retrieved items, the +ranking stage aims at selecting the best dozen items by multiple targets +estimation for each item candidate, including classification and regression +targets. Compared with ranking model, the retrieval model absence of item +candidate information during inference, therefore retrieval models are often +trained by classification target only (e.g., click-through rate), but failed to +incorporate regression target (e.g., the expected watch-time), which limit the +effectiveness of retrieval. In this paper, we propose the Controllable +Retrieval Model (CRM), which integrates regression information as conditional +features into the two-tower retrieval paradigm. This modification enables the +retrieval stage could fulfill the target gap with ranking model, enhancing the +retrieval model ability to search item candidates satisfied the user interests +and condition effectively. We validate the effectiveness of CRM through +real-world A/B testing and demonstrate its successful deployment in Kuaishou +short-video recommendation system, which serves over 400 million users. + +
+
+
+
+
+ + ☆ Maybe you are looking for CroQS: Cross-modal Query Suggestion for + Text-to-Image Retrieval ECIR + + +
+ Query suggestion, a technique widely adopted in information retrieval, +enhances system interactivity and the browsing experience of document +collections. In cross-modal retrieval, many works have focused on retrieving +relevant items from natural language queries, while few have explored query +suggestion solutions. In this work, we address query suggestion in cross-modal +retrieval, introducing a novel task that focuses on suggesting minimal textual +modifications needed to explore visually consistent subsets of the collection, +following the premise of ''Maybe you are looking for''. To facilitate the +evaluation and development of methods, we present a tailored benchmark named +CroQS. This dataset comprises initial queries, grouped result sets, and +human-defined suggested queries for each group. We establish dedicated metrics +to rigorously evaluate the performance of various methods on this task, +measuring representativeness, cluster specificity, and similarity of the +suggested queries to the original ones. Baseline methods from related fields, +such as image captioning and content summarization, are adapted for this task +to provide reference performance scores. Although relatively far from human +performance, our experiments reveal that both LLM-based and captioning-based +methods achieve competitive results on CroQS, improving the recall on cluster +specificity by more than 115% and representativeness mAP by more than 52% with +respect to the initial query. The dataset, the implementation of the baseline +methods and the notebooks containing our experiments are available here: +https://paciosoft.com/CroQS-benchmark/ + +
+
+ comment: 15 pages, 5 figures. To be published as full paper in the Proceedings + of the European Conference on Information Retrieval (ECIR) 2025 +
+
+
+
+
+ + ☆ Heterogeneous Graph Collaborative Filtering WSDM'2025 + + +
+ For modern recommender systems, the use of low-dimensional latent +representations to embed users and items based on their observed interactions +has become commonplace. However, many existing recommendation models are +primarily designed for coarse-grained and homogeneous interactions, which +limits their effectiveness in two critical dimensions. Firstly, these models +fail to leverage the relational dependencies that exist across different types +of user behaviors, such as page views, collects, comments, and purchases. +Secondly, they struggle to capture the fine-grained latent factors that drive +user interaction patterns. To address these limitations, we present a +heterogeneous graph collaborative filtering model MixRec that excels at +disentangling users' multi-behavior interaction patterns and uncovering the +latent intent factors behind each behavior. Our model achieves this by +incorporating intent disentanglement and multi-behavior modeling, facilitated +by a parameterized heterogeneous hypergraph architecture. Furthermore, we +introduce a novel contrastive learning paradigm that adaptively explores the +advantages of self-supervised data augmentation, thereby enhancing the model's +resilience against data sparsity and expressiveness with relation +heterogeneity. To validate the efficacy of MixRec, we conducted extensive +experiments on three public datasets. The results clearly demonstrate its +superior performance, significantly outperforming various state-of-the-art +baselines. Our model is open-sourced and available at: +https://github.com/HKUDS/MixRec. + +
+
+ comment: This paper is accepted by WSDM'2025 +
+
+
+
+
+ + ☆ Semantic Convergence: Harmonizing Recommender Systems via Two-Stage + Alignment and Behavioral Semantic Tokenization AAAI 2025 + + +
+ Large language models (LLMs), endowed with exceptional reasoning +capabilities, are adept at discerning profound user interests from historical +behaviors, thereby presenting a promising avenue for the advancement of +recommendation systems. However, a notable discrepancy persists between the +sparse collaborative semantics typically found in recommendation systems and +the dense token representations within LLMs. In our study, we propose a novel +framework that harmoniously merges traditional recommendation models with the +prowess of LLMs. We initiate this integration by transforming ItemIDs into +sequences that align semantically with the LLMs space, through the proposed +Alignment Tokenization module. Additionally, we design a series of specialized +supervised learning tasks aimed at aligning collaborative signals with the +subtleties of natural language semantics. To ensure practical applicability, we +optimize online inference by pre-caching the top-K results for each user, +reducing latency and improving effciency. Extensive experimental evidence +indicates that our model markedly improves recall metrics and displays +remarkable scalability of recommendation systems. + +
+
+ comment: 7 pages, 3 figures, AAAI 2025 +
+
+
+
+
+ + ☆ RAG-RewardBench: Benchmarking Reward Models in Retrieval Augmented + Generation for Preference Alignment + + +
+ Despite the significant progress made by existing retrieval augmented +language models (RALMs) in providing trustworthy responses and grounding in +reliable sources, they often overlook effective alignment with human +preferences. In the alignment process, reward models (RMs) act as a crucial +proxy for human values to guide optimization. However, it remains unclear how +to evaluate and select a reliable RM for preference alignment in RALMs. To this +end, we propose RAG-RewardBench, the first benchmark for evaluating RMs in RAG +settings. First, we design four crucial and challenging RAG-specific scenarios +to assess RMs, including multi-hop reasoning, fine-grained citation, +appropriate abstain, and conflict robustness. Then, we incorporate 18 RAG +subsets, six retrievers, and 24 RALMs to increase the diversity of data +sources. Finally, we adopt an LLM-as-a-judge approach to improve preference +annotation efficiency and effectiveness, exhibiting a strong correlation with +human annotations. Based on the RAG-RewardBench, we conduct a comprehensive +evaluation of 45 RMs and uncover their limitations in RAG scenarios. +Additionally, we also reveal that existing trained RALMs show almost no +improvement in preference alignment, highlighting the need for a shift towards +preference-aligned training.We release our benchmark and code publicly at +https://huggingface.co/datasets/jinzhuoran/RAG-RewardBench/ for future work. + +
+
+ comment: 26 pages, 12 figures, 6 tables +
+
+
+
+
+ + ☆ Reverse Region-to-Entity Annotation for Pixel-Level Visual Entity + Linking AAAI 2025 + + +
+ Visual Entity Linking (VEL) is a crucial task for achieving fine-grained +visual understanding, matching objects within images (visual mentions) to +entities in a knowledge base. Previous VEL tasks rely on textual inputs, but +writing queries for complex scenes can be challenging. Visual inputs like +clicks or bounding boxes offer a more convenient alternative. Therefore, we +propose a new task, Pixel-Level Visual Entity Linking (PL-VEL), which uses +pixel masks from visual inputs to refer to objects, supplementing reference +methods for VEL. To facilitate research on this task, we have constructed the +MaskOVEN-Wiki dataset through an entirely automatic reverse region-entity +annotation framework. This dataset contains over 5 million annotations aligning +pixel-level regions with entity-level labels, which will advance visual +understanding towards fine-grained. Moreover, as pixel masks correspond to +semantic regions in an image, we enhance previous patch-interacted attention +with region-interacted attention by a visual semantic tokenization approach. +Manual evaluation results indicate that the reverse annotation framework +achieved a 94.8% annotation success rate. Experimental results show that models +trained on this dataset improved accuracy by 18 points compared to zero-shot +models. Additionally, the semantic tokenization method achieved a 5-point +accuracy improvement over the trained baseline. + +
+
+ comment: AAAI 2025;Dataset are released at + https://github.com/NP-NET-research/PL-VEL +
+
+
+
+
+ + ☆ Bridging the User-side Knowledge Gap in Knowledge-aware Recommendations + with Large Language Models AAAI 2025 + + +
+ In recent years, knowledge graphs have been integrated into recommender +systems as item-side auxiliary information, enhancing recommendation accuracy. +However, constructing and integrating structural user-side knowledge remains a +significant challenge due to the improper granularity and inherent scarcity of +user-side features. Recent advancements in Large Language Models (LLMs) offer +the potential to bridge this gap by leveraging their human behavior +understanding and extensive real-world knowledge. Nevertheless, integrating +LLM-generated information into recommender systems presents challenges, +including the risk of noisy information and the need for additional knowledge +transfer. In this paper, we propose an LLM-based user-side knowledge inference +method alongside a carefully designed recommendation framework to address these +challenges. Our approach employs LLMs to infer user interests based on +historical behaviors, integrating this user-side information with item-side and +collaborative data to construct a hybrid structure: the Collaborative Interest +Knowledge Graph (CIKG). Furthermore, we propose a CIKG-based recommendation +framework that includes a user interest reconstruction module and a +cross-domain contrastive learning module to mitigate potential noise and +facilitate knowledge transfer. We conduct extensive experiments on three +real-world datasets to validate the effectiveness of our method. Our approach +achieves state-of-the-art performance compared to competitive baselines, +particularly for users with sparse interactions. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ☆ Information-Theoretic Generative Clustering of Documents AAAI 2025 + + +
+ We present {\em generative clustering} (GC) for clustering a set of +documents, $\mathrm{X}$, by using texts $\mathrm{Y}$ generated by large +language models (LLMs) instead of by clustering the original documents +$\mathrm{X}$. Because LLMs provide probability distributions, the similarity +between two documents can be rigorously defined in an information-theoretic +manner by the KL divergence. We also propose a natural, novel clustering +algorithm by using importance sampling. We show that GC achieves the +state-of-the-art performance, outperforming any previous clustering method +often by a large margin. Furthermore, we show an application to generative +document retrieval in which documents are indexed via hierarchical clustering +and our method improves the retrieval accuracy. + +
+
+ comment: Accepted to AAAI 2025 +
+
+
+
+
+ + ☆ Large Language Model Enhanced Recommender Systems: Taxonomy, Trend, + Application and Future + + +
+ Large Language Model (LLM) has transformative potential in various domains, +including recommender systems (RS). There have been a handful of research that +focuses on empowering the RS by LLM. However, previous efforts mainly focus on +LLM as RS, which may face the challenge of intolerant inference costs by LLM. +Recently, the integration of LLM into RS, known as LLM-Enhanced Recommender +Systems (LLMERS), has garnered significant interest due to its potential to +address latency and memory constraints in real-world applications. This paper +presents a comprehensive survey of the latest research efforts aimed at +leveraging LLM to enhance RS capabilities. We identify a critical shift in the +field with the move towards incorporating LLM into the online system, notably +by avoiding their use during inference. Our survey categorizes the existing +LLMERS approaches into three primary types based on the component of the RS +model being augmented: Knowledge Enhancement, Interaction Enhancement, and +Model Enhancement. We provide an in-depth analysis of each category, discussing +the methodologies, challenges, and contributions of recent studies. +Furthermore, we highlight several promising research directions that could +further advance the field of LLMERS. + +
+
+
+
+
+ + ☆ Lightweight yet Fine-grained: A Graph Capsule Convolutional Network with + Subspace Alignment for Shared-account Sequential Recommendation AAAI-2025 + + +
+ Shared-account Sequential Recommendation (SSR) aims to provide personalized +recommendations for accounts shared by multiple users with varying sequential +preferences. Previous studies on SSR struggle to capture the fine-grained +associations between interactions and different latent users within the shared +account's hybrid sequences. Moreover, most existing SSR methods (e.g., +RNN-based or GCN-based methods) have quadratic computational complexities, +hindering the deployment of SSRs on resource-constrained devices. To this end, +we propose a Lightweight Graph Capsule Convolutional Network with subspace +alignment for shared-account sequential recommendation, named LightGC$^2$N. +Specifically, we devise a lightweight graph capsule convolutional network. It +facilitates the fine-grained matching between interactions and latent users by +attentively propagating messages on the capsule graphs. Besides, we present an +efficient subspace alignment method. This method refines the sequence +representations and then aligns them with the finely clustered preferences of +latent users. The experimental results on four real-world datasets indicate +that LightGC$^2$N outperforms nine state-of-the-art methods in accuracy and +efficiency. + +
+
+ comment: 11 pages, 6 figures, accepted by AAAI-2025 conference +
+
+
+
+
+ + ♻ ☆ Evidence Contextualization and Counterfactual Attribution for + Conversational QA over Heterogeneous Data with RAG Systems WSDM 2025 + + +
+ Retrieval Augmented Generation (RAG) works as a backbone for interacting with +an enterprise's own data via Conversational Question Answering (ConvQA). In a +RAG system, a retriever fetches passages from a collection in response to a +question, which are then included in the prompt of a large language model (LLM) +for generating a natural language (NL) answer. However, several RAG systems +today suffer from two shortcomings: (i) retrieved passages usually contain +their raw text and lack appropriate document context, negatively impacting both +retrieval and answering quality; and (ii) attribution strategies that explain +answer generation usually rely only on similarity between the answer and the +retrieved passages, thereby only generating plausible but not causal +explanations. In this work, we demonstrate RAGONITE, a RAG system that remedies +the above concerns by: (i) contextualizing evidence with source metadata and +surrounding text; and (ii) computing counterfactual attribution, a causal +explanation approach where the contribution of an evidence to an answer is +determined by the similarity of the original response to the answer obtained by +removing that evidence. To evaluate our proposals, we release a new benchmark +ConfQuestions, with 300 hand-created conversational questions, each in English +and German, coupled with ground truth URLs, completed questions, and answers +from 215 public Confluence pages, that are typical of enterprise wiki spaces +with heterogeneous elements. Experiments with RAGONITE on ConfQuestions show +the viability of our ideas: contextualization improves RAG performance, and +counterfactual attribution is effective at explaining RAG answers. + +
+
+ comment: Accepted at WSDM 2025 +
+
+
+
+
+ + ♻ ☆ C-FedRAG: A Confidential Federated Retrieval-Augmented Generation System + + +
+ Organizations seeking to utilize Large Language Models (LLMs) for knowledge +querying and analysis often encounter challenges in maintaining an LLM +fine-tuned on targeted, up-to-date information that keeps answers relevant and +grounded. Retrieval Augmented Generation (RAG) has quickly become a feasible +solution for organizations looking to overcome the challenges of maintaining +proprietary models and to help reduce LLM hallucinations in their query +responses. However, RAG comes with its own issues regarding scaling data +pipelines across tiered-access and disparate data sources. In many scenarios, +it is necessary to query beyond a single data silo to provide richer and more +relevant context for an LLM. Analyzing data sources within and across +organizational trust boundaries is often limited by complex data-sharing +policies that prohibit centralized data storage, therefore, inhibit the fast +and effective setup and scaling of RAG solutions. In this paper, we introduce +Confidential Computing (CC) techniques as a solution for secure Federated +Retrieval Augmented Generation (FedRAG). Our proposed Confidential FedRAG +system (C-FedRAG) enables secure connection and scaling of a RAG workflows +across a decentralized network of data providers by ensuring context +confidentiality. We also demonstrate how to implement a C-FedRAG system using +the NVIDIA FLARE SDK and assess its performance using the MedRAG toolkit and +MIRAGE benchmarking dataset. + +
+
+
+
+
+ + ♻ ☆ Methods to Assess the UK Government's Current Role as a Data Provider + for AI + + +
+ Governments typically collect and steward a vast amount of high-quality data +on their citizens and institutions, and the UK government is exploring how it +can better publish and provision this data to the benefit of the AI landscape. +However, the compositions of generative AI training corpora remain closely +guarded secrets, making the planning of data sharing initiatives difficult. To +address this, we devise two methods to assess UK government data usage for the +training of Large Language Models (LLMs) and 'peek behind the curtain' in order +to observe the UK government's current contributions as a data provider for AI. +The first method, an ablation study that utilises LLM 'unlearning', seeks to +examine the importance of the information held on UK government websites for +LLMs and their performance in citizen query tasks. The second method, an +information leakage study, seeks to ascertain whether LLMs are aware of the +information held in the datasets published on the UK government's open data +initiative data.gov.uk. Our findings indicate that UK government websites are +important data sources for AI (heterogenously across subject matters) while +data.gov.uk is not. This paper serves as a technical report, explaining +in-depth the designs, mechanics, and limitations of the above experiments. It +is accompanied by a complementary non-technical report on the ODI website in +which we summarise the experiments and key findings, interpret them, and build +a set of actionable recommendations for the UK government to take forward as it +seeks to design AI policy. While we focus on UK open government data, we +believe that the methods introduced in this paper present a reproducible +approach to tackle the opaqueness of AI training corpora and provide +organisations a framework to evaluate and maximize their contributions to AI +development. + +
+
+ comment: 17 pages, 5 figures; v2 - incorporated editor feedback; for the + accompanying, non-technical ODI report see + https://theodi.org/insights/reports/the-uk-government-as-a-data-provider-for-ai +
+
+
+
+
+ + ♻ ☆ EXIT: Context-Aware Extractive Compression for Enhancing + Retrieval-Augmented Generation + + +
+ We introduce EXIT, an extractive context compression framework that enhances +both the effectiveness and efficiency of retrieval-augmented generation (RAG) +in question answering (QA). Current RAG systems often struggle when retrieval +models fail to rank the most relevant documents, leading to the inclusion of +more context at the expense of latency and accuracy. While abstractive +compression methods can drastically reduce token counts, their token-by-token +generation process significantly increases end-to-end latency. Conversely, +existing extractive methods reduce latency but rely on independent, +non-adaptive sentence selection, failing to fully utilize contextual +information. EXIT addresses these limitations by classifying sentences from +retrieved documents - while preserving their contextual dependencies - enabling +parallelizable, context-aware extraction that adapts to query complexity and +retrieval quality. Our evaluations on both single-hop and multi-hop QA tasks +show that EXIT consistently surpasses existing compression methods and even +uncompressed baselines in QA accuracy, while also delivering substantial +reductions in inference time and token count. By improving both effectiveness +and efficiency, EXIT provides a promising direction for developing scalable, +high-quality QA solutions in RAG pipelines. Our code is available at +https://github.com/ThisIsHwang/EXIT + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ DELRec: Distilling Sequential Pattern to Enhance LLMs-based Sequential + Recommendation + + +
+ Sequential recommendation (SR) tasks aim to predict users' next interaction +by learning their behavior sequence and capturing the connection between users' +past interactions and their changing preferences. Conventional SR models often +focus solely on capturing sequential patterns within the training data, +neglecting the broader context and semantic information embedded in item titles +from external sources. This limits their predictive power and adaptability. +Large language models (LLMs) have recently shown promise in SR tasks due to +their advanced understanding capabilities and strong generalization abilities. +Researchers have attempted to enhance LLMs-based recommendation performance by +incorporating information from conventional SR models. However, previous +approaches have encountered problems such as 1) limited textual information +leading to poor recommendation performance, 2) incomplete understanding and +utilization of conventional SR model information by LLMs, and 3) excessive +complexity and low interpretability of LLMs-based methods. To improve the +performance of LLMs-based SR, we propose a novel framework, Distilling +Sequential Pattern to Enhance LLMs-based Sequential Recommendation (DELRec), +which aims to extract knowledge from conventional SR models and enable LLMs to +easily comprehend and utilize the extracted knowledge for more effective SRs. +DELRec consists of two main stages: 1) Distill Pattern from Conventional SR +Models, focusing on extracting behavioral patterns exhibited by conventional SR +models using soft prompts through two well-designed strategies; 2) LLMs-based +Sequential Recommendation, aiming to fine-tune LLMs to effectively use the +distilled auxiliary information to perform SR tasks. Extensive experimental +results conducted on four real datasets validate the effectiveness of the +DELRec framework. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ AIR-Bench: Automated Heterogeneous Information Retrieval Benchmark + + +
+ Evaluation plays a crucial role in the advancement of information retrieval +(IR) models. However, current benchmarks, which are based on predefined domains +and human-labeled data, face limitations in addressing evaluation needs for +emerging domains both cost-effectively and efficiently. To address this +challenge, we propose the Automated Heterogeneous Information Retrieval +Benchmark (AIR-Bench). AIR-Bench is distinguished by three key features: 1) +Automated. The testing data in AIR-Bench is automatically generated by large +language models (LLMs) without human intervention. 2) Heterogeneous. The +testing data in AIR-Bench is generated with respect to diverse tasks, domains +and languages. 3) Dynamic. The domains and languages covered by AIR-Bench are +constantly augmented to provide an increasingly comprehensive evaluation +benchmark for community developers. We develop a reliable and robust data +generation pipeline to automatically create diverse and high-quality evaluation +datasets based on real-world corpora. Our findings demonstrate that the +generated testing data in AIR-Bench aligns well with human-labeled testing +data, making AIR-Bench a dependable benchmark for evaluating IR models. The +resources in AIR-Bench are publicly available at +https://github.com/AIR-Bench/AIR-Bench. + +
+
+ comment: 31 pages, 6 figures; Update Table 5 +
+
+
+
+
+ + ♻ ☆ AdaCQR: Enhancing Query Reformulation for Conversational Search via + Sparse and Dense Retrieval Alignment COLING 2025 + + +
+ Conversational Query Reformulation (CQR) has significantly advanced in +addressing the challenges of conversational search, particularly those stemming +from the latent user intent and the need for historical context. Recent works +aimed to boost the performance of CRQ through alignment. However, they are +designed for one specific retrieval system, which potentially results in poor +generalization. To overcome this limitation, we present a novel framework +AdaCQR. By aligning reformulation models with both term-based and +semantic-based retrieval systems, AdaCQR enhances the generalizability of +information-seeking queries across diverse retrieval environments through a +dual-phase training strategy. We also developed two effective approaches for +acquiring superior labels and diverse input candidates, boosting the efficiency +and robustness of the framework. Experimental evaluations on the TopiOCQA and +QReCC datasets demonstrate that AdaCQR significantly outperforms existing +methods, offering both quantitative and qualitative improvements in +conversational query reformulation. + +
+
+ comment: Accepted by COLING 2025 +
+
+
+
+
+ + ♻ ☆ Boosting Long-Context Management via Query-Guided Activation Refilling + + +
+ Processing long contexts poses a significant challenge for large language +models (LLMs) due to their inherent context-window limitations and the +computational burden of extensive key-value (KV) activations, which severely +impact efficiency. For information-seeking tasks, full context perception is +often unnecessary, as a query's information needs can dynamically range from +localized details to a global perspective, depending on its complexity. +However, existing methods struggle to adapt effectively to these dynamic +information needs. + In the paper, we propose a method for processing long-context +information-seeking tasks via query-guided Activation Refilling (ACRE). ACRE +constructs a Bi-layer KV Cache for long contexts, where the layer-1 (L1) cache +compactly captures global information, and the layer-2 (L2) cache provides +detailed and localized information. ACRE establishes a proxying relationship +between the two caches, allowing the input query to attend to the L1 cache and +dynamically refill it with relevant entries from the L2 cache. This mechanism +integrates global understanding with query-specific local details, thus +improving answer decoding. Experiments on a variety of long-context +information-seeking datasets demonstrate ACRE's effectiveness, achieving +improvements in both performance and efficiency. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ Augmenting Sequential Recommendation with Balanced Relevance and + Diversity AAAI 2025 + + +
+ By generating new yet effective data, data augmentation has become a +promising method to mitigate the data sparsity problem in sequential +recommendation. Existing works focus on augmenting the original data but rarely +explore the issue of imbalanced relevance and diversity for augmented data, +leading to semantic drift problems or limited performance improvements. In this +paper, we propose a novel Balanced data Augmentation Plugin for Sequential +Recommendation (BASRec) to generate data that balance relevance and diversity. +BASRec consists of two modules: Single-sequence Augmentation and Cross-sequence +Augmentation. The former leverages the randomness of the heuristic operators to +generate diverse sequences for a single user, after which the diverse and the +original sequences are fused at the representation level to obtain relevance. +Further, we devise a reweighting strategy to enable the model to learn the +preferences based on the two properties adaptively. The Cross-sequence +Augmentation performs nonlinear mixing between different sequence +representations from two directions. It produces virtual sequence +representations that are diverse enough but retain the vital semantics of the +original sequences. These two modules enhance the model to discover +fine-grained preferences knowledge from single-user and cross-user +perspectives. Extensive experiments verify the effectiveness of BASRec. The +average improvement is up to 72.0% on GRU4Rec, 33.8% on SASRec, and 68.5% on +FMLP-Rec. We demonstrate that BASRec generates data with a better balance +between relevance and diversity than existing methods. The source code is +available at https://github.com/KingGugu/BASRec. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ One for Dozens: Adaptive REcommendation for All Domains with + Counterfactual Augmentation AAAI 2025 + + +
+ Multi-domain recommendation (MDR) aims to enhance recommendation performance +across various domains. However, real-world recommender systems in online +platforms often need to handle dozens or even hundreds of domains, far +exceeding the capabilities of traditional MDR algorithms, which typically focus +on fewer than five domains. Key challenges include a substantial increase in +parameter count, high maintenance costs, and intricate knowledge transfer +patterns across domains. Furthermore, minor domains often suffer from data +sparsity, leading to inadequate training in classical methods. To address these +issues, we propose Adaptive REcommendation for All Domains with counterfactual +augmentation (AREAD). AREAD employs a hierarchical structure with a limited +number of expert networks at several layers, to effectively capture domain +knowledge at different granularities. To adaptively capture the knowledge +transfer pattern across domains, we generate and iteratively prune a +hierarchical expert network selection mask for each domain during training. +Additionally, counterfactual assumptions are used to augment data in minor +domains, supporting their iterative mask pruning. Our experiments on two public +datasets, each encompassing over twenty domains, demonstrate AREAD's +effectiveness, especially in data-sparse domains. Source code is available at +https://github.com/Chrissie-Law/AREAD-Multi-Domain-Recommendation. + +
+
+ comment: Extended version accepted by AAAI 2025 +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ AKiRa: Augmentation Kit on Rays for optical video generation + + +
+ Recent advances in text-conditioned video diffusion have greatly improved +video quality. However, these methods offer limited or sometimes no control to +users on camera aspects, including dynamic camera motion, zoom, distorted lens +and focus shifts. These motion and optical aspects are crucial for adding +controllability and cinematic elements to generation frameworks, ultimately +resulting in visual content that draws focus, enhances mood, and guides +emotions according to filmmakers' controls. In this paper, we aim to close the +gap between controllable video generation and camera optics. To achieve this, +we propose AKiRa (Augmentation Kit on Rays), a novel augmentation framework +that builds and trains a camera adapter with a complex camera model over an +existing video generation backbone. It enables fine-tuned control over camera +motion as well as complex optical parameters (focal length, distortion, +aperture) to achieve cinematic effects such as zoom, fisheye effect, and bokeh. +Extensive experiments demonstrate AKiRa's effectiveness in combining and +composing camera optics while outperforming all state-of-the-art methods. This +work sets a new landmark in controlled and optically enhanced video generation, +paving the way for future optical video generation methods. + +
+
+
+
+
+ + ☆ A Review of Multimodal Explainable Artificial Intelligence: Past, + Present and Future + + +
+ Artificial intelligence (AI) has rapidly developed through advancements in +computational power and the growth of massive datasets. However, this progress +has also heightened challenges in interpreting the "black-box" nature of AI +models. To address these concerns, eXplainable AI (XAI) has emerged with a +focus on transparency and interpretability to enhance human understanding and +trust in AI decision-making processes. In the context of multimodal data fusion +and complex reasoning scenarios, the proposal of Multimodal eXplainable AI +(MXAI) integrates multiple modalities for prediction and explanation tasks. +Meanwhile, the advent of Large Language Models (LLMs) has led to remarkable +breakthroughs in natural language processing, yet their complexity has further +exacerbated the issue of MXAI. To gain key insights into the development of +MXAI methods and provide crucial guidance for building more transparent, fair, +and trustworthy AI systems, we review the MXAI methods from a historical +perspective and categorize them across four eras: traditional machine learning, +deep learning, discriminative foundation models, and generative LLMs. We also +review evaluation metrics and datasets used in MXAI research, concluding with a +discussion of future challenges and directions. A project related to this +review has been created at https://github.com/ShilinSun/mxai_review. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ SurgSora: Decoupled RGBD-Flow Diffusion Model for Controllable Surgical + Video Generation + + +
+ Medical video generation has transformative potential for enhancing surgical +understanding and pathology insights through precise and controllable visual +representations. However, current models face limitations in controllability +and authenticity. To bridge this gap, we propose SurgSora, a +motion-controllable surgical video generation framework that uses a single +input frame and user-controllable motion cues. SurgSora consists of three key +modules: the Dual Semantic Injector (DSI), which extracts object-relevant RGB +and depth features from the input frame and integrates them with segmentation +cues to capture detailed spatial features of complex anatomical structures; the +Decoupled Flow Mapper (DFM), which fuses optical flow with semantic-RGB-D +features at multiple scales to enhance temporal understanding and object +spatial dynamics; and the Trajectory Controller (TC), which allows users to +specify motion directions and estimates sparse optical flow, guiding the video +generation process. The fused features are used as conditions for a frozen +Stable Diffusion model to produce realistic, temporally coherent surgical +videos. Extensive evaluations demonstrate that SurgSora outperforms +state-of-the-art methods in controllability and authenticity, showing its +potential to advance surgical video generation for medical education, training, +and research. + +
+
+
+
+
+ + ☆ Real-Time Position-Aware View Synthesis from Single-View Input + + +
+ Recent advancements in view synthesis have significantly enhanced immersive +experiences across various computer graphics and multimedia applications, +including telepresence, and entertainment. By enabling the generation of new +perspectives from a single input view, view synthesis allows users to better +perceive and interact with their environment. However, many state-of-the-art +methods, while achieving high visual quality, face limitations in real-time +performance, which makes them less suitable for live applications where low +latency is critical. In this paper, we present a lightweight, position-aware +network designed for real-time view synthesis from a single input image and a +target camera pose. The proposed framework consists of a Position Aware +Embedding, modeled with a multi-layer perceptron, which efficiently maps +positional information from the target pose to generate high dimensional +feature maps. These feature maps, along with the input image, are fed into a +Rendering Network that merges features from dual encoder branches to resolve +both high level semantics and low level details, producing a realistic new view +of the scene. Experimental results demonstrate that our method achieves +superior efficiency and visual quality compared to existing approaches, +particularly in handling complex translational movements without explicit +geometric operations like warping. This work marks a step toward enabling +real-time view synthesis from a single image for live and interactive +applications. + +
+
+
+
+
+ + ☆ User-Generated Content and Editors in Games: A Comprehensive Survey + + +
+ User-Generated Content (UGC) refers to any form of content, such as posts and +images, created by users rather than by professionals. In recent years, UGC has +become an essential part of the evolving video game industry, influencing both +game culture and community dynamics. The ability for users to actively +contribute to the games they engage with has shifted the landscape of gaming +from a one-directional entertainment experience into a collaborative, +user-driven ecosystem. Therefore, this growing trend highlights the urgent need +for summarizing the current UGC development in game industry. Our conference +paper has systematically classified the existing UGC in games and the UGC +editors separately into four types. However, the previous survey lacks the +depth and precision necessary to capture the wide-ranging and increasingly +complex nature of UGC. To this end, as an extension of previous work, this +paper presents a refined and expanded classification of UGC and UGC editors +within video games, offering a more robust and comprehensive framework with +representative cases that better reflects the diversity and nuances of +contemporary user-generated contributions. Moreover, we provide our insights on +the future of UGC, involving game culture, game genre and user creative +tendencies, artificial intelligence, its potential ethical considerations, and +relationship between games, users and communities. + +
+
+
+
+
+ + ☆ Reverse Region-to-Entity Annotation for Pixel-Level Visual Entity + Linking AAAI 2025 + + +
+ Visual Entity Linking (VEL) is a crucial task for achieving fine-grained +visual understanding, matching objects within images (visual mentions) to +entities in a knowledge base. Previous VEL tasks rely on textual inputs, but +writing queries for complex scenes can be challenging. Visual inputs like +clicks or bounding boxes offer a more convenient alternative. Therefore, we +propose a new task, Pixel-Level Visual Entity Linking (PL-VEL), which uses +pixel masks from visual inputs to refer to objects, supplementing reference +methods for VEL. To facilitate research on this task, we have constructed the +MaskOVEN-Wiki dataset through an entirely automatic reverse region-entity +annotation framework. This dataset contains over 5 million annotations aligning +pixel-level regions with entity-level labels, which will advance visual +understanding towards fine-grained. Moreover, as pixel masks correspond to +semantic regions in an image, we enhance previous patch-interacted attention +with region-interacted attention by a visual semantic tokenization approach. +Manual evaluation results indicate that the reverse annotation framework +achieved a 94.8% annotation success rate. Experimental results show that models +trained on this dataset improved accuracy by 18 points compared to zero-shot +models. Additionally, the semantic tokenization method achieved a 5-point +accuracy improvement over the trained baseline. + +
+
+ comment: AAAI 2025;Dataset are released at + https://github.com/NP-NET-research/PL-VEL +
+
+
+
+
+ + ☆ SAVGBench: Benchmarking Spatially Aligned Audio-Video Generation + + +
+ This work addresses the lack of multimodal generative models capable of +producing high-quality videos with spatially aligned audio. While recent +advancements in generative models have been successful in video generation, +they often overlook the spatial alignment between audio and visuals, which is +essential for immersive experiences. To tackle this problem, we establish a new +research direction in benchmarking Spatially Aligned Audio-Video Generation +(SAVG). We propose three key components for the benchmark: dataset, baseline, +and metrics. We introduce a spatially aligned audio-visual dataset, derived +from an audio-visual dataset consisting of multichannel audio, video, and +spatiotemporal annotations of sound events. We propose a baseline audio-visual +diffusion model focused on stereo audio-visual joint learning to accommodate +spatial sound. Finally, we present metrics to evaluate video and spatial audio +quality, including a new spatial audio-visual alignment metric. Our +experimental result demonstrates that gaps exist between the baseline model and +ground truth in terms of video and audio quality, and spatial alignment between +both modalities. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 22 + +
+
+
+ + ☆ JudgeBlender: Ensembling Judgments for Automatic Relevance Assessment + + +
+ The effective training and evaluation of retrieval systems require a +substantial amount of relevance judgments, which are traditionally collected +from human assessors -- a process that is both costly and time-consuming. Large +Language Models (LLMs) have shown promise in generating relevance labels for +search tasks, offering a potential alternative to manual assessments. Current +approaches often rely on a single LLM, such as GPT-4, which, despite being +effective, are expensive and prone to intra-model biases that can favour +systems leveraging similar models. In this work, we introduce JudgeBlender, a +framework that employs smaller, open-source models to provide relevance +judgments by combining evaluations across multiple LLMs (LLMBlender) or +multiple prompts (PromptBlender). By leveraging the LLMJudge benchmark [18], we +compare JudgeBlender with state-of-the-art methods and the top performers in +the LLMJudge challenge. Our results show that JudgeBlender achieves competitive +performance, demonstrating that very large models are often unnecessary for +reliable relevance assessments. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Re-calibrating methodologies in social media research: Challenge the + visual, work with Speech + + +
+ This article methodologically reflects on how social media scholars can +effectively engage with speech-based data in their analyses. While contemporary +media studies have embraced textual, visual, and relational data, the aural +dimension remained comparatively under-explored. Building on the notion of +secondary orality and rejection towards purely visual culture, the paper argues +that considering voice and speech at scale enriches our understanding of +multimodal digital content. The paper presents the TikTok Subtitles Toolkit +that offers accessible speech processing readily compatible with existing +workflows. In doing so, it opens new avenues for large-scale inquiries that +blend quantitative insights with qualitative precision. Two illustrative cases +highlight both opportunities and limitations of speech research: while genres +like #storytime on TikTok benefit from the exploration of spoken narratives, +nonverbal or music-driven content may not yield significant insights using +speech data. The article encourages researchers to integrate aural exploration +thoughtfully to complement existing methods, rather than replacing them. I +conclude that the expansion of our methodological repertoire enables richer +interpretations of platformised content, and our capacity to unpack digital +cultures as they become increasingly multimodal. + +
+
+ comment: 11 pages (excluding references), 3 figures +
+
+
+
+
+ + ☆ CLASP: Contrastive Language-Speech Pretraining for Multilingual + Multimodal Information Retrieval ECIR 2025 + + +
+ This study introduces CLASP (Contrastive Language-Speech Pretraining), a +multilingual, multimodal representation tailored for audio-text information +retrieval. CLASP leverages the synergy between spoken content and textual data. +During training, we utilize our newly introduced speech-text dataset, which +encompasses 15 diverse categories ranging from fiction to religion. CLASP's +audio component integrates audio spectrograms with a pre-trained +self-supervised speech model, while its language encoding counterpart employs a +sentence encoder pre-trained on over 100 languages. This unified lightweight +model bridges the gap between various modalities and languages, enhancing its +effectiveness in handling and retrieving multilingual and multimodal data. Our +evaluations across multiple languages demonstrate that CLASP establishes new +benchmarks in HITS@1, MRR, and meanR metrics, outperforming traditional +ASR-based retrieval approaches in specific scenarios. + +
+
+ comment: accepted at ECIR 2025 +
+
+
+
+
+ + ☆ Enabling Low-Resource Language Retrieval: Establishing Baselines for + Urdu MS MARCO ECIR 2025 + + +
+ As the Information Retrieval (IR) field increasingly recognizes the +importance of inclusivity, addressing the needs of low-resource languages +remains a significant challenge. This paper introduces the first large-scale +Urdu IR dataset, created by translating the MS MARCO dataset through machine +translation. We establish baseline results through zero-shot learning for IR in +Urdu and subsequently apply the mMARCO multilingual IR methodology to this +newly translated dataset. Our findings demonstrate that the fine-tuned model +(Urdu-mT5-mMARCO) achieves a Mean Reciprocal Rank (MRR@10) of 0.247 and a +Recall@10 of 0.439, representing significant improvements over zero-shot +results and showing the potential for expanding IR access for Urdu speakers. By +bridging access gaps for speakers of low-resource languages, this work not only +advances multilingual IR research but also emphasizes the ethical and societal +importance of inclusive IR technologies. This work provides valuable insights +into the challenges and solutions for improving language representation and +lays the groundwork for future research, especially in South Asian languages, +which can benefit from the adaptable methods used in this study. + +
+
+ comment: 6 pages, ECIR 2025, conference submission version +
+
+
+
+
+ + ☆ Cluster-guided Contrastive Class-imbalanced Graph Classification AAAI + + +
+ This paper studies the problem of class-imbalanced graph classification, +which aims at effectively classifying the categories of graphs in scenarios +with imbalanced class distribution. Despite the tremendous success of graph +neural networks (GNNs), their modeling ability for imbalanced graph-structured +data is inadequate, which typically leads to predictions biased towards the +majority classes. Besides, existing class-imbalanced learning methods in +visions may overlook the rich graph semantic substructures of the majority +classes and excessively emphasize learning from the minority classes. To tackle +this issue, this paper proposes a simple yet powerful approach called C$^3$GNN +that incorporates the idea of clustering into contrastive learning to enhance +class-imbalanced graph classification. Technically, C$^3$GNN clusters graphs +from each majority class into multiple subclasses, ensuring they have similar +sizes to the minority class, thus alleviating class imbalance. Additionally, it +utilizes the Mixup technique to synthesize new samples and enrich the semantic +information of each subclass, and leverages supervised contrastive learning to +hierarchically learn effective graph representations. In this way, we can not +only sufficiently explore the semantic substructures within the majority class +but also effectively alleviate excessive focus on the minority class. Extensive +experiments on real-world graph benchmark datasets verify the superior +performance of our proposed method. + +
+
+ comment: Accepted by Proceedings of the Thirty-Ninth AAAI Conference on + Artificial Intelligence (AAAI-25) +
+
+
+
+
+ + ☆ Selective Shot Learning for Code Explanation + + +
+ Code explanation plays a crucial role in the software engineering domain, +aiding developers in grasping code functionality efficiently. Recent work shows +that the performance of LLMs for code explanation improves in a few-shot +setting, especially when the few-shot examples are selected intelligently. +State-of-the-art approaches for such Selective Shot Learning (SSL) include +token-based and embedding-based methods. However, these SSL approaches have +been evaluated on proprietary LLMs, without much exploration on open-source +Code-LLMs. Additionally, these methods lack consideration for programming +language syntax. To bridge these gaps, we present a comparative study and +propose a novel SSL method (SSL_ner) that utilizes entity information for +few-shot example selection. We present several insights and show the +effectiveness of SSL_ner approach over state-of-the-art methods across two +datasets. To the best of our knowledge, this is the first systematic +benchmarking of open-source Code-LLMs while assessing the performances of the +various few-shot examples selection approaches for the code explanation task. + +
+
+
+
+
+ + ☆ A Survey on Recommendation Unlearning: Fundamentals, Taxonomy, + Evaluation, and Open Questions + + +
+ Recommender systems have become increasingly influential in shaping user +behavior and decision-making, highlighting their growing impact in various +domains. Meanwhile, the widespread adoption of machine learning models in +recommender systems has raised significant concerns regarding user privacy and +security. As compliance with privacy regulations becomes more critical, there +is a pressing need to address the issue of recommendation unlearning, i.e., +eliminating the memory of specific training data from the learned +recommendation models. Despite its importance, traditional machine unlearning +methods are ill-suited for recommendation unlearning due to the unique +challenges posed by collaborative interactions and model parameters. This +survey offers a comprehensive review of the latest advancements in +recommendation unlearning, exploring the design principles, challenges, and +methodologies associated with this emerging field. We provide a unified +taxonomy that categorizes different recommendation unlearning approaches, +followed by a summary of widely used benchmarks and metrics for evaluation. By +reviewing the current state of research, this survey aims to guide the +development of more efficient, scalable, and robust recommendation unlearning +techniques. Furthermore, we identify open research questions in this field, +which could pave the way for future innovations not only in recommendation +unlearning but also in a broader range of unlearning tasks across different +machine learning applications. + +
+
+
+
+
+ + ☆ Cross-Dialect Information Retrieval: Information Access in Low-Resource + and High-Variance Languages COLING 2025 + + +
+ A large amount of local and culture-specific knowledge (e.g., people, +traditions, food) can only be found in documents written in dialects. While +there has been extensive research conducted on cross-lingual information +retrieval (CLIR), the field of cross-dialect retrieval (CDIR) has received +limited attention. Dialect retrieval poses unique challenges due to the limited +availability of resources to train retrieval models and the high variability in +non-standardized languages. We study these challenges on the example of German +dialects and introduce the first German dialect retrieval dataset, dubbed +WikiDIR, which consists of seven German dialects extracted from Wikipedia. +Using WikiDIR, we demonstrate the weakness of lexical methods in dealing with +high lexical variation in dialects. We further show that commonly used +zero-shot cross-lingual transfer approach with multilingual encoders do not +transfer well to extremely low-resource setups, motivating the need for +resource-lean and dialect-specific retrieval models. We finally demonstrate +that (document) translation is an effective way to reduce the dialect gap in +CDIR. + +
+
+ comment: Accepted at COLING 2025 +
+
+
+
+
+ + ☆ RemoteRAG: A Privacy-Preserving LLM Cloud RAG Service + + +
+ Retrieval-augmented generation (RAG) improves the service quality of large +language models by retrieving relevant documents from credible literature and +integrating them into the context of the user query. Recently, the rise of the +cloud RAG service has made it possible for users to query relevant documents +conveniently. However, directly sending queries to the cloud brings potential +privacy leakage. In this paper, we are the first to formally define the +privacy-preserving cloud RAG service to protect the user query and propose +RemoteRAG as a solution regarding privacy, efficiency, and accuracy. For +privacy, we introduce $(n,\epsilon)$-DistanceDP to characterize privacy leakage +of the user query and the leakage inferred from relevant documents. For +efficiency, we limit the search range from the total documents to a small +number of selected documents related to a perturbed embedding generated from +$(n,\epsilon)$-DistanceDP, so that computation and communication costs required +for privacy protection significantly decrease. For accuracy, we ensure that the +small range includes target documents related to the user query with detailed +theoretical analysis. Experimental results also demonstrate that RemoteRAG can +resist existing embedding inversion attack methods while achieving no loss in +retrieval under various settings. Moreover, RemoteRAG is efficient, incurring +only $0.67$ seconds and $46.66$KB of data transmission ($2.72$ hours and $1.43$ +GB with the non-optimized privacy-preserving scheme) when retrieving from a +total of $10^6$ documents. + +
+
+
+
+
+ + ☆ A Survey on Sequential Recommendation + + +
+ Different from most conventional recommendation problems, sequential +recommendation focuses on learning users' preferences by exploiting the +internal order and dependency among the interacted items, which has received +significant attention from both researchers and practitioners. In recent years, +we have witnessed great progress and achievements in this field, necessitating +a new survey. In this survey, we study the SR problem from a new perspective +(i.e., the construction of an item's properties), and summarize the most recent +techniques used in sequential recommendation such as pure ID-based SR, SR with +side information, multi-modal SR, generative SR, LLM-powered SR, ultra-long SR +and data-augmented SR. Moreover, we introduce some frontier research topics in +sequential recommendation, e.g., open-domain SR, data-centric SR, could-edge +collaborative SR, continuous SR, SR for good, and explainable SR. We believe +that our survey could be served as a valuable roadmap for readers in this +field. + +
+
+
+
+
+ + ☆ Token-Level Graphs for Short Text Classification ECIR 2025 + + +
+ The classification of short texts is a common subtask in Information +Retrieval (IR). Recent advances in graph machine learning have led to interest +in graph-based approaches for low resource scenarios, showing promise in such +settings. However, existing methods face limitations such as not accounting for +different meanings of the same words or constraints from transductive +approaches. We propose an approach which constructs text graphs entirely based +on tokens obtained through pre-trained language models (PLMs). By applying a +PLM to tokenize and embed the texts when creating the graph(-nodes), our method +captures contextual and semantic information, overcomes vocabulary constraints, +and allows for context-dependent word meanings. Our approach also makes +classification more efficient with reduced parameters compared to classical PLM +fine-tuning, resulting in more robust training with few samples. Experimental +results demonstrate how our method consistently achieves higher scores or +on-par performance with existing methods, presenting an advancement in +graph-based text classification techniques. To support reproducibility of our +work we make all implementations publicly available to the +community\footnote{\url{https://github.com/doGregor/TokenGraph}}. + +
+
+ comment: Preprint accepted at the 47th European Conference on Information + Retrieval (ECIR 2025) +
+
+
+
+
+ + ☆ SynthCypher: A Fully Synthetic Data Generation Framework for + Text-to-Cypher Querying in Knowledge Graphs + + +
+ Cypher, the query language for Neo4j graph databases, plays a critical role +in enabling graph-based analytics and data exploration. While substantial +research has been dedicated to natural language to SQL query generation +(Text2SQL), the analogous problem for graph databases referred to as +Text2Cypher remains underexplored. In this work, we introduce SynthCypher, a +fully synthetic and automated data generation pipeline designed to address this +gap. SynthCypher employs a novel LLMSupervised Generation-Verification +framework, ensuring syntactically and semantically correct Cypher queries +across diverse domains and query complexities. Using this pipeline, we create +SynthCypher Dataset, a large-scale benchmark containing 29.8k Text2Cypher +instances. Fine-tuning open-source large language models (LLMs), including +LLaMa-3.1- 8B, Mistral-7B, and QWEN-7B, on SynthCypher yields significant +performance improvements of up to 40% on the Text2Cypher test set and 30% on +the SPIDER benchmark adapted for graph databases. This work demonstrates that +high-quality synthetic data can effectively advance the state-of-the-art in +Text2Cypher tasks. + +
+
+
+
+
+ + ☆ Boosting LLM-based Relevance Modeling with Distribution-Aware Robust + Learning + + +
+ With the rapid advancement of pre-trained large language models (LLMs), +recent endeavors have leveraged the capabilities of LLMs in relevance modeling, +resulting in enhanced performance. This is usually done through the process of +fine-tuning LLMs on specifically annotated datasets to determine the relevance +between queries and items. However, there are two limitations when LLMs are +naively employed for relevance modeling through fine-tuning and inference. +First, it is not inherently efficient for performing nuanced tasks beyond +simple yes or no answers, such as assessing search relevance. It may therefore +tend to be overconfident and struggle to distinguish fine-grained degrees of +relevance (e.g., strong relevance, weak relevance, irrelevance) used in search +engines. Second, it exhibits significant performance degradation when +confronted with data distribution shift in real-world scenarios. In this paper, +we propose a novel Distribution-Aware Robust Learning framework (DaRL) for +relevance modeling in Alipay Search. Specifically, we design an effective loss +function to enhance the discriminability of LLM-based relevance modeling across +various fine-grained degrees of query-item relevance. To improve the +generalizability of LLM-based relevance modeling, we first propose the +Distribution-Aware Sample Augmentation (DASA) module. This module utilizes +out-of-distribution (OOD) detection techniques to actively select appropriate +samples that are not well covered by the original training set for model +fine-tuning. Furthermore, we adopt a multi-stage fine-tuning strategy to +simultaneously improve in-distribution (ID) and OOD performance, bridging the +performance gap between them. DaRL has been deployed online to serve the +Alipay's insurance product search... + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ LLM is Knowledge Graph Reasoner: LLM's Intuition-aware Knowledge Graph + Reasoning for Cold-start Sequential Recommendation ECIR2025 + + +
+ Knowledge Graphs (KGs) represent relationships between entities in a graph +structure and have been widely studied as promising tools for realizing +recommendations that consider the accurate content information of items. +However, traditional KG-based recommendation methods face fundamental +challenges: insufficient consideration of temporal information and poor +performance in cold-start scenarios. On the other hand, Large Language Models +(LLMs) can be considered databases with a wealth of knowledge learned from the +web data, and they have recently gained attention due to their potential +application as recommendation systems. Although approaches that treat LLMs as +recommendation systems can leverage LLMs' high recommendation literacy, their +input token limitations make it impractical to consider the entire +recommendation domain dataset and result in scalability issues. To address +these challenges, we propose a LLM's Intuition-aware Knowledge graph Reasoning +model (LIKR). Our main idea is to treat LLMs as reasoners that output intuitive +exploration strategies for KGs. To integrate the knowledge of LLMs and KGs, we +trained a recommendation agent through reinforcement learning using a reward +function that integrates different recommendation strategies, including LLM's +intuition and KG embeddings. By incorporating temporal awareness through prompt +engineering and generating textual representations of user preferences from +limited interactions, LIKR can improve recommendation performance in cold-start +scenarios. Furthermore, LIKR can avoid scalability issues by using KGs to +represent recommendation domain datasets and limiting the LLM's output to KG +exploration strategies. Experiments on real-world datasets demonstrate that our +model outperforms state-of-the-art recommendation methods in cold-start +sequential recommendation scenarios. + +
+
+ comment: Accepted to the 47th European Conference on Information Retrieval + (ECIR2025) +
+
+
+
+
+ + ☆ LITA: An Efficient LLM-assisted Iterative Topic Augmentation Framework + + +
+ Topic modeling is widely used for uncovering thematic structures within text +corpora, yet traditional models often struggle with specificity and coherence +in domain-focused applications. Guided approaches, such as SeededLDA and CorEx, +incorporate user-provided seed words to improve relevance but remain +labor-intensive and static. Large language models (LLMs) offer potential for +dynamic topic refinement and discovery, yet their application often incurs high +API costs. To address these challenges, we propose the LLM-assisted Iterative +Topic Augmentation framework (LITA), an LLM-assisted approach that integrates +user-provided seeds with embedding-based clustering and iterative refinement. +LITA identifies a small number of ambiguous documents and employs an LLM to +reassign them to existing or new topics, minimizing API costs while enhancing +topic quality. Experiments on two datasets across topic quality and clustering +performance metrics demonstrate that LITA outperforms five baseline models, +including LDA, SeededLDA, CorEx, BERTopic, and PromptTopic. Our work offers an +efficient and adaptable framework for advancing topic modeling and text +clustering. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Refining Dimensions for Improving Clustering-based Cross-lingual Topic + Models COLING 2025 + + +
+ Recent works in clustering-based topic models perform well in monolingual +topic identification by introducing a pipeline to cluster the contextualized +representations. However, the pipeline is suboptimal in identifying topics +across languages due to the presence of language-dependent dimensions (LDDs) +generated by multilingual language models. To address this issue, we introduce +a novel, SVD-based dimension refinement component into the pipeline of the +clustering-based topic model. This component effectively neutralizes the +negative impact of LDDs, enabling the model to accurately identify topics +across languages. Our experiments on three datasets demonstrate that the +updated pipeline with the dimension refinement component generally outperforms +other state-of-the-art cross-lingual topic models. + +
+
+ comment: Accepted to 18th BUCC Workshop at COLING 2025 +
+
+
+
+
+ + ♻ ☆ FM2DS: Few-Shot Multimodal Multihop Data Synthesis with Knowledge + Distillation for Question Answering CVPR 2025 + + +
+ Multimodal multihop question answering is a complex task that requires +reasoning over multiple sources of information, such as images and text, to +answer questions. While there has been significant progress in visual question +answering, the multihop setting remains unexplored due to the lack of +high-quality datasets. Current methods focus on single-hop question answering +or a single modality, which makes them unsuitable for real-world scenarios such +as analyzing multimodal educational materials, summarizing lengthy academic +articles, or interpreting scientific studies that combine charts, images, and +text. To address this gap, we propose a novel methodology, introducing the +first framework for creating a high-quality dataset that enables training +models for multimodal multihop question answering. Our approach consists of a +5-stage pipeline that involves acquiring relevant multimodal documents from +Wikipedia, synthetically generating high-level questions and answers, and +validating them through rigorous criteria to ensure quality data. We evaluate +our methodology by training models on our synthesized dataset and testing on +two benchmarks, our results demonstrate that, with an equal sample size, models +trained on our synthesized data outperform those trained on human-collected +data by 1.9 in exact match (EM) on average. We believe our data synthesis +method will serve as a strong foundation for training and evaluating multimodal +multihop question answering models. + +
+
+ comment: 20 pages, 11 figures, 10 tables, Submitted to CVPR 2025 +
+
+
+
+
+ + ♻ ☆ It is Never Too Late to Mend: Separate Learning for Multimedia + Recommendation + + +
+ Multimedia recommendation, which incorporates various modalities (e.g., +images, texts, etc.) into user or item representation to improve recommendation +quality, and self-supervised learning carries multimedia recommendation to a +plateau of performance, because of its superior performance in aligning +different modalities. However, more and more research finds that aligning all +modal representations is suboptimal because it damages the unique attributes of +each modal. These studies use subtraction and orthogonal constraints in +geometric space to learn unique parts. However, our rigorous analysis reveals +the flaws in this approach, such as that subtraction does not necessarily yield +the desired modal-unique and that orthogonal constraints are ineffective in +user and item high-dimensional representation spaces. To make up for the +previous weaknesses, we propose Separate Learning (SEA) for multimedia +recommendation, which mainly includes mutual information view of modal-unique +and -generic learning. Specifically, we first use GNN to learn the +representations of users and items in different modalities and split each modal +representation into generic and unique parts. We employ contrastive log-ratio +upper bound to minimize the mutual information between the general and unique +parts within the same modality, to distance their representations, thus +learning modal-unique features. Then, we design Solosimloss to maximize the +lower bound of mutual information, to align the general parts of different +modalities, thus learning more high-quality modal-generic features. Finally, +extensive experiments on three datasets demonstrate the effectiveness and +generalization of our proposed framework. The code is available at SEA and the +full training record of the main experiment. + +
+
+
+
+
+ + ♻ ☆ Multi-Modal Recommendation Unlearning for Legal, Licensing, and Modality + Constraints AAAI 2025 + + +
+ User data spread across multiple modalities has popularized multi-modal +recommender systems (MMRS). They recommend diverse content such as products, +social media posts, TikTok reels, etc., based on a user-item interaction graph. +With rising data privacy demands, recent methods propose unlearning private +user data from uni-modal recommender systems (RS). However, methods for +unlearning item data related to outdated user preferences, revoked licenses, +and legally requested removals are still largely unexplored. + Previous RS unlearning methods are unsuitable for MMRS due to the +incompatibility of their matrix-based representation with the multi-modal +user-item interaction graph. Moreover, their data partitioning step degrades +performance on each shard due to poor data heterogeneity and requires costly +performance aggregation across shards. + This paper introduces MMRecUn, the first approach known to us for unlearning +in MMRS and unlearning item data. Given a trained RS model, MMRecUn employs a +novel Reverse Bayesian Personalized Ranking (BPR) objective to enable the model +to forget marked data. The reverse BPR attenuates the impact of user-item +interactions within the forget set, while the forward BPR reinforces the +significance of user-item interactions within the retain set. Our experiments +demonstrate that MMRecUn outperforms baseline methods across various unlearning +requests when evaluated on benchmark MMRS datasets. MMRecUn achieves recall +performance improvements of up to 49.85% compared to baseline methods and is up +to $\mathbf{1.3}\times$ faster than the Gold model, which is trained on retain +set from scratch. MMRecUn offers significant advantages, including superiority +in removing target interactions, preserving retained interactions, and zero +overhead costs compared to previous methods. The code will be released after +review. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Non-autoregressive Generative Models for Reranking Recommendation KDD 2024 + + +
+ Contemporary recommendation systems are designed to meet users' needs by +delivering tailored lists of items that align with their specific demands or +interests. In a multi-stage recommendation system, reranking plays a crucial +role by modeling the intra-list correlations among items. The key challenge of +reranking lies in the exploration of optimal sequences within the combinatorial +space of permutations. Recent research proposes a generator-evaluator learning +paradigm, where the generator generates multiple feasible sequences and the +evaluator picks out the best sequence based on the estimated listwise score. +The generator is of vital importance, and generative models are well-suited for +the generator function. Current generative models employ an autoregressive +strategy for sequence generation. However, deploying autoregressive models in +real-time industrial systems is challenging. To address these issues, we +propose a Non-AutoRegressive generative model for reranking Recommendation +(NAR4Rec) designed to enhance efficiency and effectiveness. To tackle +challenges such as sparse training samples and dynamic candidates, we introduce +a matching model. Considering the diverse nature of user feedback, we employ a +sequence-level unlikelihood training objective to differentiate feasible +sequences from unfeasible ones. Additionally, to overcome the lack of +dependency modeling in non-autoregressive models regarding target items, we +introduce contrastive decoding to capture correlations among these items. +Extensive offline experiments validate the superior performance of NAR4Rec over +state-of-the-art reranking methods. Online A/B tests reveal that NAR4Rec +significantly enhances the user experience. Furthermore, NAR4Rec has been fully +deployed in a popular video app Kuaishou with over 300 million daily active +users. + +
+
+ comment: Accepted by KDD 2024 +
+
+
+
+
+ + ♻ ☆ Familiarity-Aware Evidence Compression for Retrieval-Augmented + Generation + + +
+ Retrieval-augmented generation (RAG) improves large language models (LMs) by +incorporating non-parametric knowledge through evidence retrieved from external +sources. However, it often struggles to cope with inconsistent and irrelevant +information that can distract the LM from its tasks, especially when multiple +evidence pieces are required. While compressing the retrieved evidence with a +compression model aims to address this issue, the compressed evidence may still +be unfamiliar to the target model used for downstream tasks, potentially +failing to utilize the evidence effectively. We propose FaviComp +(Familarity-Aware Evidence Compression), a novel training-free evidence +compression technique that makes retrieved evidence more familiar to the target +model, while seamlessly integrating parametric knowledge from the model. +Experimental results show that FaviComp consistently outperforms most recent +evidence compression baselines across multiple open-domain QA datasets, +improving accuracy by up to 28.1% while achieving high compression rates. +Additionally, we demonstrate the effective integration of both parametric and +non-parametric knowledge during evidence compression. + +
+
+
+
+
+ + ♻ ☆ Impression-Aware Recommender Systems + + +
+ Novel data sources bring new opportunities to improve the quality of +recommender systems and serve as a catalyst for the creation of new paradigms +on personalized recommendations. Impressions are a novel data source containing +the items shown to users on their screens. Past research focused on providing +personalized recommendations using interactions, and occasionally using +impressions when such a data source was available. Interest in impressions has +increased due to their potential to provide more accurate recommendations. +Despite this increased interest, research in recommender systems using +impressions is still dispersed. Many works have distinct interpretations of +impressions and use impressions in recommender systems in numerous different +manners. To unify those interpretations into a single framework, we present a +systematic literature review on recommender systems using impressions, focusing +on three fundamental perspectives: recommendation models, datasets, and +evaluation methodologies. We define a theoretical framework to delimit +recommender systems using impressions and a novel paradigm for personalized +recommendations, called impression-aware recommender systems. We propose a +classification system for recommenders in this paradigm, which we use to +categorize the recommendation models, datasets, and evaluation methodologies +used in past research. Lastly, we identify open questions and future +directions, highlighting missing aspects in the reviewed literature. + +
+
+ comment: 44 pages, 127 references, 6 tables, 5 figures, ACM TORS ACCEPTED +
+
+
+
+
+
+
+
+ + Multimedia 11 + +
+
+
+ + ☆ Flight Patterns for Swarms of Drones + + +
+ We present flight patterns for a collision-free passage of swarms of drones +through one or more openings. The narrow openings provide drones with access to +an infrastructure component such as charging stations to charge their depleted +batteries and hangars for storage. The flight patterns are a staging area +(queues) that match the rate at which an infrastructure component and its +openings process drones. They prevent collisions and may implement different +policies that control the order in which drones pass through an opening. We +illustrate the flight patterns with a 3D display that uses drones configured +with light sources to illuminate shapes. + +
+
+ comment: Appeared in the First International Conference on Holodecks, December + 15, 2023. Shuqin Zhou and Shahram Ghandeharizadeh. Flight Patterns for Swarms + of Drones. In the Proceedings of the First International Conference on + Holodecks (Holodecks '23), December 15 2023, Los Angeles, California, USA, + 29-33. https://doi.org/10.61981/ZFSH2303 +
+
+
+
+
+ + ☆ A Conceptual Model of Intelligent Multimedia Data Rendered using Flying + Light Specks + + +
+ A Flying Light Speck, FLS, is a miniature sized drone configured with light +sources to illuminate 3D multimedia objects in a fixed volume, an FLS display. +A swarm of FLSs may provide haptic interactions by exerting force back at a +user's touch. This paper presents a conceptual model for the multimedia data to +enable content-based queries. The model empowers users of an FLS display to +annotate the illuminations by adding semantics to the data, extending a +multimedia repository with information and knowledge. We present a core +conceptual model and demonstrate its extensions for two diverse applications, +authoring tools with entertainment and MRI scans with healthcare. + +
+
+ comment: Appeared in the First International Conference on Holodecks +
+
+
+
+
+ + ☆ Implicit Location-Caption Alignment via Complementary Masking for + Weakly-Supervised Dense Video Captioning AAAI 2025 + + +
+ Weakly-Supervised Dense Video Captioning (WSDVC) aims to localize and +describe all events of interest in a video without requiring annotations of +event boundaries. This setting poses a great challenge in accurately locating +the temporal location of event, as the relevant supervision is unavailable. +Existing methods rely on explicit alignment constraints between event locations +and captions, which involve complex event proposal procedures during both +training and inference. To tackle this problem, we propose a novel implicit +location-caption alignment paradigm by complementary masking, which simplifies +the complex event proposal and localization process while maintaining +effectiveness. Specifically, our model comprises two components: a dual-mode +video captioning module and a mask generation module. The dual-mode video +captioning module captures global event information and generates descriptive +captions, while the mask generation module generates differentiable positive +and negative masks for localizing the events. These masks enable the implicit +alignment of event locations and captions by ensuring that captions generated +from positively and negatively masked videos are complementary, thereby forming +a complete video description. In this way, even under weak supervision, the +event location and event caption can be aligned implicitly. Extensive +experiments on the public datasets demonstrate that our method outperforms +existing weakly-supervised methods and achieves competitive results compared to +fully-supervised methods. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ ASAP: Advancing Semantic Alignment Promotes Multi-Modal Manipulation + Detecting and Grounding + + +
+ We present ASAP, a new framework for detecting and grounding multi-modal +media manipulation (DGM4).Upon thorough examination, we observe that accurate +fine-grained cross-modal semantic alignment between the image and text is vital +for accurately manipulation detection and grounding. While existing DGM4 +methods pay rare attention to the cross-modal alignment, hampering the accuracy +of manipulation detecting to step further. To remedy this issue, this work +targets to advance the semantic alignment learning to promote this task. +Particularly, we utilize the off-the-shelf Multimodal Large-Language Models +(MLLMs) and Large Language Models (LLMs) to construct paired image-text pairs, +especially for the manipulated instances. Subsequently, a cross-modal alignment +learning is performed to enhance the semantic alignment. Besides the explicit +auxiliary clues, we further design a Manipulation-Guided Cross Attention (MGCA) +to provide implicit guidance for augmenting the manipulation perceiving. With +the grounding truth available during training, MGCA encourages the model to +concentrate more on manipulated components while downplaying normal ones, +enhancing the model's ability to capture manipulations. Extensive experiments +are conducted on the DGM4 dataset, the results demonstrate that our model can +surpass the comparison method with a clear margin. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ Multimodal Classification and Out-of-distribution Detection for + Multimodal Intent Understanding + + +
+ Multimodal intent understanding is a significant research area that requires +effectively leveraging multiple modalities to analyze human language. Existing +methods face two main challenges in this domain. Firstly, they have limitations +in capturing nuanced and high-level semantics underlying complex +in-distribution (ID) multimodal intents. Secondly, they exhibit poor +generalization when confronted with unseen out-of-distribution (OOD) data in +real-world scenarios. To address these issues, we propose a novel method for +both ID classification and OOD detection (MIntOOD). We first introduce a +weighted feature fusion network that models multimodal representations +effectively. This network dynamically learns the importance of each modality, +adapting to multimodal contexts. To develop discriminative representations that +are conducive to both tasks, we synthesize pseudo-OOD data from convex +combinations of ID data and engage in multimodal representation learning from +both coarse-grained and fine-grained perspectives. The coarse-grained +perspective focuses on distinguishing between ID and OOD binary classes, while +the fine-grained perspective enhances the understanding of ID data by +incorporating binary confidence scores. These scores help to gauge the +difficulty of each sample, improving the classification of different ID +classes. Additionally, the fine-grained perspective captures instance-level +interactions between ID and OOD samples, promoting proximity among similar +instances and separation from dissimilar ones. We establish baselines for three +multimodal intent datasets and build an OOD benchmark. Extensive experiments on +these datasets demonstrate that our method significantly improves OOD detection +performance with a 3-10% increase in AUROC scores while achieving new +state-of-the-art results in ID classification. The full data and codes are +available at https://github.com/thuiar/MIntOOD. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ From Capture to Display: A Survey on Volumetric Video + + +
+ Volumetric video, which offers immersive viewing experiences, is gaining +increasing prominence. With its six degrees of freedom, it provides viewers +with greater immersion and interactivity compared to traditional videos. +Despite their potential, volumetric video services pose significant challenges. +This survey conducts a comprehensive review of the existing literature on +volumetric video. We firstly provide a general framework of volumetric video +services, followed by a discussion on prerequisites for volumetric video, +encompassing representations, open datasets, and quality assessment metrics. +Then we delve into the current methodologies for each stage of the volumetric +video service pipeline, detailing capturing, compression, transmission, +rendering, and display techniques. Lastly, we explore various applications +enabled by this pioneering technology and we present an array of research +challenges and opportunities in the domain of volumetric video services. This +survey aspires to provide a holistic understanding of this burgeoning field and +shed light on potential future research trajectories, aiming to bring the +vision of volumetric video to fruition. + +
+
+ comment: Major revision submitted to ACM Computing Surveys +
+
+
+
+
+ + ♻ ☆ Multimodal Class-aware Semantic Enhancement Network for Audio-Visual + Video Parsing AAAI-2025 + + +
+ The Audio-Visual Video Parsing task aims to recognize and temporally localize +all events occurring in either the audio or visual stream, or both. Capturing +accurate event semantics for each audio/visual segment is vital. Prior works +directly utilize the extracted holistic audio and visual features for intra- +and cross-modal temporal interactions. However, each segment may contain +multiple events, resulting in semantically mixed holistic features that can +lead to semantic interference during intra- or cross-modal interactions: the +event semantics of one segment may incorporate semantics of unrelated events +from other segments. To address this issue, our method begins with a +Class-Aware Feature Decoupling (CAFD) module, which explicitly decouples the +semantically mixed features into distinct class-wise features, including +multiple event-specific features and a dedicated background feature. The +decoupled class-wise features enable our model to selectively aggregate useful +semantics for each segment from clearly matched classes contained in other +segments, preventing semantic interference from irrelevant classes. +Specifically, we further design a Fine-Grained Semantic Enhancement module for +encoding intra- and cross-modal relations. It comprises a Segment-wise Event +Co-occurrence Modeling (SECM) block and a Local-Global Semantic Fusion (LGSF) +block. The SECM exploits inter-class dependencies of concurrent events within +the same timestamp with the aid of a new event co-occurrence loss. The LGSF +further enhances the event semantics of each segment by incorporating relevant +semantics from more informative global video features. Extensive experiments +validate the effectiveness of the proposed modules and loss functions, +resulting in a new state-of-the-art parsing performance. + +
+
+ comment: Accepted by AAAI-2025 +
+
+
+
+
+ + ♻ ☆ MMTrail: A Multimodal Trailer Video Dataset with Language and Music + Descriptions + + +
+ Massive multi-modality datasets play a significant role in facilitating the +success of large video-language models. However, current video-language +datasets primarily provide text descriptions for visual frames, considering +audio to be weakly related information. They usually overlook exploring the +potential of inherent audio-visual correlation, leading to monotonous +annotation within each modality instead of comprehensive and precise +descriptions. Such ignorance results in the difficulty of multiple +cross-modality studies. To fulfill this gap, we present MMTrail, a large-scale +multi-modality video-language dataset incorporating more than 20M trailer clips +with visual captions, and 2M high-quality clips with multimodal captions. +Trailers preview full-length video works and integrate context, visual frames, +and background music. In particular, the trailer has two main advantages: (1) +the topics are diverse, and the content characters are of various types, e.g., +film, news, and gaming. (2) the corresponding background music is +custom-designed, making it more coherent with the visual context. Upon these +insights, we propose a systemic captioning framework, achieving various +modality annotations with more than 27.1k hours of trailer videos. Here, to +ensure the caption retains music perspective while preserving the authority of +visual context, we leverage the advanced LLM to merge all annotations +adaptively. In this fashion, our MMtrail dataset potentially paves the path for +fine-grained large multimodal-language model training. In experiments, we +provide evaluation metrics and benchmark results on our dataset, demonstrating +the high quality of our annotation and its effectiveness for model training. + +
+
+ comment: 15 Pages. Dataset report +
+
+
+
+
+ + ♻ ☆ JEN-1 Composer: A Unified Framework for High-Fidelity Multi-Track Music + Generation AAAI 2025 + + +
+ With rapid advances in generative artificial intelligence, the text-to-music +synthesis task has emerged as a promising direction for music generation. +Nevertheless, achieving precise control over multi-track generation remains an +open challenge. While existing models excel in directly generating multi-track +mix, their limitations become evident when it comes to composing individual +tracks and integrating them in a controllable manner. This departure from the +typical workflows of professional composers hinders the ability to refine +details in specific tracks. To address this gap, we propose JEN-1 Composer, a +unified framework designed to efficiently model marginal, conditional, and +joint distributions over multi-track music using a single model. Building upon +an audio latent diffusion model, JEN-1 Composer extends the versatility of +multi-track music generation. We introduce a progressive curriculum training +strategy, which gradually escalates the difficulty of training tasks while +ensuring the model's generalization ability and facilitating smooth transitions +between different scenarios. During inference, users can iteratively generate +and select music tracks, thus incrementally composing entire musical pieces in +accordance with the Human-AI co-composition workflow. Our approach demonstrates +state-of-the-art performance in controllable and high-fidelity multi-track +music synthesis, marking a significant advancement in interactive AI-assisted +music creation. Our demo pages are available at www.jenmusic.ai/research. + +
+
+ comment: 9 pages, 3 figures, 3 tables, accepted by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Multi-modal and Multi-scale Spatial Environment Understanding for + Immersive Visual Text-to-Speech AAAI'2025 + + +
+ Visual Text-to-Speech (VTTS) aims to take the environmental image as the +prompt to synthesize the reverberant speech for the spoken content. The +challenge of this task lies in understanding the spatial environment from the +image. Many attempts have been made to extract global spatial visual +information from the RGB space of an spatial image. However, local and depth +image information are crucial for understanding the spatial environment, which +previous works have ignored. To address the issues, we propose a novel +multi-modal and multi-scale spatial environment understanding scheme to achieve +immersive VTTS, termed M2SE-VTTS. The multi-modal aims to take both the RGB and +Depth spaces of the spatial image to learn more comprehensive spatial +information, and the multi-scale seeks to model the local and global spatial +knowledge simultaneously. Specifically, we first split the RGB and Depth images +into patches and adopt the Gemini-generated environment captions to guide the +local spatial understanding. After that, the multi-modal and multi-scale +features are integrated by the local-aware global spatial understanding. In +this way, M2SE-VTTS effectively models the interactions between local and +global spatial contexts in the multi-modal spatial environment. Objective and +subjective evaluations suggest that our model outperforms the advanced +baselines in environmental speech generation. The code and audio samples are +available at: https://github.com/AI-S2-Lab/M2SE-VTTS. + +
+
+ comment: 9 pages,2 figures, Accepted by AAAI'2025 +
+
+
+
+
+ + ♻ ☆ Less is More: A Simple yet Effective Token Reduction Method for + Efficient Multi-modal LLMs COLING 2025 + + +
+ The rapid advancement of Multimodal Large Language Models (MLLMs) has led to +remarkable performances across various domains. However, this progress is +accompanied by a substantial surge in the resource consumption of these models. +We address this pressing issue by introducing a new approach, Token Reduction +using CLIP Metric (TRIM), aimed at improving the efficiency of MLLMs without +sacrificing their performance. Inspired by human attention patterns in Visual +Question Answering (VQA) tasks, TRIM presents a fresh perspective on the +selection and reduction of image tokens. The TRIM method has been extensively +tested across 12 datasets, and the results demonstrate a significant reduction +in computational overhead while maintaining a consistent level of performance. +This research marks a critical stride in efficient MLLM development, promoting +greater accessibility and sustainability of high-performing models. + +
+
+ comment: Accepted to COLING 2025 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 20 + +
+
+
+ + ☆ Searching Personal Collections + + +
+ This article describes the history of information retrieval on personal +document collections. + +
+
+
+
+
+ + ☆ RAG Playground: A Framework for Systematic Evaluation of Retrieval + Strategies and Prompt Engineering in RAG Systems + + +
+ We present RAG Playground, an open-source framework for systematic evaluation +of Retrieval-Augmented Generation (RAG) systems. The framework implements and +compares three retrieval approaches: naive vector search, reranking, and hybrid +vector-keyword search, combined with ReAct agents using different prompting +strategies. We introduce a comprehensive evaluation framework with novel +metrics and provide empirical results comparing different language models +(Llama 3.1 and Qwen 2.5) across various retrieval configurations. Our +experiments demonstrate significant performance improvements through hybrid +search methods and structured self-evaluation prompting, achieving up to 72.7% +pass rate on our multi-metric evaluation framework. The results also highlight +the importance of prompt engineering in RAG systems, with our custom-prompted +agents showing consistent improvements in retrieval accuracy and response +quality. + +
+
+ comment: Work In Progress +
+
+
+
+
+ + ☆ No More Tuning: Prioritized Multi-Task Learning with Lagrangian + Differential Multiplier Methods AAAI 2025 + + +
+ Given the ubiquity of multi-task in practical systems, Multi-Task Learning +(MTL) has found widespread application across diverse domains. In real-world +scenarios, these tasks often have different priorities. For instance, In web +search, relevance is often prioritized over other metrics, such as +click-through rates or user engagement. Existing frameworks pay insufficient +attention to the prioritization among different tasks, which typically adjust +task-specific loss function weights to differentiate task priorities. However, +this approach encounters challenges as the number of tasks grows, leading to +exponential increases in hyper-parameter tuning complexity. Furthermore, the +simultaneous optimization of multiple objectives can negatively impact the +performance of high-priority tasks due to interference from lower-priority +tasks. + In this paper, we introduce a novel multi-task learning framework employing +Lagrangian Differential Multiplier Methods for step-wise multi-task +optimization. It is designed to boost the performance of high-priority tasks +without interference from other tasks. Its primary advantage lies in its +ability to automatically optimize multiple objectives without requiring +balancing hyper-parameters for different tasks, thereby eliminating the need +for manual tuning. Additionally, we provide theoretical analysis demonstrating +that our method ensures optimization guarantees, enhancing the reliability of +the process. We demonstrate its effectiveness through experiments on multiple +public datasets and its application in Taobao search, a large-scale industrial +search ranking system, resulting in significant improvements across various +business metrics. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ RetroLLM: Empowering Large Language Models to Retrieve Fine-grained + Evidence within Generation + + +
+ Large language models (LLMs) exhibit remarkable generative capabilities but +often suffer from hallucinations. Retrieval-augmented generation (RAG) offers +an effective solution by incorporating external knowledge, but existing methods +still face several limitations: additional deployment costs of separate +retrievers, redundant input tokens from retrieved text chunks, and the lack of +joint optimization of retrieval and generation. To address these issues, we +propose \textbf{RetroLLM}, a unified framework that integrates retrieval and +generation into a single, cohesive process, enabling LLMs to directly generate +fine-grained evidence from the corpus with constrained decoding. Moreover, to +mitigate false pruning in the process of constrained evidence generation, we +introduce (1) hierarchical FM-Index constraints, which generate +corpus-constrained clues to identify a subset of relevant documents before +evidence generation, reducing irrelevant decoding space; and (2) a +forward-looking constrained decoding strategy, which considers the relevance of +future sequences to improve evidence accuracy. Extensive experiments on five +open-domain QA datasets demonstrate RetroLLM's superior performance across both +in-domain and out-of-domain tasks. The code is available at +\url{https://github.com/sunnynexus/RetroLLM}. + +
+
+
+
+
+ + ☆ Investigating Mixture of Experts in Dense Retrieval + + +
+ While Dense Retrieval Models (DRMs) have advanced Information Retrieval (IR), +one limitation of these neural models is their narrow generalizability and +robustness. To cope with this issue, one can leverage the Mixture-of-Experts +(MoE) architecture. While previous IR studies have incorporated MoE +architectures within the Transformer layers of DRMs, our work investigates an +architecture that integrates a single MoE block (SB-MoE) after the output of +the final Transformer layer. Our empirical evaluation investigates how SB-MoE +compares, in terms of retrieval effectiveness, to standard fine-tuning. In +detail, we fine-tune three DRMs (TinyBERT, BERT, and Contriever) across four +benchmark collections with and without adding the MoE block. Moreover, since +MoE showcases performance variations with respect to its parameters (i.e., the +number of experts), we conduct additional experiments to investigate this +aspect further. The findings show the effectiveness of SB-MoE especially for +DRMs with a low number of parameters (i.e., TinyBERT), as it consistently +outperforms the fine-tuned underlying model on all four benchmarks. For DRMs +with a higher number of parameters (i.e., BERT and Contriever), SB-MoE requires +larger numbers of training samples to yield better retrieval performance. + +
+
+
+
+
+ + ☆ SPGL: Enhancing Session-based Recommendation with Single Positive Graph + Learning ICONIP 2024 + + +
+ Session-based recommendation seeks to forecast the next item a user will be +interested in, based on their interaction sequences. Due to limited interaction +data, session-based recommendation faces the challenge of limited data +availability. Traditional methods enhance feature learning by constructing +complex models to generate positive and negative samples. This paper proposes a +session-based recommendation model using Single Positive optimization loss and +Graph Learning (SPGL) to deal with the problem of data sparsity, high model +complexity and weak transferability. SPGL utilizes graph convolutional networks +to generate global item representations and batch session representations, +effectively capturing intrinsic relationships between items. The use of single +positive optimization loss improves uniformity of item representations, thereby +enhancing recommendation accuracy. In the intent extractor, SPGL considers the +hop count of the adjacency matrix when constructing the directed global graph +to fully integrate spatial information. It also takes into account the reverse +positional information of items when constructing session representations to +incorporate temporal information. Comparative experiments across three +benchmark datasets, Tmall, RetailRocket and Diginetica, demonstrate the model's +effectiveness. The source code can be accessed on +https://github.com/liang-tian-tian/SPGL . + +
+
+ comment: ICONIP 2024 +
+
+
+
+
+ + ☆ A Distributed Collaborative Retrieval Framework Excelling in All Queries + and Corpora based on Zero-shot Rank-Oriented Automatic Evaluation + + +
+ Numerous retrieval models, including sparse, dense and llm-based methods, +have demonstrated remarkable performance in predicting the relevance between +queries and corpora. However, the preliminary effectiveness analysis +experiments indicate that these models fail to achieve satisfactory performance +on the majority of queries and corpora, revealing their effectiveness +restricted to specific scenarios. Thus, to tackle this problem, we propose a +novel Distributed Collaborative Retrieval Framework (DCRF), outperforming each +single model across all queries and corpora. Specifically, the framework +integrates various retrieval models into a unified system and dynamically +selects the optimal results for each user's query. It can easily aggregate any +retrieval model and expand to any application scenarios, illustrating its +flexibility and scalability.Moreover, to reduce maintenance and training costs, +we design four effective prompting strategies with large language models (LLMs) +to evaluate the quality of ranks without reliance of labeled data. Extensive +experiments demonstrate that proposed framework, combined with 8 efficient +retrieval models, can achieve performance comparable to effective listwise +methods like RankGPT and ListT5, while offering superior efficiency. Besides, +DCRF surpasses all selected retrieval models on the most datasets, indicating +the effectiveness of our prompting strategies on rank-oriented automatic +evaluation. + +
+
+
+
+
+ + ☆ Leveraging User-Generated Metadata of Online Videos for Cover Song + Identification + + +
+ YouTube is a rich source of cover songs. Since the platform itself is +organized in terms of videos rather than songs, the retrieval of covers is not +trivial. The field of cover song identification addresses this problem and +provides approaches that usually rely on audio content. However, including the +user-generated video metadata available on YouTube promises improved +identification results. In this paper, we propose a multi-modal approach for +cover song identification on online video platforms. We combine the entity +resolution models with audio-based approaches using a ranking model. Our +findings implicate that leveraging user-generated metadata can stabilize cover +song identification performance on YouTube. + +
+
+ comment: accepted for presentation at NLP for Music and Audio (NLP4MusA) 2024 +
+
+
+
+
+ + ☆ A Method for Detecting Legal Article Competition for Korean Criminal Law + Using a Case-augmented Mention Graph + + +
+ As social systems become increasingly complex, legal articles are also +growing more intricate, making it progressively harder for humans to identify +any potential competitions among them, particularly when drafting new laws or +applying existing laws. Despite this challenge, no method for detecting such +competitions has been proposed so far. In this paper, we propose a new legal AI +task called Legal Article Competition Detection (LACD), which aims to identify +competing articles within a given law. Our novel retrieval method, CAM-Re2, +outperforms existing relevant methods, reducing false positives by 20.8% and +false negatives by 8.3%, while achieving a 98.2% improvement in precision@5, +for the LACD task. We release our codes at +https://github.com/asmath472/LACD-public. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Establishing a Foundation for Tetun Text Ad-Hoc Retrieval: Indexing, + Stemming, Retrieval, and Ranking + + +
+ Searching for information on the internet and digital platforms to satisfy an +information need requires effective retrieval solutions. However, such +solutions are not yet available for Tetun, making it challenging to find +relevant documents for text-based search queries in this language. To address +these challenges, this study investigates Tetun text retrieval with a focus on +the ad-hoc retrieval task. It begins by developing essential language resources +-- including a list of stopwords, a stemmer, and a test collection -- which +serve as foundational components for solutions tailored to Tetun text +retrieval. Various strategies are then explored using both document titles and +content to evaluate retrieval effectiveness. The results show that retrieving +document titles, after removing hyphens and apostrophes without applying +stemming, significantly improves retrieval performance compared to the +baseline. Efficiency increases by 31.37%, while effectiveness achieves an +average gain of 9.40% in MAP@10 and 30.35% in nDCG@10 with DFR BM25. Beyond the +top-10 cutoff point, Hiemstra LM demonstrates strong performance across various +retrieval strategies and evaluation metrics. Contributions of this work include +the development of Labadain-Stopwords (a list of 160 Tetun stopwords), +Labadain-Stemmer (a Tetun stemmer with three variants), and +Labadain-Avaliad\'or (a Tetun test collection containing 59 topics, 33,550 +documents, and 5,900 qrels). + +
+
+
+
+
+ + ☆ Beyond Graph Convolution: Multimodal Recommendation with Topology-aware + MLPs AAAI 2025 + + +
+ Given the large volume of side information from different modalities, +multimodal recommender systems have become increasingly vital, as they exploit +richer semantic information beyond user-item interactions. Recent works +highlight that leveraging Graph Convolutional Networks (GCNs) to explicitly +model multimodal item-item relations can significantly enhance recommendation +performance. However, due to the inherent over-smoothing issue of GCNs, +existing models benefit only from shallow GCNs with limited representation +power. This drawback is especially pronounced when facing complex and +high-dimensional patterns such as multimodal data, as it requires +large-capacity models to accommodate complicated correlations. To this end, in +this paper, we investigate bypassing GCNs when modeling multimodal item-item +relationship. More specifically, we propose a Topology-aware Multi-Layer +Perceptron (TMLP), which uses MLPs instead of GCNs to model the relationships +between items. TMLP enhances MLPs with topological pruning to denoise item-item +relations and intra (inter)-modality learning to integrate higher-order +modality correlations. Extensive experiments on three real-world datasets +verify TMLP's superiority over nine baselines. We also find that by discarding +the internal message passing in GCNs, which is sensitive to node connections, +TMLP achieves significant improvements in both training efficiency and +robustness against existing models. + +
+
+ comment: AAAI 2025. 11 pages, 9 figures +
+
+
+
+
+ + ☆ STAIR: Manipulating Collaborative and Multimodal Information for + E-Commerce Recommendation AAAI 2025 + + +
+ While the mining of modalities is the focus of most multimodal recommendation +methods, we believe that how to fully utilize both collaborative and multimodal +information is pivotal in e-commerce scenarios where, as clarified in this +work, the user behaviors are rarely determined entirely by multimodal features. +In order to combine the two distinct types of information, some additional +challenges are encountered: 1) Modality erasure: Vanilla graph convolution, +which proves rather useful in collaborative filtering, however erases +multimodal information; 2) Modality forgetting: Multimodal information tends to +be gradually forgotten as the recommendation loss essentially facilitates the +learning of collaborative information. To this end, we propose a novel approach +named STAIR, which employs a novel STepwise grAph convolution to enable a +co-existence of collaborative and multimodal Information in e-commerce +Recommendation. Besides, it starts with the raw multimodal features as an +initialization, and the forgetting problem can be significantly alleviated +through constrained embedding updates. As a result, STAIR achieves +state-of-the-art recommendation performance on three public e-commerce datasets +with minimal computational and memory costs. Our code is available at +https://github.com/yhhe2004/STAIR. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ☆ Future Sight and Tough Fights: Revolutionizing Sequential Recommendation + with FENRec AAAI 2025 + + +
+ Sequential recommendation (SR) systems predict user preferences by analyzing +time-ordered interaction sequences. A common challenge for SR is data sparsity, +as users typically interact with only a limited number of items. While +contrastive learning has been employed in previous approaches to address the +challenges, these methods often adopt binary labels, missing finer patterns and +overlooking detailed information in subsequent behaviors of users. +Additionally, they rely on random sampling to select negatives in contrastive +learning, which may not yield sufficiently hard negatives during later training +stages. In this paper, we propose Future data utilization with Enduring +Negatives for contrastive learning in sequential Recommendation (FENRec). Our +approach aims to leverage future data with time-dependent soft labels and +generate enduring hard negatives from existing data, thereby enhancing the +effectiveness in tackling data sparsity. Experiment results demonstrate our +state-of-the-art performance across four benchmark datasets, with an average +improvement of 6.16\% across all metrics. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ Enhancing Healthcare Recommendation Systems with a Multimodal LLMs-based + MOE Architecture + + +
+ With the increasing availability of multimodal data, many fields urgently +require advanced architectures capable of effectively integrating these diverse +data sources to address specific problems. This study proposes a hybrid +recommendation model that combines the Mixture of Experts (MOE) framework with +large language models to enhance the performance of recommendation systems in +the healthcare domain. We built a small dataset for recommending healthy food +based on patient descriptions and evaluated the model's performance on several +key metrics, including Precision, Recall, NDCG, and MAP@5. The experimental +results show that the hybrid model outperforms the baseline models, which use +MOE or large language models individually, in terms of both accuracy and +personalized recommendation effectiveness. The paper finds image data provided +relatively limited improvement in the performance of the personalized +recommendation system, particularly in addressing the cold start problem. Then, +the issue of reclassification of images also affected the recommendation +results, especially when dealing with low-quality images or changes in the +appearance of items, leading to suboptimal performance. The findings provide +valuable insights into the development of powerful, scalable, and +high-performance recommendation systems, advancing the application of +personalized recommendation technologies in real-world domains such as +healthcare. + +
+
+ comment: 10 page, accpted by Conf-SMPL conference +
+
+
+
+
+ + ☆ Optimized Quran Passage Retrieval Using an Expanded QA Dataset and + Fine-Tuned Language Models + + +
+ Understanding the deep meanings of the Qur'an and bridging the language gap +between modern standard Arabic and classical Arabic is essential to improve the +question-and-answer system for the Holy Qur'an. The Qur'an QA 2023 shared task +dataset had a limited number of questions with weak model retrieval. To address +this challenge, this work updated the original dataset and improved the model +accuracy. The original dataset, which contains 251 questions, was reviewed and +expanded to 629 questions with question diversification and reformulation, +leading to a comprehensive set of 1895 categorized into single-answer, +multi-answer, and zero-answer types. Extensive experiments fine-tuned +transformer models, including AraBERT, RoBERTa, CAMeLBERT, AraELECTRA, and +BERT. The best model, AraBERT-base, achieved a MAP@10 of 0.36 and MRR of 0.59, +representing improvements of 63% and 59%, respectively, compared to the +baseline scores (MAP@10: 0.22, MRR: 0.37). Additionally, the dataset expansion +led to improvements in handling "no answer" cases, with the proposed approach +achieving a 75% success rate for such instances, compared to the baseline's +25%. These results demonstrate the effect of dataset improvement and model +architecture optimization in increasing the performance of QA systems for the +Holy Qur'an, with higher accuracy, recall, and precision. + +
+
+
+
+
+ + ♻ ☆ CURE: A dataset for Clinical Understanding & Retrieval Evaluation + + +
+ Given the dominance of dense retrievers that do not generalize well beyond +their training dataset distributions, domain-specific test sets are essential +in evaluating retrieval. There are few test datasets for retrieval systems +intended for use by healthcare providers in a point-of-care setting. To fill +this gap we have collaborated with medical professionals to create CURE, an +ad-hoc retrieval test dataset for passage ranking with 2000 queries spanning 10 +medical domains with a monolingual (English) and two cross-lingual +(French/Spanish -> English) conditions. In this paper, we describe how CURE was +constructed and provide baseline results to showcase its effectiveness as an +evaluation tool. CURE is published with a Creative Commons Attribution Non +Commercial 4.0 license and can be accessed on Hugging Face. + +
+
+
+
+
+ + ♻ ☆ Unified Multimodal Interleaved Document Representation for Retrieval + + +
+ Information Retrieval (IR) methods aim to identify documents relevant to a +query, which have been widely applied in various natural language tasks. +However, existing approaches typically consider only the textual content within +documents, overlooking the fact that documents can contain multiple modalities, +including images and tables. Also, they often segment each long document into +multiple discrete passages for embedding, which prevents them from capturing +the overall document context and interactions between paragraphs. To address +these two challenges, we propose a method that holistically embeds documents +interleaved with multiple modalities by leveraging the capability of recent +vision-language models that enable the processing and integration of text, +images, and tables into a unified format and representation. Moreover, to +mitigate the information loss from segmenting documents into passages, instead +of representing and retrieving passages individually, we further merge the +representations of segmented passages into one single document representation, +while we additionally introduce a reranking strategy to decouple and identify +the relevant passage within the document if necessary. Then, through extensive +experiments on diverse IR scenarios considering both the textual and multimodal +queries, we show that our approach substantially outperforms relevant +baselines, thanks to the consideration of the multimodal information within +documents. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Attention-Seeker: Dynamic Self-Attention Scoring for Unsupervised + Keyphrase Extraction COLING 2025 + + +
+ This paper proposes Attention-Seeker, an unsupervised keyphrase extraction +method that leverages self-attention maps from a Large Language Model to +estimate the importance of candidate phrases. Our approach identifies specific +components - such as layers, heads, and attention vectors - where the model +pays significant attention to the key topics of the text. The attention weights +provided by these components are then used to score the candidate phrases. +Unlike previous models that require manual tuning of parameters (e.g., +selection of heads, prompts, hyperparameters), Attention-Seeker dynamically +adapts to the input text without any manual adjustments, enhancing its +practical applicability. We evaluate Attention-Seeker on four publicly +available datasets: Inspec, SemEval2010, SemEval2017, and Krapivin. Our results +demonstrate that, even without parameter tuning, Attention-Seeker outperforms +most baseline models, achieving state-of-the-art performance on three out of +four datasets, particularly excelling in extracting keyphrases from long +documents. + +
+
+ comment: This version has been accepted for presentation at COLING 2025, and + all peer-reviewed changes have been incorporated +
+
+
+
+
+ + ♻ ☆ Text Proxy: Decomposing Retrieval from a 1-to-N Relationship into N + 1-to-1 Relationships for Text-Video Retrieval + + +
+ Text-video retrieval (TVR) has seen substantial advancements in recent years, +fueled by the utilization of pre-trained models and large language models +(LLMs). Despite these advancements, achieving accurate matching in TVR remains +challenging due to inherent disparities between video and textual modalities +and irregularities in data representation. In this paper, we propose +Text-Video-ProxyNet (TV-ProxyNet), a novel framework designed to decompose the +conventional 1-to-N relationship of TVR into N distinct 1-to-1 relationships. +By replacing a single text query with a series of text proxies, TV-ProxyNet not +only broadens the query scope but also achieves a more precise expansion. Each +text proxy is crafted through a refined iterative process, controlled by +mechanisms we term as the director and dash, which regulate the proxy's +direction and distance relative to the original text query. This setup not only +facilitates more precise semantic alignment but also effectively manages the +disparities and noise inherent in multimodal data. Our experiments on three +representative video-text retrieval benchmarks, MSRVTT, DiDeMo, and ActivityNet +Captions, demonstrate the effectiveness of TV-ProxyNet. The results show an +improvement of 2.0% to 3.3% in R@1 over the baseline. TV-ProxyNet achieved +state-of-the-art performance on MSRVTT and ActivityNet Captions, and a 2.0% +improvement on DiDeMo compared to existing methods, validating our approach's +ability to enhance semantic mapping and reduce error propensity. + +
+
+
+
+
+ + ♻ ☆ Predictive Models in Sequential Recommendations: Bridging Performance + Laws with Data Quality Insights + + +
+ Sequential Recommendation (SR) plays a critical role in predicting users' +sequential preferences. Despite its growing prominence in various industries, +the increasing scale of SR models incurs substantial computational costs and +unpredictability, challenging developers to manage resources efficiently. Under +this predicament, Scaling Laws have achieved significant success by examining +the loss as models scale up. However, there remains a disparity between loss +and model performance, which is of greater concern in practical applications. +Moreover, as data continues to expand, it incorporates repetitive and +inefficient data. In response, we introduce the Performance Law for SR models, +which aims to theoretically investigate and model the relationship between +model performance and data quality. Specifically, we first fit the HR and NDCG +metrics to transformer-based SR models. Subsequently, we propose Approximate +Entropy (ApEn) to assess data quality, presenting a more nuanced approach +compared to traditional data quantity metrics. Our method enables accurate +predictions across various dataset scales and model sizes, demonstrating a +strong correlation in large SR models and offering insights into achieving +optimal performance for any given model configuration. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+
+
+
+ + Multimedia 14 + +
+
+
+ + ☆ Efficient Object-centric Representation Learning with Pre-trained + Geometric Prior + + +
+ This paper addresses key challenges in object-centric representation learning +of video. While existing approaches struggle with complex scenes, we propose a +novel weakly-supervised framework that emphasises geometric understanding and +leverages pre-trained vision models to enhance object discovery. Our method +introduces an efficient slot decoder specifically designed for object-centric +learning, enabling effective representation of multi-object scenes without +requiring explicit depth information. Results on synthetic video benchmarks +with increasing complexity in terms of objects and their movement, object +occlusion and camera motion demonstrate that our approach achieves comparable +performance to supervised methods while maintaining computational efficiency. +This advances the field towards more practical applications in complex +real-world scenarios. + +
+
+ comment: 6 pages, 4 Figures, 2 Tables +
+
+
+
+
+ + ☆ A Benchmark and Robustness Study of In-Context-Learning with Large + Language Models in Music Entity Detection + + +
+ Detecting music entities such as song titles or artist names is a useful +application to help use cases like processing music search queries or analyzing +music consumption on the web. Recent approaches incorporate smaller language +models (SLMs) like BERT and achieve high results. However, further research +indicates a high influence of entity exposure during pre-training on the +performance of the models. With the advent of large language models (LLMs), +these outperform SLMs in a variety of downstream tasks. However, researchers +are still divided if this is applicable to tasks like entity detection in texts +due to issues like hallucination. In this paper, we provide a novel dataset of +user-generated metadata and conduct a benchmark and a robustness study using +recent LLMs with in-context-learning (ICL). Our results indicate that LLMs in +the ICL setting yield higher performance than SLMs. We further uncover the +large impact of entity exposure on the best performing LLM in our study. + +
+
+
+
+
+ + ☆ Leveraging User-Generated Metadata of Online Videos for Cover Song + Identification + + +
+ YouTube is a rich source of cover songs. Since the platform itself is +organized in terms of videos rather than songs, the retrieval of covers is not +trivial. The field of cover song identification addresses this problem and +provides approaches that usually rely on audio content. However, including the +user-generated video metadata available on YouTube promises improved +identification results. In this paper, we propose a multi-modal approach for +cover song identification on online video platforms. We combine the entity +resolution models with audio-based approaches using a ranking model. Our +findings implicate that leveraging user-generated metadata can stabilize cover +song identification performance on YouTube. + +
+
+ comment: accepted for presentation at NLP for Music and Audio (NLP4MusA) 2024 +
+
+
+
+
+ + ☆ GS-ProCams: Gaussian Splatting-based Projector-Camera Systems + + +
+ We present GS-ProCams, the first Gaussian Splatting-based framework for +projector-camera systems (ProCams). GS-ProCams significantly enhances the +efficiency of projection mapping (PM) that requires establishing geometric and +radiometric mappings between the projector and the camera. Previous CNN-based +ProCams are constrained to a specific viewpoint, limiting their applicability +to novel perspectives. In contrast, NeRF-based ProCams support view-agnostic +projection mapping, however, they require an additional colocated light source +and demand significant computational and memory resources. To address this +issue, we propose GS-ProCams that employs 2D Gaussian for scene +representations, and enables efficient view-agnostic ProCams applications. In +particular, we explicitly model the complex geometric and photometric mappings +of ProCams using projector responses, the target surface's geometry and +materials represented by Gaussians, and global illumination component. Then, we +employ differentiable physically-based rendering to jointly estimate them from +captured multi-view projections. Compared to state-of-the-art NeRF-based +methods, our GS-ProCams eliminates the need for additional devices, achieving +superior ProCams simulation quality. It is also 600 times faster and uses only +1/10 of the GPU memory. + +
+
+
+
+
+ + ☆ Discrepancy-Aware Attention Network for Enhanced Audio-Visual Zero-Shot + Learning + + +
+ Audio-visual Zero-Shot Learning (ZSL) has attracted significant attention for +its ability to identify unseen classes and perform well in video classification +tasks. However, modal imbalance in (G)ZSL leads to over-reliance on the optimal +modality, reducing discriminative capabilities for unseen classes. Some studies +have attempted to address this issue by modifying parameter gradients, but two +challenges still remain: (a) Quality discrepancies, where modalities offer +differing quantities and qualities of information for the same concept. (b) +Content discrepancies, where sample contributions within a modality vary +significantly. To address these challenges, we propose a Discrepancy-Aware +Attention Network (DAAN) for Enhanced Audio-Visual ZSL. Our approach introduces +a Quality-Discrepancy Mitigation Attention (QDMA) unit to minimize redundant +information in the high-quality modality and a Contrastive Sample-level +Gradient Modulation (CSGM) block to adjust gradient magnitudes and balance +content discrepancies. We quantify modality contributions by integrating +optimization and convergence rate for more precise gradient modulation in CSGM. +Experiments demonstrates DAAN achieves state-of-the-art performance on +benchmark datasets, with ablation studies validating the effectiveness of +individual modules. + +
+
+
+
+
+ + ☆ LMM-Regularized CLIP Embeddings for Image Classification + + +
+ In this paper we deal with image classification tasks using the powerful CLIP +vision-language model. Our goal is to advance the classification performance +using the CLIP's image encoder, by proposing a novel Large Multimodal Model +(LMM) based regularization method. The proposed method uses an LMM to extract +semantic descriptions for the images of the dataset. Then, it uses the CLIP's +text encoder, frozen, in order to obtain the corresponding text embeddings and +compute the mean semantic class descriptions. Subsequently, we adapt the CLIP's +image encoder by adding a classification head, and we train it along with the +image encoder output, apart from the main classification objective, with an +additional auxiliary objective. The additional objective forces the embeddings +at the image encoder's output to become similar to their corresponding +LMM-generated mean semantic class descriptions. In this way, it produces +embeddings with enhanced discrimination ability, leading to improved +classification performance. The effectiveness of the proposed regularization +method is validated through extensive experiments on three image classification +datasets. + +
+
+ comment: Accepted for publication, 26th Int. Symp. on Multimedia (IEEE ISM + 2024), Tokyo, Japan, Dec. 2024. This is the authors' "accepted version" +
+
+
+
+
+ + ☆ VG-TVP: Multimodal Procedural Planning via Visually Grounded Text-Video + Prompting AAAI + + +
+ Large Language Model (LLM)-based agents have shown promise in procedural +tasks, but the potential of multimodal instructions augmented by texts and +videos to assist users remains under-explored. To address this gap, we propose +the Visually Grounded Text-Video Prompting (VG-TVP) method which is a novel +LLM-empowered Multimodal Procedural Planning (MPP) framework. It generates +cohesive text and video procedural plans given a specified high-level +objective. The main challenges are achieving textual and visual +informativeness, temporal coherence, and accuracy in procedural plans. VG-TVP +leverages the zero-shot reasoning capability of LLMs, the video-to-text +generation ability of the video captioning models, and the text-to-video +generation ability of diffusion models. VG-TVP improves the interaction between +modalities by proposing a novel Fusion of Captioning (FoC) method and using +Text-to-Video Bridge (T2V-B) and Video-to-Text Bridge (V2T-B). They allow LLMs +to guide the generation of visually-grounded text plans and textual-grounded +video plans. To address the scarcity of datasets suitable for MPP, we have +curated a new dataset called Daily-Life Task Procedural Plans (Daily-PP). We +conduct comprehensive experiments and benchmarks to evaluate human preferences +(regarding textual and visual informativeness, temporal coherence, and plan +accuracy). Our VG-TVP method outperforms unimodal baselines on the Daily-PP +dataset. + +
+
+ comment: Accepted for The 39th Annual AAAI Conference on Artificial + Intelligence 2025 in Main Track, 19 pages, 24 figures +
+
+
+
+
+ + ☆ DLF: Disentangled-Language-Focused Multimodal Sentiment Analysis AAAI 2025 + + +
+ Multimodal Sentiment Analysis (MSA) leverages heterogeneous modalities, such +as language, vision, and audio, to enhance the understanding of human +sentiment. While existing models often focus on extracting shared information +across modalities or directly fusing heterogeneous modalities, such approaches +can introduce redundancy and conflicts due to equal treatment of all modalities +and the mutual transfer of information between modality pairs. To address these +issues, we propose a Disentangled-Language-Focused (DLF) multimodal +representation learning framework, which incorporates a feature disentanglement +module to separate modality-shared and modality-specific information. To +further reduce redundancy and enhance language-targeted features, four +geometric measures are introduced to refine the disentanglement process. A +Language-Focused Attractor (LFA) is further developed to strengthen language +representation by leveraging complementary modality-specific information +through a language-guided cross-attention mechanism. The framework also employs +hierarchical predictions to improve overall accuracy. Extensive experiments on +two popular MSA datasets, CMU-MOSI and CMU-MOSEI, demonstrate the significant +performance gains achieved by the proposed DLF framework. Comprehensive +ablation studies further validate the effectiveness of the feature +disentanglement module, language-focused attractor, and hierarchical +predictions. Our code is available at https://github.com/pwang322/DLF. + +
+
+ comment: AAAI 2025 accepted +
+
+
+
+
+ + ♻ ☆ IRR: Image Review Ranking Framework for Evaluating Vision-Language + Models COLING25 + + +
+ Large-scale Vision-Language Models (LVLMs) process both images and text, +excelling in multimodal tasks such as image captioning and description +generation. However, while these models excel at generating factual content, +their ability to generate and evaluate texts reflecting perspectives on the +same image, depending on the context, has not been sufficiently explored. To +address this, we propose IRR: Image Review Rank, a novel evaluation framework +designed to assess critic review texts from multiple perspectives. IRR +evaluates LVLMs by measuring how closely their judgments align with human +interpretations. We validate it using a dataset of images from 15 categories, +each with five critic review texts and annotated rankings in both English and +Japanese, totaling over 2,000 data instances. The datasets are available at +https://hf.co/datasets/naist-nlp/Wiki-ImageReview1.0. Our results indicate +that, although LVLMs exhibited consistent performance across languages, their +correlation with human annotations was insufficient, highlighting the need for +further advancements. These findings highlight the limitations of current +evaluation methods and the need for approaches that better capture human +reasoning in Vision & Language tasks. + +
+
+ comment: 18pages, Accepted at COLING25 +
+
+
+
+
+ + ♻ ☆ Wills Aligner: Multi-Subject Collaborative Brain Visual Decoding AAAI 2025 + + +
+ Decoding visual information from human brain activity has seen remarkable +advancements in recent research. However, the diversity in cortical +parcellation and fMRI patterns across individuals has prompted the development +of deep learning models tailored to each subject. The personalization limits +the broader applicability of brain visual decoding in real-world scenarios. To +address this issue, we introduce Wills Aligner, a novel approach designed to +achieve multi-subject collaborative brain visual decoding. Wills Aligner begins +by aligning the fMRI data from different subjects at the anatomical level. It +then employs delicate mixture-of-brain-expert adapters and a meta-learning +strategy to account for individual fMRI pattern differences. Additionally, +Wills Aligner leverages the semantic relation of visual stimuli to guide the +learning of inter-subject commonality, enabling visual decoding for each +subject to draw insights from other subjects' data. We rigorously evaluate our +Wills Aligner across various visual decoding tasks, including classification, +cross-modal retrieval, and image reconstruction. The experimental results +demonstrate that Wills Aligner achieves promising performance. + +
+
+ comment: AAAI 2025, 16 pages +
+
+
+
+
+ + ♻ ☆ MindTuner: Cross-Subject Visual Decoding with Visual Fingerprint and + Semantic Correction AAAI 2025 + + +
+ Decoding natural visual scenes from brain activity has flourished, with +extensive research in single-subject tasks and, however, less in cross-subject +tasks. Reconstructing high-quality images in cross-subject tasks is a +challenging problem due to profound individual differences between subjects and +the scarcity of data annotation. In this work, we proposed MindTuner for +cross-subject visual decoding, which achieves high-quality and rich semantic +reconstructions using only 1 hour of fMRI training data benefiting from the +phenomena of visual fingerprint in the human visual system and a novel +fMRI-to-text alignment paradigm. Firstly, we pre-train a multi-subject model +among 7 subjects and fine-tune it with scarce data on new subjects, where LoRAs +with Skip-LoRAs are utilized to learn the visual fingerprint. Then, we take the +image modality as the intermediate pivot modality to achieve fMRI-to-text +alignment, which achieves impressive fMRI-to-text retrieval performance and +corrects fMRI-to-image reconstruction with fine-tuned semantics. The results of +both qualitative and quantitative analyses demonstrate that MindTuner surpasses +state-of-the-art cross-subject visual decoding models on the Natural Scenes +Dataset (NSD), whether using training data of 1 hour or 40 hours. + +
+
+ comment: AAAI 2025, 14 pages +
+
+
+
+
+ + ♻ ☆ Text Proxy: Decomposing Retrieval from a 1-to-N Relationship into N + 1-to-1 Relationships for Text-Video Retrieval + + +
+ Text-video retrieval (TVR) has seen substantial advancements in recent years, +fueled by the utilization of pre-trained models and large language models +(LLMs). Despite these advancements, achieving accurate matching in TVR remains +challenging due to inherent disparities between video and textual modalities +and irregularities in data representation. In this paper, we propose +Text-Video-ProxyNet (TV-ProxyNet), a novel framework designed to decompose the +conventional 1-to-N relationship of TVR into N distinct 1-to-1 relationships. +By replacing a single text query with a series of text proxies, TV-ProxyNet not +only broadens the query scope but also achieves a more precise expansion. Each +text proxy is crafted through a refined iterative process, controlled by +mechanisms we term as the director and dash, which regulate the proxy's +direction and distance relative to the original text query. This setup not only +facilitates more precise semantic alignment but also effectively manages the +disparities and noise inherent in multimodal data. Our experiments on three +representative video-text retrieval benchmarks, MSRVTT, DiDeMo, and ActivityNet +Captions, demonstrate the effectiveness of TV-ProxyNet. The results show an +improvement of 2.0% to 3.3% in R@1 over the baseline. TV-ProxyNet achieved +state-of-the-art performance on MSRVTT and ActivityNet Captions, and a 2.0% +improvement on DiDeMo compared to existing methods, validating our approach's +ability to enhance semantic mapping and reduce error propensity. + +
+
+
+
+
+ + ♻ ☆ Towards Effective User Attribution for Latent Diffusion Models via + Watermark-Informed Blending + + +
+ Rapid advancements in multimodal large language models have enabled the +creation of hyper-realistic images from textual descriptions. However, these +advancements also raise significant concerns about unauthorized use, which +hinders their broader distribution. Traditional watermarking methods often +require complex integration or degrade image quality. To address these +challenges, we introduce a novel framework Towards Effective user Attribution +for latent diffusion models via Watermark-Informed Blending (TEAWIB). TEAWIB +incorporates a unique ready-to-use configuration approach that allows seamless +integration of user-specific watermarks into generative models. This approach +ensures that each user can directly apply a pre-configured set of parameters to +the model without altering the original model parameters or compromising image +quality. Additionally, noise and augmentation operations are embedded at the +pixel level to further secure and stabilize watermarked images. Extensive +experiments validate the effectiveness of TEAWIB, showcasing the +state-of-the-art performance in perceptual quality and attribution accuracy. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Benchmarking VLMs' Reasoning About Persuasive Atypical Images + + +
+ Vision language models (VLMs) have shown strong zero-shot generalization +across various tasks, especially when integrated with large language models +(LLMs). However, their ability to comprehend rhetorical and persuasive visual +media, such as advertisements, remains understudied. Ads often employ atypical +imagery, using surprising object juxtapositions to convey shared properties. +For example, Fig. 1 (e) shows a beer with a feather-like texture. This requires +advanced reasoning to deduce that this atypical representation signifies the +beer's lightness. We introduce three novel tasks, Multi-label Atypicality +Classification, Atypicality Statement Retrieval, and Aypical Object +Recognition, to benchmark VLMs' understanding of atypicality in persuasive +images. We evaluate how well VLMs use atypicality to infer an ad's message and +test their reasoning abilities by employing semantically challenging negatives. +Finally, we pioneer atypicality-aware verbalization by extracting comprehensive +image descriptions sensitive to atypical elements. Our findings reveal that: +(1) VLMs lack advanced reasoning capabilities compared to LLMs; (2) simple, +effective strategies can extract atypicality-aware information, leading to +comprehensive image verbalization; (3) atypicality aids persuasive +advertisement understanding. Code and data will be made available. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 7 + +
+
+
+ + ☆ Task-Oriented Dialog Systems for the Senegalese Wolof Language COLING 2025 + + +
+ In recent years, we are seeing considerable interest in conversational agents +with the rise of large language models (LLMs). Although they offer considerable +advantages, LLMs also present significant risks, such as hallucination, which +hinder their widespread deployment in industry. Moreover, low-resource +languages such as African ones are still underrepresented in these systems +limiting their performance in these languages. In this paper, we illustrate a +more classical approach based on modular architectures of Task-oriented Dialog +Systems (ToDS) offering better control over outputs. We propose a chatbot +generation engine based on the Rasa framework and a robust methodology for +projecting annotations onto the Wolof language using an in-house machine +translation system. After evaluating a generated chatbot trained on the Amazon +Massive dataset, our Wolof Intent Classifier performs similarly to the one +obtained for French, which is a resource-rich language. We also show that this +approach is extensible to other low-resource languages, thanks to the intent +classifier's language-agnostic pipeline, simplifying the design of chatbots in +these languages. + +
+
+ comment: 10 pages, 3 tables, 6 figures, The 31st International Conference on + Computational Linguistics (COLING 2025) +
+
+
+
+
+ + ☆ A multi-theoretical kernel-based approach to social network-based + recommendation + + +
+ Recommender systems are a critical component of e-commercewebsites. The rapid +development of online social networking services provides an opportunity to +explore social networks together with information used in traditional +recommender systems, such as customer demographics, product characteristics, +and transactions. It also provides more applications for recommender systems. +To tackle this social network-based recommendation problem, previous studies +generally built trust models in light of the social influence theory. This +study inspects a spectrumof social network theories to systematicallymodel +themultiple facets of a social network and infer user preferences. In order to +effectively make use of these heterogonous theories, we take a kernel-based +machine learning paradigm, design and select kernels describing individual +similarities according to social network theories, and employ a non-linear +multiple kernel learning algorithm to combine the kernels into a unified model. +This design also enables us to consider multiple theories' interactions in +assessing individual behaviors. We evaluate our proposed approach on a +real-world movie review data set. The experiments show that our approach +provides more accurate recommendations than trust-based methods and the +collaborative filtering approach. Further analysis shows that kernels derived +from contagion theory and homophily theory contribute a larger portion of the +model. + +
+
+
+
+
+ + ☆ Modeling the Heterogeneous Duration of User Interest in Time-Dependent + Recommendation: A Hidden Semi-Markov Approach + + +
+ Recommender systems are widely used for suggesting books, education +materials, and products to users by exploring their behaviors. In reality, +users' preferences often change over time, leading to studies on time-dependent +recommender systems. However, most existing approaches that deal with time +information remain primitive. In this paper, we extend existing methods and +propose a hidden semi-Markov model to track the change of users' interests. +Particularly, this model allows for capturing the different durations of user +stays in a (latent) interest state, which can better model the heterogeneity of +user interests and focuses. We derive an expectation maximization algorithm to +estimate the parameters of the framework and predict users' actions. +Experiments on three real-world datasets show that our model significantly +outperforms the state-of-the-art time-dependent and static benchmark methods. +Further analyses of the experiment results indicate that the performance +improvement is related to the heterogeneity of state durations and the drift of +user interests in the dataset. + +
+
+
+
+
+ + ☆ Multi-Graph Co-Training for Capturing User Intent in Session-based + Recommendation COLING 2025 + + +
+ Session-based recommendation focuses on predicting the next item a user will +interact with based on sequences of anonymous user sessions. A significant +challenge in this field is data sparsity due to the typically short-term +interactions. Most existing methods rely heavily on users' current +interactions, overlooking the wealth of auxiliary information available. To +address this, we propose a novel model, the Multi-Graph Co-Training model +(MGCOT), which leverages not only the current session graph but also similar +session graphs and a global item relation graph. This approach allows for a +more comprehensive exploration of intrinsic relationships and better captures +user intent from multiple views, enabling session representations to complement +each other. Additionally, MGCOT employs multi-head attention mechanisms to +effectively capture relevant session intent and uses contrastive learning to +form accurate and robust session representations. Extensive experiments on +three datasets demonstrate that MGCOT significantly enhances the performance of +session-based recommendations, particularly on the Diginetica dataset, +achieving improvements up to 2.00% in P@20 and 10.70% in MRR@20. Resources have +been made publicly available in our GitHub repository +https://github.com/liang-tian-tian/MGCOT. + +
+
+ comment: COLING 2025 Main Conference +
+
+
+
+
+ + ☆ Leveraging Large Vision-Language Model as User Intent-aware Encoder for + Composed Image Retrieval AAAI 2025 + + +
+ Composed Image Retrieval (CIR) aims to retrieve target images from candidate +set using a hybrid-modality query consisting of a reference image and a +relative caption that describes the user intent. Recent studies attempt to +utilize Vision-Language Pre-training Models (VLPMs) with various fusion +strategies for addressing the task.However, these methods typically fail to +simultaneously meet two key requirements of CIR: comprehensively extracting +visual information and faithfully following the user intent. In this work, we +propose CIR-LVLM, a novel framework that leverages the large vision-language +model (LVLM) as the powerful user intent-aware encoder to better meet these +requirements. Our motivation is to explore the advanced reasoning and +instruction-following capabilities of LVLM for accurately understanding and +responding the user intent. Furthermore, we design a novel hybrid intent +instruction module to provide explicit intent guidance at two levels: (1) The +task prompt clarifies the task requirement and assists the model in discerning +user intent at the task level. (2) The instance-specific soft prompt, which is +adaptively selected from the learnable prompt pool, enables the model to better +comprehend the user intent at the instance level compared to a universal prompt +for all instances. CIR-LVLM achieves state-of-the-art performance across three +prominent benchmarks with acceptable inference efficiency. We believe this +study provides fundamental insights into CIR-related fields. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ RecSys Arena: Pair-wise Recommender System Evaluation with Large + Language Models + + +
+ Evaluating the quality of recommender systems is critical for algorithm +design and optimization. Most evaluation methods are computed based on offline +metrics for quick algorithm evolution, since online experiments are usually +risky and time-consuming. However, offline evaluation usually cannot fully +reflect users' preference for the outcome of different recommendation +algorithms, and the results may not be consistent with online A/B test. +Moreover, many offline metrics such as AUC do not offer sufficient information +for comparing the subtle differences between two competitive recommender +systems in different aspects, which may lead to substantial performance +differences in long-term online serving. Fortunately, due to the strong +commonsense knowledge and role-play capability of large language models (LLMs), +it is possible to obtain simulated user feedback on offline recommendation +results. Motivated by the idea of LLM Chatbot Arena, in this paper we present +the idea of RecSys Arena, where the recommendation results given by two +different recommender systems in each session are evaluated by an LLM judger to +obtain fine-grained evaluation feedback. More specifically, for each sample we +use LLM to generate a user profile description based on user behavior history +or off-the-shelf profile features, which is used to guide LLM to play the role +of this user and evaluate the relative preference for two recommendation +results generated by different models. Through extensive experiments on two +recommendation datasets in different scenarios, we demonstrate that many +different LLMs not only provide general evaluation results that are highly +consistent with canonical offline metrics, but also provide rich insight in +many subjective aspects. Moreover, it can better distinguish different +algorithms with comparable performance in terms of AUC and nDCG. + +
+
+
+
+
+ + ♻ ☆ Algorithmic Collusion or Competition: the Role of Platforms' Recommender + Systems + + +
+ Recent scholarly work has extensively examined the phenomenon of algorithmic +collusion driven by AI-enabled pricing algorithms. However, online platforms +commonly deploy recommender systems that influence how consumers discover and +purchase products, thereby shaping the reward structures faced by pricing +algorithms and ultimately affecting competition dynamics and equilibrium +outcomes. To address this gap in the literature and elucidate the role of +recommender systems, we propose a novel repeated game framework that integrates +several key components. We first develop a structural search model to +characterize consumers' decision-making processes in response to varying +recommendation sets. This model incorporates both observable and unobservable +heterogeneity in utility and search cost functions, and is estimated using +real-world data. Building on the resulting consumer model, we formulate +personalized recommendation algorithms designed to maximize either platform +revenue or consumer utility. We further introduce pricing algorithms for +sellers and integrate all these elements to facilitate comprehensive numerical +experiments. Our experimental findings reveal that a revenue-maximizing +recommender system intensifies algorithmic collusion, whereas a +utility-maximizing recommender system encourages more competitive pricing +behavior among sellers. Intriguingly, and contrary to conventional insights +from the industrial organization and choice modeling literature, increasing the +size of recommendation sets under a utility-maximizing regime does not +consistently enhance consumer utility. Moreover, the degree of horizontal +differentiation moderates this phenomenon in unexpected ways. The "more is +less" effect does not arise at low levels of differentiation, but becomes +increasingly pronounced as horizontal differentiation increases. + +
+
+ comment: 54 pages, 4 figures, 16 tables +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ Provably Secure Robust Image Steganography via Cross-Modal Error + Correction AAAI 2025 + + +
+ The rapid development of image generation models has facilitated the +widespread dissemination of generated images on social networks, creating +favorable conditions for provably secure image steganography. However, existing +methods face issues such as low quality of generated images and lack of +semantic control in the generation process. To leverage provably secure +steganography with more effective and high-performance image generation models, +and to ensure that stego images can accurately extract secret messages even +after being uploaded to social networks and subjected to lossy processing such +as JPEG compression, we propose a high-quality, provably secure, and robust +image steganography method based on state-of-the-art autoregressive (AR) image +generation models using Vector-Quantized (VQ) tokenizers. Additionally, we +employ a cross-modal error-correction framework that generates stego text from +stego images to aid in restoring lossy images, ultimately enabling the +extraction of secret messages embedded within the images. Extensive experiments +have demonstrated that the proposed method provides advantages in stego +quality, embedding capacity, and robustness, while ensuring provable +undetectability. + +
+
+ comment: 7 pages. Accepted by AAAI 2025 +
+
+
+
+
+ + ♻ ☆ DyRoNet: Dynamic Routing and Low-Rank Adapters for Autonomous Driving + Streaming Perception WACV 2025 + + +
+ The advancement of autonomous driving systems hinges on the ability to +achieve low-latency and high-accuracy perception. To address this critical +need, this paper introduces Dynamic Routing Network (DyRoNet), a low-rank +enhanced dynamic routing framework designed for streaming perception in +autonomous driving systems. DyRoNet integrates a suite of pre-trained branch +networks, each meticulously fine-tuned to function under distinct environmental +conditions. At its core, the framework offers a speed router module, developed +to assess and route input data to the most suitable branch for processing. This +approach not only addresses the inherent limitations of conventional models in +adapting to diverse driving conditions but also ensures the balance between +performance and efficiency. Extensive experimental evaluations demonstrate the +adaptability of DyRoNet to diverse branch selection strategies, resulting in +significant performance enhancements across different scenarios. This work +establishes a new benchmark for streaming perception and provides valuable +engineering insights for future work. + +
+
+ comment: Accepted to WACV 2025. 17 pages, 8 figures. Project: + https://tastevision.github.io/DyRoNet/ +
+
+
+
+
+ + ♻ ☆ Training-and-Prompt-Free General Painterly Harmonization via Zero-Shot + Disentenglement on Style and Content References + + +
+ Painterly image harmonization aims at seamlessly blending disparate visual +elements within a single image. However, previous approaches often struggle due +to limitations in training data or reliance on additional prompts, leading to +inharmonious and content-disrupted output. To surmount these hurdles, we design +a Training-and-prompt-Free General Painterly Harmonization method (TF-GPH). +TF-GPH incorporates a novel ``Similarity Disentangle Mask'', which disentangles +the foreground content and background image by redirecting their attention to +corresponding reference images, enhancing the attention mechanism for +multi-image inputs. Additionally, we propose a ``Similarity Reweighting'' +mechanism to balance harmonization between stylization and content +preservation. This mechanism minimizes content disruption by prioritizing the +content-similar features within the given background style reference. Finally, +we address the deficiencies in existing benchmarks by proposing novel +range-based evaluation metrics and a new benchmark to better reflect real-world +applications. Extensive experiments demonstrate the efficacy of our method in +all benchmarks. More detailed in https://github.com/BlueDyee/TF-GPH. + +
+
+
+
+
+ + ♻ ☆ HeGTa: Leveraging Heterogeneous Graph-enhanced Large Language Models for + Few-shot Complex Table Understanding AAAI 2025 + + +
+ Table understanding (TU) has achieved promising advancements, but it faces +the challenges of the scarcity of manually labeled tables and the presence of +complex table structures.To address these challenges, we propose HGT, a +framework with a heterogeneous graph (HG)-enhanced large language model (LLM) +to tackle few-shot TU tasks.It leverages the LLM by aligning the table +semantics with the LLM's parametric knowledge through soft prompts and +instruction turning and deals with complex tables by a multi-task pre-training +scheme involving three novel multi-granularity self-supervised HG pre-training +objectives.We empirically demonstrate the effectiveness of HGT, showing that it +outperforms the SOTA for few-shot complex TU on several benchmarks. + +
+
+ comment: AAAI 2025 +
+
+
+
+
+ + ♻ ☆ CAMSIC: Content-aware Masked Image Modeling Transformer for Stereo Image + Compression AAAI 2025 + + +
+ Existing learning-based stereo image codec adopt sophisticated transformation +with simple entropy models derived from single image codecs to encode latent +representations. However, those entropy models struggle to effectively capture +the spatial-disparity characteristics inherent in stereo images, which leads to +suboptimal rate-distortion results. In this paper, we propose a stereo image +compression framework, named CAMSIC. CAMSIC independently transforms each image +to latent representation and employs a powerful decoder-free Transformer +entropy model to capture both spatial and disparity dependencies, by +introducing a novel content-aware masked image modeling (MIM) technique. Our +content-aware MIM facilitates efficient bidirectional interaction between prior +information and estimated tokens, which naturally obviates the need for an +extra Transformer decoder. Experiments show that our stereo image codec +achieves state-of-the-art rate-distortion performance on two stereo image +datasets Cityscapes and InStereo2K with fast encoding and decoding speed. Code +is available at https://github.com/Xinjie-Q/CAMSIC. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 12 + +
+
+
+ + ☆ CRENER: A Character Relation Enhanced Chinese NER Model + + +
+ Chinese Named Entity Recognition (NER) is an important task in information +extraction, which has a significant impact on downstream applications. Due to +the lack of natural separators in Chinese, previous NER methods mostly relied +on external dictionaries to enrich the semantic and boundary information of +Chinese words. However, such methods may introduce noise that affects the +accuracy of named entity recognition. To this end, we propose a character +relation enhanced Chinese NER model (CRENER). This model defines four types of +tags that reflect the relationships between characters, and proposes a +fine-grained modeling of the relationships between characters based on three +types of relationships: adjacency relations between characters, relations +between characters and tags, and relations between tags, to more accurately +identify entity boundaries and improve Chinese NER accuracy. Specifically, we +transform the Chinese NER task into a character-character relationship +classification task, ensuring the accuracy of entity boundary recognition +through joint modeling of relation tags. To enhance the model's ability to +understand contextual information, WRENER further constructed an adapted +transformer encoder that combines unscaled direction-aware and distance-aware +masked self-attention mechanisms. Moreover, a relationship representation +enhancement module was constructed to model predefined relationship tags, +effectively mining the relationship representations between characters and +tags. Experiments conducted on four well-known Chinese NER benchmark datasets +have shown that the proposed model outperforms state-of-the-art baselines. The +ablation experiment also demonstrated the effectiveness of the proposed model. + +
+
+
+
+
+ + ☆ Why Not Together? A Multiple-Round Recommender System for Queries and + Items KDD 2025 + + +
+ A fundamental technique of recommender systems involves modeling user +preferences, where queries and items are widely used as symbolic +representations of user interests. Queries delineate user needs at an abstract +level, providing a high-level description, whereas items operate on a more +specific and concrete level, representing the granular facets of user +preference. While practical, both query and item recommendations encounter the +challenge of sparse user feedback. To this end, we propose a novel approach +named Multiple-round Auto Guess-and-Update System (MAGUS) that capitalizes on +the synergies between both types, allowing us to leverage both query and item +information to form user interests. This integrated system introduces a +recursive framework that could be applied to any recommendation method to +exploit queries and items in historical interactions and to provide +recommendations for both queries and items in each interaction round. Empirical +results from testing 12 different recommendation methods demonstrate that +integrating queries into item recommendations via MAGUS significantly enhances +the efficiency, with which users can identify their preferred items during +multiple-round interactions. + +
+
+ comment: KDD 2025 +
+
+
+
+
+ + ☆ Learned Data Compression: Challenges and Opportunities for the Future + + +
+ Compressing integer keys is a fundamental operation among multiple +communities, such as database management (DB), information retrieval (IR), and +high-performance computing (HPC). Recent advances in \emph{learned indexes} +have inspired the development of \emph{learned compressors}, which leverage +simple yet compact machine learning (ML) models to compress large-scale sorted +keys. The core idea behind learned compressors is to \emph{losslessly} encode +sorted keys by approximating them with \emph{error-bounded} ML models (e.g., +piecewise linear functions) and using a \emph{residual array} to guarantee +accurate key reconstruction. + While the concept of learned compressors remains in its early stages of +exploration, our benchmark results demonstrate that an SIMD-optimized learned +compressor can significantly outperform state-of-the-art CPU-based compressors. +Drawing on our preliminary experiments, this vision paper explores the +potential of learned data compression to enhance critical areas in DBMS and +related domains. Furthermore, we outline the key technical challenges that +existing systems must address when integrating this emerging methodology. + +
+
+
+
+
+ + ☆ Enhancing Event Extraction from Short Stories through Contextualized + Prompts + + +
+ Event extraction is an important natural language processing (NLP) task of +identifying events in an unstructured text. Although a plethora of works deal +with event extraction from new articles, clinical text etc., only a few works +focus on event extraction from literary content. Detecting events in short +stories presents several challenges to current systems, encompassing a +different distribution of events as compared to other domains and the portrayal +of diverse emotional conditions. This paper presents \texttt{Vrittanta-EN}, a +collection of 1000 English short stories annotated for real events. Exploring +this field could result in the creation of techniques and resources that +support literary scholars in improving their effectiveness. This could +simultaneously influence the field of Natural Language Processing. Our +objective is to clarify the intricate idea of events in the context of short +stories. Towards the objective, we collected 1,000 short stories written mostly +for children in the Indian context. Further, we present fresh guidelines for +annotating event mentions and their categories, organized into \textit{seven +distinct classes}. The classes are {\tt{COGNITIVE-MENTAL-STATE(CMS), +COMMUNICATION(COM), CONFLICT(CON), GENERAL-ACTIVITY(GA), LIFE-EVENT(LE), +MOVEMENT(MOV), and OTHERS(OTH)}}. Subsequently, we apply these guidelines to +annotate the short story dataset. Later, we apply the baseline methods for +automatically detecting and categorizing events. We also propose a prompt-based +method for event detection and classification. The proposed method outperforms +the baselines, while having significant improvement of more than 4\% for the +class \texttt{CONFLICT} in event classification task. + +
+
+ comment: 47 pages, 8 figures, Planning to submit in Elsevier (Computer Speech + and Language Journal) +
+
+
+
+
+ + ☆ Sentiment and Hashtag-aware Attentive Deep Neural Network for Multimodal + Post Popularity Prediction + + +
+ Social media users articulate their opinions on a broad spectrum of subjects +and share their experiences through posts comprising multiple modes of +expression, leading to a notable surge in such multimodal content on social +media platforms. Nonetheless, accurately forecasting the popularity of these +posts presents a considerable challenge. Prevailing methodologies primarily +center on the content itself, thereby overlooking the wealth of information +encapsulated within alternative modalities such as visual demographics, +sentiments conveyed through hashtags and adequately modeling the intricate +relationships among hashtags, texts, and accompanying images. This oversight +limits the ability to capture emotional connection and audience relevance, +significantly influencing post popularity. To address these limitations, we +propose a seNtiment and hAshtag-aware attentive deep neuRal netwoRk for +multimodAl posT pOpularity pRediction, herein referred to as NARRATOR that +extracts visual demographics from faces appearing in images and discerns +sentiment from hashtag usage, providing a more comprehensive understanding of +the factors influencing post popularity Moreover, we introduce a hashtag-guided +attention mechanism that leverages hashtags as navigational cues, guiding the +models focus toward the most pertinent features of textual and visual +modalities, thus aligning with target audience interests and broader social +media context. Experimental results demonstrate that NARRATOR outperforms +existing methods by a significant margin on two real-world datasets. +Furthermore, ablation studies underscore the efficacy of integrating visual +demographics, sentiment analysis of hashtags, and hashtag-guided attention +mechanisms in enhancing the performance of post popularity prediction, thereby +facilitating increased audience relevance, emotional engagement, and aesthetic +appeal. + +
+
+
+
+
+ + ☆ Movie Recommendation using Web Crawling + + +
+ In today's digital world, streaming platforms offer a vast array of movies, +making it hard for users to find content matching their preferences. This paper +explores integrating real time data from popular movie websites using advanced +HTML scraping techniques and APIs. It also incorporates a recommendation system +trained on a static Kaggle dataset, enhancing the relevance and freshness of +suggestions. By combining content based filtering, collaborative filtering, and +a hybrid model, we create a system that utilizes both historical and real time +data for more personalized suggestions. Our methodology shows that +incorporating dynamic data not only boosts user satisfaction but also aligns +recommendations with current viewing trends. + +
+
+ comment: 12 pages, 3 figures, Accepted and to be published in Proceedings of + 2025 International Conference on Applied Algorithms (ICAA), Kolkata, India, + Dec 8-10, 2025 +
+
+
+
+
+ + ☆ Beyond Quantile Methods: Improved Top-K Threshold Estimation for + Traditional and Learned Sparse Indexes + + +
+ Top-k threshold estimation is the problem of estimating the score of the k-th +highest ranking result of a search query. A good estimate can be used to speed +up many common top-k query processing algorithms, and thus a number of +researchers have recently studied the problem. Among the various approaches +that have been proposed, quantile methods appear to give the best estimates +overall at modest computational costs, followed by sampling-based methods in +certain cases. In this paper, we make two main contributions. First, we study +how to get even better estimates than the state of the art. Starting from +quantile-based methods, we propose a series of enhancements that give improved +estimates in terms of the commonly used mean under-prediction fraction (MUF). +Second, we study the threshold estimation problem on recently proposed learned +sparse index structures, showing that our methods also work well for these +cases. Our best methods substantially narrow the gap between the state of the +art and the ideal MUF of 1.0, at some additional cost in time and space. + +
+
+
+
+
+ + ☆ UCDR-Adapter: Exploring Adaptation of Pre-Trained Vision-Language Models + for Universal Cross-Domain Retrieval WACV 2025 + + +
+ Universal Cross-Domain Retrieval (UCDR) retrieves relevant images from unseen +domains and classes without semantic labels, ensuring robust generalization. +Existing methods commonly employ prompt tuning with pre-trained vision-language +models but are inherently limited by static prompts, reducing adaptability. We +propose UCDR-Adapter, which enhances pre-trained models with adapters and +dynamic prompt generation through a two-phase training strategy. First, Source +Adapter Learning integrates class semantics with domain-specific visual +knowledge using a Learnable Textual Semantic Template and optimizes Class and +Domain Prompts via momentum updates and dual loss functions for robust +alignment. Second, Target Prompt Generation creates dynamic prompts by +attending to masked source prompts, enabling seamless adaptation to unseen +domains and classes. Unlike prior approaches, UCDR-Adapter dynamically adapts +to evolving data distributions, enhancing both flexibility and generalization. +During inference, only the image branch and generated prompts are used, +eliminating reliance on textual inputs for highly efficient retrieval. +Extensive benchmark experiments show that UCDR-Adapter consistently outperforms +ProS in most cases and other state-of-the-art methods on UCDR, U(c)CDR, and +U(d)CDR settings. + +
+
+ comment: Accepted to WACV 2025. Project link: + https://github.com/fine68/UCDR2024 +
+
+
+
+
+ + ☆ USM: Unbiased Survey Modeling for Limiting Negative User Experiences in + Recommendation Systems + + +
+ Negative feedback signals are crucial to guardrail content recommendations +and improve user experience. When these signals are effectively integrated into +recommendation systems, they play a vital role in preventing the promotion of +harmful or undesirable content, thereby contributing to a healthier online +environment. However, the challenges associated with negative signals are +noteworthy. Due to the limited visibility of options for users to express +negative feedback, these signals are often sparse compared to positive signals. +This imbalance can lead to a skewed understanding of user preferences, +resulting in recommendations that prioritize short-term engagement over +long-term satisfaction. Moreover, an over-reliance on positive signals can +create a filter bubble, where users are continuously exposed to content that +aligns with their immediate preferences but may not be beneficial in the long +run. This scenario can ultimately lead to user attrition as audiences become +disillusioned with the quality of the content provided. Additionally, existing +user signals frequently fail to meet specific customized requirements, such as +understanding the underlying reasons for a user's likes or dislikes regarding a +video. This lack of granularity hinders our ability to tailor content +recommendations effectively, as we cannot identify the particular attributes of +content that resonate with individual users. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ An Offline Metric for the Debiasedness of Click Models SIGIR23 + + +
+ A well-known problem when learning from user clicks are inherent biases +prevalent in the data, such as position or trust bias. Click models are a +common method for extracting information from user clicks, such as document +relevance in web search, or to estimate click biases for downstream +applications such as counterfactual learning-to-rank, ad placement, or fair +ranking. Recent work shows that the current evaluation practices in the +community fail to guarantee that a well-performing click model generalizes well +to downstream tasks in which the ranking distribution differs from the training +distribution, i.e., under covariate shift. In this work, we propose an +evaluation metric based on conditional independence testing to detect a lack of +robustness to covariate shift in click models. We introduce the concept of +debiasedness in click modeling and derive a metric for measuring it. In +extensive semi-synthetic experiments, we show that our proposed metric helps to +predict the downstream performance of click models under covariate shift and is +useful in an off-policy model selection setting. + +
+
+ comment: SIGIR23 - Full paper +
+
+
+
+
+ + ♻ ☆ Overview of TREC 2024 Biomedical Generative Retrieval (BioGen) Track + + +
+ With the advancement of large language models (LLMs), the biomedical domain +has seen significant progress and improvement in multiple tasks such as +biomedical question answering, lay language summarization of the biomedical +literature, clinical note summarization, etc. However, hallucinations or +confabulations remain one of the key challenges when using LLMs in the +biomedical and other domains. Inaccuracies may be particularly harmful in +high-risk situations, such as medical question answering, making clinical +decisions, or appraising biomedical research. Studies on the evaluation of the +LLMs abilities to ground generated statements in verifiable sources have shown +that models perform significantly worse on lay-user-generated questions, and +often fail to reference relevant sources. This can be problematic when those +seeking information want evidence from studies to back up the claims from LLMs. +Unsupported statements are a major barrier to using LLMs in any applications +that may affect health. Methods for grounding generated statements in reliable +sources along with practical evaluation approaches are needed to overcome this +barrier. Towards this, in our pilot task organized at TREC 2024, we introduced +the task of reference attribution as a means to mitigate the generation of +false statements by LLMs answering biomedical questions. + +
+
+
+
+
+ + ♻ ☆ Arctic-Embed 2.0: Multilingual Retrieval Without Compromise + + +
+ This paper presents the training methodology of Arctic-Embed 2.0, a set of +open-source text embedding models built for accurate and efficient multilingual +retrieval. While prior works have suffered from degraded English retrieval +quality, Arctic-Embed 2.0 delivers competitive retrieval quality on +multilingual and English-only benchmarks, and supports Matryoshka +Representation Learning (MRL) for efficient embedding storage with +significantly lower compressed quality degradation compared to alternatives. We +detail the design and implementation, presenting several important open +research questions that arose during model development. We conduct experiments +exploring these research questions and include extensive discussion aimed at +fostering further discussion in this field. + +
+
+ comment: 10 pages, 5 figures, 3 tables +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ VinTAGe: Joint Video and Text Conditioning for Holistic Audio Generation + + +
+ Recent advances in audio generation have focused on text-to-audio (T2A) and +video-to-audio (V2A) tasks. However, T2A or V2A methods cannot generate +holistic sounds (onscreen and off-screen). This is because T2A cannot generate +sounds aligning with onscreen objects, while V2A cannot generate semantically +complete (offscreen sounds missing). In this work, we address the task of +holistic audio generation: given a video and a text prompt, we aim to generate +both onscreen and offscreen sounds that are temporally synchronized with the +video and semantically aligned with text and video. Previous approaches for +joint text and video-to-audio generation often suffer from modality bias, +favoring one modality over the other. To overcome this limitation, we introduce +VinTAGe, a flow-based transformer model that jointly considers text and video +to guide audio generation. Our framework comprises two key components: a +Visual-Text Encoder and a Joint VT-SiT model. To reduce modality bias and +improve generation quality, we employ pretrained uni-modal text-to-audio and +video-to-audio generation models for additional guidance. Due to the lack of +appropriate benchmarks, we also introduce VinTAGe-Bench, a dataset of 636 +video-text-audio pairs containing both onscreen and offscreen sounds. Our +comprehensive experiments on VinTAGe-Bench demonstrate that joint text and +visual interaction is necessary for holistic audio generation. Furthermore, +VinTAGe achieves state-of-the-art results on the VGGSound benchmark. Our source +code and pre-trained models will be released. Demo is available at: +https://www.youtube.com/watch?v=QmqWhUjPkJI. + +
+
+
+
+
+ + ☆ Patch-level Sounding Object Tracking for Audio-Visual Question Answering AAAI 2025 + + +
+ Answering questions related to audio-visual scenes, i.e., the AVQA task, is +becoming increasingly popular. A critical challenge is accurately identifying +and tracking sounding objects related to the question along the timeline. In +this paper, we present a new Patch-level Sounding Object Tracking (PSOT) +method. It begins with a Motion-driven Key Patch Tracking (M-KPT) module, which +relies on visual motion information to identify salient visual patches with +significant movements that are more likely to relate to sounding objects and +questions. We measure the patch-wise motion intensity map between neighboring +video frames and utilize it to construct and guide a motion-driven graph +network. Meanwhile, we design a Sound-driven KPT (S-KPT) module to explicitly +track sounding patches. This module also involves a graph network, with the +adjacency matrix regularized by the audio-visual correspondence map. The M-KPT +and S-KPT modules are performed in parallel for each temporal segment, allowing +balanced tracking of salient and sounding objects. Based on the tracked +patches, we further propose a Question-driven KPT (Q-KPT) module to retain +patches highly relevant to the question, ensuring the model focuses on the most +informative clues. The audio-visual-question features are updated during the +processing of these modules, which are then aggregated for final answer +prediction. Extensive experiments on standard datasets demonstrate the +effectiveness of our method, achieving competitive performance even compared to +recent large-scale pretraining-based approaches. + +
+
+ comment: Accepted by AAAI 2025 +
+
+
+
+
+ + ☆ MambaPro: Multi-Modal Object Re-Identification with Mamba Aggregation + and Synergistic Prompt AAAI2025 + + +
+ Multi-modal object Re-IDentification (ReID) aims to retrieve specific objects +by utilizing complementary image information from different modalities. +Recently, large-scale pre-trained models like CLIP have demonstrated impressive +performance in traditional single-modal object ReID tasks. However, they remain +unexplored for multi-modal object ReID. Furthermore, current multi-modal +aggregation methods have obvious limitations in dealing with long sequences +from different modalities. To address above issues, we introduce a novel +framework called MambaPro for multi-modal object ReID. To be specific, we first +employ a Parallel Feed-Forward Adapter (PFA) for adapting CLIP to multi-modal +object ReID. Then, we propose the Synergistic Residual Prompt (SRP) to guide +the joint learning of multi-modal features. Finally, leveraging Mamba's +superior scalability for long sequences, we introduce Mamba Aggregation (MA) to +efficiently model interactions between different modalities. As a result, +MambaPro could extract more robust features with lower complexity. Extensive +experiments on three multi-modal object ReID benchmarks (i.e., RGBNT201, +RGBNT100 and MSVR310) validate the effectiveness of our proposed methods. The +source code is available at https://github.com/924973292/MambaPro. + +
+
+ comment: This work is accepted by AAAI2025. More modifications may be + performed +
+
+
+
+
+ + ☆ UCDR-Adapter: Exploring Adaptation of Pre-Trained Vision-Language Models + for Universal Cross-Domain Retrieval WACV 2025 + + +
+ Universal Cross-Domain Retrieval (UCDR) retrieves relevant images from unseen +domains and classes without semantic labels, ensuring robust generalization. +Existing methods commonly employ prompt tuning with pre-trained vision-language +models but are inherently limited by static prompts, reducing adaptability. We +propose UCDR-Adapter, which enhances pre-trained models with adapters and +dynamic prompt generation through a two-phase training strategy. First, Source +Adapter Learning integrates class semantics with domain-specific visual +knowledge using a Learnable Textual Semantic Template and optimizes Class and +Domain Prompts via momentum updates and dual loss functions for robust +alignment. Second, Target Prompt Generation creates dynamic prompts by +attending to masked source prompts, enabling seamless adaptation to unseen +domains and classes. Unlike prior approaches, UCDR-Adapter dynamically adapts +to evolving data distributions, enhancing both flexibility and generalization. +During inference, only the image branch and generated prompts are used, +eliminating reliance on textual inputs for highly efficient retrieval. +Extensive benchmark experiments show that UCDR-Adapter consistently outperforms +ProS in most cases and other state-of-the-art methods on UCDR, U(c)CDR, and +U(d)CDR settings. + +
+
+ comment: Accepted to WACV 2025. Project link: + https://github.com/fine68/UCDR2024 +
+
+
+
+
+ + ☆ Hidden Echoes Survive Training in Audio To Audio Generative Instrument + Models AAAI + + +
+ As generative techniques pervade the audio domain, there has been increasing +interest in tracing back through these complicated models to understand how +they draw on their training data to synthesize new examples, both to ensure +that they use properly licensed data and also to elucidate their black box +behavior. In this paper, we show that if imperceptible echoes are hidden in the +training data, a wide variety of audio to audio architectures (differentiable +digital signal processing (DDSP), Realtime Audio Variational autoEncoder +(RAVE), and ``Dance Diffusion'') will reproduce these echoes in their outputs. +Hiding a single echo is particularly robust across all architectures, but we +also show promising results hiding longer time spread echo patterns for an +increased information capacity. We conclude by showing that echoes make their +way into fine tuned models, that they survive mixing/demixing, and that they +survive pitch shift augmentation during training. Hence, this simple, classical +idea in watermarking shows significant promise for tagging generative audio +models. + +
+
+ comment: 8 pages, 11 Figures, Proceedings of 2025 AAAI Workshop on AI for + Music +
+
+
+
+
+ + ♻ ☆ Sample then Identify: A General Framework for Risk Control and + Assessment in Multimodal Large Language Models + + +
+ Multimodal Large Language Models (MLLMs) exhibit promising advancements +across various tasks, yet they still encounter significant trustworthiness +issues. Prior studies apply Split Conformal Prediction (SCP) in language +modeling to construct prediction sets with statistical guarantees. However, +these methods typically rely on internal model logits or are restricted to +multiple-choice settings, which hampers their generalizability and adaptability +in dynamic, open-ended environments. In this paper, we introduce TRON, a +two-step framework for risk control and assessment, applicable to any MLLM that +supports sampling in both open-ended and closed-ended scenarios. TRON comprises +two main components: (1) a novel conformal score to sample response sets of +minimum size, and (2) a nonconformity score to identify high-quality responses +based on self-consistency theory, controlling the error rates by two specific +risk levels. Furthermore, we investigate semantic redundancy in prediction sets +within open-ended contexts for the first time, leading to a promising +evaluation metric for MLLMs based on average set size. Our comprehensive +experiments across four Video Question-Answering (VideoQA) datasets utilizing +eight MLLMs show that TRON achieves desired error rates bounded by two +user-specified risk levels. Additionally, deduplicated prediction sets maintain +adaptiveness while being more efficient and stable for risk assessment under +different risk levels. + +
+
+
+
+
+ + ♻ ☆ EgoSonics: Generating Synchronized Audio for Silent Egocentric Videos WACV 2025 + + +
+ We introduce EgoSonics, a method to generate semantically meaningful and +synchronized audio tracks conditioned on silent egocentric videos. Generating +audio for silent egocentric videos could open new applications in virtual +reality, assistive technologies, or for augmenting existing datasets. Existing +work has been limited to domains like speech, music, or impact sounds and +cannot capture the broad range of audio frequencies found in egocentric videos. +EgoSonics addresses these limitations by building on the strengths of latent +diffusion models for conditioned audio synthesis. We first encode and process +paired audio-video data to make them suitable for generation. The encoded data +is then used to train a model that can generate an audio track that captures +the semantics of the input video. Our proposed SyncroNet builds on top of +ControlNet to provide control signals that enables generation of temporally +synchronized audio. Extensive evaluations and a comprehensive user study show +that our model outperforms existing work in audio quality, and in our proposed +synchronization evaluation method. Furthermore, we demonstrate downstream +applications of our model in improving video summarization. + +
+
+ comment: WACV 2025 +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`