From 5924bfac786e0f09e04a25755bfa6169e3b0e006 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Sun, 6 Oct 2024 05:25:44 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 22397 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 22792 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..08b4565a --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-10-03T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2410.02763v1","updated":"2024-10-03T17:59:58Z","published":"2024-10-03T17:59:58Z","title":"Vinoground: Scrutinizing LMMs over Dense Temporal Reasoning with Short\n Videos","summary":" There has been growing sentiment recently that modern large multimodal models\n(LMMs) have addressed most of the key challenges related to short video\ncomprehension. As a result, both academia and industry are gradually shifting\ntheir attention towards the more complex challenges posed by understanding\nlong-form videos. However, is this really the case? Our studies indicate that\nLMMs still lack many fundamental reasoning capabilities even when dealing with\nshort videos. We introduce Vinoground, a temporal counterfactual LMM evaluation\nbenchmark encompassing 1000 short and natural video-caption pairs. We\ndemonstrate that existing LMMs severely struggle to distinguish temporal\ndifferences between different actions and object transformations. For example,\nthe best model GPT-4o only obtains ~50% on our text and video scores, showing a\nlarge gap compared to the human baseline of ~90%. All open-source multimodal\nmodels and CLIP-based models perform much worse, producing mostly random chance\nperformance. Through this work, we shed light onto the fact that temporal\nreasoning in short videos is a problem yet to be fully solved. The dataset and\nevaluation code are available at https://vinoground.github.io.\n","authors":["Jianrui Zhang","Mu Cai","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2410.02763v1.pdf","comment":"Project Page: https://vinoground.github.io"},{"id":"http://arxiv.org/abs/2404.10917v2","updated":"2024-10-03T17:59:55Z","published":"2024-04-16T21:33:05Z","title":"Which questions should I answer? Salience Prediction of Inquisitive\n Questions","summary":" Inquisitive questions -- open-ended, curiosity-driven questions people ask as\nthey read -- are an integral part of discourse processing (Kehler and Rohde,\n2017; Onea, 2016) and comprehension (Prince, 2004). Recent work in NLP has\ntaken advantage of question generation capabilities of LLMs to enhance a wide\nrange of applications. But the space of inquisitive questions is vast: many\nquestions can be evoked from a given context. So which of those should be\nprioritized to find answers? Linguistic theories, unfortunately, have not yet\nprovided an answer to this question. This paper presents QSALIENCE, a salience\npredictor of inquisitive questions. QSALIENCE is instruction-tuned over our\ndataset of linguist-annotated salience scores of 1,766 (context, question)\npairs. A question scores high on salience if answering it would greatly enhance\nthe understanding of the text (Van Rooy, 2003). We show that highly salient\nquestions are empirically more likely to be answered in the same article,\nbridging potential questions (Onea, 2016) with Questions Under Discussion\n(Roberts, 2012). We further validate our findings by showing that answering\nsalient questions is an indicator of summarization quality in news.\n","authors":["Yating Wu","Ritika Mangla","Alexandros G. Dimakis","Greg Durrett","Junyi Jessy Li"],"pdf_url":"https://arxiv.org/pdf/2404.10917v2.pdf","comment":"Camera Ready for EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2410.02760v1","updated":"2024-10-03T17:59:30Z","published":"2024-10-03T17:59:30Z","title":"Erasing Conceptual Knowledge from Language Models","summary":" Concept erasure in language models has traditionally lacked a comprehensive\nevaluation framework, leading to incomplete assessments of effectiveness of\nerasure methods. We propose an evaluation paradigm centered on three critical\ncriteria: innocence (complete knowledge removal), seamlessness (maintaining\nconditional fluent generation), and specificity (preserving unrelated task\nperformance). Our evaluation metrics naturally motivate the development of\nErasure of Language Memory (ELM), a new method designed to address all three\ndimensions. ELM employs targeted low-rank updates to alter output distributions\nfor erased concepts while preserving overall model capabilities including\nfluency when prompted for an erased concept. We demonstrate ELM's efficacy on\nbiosecurity, cybersecurity, and literary domain erasure tasks. Comparative\nanalysis shows that ELM achieves superior performance across our proposed\nmetrics, including near-random scores on erased topic assessments, generation\nfluency, maintained accuracy on unrelated benchmarks, and robustness under\nadversarial attacks. Our code, data, and trained models are available at\nhttps://elm.baulab.info\n","authors":["Rohit Gandikota","Sheridan Feucht","Samuel Marks","David Bau"],"pdf_url":"https://arxiv.org/pdf/2410.02760v1.pdf","comment":"Project Page: https://elm.baulab.info"},{"id":"http://arxiv.org/abs/2410.02756v1","updated":"2024-10-03T17:58:55Z","published":"2024-10-03T17:58:55Z","title":"CorPipe at CRAC 2024: Predicting Zero Mentions from Raw Text","summary":" We present CorPipe 24, the winning entry to the CRAC 2024 Shared Task on\nMultilingual Coreference Resolution. In this third iteration of the shared\ntask, a novel objective is to also predict empty nodes needed for zero\ncoreference mentions (while the empty nodes were given on input in previous\nyears). This way, coreference resolution can be performed on raw text. We\nevaluate two model variants: a~two-stage approach (where the empty nodes are\npredicted first using a pretrained encoder model and then processed together\nwith sentence words by another pretrained model) and a single-stage approach\n(where a single pretrained encoder model generates empty nodes, coreference\nmentions, and coreference links jointly). In both settings, CorPipe surpasses\nother participants by a large margin of 3.9 and 2.8 percent points,\nrespectively. The source code and the trained model are available at\nhttps://github.com/ufal/crac2024-corpipe .\n","authors":["Milan Straka"],"pdf_url":"https://arxiv.org/pdf/2410.02756v1.pdf","comment":"Accepted to CRAC 2024"},{"id":"http://arxiv.org/abs/2410.02755v1","updated":"2024-10-03T17:58:29Z","published":"2024-10-03T17:58:29Z","title":"SIEVE: General Purpose Data Filtering System Matching GPT-4o Accuracy at\n 1% the Cost","summary":" Creating specialized large language models requires vast amounts of clean,\nspecial purpose data for training and fine-tuning. With only a handful of\nexisting large-scale, domain-specific datasets, creation of new datasets is\nrequired in most applications. This requires the development of new\napplication-specific filtering of web-scale data. Filtering with a\nhigh-performance, general-purpose LLM such as GPT-4o can be highly effective,\nbut this is extremely expensive at web-scale. This paper proposes SIEVE, a\nlightweight alternative that matches GPT-4o accuracy at a fraction of the cost.\nSIEVE can perform up to 500 filtering operations for the cost of one GPT-4o\nfiltering call. The key to SIEVE is a seamless integration of GPT-4o and\nlightweight T5 models, using active learning to fine-tune T5 in the background\nwith a small number of calls to GPT-4o. Once trained, it performs as well as\nGPT-4o at a tiny fraction of the cost. We experimentally validate SIEVE on the\nOpenWebText dataset, using five highly customized filter tasks targeting high\nquality and domain-specific content. Our results demonstrate the effectiveness\nand efficiency of our method in curating large, high-quality datasets for\nlanguage model training at a substantially lower cost (1%) than existing\ntechniques. To further validate SIEVE, experiments show that SIEVE and GPT-4o\nachieve similar accuracy, with human evaluators preferring SIEVE's filtering\nresults to those of GPT-4o.\n","authors":["Jifan Zhang","Robert Nowak"],"pdf_url":"https://arxiv.org/pdf/2410.02755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02749v1","updated":"2024-10-03T17:57:22Z","published":"2024-10-03T17:57:22Z","title":"Training Language Models on Synthetic Edit Sequences Improves Code\n Synthesis","summary":" Software engineers mainly write code by editing existing programs. In\ncontrast, large language models (LLMs) autoregressively synthesize programs in\na single pass. One explanation for this is the scarcity of open-sourced edit\ndata. While high-quality instruction data for code synthesis is already scarce,\nhigh-quality edit data is even scarcer. To fill this gap, we develop a\nsynthetic data generation algorithm called LintSeq. This algorithm refactors\nexisting code into a sequence of code edits by using a linter to procedurally\nsample across the error-free insertions that can be used to sequentially write\nprograms. It outputs edit sequences as text strings consisting of consecutive\nprogram diffs. To test LintSeq, we use it to refactor a dataset of instruction\n+ program pairs into instruction + program-diff-sequence tuples. Then, we\ninstruction finetune a series of smaller LLMs ranging from 2.6B to 14B\nparameters on both the re-factored and original versions of this dataset,\ncomparing zero-shot performance on code synthesis benchmarks. We show that\nduring repeated sampling, edit sequence finetuned models produce more diverse\nprograms than baselines. This results in better inference-time scaling for\nbenchmark coverage as a function of samples, i.e. the fraction of problems\n\"pass@k\" solved by any attempt given \"k\" tries. For example, on HumanEval\npass@50, small LLMs finetuned on synthetic edit sequences are competitive with\nGPT-4 and outperform models finetuned on the baseline dataset by +20% (+/-3%)\nin absolute score. Finally, we also pretrain our own tiny LMs for code\nunderstanding. We show that finetuning tiny models on synthetic code edits\nresults in state-of-the-art code synthesis for the on-device model class. Our\n150M parameter edit sequence LM matches or outperforms code models with twice\nas many parameters, both with and without repeated sampling, including Codex\nand AlphaCode.\n","authors":["Ulyana Piterbarg","Lerrel Pinto","Rob Fergus"],"pdf_url":"https://arxiv.org/pdf/2410.02749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18957v2","updated":"2024-10-03T17:57:07Z","published":"2024-09-27T17:58:50Z","title":"LML-DAP: Language Model Learning a Dataset for Data-Augmented Prediction","summary":" Classification tasks are typically handled using Machine Learning (ML)\nmodels, which lack a balance between accuracy and interpretability. This paper\nintroduces a new approach to using Large Language Models (LLMs) for\nclassification tasks in an explainable way. Unlike ML models that rely heavily\non data cleaning and feature engineering, this method streamlines the process\nusing LLMs. This paper proposes a new concept called \"Language Model Learning\n(LML)\" powered by a new method called \"Data-Augmented Prediction (DAP)\". The\nclassification is performed by LLMs using a method similar to humans manually\nexploring and understanding the data and deciding classifications using data as\na reference. In the LML process, a dataset is summarized and evaluated to\ndetermine the features that lead to the classification of each label the most.\nIn the process of DAP, the system uses the data summary and a row of the\ntesting dataset to automatically generate a query, which is used to retrieve\nrelevant rows from the dataset. A classification is generated by the LLM using\ndata summary and relevant rows, ensuring satisfactory accuracy even with\ncomplex data using context-aware decision-making. LML and DAP unlock the\npossibilities of new applications. The proposed method uses the words \"Act as\nan Explainable Machine Learning Model\" in the prompt to enhance the\ninterpretability of the predictions by allowing users to review the logic\nbehind each prediction. In some test cases, the system scored an accuracy above\n90%, proving the effectiveness of the system and its potential to outperform\nconventional ML models in various scenarios. The code is available at\nhttps://github.com/Pro-GenAI/LML-DAP\n","authors":["Praneeth Vadlapati"],"pdf_url":"https://arxiv.org/pdf/2409.18957v2.pdf","comment":"Updated title, abstract, and images"},{"id":"http://arxiv.org/abs/2410.02748v1","updated":"2024-10-03T17:57:01Z","published":"2024-10-03T17:57:01Z","title":"CriSPO: Multi-Aspect Critique-Suggestion-guided Automatic Prompt\n Optimization for Text Generation","summary":" Large language models (LLMs) can generate fluent summaries across domains\nusing prompting techniques, reducing the need to train models for summarization\napplications. However, crafting effective prompts that guide LLMs to generate\nsummaries with the appropriate level of detail and writing style remains a\nchallenge. In this paper, we explore the use of salient information extracted\nfrom the source document to enhance summarization prompts. We show that adding\nkeyphrases in prompts can improve ROUGE F1 and recall, making the generated\nsummaries more similar to the reference and more complete. The number of\nkeyphrases can control the precision-recall trade-off. Furthermore, our\nanalysis reveals that incorporating phrase-level salient information is\nsuperior to word- or sentence-level. However, the impact on hallucination is\nnot universally positive across LLMs. To conduct this analysis, we introduce\nKeyphrase Signal Extractor (CriSPO), a lightweight model that can be finetuned\nto extract salient keyphrases. By using CriSPO, we achieve consistent ROUGE\nimprovements across datasets and open-weight and proprietary LLMs without any\nLLM customization. Our findings provide insights into leveraging salient\ninformation in building prompt-based summarization systems.\n","authors":["Han He","Qianchu Liu","Lei Xu","Chaitanya Shivade","Yi Zhang","Sundararajan Srinivasan","Katrin Kirchhoff"],"pdf_url":"https://arxiv.org/pdf/2410.02748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11687v2","updated":"2024-10-03T17:56:34Z","published":"2024-06-17T16:05:32Z","title":"Tokenization Falling Short: The Curse of Tokenization","summary":" Language models typically tokenize raw text into sequences of subword\nidentifiers from a predefined vocabulary, a process inherently sensitive to\ntypographical errors, length variations, and largely oblivious to the internal\nstructure of tokens--issues we term the curse of tokenization. In this study,\nwe delve into these drawbacks and demonstrate that large language models (LLMs)\nremain susceptible to these problems. This study systematically investigates\nthese challenges and their impact on LLMs through three critical research\nquestions: (1) complex problem solving, (2) token structure probing, and (3)\nresilience to typographical variation. Our findings reveal that scaling model\nparameters can mitigate the issue of tokenization; however, LLMs still suffer\nfrom biases induced by typos and other text format variations. Our experiments\nshow that subword regularization such as BPE-dropout can mitigate this issue.\nWe release our evaluation code and data at https://github.com/FloatAI/TKEval.\n","authors":["Yekun Chai","Yewei Fang","Qiwei Peng","Xuhong Li"],"pdf_url":"https://arxiv.org/pdf/2406.11687v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2404.07840v3","updated":"2024-10-03T17:56:12Z","published":"2024-04-11T15:27:56Z","title":"On Training Data Influence of GPT Models","summary":" Amidst the rapid advancements in generative language models, the\ninvestigation of how training data shapes the performance of GPT models is\nstill emerging. This paper presents GPTfluence, a novel approach that leverages\na featurized simulation to assess the impact of training examples on the\ntraining dynamics of GPT models. Our approach not only traces the influence of\nindividual training instances on performance trajectories, such as loss and\nother key metrics, on targeted test points but also enables a comprehensive\ncomparison with existing methods across various training scenarios in GPT\nmodels, ranging from 14 million to 2.8 billion parameters, across a range of\ndownstream tasks. Contrary to earlier methods that struggle with generalization\nto new data, GPTfluence introduces a parameterized simulation of training\ndynamics, demonstrating robust generalization capabilities to unseen training\ndata. This adaptability is evident across both fine-tuning and\ninstruction-tuning scenarios, spanning tasks in natural language understanding\nand generation. We make our code and data publicly available at\nhttps://github.com/ernie-research/gptfluence.\n","authors":["Yekun Chai","Qingyi Liu","Shuohuan Wang","Yu Sun","Qiwei Peng","Hua Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07840v3.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2307.10432v3","updated":"2024-10-03T17:55:29Z","published":"2023-07-19T19:40:34Z","title":"PharmacyGPT: The AI Pharmacist","summary":" In this study, we introduce PharmacyGPT, a novel framework to assess the\ncapabilities of large language models (LLMs) such as ChatGPT and GPT-4 in\nemulating the role of clinical pharmacists. Our methodology encompasses the\nutilization of LLMs to generate comprehensible patient clusters, formulate\nmedication plans, and forecast patient outcomes. We conduct our investigation\nusing real data acquired from the intensive care unit (ICU) at the University\nof North Carolina Chapel Hill (UNC) Hospital. Our analysis offers valuable\ninsights into the potential applications and limitations of LLMs in the field\nof clinical pharmacy, with implications for both patient care and the\ndevelopment of future AI-driven healthcare solutions. By evaluating the\nperformance of PharmacyGPT, we aim to contribute to the ongoing discourse\nsurrounding the integration of artificial intelligence in healthcare settings,\nultimately promoting the responsible and efficacious use of such technologies.\n","authors":["Zhengliang Liu","Zihao Wu","Mengxuan Hu","Bokai Zhao","Lin Zhao","Tianyi Zhang","Haixing Dai","Xianyan Chen","Ye Shen","Sheng Li","Quanzheng Li","Xiang Li","Brian Murray","Tianming Liu","Andrea Sikora"],"pdf_url":"https://arxiv.org/pdf/2307.10432v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02744v1","updated":"2024-10-03T17:55:17Z","published":"2024-10-03T17:55:17Z","title":"Neutral residues: revisiting adapters for model extension","summary":" We address the problem of extending a pretrained large language model to a\nnew domain that was not seen at training time, like adding a language for which\nthe original model has seen no or little training data. Popular solutions like\nfine-tuning or low-rank adaptation are successful at domain adaptation, but\nformally they do not add any extra capacity and degrade the performance in the\noriginal domain.\n Our paper analyzes this extension problem under three angles: data,\narchitecture and training procedure, which are advantageously considered\njointly. In particular, we improve adapters and make it possible to learn an\nentire new language while ensuring that the output of the neural network is\nalmost unchanged in the original domain. For this purpose, we modify the new\nresidual blocks in a way that leads each new residual block to output\nnear-zeros in the original domain.\n This solution of neutral residues, which borrows architectural components\nfrom mixture of experts, is effective: with only 20% extra learnable weights\ncompared to an original model trained on English, we get results that are\nsignificantly better than concurrent approaches (fine-tuning, low-rank or\nvanilla adapters) in terms of the trade-off between learning a new language and\nnot forgetting English.\n","authors":["Franck Signe Talla","Herve Jegou","Edouard Grave"],"pdf_url":"https://arxiv.org/pdf/2410.02744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02743v1","updated":"2024-10-03T17:55:13Z","published":"2024-10-03T17:55:13Z","title":"MA-RLHF: Reinforcement Learning from Human Feedback with Macro Actions","summary":" Reinforcement learning from human feedback (RLHF) has demonstrated\neffectiveness in aligning large language models (LLMs) with human preferences.\nHowever, token-level RLHF suffers from the credit assignment problem over long\nsequences, where delayed rewards make it challenging for the model to discern\nwhich actions contributed to successful outcomes. This hinders learning\nefficiency and slows convergence. In this paper, we propose MA-RLHF, a simple\nyet effective RLHF framework that incorporates macro actions -- sequences of\ntokens or higher-level language constructs -- into the learning process. By\noperating at this higher level of abstraction, our approach reduces the\ntemporal distance between actions and rewards, facilitating faster and more\naccurate credit assignment. This results in more stable policy gradient\nestimates and enhances learning efficiency within each episode, all without\nincreasing computational complexity during training or inference. We validate\nour approach through extensive experiments across various model sizes and\ntasks, including text summarization, dialogue generation, question answering,\nand program synthesis. Our method achieves substantial performance improvements\nover standard RLHF, with performance gains of up to 30% in text summarization\nand code generation, 18% in dialogue, and 8% in question answering tasks.\nNotably, our approach reaches parity with vanilla RLHF 1.7x to 2x faster in\nterms of training time and continues to outperform it with further training. We\nwill make our code and data publicly available at\nhttps://github.com/ernie-research/MA-RLHF .\n","authors":["Yekun Chai","Haoran Sun","Huang Fang","Shuohuan Wang","Yu Sun","Hua Wu"],"pdf_url":"https://arxiv.org/pdf/2410.02743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02742v1","updated":"2024-10-03T17:55:09Z","published":"2024-10-03T17:55:09Z","title":"Grounding Large Language Models In Embodied Environment With Imperfect\n World Models","summary":" Despite a widespread success in various applications, large language models\n(LLMs) often stumble when tackling basic physical reasoning or executing\nrobotics tasks, due to a lack of direct experience with the physical nuances of\nthe real world. To address these issues, we propose a Grounding Large language\nmodel with Imperfect world MOdel (GLIMO), which utilizes proxy world models\nsuch as simulators to collect and synthesize trining data. GLIMO incorporates\nan LLM agent-based data generator to automatically create high-quality and\ndiverse instruction datasets. The generator includes an iterative self-refining\nmodule for temporally consistent experience sampling, a diverse set of\nquestion-answering instruction seeds, and a retrieval-augmented generation\nmodule for reflecting on prior experiences. Comprehensive experiments show that\nour approach improve the performance of strong open-source LLMs like LLaMA-3\nwith a performance boost of 2.04 $\\times$, 1.54 $\\times$, and 1.82 $\\times$\nacross three different benchmarks, respectively. The performance is able to\ncompete with or surpass their larger counterparts such as GPT-4.\n","authors":["Haolan Liu","Jishen Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.02742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02741v1","updated":"2024-10-03T17:54:56Z","published":"2024-10-03T17:54:56Z","title":"Salient Information Prompting to Steer Content in Prompt-based\n Abstractive Summarization","summary":" Large language models (LLMs) can generate fluent summaries across domains\nusing prompting techniques, reducing the need to train models for summarization\napplications. However, crafting effective prompts that guide LLMs to generate\nsummaries with the appropriate level of detail and writing style remains a\nchallenge. In this paper, we explore the use of salient information extracted\nfrom the source document to enhance summarization prompts. We show that adding\nkeyphrases in prompts can improve ROUGE F1 and recall, making the generated\nsummaries more similar to the reference and more complete. The number of\nkeyphrases can control the precision-recall trade-off. Furthermore, our\nanalysis reveals that incorporating phrase-level salient information is\nsuperior to word- or sentence-level. However, the impact on hallucination is\nnot universally positive across LLMs. To conduct this analysis, we introduce\nKeyphrase Signal Extractor (SigExt), a lightweight model that can be finetuned\nto extract salient keyphrases. By using SigExt, we achieve consistent ROUGE\nimprovements across datasets and open-weight and proprietary LLMs without any\nLLM customization. Our findings provide insights into leveraging salient\ninformation in building prompt-based summarization systems.\n","authors":["Lei Xu","Mohammed Asad Karim","Saket Dingliwal","Aparna Elangovan"],"pdf_url":"https://arxiv.org/pdf/2410.02741v1.pdf","comment":"Accepted to EMNLP 2024 Industry Track"},{"id":"http://arxiv.org/abs/2410.02736v1","updated":"2024-10-03T17:53:30Z","published":"2024-10-03T17:53:30Z","title":"Justice or Prejudice? Quantifying Biases in LLM-as-a-Judge","summary":" LLM-as-a-Judge has been widely utilized as an evaluation method in various\nbenchmarks and served as supervised rewards in model training. However, despite\ntheir excellence in many domains, potential issues are under-explored,\nundermining their reliability and the scope of their utility. Therefore, we\nidentify 12 key potential biases and propose a new automated bias\nquantification framework-CALM-which systematically quantifies and analyzes each\ntype of bias in LLM-as-a-Judge by using automated and principle-guided\nmodification. Our experiments cover multiple popular language models, and the\nresults indicate that while advanced models have achieved commendable overall\nperformance, significant biases persist in certain specific tasks. Empirical\nresults suggest that there remains room for improvement in the reliability of\nLLM-as-a-Judge. Moreover, we also discuss the explicit and implicit influence\nof these biases and give some suggestions for the reliable application of\nLLM-as-a-Judge. Our work highlights the need for stakeholders to address these\nissues and remind users to exercise caution in LLM-as-a-Judge applications.\n","authors":["Jiayi Ye","Yanbo Wang","Yue Huang","Dongping Chen","Qihui Zhang","Nuno Moniz","Tian Gao","Werner Geyer","Chao Huang","Pin-Yu Chen","Nitesh V Chawla","Xiangliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.02736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02730v1","updated":"2024-10-03T17:49:28Z","published":"2024-10-03T17:49:28Z","title":"DivScene: Benchmarking LVLMs for Object Navigation with Diverse Scenes\n and Objects","summary":" Object navigation in unknown environments is crucial for deploying embodied\nagents in real-world applications. While we have witnessed huge progress due to\nlarge-scale scene datasets, faster simulators, and stronger models, previous\nstudies mainly focus on limited scene types and target objects. In this paper,\nwe study a new task of navigating to diverse target objects in a large number\nof scene types. To benchmark the problem, we present a large-scale scene\ndataset, DivScene, which contains 4,614 scenes across 81 different types. With\nthe dataset, we build an end-to-end embodied agent, NatVLM, by fine-tuning a\nLarge Vision Language Model (LVLM) through imitation learning. The LVLM is\ntrained to take previous observations from the environment and generate the\nnext actions. We also introduce CoT explanation traces of the action prediction\nfor better performance when tuning LVLMs. Our extensive experiments find that\nwe can build a performant LVLM-based agent through imitation learning on the\nshortest paths constructed by a BFS planner without any human supervision. Our\nagent achieves a success rate that surpasses GPT-4o by over 20%. Meanwhile, we\ncarry out various analyses showing the generalization ability of our agent.\n","authors":["Zhaowei Wang","Hongming Zhang","Tianqing Fang","Ye Tian","Yue Yang","Kaixin Ma","Xiaoman Pan","Yangqiu Song","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2410.02730v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2410.02729v1","updated":"2024-10-03T17:49:09Z","published":"2024-10-03T17:49:09Z","title":"Unified Multi-Modal Interleaved Document Representation for Information\n Retrieval","summary":" Information Retrieval (IR) methods aim to identify relevant documents in\nresponse to a given query, which have gained remarkable attention due to their\nsuccessful application in various natural language tasks. However, existing\napproaches typically consider only the textual information within the\ndocuments, which overlooks the fact that documents can contain multiple\nmodalities, including texts, images, and tables. Further, they often segment\neach long document into multiple discrete passages for embedding, preventing\nthem from capturing the overall document context and interactions between\nparagraphs. We argue that these two limitations lead to suboptimal document\nrepresentations for retrieval. In this work, to address them, we aim to produce\nmore comprehensive and nuanced document representations by holistically\nembedding documents interleaved with different modalities. Specifically, we\nachieve this by leveraging the capability of recent vision-language models that\nenable the processing and integration of text, images, and tables into a\nunified format and representation. Moreover, to mitigate the information loss\nfrom segmenting documents into passages, instead of representing and retrieving\npassages individually, we further merge the representations of segmented\npassages into one single document representation, while we additionally\nintroduce a reranking strategy to decouple and identify the relevant passage\nwithin the document if necessary. Then, through extensive experiments on\ndiverse information retrieval scenarios considering both the textual and\nmultimodal queries, we show that our approach substantially outperforms\nrelevant baselines, thanks to the consideration of the multimodal information\ninterleaved within the documents in a unified way.\n","authors":["Jaewoo Lee","Joonho Ko","Jinheon Baek","Soyeong Jeong","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2410.02729v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.02725v1","updated":"2024-10-03T17:47:29Z","published":"2024-10-03T17:47:29Z","title":"Adaptive Inference-Time Compute: LLMs Can Predict if They Can Do Better,\n Even Mid-Generation","summary":" Inference-time computation is a powerful paradigm to enhance the performance\nof large language models (LLMs), with Best-of-N sampling being a widely used\ntechnique. However, this method is computationally expensive, requiring both\n(1) an external reward model and (2) the generation of multiple samples. In\nthis work, we introduce a new generative self-evaluation scheme designed to\nadaptively reduce the number of generated samples while maintaining or even\nimproving performance. We use a generative reward model formulation, allowing\nthe LLM to predict mid-generation the probability that restarting the\ngeneration will yield a better response. These predictions are obtained without\nan external reward model and can be used to decide whether or not to generate\nmore samples, prune unpromising samples early on, or to pick the best sample.\nThis capability is very inexpensive as it involves generating a single\npredefined token. Trained using a dataset constructed with real unfiltered\nLMSYS user prompts, Llama 3.1 8B's win rate against GPT-4 on AlpacaEval\nincreases from 21% to 34% with 16 samples and math performance on GSM8K\nimproves from 84% to 91%. By sampling only when the LLM determines that it is\nbeneficial to do so and adaptively adjusting temperature annealing, we\ndemonstrate that 74% of the improvement from using 16 samples can be achieved\nwith only 1.2 samples on average. We further demonstrate that 50-75% of samples\ncan be pruned early in generation with minimal degradation in performance.\nOverall, our methods enable more efficient and scalable compute utilization\nduring inference for LLMs.\n","authors":["Rohin Manvi","Anikait Singh","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2410.02725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10710v3","updated":"2024-10-03T17:46:40Z","published":"2024-04-16T16:36:50Z","title":"Autoregressive Pre-Training on Pixels and Texts","summary":" The integration of visual and textual information represents a promising\ndirection in the advancement of language models. In this paper, we explore the\ndual modality of language--both visual and textual--within an autoregressive\nframework, pre-trained on both document images and texts. Our method employs a\nmultimodal training strategy, utilizing visual data through next patch\nprediction with a regression head and/or textual data through next token\nprediction with a classification head. We focus on understanding the\ninteraction between these two modalities and their combined impact on model\nperformance. Our extensive evaluation across a wide range of benchmarks shows\nthat incorporating both visual and textual data significantly improves the\nperformance of pixel-based language models. Remarkably, we find that a\nunidirectional pixel-based model trained solely on visual data can achieve\ncomparable results to state-of-the-art bidirectional models on several language\nunderstanding tasks. This work uncovers the untapped potential of integrating\nvisual and textual modalities for more effective language modeling. We release\nour code, data, and model checkpoints at\n\\url{https://github.com/ernie-research/pixelgpt}.\n","authors":["Yekun Chai","Qingyi Liu","Jingwu Xiao","Shuohuan Wang","Yu Sun","Hua Wu"],"pdf_url":"https://arxiv.org/pdf/2404.10710v3.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.02724v1","updated":"2024-10-03T17:45:31Z","published":"2024-10-03T17:45:31Z","title":"Large Language Models as Markov Chains","summary":" Large language models (LLMs) have proven to be remarkably efficient, both\nacross a wide range of natural language processing tasks and well beyond them.\nHowever, a comprehensive theoretical analysis of the origins of their\nimpressive performance remains elusive. In this paper, we approach this\nchallenging task by drawing an equivalence between generic autoregressive\nlanguage models with vocabulary of size $T$ and context window of size $K$ and\nMarkov chains defined on a finite state space of size $\\mathcal{O}(T^K)$. We\nderive several surprising findings related to the existence of a stationary\ndistribution of Markov chains that capture the inference power of LLMs, their\nspeed of convergence to it, and the influence of the temperature on the latter.\nWe then prove pre-training and in-context generalization bounds and show how\nthe drawn equivalence allows us to enrich their interpretation. Finally, we\nillustrate our theoretical guarantees with experiments on several recent LLMs\nto highlight how they capture the behavior observed in practice.\n","authors":["Oussama Zekri","Ambroise Odonnat","Abdelhakim Benechehab","Linus Bleistein","Nicolas Boullé","Ievgen Redko"],"pdf_url":"https://arxiv.org/pdf/2410.02724v1.pdf","comment":"49 pages, 17 figures"},{"id":"http://arxiv.org/abs/2410.02721v1","updated":"2024-10-03T17:40:55Z","published":"2024-10-03T17:40:55Z","title":"Domain-Specific Retrieval-Augmented Generation Using Vector Stores,\n Knowledge Graphs, and Tensor Factorization","summary":" Large Language Models (LLMs) are pre-trained on large-scale corpora and excel\nin numerous general natural language processing (NLP) tasks, such as question\nanswering (QA). Despite their advanced language capabilities, when it comes to\ndomain-specific and knowledge-intensive tasks, LLMs suffer from hallucinations,\nknowledge cut-offs, and lack of knowledge attributions. Additionally, fine\ntuning LLMs' intrinsic knowledge to highly specific domains is an expensive and\ntime consuming process. The retrieval-augmented generation (RAG) process has\nrecently emerged as a method capable of optimization of LLM responses, by\nreferencing them to a predetermined ontology. It was shown that using a\nKnowledge Graph (KG) ontology for RAG improves the QA accuracy, by taking into\naccount relevant sub-graphs that preserve the information in a structured\nmanner. In this paper, we introduce SMART-SLIC, a highly domain-specific LLM\nframework, that integrates RAG with KG and a vector store (VS) that store\nfactual domain specific information. Importantly, to avoid hallucinations in\nthe KG, we build these highly domain-specific KGs and VSs without the use of\nLLMs, but via NLP, data mining, and nonnegative tensor factorization with\nautomatic model selection. Pairing our RAG with a domain-specific: (i) KG\n(containing structured information), and (ii) VS (containing unstructured\ninformation) enables the development of domain-specific chat-bots that\nattribute the source of information, mitigate hallucinations, lessen the need\nfor fine-tuning, and excel in highly domain-specific question answering tasks.\nWe pair SMART-SLIC with chain-of-thought prompting agents. The framework is\ndesigned to be generalizable to adapt to any specific or specialized domain. In\nthis paper, we demonstrate the question answering capabilities of our framework\non a corpus of scientific publications on malware analysis and anomaly\ndetection.\n","authors":["Ryan C. Barron","Ves Grantcharov","Selma Wanna","Maksim E. Eren","Manish Bhattarai","Nicholas Solovyev","George Tompkins","Charles Nicholas","Kim Ø. Rasmussen","Cynthia Matuszek","Boian S. Alexandrov"],"pdf_url":"https://arxiv.org/pdf/2410.02721v1.pdf","comment":"9 pages 7 figures, 1 table, 1 cypher code Accepted to ICMLA 2024"},{"id":"http://arxiv.org/abs/2410.02719v1","updated":"2024-10-03T17:39:38Z","published":"2024-10-03T17:39:38Z","title":"UncertaintyRAG: Span-Level Uncertainty Enhanced Long-Context Modeling\n for Retrieval-Augmented Generation","summary":" We present UncertaintyRAG, a novel approach for long-context\nRetrieval-Augmented Generation (RAG) that utilizes Signal-to-Noise Ratio\n(SNR)-based span uncertainty to estimate similarity between text chunks. This\nspan uncertainty enhances model calibration, improving robustness and\nmitigating semantic inconsistencies introduced by random chunking. Leveraging\nthis insight, we propose an efficient unsupervised learning technique to train\nthe retrieval model, alongside an effective data sampling and scaling strategy.\nUncertaintyRAG outperforms baselines by 2.03% on LLaMA-2-7B, achieving\nstate-of-the-art results while using only 4% of the training data compared to\nother advanced open-source retrieval models under distribution shift settings.\nOur method demonstrates strong calibration through span uncertainty, leading to\nimproved generalization and robustness in long-context RAG tasks. Additionally,\nUncertaintyRAG provides a lightweight retrieval model that can be integrated\ninto any large language model with varying context window lengths, without the\nneed for fine-tuning, showcasing the flexibility of our approach.\n","authors":["Zixuan Li","Jing Xiong","Fanghua Ye","Chuanyang Zheng","Xun Wu","Jianqiao Lu","Zhongwei Wan","Xiaodan Liang","Chengming Li","Zhenan Sun","Lingpeng Kong","Ngai Wong"],"pdf_url":"https://arxiv.org/pdf/2410.02719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02713v1","updated":"2024-10-03T17:36:49Z","published":"2024-10-03T17:36:49Z","title":"Video Instruction Tuning With Synthetic Data","summary":" The development of video large multimodal models (LMMs) has been hindered by\nthe difficulty of curating large amounts of high-quality raw data from the web.\nTo address this, we propose an alternative approach by creating a high-quality\nsynthetic dataset specifically for video instruction-following, namely\nLLaVA-Video-178K. This dataset includes key tasks such as detailed captioning,\nopen-ended question-answering (QA), and multiple-choice QA. By training on this\ndataset, in combination with existing visual instruction tuning data, we\nintroduce LLaVA-Video, a new video LMM. Our experiments demonstrate that\nLLaVA-Video achieves strong performance across various video benchmarks,\nhighlighting the effectiveness of our dataset. We plan to release the dataset,\nits generation pipeline, and the model checkpoints.\n","authors":["Yuanhan Zhang","Jinming Wu","Wei Li","Bo Li","Zejun Ma","Ziwei Liu","Chunyuan Li"],"pdf_url":"https://arxiv.org/pdf/2410.02713v1.pdf","comment":"Project page: https://llava-vl.github.io/blog/2024-09-30-llava-video/"},{"id":"http://arxiv.org/abs/2410.02712v1","updated":"2024-10-03T17:36:33Z","published":"2024-10-03T17:36:33Z","title":"LLaVA-Critic: Learning to Evaluate Multimodal Models","summary":" We introduce LLaVA-Critic, the first open-source large multimodal model (LMM)\ndesigned as a generalist evaluator to assess performance across a wide range of\nmultimodal tasks. LLaVA-Critic is trained using a high-quality critic\ninstruction-following dataset that incorporates diverse evaluation criteria and\nscenarios. Our experiments demonstrate the model's effectiveness in two key\nareas: (1) LMM-as-a-Judge, where LLaVA-Critic provides reliable evaluation\nscores, performing on par with or surpassing GPT models on multiple evaluation\nbenchmarks; and (2) Preference Learning, where it generates reward signals for\npreference learning, enhancing model alignment capabilities. This work\nunderscores the potential of open-source LMMs in self-critique and evaluation,\nsetting the stage for future research into scalable, superhuman alignment\nfeedback mechanisms for LMMs.\n","authors":["Tianyi Xiong","Xiyao Wang","Dong Guo","Qinghao Ye","Haoqi Fan","Quanquan Gu","Heng Huang","Chunyuan Li"],"pdf_url":"https://arxiv.org/pdf/2410.02712v1.pdf","comment":"Project Page: https://llava-vl.github.io/blog/2024-10-03-llava-critic"},{"id":"http://arxiv.org/abs/2410.02707v1","updated":"2024-10-03T17:31:31Z","published":"2024-10-03T17:31:31Z","title":"LLMs Know More Than They Show: On the Intrinsic Representation of LLM\n Hallucinations","summary":" Large language models (LLMs) often produce errors, including factual\ninaccuracies, biases, and reasoning failures, collectively referred to as\n\"hallucinations\". Recent studies have demonstrated that LLMs' internal states\nencode information regarding the truthfulness of their outputs, and that this\ninformation can be utilized to detect errors. In this work, we show that the\ninternal representations of LLMs encode much more information about\ntruthfulness than previously recognized. We first discover that the\ntruthfulness information is concentrated in specific tokens, and leveraging\nthis property significantly enhances error detection performance. Yet, we show\nthat such error detectors fail to generalize across datasets, implying that --\ncontrary to prior claims -- truthfulness encoding is not universal but rather\nmultifaceted. Next, we show that internal representations can also be used for\npredicting the types of errors the model is likely to make, facilitating the\ndevelopment of tailored mitigation strategies. Lastly, we reveal a discrepancy\nbetween LLMs' internal encoding and external behavior: they may encode the\ncorrect answer, yet consistently generate an incorrect one. Taken together,\nthese insights deepen our understanding of LLM errors from the model's internal\nperspective, which can guide future research on enhancing error analysis and\nmitigation.\n","authors":["Hadas Orgad","Michael Toker","Zorik Gekhman","Roi Reichart","Idan Szpektor","Hadas Kotek","Yonatan Belinkov"],"pdf_url":"https://arxiv.org/pdf/2410.02707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02703v1","updated":"2024-10-03T17:27:30Z","published":"2024-10-03T17:27:30Z","title":"Selective Attention Improves Transformer","summary":" Unneeded elements in the attention's context degrade performance. We\nintroduce Selective Attention, a simple parameter-free change to the standard\nattention mechanism which reduces attention to unneeded elements. Selective\nattention improves language modeling performance in a variety of model sizes\nand context lengths. For example, a range of transformers trained with the\nlanguage modeling objective on C4 with selective attention perform equivalently\nto standard transformers with ~2X more heads and parameters in their attention\nmodules. Selective attention also allows decreasing the size of the attention's\ncontext buffer, leading to meaningful reductions in the memory and compute\nrequirements during inference. For example, transformers with 100M parameters\ntrained on C4 with context sizes of 512, 1,024, and 2,048 need 16X, 25X, and\n47X less memory for their attention module, respectively, when equipped with\nselective attention, as those without selective attention, with the same\nvalidation perplexity.\n","authors":["Yaniv Leviathan","Matan Kalman","Yossi Matias"],"pdf_url":"https://arxiv.org/pdf/2410.02703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12683v2","updated":"2024-10-03T17:27:28Z","published":"2023-12-20T00:49:52Z","title":"Turning English-centric LLMs Into Polyglots: How Much Multilinguality Is\n Needed?","summary":" The vast majority of today's large language models (LLMs) are\nEnglish-centric, having been pretrained predominantly on English text. Yet, in\norder to meet user expectations, models need to be able to respond\nappropriately in multiple languages once deployed in downstream applications.\nThis requires strong cross-lingual transfer abilities. In this work, we\ninvestigate the minimal amount of multilinguality required during finetuning to\nelicit cross-lingual generalisation in English-centric LLMs. In experiments\nacross four LLMs, we find that multilingual instruction tuning with as few as\ntwo to three languages is both necessary and sufficient to elicit effective\ncross-lingual generalisation, with the limiting factor being the degree to\nwhich a target language is seen during pretraining. Evaluations on five\ndifferent tasks further reveal that multilingual instruction tuning is most\nbeneficial for generative tasks that assume input/output language agreement,\nsuch as in chat settings, while being of less importance for highly structured\nclassification-style tasks. Our code and data is available at\nhttps://github.com/ZurichNLP/multilingual-instruction-tuning.\n","authors":["Tannon Kew","Florian Schottmann","Rico Sennrich"],"pdf_url":"https://arxiv.org/pdf/2312.12683v2.pdf","comment":"Accepted at Findings of EMNLP 2024"},{"id":"http://arxiv.org/abs/2407.07071v2","updated":"2024-10-03T17:26:48Z","published":"2024-07-09T17:44:34Z","title":"Lookback Lens: Detecting and Mitigating Contextual Hallucinations in\n Large Language Models Using Only Attention Maps","summary":" When asked to summarize articles or answer questions given a passage, large\nlanguage models (LLMs) can hallucinate details and respond with unsubstantiated\nanswers that are inaccurate with respect to the input context. This paper\ndescribes a simple approach for detecting such contextual hallucinations. We\nhypothesize that contextual hallucinations are related to the extent to which\nan LLM attends to information in the provided context versus its own\ngenerations. Based on this intuition, we propose a simple hallucination\ndetection model whose input features are given by the ratio of attention\nweights on the context versus newly generated tokens (for each attention head).\nWe find that a linear classifier based on these lookback ratio features is as\neffective as a richer detector that utilizes the entire hidden states of an LLM\nor a text-based entailment model. The lookback ratio-based detector -- Lookback\nLens -- is found to transfer across tasks and even models, allowing a detector\nthat is trained on a 7B model to be applied (without retraining) to a larger\n13B model. We further apply this detector to mitigate contextual\nhallucinations, and find that a simple classifier-guided decoding approach is\nable to reduce the amount of hallucination, for example by 9.6% in the XSum\nsummarization task.\n","authors":["Yung-Sung Chuang","Linlu Qiu","Cheng-Yu Hsieh","Ranjay Krishna","Yoon Kim","James Glass"],"pdf_url":"https://arxiv.org/pdf/2407.07071v2.pdf","comment":"EMNLP 2024 main conference long paper. The source code is available\n at https://github.com/voidism/Lookback-Lens"},{"id":"http://arxiv.org/abs/2311.00237v3","updated":"2024-10-03T17:25:02Z","published":"2023-11-01T02:40:42Z","title":"The Mystery of In-Context Learning: A Comprehensive Survey on\n Interpretation and Analysis","summary":" Understanding in-context learning (ICL) capability that enables large\nlanguage models (LLMs) to excel in proficiency through demonstration examples\nis of utmost importance. This importance stems not only from the better\nutilization of this capability across various tasks, but also from the\nproactive identification and mitigation of potential risks, including concerns\nregarding truthfulness, bias, and toxicity, that may arise alongside the\ncapability. In this paper, we present a thorough survey on the interpretation\nand analysis of in-context learning. First, we provide a concise introduction\nto the background and definition of in-context learning. Then, we give an\noverview of advancements from two perspectives: 1) a theoretical perspective,\nemphasizing studies on mechanistic interpretability and delving into the\nmathematical foundations behind ICL; and 2) an empirical perspective,\nconcerning studies that empirically analyze factors associated with ICL. We\nconclude by highlighting the challenges encountered and suggesting potential\navenues for future research. We believe that our work establishes the basis for\nfurther exploration into the interpretation of in-context learning.\nAdditionally, we have created a repository containing the resources referenced\nin our survey.\n","authors":["Yuxiang Zhou","Jiazheng Li","Yanzheng Xiang","Hanqi Yan","Lin Gui","Yulan He"],"pdf_url":"https://arxiv.org/pdf/2311.00237v3.pdf","comment":"Accepted to the main conference of EMNLP 2024. Resources are\n available at https://github.com/zyxnlp/ICL-Interpretation-Analysis-Resources"},{"id":"http://arxiv.org/abs/2410.02694v1","updated":"2024-10-03T17:20:11Z","published":"2024-10-03T17:20:11Z","title":"HELMET: How to Evaluate Long-Context Language Models Effectively and\n Thoroughly","summary":" There have been many benchmarks for evaluating long-context language models\n(LCLMs), but developers often rely on synthetic tasks like needle-in-a-haystack\n(NIAH) or arbitrary subsets of tasks. It remains unclear whether they translate\nto the diverse downstream applications of LCLMs, and the inconsistency further\ncomplicates model comparison. We investigate the underlying reasons behind\ncurrent practices and find that existing benchmarks often provide noisy signals\ndue to low coverage of applications, insufficient lengths, unreliable metrics,\nand incompatibility with base models. In this work, we present HELMET (How to\nEvaluate Long-context Models Effectively and Thoroughly), a comprehensive\nbenchmark encompassing seven diverse, application-centric categories. We also\naddress many issues in previous benchmarks by adding controllable lengths up to\n128k tokens, model-based evaluation for reliable metrics, and few-shot\nprompting for robustly evaluating base models. Consequently, we demonstrate\nthat HELMET offers more reliable and consistent rankings of frontier LCLMs.\nThrough a comprehensive study of 51 LCLMs, we find that (1) synthetic tasks\nlike NIAH are not good predictors of downstream performance; (2) the diverse\ncategories in HELMET exhibit distinct trends and low correlation with each\nother; and (3) while most LCLMs achieve perfect NIAH scores, open-source models\nsignificantly lag behind closed ones when the task requires full-context\nreasoning or following complex instructions -- the gap widens with increased\nlengths. Finally, we recommend using our RAG tasks for fast model development,\nas they are easy to run and more predictive of other downstream performance;\nultimately, we advocate for a holistic evaluation across diverse tasks.\n","authors":["Howard Yen","Tianyu Gao","Minmin Hou","Ke Ding","Daniel Fleischer","Peter Izasak","Moshe Wasserblat","Danqi Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02694v1.pdf","comment":"Code and data are available here:\n https://github.com/princeton-nlp/HELMET"},{"id":"http://arxiv.org/abs/2410.02691v1","updated":"2024-10-03T17:18:03Z","published":"2024-10-03T17:18:03Z","title":"On the Proper Treatment of Tokenization in Psycholinguistics","summary":" Language models are widely used in computational psycholinguistics to test\ntheories that relate the negative log probability (the surprisal) of a region\nof interest (a substring of characters) under a language model to its cognitive\ncost experienced by readers, as operationalized, for example, by gaze duration\non the region. However, the application of modern language models to\npsycholinguistic studies is complicated by the practice of using tokenization\nas an intermediate step in training a model. Doing so results in a language\nmodel over token strings rather than one over character strings. Vexingly,\nregions of interest are generally misaligned with these token strings. The\npaper argues that token-level language models should be (approximately)\nmarginalized into character-level language models before they are used in\npsycholinguistic studies to compute the surprisal of a region of interest;\nthen, the marginalized character-level language model can be used to compute\nthe surprisal of an arbitrary character substring, which we term a focal area,\nthat the experimenter may wish to use as a predictor. Our proposal of\nmarginalizing a token-level model into a character-level one solves this\nmisalignment issue independently of the tokenization scheme. Empirically, we\ndiscover various focal areas whose surprisal is a better psychometric predictor\nthan the surprisal of the region of interest itself.\n","authors":["Mario Giulianelli","Luca Malagutti","Juan Luis Gastaldi","Brian DuSell","Tim Vieira","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2410.02691v1.pdf","comment":"Main conference long paper at EMNLP 2024"},{"id":"http://arxiv.org/abs/2401.03741v2","updated":"2024-10-03T17:15:24Z","published":"2024-01-08T09:01:29Z","title":"Enhanced Automated Code Vulnerability Repair using Large Language Models","summary":" This research addresses the complex challenge of automated repair of code\nvulnerabilities, vital for enhancing digital security in an increasingly\ntechnology-driven world. The study introduces a novel and efficient format for\nthe representation of code modification, using advanced Large Language Models\n(LLMs) such as Code Llama and Mistral. These models, fine-tuned on datasets\nfeaturing C code vulnerabilities, significantly improve the accuracy and\nadaptability of automated code repair techniques. A key finding is the enhanced\nrepair accuracy of these models when compared to previous methods such as\nVulRepair, which underscores their practical utility and efficiency. The\nresearch also offers a critical assessment of current evaluation metrics, such\nas perfect predictions, and their limitations in reflecting the true\ncapabilities of automated repair models in real-world scenarios. Following\nthis, it underscores the importance of using test datasets devoid of train\nsamples, emphasizing the need for dataset integrity to enhance the\neffectiveness of LLMs in code repair tasks. The significance of this work is\nits contribution to digital security, setting new standards for automated code\nvulnerability repair and paving the way for future advancements in the fields\nof cybersecurity and artificial intelligence. The study does not only highlight\nthe potential of LLMs in enhancing code security but also fosters further\nexploration and research in these crucial areas.\n","authors":["David de-Fitero-Dominguez","Eva Garcia-Lopez","Antonio Garcia-Cabot","Jose-Javier Martinez-Herraiz"],"pdf_url":"https://arxiv.org/pdf/2401.03741v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03650v2","updated":"2024-10-03T17:13:04Z","published":"2024-09-05T16:08:19Z","title":"On the Limited Generalization Capability of the Implicit Reward Model\n Induced by Direct Preference Optimization","summary":" Reinforcement Learning from Human Feedback (RLHF) is an effective approach\nfor aligning language models to human preferences. Central to RLHF is learning\na reward function for scoring human preferences. Two main approaches for\nlearning a reward model are 1) training an EXplicit Reward Model (EXRM) as in\nRLHF, and 2) using an implicit reward learned from preference data through\nmethods such as Direct Preference Optimization (DPO). Prior work has shown that\nthe implicit reward model of DPO (denoted as DPORM) can approximate an EXRM in\nthe limit. DPORM's effectiveness directly implies the optimality of the learned\npolicy, and also has practical implication for LLM alignment methods including\niterative DPO. However, it is unclear how well DPORM empirically matches the\nperformance of EXRM. This work studies the accuracy at distinguishing preferred\nand rejected answers for both DPORM and EXRM. Our findings indicate that even\nthough DPORM fits the training dataset comparably, it generalizes less\neffectively than EXRM, especially when the validation datasets contain\ndistribution shifts. Across five out-of-distribution settings, DPORM has a mean\ndrop in accuracy of 3% and a maximum drop of 7%. These findings highlight that\nDPORM has limited generalization ability and substantiates the integration of\nan explicit reward model in iterative DPO approaches.\n","authors":["Yong Lin","Skyler Seto","Maartje ter Hoeve","Katherine Metcalf","Barry-John Theobald","Xuan Wang","Yizhe Zhang","Chen Huang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03650v2.pdf","comment":"12 pages, 8 tables, 3 figures; Paper Accepted at EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2410.02684v1","updated":"2024-10-03T17:10:41Z","published":"2024-10-03T17:10:41Z","title":"HiddenGuard: Fine-Grained Safe Generation with Specialized\n Representation Router","summary":" As Large Language Models (LLMs) grow increasingly powerful, ensuring their\nsafety and alignment with human values remains a critical challenge. Ideally,\nLLMs should provide informative responses while avoiding the disclosure of\nharmful or sensitive information. However, current alignment approaches, which\nrely heavily on refusal strategies, such as training models to completely\nreject harmful prompts or applying coarse filters are limited by their binary\nnature. These methods either fully deny access to information or grant it\nwithout sufficient nuance, leading to overly cautious responses or failures to\ndetect subtle harmful content. For example, LLMs may refuse to provide basic,\npublic information about medication due to misuse concerns. Moreover, these\nrefusal-based methods struggle to handle mixed-content scenarios and lack the\nability to adapt to context-dependent sensitivities, which can result in\nover-censorship of benign content. To overcome these challenges, we introduce\nHiddenGuard, a novel framework for fine-grained, safe generation in LLMs.\nHiddenGuard incorporates Prism (rePresentation Router for In-Stream\nModeration), which operates alongside the LLM to enable real-time, token-level\ndetection and redaction of harmful content by leveraging intermediate hidden\nstates. This fine-grained approach allows for more nuanced, context-aware\nmoderation, enabling the model to generate informative responses while\nselectively redacting or replacing sensitive information, rather than outright\nrefusal. We also contribute a comprehensive dataset with token-level\nfine-grained annotations of potentially harmful information across diverse\ncontexts. Our experiments demonstrate that HiddenGuard achieves over 90% in F1\nscore for detecting and redacting harmful content while preserving the overall\nutility and informativeness of the model's responses.\n","authors":["Lingrui Mei","Shenghua Liu","Yiwei Wang","Baolong Bi","Ruibin Yuan","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.02684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18725v2","updated":"2024-10-03T17:10:09Z","published":"2024-06-26T19:48:48Z","title":"Jailbreaking LLMs with Arabic Transliteration and Arabizi","summary":" This study identifies the potential vulnerabilities of Large Language Models\n(LLMs) to 'jailbreak' attacks, specifically focusing on the Arabic language and\nits various forms. While most research has concentrated on English-based prompt\nmanipulation, our investigation broadens the scope to investigate the Arabic\nlanguage. We initially tested the AdvBench benchmark in Standardized Arabic,\nfinding that even with prompt manipulation techniques like prefix injection, it\nwas insufficient to provoke LLMs into generating unsafe content. However, when\nusing Arabic transliteration and chatspeak (or arabizi), we found that unsafe\ncontent could be produced on platforms like OpenAI GPT-4 and Anthropic Claude 3\nSonnet. Our findings suggest that using Arabic and its various forms could\nexpose information that might remain hidden, potentially increasing the risk of\njailbreak attacks. We hypothesize that this exposure could be due to the\nmodel's learned connection to specific words, highlighting the need for more\ncomprehensive safety training across all language forms.\n","authors":["Mansour Al Ghanim","Saleh Almohaimeed","Mengxin Zheng","Yan Solihin","Qian Lou"],"pdf_url":"https://arxiv.org/pdf/2406.18725v2.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.02683v1","updated":"2024-10-03T17:08:52Z","published":"2024-10-03T17:08:52Z","title":"DailyDilemmas: Revealing Value Preferences of LLMs with Quandaries of\n Daily Life","summary":" As we increasingly seek guidance from LLMs for decision-making in daily life,\nmany of these decisions are not clear-cut and depend significantly on the\npersonal values and ethical standards of the users. We present DailyDilemmas, a\ndataset of 1,360 moral dilemmas encountered in everyday life. Each dilemma\nincludes two possible actions and with each action, the affected parties and\nhuman values invoked. Based on these dilemmas, we consolidated a set of human\nvalues across everyday topics e.g., interpersonal relationships, workplace, and\nenvironmental issues. We evaluated LLMs on these dilemmas to determine what\naction they will take and the values represented by these actions. Then, we\nanalyzed these values through the lens of five popular theories inspired by\nsociology, psychology and philosophy. These theories are: World Value Survey,\nMoral Foundation Theory, Maslow's Hierarchy of Needs, Aristotle's Virtues, and\nPlutchik Wheel of Emotion. We find that LLMs are most aligned with the\nself-expression over survival values in terms of World Value Survey, care over\nloyalty in Moral Foundation Theory. Interestingly, we find large preferences\ndifferences in models for some core values such as truthfulness e.g.,\nMixtral-8x7B model tends to neglect it by 9.7% while GPT-4-turbo model tends to\nselect it by 9.4%. We also study the recent guidance released by OpenAI\n(ModelSpec), and Anthropic (Constitutional AI) to understand how their released\nprinciples reflect their actual value prioritization when facing nuanced moral\nreasoning in daily-life settings. We find that end users cannot effectively\nsteer such prioritization using system prompts.\n","authors":["Yu Ying Chiu","Liwei Jiang","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2410.02683v1.pdf","comment":"Preprint. Under Review"},{"id":"http://arxiv.org/abs/2311.09756v2","updated":"2024-10-03T17:04:50Z","published":"2023-11-16T10:30:26Z","title":"StorySparkQA: Expert-Annotated QA Pairs with Real-World Knowledge for\n Children's Story-Based Learning","summary":" Interactive story reading is a common parent-child activity, where parents\nexpect to teach both language skills and real-world knowledge beyond the story.\nWhile increasing storytelling and reading systems have been developed for this\nactivity, they often fail to infuse real-world knowledge into the conversation.\nThis limitation can be attributed to the existing question-answering (QA)\ndatasets used for children's education, upon which the systems are built,\nfailing to capture the nuances of how education experts think when conducting\ninteractive story reading activities. To bridge this gap, we design an\nannotation framework, empowered by existing knowledge graph to capture experts'\nannotations and thinking process, and leverage this framework to construct\nStorySparkQA dataset, which comprises 5,868 expert-annotated QA pairs with\nreal-world knowledge. We conduct automated and human expert evaluations across\nvarious QA pair generation settings to demonstrate that our StorySparkQA can\neffectively support models in generating QA pairs that target real-world\nknowledge beyond story content. StorySparkQA is available at\nhttps://huggingface.co/datasets/NEU-HAI/StorySparkQA.\n","authors":["Jiaju Chen","Yuxuan Lu","Shao Zhang","Bingsheng Yao","Yuanzhe Dong","Ying Xu","Yunyao Li","Qianwen Wang","Dakuo Wang","Yuling Sun"],"pdf_url":"https://arxiv.org/pdf/2311.09756v2.pdf","comment":"Accepted at EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2410.02678v1","updated":"2024-10-03T17:04:48Z","published":"2024-10-03T17:04:48Z","title":"Distilling an End-to-End Voice Assistant Without Instruction Training\n Data","summary":" Voice assistants, such as Siri and Google Assistant, typically model audio\nand text separately, resulting in lost speech information and increased\ncomplexity. Recent efforts to address this with end-to-end Speech Large\nLanguage Models (LLMs) trained with supervised finetuning (SFT)\n have led to models ``forgetting\" capabilities from text-only LLMs. Our work\nproposes an alternative paradigm for training Speech LLMs without instruction\ndata, using the response of a text-only LLM to transcripts as self-supervision.\nImportantly, this process can be performed without annotated responses. We show\nthat our Distilled Voice Assistant (DiVA) generalizes to Spoken Question\nAnswering, Classification, and Translation. Furthermore, we show that DiVA\nbetter meets user preferences, achieving a 72\\% win rate compared with\nstate-of-the-art models like Qwen 2 Audio, despite using $>$100x less training\ncompute.\n","authors":["William Held","Ella Li","Michael Ryan","Weiyan Shi","Yanzhe Zhang","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2410.02678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02677v1","updated":"2024-10-03T17:04:31Z","published":"2024-10-03T17:04:31Z","title":"CulturalBench: a Robust, Diverse and Challenging Benchmark on Measuring\n the (Lack of) Cultural Knowledge of LLMs","summary":" To make large language models (LLMs) more helpful across diverse cultures, it\nis essential to have effective cultural knowledge benchmarks to measure and\ntrack our progress. Effective benchmarks need to be robust, diverse, and\nchallenging. We introduce CulturalBench: a set of 1,227 human-written and\nhuman-verified questions for effectively assessing LLMs' cultural knowledge,\ncovering 45 global regions including the underrepresented ones like Bangladesh,\nZimbabwe, and Peru. Questions - each verified by five independent annotators -\nspan 17 diverse topics ranging from food preferences to greeting etiquettes. We\nevaluate models on two setups: CulturalBench-Easy and CulturalBench-Hard which\nshare the same questions but asked differently. We find that LLMs are sensitive\nto such difference in setups (e.g., GPT-4o with 27.3% difference). Compared to\nhuman performance (92.6% accuracy), CulturalBench-Hard is more challenging for\nfrontier LLMs with the best performing model (GPT-4o) at only 61.5% and the\nworst (Llama3-8b) at 21.4%. Moreover, we find that LLMs often struggle with\ntricky questions that have multiple correct answers (e.g., What utensils do the\nChinese usually use?), revealing a tendency to converge to a single answer. Our\nresults also indicate that OpenAI GPT-4o substantially outperform other\nproprietary and open source models in questions related to all but one region\n(Oceania). Nonetheless, all models consistently underperform on questions\nrelated to South America and the Middle East.\n","authors":["Yu Ying Chiu","Liwei Jiang","Bill Yuchen Lin","Chan Young Park","Shuyue Stella Li","Sahithya Ravi","Mehar Bhatia","Maria Antoniak","Yulia Tsvetkov","Vered Shwartz","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2410.02677v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2410.02675v1","updated":"2024-10-03T17:02:21Z","published":"2024-10-03T17:02:21Z","title":"FAN: Fourier Analysis Networks","summary":" Despite the remarkable success achieved by neural networks, particularly\nthose represented by MLP and Transformer, we reveal that they exhibit potential\nflaws in the modeling and reasoning of periodicity, i.e., they tend to memorize\nthe periodic data rather than genuinely understanding the underlying principles\nof periodicity. However, periodicity is a crucial trait in various forms of\nreasoning and generalization, underpinning predictability across natural and\nengineered systems through recurring patterns in observations. In this paper,\nwe propose FAN, a novel network architecture based on Fourier Analysis, which\nempowers the ability to efficiently model and reason about periodic phenomena.\nBy introducing Fourier Series, the periodicity is naturally integrated into the\nstructure and computational processes of the neural network, thus achieving a\nmore accurate expression and prediction of periodic patterns. As a promising\nsubstitute to multi-layer perceptron (MLP), FAN can seamlessly replace MLP in\nvarious models with fewer parameters and FLOPs. Through extensive experiments,\nwe demonstrate the effectiveness of FAN in modeling and reasoning about\nperiodic functions, and the superiority and generalizability of FAN across a\nrange of real-world tasks, including symbolic formula representation, time\nseries forecasting, and language modeling.\n","authors":["Yihong Dong","Ge Li","Yongding Tao","Xue Jiang","Kechi Zhang","Jia Li","Jing Su","Jun Zhang","Jingjing Xu"],"pdf_url":"https://arxiv.org/pdf/2410.02675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02674v1","updated":"2024-10-03T16:58:21Z","published":"2024-10-03T16:58:21Z","title":"Examining Language Modeling Assumptions Using an Annotated Literary\n Dialect Corpus","summary":" We present a dataset of 19th century American literary orthovariant tokens\nwith a novel layer of human-annotated dialect group tags designed to serve as\nthe basis for computational experiments exploring literarily meaningful\northographic variation. We perform an initial broad set of experiments over\nthis dataset using both token (BERT) and character (CANINE)-level contextual\nlanguage models. We find indications that the \"dialect effect\" produced by\nintentional orthographic variation employs multiple linguistic channels, and\nthat these channels are able to be surfaced to varied degrees given particular\nlanguage modelling assumptions. Specifically, we find evidence showing that\nchoice of tokenization scheme meaningfully impact the type of orthographic\ninformation a model is able to surface.\n","authors":["Craig Messner","Tom Lippincott"],"pdf_url":"https://arxiv.org/pdf/2410.02674v1.pdf","comment":"Accepted to NLP4DH@EMNLP2024"},{"id":"http://arxiv.org/abs/2407.07950v2","updated":"2024-10-03T16:54:59Z","published":"2024-07-10T18:00:05Z","title":"Rel-A.I.: An Interaction-Centered Approach To Measuring Human-LM\n Reliance","summary":" The ability to communicate uncertainty, risk, and limitation is crucial for\nthe safety of large language models. However, current evaluations of these\nabilities rely on simple calibration, asking whether the language generated by\nthe model matches appropriate probabilities. Instead, evaluation of this aspect\nof LLM communication should focus on the behaviors of their human\ninterlocutors: how much do they rely on what the LLM says? Here we introduce an\ninteraction-centered evaluation framework called Rel-A.I. (pronounced \"rely\"})\nthat measures whether humans rely on LLM generations. We use this framework to\nstudy how reliance is affected by contextual features of the interaction (e.g,\nthe knowledge domain that is being discussed), or the use of greetings\ncommunicating warmth or competence (e.g., \"I'm happy to help!\"). We find that\ncontextual characteristics significantly affect human reliance behavior. For\nexample, people rely 10% more on LMs when responding to questions involving\ncalculations and rely 30% more on LMs that are perceived as more competent. Our\nresults show that calibration and language quality alone are insufficient in\nevaluating the risks of human-LM interactions, and illustrate the need to\nconsider features of the interactional context.\n","authors":["Kaitlyn Zhou","Jena D. Hwang","Xiang Ren","Nouha Dziri","Dan Jurafsky","Maarten Sap"],"pdf_url":"https://arxiv.org/pdf/2407.07950v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2407.07565v3","updated":"2024-10-03T16:48:55Z","published":"2024-07-10T11:50:20Z","title":"On Leakage of Code Generation Evaluation Datasets","summary":" In this paper, we consider contamination by code generation test sets, in\nparticular in their use in modern large language models. We discuss three\npossible sources of such contamination and show findings supporting each of\nthem: (i) direct data leakage, (ii) indirect data leakage through the use of\nsynthetic data and (iii) overfitting to evaluation sets during model selection.\nTo address this, we release Less Basic Python Problems (LBPP): an\nuncontaminated new benchmark of 161 prompts with their associated Python\nsolutions. LBPP is released at https://huggingface.co/datasets/CohereForAI/lbpp .\n","authors":["Alexandre Matton","Tom Sherborne","Dennis Aumiller","Elena Tommasone","Milad Alizadeh","Jingyi He","Raymond Ma","Maxime Voisin","Ellen Gilsenan-McMahon","Matthias Gallé"],"pdf_url":"https://arxiv.org/pdf/2407.07565v3.pdf","comment":"EMNLP 2024 Findings. 5 main pages, 9 in total"},{"id":"http://arxiv.org/abs/2410.02660v1","updated":"2024-10-03T16:46:52Z","published":"2024-10-03T16:46:52Z","title":"How to Train Long-Context Language Models (Effectively)","summary":" We study continued training and supervised fine-tuning (SFT) of a language\nmodel (LM) to make effective use of long-context information. We first\nestablish a reliable evaluation protocol to guide model development -- Instead\nof perplexity or simple needle-in-a-haystack (NIAH) tests, we use a broad set\nof long-context tasks, and we evaluate models after SFT with instruction data\nas this better reveals long-context abilities. Supported by our robust\nevaluations, we run thorough experiments to decide the data mix for continued\npre-training, the instruction tuning dataset, and many other design choices. We\nfind that (1) code repositories and books are excellent sources of long data,\nbut it is crucial to combine them with high-quality short data; (2) training\nwith a sequence length beyond the evaluation length boosts long-context\nperformance; (3) for SFT, using only short instruction datasets yields strong\nperformance on long-context tasks. Our final model, ProLong-8B, which is\ninitialized from Llama-3 and trained on 40B tokens, demonstrates\nstate-of-the-art long-context performance among similarly sized models at a\nlength of 128K. ProLong outperforms Llama-3.18B-Instruct on the majority of\nlong-context tasks despite having seen only 5% as many tokens during\nlong-context training. Additionally, ProLong can effectively process up to 512K\ntokens, one of the longest context windows of publicly available LMs.\n","authors":["Tianyu Gao","Alexander Wettig","Howard Yen","Danqi Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02660v1.pdf","comment":"Our code, data, and models are available at\n https://github.com/princeton-nlp/ProLong"},{"id":"http://arxiv.org/abs/2407.11969v3","updated":"2024-10-03T16:46:09Z","published":"2024-07-16T17:59:55Z","title":"Does Refusal Training in LLMs Generalize to the Past Tense?","summary":" Refusal training is widely used to prevent LLMs from generating harmful,\nundesirable, or illegal outputs. We reveal a curious generalization gap in the\ncurrent refusal training approaches: simply reformulating a harmful request in\nthe past tense (e.g., \"How to make a Molotov cocktail?\" to \"How did people make\na Molotov cocktail?\") is often sufficient to jailbreak many state-of-the-art\nLLMs. We systematically evaluate this method on Llama-3 8B, Claude-3.5 Sonnet,\nGPT-3.5 Turbo, Gemma-2 9B, Phi-3-Mini, GPT-4o mini, GPT-4o, o1-mini,\no1-preview, and R2D2 models using GPT-3.5 Turbo as a reformulation model. For\nexample, the success rate of this simple attack on GPT-4o increases from 1%\nusing direct requests to 88% using 20 past tense reformulation attempts on\nharmful requests from JailbreakBench with GPT-4 as a jailbreak judge.\nInterestingly, we also find that reformulations in the future tense are less\neffective, suggesting that refusal guardrails tend to consider past historical\nquestions more benign than hypothetical future questions. Moreover, our\nexperiments on fine-tuning GPT-3.5 Turbo show that defending against past\nreformulations is feasible when past tense examples are explicitly included in\nthe fine-tuning data. Overall, our findings highlight that the widely used\nalignment techniques -- such as SFT, RLHF, and adversarial training -- employed\nto align the studied models can be brittle and do not always generalize as\nintended. We provide code and jailbreak artifacts at\nhttps://github.com/tml-epfl/llm-past-tense.\n","authors":["Maksym Andriushchenko","Nicolas Flammarion"],"pdf_url":"https://arxiv.org/pdf/2407.11969v3.pdf","comment":"Update in v3: o1-mini and o1-preview results (on top of GPT-4o and\n Claude 3.5 Sonnet added in v2). We provide code and jailbreak artifacts at\n https://github.com/tml-epfl/llm-past-tense"},{"id":"http://arxiv.org/abs/2410.02657v1","updated":"2024-10-03T16:43:17Z","published":"2024-10-03T16:43:17Z","title":"Hate Personified: Investigating the role of LLMs in content moderation","summary":" For subjective tasks such as hate detection, where people perceive hate\ndifferently, the Large Language Model's (LLM) ability to represent diverse\ngroups is unclear. By including additional context in prompts, we\ncomprehensively analyze LLM's sensitivity to geographical priming, persona\nattributes, and numerical information to assess how well the needs of various\ngroups are reflected. Our findings on two LLMs, five languages, and six\ndatasets reveal that mimicking persona-based attributes leads to annotation\nvariability. Meanwhile, incorporating geographical signals leads to better\nregional alignment. We also find that the LLMs are sensitive to numerical\nanchors, indicating the ability to leverage community-based flagging efforts\nand exposure to adversaries. Our work provides preliminary guidelines and\nhighlights the nuances of applying LLMs in culturally sensitive cases.\n","authors":["Sarah Masud","Sahajpreet Singh","Viktor Hangya","Alexander Fraser","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2410.02657v1.pdf","comment":"17 pages, 6 Figures, 13 Tables, EMNLP'24 Mains"},{"id":"http://arxiv.org/abs/2402.16382v2","updated":"2024-10-03T16:39:32Z","published":"2024-02-26T08:08:03Z","title":"Immunization against harmful fine-tuning attacks","summary":" Large Language Models (LLMs) are often trained with safety guards intended to\nprevent harmful text generation. However, such safety training can be removed\nby fine-tuning the LLM on harmful datasets. While this emerging threat (harmful\nfine-tuning attacks) has been characterized by previous work, there is little\nunderstanding of how we should proceed in constructing and validating defenses\nagainst these attacks especially in the case where defenders would not have\ncontrol of the fine-tuning process. We introduce a formal framework based on\nthe training budget of an attacker which we call \"Immunization\" conditions.\nUsing a formal characterisation of the harmful fine-tuning problem, we provide\na thorough description of what a successful defense must comprise of and\nestablish a set of guidelines on how rigorous defense research that gives us\nconfidence should proceed.\n","authors":["Domenic Rosati","Jan Wehner","Kai Williams","Łukasz Bartoszcze","Jan Batzner","Hassan Sajjad","Frank Rudzicz"],"pdf_url":"https://arxiv.org/pdf/2402.16382v2.pdf","comment":"Published in EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.02653v1","updated":"2024-10-03T16:36:35Z","published":"2024-10-03T16:36:35Z","title":"Measuring and Improving Persuasiveness of Generative Models","summary":" LLMs are increasingly being used in workflows involving generating content to\nbe consumed by humans (e.g., marketing) and also in directly interacting with\nhumans (e.g., through chatbots). The development of such systems that are\ncapable of generating verifiably persuasive messages presents both\nopportunities and challenges for society. On the one hand, such systems could\npositively impact domains like advertising and social good, such as addressing\ndrug addiction, and on the other, they could be misused for spreading\nmisinformation and shaping political opinions. To channel LLMs' impact on\nsociety, we need to develop systems to measure and benchmark their\npersuasiveness. With this motivation, we introduce PersuasionBench and\nPersuasionArena, the first large-scale benchmark and arena containing a battery\nof tasks to measure the persuasion ability of generative models automatically.\nWe investigate to what extent LLMs know and leverage linguistic patterns that\ncan help them generate more persuasive language. Our findings indicate that the\npersuasiveness of LLMs correlates positively with model size, but smaller\nmodels can also be made to have a higher persuasiveness than much larger\nmodels. Notably, targeted training using synthetic and natural datasets\nsignificantly enhances smaller models' persuasive capabilities, challenging\nscale-dependent assumptions. Our findings carry key implications for both model\ndevelopers and policymakers. For instance, while the EU AI Act and California's\nSB-1047 aim to regulate AI models based on the number of floating point\noperations, we demonstrate that simple metrics like this alone fail to capture\nthe full scope of AI's societal impact. We invite the community to explore and\ncontribute to PersuasionArena and PersuasionBench, available at\nhttps://bit.ly/measure-persuasion, to advance our understanding of AI-driven\npersuasion and its societal implications.\n","authors":["Somesh Singh","Yaman K Singla","Harini SI","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2410.02653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02650v1","updated":"2024-10-03T16:34:46Z","published":"2024-10-03T16:34:46Z","title":"Undesirable Memorization in Large Language Models: A Survey","summary":" While recent research increasingly showcases the remarkable capabilities of\nLarge Language Models (LLMs), it's vital to confront their hidden pitfalls.\nAmong these challenges, the issue of memorization stands out, posing\nsignificant ethical and legal risks. In this paper, we presents a\nSystematization of Knowledge (SoK) on the topic of memorization in LLMs.\nMemorization is the effect that a model tends to store and reproduce phrases or\npassages from the training data and has been shown to be the fundamental issue\nto various privacy and security attacks against LLMs.\n We begin by providing an overview of the literature on the memorization,\nexploring it across five key dimensions: intentionality, degree,\nretrievability, abstraction, and transparency. Next, we discuss the metrics and\nmethods used to measure memorization, followed by an analysis of the factors\nthat contribute to memorization phenomenon. We then examine how memorization\nmanifests itself in specific model architectures and explore strategies for\nmitigating these effects. We conclude our overview by identifying potential\nresearch topics for the near future: to develop methods for balancing\nperformance and privacy in LLMs, and the analysis of memorization in specific\ncontexts, including conversational agents, retrieval-augmented generation,\nmultilingual language models, and diffusion language models.\n","authors":["Ali Satvaty","Suzan Verberne","Fatih Turkmen"],"pdf_url":"https://arxiv.org/pdf/2410.02650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02647v1","updated":"2024-10-03T16:33:35Z","published":"2024-10-03T16:33:35Z","title":"Immunogenicity Prediction with Dual Attention Enables Vaccine Target\n Selection","summary":" Immunogenicity prediction is a central topic in reverse vaccinology for\nfinding candidate vaccines that can trigger protective immune responses.\nExisting approaches typically rely on highly compressed features and simple\nmodel architectures, leading to limited prediction accuracy and poor\ngeneralizability. To address these challenges, we introduce ProVaccine, a novel\ndeep learning solution with a dual attention mechanism that integrates\npre-trained latent vector representations of protein sequences and structures.\nWe also compile the most comprehensive immunogenicity dataset to date,\nencompassing over 9,500 antigen sequences, structures, and immunogenicity\nlabels from bacteria, viruses, and tumors. Extensive experiments demonstrate\nthat ProVaccine outperforms existing methods across a wide range of evaluation\nmetrics. Furthermore, we establish a post-hoc validation protocol to assess the\npractical significance of deep learning models in tackling vaccine design\nchallenges. Our work provides an effective tool for vaccine design and sets\nvaluable benchmarks for future research.\n","authors":["Song Li","Yang Tan","Song Ke","Liang Hong","Bingxin Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.02647v1.pdf","comment":"18 pages, 11 tables, 5 figures"},{"id":"http://arxiv.org/abs/2409.02026v2","updated":"2024-10-03T16:31:59Z","published":"2024-09-03T16:20:22Z","title":"Foundations of Large Language Model Compression -- Part 1: Weight\n Quantization","summary":" In recent years, compression of large language models (LLMs) has emerged as\nan important problem to enable language model deployment on\nresource-constrained devices, reduce computational costs, and mitigate the\nenvironmental footprint of large-scale AI infrastructure. In this paper, we lay\ndown the foundation for LLM quantization from a convex optimization perspective\nand propose a quantization technique that builds on this foundation for optimum\nquantization outcomes. Our quantization framework, CVXQ, scales to models\ncontaining hundreds of billions of weight parameters and provides users with\nthe flexibility to compress models to any specified model size, post-training.\nA reference implementation of CVXQ can be obtained from github.com/seannz/cvxq.\n","authors":["Sean I. Young"],"pdf_url":"https://arxiv.org/pdf/2409.02026v2.pdf","comment":"Preprint. 17 pages, 4 figures, 5 appendices"},{"id":"http://arxiv.org/abs/2409.11295v2","updated":"2024-10-03T16:30:43Z","published":"2024-09-17T15:49:44Z","title":"EIA: Environmental Injection Attack on Generalist Web Agents for Privacy\n Leakage","summary":" Generalist web agents have demonstrated remarkable potential in autonomously\ncompleting a wide range of tasks on real websites, significantly boosting human\nproductivity. However, web tasks, such as booking flights, usually involve\nusers' PII, which may be exposed to potential privacy risks if web agents\naccidentally interact with compromised websites, a scenario that remains\nlargely unexplored in the literature. In this work, we narrow this gap by\nconducting the first study on the privacy risks of generalist web agents in\nadversarial environments. First, we present a realistic threat model for\nattacks on the website, where we consider two adversarial targets: stealing\nusers' specific PII or the entire user request. Then, we propose a novel attack\nmethod, termed Environmental Injection Attack (EIA). EIA injects malicious\ncontent designed to adapt well to environments where the agents operate and our\nwork instantiates EIA specifically for privacy scenarios in web environments.\nWe collect 177 action steps that involve diverse PII categories on realistic\nwebsites from the Mind2Web, and conduct experiments using one of the most\ncapable generalist web agent frameworks to date. The results demonstrate that\nEIA achieves up to 70% ASR in stealing specific PII and 16% ASR for full user\nrequest. Additionally, by accessing the stealthiness and experimenting with a\ndefensive system prompt, we indicate that EIA is hard to detect and mitigate.\nNotably, attacks that are not well adapted for a webpage can be detected via\nhuman inspection, leading to our discussion about the trade-off between\nsecurity and autonomy. However, extra attackers' efforts can make EIA\nseamlessly adapted, rendering such supervision ineffective. Thus, we further\ndiscuss the defenses at the pre- and post-deployment stages of the websites\nwithout relying on human supervision and call for more advanced defense\nstrategies.\n","authors":["Zeyi Liao","Lingbo Mo","Chejian Xu","Mintong Kang","Jiawei Zhang","Chaowei Xiao","Yuan Tian","Bo Li","Huan Sun"],"pdf_url":"https://arxiv.org/pdf/2409.11295v2.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2410.02642v1","updated":"2024-10-03T16:25:37Z","published":"2024-10-03T16:25:37Z","title":"Attention in Large Language Models Yields Efficient Zero-Shot Re-Rankers","summary":" Information retrieval (IR) systems have played a vital role in modern digital\nlife and have cemented their continued usefulness in this new era of generative\nAI via retrieval-augmented generation. With strong language processing\ncapabilities and remarkable versatility, large language models (LLMs) have\nbecome popular choices for zero-shot re-ranking in IR systems. So far,\nLLM-based re-ranking methods rely on strong generative capabilities, which\nrestricts their use to either specialized or powerful proprietary models. Given\nthese restrictions, we ask: is autoregressive generation necessary and optimal\nfor LLMs to perform re-ranking? We hypothesize that there are abundant signals\nrelevant to re-ranking within LLMs that might not be used to their full\npotential via generation. To more directly leverage such signals, we propose\nin-context re-ranking (ICR), a novel method that leverages the change in\nattention pattern caused by the search query for accurate and efficient\nre-ranking. To mitigate the intrinsic biases in LLMs, we propose a calibration\nmethod using a content-free query. Due to the absence of generation, ICR only\nrequires two ($O(1)$) forward passes to re-rank $N$ documents, making it\nsubstantially more efficient than generative re-ranking methods that require at\nleast $O(N)$ forward passes. Our novel design also enables ICR to be applied to\nany LLM without specialized training while guaranteeing a well-formed ranking.\nExtensive experiments with two popular open-weight LLMs on standard single-hop\nand multi-hop information retrieval benchmarks show that ICR outperforms\nRankGPT while cutting the latency by more than 60% in practice. Through\ndetailed analyses, we show that ICR's performance is specially strong on tasks\nthat require more complex re-ranking signals. Our findings call for further\nexploration on novel ways of utilizing open-weight LLMs beyond text generation.\n","authors":["Shijie Chen","Bernal Jiménez Gutiérrez","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2410.02642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02631v1","updated":"2024-10-03T16:15:04Z","published":"2024-10-03T16:15:04Z","title":"Large Language Model for Multi-Domain Translation: Benchmarking and\n Domain CoT Fine-tuning","summary":" Achieving consistent high-quality machine translation (MT) across diverse\ndomains remains a significant challenge, primarily due to the limited and\nimbalanced parallel training data available in various domains. While large\nlanguage models (LLMs) have demonstrated impressive general understanding and\ngeneration abilities, their potential in multi-domain MT is under-explored. We\nestablish a comprehensive benchmark for multi-domain translation, featuring 25\nGerman$\\Leftrightarrow$English and 22 Chinese$\\Leftrightarrow$English test sets\nrespectively covering 15 domains. Our evaluation of prominent LLMs reveals a\ndiscernible performance gap against traditional MT systems, highlighting domain\noverfitting and catastrophic forgetting issues after fine-tuning on\ndomain-limited corpora. To mitigate this, we propose a domain Chain of Thought\n(CoT) fine-tuning technique that utilizes the intrinsic multi-domain\nintelligence of LLMs to improve translation performance. This method inspires\nthe LLM to perceive domain information from the source text, which then serves\nas a helpful hint to guide the translation process. Despite being trained on a\nsmall dataset of four domains, our CoT fine-tune approach achieves notable\nenhancements in translation accuracy and domain robustness than traditional\nfine-tuning, as evidenced by an average 1.53 BLEU score increase in over 20\nGerman$\\rightarrow$English distinct out-of-domain tests.\n","authors":["Tianxiang Hu","Pei Zhang","Baosong Yang","Jun Xie","Derek F. Wong","Rui Wang"],"pdf_url":"https://arxiv.org/pdf/2410.02631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08702v4","updated":"2024-10-03T16:11:43Z","published":"2024-02-13T16:38:01Z","title":"PRompt Optimization in Multi-Step Tasks (PROMST): Integrating Human\n Feedback and Heuristic-based Sampling","summary":" Prompt optimization aims to find the best prompt to a large language model\n(LLM) for a given task. LLMs have been successfully used to help find and\nimprove prompt candidates for single-step tasks. However, realistic tasks for\nagents are multi-step and introduce new challenges: (1) Prompt content is\nlikely to be more extensive and complex, making it more difficult for LLMs to\nanalyze errors, (2) the impact of an individual step is difficult to evaluate,\nand (3) different people may have varied preferences about task execution.\nWhile humans struggle to optimize prompts, they are good at providing feedback\nabout LLM outputs; we therefore introduce a new LLM-driven discrete prompt\noptimization framework PRompt Optimization in Multi-Step Tasks (PROMST) that\nincorporates human-designed feedback rules to automatically offer direct\nsuggestions for improvement. We also use an extra learned heuristic model that\npredicts prompt performance to efficiently sample from prompt candidates. This\napproach significantly outperforms both human-engineered prompts and several\nother prompt optimization methods across 11 representative multi-step tasks (an\naverage 10.6\\%-29.3\\% improvement to current best methods on five LLMs\nrespectively). We believe our work can serve as a benchmark for automatic\nprompt optimization for LLM-driven multi-step tasks. Datasets and Codes are\navailable at https://github.com/yongchao98/PROMST. Project Page is available at\nhttps://yongchao98.github.io/MIT-REALM-PROMST.\n","authors":["Yongchao Chen","Jacob Arkin","Yilun Hao","Yang Zhang","Nicholas Roy","Chuchu Fan"],"pdf_url":"https://arxiv.org/pdf/2402.08702v4.pdf","comment":"62 pages, 14 figures, Published in EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2403.13681v2","updated":"2024-10-03T16:01:01Z","published":"2024-03-20T15:39:54Z","title":"PARAMANU-AYN: Pretrain from scratch or Continual Pretraining of LLMs for\n Legal Domain Adaptation?","summary":" In this paper, we present Paramanu-Ayn, a collection of legal language models\ntrained exclusively on Indian legal case documents. This 97-million-parameter\nAuto-Regressive (AR) decoder-only model was pretrained from scratch with a\ncontext size of 8192 on a single GPU for just 185 hours, achieving an efficient\nMFU of 41.35. We also developed a legal domain specialized BPE tokenizer. We\nevaluated our model using perplexity and zero-shot tasks: case judgment\nprediction with explanation and abstractive case summarization. Paramanu-Ayn\noutperformed Llama-2 7B and Gemini-Pro in case judgment prediction with\nexplanation task on test accuracy by nearly 2 percentage points, despite being\n72 times smaller. In zero-shot abstractive summarization, it surpassed\ndecoder-only LLMs generating fixed-length summaries (5000 tokens) by over 10\npercentage points in BLEU and METEOR metrics, and by nearly 4 percentage points\nin BERTScore. Further evaluations on zero-shot commonsense and mathematical\nbenchmarks showed that Paramanu-Ayn excelled despite being trained exclusively\non legal documents, outperforming Llama-1, Llama-2, and Falcon on\nAGIEVAL-AQuA-RAT and AGIEVAL-SAT-Math tasks. We also instruction-tuned our\nmodel on 10,763 diverse legal tasks, including legal clause generation, legal\ndrafting, case summarization, etc. The Paramanu-Ayn-instruct model scored above\n8 out of 10 in clarity, relevance, completeness, and legal reasoning metrics by\nGPT-3.5-Turbo. We found that our models, were able to learn drafting knowledge\nand generalize to draft legal contracts and legal clauses with limited\ninstruction-tuning. Hence, we conclude that for a strong domain-specialized\ngenerative language model (such as legal), domain specialized pretraining from\nscratch is more cost effective, environmentally friendly, and remains\ncompetitive with larger models or even better than adapting LLMs for legal\ndomain tasks.\n","authors":["Mitodru Niyogi","Arnab Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2403.13681v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01744v2","updated":"2024-10-03T15:57:05Z","published":"2024-10-02T16:55:01Z","title":"Leopard: A Vision Language Model For Text-Rich Multi-Image Tasks","summary":" Text-rich images, where text serves as the central visual element guiding the\noverall understanding, are prevalent in real-world applications, such as\npresentation slides, scanned documents, and webpage snapshots. Tasks involving\nmultiple text-rich images are especially challenging, as they require not only\nunderstanding the content of individual images but reasoning about\ninter-relationships and logical flows across multiple visual inputs. Despite\nthe importance of these scenarios, current multimodal large language models\n(MLLMs) struggle to handle such tasks due to two key challenges: (1) the\nscarcity of high-quality instruction tuning datasets for text-rich multi-image\nscenarios, and (2) the difficulty in balancing image resolution with visual\nfeature sequence length. To address these challenges, we propose Leopard, a\nMLLM designed specifically for handling vision-language tasks involving\nmultiple text-rich images. First, we curated about one million high-quality\nmultimodal instruction-tuning data, tailored to text-rich, multi-image\nscenarios. Second, we developed an adaptive high-resolution multi-image\nencoding module to dynamically optimize the allocation of visual sequence\nlength based on the original aspect ratios and resolutions of the input images.\nExperiments across a wide range of benchmarks demonstrate our model's superior\ncapabilities in text-rich, multi-image evaluations and competitive performance\nin general domain evaluations.\n","authors":["Mengzhao Jia","Wenhao Yu","Kaixin Ma","Tianqing Fang","Zhihan Zhang","Siru Ouyang","Hongming Zhang","Meng Jiang","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2410.01744v2.pdf","comment":"Our code is available at https://github.com/Jill0001/Leopard"},{"id":"http://arxiv.org/abs/2409.05197v2","updated":"2024-10-03T15:55:40Z","published":"2024-09-08T19:22:58Z","title":"Seemingly Plausible Distractors in Multi-Hop Reasoning: Are Large\n Language Models Attentive Readers?","summary":" State-of-the-art Large Language Models (LLMs) are accredited with an\nincreasing number of different capabilities, ranging from reading\ncomprehension, over advanced mathematical and reasoning skills to possessing\nscientific knowledge. In this paper we focus on their multi-hop reasoning\ncapability: the ability to identify and integrate information from multiple\ntextual sources.\n Given the concerns with the presence of simplifying cues in existing\nmulti-hop reasoning benchmarks, which allow models to circumvent the reasoning\nrequirement, we set out to investigate, whether LLMs are prone to exploiting\nsuch simplifying cues. We find evidence that they indeed circumvent the\nrequirement to perform multi-hop reasoning, but they do so in more subtle ways\nthan what was reported about their fine-tuned pre-trained language model (PLM)\npredecessors. Motivated by this finding, we propose a challenging multi-hop\nreasoning benchmark, by generating seemingly plausible multi-hop reasoning\nchains, which ultimately lead to incorrect answers. We evaluate multiple open\nand proprietary state-of-the-art LLMs, and find that their performance to\nperform multi-hop reasoning is affected, as indicated by up to 45% relative\ndecrease in F1 score when presented with such seemingly plausible alternatives.\nWe conduct a deeper analysis and find evidence that while LLMs tend to ignore\nmisleading lexical cues, misleading reasoning paths indeed present a\nsignificant challenge.\n","authors":["Neeladri Bhuiya","Viktor Schlegel","Stefan Winkler"],"pdf_url":"https://arxiv.org/pdf/2409.05197v2.pdf","comment":"15 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.12191v2","updated":"2024-10-03T15:54:49Z","published":"2024-09-18T17:59:32Z","title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at\n Any Resolution","summary":" We present the Qwen2-VL Series, an advanced upgrade of the previous Qwen-VL\nmodels that redefines the conventional predetermined-resolution approach in\nvisual processing. Qwen2-VL introduces the Naive Dynamic Resolution mechanism,\nwhich enables the model to dynamically process images of varying resolutions\ninto different numbers of visual tokens. This approach allows the model to\ngenerate more efficient and accurate visual representations, closely aligning\nwith human perceptual processes. The model also integrates Multimodal Rotary\nPosition Embedding (M-RoPE), facilitating the effective fusion of positional\ninformation across text, images, and videos. We employ a unified paradigm for\nprocessing both images and videos, enhancing the model's visual perception\ncapabilities. To explore the potential of large multimodal models, Qwen2-VL\ninvestigates the scaling laws for large vision-language models (LVLMs). By\nscaling both the model size-with versions at 2B, 8B, and 72B parameters-and the\namount of training data, the Qwen2-VL Series achieves highly competitive\nperformance. Notably, the Qwen2-VL-72B model achieves results comparable to\nleading models such as GPT-4o and Claude3.5-Sonnet across various multimodal\nbenchmarks, outperforming other generalist models. Code is available at\nhttps://github.com/QwenLM/Qwen2-VL .\n","authors":["Peng Wang","Shuai Bai","Sinan Tan","Shijie Wang","Zhihao Fan","Jinze Bai","Keqin Chen","Xuejing Liu","Jialin Wang","Wenbin Ge","Yang Fan","Kai Dang","Mengfei Du","Xuancheng Ren","Rui Men","Dayiheng Liu","Chang Zhou","Jingren Zhou","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2409.12191v2.pdf","comment":"Code is available at https://github.com/QwenLM/Qwen2-VL. arXiv admin\n note: text overlap with arXiv:2408.15262 by other authors"},{"id":"http://arxiv.org/abs/2410.02613v1","updated":"2024-10-03T15:51:36Z","published":"2024-10-03T15:51:36Z","title":"NL-Eye: Abductive NLI for Images","summary":" Will a Visual Language Model (VLM)-based bot warn us about slipping if it\ndetects a wet floor? Recent VLMs have demonstrated impressive capabilities, yet\ntheir ability to infer outcomes and causes remains underexplored. To address\nthis, we introduce NL-Eye, a benchmark designed to assess VLMs' visual\nabductive reasoning skills. NL-Eye adapts the abductive Natural Language\nInference (NLI) task to the visual domain, requiring models to evaluate the\nplausibility of hypothesis images based on a premise image and explain their\ndecisions. NL-Eye consists of 350 carefully curated triplet examples (1,050\nimages) spanning diverse reasoning categories: physical, functional, logical,\nemotional, cultural, and social. The data curation process involved two steps -\nwriting textual descriptions and generating images using text-to-image models,\nboth requiring substantial human involvement to ensure high-quality and\nchallenging scenes. Our experiments show that VLMs struggle significantly on\nNL-Eye, often performing at random baseline levels, while humans excel in both\nplausibility prediction and explanation quality. This demonstrates a deficiency\nin the abductive reasoning capabilities of modern VLMs. NL-Eye represents a\ncrucial step toward developing VLMs capable of robust multimodal reasoning for\nreal-world applications, including accident-prevention bots and generated video\nverification.\n","authors":["Mor Ventura","Michael Toker","Nitay Calderon","Zorik Gekhman","Yonatan Bitton","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2410.02613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02611v1","updated":"2024-10-03T15:50:08Z","published":"2024-10-03T15:50:08Z","title":"IndicSentEval: How Effectively do Multilingual Transformer Models encode\n Linguistic Properties for Indic Languages?","summary":" Transformer-based models have revolutionized the field of natural language\nprocessing. To understand why they perform so well and to assess their\nreliability, several studies have focused on questions such as: Which\nlinguistic properties are encoded by these models, and to what extent? How\nrobust are these models in encoding linguistic properties when faced with\nperturbations in the input text? However, these studies have mainly focused on\nBERT and the English language. In this paper, we investigate similar questions\nregarding encoding capability and robustness for 8 linguistic properties across\n13 different perturbations in 6 Indic languages, using 9 multilingual\nTransformer models (7 universal and 2 Indic-specific). To conduct this study,\nwe introduce a novel multilingual benchmark dataset, IndicSentEval, containing\napproximately $\\sim$47K sentences. Surprisingly, our probing analysis of\nsurface, syntactic, and semantic properties reveals that while almost all\nmultilingual models demonstrate consistent encoding performance for English,\nthey show mixed results for Indic languages. As expected, Indic-specific\nmultilingual models capture linguistic properties in Indic languages better\nthan universal models. Intriguingly, universal models broadly exhibit better\nrobustness compared to Indic-specific models, particularly under perturbations\nsuch as dropping both nouns and verbs, dropping only verbs, or keeping only\nnouns. Overall, this study provides valuable insights into probing and\nperturbation-specific strengths and weaknesses of popular multilingual\nTransformer-based models for different Indic languages. We make our code and\ndataset publicly available [https://tinyurl.com/IndicSentEval}].\n","authors":["Akhilesh Aravapalli","Mounika Marreddy","Subba Reddy Oota","Radhika Mamidi","Manish Gupta"],"pdf_url":"https://arxiv.org/pdf/2410.02611v1.pdf","comment":"23 pages, 11 figures"},{"id":"http://arxiv.org/abs/2410.02609v1","updated":"2024-10-03T15:49:35Z","published":"2024-10-03T15:49:35Z","title":"Ethio-Fake: Cutting-Edge Approaches to Combat Fake News in\n Under-Resourced Languages Using Explainable AI","summary":" The proliferation of fake news has emerged as a significant threat to the\nintegrity of information dissemination, particularly on social media platforms.\nMisinformation can spread quickly due to the ease of creating and disseminating\ncontent, affecting public opinion and sociopolitical events. Identifying false\ninformation is therefore essential to reducing its negative consequences and\nmaintaining the reliability of online news sources. Traditional approaches to\nfake news detection often rely solely on content-based features, overlooking\nthe crucial role of social context in shaping the perception and propagation of\nnews articles. In this paper, we propose a comprehensive approach that\nintegrates social context-based features with news content features to enhance\nthe accuracy of fake news detection in under-resourced languages. We perform\nseveral experiments utilizing a variety of methodologies, including traditional\nmachine learning, neural networks, ensemble learning, and transfer learning.\nAssessment of the outcomes of the experiments shows that the ensemble learning\napproach has the highest accuracy, achieving a 0.99 F1 score. Additionally,\nwhen compared with monolingual models, the fine-tuned model with the target\nlanguage outperformed others, achieving a 0.94 F1 score. We analyze the\nfunctioning of the models, considering the important features that contribute\nto model performance, using explainable AI techniques.\n","authors":["Mesay Gemeda Yigezu","Melkamu Abay Mersha","Girma Yohannis Bade","Jugal Kalita","Olga Kolesnikova","Alexander Gelbukh"],"pdf_url":"https://arxiv.org/pdf/2410.02609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10960v3","updated":"2024-10-03T15:48:45Z","published":"2024-07-15T17:55:42Z","title":"Fast Matrix Multiplications for Lookup Table-Quantized LLMs","summary":" The deployment of large language models (LLMs) is often constrained by memory\nbandwidth, where the primary bottleneck is the cost of transferring model\nparameters from the GPU's global memory to its registers. When coupled with\ncustom kernels that fuse the dequantization and matmul operations, weight-only\nquantization can thus enable faster inference by reducing the amount of memory\nmovement. However, developing high-performance kernels for weight-quantized\nLLMs presents substantial challenges, especially when the weights are\ncompressed to non-evenly-divisible bit widths (e.g., 3 bits) with non-uniform,\nlookup table (LUT) quantization. This paper describes FLUTE, a flexible lookup\ntable engine for LUT-quantized LLMs, which uses offline restructuring of the\nquantized weight matrix to minimize bit manipulations associated with\nunpacking, and vectorization and duplication of the lookup table to mitigate\nshared memory bandwidth constraints. At batch sizes < 32 and quantization group\nsize of 128 (typical in LLM inference), the FLUTE kernel can be 2-4x faster\nthan existing GEMM kernels. As an application of FLUTE, we explore a simple\nextension to lookup table-based NormalFloat quantization and apply it to\nquantize LLaMA3 to various configurations, obtaining competitive quantization\nperformance against strong baselines while obtaining an end-to-end throughput\nincrease of 1.5 to 2 times.\n","authors":["Han Guo","William Brandon","Radostin Cholakov","Jonathan Ragan-Kelley","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2407.10960v3.pdf","comment":"EMNLP 2024 (Findings)"},{"id":"http://arxiv.org/abs/2406.18256v3","updated":"2024-10-03T15:48:31Z","published":"2024-06-26T11:08:17Z","title":"Llamipa: An Incremental Discourse Parser","summary":" This paper provides the first discourse parsing experiments with a large\nlanguage model(LLM) finetuned on corpora annotated in the style of SDRT\n(Segmented Discourse Representation Theory Asher, 1993; Asher and Lascarides,\n2003). The result is a discourse parser, Llamipa (Llama Incremental Parser),\nthat leverages discourse context, leading to substantial performance gains over\napproaches that use encoder-only models to provide local, context-sensitive\nrepresentations of discourse units. Furthermore, it can process discourse data\nincrementally, which is essential for the eventual use of discourse information\nin downstream tasks.\n","authors":["Kate Thompson","Akshay Chaturvedi","Julie Hunter","Nicholas Asher"],"pdf_url":"https://arxiv.org/pdf/2406.18256v3.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2406.18164v3","updated":"2024-10-03T15:46:16Z","published":"2024-06-26T08:24:44Z","title":"Nebula: A discourse aware Minecraft Builder","summary":" When engaging in collaborative tasks, humans efficiently exploit the semantic\nstructure of a conversation to optimize verbal and nonverbal interactions. But\nin recent \"language to code\" or \"language to action\" models, this information\nis lacking. We show how incorporating the prior discourse and nonlinguistic\ncontext of a conversation situated in a nonlinguistic environment can improve\nthe \"language to action\" component of such interactions. We finetune an LLM to\npredict actions based on prior context; our model, Nebula, doubles the\nnet-action F1 score over the baseline on this task of Jayannavar et al.(2020).\nWe also investigate our model's ability to construct shapes and understand\nlocation descriptions using a synthetic dataset\n","authors":["Akshay Chaturvedi","Kate Thompson","Nicholas Asher"],"pdf_url":"https://arxiv.org/pdf/2406.18164v3.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2304.08460v3","updated":"2024-10-03T15:46:13Z","published":"2023-04-17T17:36:35Z","title":"LongForm: Effective Instruction Tuning with Reverse Instructions","summary":" Instruction tuning enables language models to more effectively generalize and\nbetter follow user intent. However, obtaining instruction data is costly and\nchallenging. Prior work employs methods such as expensive human annotation,\ncrowd-sourced datasets with alignment issues, and generating noisy examples via\nLLMs. We introduce the LongForm-C dataset, which is created by reverse\ninstructions. We generate instructions via LLMs for human-written corpus\nexamples using reverse instructions. First we select a diverse set of\nhuman-written documents from corpora such as C4 and Wikipedia; then we generate\ninstructions for these documents via LLMs. This approach provides a cheaper and\ncleaner instruction-tuning dataset with natural output and one suitable for\nlong text generation. Our models outperform 10x larger language models without\ninstruction tuning on tasks such as story/recipe generation and long-form\nquestion answering. Moreover, LongForm models outperform prior\ninstruction-tuned models such as FLAN-T5 and Alpaca by a large margin, and\nimprove language understanding capabilities further. We publicly release our\ndata and models: https://github.com/akoksal/LongForm.\n","authors":["Abdullatif Köksal","Timo Schick","Anna Korhonen","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2304.08460v3.pdf","comment":"EMNLP 2024 Findings. This version extends the training with recent\n LLMs, evaluation with new metrics, and NLU tasks"},{"id":"http://arxiv.org/abs/2407.12402v2","updated":"2024-10-03T15:45:52Z","published":"2024-07-17T08:28:55Z","title":"TurkishMMLU: Measuring Massive Multitask Language Understanding in\n Turkish","summary":" Multiple choice question answering tasks evaluate the reasoning,\ncomprehension, and mathematical abilities of Large Language Models (LLMs).\nWhile existing benchmarks employ automatic translation for multilingual\nevaluation, this approach is error-prone and potentially introduces culturally\nbiased questions, especially in social sciences. We introduce the first\nmultitask, multiple-choice Turkish QA benchmark, TurkishMMLU, to evaluate LLMs'\nunderstanding of the Turkish language. TurkishMMLU includes over 10,000\nquestions, covering 9 different subjects from Turkish high-school education\ncurricula. These questions are written by curriculum experts, suitable for the\nhigh-school curricula in Turkey, covering subjects ranging from natural\nsciences and math questions to more culturally representative topics such as\nTurkish Literature and the history of the Turkish Republic. We evaluate over 20\nLLMs, including multilingual open-source (e.g., Gemma, Llama, MT5),\nclosed-source (GPT 4o, Claude, Gemini), and Turkish-adapted (e.g., Trendyol)\nmodels. We provide an extensive evaluation, including zero-shot and few-shot\nevaluation of LLMs, chain-of-thought reasoning, and question difficulty\nanalysis along with model performance. We provide an in-depth analysis of the\nTurkish capabilities and limitations of current LLMs to provide insights for\nfuture LLMs for the Turkish language. We publicly release our code for the\ndataset and evaluation: https://github.com/ArdaYueksel/TurkishMMLU.\n","authors":["Arda Yüksel","Abdullatif Köksal","Lütfi Kerem Şenel","Anna Korhonen","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2407.12402v2.pdf","comment":"EMNLP 2024 - Findings"},{"id":"http://arxiv.org/abs/2404.14741v2","updated":"2024-10-03T15:44:59Z","published":"2024-04-23T04:47:22Z","title":"Generate-on-Graph: Treat LLM as both Agent and KG in Incomplete\n Knowledge Graph Question Answering","summary":" To address the issues of insufficient knowledge and hallucination in Large\nLanguage Models (LLMs), numerous studies have explored integrating LLMs with\nKnowledge Graphs (KGs). However, these methods are typically evaluated on\nconventional Knowledge Graph Question Answering (KGQA) with complete KGs, where\nall factual triples required for each question are entirely covered by the\ngiven KG. In such cases, LLMs primarily act as an agent to find answer entities\nwithin the KG, rather than effectively integrating the internal knowledge of\nLLMs and external knowledge sources such as KGs. In fact, KGs are often\nincomplete to cover all the knowledge required to answer questions. To simulate\nthese real-world scenarios and evaluate the ability of LLMs to integrate\ninternal and external knowledge, we propose leveraging LLMs for QA under\nIncomplete Knowledge Graph (IKGQA), where the provided KG lacks some of the\nfactual triples for each question, and construct corresponding datasets. To\nhandle IKGQA, we propose a training-free method called Generate-on-Graph (GoG),\nwhich can generate new factual triples while exploring KGs. Specifically, GoG\nperforms reasoning through a Thinking-Searching-Generating framework, which\ntreats LLM as both Agent and KG in IKGQA. Experimental results on two datasets\ndemonstrate that our GoG outperforms all previous methods.\n","authors":["Yao Xu","Shizhu He","Jiabei Chen","Zihao Wang","Yangqiu Song","Hanghang Tong","Guang Liu","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2404.14741v2.pdf","comment":"Accepted by EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2410.02603v1","updated":"2024-10-03T15:44:42Z","published":"2024-10-03T15:44:42Z","title":"Agents' Room: Narrative Generation through Multi-step Collaboration","summary":" Writing compelling fiction is a multifaceted process combining elements such\nas crafting a plot, developing interesting characters, and using evocative\nlanguage. While large language models (LLMs) show promise for story writing,\nthey currently rely heavily on intricate prompting, which limits their use. We\npropose Agents' Room, a generation framework inspired by narrative theory, that\ndecomposes narrative writing into subtasks tackled by specialized agents. To\nillustrate our method, we introduce Tell Me A Story, a high-quality dataset of\ncomplex writing prompts and human-written stories, and a novel evaluation\nframework designed specifically for assessing long narratives. We show that\nAgents' Room generates stories that are preferred by expert evaluators over\nthose produced by baseline systems by leveraging collaboration and\nspecialization to decompose the complex story writing task into tractable\ncomponents. We provide extensive analysis with automated and human-based\nmetrics of the generated output.\n","authors":["Fantine Huot","Reinald Kim Amplayo","Jennimaria Palomaki","Alice Shoshana Jakobovits","Elizabeth Clark","Mirella Lapata"],"pdf_url":"https://arxiv.org/pdf/2410.02603v1.pdf","comment":"Under review as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2410.01769v2","updated":"2024-10-03T15:30:12Z","published":"2024-10-02T17:25:37Z","title":"Quantifying Generalization Complexity for Large Language Models","summary":" While large language models (LLMs) have shown exceptional capabilities in\nunderstanding complex queries and performing sophisticated tasks, their\ngeneralization abilities are often deeply entangled with memorization,\nnecessitating more precise evaluation. To address this challenge, we introduce\nScylla, a dynamic evaluation framework that quantitatively measures the\ngeneralization abilities of LLMs. Scylla disentangles generalization from\nmemorization via assessing model performance on both in-distribution (ID) and\nout-of-distribution (OOD) data through 20 tasks across 5 levels of complexity.\nThrough extensive experiments, we uncover a non-monotonic relationship between\ntask complexity and the performance gap between ID and OOD data, which we term\nthe generalization valley. Specifically, this phenomenon reveals a critical\nthreshold - referred to as critical complexity - where reliance on\nnon-generalizable behavior peaks, indicating the upper bound of LLMs'\ngeneralization capabilities. As model size increases, the critical complexity\nshifts toward higher levels of task complexity, suggesting that larger models\ncan handle more complex reasoning tasks before over-relying on memorization.\nLeveraging Scylla and the concept of critical complexity, we benchmark 28LLMs\nincluding both open-sourced models such as LLaMA and Qwen families, and\nclose-sourced models like Claude and GPT, providing a more robust evaluation\nand establishing a clearer understanding of LLMs' generalization capabilities.\n","authors":["Zhenting Qi","Hongyin Luo","Xuliang Huang","Zhuokai Zhao","Yibo Jiang","Xiangjun Fan","Himabindu Lakkaraju","James Glass"],"pdf_url":"https://arxiv.org/pdf/2410.01769v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02584v1","updated":"2024-10-03T15:28:05Z","published":"2024-10-03T15:28:05Z","title":"Towards Implicit Bias Detection and Mitigation in Multi-Agent LLM\n Interactions","summary":" As Large Language Models (LLMs) continue to evolve, they are increasingly\nbeing employed in numerous studies to simulate societies and execute diverse\nsocial tasks. However, LLMs are susceptible to societal biases due to their\nexposure to human-generated data. Given that LLMs are being used to gain\ninsights into various societal aspects, it is essential to mitigate these\nbiases. To that end, our study investigates the presence of implicit gender\nbiases in multi-agent LLM interactions and proposes two strategies to mitigate\nthese biases. We begin by creating a dataset of scenarios where implicit gender\nbiases might arise, and subsequently develop a metric to assess the presence of\nbiases. Our empirical analysis reveals that LLMs generate outputs characterized\nby strong implicit bias associations (>= 50\\% of the time). Furthermore, these\nbiases tend to escalate following multi-agent interactions. To mitigate them,\nwe propose two strategies: self-reflection with in-context examples (ICE); and\nsupervised fine-tuning. Our research demonstrates that both methods effectively\nmitigate implicit biases, with the ensemble of fine-tuning and self-reflection\nproving to be the most successful.\n","authors":["Angana Borah","Rada Mihalcea"],"pdf_url":"https://arxiv.org/pdf/2410.02584v1.pdf","comment":"Accepted to EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2310.04484v3","updated":"2024-10-03T15:20:17Z","published":"2023-10-06T13:28:04Z","title":"Ada-Instruct: Adapting Instruction Generators for Complex Reasoning","summary":" Instructions augmentation is a crucial step for unleashing the full potential\nof large language models (LLMs) in downstream tasks. Existing Self-Instruct\nmethods primarily simulate new instructions from a few initial instructions\nwith in-context learning. However, our study identifies a critical flaw in this\napproach: even with GPT4o, Self-Instruct cannot generate complex instructions\nof length $\\ge 100$, which is necessary in complex tasks such as code\ncompletion.\n To address this issue, our key insight is that fine-tuning open source LLMs\nwith only ten examples can produce complex instructions that maintain\ndistributional consistency for complex reasoning tasks. We introduce\nAda-Instruct, an adaptive instruction generator developed through fine-tuning.\nWe empirically validated Ada-Instruct's efficacy across different applications.\nThe results highlight Ada-Instruct's capacity to generate long, intricate, and\ndistributionally consistent instructions.\n","authors":["Wanyun Cui","Qianle Wang"],"pdf_url":"https://arxiv.org/pdf/2310.04484v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11194v2","updated":"2024-10-03T15:13:58Z","published":"2024-06-17T04:00:04Z","title":"In-Context Editing: Learning Knowledge from Self-Induced Distributions","summary":" In scenarios where language models must incorporate new information\nefficiently without extensive retraining, traditional fine-tuning methods are\nprone to overfitting, degraded generalization, and unnatural language\ngeneration. To address these limitations, we introduce Consistent In-Context\nEditing (ICE), a novel approach leveraging the model's in-context learning\ncapability to optimize toward a contextual distribution rather than a one-hot\ntarget. ICE introduces a simple yet effective optimization framework for the\nmodel to internalize new knowledge by aligning its output distributions with\nand without additional context. This method enhances the robustness and\neffectiveness of gradient-based tuning methods, preventing overfitting and\npreserving the model's integrity. We analyze ICE across four critical aspects\nof knowledge editing: accuracy, locality, generalization, and linguistic\nquality, demonstrating its advantages. Experimental results confirm the\neffectiveness of ICE and demonstrate its potential for continual editing,\nensuring that the integrity of the model is preserved while updating\ninformation.\n","authors":["Siyuan Qi","Bangcheng Yang","Kailin Jiang","Xiaobo Wang","Jiaqi Li","Yifan Zhong","Yaodong Yang","Zilong Zheng"],"pdf_url":"https://arxiv.org/pdf/2406.11194v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02560v1","updated":"2024-10-03T15:04:27Z","published":"2024-10-03T15:04:27Z","title":"Convolutional Variational Autoencoders for Spectrogram Compression in\n Automatic Speech Recognition","summary":" For many Automatic Speech Recognition (ASR) tasks audio features as\nspectrograms show better results than Mel-frequency Cepstral Coefficients\n(MFCC), but in practice they are hard to use due to a complex dimensionality of\na feature space. The following paper presents an alternative approach towards\ngenerating compressed spectrogram representation, based on Convolutional\nVariational Autoencoders (VAE). A Convolutional VAE model was trained on a\nsubsample of the LibriSpeech dataset to reconstruct short fragments of audio\nspectrograms (25 ms) from a 13-dimensional embedding. The trained model for a\n40-dimensional (300 ms) embedding was used to generate features for corpus of\nspoken commands on the GoogleSpeechCommands dataset. Using the generated\nfeatures an ASR system was built and compared to the model with MFCC features.\n","authors":["Olga Yakovenko","Ivan Bondarenko"],"pdf_url":"https://arxiv.org/pdf/2410.02560v1.pdf","comment":"Theory and Practice of Natural Computing 9th International\n Conference, TPNC 2020, Taoyuan, Taiwan, 2020, Proceedings 9"},{"id":"http://arxiv.org/abs/2410.02558v1","updated":"2024-10-03T15:04:00Z","published":"2024-10-03T15:04:00Z","title":"Improving Unsupervised Constituency Parsing via Maximizing Semantic\n Information","summary":" Unsupervised constituency parsers organize phrases within a sentence into a\ntree-shaped syntactic constituent structure that reflects the organization of\nsentence semantics. However, the traditional objective of maximizing sentence\nlog-likelihood (LL) does not explicitly account for the close relationship\nbetween the constituent structure and the semantics, resulting in a weak\ncorrelation between LL values and parsing accuracy. In this paper, we introduce\na novel objective for training unsupervised parsers: maximizing the information\nbetween constituent structures and sentence semantics (SemInfo). We introduce a\nbag-of-substrings model to represent the semantics and apply the\nprobability-weighted information metric to estimate the SemInfo. Additionally,\nwe develop a Tree Conditional Random Field (TreeCRF)-based model to apply the\nSemInfo maximization objective to Probabilistic Context-Free Grammar (PCFG)\ninduction, the state-of-the-art method for unsupervised constituency parsing.\nExperiments demonstrate that SemInfo correlates more strongly with parsing\naccuracy than LL. Our algorithm significantly enhances parsing accuracy by an\naverage of 7.85 points across five PCFG variants and in four languages,\nachieving new state-of-the-art results in three of the four languages.\n","authors":["Junjie Chen","Xiangheng He","Yusuke Miyao","Danushka Bollegala"],"pdf_url":"https://arxiv.org/pdf/2410.02558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12471v2","updated":"2024-10-03T14:56:29Z","published":"2024-06-18T10:20:36Z","title":"Fighting Randomness with Randomness: Mitigating Optimisation Instability\n of Fine-Tuning using Delayed Ensemble and Noisy Interpolation","summary":" While fine-tuning of pre-trained language models generally helps to overcome\nthe lack of labelled training samples, it also displays model performance\ninstability. This instability mainly originates from randomness in\ninitialisation or data shuffling. To address this, researchers either modify\nthe training process or augment the available samples, which typically results\nin increased computational costs. We propose a new mitigation strategy, called\nDelayed Ensemble with Noisy Interpolation (DENI), that leverages the strengths\nof ensembling, noise regularisation and model interpolation, while retaining\ncomputational efficiency. We compare DENI with 9 representative mitigation\nstrategies across 3 models, 4 tuning strategies and 7 text classification\ndatasets. We show that: 1) DENI outperforms the best performing mitigation\nstrategy (Ensemble), while using only a fraction of its cost; 2) the mitigation\nstrategies are beneficial for parameter-efficient fine-tuning (PEFT) methods,\noutperforming full fine-tuning in specific cases; and 3) combining DENI with\ndata augmentation often leads to even more effective instability mitigation.\n","authors":["Branislav Pecher","Jan Cegin","Robert Belanec","Jakub Simko","Ivan Srba","Maria Bielikova"],"pdf_url":"https://arxiv.org/pdf/2406.12471v2.pdf","comment":"Accepted to the Findings of the EMNLP'24 Conference"},{"id":"http://arxiv.org/abs/2402.12817v2","updated":"2024-10-03T14:56:24Z","published":"2024-02-20T08:38:19Z","title":"On Sensitivity of Learning with Limited Labelled Data to the Effects of\n Randomness: Impact of Interactions and Systematic Choices","summary":" While learning with limited labelled data can improve performance when the\nlabels are lacking, it is also sensitive to the effects of uncontrolled\nrandomness introduced by so-called randomness factors (e.g., varying order of\ndata). We propose a method to systematically investigate the effects of\nrandomness factors while taking the interactions between them into\nconsideration. To measure the true effects of an individual randomness factor,\nour method mitigates the effects of other factors and observes how the\nperformance varies across multiple runs. Applying our method to multiple\nrandomness factors across in-context learning and fine-tuning approaches on 7\nrepresentative text classification tasks and meta-learning on 3 tasks, we show\nthat: 1) disregarding interactions between randomness factors in existing works\ncaused inconsistent findings due to incorrect attribution of the effects of\nrandomness factors, such as disproving the consistent sensitivity of in-context\nlearning to sample order even with random sample selection; and 2) besides\nmutual interactions, the effects of randomness factors, especially sample\norder, are also dependent on more systematic choices unexplored in existing\nworks, such as number of classes, samples per class or choice of prompt format.\n","authors":["Branislav Pecher","Ivan Srba","Maria Bielikova"],"pdf_url":"https://arxiv.org/pdf/2402.12817v2.pdf","comment":"Accepted to the EMNLP'24 Main Conference"},{"id":"http://arxiv.org/abs/2409.19700v2","updated":"2024-10-03T14:56:02Z","published":"2024-09-29T13:16:37Z","title":"2D-TPE: Two-Dimensional Positional Encoding Enhances Table Understanding\n for Large Language Models","summary":" Tables are ubiquitous across various domains for concisely representing\nstructured information. Empowering large language models (LLMs) to reason over\ntabular data represents an actively explored direction. However, since typical\nLLMs only support one-dimensional~(1D) inputs, existing methods often flatten\nthe two-dimensional~(2D) table structure into a sequence of tokens, which can\nseverely disrupt the spatial relationships and result in an inevitable loss of\nvital contextual information. In this paper, we first empirically demonstrate\nthe detrimental impact of such flattening operations on the performance of LLMs\nin capturing the spatial information of tables through two elaborate proxy\ntasks. Subsequently, we introduce a simple yet effective positional encoding\nmethod, termed ``2D-TPE'' (Two-Dimensional Table Positional Encoding), to\naddress this challenge. 2D-TPE enables each attention head to dynamically\nselect a permutation order of tokens within the context for attending to them,\nwhere each permutation represents a distinct traversal mode for the table, such\nas column-wise or row-wise traversal. 2D-TPE effectively mitigates the risk of\nlosing essential spatial information while preserving computational efficiency,\nthus better preserving the table structure. Extensive experiments across five\nbenchmarks demonstrate that 2D-TPE outperforms strong baselines, underscoring\nthe importance of preserving the table structure for accurate table\ncomprehension. Comprehensive analysis further reveals the substantially better\nscalability of 2D-TPE to large tables than baselines.\n","authors":["Jia-Nan Li","Jian Guan","Wei Wu","Zhengtao Yu","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2409.19700v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02551v1","updated":"2024-10-03T14:55:22Z","published":"2024-10-03T14:55:22Z","title":"ColaCare: Enhancing Electronic Health Record Modeling through Large\n Language Model-Driven Multi-Agent Collaboration","summary":" We introduce ColaCare, a framework that enhances Electronic Health Record\n(EHR) modeling through multi-agent collaboration driven by Large Language\nModels (LLMs). Our approach seamlessly integrates domain-specific expert models\nwith LLMs to bridge the gap between structured EHR data and text-based\nreasoning. Inspired by clinical consultations, ColaCare employs two types of\nagents: DoctorAgent and MetaAgent, which collaboratively analyze patient data.\nExpert models process and generate predictions from numerical EHR data, while\nLLM agents produce reasoning references and decision-making reports within the\ncollaborative consultation framework. We additionally incorporate the Merck\nManual of Diagnosis and Therapy (MSD) medical guideline within a\nretrieval-augmented generation (RAG) module for authoritative evidence support.\nExtensive experiments conducted on four distinct EHR datasets demonstrate\nColaCare's superior performance in mortality prediction tasks, underscoring its\npotential to revolutionize clinical decision support systems and advance\npersonalized precision medicine. The code, complete prompt templates, more case\nstudies, etc. are publicly available at the anonymous link:\nhttps://colacare.netlify.app.\n","authors":["Zixiang Wang","Yinghao Zhu","Huiya Zhao","Xiaochen Zheng","Tianlong Wang","Wen Tang","Yasha Wang","Chengwei Pan","Ewen M. Harrison","Junyi Gao","Liantao Ma"],"pdf_url":"https://arxiv.org/pdf/2410.02551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02458v1","updated":"2024-10-03T14:50:33Z","published":"2024-10-03T14:50:33Z","title":"MedVisionLlama: Leveraging Pre-Trained Large Language Model Layers to\n Enhance Medical Image Segmentation","summary":" Large Language Models (LLMs), known for their versatility in textual data,\nare increasingly being explored for their potential to enhance medical image\nsegmentation, a crucial task for accurate diagnostic imaging. This study\nexplores enhancing Vision Transformers (ViTs) for medical image segmentation by\nintegrating pre-trained LLM transformer blocks. Our approach, which\nincorporates a frozen LLM transformer block into the encoder of a ViT-based\nmodel, leads to substantial improvements in segmentation performance across\nvarious medical imaging modalities. We propose a Hybrid Attention Mechanism\nthat combines global and local feature learning with a Multi-Scale Fusion Block\nfor aggregating features across different scales. The enhanced model shows\nsignificant performance gains, including an average Dice score increase from\n0.74 to 0.79 and improvements in accuracy, precision, and the Jaccard Index.\nThese results demonstrate the effectiveness of LLM-based transformers in\nrefining medical image segmentation, highlighting their potential to\nsignificantly boost model accuracy and robustness. The source code and our\nimplementation are available at: https://bit.ly/3zf2CVs\n","authors":["Gurucharan Marthi Krishna Kumar","Aman Chadha","Janine Mendola","Amir Shmuel"],"pdf_url":"https://arxiv.org/pdf/2410.02458v1.pdf","comment":"Submitted to IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2409.15977v3","updated":"2024-10-03T14:45:55Z","published":"2024-09-24T11:18:09Z","title":"TCSinger: Zero-Shot Singing Voice Synthesis with Style Transfer and\n Multi-Level Style Control","summary":" Zero-shot singing voice synthesis (SVS) with style transfer and style control\naims to generate high-quality singing voices with unseen timbres and styles\n(including singing method, emotion, rhythm, technique, and pronunciation) from\naudio and text prompts. However, the multifaceted nature of singing styles\nposes a significant challenge for effective modeling, transfer, and control.\nFurthermore, current SVS models often fail to generate singing voices rich in\nstylistic nuances for unseen singers. To address these challenges, we introduce\nTCSinger, the first zero-shot SVS model for style transfer across cross-lingual\nspeech and singing styles, along with multi-level style control. Specifically,\nTCSinger proposes three primary modules: 1) the clustering style encoder\nemploys a clustering vector quantization model to stably condense style\ninformation into a compact latent space; 2) the Style and Duration Language\nModel (S\\&D-LM) concurrently predicts style information and phoneme duration,\nwhich benefits both; 3) the style adaptive decoder uses a novel mel-style\nadaptive normalization method to generate singing voices with enhanced details.\nExperimental results show that TCSinger outperforms all baseline models in\nsynthesis quality, singer similarity, and style controllability across various\ntasks, including zero-shot style transfer, multi-level style control,\ncross-lingual style transfer, and speech-to-singing style transfer. Singing\nvoice samples can be accessed at https://tcsinger.github.io/.\n","authors":["Yu Zhang","Ziyue Jiang","Ruiqi Li","Changhao Pan","Jinzheng He","Rongjie Huang","Chuxin Wang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.15977v3.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2402.18045v3","updated":"2024-10-03T14:44:44Z","published":"2024-02-28T04:43:46Z","title":"Multi-FAct: Assessing Factuality of Multilingual LLMs using FActScore","summary":" Evaluating the factuality of long-form large language model (LLM)-generated\ntext is an important challenge. Recently there has been a surge of interest in\nfactuality evaluation for English, but little is known about the factuality\nevaluation of multilingual LLMs, specially when it comes to long-form\ngeneration. %This paper systematically evaluates multilingual LLMs' factual\naccuracy across languages and geographic regions. We introduce a simple\npipeline for multilingual factuality evaluation, by applying FActScore (Min et\nal., 2023) for diverse languages. In addition to evaluating multilingual\nfactual generation, we evaluate the factual accuracy of long-form text\ngeneration in topics that reflect regional diversity. We also examine the\nfeasibility of running the FActScore pipeline using non-English Wikipedia and\nprovide comprehensive guidelines on multilingual factual evaluation for\nregionally diverse topics.\n","authors":["Sheikh Shafayat","Eunsu Kim","Juhyun Oh","Alice Oh"],"pdf_url":"https://arxiv.org/pdf/2402.18045v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02538v1","updated":"2024-10-03T14:43:43Z","published":"2024-10-03T14:43:43Z","title":"Algorithms For Automatic Accentuation And Transcription Of Russian Texts\n In Speech Recognition Systems","summary":" This paper presents an overview of rule-based system for automatic\naccentuation and phonemic transcription of Russian texts for speech connected\ntasks, such as Automatic Speech Recognition (ASR). Two parts of the developed\nsystem, accentuation and transcription, use different approaches to achieve\ncorrect phonemic representations of input phrases. Accentuation is based on\n\"Grammatical dictionary of the Russian language\" of A.A. Zaliznyak and\nwiktionary corpus. To distinguish homographs, the accentuation system also\nutilises morphological information of the sentences based on Recurrent Neural\nNetworks (RNN). Transcription algorithms apply the rules presented in the\nmonograph of B.M. Lobanov and L.I. Tsirulnik \"Computer Synthesis and Voice\nCloning\". The rules described in the present paper are implemented in an\nopen-source module, which can be of use to any scientific study connected to\nASR or Speech To Text (STT) tasks. Automatically marked up text annotations of\nthe Russian Voxforge database were used as training data for an acoustic model\nin CMU Sphinx. The resulting acoustic model was evaluated on cross-validation,\nmean Word Accuracy being 71.2%. The developed toolkit is written in the Python\nlanguage and is accessible on GitHub for any researcher interested.\n","authors":["Olga Iakovenko","Ivan Bondarenko","Mariya Borovikova","Daniil Vodolazsky"],"pdf_url":"https://arxiv.org/pdf/2410.02538v1.pdf","comment":"Speech and Computer 20th International Conference, SPECOM 2018,\n Leipzig, Germany, Proceedings 20"},{"id":"http://arxiv.org/abs/2402.17512v3","updated":"2024-10-03T14:41:43Z","published":"2024-02-27T13:54:48Z","title":"Latte: Latent Attention for Linear Time Transformers","summary":" The time complexity of the standard attention mechanism in transformers\nscales quadratically with sequence length. We propose a probabilistic framework\nfor attention, enabling us to derive a novel low-rank linear\nre-parameterisation of both bidirectional and causal cases, based on defining a\nlatent variable model. Our method can be seamlessly integrated as a drop-in\nreplacement for the standard attention mechanism. Additionally, this framework\nprovides a natural extension for combining local standard attention with our\nglobal linear attention. This approach allows us to extend the context length\nof existing large pre-trained models with only a few additional training steps.\nThe resulting ``Latte Transformer'' achieves performance comparable to standard\nattention and other state-of-the-art models, while maintaining linear time and\nmemory complexity, along with constant-time next-token prediction during\ninference.\n","authors":["Rares Dolga","Marius Cobzarenco","David Barber"],"pdf_url":"https://arxiv.org/pdf/2402.17512v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02525v1","updated":"2024-10-03T14:33:34Z","published":"2024-10-03T14:33:34Z","title":"Contextual Document Embeddings","summary":" Dense document embeddings are central to neural retrieval. The dominant\nparadigm is to train and construct embeddings by running encoders directly on\nindividual documents. In this work, we argue that these embeddings, while\neffective, are implicitly out-of-context for targeted use cases of retrieval,\nand that a contextualized document embedding should take into account both the\ndocument and neighboring documents in context - analogous to contextualized\nword embeddings. We propose two complementary methods for contextualized\ndocument embeddings: first, an alternative contrastive learning objective that\nexplicitly incorporates the document neighbors into the intra-batch contextual\nloss; second, a new contextual architecture that explicitly encodes neighbor\ndocument information into the encoded representation. Results show that both\nmethods achieve better performance than biencoders in several settings, with\ndifferences especially pronounced out-of-domain. We achieve state-of-the-art\nresults on the MTEB benchmark with no hard negative mining, score distillation,\ndataset-specific instructions, intra-GPU example-sharing, or extremely large\nbatch sizes. Our method can be applied to improve performance on any\ncontrastive learning dataset and any biencoder.\n","authors":["John X. Morris","Alexander M. Rush"],"pdf_url":"https://arxiv.org/pdf/2410.02525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17954v3","updated":"2024-10-03T14:29:11Z","published":"2024-02-28T00:24:29Z","title":"Twists, Humps, and Pebbles: Multilingual Speech Recognition Models\n Exhibit Gender Performance Gaps","summary":" Current automatic speech recognition (ASR) models are designed to be used\nacross many languages and tasks without substantial changes. However, this\nbroad language coverage hides performance gaps within languages, for example,\nacross genders. Our study systematically evaluates the performance of two\nwidely used multilingual ASR models on three datasets, encompassing 19\nlanguages from eight language families and two speaking conditions. Our\nfindings reveal clear gender disparities, with the advantaged group varying\nacross languages and models. Surprisingly, those gaps are not explained by\nacoustic or lexical properties. However, probing internal model states reveals\na correlation with gendered performance gap. That is, the easier it is to\ndistinguish speaker gender in a language using probes, the more the gap\nreduces, favoring female speakers. Our results show that gender disparities\npersist even in state-of-the-art models. Our findings have implications for the\nimprovement of multilingual ASR systems, underscoring the importance of\naccessibility to training data and nuanced evaluation to predict and mitigate\ngender gaps. We release all code and artifacts at\nhttps://github.com/g8a9/multilingual-asr-gender-gap.\n","authors":["Giuseppe Attanasio","Beatrice Savoldi","Dennis Fucci","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2402.17954v3.pdf","comment":"Accepted at EMNLP 2024. Code and artifacts at\n https://github.com/g8a9/multilingual-asr-gender-gap"},{"id":"http://arxiv.org/abs/2410.02521v1","updated":"2024-10-03T14:28:40Z","published":"2024-10-03T14:28:40Z","title":"Methods for Automatic Matrix Language Determination of Code-Switched\n Speech","summary":" Code-switching (CS) is the process of speakers interchanging between two or\nmore languages which in the modern world becomes increasingly common. In order\nto better describe CS speech the Matrix Language Frame (MLF) theory introduces\nthe concept of a Matrix Language, which is the language that provides the\ngrammatical structure for a CS utterance. In this work the MLF theory was used\nto develop systems for Matrix Language Identity (MLID) determination. The MLID\nof English/Mandarin and English/Spanish CS text and speech was compared to\nacoustic language identity (LID), which is a typical way to identify a language\nin monolingual utterances. MLID predictors from audio show higher correlation\nwith the textual principles than LID in all cases while also outperforming LID\nin an MLID recognition task based on F1 macro (60\\%) and correlation score\n(0.38). This novel approach has identified that non-English languages (Mandarin\nand Spanish) are preferred over the English language as the ML contrary to the\nmonolingual choice of LID.\n","authors":["Olga Iakovenko","Thomas Hain"],"pdf_url":"https://arxiv.org/pdf/2410.02521v1.pdf","comment":"Accepted at EMNLP"},{"id":"http://arxiv.org/abs/2309.15656v2","updated":"2024-10-03T14:27:14Z","published":"2023-09-27T13:45:38Z","title":"Conversational Feedback in Scripted versus Spontaneous Dialogues: A\n Comparative Analysis","summary":" Scripted dialogues such as movie and TV subtitles constitute a widespread\nsource of training data for conversational NLP models. However, there are\nnotable linguistic differences between these dialogues and spontaneous\ninteractions, especially regarding the occurrence of communicative feedback\nsuch as backchannels, acknowledgments, or clarification requests. This paper\npresents a quantitative analysis of such feedback phenomena in both subtitles\nand spontaneous conversations. Based on conversational data spanning eight\nlanguages and multiple genres, we extract lexical statistics, classifications\nfrom a dialogue act tagger, expert annotations and labels derived from a\nfine-tuned Large Language Model (LLM). Our main empirical findings are that (1)\ncommunicative feedback is markedly less frequent in subtitles than in\nspontaneous dialogues and (2) subtitles contain a higher proportion of negative\nfeedback. We also show that dialogues generated by standard LLMs lie much\ncloser to scripted dialogues than spontaneous interactions in terms of\ncommunicative feedback.\n","authors":["Ildikó Pilán","Laurent Prévot","Hendrik Buschmeier","Pierre Lison"],"pdf_url":"https://arxiv.org/pdf/2309.15656v2.pdf","comment":"Updated version for SIGdial 2024"},{"id":"http://arxiv.org/abs/2408.03350v2","updated":"2024-10-03T14:20:40Z","published":"2024-08-05T20:19:18Z","title":"miniCTX: Neural Theorem Proving with (Long-)Contexts","summary":" Real-world formal theorem proving often depends on a wealth of context,\nincluding definitions, lemmas, comments, file structure, and other information.\nWe introduce miniCTX, which tests a model's ability to prove formal\nmathematical theorems that depend on new context that is not seen during\ntraining. miniCTX contains theorems sourced from real Lean projects and\ntextbooks, each associated with a context that can span tens of thousands of\ntokens. Models are tasked with proving a theorem given access to code from the\ntheorem's repository, which contains context that is needed for the proof. As a\nbaseline for miniCTX, we tested fine-tuning and prompting methods that\ncondition theorem proving on preceding context. Both approaches substantially\noutperform traditional methods that rely solely on state information. We found\nthat this ability to use context is not captured by previous benchmarks such as\nminiF2F. Alongside miniCTX, we offer ntp-toolkit for automatically extracting\nand annotating theorem proving data, making it easy to add new projects into\nminiCTX to ensure that contexts are not seen during training. miniCTX offers a\nchallenging and realistic evaluation of neural theorem provers.\n","authors":["Jiewen Hu","Thomas Zhu","Sean Welleck"],"pdf_url":"https://arxiv.org/pdf/2408.03350v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02507v1","updated":"2024-10-03T14:15:00Z","published":"2024-10-03T14:15:00Z","title":"Can Large Language Models Grasp Legal Theories? Enhance Legal Reasoning\n with Insights from Multi-Agent Collaboration","summary":" Large Language Models (LLMs) could struggle to fully understand legal\ntheories and perform complex legal reasoning tasks. In this study, we introduce\na challenging task (confusing charge prediction) to better evaluate LLMs'\nunderstanding of legal theories and reasoning capabilities. We also propose a\nnovel framework: Multi-Agent framework for improving complex Legal Reasoning\ncapability (MALR). MALR employs non-parametric learning, encouraging LLMs to\nautomatically decompose complex legal tasks and mimic human learning process to\nextract insights from legal rules, helping LLMs better understand legal\ntheories and enhance their legal reasoning abilities. Extensive experiments on\nmultiple real-world datasets demonstrate that the proposed framework\neffectively addresses complex reasoning issues in practical scenarios, paving\nthe way for more reliable applications in the legal domain.\n","authors":["Weikang Yuan","Junjie Cao","Zhuoren Jiang","Yangyang Kang","Jun Lin","Kaisong Song","tianqianjin lin","Pengwei Yan","Changlong Sun","Xiaozhong Liu"],"pdf_url":"https://arxiv.org/pdf/2410.02507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18028v2","updated":"2024-10-03T14:11:23Z","published":"2024-09-26T16:34:35Z","title":"Compositional Hardness of Code in Large Language Models -- A\n Probabilistic Perspective","summary":" A common practice in large language model (LLM) usage for complex analytical\ntasks such as code generation, is to sample a solution for the entire task\nwithin the model's context window. Previous works have shown that subtask\ndecomposition within the model's context (chain of thought), is beneficial for\nsolving such tasks. In this work, we point a limitation of LLMs' ability to\nperform several sub-tasks within the same context window - an in-context\nhardness of composition, pointing to an advantage for distributing a decomposed\nproblem in a multi-agent system of LLMs. The hardness of composition is\nquantified by a generation complexity metric, i.e., the number of LLM\ngenerations required to sample at least one correct solution. We find a gap\nbetween the generation complexity of solving a compositional problem within the\nsame context relative to distributing it among multiple agents, that increases\nexponentially with the solution's length. We prove our results theoretically\nand demonstrate them empirically.\n","authors":["Yotam Wolf","Binyamin Rothberg","Dorin Shteyman","Amnon Shashua"],"pdf_url":"https://arxiv.org/pdf/2409.18028v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02503v1","updated":"2024-10-03T14:06:43Z","published":"2024-10-03T14:06:43Z","title":"Mixed-Session Conversation with Egocentric Memory","summary":" Recently introduced dialogue systems have demonstrated high usability.\nHowever, they still fall short of reflecting real-world conversation scenarios.\nCurrent dialogue systems exhibit an inability to replicate the dynamic,\ncontinuous, long-term interactions involving multiple partners. This shortfall\narises because there have been limited efforts to account for both aspects of\nreal-world dialogues: deeply layered interactions over the long-term dialogue\nand widely expanded conversation networks involving multiple participants. As\nthe effort to incorporate these aspects combined, we introduce Mixed-Session\nConversation, a dialogue system designed to construct conversations with\nvarious partners in a multi-session dialogue setup. We propose a new dataset\ncalled MiSC to implement this system. The dialogue episodes of MiSC consist of\n6 consecutive sessions, with four speakers (one main speaker and three\npartners) appearing in each episode. Also, we propose a new dialogue model with\na novel memory management mechanism, called Egocentric Memory Enhanced\nMixed-Session Conversation Agent (EMMA). EMMA collects and retains memories\nfrom the main speaker's perspective during conversations with partners,\nenabling seamless continuity in subsequent interactions. Extensive human\nevaluations validate that the dialogues in MiSC demonstrate a seamless\nconversational flow, even when conversation partners change in each session.\nEMMA trained with MiSC is also evaluated to maintain high memorability without\ncontradiction throughout the entire conversation.\n","authors":["Jihyoung Jang","Taeyoung Kim","Hyounghun Kim"],"pdf_url":"https://arxiv.org/pdf/2410.02503v1.pdf","comment":"EMNLP Findings 2024 (30 pages); Project website:\n https://mixed-session.github.io/"},{"id":"http://arxiv.org/abs/2407.03277v2","updated":"2024-10-03T14:05:14Z","published":"2024-07-03T17:04:17Z","title":"Evaluating Automatic Metrics with Incremental Machine Translation\n Systems","summary":" We introduce a dataset comprising commercial machine translations, gathered\nweekly over six years across 12 translation directions. Since human A/B testing\nis commonly used, we assume commercial systems improve over time, which enables\nus to evaluate machine translation (MT) metrics based on their preference for\nmore recent translations. Our study not only confirms several prior findings,\nsuch as the advantage of neural metrics over non-neural ones, but also explores\nthe debated issue of how MT quality affects metric reliability--an\ninvestigation that smaller datasets in previous research could not sufficiently\nexplore. Overall, our research demonstrates the dataset's value as a testbed\nfor metric evaluation. We release our code at https://github.com/gjwubyron/Evo\n","authors":["Guojun Wu","Shay B. Cohen","Rico Sennrich"],"pdf_url":"https://arxiv.org/pdf/2407.03277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02499v1","updated":"2024-10-03T14:01:01Z","published":"2024-10-03T14:01:01Z","title":"Defining Knowledge: Bridging Epistemology and Large Language Models","summary":" Knowledge claims are abundant in the literature on large language models\n(LLMs); but can we say that GPT-4 truly \"knows\" the Earth is round? To address\nthis question, we review standard definitions of knowledge in epistemology and\nwe formalize interpretations applicable to LLMs. In doing so, we identify\ninconsistencies and gaps in how current NLP research conceptualizes knowledge\nwith respect to epistemological frameworks. Additionally, we conduct a survey\nof 100 professional philosophers and computer scientists to compare their\npreferences in knowledge definitions and their views on whether LLMs can really\nbe said to know. Finally, we suggest evaluation protocols for testing knowledge\nin accordance to the most relevant definitions.\n","authors":["Constanza Fierro","Ruchira Dhar","Filippos Stamatiou","Nicolas Garneau","Anders Søgaard"],"pdf_url":"https://arxiv.org/pdf/2410.02499v1.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.02498v1","updated":"2024-10-03T14:00:44Z","published":"2024-10-03T14:00:44Z","title":"Dynamic Gradient Alignment for Online Data Mixing","summary":" The composition of training data mixtures is critical for effectively\ntraining large language models (LLMs), as it directly impacts their performance\non downstream tasks. Our goal is to identify an optimal data mixture to\nspecialize an LLM for a specific task with access to only a few examples.\nTraditional approaches to this problem include ad-hoc reweighting methods,\nimportance sampling, and gradient alignment techniques. This paper focuses on\ngradient alignment and introduces Dynamic Gradient Alignment (DGA), a scalable\nonline gradient alignment algorithm. DGA dynamically estimates the pre-training\ndata mixture on which the models' gradients align as well as possible with\nthose of the model on the specific task. DGA is the first gradient alignment\napproach that incurs minimal overhead compared to standard pre-training and\noutputs a competitive model, eliminating the need for retraining the model.\nExperimentally, we demonstrate significant improvements over importance\nsampling in two key scenarios: (i) when the pre-training set is small and\nimportance sampling overfits due to limited data; and (ii) when there is\ninsufficient specialized data, trapping importance sampling on narrow pockets\nof data. Our findings underscore the effectiveness of gradient alignment\nmethods in optimizing training data mixtures, particularly in data-constrained\nenvironments, and offer a practical solution for enhancing LLM performance on\nspecific tasks with limited data availability.\n","authors":["Simin Fan","David Grangier","Pierre Ablin"],"pdf_url":"https://arxiv.org/pdf/2410.02498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02492v1","updated":"2024-10-03T13:57:07Z","published":"2024-10-03T13:57:07Z","title":"DTVLT: A Multi-modal Diverse Text Benchmark for Visual Language Tracking\n Based on LLM","summary":" Visual language tracking (VLT) has emerged as a cutting-edge research area,\nharnessing linguistic data to enhance algorithms with multi-modal inputs and\nbroadening the scope of traditional single object tracking (SOT) to encompass\nvideo understanding applications. Despite this, most VLT benchmarks still\ndepend on succinct, human-annotated text descriptions for each video. These\ndescriptions often fall short in capturing the nuances of video content\ndynamics and lack stylistic variety in language, constrained by their uniform\nlevel of detail and a fixed annotation frequency. As a result, algorithms tend\nto default to a \"memorize the answer\" strategy, diverging from the core\nobjective of achieving a deeper understanding of video content. Fortunately,\nthe emergence of large language models (LLMs) has enabled the generation of\ndiverse text. This work utilizes LLMs to generate varied semantic annotations\n(in terms of text lengths and granularities) for representative SOT benchmarks,\nthereby establishing a novel multi-modal benchmark. Specifically, we (1)\npropose a new visual language tracking benchmark with diverse texts, named\nDTVLT, based on five prominent VLT and SOT benchmarks, including three\nsub-tasks: short-term tracking, long-term tracking, and global instance\ntracking. (2) We offer four granularity texts in our benchmark, considering the\nextent and density of semantic information. We expect this multi-granular\ngeneration strategy to foster a favorable environment for VLT and video\nunderstanding research. (3) We conduct comprehensive experimental analyses on\nDTVLT, evaluating the impact of diverse text on tracking performance and hope\nthe identified performance bottlenecks of existing algorithms can support\nfurther research in VLT and video understanding. The proposed benchmark,\nexperimental results and toolkit will be released gradually on\nhttp://videocube.aitestunion.com/.\n","authors":["Xuchen Li","Shiyu Hu","Xiaokun Feng","Dailing Zhang","Meiqi Wu","Jing Zhang","Kaiqi Huang"],"pdf_url":"https://arxiv.org/pdf/2410.02492v1.pdf","comment":"Preprint, Under Review"},{"id":"http://arxiv.org/abs/2404.07103v3","updated":"2024-10-03T13:55:08Z","published":"2024-04-10T15:41:53Z","title":"Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on\n Graphs","summary":" Large language models (LLMs), while exhibiting exceptional performance,\nsuffer from hallucinations, especially on knowledge-intensive tasks. Existing\nworks propose to augment LLMs with individual text units retrieved from\nexternal knowledge corpora to alleviate the issue. However, in many domains,\ntexts are interconnected (e.g., academic papers in a bibliographic graph are\nlinked by citations and co-authorships) which form a (text-attributed) graph.\nThe knowledge in such graphs is encoded not only in single texts/nodes but also\nin their associated connections. To facilitate the research of augmenting LLMs\nwith graphs, we manually construct a Graph Reasoning Benchmark dataset called\nGRBench, containing 1,740 questions that can be answered with the knowledge\nfrom 10 domain graphs. Then, we propose a simple and effective framework called\nGraph Chain-of-thought (Graph-CoT) to augment LLMs with graphs by encouraging\nLLMs to reason on the graph iteratively. Each Graph-CoT iteration consists of\nthree sub-steps: LLM reasoning, LLM-graph interaction, and graph execution. We\nconduct systematic experiments with three LLM backbones on GRBench, where\nGraph-CoT outperforms the baselines consistently. The code is available at\nhttps://github.com/PeterGriffinJin/Graph-CoT.\n","authors":["Bowen Jin","Chulin Xie","Jiawei Zhang","Kashob Kumar Roy","Yu Zhang","Zheng Li","Ruirui Li","Xianfeng Tang","Suhang Wang","Yu Meng","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2404.07103v3.pdf","comment":"21 pages. Code: https://github.com/PeterGriffinJin/Graph-CoT"},{"id":"http://arxiv.org/abs/2405.13448v2","updated":"2024-10-03T13:53:59Z","published":"2024-05-22T08:38:26Z","title":"Distilling Instruction-following Abilities of Large Language Models with\n Task-aware Curriculum Planning","summary":" Instruction tuning aims to align large language models (LLMs) with\nopen-domain instructions and human-preferred responses. While several studies\nhave explored autonomous approaches to distilling and annotating instructions\nfrom powerful proprietary LLMs, such as ChatGPT, they often neglect the impact\nof the distributions and characteristics of tasks, together with the varying\ndifficulty of instructions in training sets. This oversight can lead to\nimbalanced knowledge capabilities and poor generalization powers of student\nLLMs. To address these challenges, we introduce Task-Aware Curriculum Planning\nfor Instruction Refinement (TAPIR), a multi-round distillation framework that\nutilizes an oracle LLM to select instructions that are difficult for a student\nLLM to follow. To balance the student's capabilities, task distributions in\ntraining sets are adjusted with responses automatically refined according to\ntheir corresponding tasks. In addition, by incorporating curriculum planning,\nour approach systematically escalates the difficulty levels of tasks,\nprogressively enhancing the student LLM's capabilities. We rigorously evaluate\nTAPIR using several widely recognized benchmarks (such as AlpacaEval 2.0,\nMT-Bench, etc.) and multiple student LLMs. Empirical results demonstrate that\nstudent LLMs, trained with our method and less training data, outperform larger\ninstruction-tuned models and strong distillation baselines.\n","authors":["Yuanhao Yue","Chengyu Wang","Jun Huang","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.13448v2.pdf","comment":"emnlp 2024 findings"},{"id":"http://arxiv.org/abs/2407.04069v2","updated":"2024-10-03T13:51:53Z","published":"2024-07-04T17:15:37Z","title":"A Systematic Survey and Critical Review on Evaluating Large Language\n Models: Challenges, Limitations, and Recommendations","summary":" Large Language Models (LLMs) have recently gained significant attention due\nto their remarkable capabilities in performing diverse tasks across various\ndomains. However, a thorough evaluation of these models is crucial before\ndeploying them in real-world applications to ensure they produce reliable\nperformance. Despite the well-established importance of evaluating LLMs in the\ncommunity, the complexity of the evaluation process has led to varied\nevaluation setups, causing inconsistencies in findings and interpretations. To\naddress this, we systematically review the primary challenges and limitations\ncausing these inconsistencies and unreliable evaluations in various steps of\nLLM evaluation. Based on our critical review, we present our perspectives and\nrecommendations to ensure LLM evaluations are reproducible, reliable, and\nrobust.\n","authors":["Md Tahmid Rahman Laskar","Sawsan Alqahtani","M Saiful Bari","Mizanur Rahman","Mohammad Abdullah Matin Khan","Haidar Khan","Israt Jahan","Amran Bhuiyan","Chee Wei Tan","Md Rizwan Parvez","Enamul Hoque","Shafiq Joty","Jimmy Huang"],"pdf_url":"https://arxiv.org/pdf/2407.04069v2.pdf","comment":"Accepted at EMNLP 2024 (Main Conference)"},{"id":"http://arxiv.org/abs/2312.02783v3","updated":"2024-10-03T13:47:02Z","published":"2023-12-05T14:14:27Z","title":"Large Language Models on Graphs: A Comprehensive Survey","summary":" Large language models (LLMs), such as GPT4 and LLaMA, are creating\nsignificant advancements in natural language processing, due to their strong\ntext encoding/decoding ability and newly found emergent capability (e.g.,\nreasoning). While LLMs are mainly designed to process pure texts, there are\nmany real-world scenarios where text data is associated with rich structure\ninformation in the form of graphs (e.g., academic networks, and e-commerce\nnetworks) or scenarios where graph data is paired with rich textual information\n(e.g., molecules with descriptions). Besides, although LLMs have shown their\npure text-based reasoning ability, it is underexplored whether such ability can\nbe generalized to graphs (i.e., graph-based reasoning). In this paper, we\nprovide a systematic review of scenarios and techniques related to large\nlanguage models on graphs. We first summarize potential scenarios of adopting\nLLMs on graphs into three categories, namely pure graphs, text-attributed\ngraphs, and text-paired graphs. We then discuss detailed techniques for\nutilizing LLMs on graphs, including LLM as Predictor, LLM as Encoder, and LLM\nas Aligner, and compare the advantages and disadvantages of different schools\nof models. Furthermore, we discuss the real-world applications of such methods\nand summarize open-source codes and benchmark datasets. Finally, we conclude\nwith potential future research directions in this fast-growing field. The\nrelated source can be found at\nhttps://github.com/PeterGriffinJin/Awesome-Language-Model-on-Graphs.\n","authors":["Bowen Jin","Gang Liu","Chi Han","Meng Jiang","Heng Ji","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2312.02783v3.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2401.16332v4","updated":"2024-10-03T13:40:39Z","published":"2024-01-29T17:38:14Z","title":"Tradeoffs Between Alignment and Helpfulness in Language Models with\n Representation Engineering","summary":" Language model alignment has become an important component of AI safety,\nallowing safe interactions between humans and language models, by enhancing\ndesired behaviors and inhibiting undesired ones. It is often done by tuning the\nmodel or inserting preset aligning prompts. Recently, representation\nengineering, a method which alters the model's behavior via changing its\nrepresentations post-training, was shown to be effective in aligning LLMs (Zou\net al., 2023a). Representation engineering yields gains in alignment oriented\ntasks such as resistance to adversarial attacks and reduction of social biases,\nbut was also shown to cause a decrease in the ability of the model to perform\nbasic tasks. In this paper we study the tradeoff between the increase in\nalignment and decrease in helpfulness of the model. We propose a theoretical\nframework which provides bounds for these two quantities, and demonstrate their\nrelevance empirically. First, we find that under the conditions of our\nframework, alignment can be guaranteed with representation engineering, and at\nthe same time that helpfulness is harmed in the process. Second, we show that\nhelpfulness is harmed quadratically with the norm of the representation\nengineering vector, while the alignment increases linearly with it, indicating\na regime in which it is efficient to use representation engineering. We\nvalidate our findings empirically, and chart the boundaries to the usefulness\nof representation engineering for alignment.\n","authors":["Yotam Wolf","Noam Wies","Dorin Shteyman","Binyamin Rothberg","Yoav Levine","Amnon Shashua"],"pdf_url":"https://arxiv.org/pdf/2401.16332v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17041v4","updated":"2024-10-03T13:31:39Z","published":"2023-11-28T18:53:06Z","title":"Eliciting In-Context Learning in Vision-Language Models for Videos\n Through Curated Data Distributional Properties","summary":" A major reason behind the recent success of large language models (LLMs) is\ntheir \\textit{in-context learning} capability, which makes it possible to\nrapidly adapt them to downstream text-based tasks by prompting them with a\nsmall number of relevant demonstrations. While large vision-language models\n(VLMs) have recently been developed for tasks requiring both text and images,\nthey largely lack in-context learning over visual information, especially in\nunderstanding and generating text about videos. In this work, we implement\n\\textbf{E}mergent \\textbf{I}n-context \\textbf{Le}arning on \\textbf{V}ideos\n(\\eilev{}), a novel training paradigm that induces in-context learning over\nvideo and text by capturing key properties of pre-training data found by prior\nwork to be essential for in-context learning in transformers. In our\nexperiments, we show that \\eilev-trained models outperform other off-the-shelf\nVLMs in few-shot video narration for novel, rare actions. Furthermore, we\ndemonstrate that these key properties of bursty distributions, skewed marginal\ndistributions, and dynamic meaning each contribute to varying degrees to VLMs'\nin-context learning capability in narrating procedural videos. Our results,\nanalysis, and \\eilev{}-trained models yield numerous insights about the\nemergence of in-context learning over video and text, creating a foundation for\nfuture work to optimize and scale VLMs for open-domain video understanding and\nreasoning. Our code and demo are available at\n\\url{https://github.com/yukw777/EILEV}.\n","authors":["Keunwoo Peter Yu","Zheyuan Zhang","Fengyuan Hu","Shane Storks","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2311.17041v4.pdf","comment":"16 pages, LaTeX; Accepted to EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2404.15206v3","updated":"2024-10-03T13:23:59Z","published":"2024-04-23T16:39:03Z","title":"Does Instruction Tuning Make LLMs More Consistent?","summary":" The purpose of instruction tuning is enabling zero-shot performance, but\ninstruction tuning has also been shown to improve chain-of-thought reasoning\nand value alignment (Si et al., 2023). Here we consider the impact on\n$\\textit{consistency}$, i.e., the sensitivity of language models to small\nperturbations in the input. We compare 10 instruction-tuned LLaMA models to the\noriginal LLaMA-7b model and show that almost across-the-board they become more\nconsistent, both in terms of their representations and their predictions in\nzero-shot and downstream tasks. We explain these improvements through\nmechanistic analyses of factual recall.\n","authors":["Constanza Fierro","Jiaang Li","Anders Søgaard"],"pdf_url":"https://arxiv.org/pdf/2404.15206v3.pdf","comment":"We need to run extra experiments to ensure some of the claims in the\n paper are fully correct"},{"id":"http://arxiv.org/abs/2410.02465v1","updated":"2024-10-03T13:15:19Z","published":"2024-10-03T13:15:19Z","title":"Response Tuning: Aligning Large Language Models without Instruction","summary":" Instruction tuning-supervised fine-tuning using instruction-response pairs-is\na foundational step in transitioning pre-trained Large Language Models (LLMs)\ninto helpful and safe chat assistants. Our hypothesis is that establishing an\nadequate output space can enable such a transition given the capabilities\ninherent in pre-trained LLMs. To verify this, we propose Response Tuning (RT),\nwhich eliminates the instruction-conditioning step in instruction tuning and\nsolely focuses on response space supervision. Our experiments demonstrate that\nRT models, trained only using responses, can effectively respond to a wide\nrange of instructions and exhibit helpfulness comparable to that of their\ninstruction-tuned counterparts. Furthermore, we observe that controlling the\ntraining response distribution can significantly improve their user preference\nor elicit target behaviors such as refusing assistance for unsafe queries. Our\nfindings illuminate the role of establishing an adequate output space in\nalignment, highlighting the potential of the extensive inherent capabilities of\npre-trained LLMs.\n","authors":["Seokhyun An","Hyounghun Kim"],"pdf_url":"https://arxiv.org/pdf/2410.02465v1.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2410.01242v2","updated":"2024-10-03T13:12:24Z","published":"2024-10-02T05:07:02Z","title":"RGD: Multi-LLM Based Agent Debugger via Refinement and Generation\n Guidance","summary":" Large Language Models (LLMs) have shown incredible potential in code\ngeneration tasks, and recent research in prompt engineering have enhanced LLMs'\nunderstanding of textual information. However, ensuring the accuracy of\ngenerated code often requires extensive testing and validation by programmers.\nWhile LLMs can typically generate code based on task descriptions, their\naccuracy remains limited, especially for complex tasks that require a deeper\nunderstanding of both the problem statement and the code generation process.\nThis limitation is primarily due to the LLMs' need to simultaneously comprehend\ntext and generate syntactically and semantically correct code, without having\nthe capability to automatically refine the code. In real-world software\ndevelopment, programmers rarely produce flawless code in a single attempt based\non the task description alone, they rely on iterative feedback and debugging to\nrefine their programs. Inspired by this process, we introduce a novel\narchitecture of LLM-based agents for code generation and automatic debugging:\nRefinement and Guidance Debugging (RGD). The RGD framework is a multi-LLM-based\nagent debugger that leverages three distinct LLM agents-Guide Agent, Debug\nAgent, and Feedback Agent. RGD decomposes the code generation task into\nmultiple steps, ensuring a clearer workflow and enabling iterative code\nrefinement based on self-reflection and feedback. Experimental results\ndemonstrate that RGD exhibits remarkable code generation capabilities,\nachieving state-of-the-art performance with a 9.8% improvement on the HumanEval\ndataset and a 16.2% improvement on the MBPP dataset compared to the\nstate-of-the-art approaches and traditional direct prompting approaches. We\nhighlight the effectiveness of the RGD framework in enhancing LLMs' ability to\ngenerate and refine code autonomously.\n","authors":["Haolin Jin","Zechao Sun","Huaming Chen"],"pdf_url":"https://arxiv.org/pdf/2410.01242v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07431v2","updated":"2024-10-03T13:07:25Z","published":"2024-09-11T17:21:59Z","title":"Synthetic continued pretraining","summary":" Pretraining on large-scale, unstructured internet text enables language\nmodels to acquire a significant amount of world knowledge. However, this\nknowledge acquisition is data-inefficient--to learn a given fact, models must\nbe trained on hundreds to thousands of diverse representations of it. This\nposes a challenge when adapting a pretrained model to a small corpus of\ndomain-specific documents, where each fact may appear rarely or only once. We\npropose to bridge this gap with synthetic continued pretraining: using the\nsmall domain-specific corpus to synthesize a large corpus more amenable to\nlearning, and then performing continued pretraining on the synthesized corpus.\nWe instantiate this proposal with EntiGraph, a synthetic data augmentation\nalgorithm that extracts salient entities from the source documents and then\ngenerates diverse text by drawing connections between the sampled entities.\nSynthetic continued pretraining with EntiGraph enables a language model to\nanswer questions and follow generic instructions related to the source\ndocuments without access to them. If, instead, the source documents are\navailable at inference time, we show that the knowledge acquired through our\napproach compounds with retrieval-augmented generation. To better understand\nthese results, we build a simple mathematical model of EntiGraph, and show how\nsynthetic data augmentation can \"rearrange\" knowledge to enable more\ndata-efficient learning.\n","authors":["Zitong Yang","Neil Band","Shuangping Li","Emmanuel Candès","Tatsunori Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2409.07431v2.pdf","comment":"Updated organization of experimental results and methods\n introduction. Released the dataset and model weights artifact"},{"id":"http://arxiv.org/abs/2406.19999v2","updated":"2024-10-03T13:02:11Z","published":"2024-06-28T15:34:26Z","title":"The SIFo Benchmark: Investigating the Sequential Instruction Following\n Ability of Large Language Models","summary":" Following multiple instructions is a crucial ability for large language\nmodels (LLMs). Evaluating this ability comes with significant challenges: (i)\nlimited coherence between multiple instructions, (ii) positional bias where the\norder of instructions affects model performance, and (iii) a lack of\nobjectively verifiable tasks. To address these issues, we introduce a benchmark\ndesigned to evaluate models' abilities to follow multiple instructions through\nsequential instruction following (SIFo) tasks. In SIFo, the successful\ncompletion of multiple instructions is verifiable by examining only the final\ninstruction. Our benchmark evaluates instruction following using four tasks\n(text modification, question answering, mathematics, and security rules), each\nassessing different aspects of sequential instruction following. Our evaluation\nof popular LLMs, both closed-source and open-source, shows that more recent and\nlarger models significantly outperform their older and smaller counterparts on\nthe SIFo tasks, validating the benchmark's effectiveness. All models struggle\nwith following sequences of instructions, hinting at an important lack of\nrobustness of today's language models.\n","authors":["Xinyi Chen","Baohao Liao","Jirui Qi","Panagiotis Eustratiadis","Christof Monz","Arianna Bisazza","Maarten de Rijke"],"pdf_url":"https://arxiv.org/pdf/2406.19999v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2410.02441v1","updated":"2024-10-03T12:39:14Z","published":"2024-10-03T12:39:14Z","title":"Embedded Topic Models Enhanced by Wikification","summary":" Topic modeling analyzes a collection of documents to learn meaningful\npatterns of words. However, previous topic models consider only the spelling of\nwords and do not take into consideration the homography of words. In this\nstudy, we incorporate the Wikipedia knowledge into a neural topic model to make\nit aware of named entities. We evaluate our method on two datasets, 1) news\narticles of \\textit{New York Times} and 2) the AIDA-CoNLL dataset. Our\nexperiments show that our method improves the performance of neural topic\nmodels in generalizability. Moreover, we analyze frequent terms in each topic\nand the temporal dependencies between topics to demonstrate that our\nentity-aware topic models can capture the time-series development of topics\nwell.\n","authors":["Takashi Shibuya","Takehito Utsuro"],"pdf_url":"https://arxiv.org/pdf/2410.02441v1.pdf","comment":"Accepted at EMNLP 2024 Workshop NLP for Wikipedia"},{"id":"http://arxiv.org/abs/2410.02433v1","updated":"2024-10-03T12:28:13Z","published":"2024-10-03T12:28:13Z","title":"Better Call SAUL: Fluent and Consistent Language Model Editing with\n Generation Regularization","summary":" To ensure large language models contain up-to-date knowledge, they need to be\nupdated regularly. However, model editing is challenging as it might also\naffect knowledge that is unrelated to the new data. State-of-the-art methods\nidentify parameters associated with specific knowledge and then modify them via\ndirect weight updates. However, these locate-and-edit methods suffer from heavy\ncomputational overhead and lack theoretical validation. In contrast, directly\nfine-tuning the model on requested edits affects the model's behavior on\nunrelated knowledge, and significantly damages the model's generation fluency\nand consistency. To address these challenges, we propose SAUL, a streamlined\nmodel editing method that uses sentence concatenation with augmented random\nfacts for generation regularization. Evaluations on three model editing\nbenchmarks show that SAUL is a practical and reliable solution for model\nediting outperforming state-of-the-art methods while maintaining generation\nquality and reducing computational overhead.\n","authors":["Mingyang Wang","Lukas Lange","Heike Adel","Jannik Strötgen","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2410.02433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02429v1","updated":"2024-10-03T12:24:18Z","published":"2024-10-03T12:24:18Z","title":"IoT-LLM: Enhancing Real-World IoT Task Reasoning with Large Language\n Models","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities across\ntextual and visual domains but often generate outputs that violate physical\nlaws, revealing a gap in their understanding of the physical world. Inspired by\nhuman cognition, where perception is fundamental to reasoning, we explore\naugmenting LLMs with enhanced perception abilities using Internet of Things\n(IoT) sensor data and pertinent knowledge for IoT task reasoning in the\nphysical world. In this work, we systematically study LLMs capability to\naddress real-world IoT tasks by augmenting their perception and knowledge base,\nand then propose a unified framework, IoT-LLM, to enhance such capability. In\nIoT-LLM, we customize three steps for LLMs: preprocessing IoT data into formats\namenable to LLMs, activating their commonsense knowledge through\nchain-of-thought prompting and specialized role definitions, and expanding\ntheir understanding via IoT-oriented retrieval-augmented generation based on\nin-context learning. To evaluate the performance, We design a new benchmark\nwith five real-world IoT tasks with different data types and reasoning\ndifficulties and provide the benchmarking results on six open-source and\nclose-source LLMs. Experimental results demonstrate the limitations of existing\nLLMs with naive textual inputs that cannot perform these tasks effectively. We\nshow that IoT-LLM significantly enhances the performance of IoT tasks reasoning\nof LLM, such as GPT-4, achieving an average improvement of 65% across various\ntasks against previous methods. The results also showcase LLMs ability to\ncomprehend IoT data and the physical law behind data by providing a reasoning\nprocess. Limitations of our work are claimed to inspire future research in this\nnew era.\n","authors":["Tuo An","Yunjiao Zhou","Han Zou","Jianfei Yang"],"pdf_url":"https://arxiv.org/pdf/2410.02429v1.pdf","comment":"21 pages, 10 figures, submitted to ICLR 2025 Conference"},{"id":"http://arxiv.org/abs/2410.02428v1","updated":"2024-10-03T12:21:17Z","published":"2024-10-03T12:21:17Z","title":"Collective Critics for Creative Story Generation","summary":" Generating a long story of several thousand words with narrative coherence\nusing Large Language Models (LLMs) has been a challenging task. Previous\nresearch has addressed this challenge by proposing different frameworks that\ncreate a story plan and generate a long story based on that plan. However,\nthese frameworks have been mainly focusing on maintaining narrative coherence\nin stories, often overlooking creativity in story planning and the\nexpressiveness of the stories generated from those plans, which are desirable\nproperties to captivate readers' interest. In this paper, we propose Collective\nCritics for Creative Story Generation framework (CritiCS), which is composed of\nplan refining stage (CrPlan) and story generation stage (CrText), to integrate\na collective revision mechanism that promotes those properties into long-form\nstory generation process. Specifically, in each stage, a group of LLM critics\nand one leader collaborate to incrementally refine drafts of plan and story\nthroughout multiple rounds. Extensive human evaluation shows that the CritiCS\ncan significantly enhance story creativity and reader engagement, while also\nmaintaining narrative coherence. Furthermore, the design of the framework\nallows active participation from human writers in any role within the critique\nprocess, enabling interactive human-machine collaboration in story writing.\n","authors":["Minwook Bae","Hyounghun Kim"],"pdf_url":"https://arxiv.org/pdf/2410.02428v1.pdf","comment":"EMNLP 2024 (36 pages)"},{"id":"http://arxiv.org/abs/2406.13092v2","updated":"2024-10-03T12:20:10Z","published":"2024-06-18T22:44:50Z","title":"Multilingual Synopses of Movie Narratives: A Dataset for Vision-Language\n Story Understanding","summary":" Story video-text alignment, a core task in computational story understanding,\naims to align video clips with corresponding sentences in their descriptions.\nHowever, progress on the task has been held back by the scarcity of manually\nannotated video-text correspondence and the heavy concentration on English\nnarrations of Hollywood movies. To address these issues, in this paper, we\nconstruct a large-scale multilingual video story dataset named Multilingual\nSynopses of Movie Narratives (M-SYMON), containing 13,166 movie summary videos\nfrom 7 languages, as well as manual annotation of fine-grained video-text\ncorrespondences for 101.5 hours of video. Training on the human annotated data\nfrom SyMoN outperforms the SOTA methods by 15.7 and 16.2 percentage points on\nClip Accuracy and Sentence IoU scores, respectively, demonstrating the\neffectiveness of the annotations. As benchmarks for future research, we create\n6 baseline approaches with different multilingual training strategies, compare\ntheir performance in both intra-lingual and cross-lingual setups, exemplifying\nthe challenges of multilingual video-text alignment. The dataset is released\nat: https://github.com/insundaycathy/M-SyMoN\n","authors":["Yidan Sun","Jianfei Yu","Boyang Li"],"pdf_url":"https://arxiv.org/pdf/2406.13092v2.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.02426v1","updated":"2024-10-03T12:19:49Z","published":"2024-10-03T12:19:49Z","title":"Learning the Latent Rules of a Game from Data: A Chess Story","summary":" We demonstrate that small pretrained foundational generative language models\nwith millions of parameters can learn the latent rules of a process from data\nassociated with the process. Inspired by Stefan Zweig's novella\n\"Schachnovelle,\" also known as \"The Royal Game\" in English, we show that 28M\nand 125M parameter pretrained foundational small language models (SLMs) can be\ninstruction fine-tuned with 1,000-to-1,000,000 examples to learn the rules of\nchess, propose legal moves, and accurately solve chess problems. We also\nexplore the impact of successive language model fine-tuning epochs on improved\noutcomes and demonstrate reductions in model hallucinations by increasing the\nnumber of instruction fine-tuning examples.\n","authors":["Ben Fauber"],"pdf_url":"https://arxiv.org/pdf/2410.02426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02425v1","updated":"2024-10-03T12:19:06Z","published":"2024-10-03T12:19:06Z","title":"LLM-Pilot: Characterize and Optimize Performance of your LLM Inference\n Services","summary":" As Large Language Models (LLMs) are rapidly growing in popularity, LLM\ninference services must be able to serve requests from thousands of users while\nsatisfying performance requirements. The performance of an LLM inference\nservice is largely determined by the hardware onto which it is deployed, but\nunderstanding of which hardware will deliver on performance requirements\nremains challenging. In this work we present LLM-Pilot - a first-of-its-kind\nsystem for characterizing and predicting performance of LLM inference services.\nLLM-Pilot performs benchmarking of LLM inference services, under a realistic\nworkload, across a variety of GPUs, and optimizes the service configuration for\neach considered GPU to maximize performance. Finally, using this\ncharacterization data, LLM-Pilot learns a predictive model, which can be used\nto recommend the most cost-effective hardware for a previously unseen LLM.\nCompared to existing methods, LLM-Pilot can deliver on performance requirements\n33% more frequently, whilst reducing costs by 60% on average.\n","authors":["Małgorzata Łazuka","Andreea Anghel","Thomas Parnell"],"pdf_url":"https://arxiv.org/pdf/2410.02425v1.pdf","comment":"Accepted to the International Conference for High Performance\n Computing, Networking, Storage and Analysis (SC '24)"},{"id":"http://arxiv.org/abs/2410.02417v1","updated":"2024-10-03T12:07:34Z","published":"2024-10-03T12:07:34Z","title":"MenakBERT -- Hebrew Diacriticizer","summary":" Diacritical marks in the Hebrew language give words their vocalized form. The\ntask of adding diacritical marks to plain Hebrew text is still dominated by a\nsystem that relies heavily on human-curated resources. Recent models trained on\ndiacritized Hebrew texts still present a gap in performance. We use a recently\ndeveloped char-based PLM to narrowly bridge this gap. Presenting MenakBERT, a\ncharacter level transformer pretrained on Hebrew text and fine-tuned to produce\ndiacritical marks for Hebrew sentences. We continue to show how finetuning a\nmodel for diacritizing transfers to a task such as part of speech tagging.\n","authors":["Ido Cohen","Jacob Gidron","Idan Pinto"],"pdf_url":"https://arxiv.org/pdf/2410.02417v1.pdf","comment":"Published at ISCOL2022 as a poster"},{"id":"http://arxiv.org/abs/2406.11096v3","updated":"2024-10-03T11:57:00Z","published":"2024-06-16T22:59:18Z","title":"The Potential and Challenges of Evaluating Attitudes, Opinions, and\n Values in Large Language Models","summary":" Recent advances in Large Language Models (LLMs) have sparked wide interest in\nvalidating and comprehending the human-like cognitive-behavioral traits LLMs\nmay capture and convey. These cognitive-behavioral traits include typically\nAttitudes, Opinions, Values (AOVs). However, measuring AOVs embedded within\nLLMs remains opaque, and different evaluation methods may yield different\nresults. This has led to a lack of clarity on how different studies are related\nto each other and how they can be interpreted. This paper aims to bridge this\ngap by providing a comprehensive overview of recent works on the evaluation of\nAOVs in LLMs. Moreover, we survey related approaches in different stages of the\nevaluation pipeline in these works. By doing so, we address the potential and\nchallenges with respect to understanding the model, human-AI alignment, and\ndownstream application in social sciences. Finally, we provide practical\ninsights into evaluation methods, model enhancement, and interdisciplinary\ncollaboration, thereby contributing to the evolving landscape of evaluating\nAOVs in LLMs.\n","authors":["Bolei Ma","Xinpeng Wang","Tiancheng Hu","Anna-Carolina Haensch","Michael A. Hedderich","Barbara Plank","Frauke Kreuter"],"pdf_url":"https://arxiv.org/pdf/2406.11096v3.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2212.00596v2","updated":"2024-10-03T11:42:43Z","published":"2022-12-01T15:48:51Z","title":"Language models and brains align due to more than next-word prediction\n and word-level information","summary":" Pretrained language models have been shown to significantly predict brain\nrecordings of people comprehending language. Recent work suggests that the\nprediction of the next word is a key mechanism that contributes to this\nalignment. What is not yet understood is whether prediction of the next word is\nnecessary for this observed alignment or simply sufficient, and whether there\nare other shared mechanisms or information that are similarly important. In\nthis work, we take a step towards understanding the reasons for brain alignment\nvia two simple perturbations in popular pretrained language models. These\nperturbations help us design contrasts that can control for different types of\ninformation. By contrasting the brain alignment of these differently perturbed\nmodels, we show that improvements in alignment with brain recordings are due to\nmore than improvements in next-word prediction and word-level information.\n","authors":["Gabriele Merlin","Mariya Toneva"],"pdf_url":"https://arxiv.org/pdf/2212.00596v2.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.02396v1","updated":"2024-10-03T11:17:58Z","published":"2024-10-03T11:17:58Z","title":"Parameter Competition Balancing for Model Merging","summary":" While fine-tuning pretrained models has become common practice, these models\noften underperform outside their specific domains. Recently developed model\nmerging techniques enable the direct integration of multiple models, each\nfine-tuned for distinct tasks, into a single model. This strategy promotes\nmultitasking capabilities without requiring retraining on the original\ndatasets. However, existing methods fall short in addressing potential\nconflicts and complex correlations between tasks, especially in parameter-level\nadjustments, posing a challenge in effectively balancing parameter competition\nacross various tasks. This paper introduces an innovative technique named\nPCB-Merging (Parameter Competition Balancing), a lightweight and training-free\ntechnique that adjusts the coefficients of each parameter for effective model\nmerging. PCB-Merging employs intra-balancing to gauge parameter significance\nwithin individual tasks and inter-balancing to assess parameter similarities\nacross different tasks. Parameters with low importance scores are dropped, and\nthe remaining ones are rescaled to form the final merged model. We assessed our\napproach in diverse merging scenarios, including cross-task, cross-domain, and\ncross-training configurations, as well as out-of-domain generalization. The\nexperimental results reveal that our approach achieves substantial performance\nenhancements across multiple modalities, domains, model sizes, number of tasks,\nfine-tuning forms, and large language models, outperforming existing model\nmerging methods. The code is publicly available at:\n\\url{https://github.com/duguodong7/pcb-merging}.\n","authors":["Guodong Du","Junlin Lee","Jing Li","Runhua Jiang","Yifei Guo","Shuyang Yu","Hanting Liu","Sim Kuan Goh","Ho-Kin Tang","Daojing He","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.02396v1.pdf","comment":"Accepted by NeurIPS2024"},{"id":"http://arxiv.org/abs/2406.13560v2","updated":"2024-10-03T11:17:43Z","published":"2024-06-19T13:48:19Z","title":"Lexically Grounded Subword Segmentation","summary":" We present three innovations in tokenization and subword segmentation. First,\nwe propose to use unsupervised morphological analysis with Morfessor as\npre-tokenization. Second, we present an algebraic method for obtaining subword\nembeddings grounded in a word embedding space. Based on that, we design a novel\nsubword segmentation algorithm that uses the embeddings, ensuring that the\nprocedure considers lexical meaning. Third, we introduce an efficient\nsegmentation algorithm based on a subword bigram model that can be initialized\nwith the lexically aware segmentation method to avoid using Morfessor and large\nembedding tables at inference time. We evaluate the proposed approaches using\ntwo intrinsic metrics and measure their performance on two downstream tasks:\npart-of-speech tagging and machine translation. Our experiments show\nsignificant improvements in the morphological plausibility of the segmentation\nwhen evaluated using segmentation precision on morpheme boundaries and improved\nR\\'enyi efficiency in 8 languages. Although the proposed tokenization methods\ndo not have a large impact on automatic translation quality, we observe\nconsistent performance gains in the arguably more morphological task of\npart-of-speech tagging.\n","authors":["Jindřich Libovický","Jindřich Helcl"],"pdf_url":"https://arxiv.org/pdf/2406.13560v2.pdf","comment":"Camera-ready, EMNLP Main conf"},{"id":"http://arxiv.org/abs/2406.13663v3","updated":"2024-10-03T11:03:22Z","published":"2024-06-19T16:10:26Z","title":"Model Internals-based Answer Attribution for Trustworthy\n Retrieval-Augmented Generation","summary":" Ensuring the verifiability of model answers is a fundamental challenge for\nretrieval-augmented generation (RAG) in the question answering (QA) domain.\nRecently, self-citation prompting was proposed to make large language models\n(LLMs) generate citations to supporting documents along with their answers.\nHowever, self-citing LLMs often struggle to match the required format, refer to\nnon-existent sources, and fail to faithfully reflect LLMs' context usage\nthroughout the generation. In this work, we present MIRAGE --Model\nInternals-based RAG Explanations -- a plug-and-play approach using model\ninternals for faithful answer attribution in RAG applications. MIRAGE detects\ncontext-sensitive answer tokens and pairs them with retrieved documents\ncontributing to their prediction via saliency methods. We evaluate our proposed\napproach on a multilingual extractive QA dataset, finding high agreement with\nhuman answer attribution. On open-ended QA, MIRAGE achieves citation quality\nand efficiency comparable to self-citation while also allowing for a\nfiner-grained control of attribution parameters. Our qualitative evaluation\nhighlights the faithfulness of MIRAGE's attributions and underscores the\npromising application of model internals for RAG answer attribution.\n","authors":["Jirui Qi","Gabriele Sarti","Raquel Fernández","Arianna Bisazza"],"pdf_url":"https://arxiv.org/pdf/2406.13663v3.pdf","comment":"Accepted by EMNLP 2024 Main Conference. Code and data released at\n https://github.com/Betswish/MIRAGE"},{"id":"http://arxiv.org/abs/2410.02381v1","updated":"2024-10-03T11:01:25Z","published":"2024-10-03T11:01:25Z","title":"MetaMetrics: Calibrating Metrics For Generation Tasks Using Human\n Preferences","summary":" Understanding the quality of a performance evaluation metric is crucial for\nensuring that model outputs align with human preferences. However, it remains\nunclear how well each metric captures the diverse aspects of these preferences,\nas metrics often excel in one particular area but not across all dimensions. To\naddress this, it is essential to systematically calibrate metrics to specific\naspects of human preference, catering to the unique characteristics of each\naspect. We introduce MetaMetrics, a calibrated meta-metric designed to evaluate\ngeneration tasks across different modalities in a supervised manner.\nMetaMetrics optimizes the combination of existing metrics to enhance their\nalignment with human preferences. Our metric demonstrates flexibility and\neffectiveness in both language and vision downstream tasks, showing significant\nbenefits across various multilingual and multi-domain scenarios. MetaMetrics\naligns closely with human preferences and is highly extendable and easily\nintegrable into any application. This makes MetaMetrics a powerful tool for\nimproving the evaluation of generation tasks, ensuring that metrics are more\nrepresentative of human judgment across diverse contexts.\n","authors":["Genta Indra Winata","David Anugraha","Lucky Susanto","Garry Kuwanto","Derry Tanti Wijaya"],"pdf_url":"https://arxiv.org/pdf/2410.02381v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2409.02889v2","updated":"2024-10-03T11:01:14Z","published":"2024-09-04T17:25:21Z","title":"LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via a\n Hybrid Architecture","summary":" Expanding the long-context capabilities of Multi-modal Large Language\nModels~(MLLMs) is crucial for video understanding, high-resolution image\nunderstanding, and multi-modal agents. This involves a series of systematic\noptimizations, including model architecture, data construction and training\nstrategy, particularly addressing challenges such as \\textit{degraded\nperformance with more images} and \\textit{high computational costs}. In this\npaper, we adapt the model architecture to a hybrid of Mamba and Transformer\nblocks, approach data construction with both temporal and spatial dependencies\namong multiple images and employ a progressive training strategy. The released\nmodel \\textbf{LongLLaVA}~(\\textbf{Long}-Context \\textbf{L}arge\n\\textbf{L}anguage \\textbf{a}nd \\textbf{V}ision \\textbf{A}ssistant) is the first\nhybrid MLLM, which achieved a better balance between efficiency and\neffectiveness. LongLLaVA not only achieves competitive results across various\nbenchmarks, but also maintains high throughput and low memory consumption.\nEspecially, it could process nearly a thousand images on a single A100 80GB\nGPU, showing promising application prospects for a wide range of tasks.\n","authors":["Xidong Wang","Dingjie Song","Shunian Chen","Chen Zhang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02889v2.pdf","comment":"20 pages, 9 figures, 9 tables"},{"id":"http://arxiv.org/abs/2410.02378v1","updated":"2024-10-03T10:51:02Z","published":"2024-10-03T10:51:02Z","title":"Towards Comprehensive Detection of Chinese Harmful Memes","summary":" This paper has been accepted in the NeurIPS 2024 D & B Track. Harmful memes\nhave proliferated on the Chinese Internet, while research on detecting Chinese\nharmful memes significantly lags behind due to the absence of reliable datasets\nand effective detectors. To this end, we focus on the comprehensive detection\nof Chinese harmful memes. We construct ToxiCN MM, the first Chinese harmful\nmeme dataset, which consists of 12,000 samples with fine-grained annotations\nfor various meme types. Additionally, we propose a baseline detector,\nMultimodal Knowledge Enhancement (MKE), incorporating contextual information of\nmeme content generated by the LLM to enhance the understanding of Chinese\nmemes. During the evaluation phase, we conduct extensive quantitative\nexperiments and qualitative analyses on multiple baselines, including LLMs and\nour MKE. The experimental results indicate that detecting Chinese harmful memes\nis challenging for existing models while demonstrating the effectiveness of\nMKE. The resources for this paper are available at\nhttps://github.com/DUT-lujunyu/ToxiCN_MM.\n","authors":["Junyu Lu","Bo Xu","Xiaokun Zhang","Hongbo Wang","Haohao Zhu","Dongyu Zhang","Liang Yang","Hongfei Lin"],"pdf_url":"https://arxiv.org/pdf/2410.02378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02365v1","updated":"2024-10-03T10:24:24Z","published":"2024-10-03T10:24:24Z","title":"From Concrete to Abstract: A Multimodal Generative Approach to Abstract\n Concept Learning","summary":" Understanding and manipulating concrete and abstract concepts is fundamental\nto human intelligence. Yet, they remain challenging for artificial agents. This\npaper introduces a multimodal generative approach to high order abstract\nconcept learning, which integrates visual and categorical linguistic\ninformation from concrete ones. Our model initially grounds subordinate level\nconcrete concepts, combines them to form basic level concepts, and finally\nabstracts to superordinate level concepts via the grounding of basic-level\nconcepts. We evaluate the model language learning ability through\nlanguage-to-visual and visual-to-language tests with high order abstract\nconcepts. Experimental results demonstrate the proficiency of the model in both\nlanguage understanding and language naming tasks.\n","authors":["Haodong Xie","Rahul Singh Maharjan","Federico Tavella","Angelo Cangelosi"],"pdf_url":"https://arxiv.org/pdf/2410.02365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02355v1","updated":"2024-10-03T10:06:27Z","published":"2024-10-03T10:06:27Z","title":"AlphaEdit: Null-Space Constrained Knowledge Editing for Language Models","summary":" Large language models (LLMs) often exhibit hallucinations due to incorrect or\noutdated knowledge. Hence, model editing methods have emerged to enable\ntargeted knowledge updates. To achieve this, a prevailing paradigm is the\nlocating-then-editing approach, which first locates influential parameters and\nthen edits them by introducing a perturbation. While effective, current studies\nhave demonstrated that this perturbation inevitably disrupt the originally\npreserved knowledge within LLMs, especially in sequential editing scenarios. To\naddress this, we introduce AlphaEdit, a novel solution that projects\nperturbation onto the null space of the preserved knowledge before applying it\nto the parameters. We theoretically prove that this projection ensures the\noutput of post-edited LLMs remains unchanged when queried about the preserved\nknowledge, thereby mitigating the issue of disruption. Extensive experiments on\nvarious LLMs, including LLaMA3, GPT2-XL, and GPT-J, show that AlphaEdit boosts\nthe performance of most locating-then-editing methods by an average of 36.4%\nwith a single line of additional code for projection solely. Our code is\navailable at: https://github.com/jianghoucheng/AlphaEdit.\n","authors":["Junfeng Fang","Houcheng Jiang","Kun Wang","Yunshan Ma","Xiang Wang","Xiangnan He","Tat-seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.02355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02343v1","updated":"2024-10-03T09:53:48Z","published":"2024-10-03T09:53:48Z","title":"Listening to the Wise Few: Select-and-Copy Attention Heads for\n Multiple-Choice QA","summary":" A standard way to evaluate the abilities of LLM involves presenting a\nmultiple-choice question and selecting the option with the highest logit as the\nmodel's predicted answer. However, such a format for evaluating LLMs has\nlimitations, since even if the model knows the correct answer, it may struggle\nto select the corresponding letter simply due to difficulties in following this\nrigid format. To address this, we introduce new scores that better capture and\nreveal model's underlying knowledge: the Query-Key Score (QK-score), derived\nfrom the interaction between query and key representations in attention heads,\nand the Attention Score, based on attention weights. These scores are extracted\nfrom specific \\textit{select-and-copy} heads, which show consistent performance\nacross popular Multi-Choice Question Answering (MCQA) datasets. Based on these\nscores, our method improves knowledge extraction, yielding up to 16\\% gain for\nLLaMA2-7B and up to 10\\% for larger models on popular MCQA benchmarks. At the\nsame time, the accuracy on a simple synthetic dataset, where the model\nexplicitly knows the right answer, increases by almost 60\\%, achieving nearly\nperfect accuracy, therefore demonstrating the method's efficiency in mitigating\nMCQA format limitations. To support our claims, we conduct experiments on\nmodels ranging from 7 billion to 70 billion parameters in both zero- and\nfew-shot setups.\n","authors":["Eduard Tulchinskii","Laida Kushnareva","Kristian Kuznetsov","Anastasia Voznyuk","Andrei Andriiainen","Irina Piontkovskaya","Evgeny Burnaev","Serguei Barannikov"],"pdf_url":"https://arxiv.org/pdf/2410.02343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02338v1","updated":"2024-10-03T09:48:09Z","published":"2024-10-03T09:48:09Z","title":"How Much Can RAG Help the Reasoning of LLM?","summary":" Retrieval-Augmented Generation (RAG) has gained significant popularity in\nmodern Large Language Models (LLMs) due to its effectiveness in introducing new\nknowledge and reducing hallucinations. However, the deep understanding of RAG\nremains limited, how does RAG help the reasoning process and can RAG help\nimprove the reasoning capability remains question. While external documents are\ntypically considered as a method to incorporate domain-specific information,\nthey also contain intermediate reasoning results related to the query, this\nsuggests that documents could enhance the reasoning capability of LLMs, which\nhas not been previously explored. In this paper, we investigate this issue in\ndepth and find that while RAG can assist with reasoning, the help is limited.\nIf we conceptualize the reasoning process as a tree with fixed depth, then RAG\nstruggles to assist LLMs in performing deeper reasoning. Additionally, the\ninformation in the documents requires preprocessing to filter out noise. We\ndemonstrate that this preprocessing is difficult to achieve simply fine-tuning\nof the LLM, it often necessitates numerous additional transformer layers to\nsolve the problem. To simplify the problem, we propose DPrompt tuning, which\neffectively resolves the issue within just limited transformer layers, leading\nto improved performance.\n","authors":["Jingyu Liu","Jiaen Lin","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2410.02338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18369v2","updated":"2024-10-03T09:45:47Z","published":"2024-05-28T17:08:31Z","title":"PromptWizard: Task-Aware Prompt Optimization Framework","summary":" Large language models (LLMs) have transformed AI across diverse domains, with\nprompting being central to their success in guiding model outputs. However,\nmanual prompt engineering is both labor-intensive and domain-specific,\nnecessitating the need for automated solutions. We introduce PromptWizard, a\nnovel, fully automated framework for discrete prompt optimization, utilizing a\nself-evolving, self-adapting mechanism. Through a feedback-driven critique and\nsynthesis process, PromptWizard achieves an effective balance between\nexploration and exploitation, iteratively refining both prompt instructions and\nin-context examples to generate human-readable, task-specific prompts. This\nguided approach systematically improves prompt quality, resulting in superior\nperformance across 45 tasks. PromptWizard excels even with limited training\ndata, smaller LLMs, and various LLM architectures. Additionally, our cost\nanalysis reveals a substantial reduction in API calls, token usage, and overall\ncost, demonstrating PromptWizard's efficiency, scalability, and advantages over\nexisting prompt optimization strategies.\n","authors":["Eshaan Agarwal","Joykirat Singh","Vivek Dani","Raghav Magazine","Tanuja Ganu","Akshay Nambi"],"pdf_url":"https://arxiv.org/pdf/2405.18369v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12319v2","updated":"2024-10-03T09:38:48Z","published":"2024-06-18T06:43:04Z","title":"On the Adversarial Vulnerability of Pairwise Evaluation Using Large\n Language Models","summary":" Pairwise evaluation using large language models (LLMs) is widely adopted for\nevaluating generated outputs. However, the reliability of LLM evaluators is\noften compromised by their biased preferences, such as favoring verbosity and\nan authoritative tone. In this work, we find that the evaluation setup itself\ncan significantly amplify these biases, where pairwise evaluators exhibit more\nundesirable tendencies than pointwise evaluators. Our analysis further reveals\nthat even when pairwise evaluators make incorrect judgments, they can still\naccurately identify shortcomings in low-quality outputs. As a simple remedy, we\nalso propose incorporating pointwise reasoning into pairwise evaluation.\nExperimental results show that our method improves the performance of pairwise\nevaluators on adversarial samples across various models. We hope our findings\nencourage further exploration into the reliability of LLM evaluators.\n","authors":["Hawon Jeong","ChaeHun Park","Jimin Hong","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2406.12319v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04459v3","updated":"2024-10-03T09:32:31Z","published":"2024-07-05T12:09:40Z","title":"Generalists vs. Specialists: Evaluating Large Language Models for Urdu","summary":" In this paper, we compare general-purpose models, GPT-4-Turbo and Llama-3-8b,\nwith special-purpose models--XLM-Roberta-large, mT5-large, and Llama-3-8b--that\nhave been fine-tuned on specific tasks. We focus on seven classification and\nseven generation tasks to evaluate the performance of these models on Urdu\nlanguage. Urdu has 70 million native speakers, yet it remains underrepresented\nin Natural Language Processing (NLP). Despite the frequent advancements in\nLarge Language Models (LLMs), their performance in low-resource languages,\nincluding Urdu, still needs to be explored. We also conduct a human evaluation\nfor the generation tasks and compare the results with the evaluations performed\nby GPT-4-Turbo, Llama-3-8b and Claude 3.5 Sonnet. We find that special-purpose\nmodels consistently outperform general-purpose models across various tasks. We\nalso find that the evaluation done by GPT-4-Turbo for generation tasks aligns\nmore closely with human evaluation compared to the evaluation the evaluation\ndone by Llama-3-8b. This paper contributes to the NLP community by providing\ninsights into the effectiveness of general and specific-purpose LLMs for\nlow-resource languages.\n","authors":["Samee Arif","Abdul Hameed Azeemi","Agha Ali Raza","Awais Athar"],"pdf_url":"https://arxiv.org/pdf/2407.04459v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02330v1","updated":"2024-10-03T09:28:59Z","published":"2024-10-03T09:28:59Z","title":"Llama SLayer 8B: Shallow Layers Hold the Key to Knowledge Injection","summary":" As a manner to augment pre-trained large language models (LLM), knowledge\ninjection is critical to develop vertical domain large models and has been\nwidely studied. Although most current approaches, including parameter-efficient\nfine-tuning (PEFT) and block expansion methods, uniformly apply knowledge\nacross all LLM layers, it raises the question: are all layers equally crucial\nfor knowledge injection? We begin by evaluating the importance of each layer in\nfinding the optimal layer range for knowledge injection. Intuitively, the more\nimportant layers should play a more critical role in knowledge injection and\ndeserve a denser injection. We observe performance dips in question-answering\nbenchmarks after the removal or expansion of the shallow layers, and the\ndegradation shrinks as the layer gets deeper, indicating that the shallow\nlayers hold the key to knowledge injection. This insight leads us to propose\nthe S strategy, a post-pretraining strategy of selectively enhancing shallow\nlayers while pruning the less effective deep ones. Based on this strategy, we\nintroduce Llama Slayer-8B and Llama Slayer-8B-Instruct. We experimented on the\ncorpus of code $\\&$ math and demonstrated the effectiveness of our strategy.\nFurther experiments across different LLM, Mistral-7B, and a legal corpus\nconfirmed the general applicability of the approach, underscoring its\nwide-ranging efficacy. Our code is available at:\n\\https://github.com/txchen-USTC/Llama-Slayer\n","authors":["Tianxiang Chen","Zhentao Tan","Tao Gong","Yue Wu","Qi Chu","Bin Liu","Jieping Ye","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2410.02330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16050v2","updated":"2024-10-03T09:24:56Z","published":"2024-02-25T10:27:46Z","title":"Efficient Temporal Extrapolation of Multimodal Large Language Models\n with Temporal Grounding Bridge","summary":" Despite progress in multimodal large language models (MLLMs), the challenge\nof interpreting long-form videos in response to linguistic queries persists,\nlargely due to the inefficiency in temporal grounding and limited pre-trained\ncontext window size. In this work, we introduce Temporal Grounding Bridge\n(TGB), a novel framework that bootstraps MLLMs with advanced temporal grounding\ncapabilities and broadens their contextual scope. Our framework significantly\nenhances the temporal capabilities of current MLLMs through three key\ninnovations: an efficient multi-span temporal grounding algorithm applied to\nlow-dimension temporal features projected from flow; a multimodal length\nextrapolation training paradigm that utilizes low-dimension temporal features\nto extend the training context window size; and a bootstrapping framework that\nbridges our model with pluggable MLLMs without requiring annotation. We\nvalidate TGB across seven video benchmarks and demonstrate substantial\nperformance improvements compared with prior MLLMs. Notably, our model,\ninitially trained on sequences of four frames, effectively handles sequences up\nto 16 longer without sacrificing performance, highlighting its scalability and\neffectiveness in real-world applications. Our code is publicly available at\nhttps://github.com/bigai-nlco/VideoTGB\n","authors":["Yuxuan Wang","Yueqian Wang","Pengfei Wu","Jianxin Liang","Dongyan Zhao","Yang Liu","Zilong Zheng"],"pdf_url":"https://arxiv.org/pdf/2402.16050v2.pdf","comment":"To appear at EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.12924v2","updated":"2024-10-03T09:21:57Z","published":"2024-09-04T03:17:19Z","title":"WaveletGPT: Wavelets Meet Large Language Models","summary":" Large Language Models (LLMs) have ushered in a new wave of artificial\nintelligence advancements impacting every scientific field and discipline. They\nare trained on a simple objective: to predict the next token given the previous\ncontext. We live in a world where most of the data around us, e.g., text,\naudio, and music, has a multi-scale structure associated with it. This paper\ninfuses LLMs with traditional signal processing ideas, namely wavelets, during\npre-training to take advantage of the structure. Without adding \\textbf{any\nextra parameters} to a GPT-style LLM architecture, we achieve the same\npre-training performance almost twice as fast in text, raw audio, and symbolic\nmusic. This is achieved by imposing a structure on intermediate embeddings.\nWhen trained for the same number of training steps, we achieve significant\ngains in performance, which is comparable to pre-training a larger neural\narchitecture. Our architecture allows every next token prediction access to\nintermediate embeddings at different temporal resolutions in every Transformer\ndecoder block. This work will hopefully pave the way for incorporating\nmulti-rate signal processing ideas into traditional LLM pre-training. Further,\nwe showcase pushing model performance by improving internal structure instead\nof just going after scale.\n","authors":["Prateek Verma"],"pdf_url":"https://arxiv.org/pdf/2409.12924v2.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.09589v4","updated":"2024-10-03T09:00:35Z","published":"2024-05-15T10:16:25Z","title":"A Comprehensive Survey of Hallucination in Large Language, Image, Video\n and Audio Foundation Models","summary":" The rapid advancement of foundation models (FMs) across language, image,\naudio, and video domains has shown remarkable capabilities in diverse tasks.\nHowever, the proliferation of FMs brings forth a critical challenge: the\npotential to generate hallucinated outputs, particularly in high-stakes\napplications. The tendency of foundation models to produce hallucinated content\narguably represents the biggest hindrance to their widespread adoption in\nreal-world scenarios, especially in domains where reliability and accuracy are\nparamount. This survey paper presents a comprehensive overview of recent\ndevelopments that aim to identify and mitigate the problem of hallucination in\nFMs, spanning text, image, video, and audio modalities. By synthesizing recent\nadvancements in detecting and mitigating hallucination across various\nmodalities, the paper aims to provide valuable insights for researchers,\ndevelopers, and practitioners. Essentially, it establishes a clear framework\nencompassing definition, taxonomy, and detection strategies for addressing\nhallucination in multimodal foundation models, laying the foundation for future\nresearch in this pivotal area.\n","authors":["Pranab Sahoo","Prabhash Meharia","Akash Ghosh","Sriparna Saha","Vinija Jain","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2405.09589v4.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2410.02320v1","updated":"2024-10-03T08:56:29Z","published":"2024-10-03T08:56:29Z","title":"Post-edits Are Preferences Too","summary":" Preference Optimization (PO) techniques are currently one of the state of the\nart techniques for fine-tuning large language models (LLMs) on pairwise\npreference feedback from human annotators. However, in machine translation,\nthis sort of feedback can be difficult to solicit. Additionally, Kreutzer et\nal. (2018) have shown that, for machine translation, pairwise preferences are\nless reliable than other forms of human feedback, such as 5-point ratings.\n We examine post-edits to see if they can be a source of reliable human\npreferences by construction. In PO, a human annotator is shown sequences $s_1$\nand $s_2$ and asked for a preference judgment, %$s_1 > s_2$; while for\npost-editing, editors \\emph{create} $s_1$ and know that it should be better\nthan $s_2$. We attempt to use these implicit preferences for PO and show that\nit helps the model move towards post-edit-like hypotheses and away from machine\ntranslation-like hypotheses. Furthermore, we show that best results are\nobtained by pre-training the model with supervised fine-tuning (SFT) on\npost-edits in order to promote post-edit-like hypotheses to the top output\nranks.\n","authors":["Nathaniel Berger","Stefan Riezler","Miriam Exel","Matthias Huck"],"pdf_url":"https://arxiv.org/pdf/2410.02320v1.pdf","comment":"To appear at the Ninth Conference on Machine Translation (WMT24)"},{"id":"http://arxiv.org/abs/2406.02069v3","updated":"2024-10-03T08:46:42Z","published":"2024-06-04T07:51:30Z","title":"PyramidKV: Dynamic KV Cache Compression based on Pyramidal Information\n Funneling","summary":" In this study, we investigate whether attention-based information flow inside\nlarge language models (LLMs) is aggregated through noticeable patterns for long\ncontext processing. Our observations reveal that LLMs aggregate information\nthrough Pyramidal Information Funneling where attention is scattering widely in\nlower layers, progressively consolidating within specific contexts, and\nultimately focusing on critical tokens (a.k.a massive activation or attention\nsink) in higher layers. Motivated by these insights, we developed PyramidKV, a\nnovel and effective KV cache compression method. This approach dynamically\nadjusts the KV cache size across different layers, allocating more cache in\nlower layers and less in higher ones, diverging from traditional methods that\nmaintain a uniform KV cache size. Our experimental evaluations, utilizing the\nLongBench benchmark, show that PyramidKV matches the performance of models with\na full KV cache while retaining only 12% of the KV cache, thus significantly\nreducing memory usage. In scenarios emphasizing memory efficiency, where only\n0.7% of the KV cache is maintained, PyramidKV surpasses other KV cache\ncompression techniques, achieving up to a 20.5 absolute accuracy improvement on\nTREC dataset. In the Needle-in-a-Haystack experiment, PyramidKV outperforms\ncompeting methods in maintaining long-context comprehension in LLMs; notably,\nretaining just 128 KV cache entries enables the LLAMA-3-70B model to achieve\n100% Acc. performance, matching that of a full KV cache.\n","authors":["Zefan Cai","Yichi Zhang","Bofei Gao","Yuliang Liu","Tianyu Liu","Keming Lu","Wayne Xiong","Yue Dong","Baobao Chang","Junjie Hu","Wen Xiao"],"pdf_url":"https://arxiv.org/pdf/2406.02069v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02308v1","updated":"2024-10-03T08:44:17Z","published":"2024-10-03T08:44:17Z","title":"Traffic Light or Light Traffic? Investigating Phrasal Semantics in Large\n Language Models","summary":" Phrases are fundamental linguistic units through which humans convey\nsemantics. This study critically examines the capacity of API-based large\nlanguage models (LLMs) to comprehend phrase semantics, utilizing three\nhuman-annotated datasets. We assess the performance of LLMs in executing phrase\nsemantic reasoning tasks guided by natural language instructions and explore\nthe impact of common prompting techniques, including few-shot demonstrations\nand Chain-of-Thought reasoning. Our findings reveal that LLMs greatly\noutperform traditional embedding methods across the datasets; however, they do\nnot show a significant advantage over fine-tuned methods. The effectiveness of\nadvanced prompting strategies shows variability. We conduct detailed error\nanalyses to interpret the limitations faced by LLMs in comprehending phrase\nsemantics. Code and data can be found at\nhttps://github.com/memray/llm_phrase_semantics.\n","authors":["Rui Meng","Ye Liu","Lifu Tu","Daqing He","Yingbo Zhou","Semih Yavuz"],"pdf_url":"https://arxiv.org/pdf/2410.02308v1.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2406.17233v2","updated":"2024-10-03T08:43:25Z","published":"2024-06-25T02:37:53Z","title":"Self-Constructed Context Decompilation with Fined-grained Alignment\n Enhancement","summary":" Decompilation transforms compiled code back into a high-level programming\nlanguage for analysis when source code is unavailable. Previous work has\nprimarily focused on enhancing decompilation performance by increasing the\nscale of model parameters or training data for pre-training. Based on the\ncharacteristics of the decompilation task, we propose two methods: (1) Without\nfine-tuning, the Self-Constructed Context Decompilation (sc$^2$dec) method\nrecompiles the LLM's decompilation results to construct pairs for in-context\nlearning, helping the model improve decompilation performance. (2) Fine-grained\nAlignment Enhancement (FAE), which meticulously aligns assembly code with\nsource code at the statement level by leveraging debugging information, is\nemployed during the fine-tuning phase to achieve further improvements in\ndecompilation. By integrating these two methods, we achieved a Re-Executability\nperformance improvement of approximately 3.90% on the Decompile-Eval benchmark,\nestablishing a new state-of-the-art performance of 52.41%. The code, data, and\nmodels are available at https://github.com/AlongWY/sccdec.\n","authors":["Yunlong Feng","Dechuan Teng","Yang Xu","Honglin Mu","Xiao Xu","Libo Qin","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2406.17233v2.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2410.02298v1","updated":"2024-10-03T08:34:17Z","published":"2024-10-03T08:34:17Z","title":"Jailbreak Antidote: Runtime Safety-Utility Balance via Sparse\n Representation Adjustment in Large Language Models","summary":" As large language models (LLMs) become integral to various applications,\nensuring both their safety and utility is paramount. Jailbreak attacks, which\nmanipulate LLMs into generating harmful content, pose significant challenges to\nthis balance. Existing defenses, such as prompt engineering and safety\nfine-tuning, often introduce computational overhead, increase inference\nlatency, and lack runtime flexibility. Moreover, overly restrictive safety\nmeasures can degrade model utility by causing refusals of benign queries. In\nthis paper, we introduce Jailbreak Antidote, a method that enables real-time\nadjustment of LLM safety preferences by manipulating a sparse subset of the\nmodel's internal states during inference. By shifting the model's hidden\nrepresentations along a safety direction with varying strengths, we achieve\nflexible control over the safety-utility balance without additional token\noverhead or inference delays. Our analysis reveals that safety-related\ninformation in LLMs is sparsely distributed; adjusting approximately 5% of the\ninternal state is as effective as modifying the entire state. Extensive\nexperiments on nine LLMs (ranging from 2 billion to 72 billion parameters),\nevaluated against ten jailbreak attack methods and compared with six defense\nstrategies, validate the effectiveness and efficiency of our approach. By\ndirectly manipulating internal states during reasoning, Jailbreak Antidote\noffers a lightweight, scalable solution that enhances LLM safety while\npreserving utility, opening new possibilities for real-time safety mechanisms\nin widely-deployed AI systems.\n","authors":["Guobin Shen","Dongcheng Zhao","Yiting Dong","Xiang He","Yi Zeng"],"pdf_url":"https://arxiv.org/pdf/2410.02298v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.02297v1","updated":"2024-10-03T08:27:59Z","published":"2024-10-03T08:27:59Z","title":"Make Compound Sentences Simple to Analyze: Learning to Split Sentences\n for Aspect-based Sentiment Analysis","summary":" In the domain of Aspect-Based Sentiment Analysis (ABSA), generative methods\nhave shown promising results and achieved substantial advancements. However,\ndespite these advancements, the tasks of extracting sentiment quadruplets,\nwhich capture the nuanced sentiment expressions within a sentence, remain\nsignificant challenges. In particular, compound sentences can potentially\ncontain multiple quadruplets, making the extraction task increasingly difficult\nas sentence complexity grows. To address this issue, we are focusing on\nsimplifying sentence structures to facilitate the easier recognition of these\nelements and crafting a model that integrates seamlessly with various ABSA\ntasks. In this paper, we propose Aspect Term Oriented Sentence Splitter\n(ATOSS), which simplifies compound sentence into simpler and clearer forms,\nthereby clarifying their structure and intent. As a plug-and-play module, this\napproach retains the parameters of the ABSA model while making it easier to\nidentify essential intent within input sentences. Extensive experimental\nresults show that utilizing ATOSS outperforms existing methods in both ASQP and\nACOS tasks, which are the primary tasks for extracting sentiment quadruplets.\n","authors":["Yongsik Seo","Sungwon Song","Ryang Heo","Jieyong Kim","Dongha Lee"],"pdf_url":"https://arxiv.org/pdf/2410.02297v1.pdf","comment":"Accepted at EMNLP 2024 (Findings, long paper)"},{"id":"http://arxiv.org/abs/2410.02296v1","updated":"2024-10-03T08:27:54Z","published":"2024-10-03T08:27:54Z","title":"Language Models are Graph Learners","summary":" Language Models (LMs) are increasingly challenging the dominance of\ndomain-specific models, including Graph Neural Networks (GNNs) and Graph\nTransformers (GTs), in graph learning tasks. Following this trend, we propose a\nnovel approach that empowers off-the-shelf LMs to achieve performance\ncomparable to state-of-the-art GNNs on node classification tasks, without\nrequiring any architectural modification. By preserving the LM's original\narchitecture, our approach retains a key benefit of LM instruction tuning: the\nability to jointly train on diverse datasets, fostering greater flexibility and\nefficiency. To achieve this, we introduce two key augmentation strategies: (1)\nEnriching LMs' input using topological and semantic retrieval methods, which\nprovide richer contextual information, and (2) guiding the LMs' classification\nprocess through a lightweight GNN classifier that effectively prunes class\ncandidates. Our experiments on real-world datasets show that backbone Flan-T5\nmodels equipped with these augmentation strategies outperform state-of-the-art\ntext-output node classifiers and are comparable to top-performing vector-output\nnode classifiers. By bridging the gap between specialized task-specific node\nclassifiers and general LMs, this work paves the way for more versatile and\nwidely applicable graph learning models. We will open-source the code upon\npublication.\n","authors":["Zhe Xu","Kaveh Hassani","Si Zhang","Hanqing Zeng","Michihiro Yasunaga","Limei Wang","Dongqi Fu","Ning Yao","Bo Long","Hanghang Tong"],"pdf_url":"https://arxiv.org/pdf/2410.02296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18430v2","updated":"2024-10-03T08:24:40Z","published":"2024-03-27T10:36:17Z","title":"Exploring language relations through syntactic distances and geographic\n proximity","summary":" Languages are grouped into families that share common linguistic traits.\nWhile this approach has been successful in understanding genetic relations\nbetween diverse languages, more analyses are needed to accurately quantify\ntheir relatedness, especially in less studied linguistic levels such as syntax.\nHere, we explore linguistic distances using series of parts of speech (POS)\nextracted from the Universal Dependencies dataset. Within an\ninformation-theoretic framework, we show that employing POS trigrams maximizes\nthe possibility of capturing syntactic variations while being at the same time\ncompatible with the amount of available data. Linguistic connections are then\nestablished by assessing pairwise distances based on the POS distributions.\nIntriguingly, our analysis reveals definite clusters that correspond to well\nknown language families and groups, with exceptions explained by distinct\nmorphological typologies. Furthermore, we obtain a significant correlation\nbetween language similarity and geographic distance, which underscores the\ninfluence of spatial proximity on language kinships.\n","authors":["Juan De Gregorio","Raúl Toral","David Sánchez"],"pdf_url":"https://arxiv.org/pdf/2403.18430v2.pdf","comment":"39 pages"},{"id":"http://arxiv.org/abs/2410.02293v1","updated":"2024-10-03T08:23:06Z","published":"2024-10-03T08:23:06Z","title":"Efficient Second-Order Neural Network Optimization via Adaptive Trust\n Region Methods","summary":" Second-order optimization methods offer notable advantages in training deep\nneural networks by utilizing curvature information to achieve faster\nconvergence. However, traditional second-order techniques are computationally\nprohibitive, primarily due to the large matrix inversions and high memory\ndemands they require. While adaptive trust-region methods have been developed\nto mitigate these issues, their performance is often hindered by conservative\nestimates of key parameters, such as the Lipschitz constant of the Hessian,\nresulting in suboptimal outcomes. In this paper, we introduce\nSecondOrderAdaptiveAdam (SOAA), a novel optimization algorithm designed to\novercome these limitations. SOAA approximates the Fisher information matrix\nusing a diagonal representation, reducing computational complexity from\n\\(O(n^{2})\\) to \\(O(n)\\), thereby making it suitable for large-scale deep\nlearning models, including large language models (LLMs). Additionally, the\nalgorithm integrates an adaptive trust-region mechanism that dynamically\nadjusts the trust region size based on observed loss reduction, ensuring both\nrobust convergence and computational efficiency. We empirically demonstrate\nthat SOAA achieves faster and more stable convergence compared to first-order\noptimizers, such as Adam, under similar computational constraints. However, the\ndiagonal approximation of the Fisher information matrix may be less effective\nin capturing higher-order interactions between gradients, suggesting potential\nareas for further refinement and future research.\n","authors":["James Vo"],"pdf_url":"https://arxiv.org/pdf/2410.02293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02284v1","updated":"2024-10-03T08:07:55Z","published":"2024-10-03T08:07:55Z","title":"Correlation and Navigation in the Vocabulary Key Representation Space of\n Language Models","summary":" Language model (LM) decoding is based on the next-token prediction (NTP)\nprobability distribution. For neural LMs (e.g., Transformer-based), NTP\ndistribution is essentially a softmax-regularized dot product between an\nencoded input context (query) and fixed vocabulary representations (keys). In\nthis paper, we study the effect of the key distribution on the NTP\ndistribution, with a focus on whether the similarity between keys will trigger\nspurious correlations in NTP. Through knowledge-probing tasks, we show that in\nthe NTP distribution, the few top-ranked tokens are typically accurate.\nHowever, the middle-ranked prediction is highly biased towards the tokens that\nare distributionally (not necessarily semantically) similar to these top ones.\nFor instance, if \"P\" is predicted as the top-1 token, \"A\"-\"Z\" will all be\nranked high in NTP, no matter whether they can lead to correct decoding\nresults. This hurts the sampling diversity and makes the sampling of correct,\nlong-tail results hopeless and noisy. We attempt to alleviate this issue via a\nnovel in-context method that iteratively pushes the query representation away\nfrom explored regions. Specifically, we include the explored decoding results\nin the context and prompt the LM to generate something else, which encourages\nthe LM to produce a query representation that has small dot products with\nexplored keys. Experiments on knowledge-probing tasks show that our method\nleads to efficient navigation away from explored keys to correct new keys. We\nfurther extend our method to open-ended and chain-of-thought (for reasoning)\ngeneration. Experiment results show that ICN contributes to better generation\ndiversity and improved self-consistency voting performance. Finally, we discuss\npotential training issues caused by the fixed key space together with the\nchallenges and possible ways to address them in future research.\n","authors":["Letian Peng","Chenyang An","Jingbo Shang"],"pdf_url":"https://arxiv.org/pdf/2410.02284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02283v1","updated":"2024-10-03T08:07:14Z","published":"2024-10-03T08:07:14Z","title":"Morphological evaluation of subwords vocabulary used by BETO language\n model","summary":" Subword tokenization algorithms used by Large Language Models are\nsignificantly more efficient and can independently build the necessary\nvocabulary of words and subwords without human intervention. However, those\nsubwords do not always align with real morphemes, potentially impacting the\nmodels' performance, though it remains uncertain when this might occur. In\nprevious research, we proposed a method to assess the morphological quality of\nvocabularies, focusing on the overlap between these vocabularies and the\nmorphemes of a given language. Our evaluation method was built on three quality\nmeasures, relevance, cohesion, and morphological accuracy, and a procedure for\ntheir assessment. By applying this method to vocabularies created by three\nsubword tokenization algorithms, BPE, Wordpiece, and Unigram, we concluded that\nthese vocabularies generally exhibit very low morphological quality. In this\narticle, we apply this evaluation to the tokenizer of BETO, a BERT language\nmodel trained on large Spanish corpora. This evaluation, along with our\nprevious results, helped us conclude that its vocabulary has a low\nmorphological quality, and we also found that training the tokenizer in a\nlarger corpus does not improve the morphological quality of the generated\nvocabulary. Additionally, this evaluation helps clarify the algorithm used by\nthe tokenizer, that is, Wordpiece, given the inconsistencies between the\nauthors' claims and the model's configuration.\n","authors":["Óscar García-Sierra","Ana Fernández-Pampillón Cesteros","Miguel Ortega-Martín"],"pdf_url":"https://arxiv.org/pdf/2410.02283v1.pdf","comment":"in Spanish language"},{"id":"http://arxiv.org/abs/2406.11341v2","updated":"2024-10-03T08:07:01Z","published":"2024-06-17T08:59:04Z","title":"A Systematic Analysis of Large Language Models as Soft Reasoners: The\n Case of Syllogistic Inferences","summary":" The reasoning abilities of Large Language Models (LLMs) are becoming a\ncentral focus of study in NLP. In this paper, we consider the case of\nsyllogistic reasoning, an area of deductive reasoning studied extensively in\nlogic and cognitive psychology. Previous research has shown that pre-trained\nLLMs exhibit reasoning biases, such as $\\textit{content effects}$, avoid\nanswering that $\\textit{no conclusion follows}$, display human-like\ndifficulties, and struggle with multi-step reasoning. We contribute to this\nresearch line by systematically investigating the effects of chain-of-thought\nreasoning, in-context learning (ICL), and supervised fine-tuning (SFT) on\nsyllogistic reasoning, considering syllogisms with conclusions that support or\nviolate world knowledge, as well as ones with multiple premises. Crucially, we\ngo beyond the standard focus on accuracy, with an in-depth analysis of the\nconclusions generated by the models. Our results suggest that the behavior of\npre-trained LLMs can be explained by heuristics studied in cognitive science\nand that both ICL and SFT improve model performance on valid inferences,\nalthough only the latter mitigates most reasoning biases without harming model\nconsistency.\n","authors":["Leonardo Bertolazzi","Albert Gatt","Raffaella Bernardi"],"pdf_url":"https://arxiv.org/pdf/2406.11341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11016v2","updated":"2024-10-03T08:05:14Z","published":"2024-06-16T17:19:23Z","title":"Optimized Speculative Sampling for GPU Hardware Accelerators","summary":" In this work, we optimize speculative sampling for parallel hardware\naccelerators to improve sampling speed. We notice that substantial portions of\nthe intermediate matrices necessary for speculative sampling can be computed\nconcurrently. This allows us to distribute the workload across multiple GPU\nthreads, enabling simultaneous operations on matrix segments within thread\nblocks. This results in profiling time improvements ranging from 6% to 13%\nrelative to the baseline implementation, without compromising accuracy. To\nfurther accelerate speculative sampling, probability distributions\nparameterized by softmax are approximated by sigmoid. This approximation\napproach results in significantly greater relative improvements in profiling\ntime, ranging from 37% to 94%, with a minor decline in accuracy. We conduct\nextensive experiments on both automatic speech recognition and summarization\ntasks to validate the effectiveness of our optimization methods.\n","authors":["Dominik Wagner","Seanie Lee","Ilja Baumann","Philipp Seeberger","Korbinian Riedhammer","Tobias Bocklet"],"pdf_url":"https://arxiv.org/pdf/2406.11016v2.pdf","comment":"Accepted at EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.02281v1","updated":"2024-10-03T08:03:40Z","published":"2024-10-03T08:03:40Z","title":"Annotation Guidelines for Corpus Novelties: Part 1 -- Named Entity\n Recognition","summary":" The Novelties corpus is a collection of novels (and parts of novels)\nannotated for Named Entity Recognition (NER) among other tasks. This document\ndescribes the guidelines applied during its annotation. It contains the\ninstructions used by the annotators, as well as a number of examples retrieved\nfrom the annotated novels, and illustrating expressions that should be marked\nas entities as well as expressions that should not.\n","authors":["Arthur Amalvy","Vincent Labatut"],"pdf_url":"https://arxiv.org/pdf/2410.02281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19335v2","updated":"2024-10-03T07:59:36Z","published":"2024-04-30T08:01:49Z","title":"StablePT: Towards Stable Prompting for Few-shot Learning via Input\n Separation","summary":" Large language models have shown their ability to become effective few-shot\nlearners with prompting, revolutionizing the paradigm of learning with data\nscarcity. However, this approach largely depends on the quality of prompt\ninitialization, and always exhibits large variability among different runs.\nSuch property makes prompt tuning highly unreliable and vulnerable to poorly\nconstructed prompts, which limits its extension to more real-world\napplications. To tackle this issue, we propose to treat the hard prompt and\nsoft prompt as separate inputs to mitigate noise brought by the prompt\ninitialization. Furthermore, we optimize soft prompts with contrastive learning\nfor utilizing class-aware information in the training process to maintain model\nperformance. Experimental results demonstrate that \\sysname outperforms\nstate-of-the-art methods by 6.97% in accuracy and reduces the standard\ndeviation by 1.92 on average. Furthermore, extensive experiments underscore its\nrobustness and stability across 8 datasets covering various tasks. Codes are\navailable at https://github.com/lccc0528/Stable/tree/main.\n","authors":["Xiaoming Liu","Chen Liu","Zhaohan Zhang","Chengzhengxu Li","Longtian Wang","Yu Lan","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2404.19335v2.pdf","comment":"EMNLP 2024 Findings"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2410.02764v1","updated":"2024-10-03T17:59:59Z","published":"2024-10-03T17:59:59Z","title":"Flash-Splat: 3D Reflection Removal with Flash Cues and Gaussian Splats","summary":" We introduce a simple yet effective approach for separating transmitted and\nreflected light. Our key insight is that the powerful novel view synthesis\ncapabilities provided by modern inverse rendering methods (e.g.,~3D Gaussian\nsplatting) allow one to perform flash/no-flash reflection separation using\nunpaired measurements -- this relaxation dramatically simplifies image\nacquisition over conventional paired flash/no-flash reflection separation\nmethods. Through extensive real-world experiments, we demonstrate our method,\nFlash-Splat, accurately reconstructs both transmitted and reflected scenes in\n3D. Our method outperforms existing 3D reflection separation methods, which do\nnot leverage illumination control, by a large margin. Our project webpage is at\nhttps://flash-splat.github.io/.\n","authors":["Mingyang Xie","Haoming Cai","Sachin Shah","Yiran Xu","Brandon Y. Feng","Jia-Bin Huang","Christopher A. Metzler"],"pdf_url":"https://arxiv.org/pdf/2410.02764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02763v1","updated":"2024-10-03T17:59:58Z","published":"2024-10-03T17:59:58Z","title":"Vinoground: Scrutinizing LMMs over Dense Temporal Reasoning with Short\n Videos","summary":" There has been growing sentiment recently that modern large multimodal models\n(LMMs) have addressed most of the key challenges related to short video\ncomprehension. As a result, both academia and industry are gradually shifting\ntheir attention towards the more complex challenges posed by understanding\nlong-form videos. However, is this really the case? Our studies indicate that\nLMMs still lack many fundamental reasoning capabilities even when dealing with\nshort videos. We introduce Vinoground, a temporal counterfactual LMM evaluation\nbenchmark encompassing 1000 short and natural video-caption pairs. We\ndemonstrate that existing LMMs severely struggle to distinguish temporal\ndifferences between different actions and object transformations. For example,\nthe best model GPT-4o only obtains ~50% on our text and video scores, showing a\nlarge gap compared to the human baseline of ~90%. All open-source multimodal\nmodels and CLIP-based models perform much worse, producing mostly random chance\nperformance. Through this work, we shed light onto the fact that temporal\nreasoning in short videos is a problem yet to be fully solved. The dataset and\nevaluation code are available at https://vinoground.github.io.\n","authors":["Jianrui Zhang","Mu Cai","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2410.02763v1.pdf","comment":"Project Page: https://vinoground.github.io"},{"id":"http://arxiv.org/abs/2410.02762v1","updated":"2024-10-03T17:59:57Z","published":"2024-10-03T17:59:57Z","title":"Interpreting and Editing Vision-Language Representations to Mitigate\n Hallucinations","summary":" We investigate the internal representations of vision-language models (VLMs)\nto address hallucinations, a persistent challenge despite advances in model\nsize and training. We project VLMs' internal image representations to their\nlanguage vocabulary and observe more confident output probabilities on real\nobjects than hallucinated objects. We additionally use these output\nprobabilities to spatially localize real objects. Building on this approach, we\nintroduce a knowledge erasure algorithm that removes hallucinations by linearly\northogonalizing image features with respect to hallucinated object features. We\nshow that targeted edits to a model's latent representations can reduce\nhallucinations by up to 25.7% on the COCO2014 dataset while preserving\nperformance. Our findings demonstrate how a deeper understanding of VLMs'\nlatent representations can enhance reliability and enable novel capabilities,\nsuch as zero-shot segmentation.\n","authors":["Nick Jiang","Anish Kachinthaya","Suzie Petryk","Yossi Gandelsman"],"pdf_url":"https://arxiv.org/pdf/2410.02762v1.pdf","comment":"Project page and code: http://anishk23733.github.io/vl-interp/"},{"id":"http://arxiv.org/abs/2410.02761v1","updated":"2024-10-03T17:59:34Z","published":"2024-10-03T17:59:34Z","title":"FakeShield: Explainable Image Forgery Detection and Localization via\n Multi-modal Large Language Models","summary":" The rapid development of generative AI is a double-edged sword, which not\nonly facilitates content creation but also makes image manipulation easier and\nmore difficult to detect. Although current image forgery detection and\nlocalization (IFDL) methods are generally effective, they tend to face two\nchallenges: \\textbf{1)} black-box nature with unknown detection principle,\n\\textbf{2)} limited generalization across diverse tampering methods (e.g.,\nPhotoshop, DeepFake, AIGC-Editing). To address these issues, we propose the\nexplainable IFDL task and design FakeShield, a multi-modal framework capable of\nevaluating image authenticity, generating tampered region masks, and providing\na judgment basis based on pixel-level and image-level tampering clues.\nAdditionally, we leverage GPT-4o to enhance existing IFDL datasets, creating\nthe Multi-Modal Tamper Description dataSet (MMTD-Set) for training FakeShield's\ntampering analysis capabilities. Meanwhile, we incorporate a Domain Tag-guided\nExplainable Forgery Detection Module (DTE-FDM) and a Multi-modal Forgery\nLocalization Module (MFLM) to address various types of tamper detection\ninterpretation and achieve forgery localization guided by detailed textual\ndescriptions. Extensive experiments demonstrate that FakeShield effectively\ndetects and localizes various tampering techniques, offering an explainable and\nsuperior solution compared to previous IFDL methods.\n","authors":["Zhipei Xu","Xuanyu Zhang","Runyi Li","Zecheng Tang","Qing Huang","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.02761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17916v2","updated":"2024-10-03T17:59:25Z","published":"2024-03-26T17:53:27Z","title":"CMP: Cooperative Motion Prediction with Multi-Agent Communication","summary":" The confluence of the advancement of Autonomous Vehicles (AVs) and the\nmaturity of Vehicle-to-Everything (V2X) communication has enabled the\ncapability of cooperative connected and automated vehicles (CAVs). Building on\ntop of cooperative perception, this paper explores the feasibility and\neffectiveness of cooperative motion prediction. Our method, CMP, takes LiDAR\nsignals as model input to enhance tracking and prediction capabilities. Unlike\nprevious work that focuses separately on either cooperative perception or\nmotion prediction, our framework, to the best of our knowledge, is the first to\naddress the unified problem where CAVs share information in both perception and\nprediction modules. Incorporated into our design is the unique capability to\ntolerate realistic V2X bandwidth limitations and transmission delays, while\ndealing with bulky perception representations. We also propose a prediction\naggregation module, which unifies the predictions obtained by different CAVs\nand generates the final prediction. Through extensive experiments and ablation\nstudies on the OPV2V and V2V4Real datasets, we demonstrate the effectiveness of\nour method in cooperative perception, tracking, and motion prediction. In\nparticular, CMP reduces the average prediction error by 16.4\\% with fewer\nmissing detections compared with the no cooperation setting and by 12.3\\%\ncompared with the strongest baseline. Our work marks a significant step forward\nin the cooperative capabilities of CAVs, showcasing enhanced performance in\ncomplex scenarios. The code can be found on the project website:\nhttps://cmp-cooperative-prediction.github.io/.\n","authors":["Zehao Wang","Yuping Wang","Zhuoyuan Wu","Hengbo Ma","Zhaowei Li","Hang Qiu","Jiachen Li"],"pdf_url":"https://arxiv.org/pdf/2403.17916v2.pdf","comment":"Project website: https://cmp-cooperative-prediction.github.io/"},{"id":"http://arxiv.org/abs/2410.02757v1","updated":"2024-10-03T17:59:02Z","published":"2024-10-03T17:59:02Z","title":"Loong: Generating Minute-level Long Videos with Autoregressive Language\n Models","summary":" It is desirable but challenging to generate content-rich long videos in the\nscale of minutes. Autoregressive large language models (LLMs) have achieved\ngreat success in generating coherent and long sequences of tokens in the domain\nof natural language processing, while the exploration of autoregressive LLMs\nfor video generation is limited to generating short videos of several seconds.\nIn this work, we conduct a deep analysis of the challenges that prevent\nautoregressive LLM-based video generators from generating long videos. Based on\nthe observations and analysis, we propose Loong, a new autoregressive LLM-based\nvideo generator that can generate minute-long videos. Specifically, we model\nthe text tokens and video tokens as a unified sequence for autoregressive LLMs\nand train the model from scratch. We propose progressive short-to-long training\nwith a loss re-weighting scheme to mitigate the loss imbalance problem for long\nvideo training. We further investigate inference strategies, including video\ntoken re-encoding and sampling strategies, to diminish error accumulation\nduring inference. Our proposed Loong can be trained on 10-second videos and be\nextended to generate minute-level long videos conditioned on text prompts, as\ndemonstrated by the results. More samples are available at:\nhttps://epiphqny.github.io/Loong-video.\n","authors":["Yuqing Wang","Tianwei Xiong","Daquan Zhou","Zhijie Lin","Yang Zhao","Bingyi Kang","Jiashi Feng","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2410.02757v1.pdf","comment":"Project page: https://epiphqny.github.io/Loong-video/"},{"id":"http://arxiv.org/abs/2307.08695v3","updated":"2024-10-03T17:58:03Z","published":"2023-07-17T17:57:01Z","title":"NVDS+: Towards Efficient and Versatile Neural Stabilizer for Video Depth\n Estimation","summary":" Video depth estimation aims to infer temporally consistent depth. One\napproach is to finetune a single-image model on each video with geometry\nconstraints, which proves inefficient and lacks robustness. An alternative is\nlearning to enforce consistency from data, which requires well-designed models\nand sufficient video depth data. To address both challenges, we introduce NVDS+\nthat stabilizes inconsistent depth estimated by various single-image models in\na plug-and-play manner. We also elaborate a large-scale Video Depth in the Wild\n(VDW) dataset, which contains 14,203 videos with over two million frames,\nmaking it the largest natural-scene video depth dataset. Additionally, a\nbidirectional inference strategy is designed to improve consistency by\nadaptively fusing forward and backward predictions. We instantiate a model\nfamily ranging from small to large scales for different applications. The\nmethod is evaluated on VDW dataset and three public benchmarks. To further\nprove the versatility, we extend NVDS+ to video semantic segmentation and\nseveral downstream applications like bokeh rendering, novel view synthesis, and\n3D reconstruction. Experimental results show that our method achieves\nsignificant improvements in consistency, accuracy, and efficiency. Our work\nserves as a solid baseline and data foundation for learning-based video depth\nestimation. Code and dataset are available at:\nhttps://github.com/RaymondWang987/NVDS\n","authors":["Yiran Wang","Min Shi","Jiaqi Li","Chaoyi Hong","Zihao Huang","Juewen Peng","Zhiguo Cao","Jianming Zhang","Ke Xian","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2307.08695v3.pdf","comment":"V1/V2: ICCV 2023 accepted; V3: the journal extension accepted by IEEE\n TPAMI 2024"},{"id":"http://arxiv.org/abs/2404.05717v3","updated":"2024-10-03T17:56:42Z","published":"2024-04-08T17:52:29Z","title":"SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual\n Editing","summary":" Effective editing of personal content holds a pivotal role in enabling\nindividuals to express their creativity, weaving captivating narratives within\ntheir visual stories, and elevate the overall quality and impact of their\nvisual content. Therefore, in this work, we introduce SwapAnything, a novel\nframework that can swap any objects in an image with personalized concepts\ngiven by the reference, while keeping the context unchanged. Compared with\nexisting methods for personalized subject swapping, SwapAnything has three\nunique advantages: (1) precise control of arbitrary objects and parts rather\nthan the main subject, (2) more faithful preservation of context pixels, (3)\nbetter adaptation of the personalized concept to the image. First, we propose\ntargeted variable swapping to apply region control over latent feature maps and\nswap masked variables for faithful context preservation and initial semantic\nconcept swapping. Then, we introduce appearance adaptation, to seamlessly adapt\nthe semantic concept into the original image in terms of target location,\nshape, style, and content during the image generation process. Extensive\nresults on both human and automatic evaluation demonstrate significant\nimprovements of our approach over baseline methods on personalized swapping.\nFurthermore, SwapAnything shows its precise and faithful swapping abilities\nacross single object, multiple objects, partial object, and cross-domain\nswapping tasks. SwapAnything also achieves great performance on text-based\nswapping and tasks beyond swapping such as object insertion.\n","authors":["Jing Gu","Nanxuan Zhao","Wei Xiong","Qing Liu","Zhifei Zhang","He Zhang","Jianming Zhang","HyunJoon Jung","Yilin Wang","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2404.05717v3.pdf","comment":"ECCV 2024, 23 pages, 14 figures, 3 tables"},{"id":"http://arxiv.org/abs/2410.02746v1","updated":"2024-10-03T17:56:09Z","published":"2024-10-03T17:56:09Z","title":"Contrastive Localized Language-Image Pre-Training","summary":" Contrastive Language-Image Pre-training (CLIP) has been a celebrated method\nfor training vision encoders to generate image/text representations\nfacilitating various applications. Recently, CLIP has been widely adopted as\nthe vision backbone of multimodal large language models (MLLMs) to connect\nimage inputs for language interactions. The success of CLIP as a\nvision-language foundation model relies on aligning web-crawled noisy text\nannotations at image levels. Nevertheless, such criteria may become\ninsufficient for downstream tasks in need of fine-grained vision\nrepresentations, especially when region-level understanding is demanding for\nMLLMs. In this paper, we improve the localization capability of CLIP with\nseveral advances. We propose a pre-training method called Contrastive Localized\nLanguage-Image Pre-training (CLOC) by complementing CLIP with region-text\ncontrastive loss and modules. We formulate a new concept, promptable\nembeddings, of which the encoder produces image embeddings easy to transform\ninto region representations given spatial hints. To support large-scale\npre-training, we design a visually-enriched and spatially-localized captioning\nframework to effectively generate region-text pseudo-labels at scale. By\nscaling up to billions of annotated images, CLOC enables high-quality regional\nembeddings for image region recognition and retrieval tasks, and can be a\ndrop-in replacement of CLIP to enhance MLLMs, especially on referring and\ngrounding tasks.\n","authors":["Hong-You Chen","Zhengfeng Lai","Haotian Zhang","Xinze Wang","Marcin Eichner","Keen You","Meng Cao","Bowen Zhang","Yinfei Yang","Zhe Gan"],"pdf_url":"https://arxiv.org/pdf/2410.02746v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.02740v1","updated":"2024-10-03T17:54:52Z","published":"2024-10-03T17:54:52Z","title":"Revisit Large-Scale Image-Caption Data in Pre-training Multimodal\n Foundation Models","summary":" Recent advancements in multimodal models highlight the value of rewritten\ncaptions for improving performance, yet key challenges remain. For example,\nwhile synthetic captions often provide superior quality and image-text\nalignment, it is not clear whether they can fully replace AltTexts: the role of\nsynthetic captions and their interaction with original web-crawled AltTexts in\npre-training is still not well understood. Moreover, different multimodal\nfoundation models may have unique preferences for specific caption formats, but\nefforts to identify the optimal captions for each model remain limited. In this\nwork, we propose a novel, controllable, and scalable captioning pipeline\ndesigned to generate diverse caption formats tailored to various multimodal\nmodels. By examining Short Synthetic Captions (SSC) towards Dense Synthetic\nCaptions (DSC+) as case studies, we systematically explore their effects and\ninteractions with AltTexts across models such as CLIP, multimodal LLMs, and\ndiffusion models. Our findings reveal that a hybrid approach that keeps both\nsynthetic captions and AltTexts can outperform the use of synthetic captions\nalone, improving both alignment and performance, with each model demonstrating\npreferences for particular caption formats. This comprehensive analysis\nprovides valuable insights into optimizing captioning strategies, thereby\nadvancing the pre-training of multimodal foundation models.\n","authors":["Zhengfeng Lai","Vasileios Saveris","Chen Chen","Hong-You Chen","Haotian Zhang","Bowen Zhang","Juan Lao Tebar","Wenze Hu","Zhe Gan","Peter Grasch","Meng Cao","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2410.02740v1.pdf","comment":"CV/ML"},{"id":"http://arxiv.org/abs/2303.17051v3","updated":"2024-10-03T17:53:04Z","published":"2023-03-29T22:50:05Z","title":"Towards Foundation Models and Few-Shot Parameter-Efficient Fine-Tuning\n for Volumetric Organ Segmentation","summary":" The recent popularity of foundation models and the pre-train-and-adapt\nparadigm, where a large-scale model is transferred to downstream tasks, is\ngaining attention for volumetric medical image segmentation. However, current\ntransfer learning strategies devoted to full fine-tuning for transfer learning\nmay require significant resources and yield sub-optimal results when the\nlabeled data of the target task is scarce. This makes its applicability in real\nclinical settings challenging since these institutions are usually constrained\non data and computational resources to develop proprietary solutions. To\naddress this challenge, we formalize Few-Shot Efficient Fine-Tuning (FSEFT), a\nnovel and realistic scenario for adapting medical image segmentation foundation\nmodels. This setting considers the key role of both data- and parameter-\nefficiency during adaptation. Building on a foundation model pre-trained on\nopen-access CT organ segmentation sources, we propose leveraging\nParameter-Efficient Fine-Tuning and black-box Adapters to address such\nchallenges. Furthermore, novel efficient adaptation methodologies are\nintroduced in this work, which include Spatial black-box Adapters that are more\nappropriate for dense prediction tasks and constrained transductive inference,\nleveraging task-specific prior knowledge. Our comprehensive transfer learning\nexperiments confirm the suitability of foundation models in medical image\nsegmentation and unveil the limitations of popular fine-tuning strategies in\nfew-shot scenarios.\n","authors":["Julio Silva-Rodríguez","Jose Dolz","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2303.17051v3.pdf","comment":"Journal Extension of MICCAI - MedAGI Workshop 2023. Code in\n https://github.com/jusiro/fewshot-finetuning"},{"id":"http://arxiv.org/abs/2410.02730v1","updated":"2024-10-03T17:49:28Z","published":"2024-10-03T17:49:28Z","title":"DivScene: Benchmarking LVLMs for Object Navigation with Diverse Scenes\n and Objects","summary":" Object navigation in unknown environments is crucial for deploying embodied\nagents in real-world applications. While we have witnessed huge progress due to\nlarge-scale scene datasets, faster simulators, and stronger models, previous\nstudies mainly focus on limited scene types and target objects. In this paper,\nwe study a new task of navigating to diverse target objects in a large number\nof scene types. To benchmark the problem, we present a large-scale scene\ndataset, DivScene, which contains 4,614 scenes across 81 different types. With\nthe dataset, we build an end-to-end embodied agent, NatVLM, by fine-tuning a\nLarge Vision Language Model (LVLM) through imitation learning. The LVLM is\ntrained to take previous observations from the environment and generate the\nnext actions. We also introduce CoT explanation traces of the action prediction\nfor better performance when tuning LVLMs. Our extensive experiments find that\nwe can build a performant LVLM-based agent through imitation learning on the\nshortest paths constructed by a BFS planner without any human supervision. Our\nagent achieves a success rate that surpasses GPT-4o by over 20%. Meanwhile, we\ncarry out various analyses showing the generalization ability of our agent.\n","authors":["Zhaowei Wang","Hongming Zhang","Tianqing Fang","Ye Tian","Yue Yang","Kaixin Ma","Xiaoman Pan","Yangqiu Song","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2410.02730v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2404.10710v3","updated":"2024-10-03T17:46:40Z","published":"2024-04-16T16:36:50Z","title":"Autoregressive Pre-Training on Pixels and Texts","summary":" The integration of visual and textual information represents a promising\ndirection in the advancement of language models. In this paper, we explore the\ndual modality of language--both visual and textual--within an autoregressive\nframework, pre-trained on both document images and texts. Our method employs a\nmultimodal training strategy, utilizing visual data through next patch\nprediction with a regression head and/or textual data through next token\nprediction with a classification head. We focus on understanding the\ninteraction between these two modalities and their combined impact on model\nperformance. Our extensive evaluation across a wide range of benchmarks shows\nthat incorporating both visual and textual data significantly improves the\nperformance of pixel-based language models. Remarkably, we find that a\nunidirectional pixel-based model trained solely on visual data can achieve\ncomparable results to state-of-the-art bidirectional models on several language\nunderstanding tasks. This work uncovers the untapped potential of integrating\nvisual and textual modalities for more effective language modeling. We release\nour code, data, and model checkpoints at\n\\url{https://github.com/ernie-research/pixelgpt}.\n","authors":["Yekun Chai","Qingyi Liu","Jingwu Xiao","Shuohuan Wang","Yu Sun","Hua Wu"],"pdf_url":"https://arxiv.org/pdf/2404.10710v3.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.02720v1","updated":"2024-10-03T17:39:55Z","published":"2024-10-03T17:39:55Z","title":"Curvature Diversity-Driven Deformation and Domain Alignment for Point\n Cloud","summary":" Unsupervised Domain Adaptation (UDA) is crucial for reducing the need for\nextensive manual data annotation when training deep networks on point cloud\ndata. A significant challenge of UDA lies in effectively bridging the domain\ngap. To tackle this challenge, we propose \\textbf{C}urvature\n\\textbf{D}iversity-Driven \\textbf{N}uclear-Norm Wasserstein \\textbf{D}omain\nAlignment (CDND). Our approach first introduces a \\textit{\\textbf{Curv}ature\nDiversity-driven Deformation \\textbf{Rec}onstruction (CurvRec)} task, which\neffectively mitigates the gap between the source and target domains by enabling\nthe model to extract salient features from semantically rich regions of a given\npoint cloud. We then propose \\textit{\\textbf{D}eformation-based\n\\textbf{N}uclear-norm \\textbf{W}asserstein \\textbf{D}iscrepancy (D-NWD)}, which\napplies the Nuclear-norm Wasserstein Discrepancy to both \\textit{deformed and\noriginal} data samples to align the source and target domains. Furthermore, we\ncontribute a theoretical justification for the effectiveness of D-NWD in\ndistribution alignment and demonstrate that it is \\textit{generic} enough to be\napplied to \\textbf{any} deformations. To validate our method, we conduct\nextensive experiments on two public domain adaptation datasets for point cloud\nclassification and segmentation tasks. Empirical experiment results show that\nour CDND achieves state-of-the-art performance by a noticeable margin over\nexisting approaches.\n","authors":["Mengxi Wu","Hao Huang","Yi Fang","Mohammad Rostami"],"pdf_url":"https://arxiv.org/pdf/2410.02720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02714v1","updated":"2024-10-03T17:37:18Z","published":"2024-10-03T17:37:18Z","title":"AlzhiNet: Traversing from 2DCNN to 3DCNN, Towards Early Detection and\n Diagnosis of Alzheimer's Disease","summary":" Alzheimer's disease (AD) is a progressive neurodegenerative disorder with\nincreasing prevalence among the aging population, necessitating early and\naccurate diagnosis for effective disease management. In this study, we present\na novel hybrid deep learning framework that integrates both 2D Convolutional\nNeural Networks (2D-CNN) and 3D Convolutional Neural Networks (3D-CNN), along\nwith a custom loss function and volumetric data augmentation, to enhance\nfeature extraction and improve classification performance in AD diagnosis.\nAccording to extensive experiments, AlzhiNet outperforms standalone 2D and 3D\nmodels, highlighting the importance of combining these complementary\nrepresentations of data. The depth and quality of 3D volumes derived from the\naugmented 2D slices also significantly influence the model's performance. The\nresults indicate that carefully selecting weighting factors in hybrid\npredictions is imperative for achieving optimal results. Our framework has been\nvalidated on the Magnetic Resonance Imaging (MRI) from Kaggle and MIRIAD\ndatasets, obtaining accuracies of 98.9% and 99.99%, respectively, with an AUC\nof 100%. Furthermore, AlzhiNet was studied under a variety of perturbation\nscenarios on the Alzheimer's Kaggle dataset, including Gaussian noise,\nbrightness, contrast, salt and pepper noise, color jitter, and occlusion. The\nresults obtained show that AlzhiNet is more robust to perturbations than\nResNet-18, making it an excellent choice for real-world applications. This\napproach represents a promising advancement in the early diagnosis and\ntreatment planning for Alzheimer's disease.\n","authors":["Romoke Grace Akindele","Samuel Adebayo","Paul Shekonya Kanda","Ming Yu"],"pdf_url":"https://arxiv.org/pdf/2410.02714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02713v1","updated":"2024-10-03T17:36:49Z","published":"2024-10-03T17:36:49Z","title":"Video Instruction Tuning With Synthetic Data","summary":" The development of video large multimodal models (LMMs) has been hindered by\nthe difficulty of curating large amounts of high-quality raw data from the web.\nTo address this, we propose an alternative approach by creating a high-quality\nsynthetic dataset specifically for video instruction-following, namely\nLLaVA-Video-178K. This dataset includes key tasks such as detailed captioning,\nopen-ended question-answering (QA), and multiple-choice QA. By training on this\ndataset, in combination with existing visual instruction tuning data, we\nintroduce LLaVA-Video, a new video LMM. Our experiments demonstrate that\nLLaVA-Video achieves strong performance across various video benchmarks,\nhighlighting the effectiveness of our dataset. We plan to release the dataset,\nits generation pipeline, and the model checkpoints.\n","authors":["Yuanhan Zhang","Jinming Wu","Wei Li","Bo Li","Zejun Ma","Ziwei Liu","Chunyuan Li"],"pdf_url":"https://arxiv.org/pdf/2410.02713v1.pdf","comment":"Project page: https://llava-vl.github.io/blog/2024-09-30-llava-video/"},{"id":"http://arxiv.org/abs/2410.02712v1","updated":"2024-10-03T17:36:33Z","published":"2024-10-03T17:36:33Z","title":"LLaVA-Critic: Learning to Evaluate Multimodal Models","summary":" We introduce LLaVA-Critic, the first open-source large multimodal model (LMM)\ndesigned as a generalist evaluator to assess performance across a wide range of\nmultimodal tasks. LLaVA-Critic is trained using a high-quality critic\ninstruction-following dataset that incorporates diverse evaluation criteria and\nscenarios. Our experiments demonstrate the model's effectiveness in two key\nareas: (1) LMM-as-a-Judge, where LLaVA-Critic provides reliable evaluation\nscores, performing on par with or surpassing GPT models on multiple evaluation\nbenchmarks; and (2) Preference Learning, where it generates reward signals for\npreference learning, enhancing model alignment capabilities. This work\nunderscores the potential of open-source LMMs in self-critique and evaluation,\nsetting the stage for future research into scalable, superhuman alignment\nfeedback mechanisms for LMMs.\n","authors":["Tianyi Xiong","Xiyao Wang","Dong Guo","Qinghao Ye","Haoqi Fan","Quanquan Gu","Heng Huang","Chunyuan Li"],"pdf_url":"https://arxiv.org/pdf/2410.02712v1.pdf","comment":"Project Page: https://llava-vl.github.io/blog/2024-10-03-llava-critic"},{"id":"http://arxiv.org/abs/2410.02710v1","updated":"2024-10-03T17:34:55Z","published":"2024-10-03T17:34:55Z","title":"SteerDiff: Steering towards Safe Text-to-Image Diffusion Models","summary":" Text-to-image (T2I) diffusion models have drawn attention for their ability\nto generate high-quality images with precise text alignment. However, these\nmodels can also be misused to produce inappropriate content. Existing safety\nmeasures, which typically rely on text classifiers or ControlNet-like\napproaches, are often insufficient. Traditional text classifiers rely on\nlarge-scale labeled datasets and can be easily bypassed by rephrasing. As\ndiffusion models continue to scale, fine-tuning these safeguards becomes\nincreasingly challenging and lacks flexibility. Recent red-teaming attack\nresearches further underscore the need for a new paradigm to prevent the\ngeneration of inappropriate content. In this paper, we introduce SteerDiff, a\nlightweight adaptor module designed to act as an intermediary between user\ninput and the diffusion model, ensuring that generated images adhere to ethical\nand safety standards with little to no impact on usability. SteerDiff\nidentifies and manipulates inappropriate concepts within the text embedding\nspace to guide the model away from harmful outputs. We conduct extensive\nexperiments across various concept unlearning tasks to evaluate the\neffectiveness of our approach. Furthermore, we benchmark SteerDiff against\nmultiple red-teaming strategies to assess its robustness. Finally, we explore\nthe potential of SteerDiff for concept forgetting tasks, demonstrating its\nversatility in text-conditioned image generation.\n","authors":["Hongxiang Zhang","Yifeng He","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02705v1","updated":"2024-10-03T17:28:07Z","published":"2024-10-03T17:28:07Z","title":"ControlAR: Controllable Image Generation with Autoregressive Models","summary":" Autoregressive (AR) models have reformulated image generation as next-token\nprediction, demonstrating remarkable potential and emerging as strong\ncompetitors to diffusion models. However, control-to-image generation, akin to\nControlNet, remains largely unexplored within AR models. Although a natural\napproach, inspired by advancements in Large Language Models, is to tokenize\ncontrol images into tokens and prefill them into the autoregressive model\nbefore decoding image tokens, it still falls short in generation quality\ncompared to ControlNet and suffers from inefficiency. To this end, we introduce\nControlAR, an efficient and effective framework for integrating spatial\ncontrols into autoregressive image generation models. Firstly, we explore\ncontrol encoding for AR models and propose a lightweight control encoder to\ntransform spatial inputs (e.g., canny edges or depth maps) into control tokens.\nThen ControlAR exploits the conditional decoding method to generate the next\nimage token conditioned on the per-token fusion between control and image\ntokens, similar to positional encodings. Compared to prefilling tokens, using\nconditional decoding significantly strengthens the control capability of AR\nmodels but also maintains the model's efficiency. Furthermore, the proposed\nControlAR surprisingly empowers AR models with arbitrary-resolution image\ngeneration via conditional decoding and specific controls. Extensive\nexperiments can demonstrate the controllability of the proposed ControlAR for\nthe autoregressive control-to-image generation across diverse inputs, including\nedges, depths, and segmentation masks. Furthermore, both quantitative and\nqualitative results indicate that ControlAR surpasses previous state-of-the-art\ncontrollable diffusion models, e.g., ControlNet++. Code, models, and demo will\nsoon be available at https://github.com/hustvl/ControlAR.\n","authors":["Zongming Li","Tianheng Cheng","Shoufa Chen","Peize Sun","Haocheng Shen","Longjin Ran","Xiaoxin Chen","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.02705v1.pdf","comment":"Preprint. Work in progress"},{"id":"http://arxiv.org/abs/2406.03520v2","updated":"2024-10-03T17:24:40Z","published":"2024-06-05T17:53:55Z","title":"VideoPhy: Evaluating Physical Commonsense for Video Generation","summary":" Recent advances in internet-scale video data pretraining have led to the\ndevelopment of text-to-video generative models that can create high-quality\nvideos across a broad range of visual concepts, synthesize realistic motions\nand render complex objects. Hence, these generative models have the potential\nto become general-purpose simulators of the physical world. However, it is\nunclear how far we are from this goal with the existing text-to-video\ngenerative models. To this end, we present VideoPhy, a benchmark designed to\nassess whether the generated videos follow physical commonsense for real-world\nactivities (e.g. marbles will roll down when placed on a slanted surface).\nSpecifically, we curate diverse prompts that involve interactions between\nvarious material types in the physical world (e.g., solid-solid, solid-fluid,\nfluid-fluid). We then generate videos conditioned on these captions from\ndiverse state-of-the-art text-to-video generative models, including open models\n(e.g., CogVideoX) and closed models (e.g., Lumiere, Dream Machine). Our human\nevaluation reveals that the existing models severely lack the ability to\ngenerate videos adhering to the given text prompts, while also lack physical\ncommonsense. Specifically, the best performing model, CogVideoX-5B, generates\nvideos that adhere to the caption and physical laws for 39.6% of the instances.\nVideoPhy thus highlights that the video generative models are far from\naccurately simulating the physical world. Finally, we propose an\nauto-evaluator, VideoCon-Physics, to assess the performance reliably for the\nnewly released models.\n","authors":["Hritik Bansal","Zongyu Lin","Tianyi Xie","Zeshun Zong","Michal Yarom","Yonatan Bitton","Chenfanfu Jiang","Yizhou Sun","Kai-Wei Chang","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2406.03520v2.pdf","comment":"43 pages, 29 figures, 12 tables. Added CogVideo and Dream Machine in\n v2"},{"id":"http://arxiv.org/abs/2410.02698v1","updated":"2024-10-03T17:21:30Z","published":"2024-10-03T17:21:30Z","title":"Lie Algebra Canonicalization: Equivariant Neural Operators under\n arbitrary Lie Groups","summary":" The quest for robust and generalizable machine learning models has driven\nrecent interest in exploiting symmetries through equivariant neural networks.\nIn the context of PDE solvers, recent works have shown that Lie point\nsymmetries can be a useful inductive bias for Physics-Informed Neural Networks\n(PINNs) through data and loss augmentation. Despite this, directly enforcing\nequivariance within the model architecture for these problems remains elusive.\nThis is because many PDEs admit non-compact symmetry groups, oftentimes not\nstudied beyond their infinitesimal generators, making them incompatible with\nmost existing equivariant architectures. In this work, we propose Lie aLgebrA\nCanonicalization (LieLAC), a novel approach that exploits only the action of\ninfinitesimal generators of the symmetry group, circumventing the need for\nknowledge of the full group structure. To achieve this, we address existing\ntheoretical issues in the canonicalization literature, establishing connections\nwith frame averaging in the case of continuous non-compact groups. Operating\nwithin the framework of canonicalization, LieLAC can easily be integrated with\nunconstrained pre-trained models, transforming inputs to a canonical form\nbefore feeding them into the existing model, effectively aligning the input for\nmodel inference according to allowed symmetries. LieLAC utilizes standard Lie\ngroup descent schemes, achieving equivariance in pre-trained models. Finally,\nwe showcase LieLAC's efficacy on tasks of invariant image classification and\nLie point symmetry equivariant neural PDE solvers using pre-trained models.\n","authors":["Zakhar Shumaylov","Peter Zaika","James Rowbottom","Ferdia Sherry","Melanie Weber","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2410.02698v1.pdf","comment":"40 pages; preprint"},{"id":"http://arxiv.org/abs/2310.10224v4","updated":"2024-10-03T17:13:41Z","published":"2023-10-16T09:34:06Z","title":"Generalizing Medical Image Representations via Quaternion Wavelet\n Networks","summary":" Neural network generalizability is becoming a broad research field due to the\nincreasing availability of datasets from different sources and for various\ntasks. This issue is even wider when processing medical data, where a lack of\nmethodological standards causes large variations being provided by different\nimaging centers or acquired with various devices and cofactors. To overcome\nthese limitations, we introduce a novel, generalizable, data- and task-agnostic\nframework able to extract salient features from medical images. The proposed\nquaternion wavelet network (QUAVE) can be easily integrated with any\npre-existing medical image analysis or synthesis task, and it can be involved\nwith real, quaternion, or hypercomplex-valued models, generalizing their\nadoption to single-channel data. QUAVE first extracts different sub-bands\nthrough the quaternion wavelet transform, resulting in both\nlow-frequency/approximation bands and high-frequency/fine-grained features.\nThen, it weighs the most representative set of sub-bands to be involved as\ninput to any other neural model for image processing, replacing standard data\nsamples. We conduct an extensive experimental evaluation comprising different\ndatasets, diverse image analysis, and synthesis tasks including reconstruction,\nsegmentation, and modality translation. We also evaluate QUAVE in combination\nwith both real and quaternion-valued models. Results demonstrate the\neffectiveness and the generalizability of the proposed framework that improves\nnetwork performance while being flexible to be adopted in manifold scenarios\nand robust to domain shifts. The full code is available at:\nhttps://github.com/ispamm/QWT.\n","authors":["Luigi Sigillo","Eleonora Grassucci","Aurelio Uncini","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2310.10224v4.pdf","comment":"This paper is currently under review"},{"id":"http://arxiv.org/abs/2403.10390v2","updated":"2024-10-03T17:10:22Z","published":"2024-03-15T15:21:04Z","title":"Evaluating Perceptual Distance Models by Fitting Binomial Distributions\n to Two-Alternative Forced Choice Data","summary":" The two-alternative forced choice (2AFC) experimental method is popular in\nthe visual perception literature, where practitioners aim to understand how\nhuman observers perceive distances within triplets made of a reference image\nand two distorted versions. In the past, this had been conducted in controlled\nenvironments, with triplets sharing images, so it was possible to rank the\nperceived quality. This ranking would then be used to evaluate perceptual\ndistance models against the experimental data. Recently, crowd-sourced\nperceptual datasets have emerged, with no images shared between triplets,\nmaking ranking infeasible. Evaluating perceptual distance models using this\ndata reduces the judgements on a triplet to a binary decision, namely, whether\nthe distance model agrees with the human decision - which is suboptimal and\nprone to misleading conclusions. Instead, we statistically model the underlying\ndecision-making process during 2AFC experiments using a binomial distribution.\nHaving enough empirical data, we estimate a smooth and consistent distribution\nof the judgements on the reference-distorted distance plane, according to each\ndistance model. By applying maximum likelihood, we estimate the parameter of\nthe local binomial distribution, and a global measurement of the expected\nlog-likelihood of the measured responses. We calculate meaningful and\nwell-founded metrics for the distance model, beyond the mere prediction\naccuracy as percentage agreement, even with variable numbers of judgements per\ntriplet -- key advantages over both classical and neural network methods.\n","authors":["Alexander Hepburn","Raul Santos-Rodriguez","Javier Portilla"],"pdf_url":"https://arxiv.org/pdf/2403.10390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02671v1","updated":"2024-10-03T16:54:35Z","published":"2024-10-03T16:54:35Z","title":"Unsupervised Point Cloud Completion through Unbalanced Optimal Transport","summary":" Unpaired point cloud completion explores methods for learning a completion\nmap from unpaired incomplete and complete point cloud data. In this paper, we\npropose a novel approach for unpaired point cloud completion using the\nunbalanced optimal transport map, called Unbalanced Optimal Transport Map for\nUnpaired Point Cloud Completion (UOT-UPC). We demonstrate that the unpaired\npoint cloud completion can be naturally interpreted as the Optimal Transport\n(OT) problem and introduce the Unbalanced Optimal Transport (UOT) approach to\naddress the class imbalance problem, which is prevalent in unpaired point cloud\ncompletion datasets. Moreover, we analyze the appropriate cost function for\nunpaired completion tasks. This analysis shows that the InfoCD cost function is\nparticularly well-suited for this task. Our model is the first attempt to\nleverage UOT for unpaired point cloud completion, achieving competitive or\nsuperior results on both single-category and multi-category datasets. In\nparticular, our model is especially effective in scenarios with class\nimbalance, where the proportions of categories are different between the\nincomplete and complete point cloud datasets.\n","authors":["Taekyung Lee","Jaemoo Choi","Jaewoong Choi"],"pdf_url":"https://arxiv.org/pdf/2410.02671v1.pdf","comment":"20 pages, 10 figures"},{"id":"http://arxiv.org/abs/2410.02653v1","updated":"2024-10-03T16:36:35Z","published":"2024-10-03T16:36:35Z","title":"Measuring and Improving Persuasiveness of Generative Models","summary":" LLMs are increasingly being used in workflows involving generating content to\nbe consumed by humans (e.g., marketing) and also in directly interacting with\nhumans (e.g., through chatbots). The development of such systems that are\ncapable of generating verifiably persuasive messages presents both\nopportunities and challenges for society. On the one hand, such systems could\npositively impact domains like advertising and social good, such as addressing\ndrug addiction, and on the other, they could be misused for spreading\nmisinformation and shaping political opinions. To channel LLMs' impact on\nsociety, we need to develop systems to measure and benchmark their\npersuasiveness. With this motivation, we introduce PersuasionBench and\nPersuasionArena, the first large-scale benchmark and arena containing a battery\nof tasks to measure the persuasion ability of generative models automatically.\nWe investigate to what extent LLMs know and leverage linguistic patterns that\ncan help them generate more persuasive language. Our findings indicate that the\npersuasiveness of LLMs correlates positively with model size, but smaller\nmodels can also be made to have a higher persuasiveness than much larger\nmodels. Notably, targeted training using synthetic and natural datasets\nsignificantly enhances smaller models' persuasive capabilities, challenging\nscale-dependent assumptions. Our findings carry key implications for both model\ndevelopers and policymakers. For instance, while the EU AI Act and California's\nSB-1047 aim to regulate AI models based on the number of floating point\noperations, we demonstrate that simple metrics like this alone fail to capture\nthe full scope of AI's societal impact. We invite the community to explore and\ncontribute to PersuasionArena and PersuasionBench, available at\nhttps://bit.ly/measure-persuasion, to advance our understanding of AI-driven\npersuasion and its societal implications.\n","authors":["Somesh Singh","Yaman K Singla","Harini SI","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2410.02653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02646v1","updated":"2024-10-03T16:31:28Z","published":"2024-10-03T16:31:28Z","title":"Learning 3D Perception from Others' Predictions","summary":" Accurate 3D object detection in real-world environments requires a huge\namount of annotated data with high quality. Acquiring such data is tedious and\nexpensive, and often needs repeated effort when a new sensor is adopted or when\nthe detector is deployed in a new environment. We investigate a new scenario to\nconstruct 3D object detectors: learning from the predictions of a nearby unit\nthat is equipped with an accurate detector. For example, when a self-driving\ncar enters a new area, it may learn from other traffic participants whose\ndetectors have been optimized for that area. This setting is label-efficient,\nsensor-agnostic, and communication-efficient: nearby units only need to share\nthe predictions with the ego agent (e.g., car). Naively using the received\npredictions as ground-truths to train the detector for the ego car, however,\nleads to inferior performance. We systematically study the problem and identify\nviewpoint mismatches and mislocalization (due to synchronization and GPS\nerrors) as the main causes, which unavoidably result in false positives, false\nnegatives, and inaccurate pseudo labels. We propose a distance-based\ncurriculum, first learning from closer units with similar viewpoints and\nsubsequently improving the quality of other units' predictions via\nself-training. We further demonstrate that an effective pseudo label refinement\nmodule can be trained with a handful of annotated data, largely reducing the\ndata quantity necessary to train an object detector. We validate our approach\non the recently released real-world collaborative driving dataset, using\nreference cars' predictions as pseudo labels for the ego car. Extensive\nexperiments including several scenarios (e.g., different sensors, detectors,\nand domains) demonstrate the effectiveness of our approach toward\nlabel-efficient learning of 3D perception from other units' predictions.\n","authors":["Jinsu Yoo","Zhenyang Feng","Tai-Yu Pan","Yihong Sun","Cheng Perng Phoo","Xiangyu Chen","Mark Campbell","Kilian Q. Weinberger","Bharath Hariharan","Wei-Lun Chao"],"pdf_url":"https://arxiv.org/pdf/2410.02646v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2410.02643v1","updated":"2024-10-03T16:29:47Z","published":"2024-10-03T16:29:47Z","title":"Why Sample Space Matters: Keyframe Sampling Optimization for LiDAR-based\n Place Recognition","summary":" Recent advances in robotics are pushing real-world autonomy, enabling robots\nto perform long-term and large-scale missions. A crucial component for\nsuccessful missions is the incorporation of loop closures through place\nrecognition, which effectively mitigates accumulated pose estimation drift.\nDespite computational advancements, optimizing performance for real-time\ndeployment remains challenging, especially in resource-constrained mobile\nrobots and multi-robot systems since, conventional keyframe sampling practices\nin place recognition often result in retaining redundant information or\noverlooking relevant data, as they rely on fixed sampling intervals or work\ndirectly in the 3D space instead of the feature space. To address these\nconcerns, we introduce the concept of sample space in place recognition and\ndemonstrate how different sampling techniques affect the query process and\noverall performance. We then present a novel keyframe sampling approach for\nLiDAR-based place recognition, which focuses on redundancy minimization and\ninformation preservation in the hyper-dimensional descriptor space. This\napproach is applicable to both learning-based and handcrafted descriptors, and\nthrough the experimental validation across multiple datasets and descriptor\nframeworks, we demonstrate the effectiveness of our proposed method, showing it\ncan jointly minimize redundancy and preserve essential information in\nreal-time. The proposed approach maintains robust performance across various\ndatasets without requiring parameter tuning, contributing to more efficient and\nreliable place recognition for a wide range of robotic applications.\n","authors":["Nikolaos Stathoulopoulos","Vidya Sumathy","Christoforos Kanellakis","George Nikolakopoulos"],"pdf_url":"https://arxiv.org/pdf/2410.02643v1.pdf","comment":"20 pages, 15 figures. Submitted"},{"id":"http://arxiv.org/abs/2405.13675v2","updated":"2024-10-03T16:26:46Z","published":"2024-05-22T14:16:30Z","title":"Context and Geometry Aware Voxel Transformer for Semantic Scene\n Completion","summary":" Vision-based Semantic Scene Completion (SSC) has gained much attention due to\nits widespread applications in various 3D perception tasks. Existing\nsparse-to-dense approaches typically employ shared context-independent queries\nacross various input images, which fails to capture distinctions among them as\nthe focal regions of different inputs vary and may result in undirected feature\naggregation of cross-attention. Additionally, the absence of depth information\nmay lead to points projected onto the image plane sharing the same 2D position\nor similar sampling points in the feature map, resulting in depth ambiguity. In\nthis paper, we present a novel context and geometry aware voxel transformer. It\nutilizes a context aware query generator to initialize context-dependent\nqueries tailored to individual input images, effectively capturing their unique\ncharacteristics and aggregating information within the region of interest.\nFurthermore, it extend deformable cross-attention from 2D to 3D pixel space,\nenabling the differentiation of points with similar image coordinates based on\ntheir depth coordinates. Building upon this module, we introduce a neural\nnetwork named CGFormer to achieve semantic scene completion. Simultaneously,\nCGFormer leverages multiple 3D representations (i.e., voxel and TPV) to boost\nthe semantic and geometric representation abilities of the transformed 3D\nvolume from both local and global perspectives. Experimental results\ndemonstrate that CGFormer achieves state-of-the-art performance on the\nSemanticKITTI and SSCBench-KITTI-360 benchmarks, attaining a mIoU of 16.87 and\n20.05, as well as an IoU of 45.99 and 48.07, respectively. Remarkably, CGFormer\neven outperforms approaches employing temporal images as inputs or much larger\nimage backbone networks.\n","authors":["Zhu Yu","Runmin Zhang","Jiacheng Ying","Junchen Yu","Xiaohai Hu","Lun Luo","Si-Yuan Cao","Hui-Liang Shen"],"pdf_url":"https://arxiv.org/pdf/2405.13675v2.pdf","comment":"NIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2410.02640v1","updated":"2024-10-03T16:24:20Z","published":"2024-10-03T16:24:20Z","title":"Diffusion-based Extreme Image Compression with Compressed Feature\n Initialization","summary":" Diffusion-based extreme image compression methods have achieved impressive\nperformance at extremely low bitrates. However, constrained by the iterative\ndenoising process that starts from pure noise, these methods are limited in\nboth fidelity and efficiency. To address these two issues, we present Relay\nResidual Diffusion Extreme Image Compression (RDEIC), which leverages\ncompressed feature initialization and residual diffusion. Specifically, we\nfirst use the compressed latent features of the image with added noise, instead\nof pure noise, as the starting point to eliminate the unnecessary initial\nstages of the denoising process. Second, we design a novel relay residual\ndiffusion that reconstructs the raw image by iteratively removing the added\nnoise and the residual between the compressed and target latent features.\nNotably, our relay residual diffusion network seamlessly integrates pre-trained\nstable diffusion to leverage its robust generative capability for high-quality\nreconstruction. Third, we propose a fixed-step fine-tuning strategy to\neliminate the discrepancy between the training and inference phases, further\nimproving the reconstruction quality. Extensive experiments demonstrate that\nthe proposed RDEIC achieves state-of-the-art visual quality and outperforms\nexisting diffusion-based extreme image compression methods in both fidelity and\nefficiency. The source code will be provided in\nhttps://github.com/huai-chang/RDEIC.\n","authors":["Zhiyuan Li","Yanhui Zhou","Hao Wei","Chenyang Ge","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2410.02640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02638v1","updated":"2024-10-03T16:23:33Z","published":"2024-10-03T16:23:33Z","title":"Spatial-Temporal Multi-Cuts for Online Multiple-Camera Vehicle Tracking","summary":" Accurate online multiple-camera vehicle tracking is essential for intelligent\ntransportation systems, autonomous driving, and smart city applications. Like\nsingle-camera multiple-object tracking, it is commonly formulated as a graph\nproblem of tracking-by-detection. Within this framework, existing online\nmethods usually consist of two-stage procedures that cluster temporally first,\nthen spatially, or vice versa. This is computationally expensive and prone to\nerror accumulation. We introduce a graph representation that allows\nspatial-temporal clustering in a single, combined step: New detections are\nspatially and temporally connected with existing clusters. By keeping sparse\nappearance and positional cues of all detections in a cluster, our method can\ncompare clusters based on the strongest available evidence. The final tracks\nare obtained online using a simple multicut assignment procedure. Our method\ndoes not require any training on the target scene, pre-extraction of\nsingle-camera tracks, or additional annotations. Notably, we outperform the\nonline state-of-the-art on the CityFlow dataset in terms of IDF1 by more than\n14%, and on the Synthehicle dataset by more than 25%, respectively. The code is\npublicly available.\n","authors":["Fabian Herzog","Johannes Gilg","Philipp Wolters","Torben Teepe","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2410.02638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02637v1","updated":"2024-10-03T16:23:13Z","published":"2024-10-03T16:23:13Z","title":"Plots Unlock Time-Series Understanding in Multimodal Models","summary":" While multimodal foundation models can now natively work with data beyond\ntext, they remain underutilized in analyzing the considerable amounts of\nmulti-dimensional time-series data in fields like healthcare, finance, and\nsocial sciences, representing a missed opportunity for richer, data-driven\ninsights. This paper proposes a simple but effective method that leverages the\nexisting vision encoders of these models to \"see\" time-series data via plots,\navoiding the need for additional, potentially costly, model training. Our\nempirical evaluations show that this approach outperforms providing the raw\ntime-series data as text, with the additional benefit that visual time-series\nrepresentations demonstrate up to a 90% reduction in model API costs. We\nvalidate our hypothesis through synthetic data tasks of increasing complexity,\nprogressing from simple functional form identification on clean data, to\nextracting trends from noisy scatter plots. To demonstrate generalizability\nfrom synthetic tasks with clear reasoning steps to more complex, real-world\nscenarios, we apply our approach to consumer health tasks - specifically fall\ndetection, activity recognition, and readiness assessment - which involve\nheterogeneous, noisy data and multi-step reasoning. The overall success in plot\nperformance over text performance (up to an 120% performance increase on\nzero-shot synthetic tasks, and up to 150% performance increase on real-world\ntasks), across both GPT and Gemini model families, highlights our approach's\npotential for making the best use of the native capabilities of foundation\nmodels.\n","authors":["Mayank Daswani","Mathias M. J. Bellaiche","Marc Wilson","Desislav Ivanov","Mikhail Papkov","Eva Schnider","Jing Tang","Kay Lamerigts","Gabriela Botea","Michael A. Sanchez","Yojan Patel","Shruthi Prabhakara","Shravya Shetty","Umesh Telang"],"pdf_url":"https://arxiv.org/pdf/2410.02637v1.pdf","comment":"49 pages"},{"id":"http://arxiv.org/abs/2410.02630v1","updated":"2024-10-03T16:14:22Z","published":"2024-10-03T16:14:22Z","title":"Metrics Revolutions: Groundbreaking Insights into the Implementation of\n Metrics for Biomedical Image Segmentation","summary":" The evaluation of segmentation performance is a common task in biomedical\nimage analysis, with its importance emphasized in the recently released metrics\nselection guidelines and computing frameworks. To quantitatively evaluate the\nalignment of two segmentations, researchers commonly resort to counting\nmetrics, such as the Dice similarity coefficient, or distance-based metrics,\nsuch as the Hausdorff distance, which are usually computed by publicly\navailable open-source tools with an inherent assumption that these tools\nprovide consistent results. In this study we questioned this assumption, and\nperformed a systematic implementation analysis along with quantitative\nexperiments on real-world clinical data to compare 11 open-source tools for\ndistance-based metrics computation against our highly accurate mesh-based\nreference implementation. The results revealed that statistically significant\ndifferences among all open-source tools are both surprising and concerning,\nsince they question the validity of existing studies. Besides identifying the\nmain sources of variation, we also provide recommendations for distance-based\nmetrics computation.\n","authors":["Gašper Podobnik","Tomaž Vrtovec"],"pdf_url":"https://arxiv.org/pdf/2410.02630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02619v1","updated":"2024-10-03T15:58:18Z","published":"2024-10-03T15:58:18Z","title":"GI-GS: Global Illumination Decomposition on Gaussian Splatting for\n Inverse Rendering","summary":" We present GI-GS, a novel inverse rendering framework that leverages 3D\nGaussian Splatting (3DGS) and deferred shading to achieve photo-realistic novel\nview synthesis and relighting. In inverse rendering, accurately modeling the\nshading processes of objects is essential for achieving high-fidelity results.\nTherefore, it is critical to incorporate global illumination to account for\nindirect lighting that reaches an object after multiple bounces across the\nscene. Previous 3DGS-based methods have attempted to model indirect lighting by\ncharacterizing indirect illumination as learnable lighting volumes or\nadditional attributes of each Gaussian, while using baked occlusion to\nrepresent shadow effects. These methods, however, fail to accurately model the\ncomplex physical interactions between light and objects, making it impossible\nto construct realistic indirect illumination during relighting. To address this\nlimitation, we propose to calculate indirect lighting using efficient path\ntracing with deferred shading. In our framework, we first render a G-buffer to\ncapture the detailed geometry and material properties of the scene. Then, we\nperform physically-based rendering (PBR) only for direct lighting. With the\nG-buffer and previous rendering results, the indirect lighting can be\ncalculated through a lightweight path tracing. Our method effectively models\nindirect lighting under any given lighting conditions, thereby achieving better\nnovel view synthesis and relighting. Quantitative and qualitative results show\nthat our GI-GS outperforms existing baselines in both rendering quality and\nefficiency.\n","authors":["Hongze Chen","Zehong Lin","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.02619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01744v2","updated":"2024-10-03T15:57:05Z","published":"2024-10-02T16:55:01Z","title":"Leopard: A Vision Language Model For Text-Rich Multi-Image Tasks","summary":" Text-rich images, where text serves as the central visual element guiding the\noverall understanding, are prevalent in real-world applications, such as\npresentation slides, scanned documents, and webpage snapshots. Tasks involving\nmultiple text-rich images are especially challenging, as they require not only\nunderstanding the content of individual images but reasoning about\ninter-relationships and logical flows across multiple visual inputs. Despite\nthe importance of these scenarios, current multimodal large language models\n(MLLMs) struggle to handle such tasks due to two key challenges: (1) the\nscarcity of high-quality instruction tuning datasets for text-rich multi-image\nscenarios, and (2) the difficulty in balancing image resolution with visual\nfeature sequence length. To address these challenges, we propose Leopard, a\nMLLM designed specifically for handling vision-language tasks involving\nmultiple text-rich images. First, we curated about one million high-quality\nmultimodal instruction-tuning data, tailored to text-rich, multi-image\nscenarios. Second, we developed an adaptive high-resolution multi-image\nencoding module to dynamically optimize the allocation of visual sequence\nlength based on the original aspect ratios and resolutions of the input images.\nExperiments across a wide range of benchmarks demonstrate our model's superior\ncapabilities in text-rich, multi-image evaluations and competitive performance\nin general domain evaluations.\n","authors":["Mengzhao Jia","Wenhao Yu","Kaixin Ma","Tianqing Fang","Zhihan Zhang","Siru Ouyang","Hongming Zhang","Meng Jiang","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2410.01744v2.pdf","comment":"Our code is available at https://github.com/Jill0001/Leopard"},{"id":"http://arxiv.org/abs/2409.12191v2","updated":"2024-10-03T15:54:49Z","published":"2024-09-18T17:59:32Z","title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at\n Any Resolution","summary":" We present the Qwen2-VL Series, an advanced upgrade of the previous Qwen-VL\nmodels that redefines the conventional predetermined-resolution approach in\nvisual processing. Qwen2-VL introduces the Naive Dynamic Resolution mechanism,\nwhich enables the model to dynamically process images of varying resolutions\ninto different numbers of visual tokens. This approach allows the model to\ngenerate more efficient and accurate visual representations, closely aligning\nwith human perceptual processes. The model also integrates Multimodal Rotary\nPosition Embedding (M-RoPE), facilitating the effective fusion of positional\ninformation across text, images, and videos. We employ a unified paradigm for\nprocessing both images and videos, enhancing the model's visual perception\ncapabilities. To explore the potential of large multimodal models, Qwen2-VL\ninvestigates the scaling laws for large vision-language models (LVLMs). By\nscaling both the model size-with versions at 2B, 8B, and 72B parameters-and the\namount of training data, the Qwen2-VL Series achieves highly competitive\nperformance. Notably, the Qwen2-VL-72B model achieves results comparable to\nleading models such as GPT-4o and Claude3.5-Sonnet across various multimodal\nbenchmarks, outperforming other generalist models. Code is available at\nhttps://github.com/QwenLM/Qwen2-VL .\n","authors":["Peng Wang","Shuai Bai","Sinan Tan","Shijie Wang","Zhihao Fan","Jinze Bai","Keqin Chen","Xuejing Liu","Jialin Wang","Wenbin Ge","Yang Fan","Kai Dang","Mengfei Du","Xuancheng Ren","Rui Men","Dayiheng Liu","Chang Zhou","Jingren Zhou","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2409.12191v2.pdf","comment":"Code is available at https://github.com/QwenLM/Qwen2-VL. arXiv admin\n note: text overlap with arXiv:2408.15262 by other authors"},{"id":"http://arxiv.org/abs/2410.02613v1","updated":"2024-10-03T15:51:36Z","published":"2024-10-03T15:51:36Z","title":"NL-Eye: Abductive NLI for Images","summary":" Will a Visual Language Model (VLM)-based bot warn us about slipping if it\ndetects a wet floor? Recent VLMs have demonstrated impressive capabilities, yet\ntheir ability to infer outcomes and causes remains underexplored. To address\nthis, we introduce NL-Eye, a benchmark designed to assess VLMs' visual\nabductive reasoning skills. NL-Eye adapts the abductive Natural Language\nInference (NLI) task to the visual domain, requiring models to evaluate the\nplausibility of hypothesis images based on a premise image and explain their\ndecisions. NL-Eye consists of 350 carefully curated triplet examples (1,050\nimages) spanning diverse reasoning categories: physical, functional, logical,\nemotional, cultural, and social. The data curation process involved two steps -\nwriting textual descriptions and generating images using text-to-image models,\nboth requiring substantial human involvement to ensure high-quality and\nchallenging scenes. Our experiments show that VLMs struggle significantly on\nNL-Eye, often performing at random baseline levels, while humans excel in both\nplausibility prediction and explanation quality. This demonstrates a deficiency\nin the abductive reasoning capabilities of modern VLMs. NL-Eye represents a\ncrucial step toward developing VLMs capable of robust multimodal reasoning for\nreal-world applications, including accident-prevention bots and generated video\nverification.\n","authors":["Mor Ventura","Michael Toker","Nitay Calderon","Zorik Gekhman","Yonatan Bitton","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2410.02613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05662v4","updated":"2024-10-03T15:50:48Z","published":"2024-04-08T16:46:25Z","title":"BinaryDM: Accurate Weight Binarization for Efficient Diffusion Models","summary":" With the advancement of diffusion models (DMs) and the substantially\nincreased computational requirements, quantization emerges as a practical\nsolution to obtain compact and efficient low-bit DMs. However, the highly\ndiscrete representation leads to severe accuracy degradation, hindering the\nquantization of diffusion models to ultra-low bit-widths. This paper proposes a\nnovel weight binarization approach for DMs, namely BinaryDM, pushing binarized\nDMs to be accurate and efficient by improving the representation and\noptimization. From the representation perspective, we present an\nEvolvable-Basis Binarizer (EBB) to enable a smooth evolution of DMs from\nfull-precision to accurately binarized. EBB enhances information representation\nin the initial stage through the flexible combination of multiple binary bases\nand applies regularization to evolve into efficient single-basis binarization.\nThe evolution only occurs in the head and tail of the DM architecture to retain\nthe stability of training. From the optimization perspective, a Low-rank\nRepresentation Mimicking (LRM) is applied to assist the optimization of\nbinarized DMs. The LRM mimics the representations of full-precision DMs in\nlow-rank space, alleviating the direction ambiguity of the optimization process\ncaused by fine-grained alignment. Comprehensive experiments demonstrate that\nBinaryDM achieves significant accuracy and efficiency gains compared to SOTA\nquantization methods of DMs under ultra-low bit-widths. With 1-bit weight and\n4-bit activation (W1A4), BinaryDM achieves as low as 7.74 FID and saves the\nperformance from collapse (baseline FID 10.87). As the first binarization\nmethod for diffusion models, W1A4 BinaryDM achieves impressive 15.2x OPs and\n29.2x model size savings, showcasing its substantial potential for edge\ndeployment.\n","authors":["Xingyu Zheng","Xianglong Liu","Haotong Qin","Xudong Ma","Mingyuan Zhang","Haojie Hao","Jiakai Wang","Zixiang Zhao","Jinyang Guo","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2404.05662v4.pdf","comment":"The code is available at https://github.com/Xingyu-Zheng/BinaryDM"},{"id":"http://arxiv.org/abs/2410.02598v1","updated":"2024-10-03T15:40:58Z","published":"2024-10-03T15:40:58Z","title":"High-Efficiency Neural Video Compression via Hierarchical Predictive\n Learning","summary":" The enhanced Deep Hierarchical Video Compression-DHVC 2.0-has been\nintroduced. This single-model neural video codec operates across a broad range\nof bitrates, delivering not only superior compression performance to\nrepresentative methods but also impressive complexity efficiency, enabling\nreal-time processing with a significantly smaller memory footprint on standard\nGPUs. These remarkable advancements stem from the use of hierarchical\npredictive coding. Each video frame is uniformly transformed into multiscale\nrepresentations through hierarchical variational autoencoders. For a specific\nscale's feature representation of a frame, its corresponding latent residual\nvariables are generated by referencing lower-scale spatial features from the\nsame frame and then conditionally entropy-encoded using a probabilistic model\nwhose parameters are predicted using same-scale temporal reference from\nprevious frames and lower-scale spatial reference of the current frame. This\nfeature-space processing operates from the lowest to the highest scale of each\nframe, completely eliminating the need for the complexity-intensive motion\nestimation and compensation techniques that have been standard in video codecs\nfor decades. The hierarchical approach facilitates parallel processing,\naccelerating both encoding and decoding, and supports transmission-friendly\nprogressive decoding, making it particularly advantageous for networked video\napplications in the presence of packet loss. Source codes will be made\navailable.\n","authors":["Ming Lu","Zhihao Duan","Wuyang Cong","Dandan Ding","Fengqing Zhu","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2410.02598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02592v1","updated":"2024-10-03T15:34:41Z","published":"2024-10-03T15:34:41Z","title":"IC3M: In-Car Multimodal Multi-object Monitoring for Abnormal Status of\n Both Driver and Passengers","summary":" Recently, in-car monitoring has emerged as a promising technology for\ndetecting early-stage abnormal status of the driver and providing timely alerts\nto prevent traffic accidents. Although training models with multimodal data\nenhances the reliability of abnormal status detection, the scarcity of labeled\ndata and the imbalance of class distribution impede the extraction of critical\nabnormal state features, significantly deteriorating training performance.\nFurthermore, missing modalities due to environment and hardware limitations\nfurther exacerbate the challenge of abnormal status identification. More\nimportantly, monitoring abnormal health conditions of passengers, particularly\nin elderly care, is of paramount importance but remains underexplored. To\naddress these challenges, we introduce our IC3M, an efficient\ncamera-rotation-based multimodal framework for monitoring both driver and\npassengers in a car. Our IC3M comprises two key modules: an adaptive threshold\npseudo-labeling strategy and a missing modality reconstruction. The former\ncustomizes pseudo-labeling thresholds for different classes based on the class\ndistribution, generating class-balanced pseudo labels to guide model training\neffectively, while the latter leverages crossmodality relationships learned\nfrom limited labels to accurately recover missing modalities by distribution\ntransferring from available modalities. Extensive experimental results\ndemonstrate that IC3M outperforms state-of-the-art benchmarks in accuracy,\nprecision, and recall while exhibiting superior robustness under limited\nlabeled data and severe missing modality.\n","authors":["Zihan Fang","Zheng Lin","Senkang Hu","Hangcheng Cao","Yiqin Deng","Xianhao Chen","Yuguang Fang"],"pdf_url":"https://arxiv.org/pdf/2410.02592v1.pdf","comment":"16 pages, 17 figures"},{"id":"http://arxiv.org/abs/2410.02587v1","updated":"2024-10-03T15:29:43Z","published":"2024-10-03T15:29:43Z","title":"An Improved Variational Method for Image Denoising","summary":" The total variation (TV) method is an image denoising technique that aims to\nreduce noise by minimizing the total variation of the image, which measures the\nvariation in pixel intensities. The TV method has been widely applied in image\nprocessing and computer vision for its ability to preserve edges and enhance\nimage quality. In this paper, we propose an improved TV model for image\ndenoising and the associated numerical algorithm to carry out the procedure,\nwhich is particularly effective in removing several types of noises and their\ncombinations. Our improved model admits a unique solution and the associated\nnumerical algorithm guarantees the convergence. Numerical experiments are\ndemonstrated to show improved effectiveness and denoising quality compared to\nother TV models. Such encouraging results further enhance the utility of the TV\nmethod in image processing.\n","authors":["Jing-En Huang","Jia-Wei Liao","Ku-Te Lin","Yu-Ju Tsai","Mei-Heng Yueh"],"pdf_url":"https://arxiv.org/pdf/2410.02587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02572v1","updated":"2024-10-03T15:20:19Z","published":"2024-10-03T15:20:19Z","title":"Combining Pre- and Post-Demosaicking Noise Removal for RAW Video","summary":" Denoising is one of the fundamental steps of the processing pipeline that\nconverts data captured by a camera sensor into a display-ready image or video.\nIt is generally performed early in the pipeline, usually before demosaicking,\nalthough studies swapping their order or even conducting them jointly have been\nproposed. With the advent of deep learning, the quality of denoising algorithms\nhas steadily increased. Even so, modern neural networks still have a hard time\nadapting to new noise levels and scenes, which is indispensable for real-world\napplications. With those in mind, we propose a self-similarity-based denoising\nscheme that weights both a pre- and a post-demosaicking denoiser for\nBayer-patterned CFA video data. We show that a balance between the two leads to\nbetter image quality, and we empirically find that higher noise levels benefit\nfrom a higher influence pre-demosaicking. We also integrate temporal trajectory\nprefiltering steps before each denoiser, which further improve texture\nreconstruction. The proposed method only requires an estimation of the noise\nmodel at the sensor, accurately adapts to any noise level, and is competitive\nwith the state of the art, making it suitable for real-world videography.\n","authors":["Marco Sánchez-Beeckman","Antoni Buades","Nicola Brandonisio","Bilel Kanoun"],"pdf_url":"https://arxiv.org/pdf/2410.02572v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2410.02571v1","updated":"2024-10-03T15:18:28Z","published":"2024-10-03T15:18:28Z","title":"SuperGS: Super-Resolution 3D Gaussian Splatting via Latent Feature Field\n and Gradient-guided Splitting","summary":" Recently, 3D Gaussian Splatting (3DGS) has exceled in novel view synthesis\nwith its real-time rendering capabilities and superior quality. However, it\nfaces challenges for high-resolution novel view synthesis (HRNVS) due to the\ncoarse nature of primitives derived from low-resolution input views. To address\nthis issue, we propose Super-Resolution 3DGS (SuperGS), which is an expansion\nof 3DGS designed with a two-stage coarse-to-fine training framework, utilizing\npretrained low-resolution scene representation as an initialization for\nsuper-resolution optimization. Moreover, we introduce Multi-resolution Feature\nGaussian Splatting (MFGS) to incorporates a latent feature field for flexible\nfeature sampling and Gradient-guided Selective Splitting (GSS) for effective\nGaussian upsampling. By integrating these strategies within the coarse-to-fine\nframework ensure both high fidelity and memory efficiency. Extensive\nexperiments demonstrate that SuperGS surpasses state-of-the-art HRNVS methods\non challenging real-world datasets using only low-resolution inputs.\n","authors":["Shiyun Xie","Zhiru Wang","Yinghao Zhu","Chengwei Pan"],"pdf_url":"https://arxiv.org/pdf/2410.02571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14407v2","updated":"2024-10-03T15:07:52Z","published":"2024-02-22T09:48:47Z","title":"Learning an Actionable Discrete Diffusion Policy via Large-Scale\n Actionless Video Pre-Training","summary":" Learning a generalist embodied agent capable of completing multiple tasks\nposes challenges, primarily stemming from the scarcity of action-labeled\nrobotic datasets. In contrast, a vast amount of human videos exist, capturing\nintricate tasks and interactions with the physical world. Promising prospects\narise for utilizing actionless human videos for pre-training and transferring\nthe knowledge to facilitate robot policy learning through limited robot\ndemonstrations. However, it remains a challenge due to the domain gap between\nhumans and robots. Moreover, it is difficult to extract useful information\nrepresenting the dynamic world from human videos, because of its noisy and\nmultimodal data structure. In this paper, we introduce a novel framework to\ntackle these challenges, which leverages a unified discrete diffusion to\ncombine generative pre-training on human videos and policy fine-tuning on a\nsmall number of action-labeled robot videos. We start by compressing both human\nand robot videos into unified video tokens. In the pre-training stage, we\nemploy a discrete diffusion model with a mask-and-replace diffusion strategy to\npredict future video tokens in the latent space. In the fine-tuning stage, we\nharness the imagined future videos to guide low-level action learning with a\nlimited set of robot data. Experiments demonstrate that our method generates\nhigh-fidelity future videos for planning and enhances the fine-tuned policies\ncompared to previous state-of-the-art approaches with superior performance. Our\nproject website is available at https://video-diff.github.io/.\n","authors":["Haoran He","Chenjia Bai","Ling Pan","Weinan Zhang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2402.14407v2.pdf","comment":"Accepted by NeurIPS 2024. 24 pages"},{"id":"http://arxiv.org/abs/2410.02550v1","updated":"2024-10-03T14:53:42Z","published":"2024-10-03T14:53:42Z","title":"NestedMorph: Enhancing Deformable Medical Image Registration with Nested\n Attention Mechanisms","summary":" Deformable image registration is crucial for aligning medical images in a\nnon-linear fashion across different modalities, allowing for precise spatial\ncorrespondence between varying anatomical structures. This paper presents\nNestedMorph, a novel network utilizing a Nested Attention Fusion approach to\nimprove intra-subject deformable registration between T1-weighted (T1w) MRI and\ndiffusion MRI (dMRI) data. NestedMorph integrates high-resolution spatial\ndetails from an encoder with semantic information from a decoder using a\nmulti-scale framework, enhancing both local and global feature extraction. Our\nmodel notably outperforms existing methods, including CNN-based approaches like\nVoxelMorph, MIDIR, and CycleMorph, as well as Transformer-based models such as\nTransMorph and ViT-V-Net, and traditional techniques like NiftyReg and SyN.\nEvaluations on the HCP dataset demonstrate that NestedMorph achieves superior\nperformance across key metrics, including SSIM, HD95, and SDlogJ, with the\nhighest SSIM of 0.89, and the lowest HD95 of 2.5 and SDlogJ of 0.22. These\nresults highlight NestedMorph's ability to capture both local and global image\nfeatures effectively, leading to superior registration performance. The\npromising outcomes of this study underscore NestedMorph's potential to\nsignificantly advance deformable medical image registration, providing a robust\nframework for future research and clinical applications. The source code and\nour implementation are available at: https://bit.ly/3zdVqcg\n","authors":["Gurucharan Marthi Krishna Kumar","Janine Mendola","Amir Shmuel"],"pdf_url":"https://arxiv.org/pdf/2410.02550v1.pdf","comment":"Submitted to IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2410.02458v1","updated":"2024-10-03T14:50:33Z","published":"2024-10-03T14:50:33Z","title":"MedVisionLlama: Leveraging Pre-Trained Large Language Model Layers to\n Enhance Medical Image Segmentation","summary":" Large Language Models (LLMs), known for their versatility in textual data,\nare increasingly being explored for their potential to enhance medical image\nsegmentation, a crucial task for accurate diagnostic imaging. This study\nexplores enhancing Vision Transformers (ViTs) for medical image segmentation by\nintegrating pre-trained LLM transformer blocks. Our approach, which\nincorporates a frozen LLM transformer block into the encoder of a ViT-based\nmodel, leads to substantial improvements in segmentation performance across\nvarious medical imaging modalities. We propose a Hybrid Attention Mechanism\nthat combines global and local feature learning with a Multi-Scale Fusion Block\nfor aggregating features across different scales. The enhanced model shows\nsignificant performance gains, including an average Dice score increase from\n0.74 to 0.79 and improvements in accuracy, precision, and the Jaccard Index.\nThese results demonstrate the effectiveness of LLM-based transformers in\nrefining medical image segmentation, highlighting their potential to\nsignificantly boost model accuracy and robustness. The source code and our\nimplementation are available at: https://bit.ly/3zf2CVs\n","authors":["Gurucharan Marthi Krishna Kumar","Aman Chadha","Janine Mendola","Amir Shmuel"],"pdf_url":"https://arxiv.org/pdf/2410.02458v1.pdf","comment":"Submitted to IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2403.09850v2","updated":"2024-10-03T14:44:51Z","published":"2024-03-14T20:18:08Z","title":"MARVIS: Motion & Geometry Aware Real and Virtual Image Segmentation","summary":" Tasks such as autonomous navigation, 3D reconstruction, and object\nrecognition near the water surfaces are crucial in marine robotics\napplications. However, challenges arise due to dynamic disturbances, e.g.,\nlight reflections and refraction from the random air-water interface, irregular\nliquid flow, and similar factors, which can lead to potential failures in\nperception and navigation systems. Traditional computer vision algorithms\nstruggle to differentiate between real and virtual image regions, significantly\ncomplicating tasks. A virtual image region is an apparent representation formed\nby the redirection of light rays, typically through reflection or refraction,\ncreating the illusion of an object's presence without its actual physical\nlocation. This work proposes a novel approach for segmentation on real and\nvirtual image regions, exploiting synthetic images combined with\ndomain-invariant information, a Motion Entropy Kernel, and Epipolar Geometric\nConsistency. Our segmentation network does not need to be re-trained if the\ndomain changes. We show this by deploying the same segmentation network in two\ndifferent domains: simulation and the real world. By creating realistic\nsynthetic images that mimic the complexities of the water surface, we provide\nfine-grained training data for our network (MARVIS) to discern between real and\nvirtual images effectively. By motion & geometry-aware design choices and\nthrough comprehensive experimental analysis, we achieve state-of-the-art\nreal-virtual image segmentation performance in unseen real world domain,\nachieving an IoU over 78% and a F1-Score over 86% while ensuring a small\ncomputational footprint. MARVIS offers over 43 FPS (8 FPS) inference rates on a\nsingle GPU (CPU core). Our code and dataset are available here\nhttps://github.com/jiayi-wu-umd/MARVIS.\n","authors":["Jiayi Wu","Xiaomin Lin","Shahriar Negahdaripour","Cornelia Fermüller","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2403.09850v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02534v1","updated":"2024-10-03T14:40:17Z","published":"2024-10-03T14:40:17Z","title":"Pseudo-Stereo Inputs: A Solution to the Occlusion Challenge in\n Self-Supervised Stereo Matching","summary":" Self-supervised stereo matching holds great promise for application and\nresearch due to its independence from expensive labeled data. However, direct\nself-supervised stereo matching paradigms based on photometric loss functions\nhave consistently struggled with performance issues due to the occlusion\nchallenge. The crux of the occlusion challenge lies in the fact that the\npositions of occluded pixels consistently align with the epipolar search\ndirection defined by the input stereo images, leading to persistent information\nloss and erroneous feedback at fixed locations during self-supervised training.\nIn this work, we propose a simple yet highly effective pseudo-stereo inputs\nstrategy to address the core occlusion challenge. This strategy decouples the\ninput and feedback images, compelling the network to probabilistically sample\ninformation from both sides of the occluding objects. As a result, the\npersistent lack of information in the aforementioned fixed occlusion areas is\nmitigated. Building upon this, we further address feedback conflicts and\noverfitting issues arising from the strategy. By integrating these components,\nour method achieves stable and significant performance improvements compared to\nexisting methods. Quantitative experiments are conducted to evaluate the\nperformance. Qualitative experiments further demonstrate accurate disparity\ninference even at occluded regions. These results demonstrate a significant\nadvancement over previous methods in the field of direct self-supervised stereo\nmatching based on photometric loss. The proposed pseudo-stereo inputs strategy,\ndue to its simplicity and effectiveness, has the potential to serve as a new\nparadigm for direct self-supervised stereo matching. Code is available at\nhttps://github.com/qrzyang/Pseudo-Stereo.\n","authors":["Ruizhi Yang","Xingqiang Li","Jiajun Bai","Jinsong Du"],"pdf_url":"https://arxiv.org/pdf/2410.02534v1.pdf","comment":"Submitted to IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2410.02530v1","updated":"2024-10-03T14:36:32Z","published":"2024-10-03T14:36:32Z","title":"A Foundation Model for the Solar Dynamics Observatory","summary":" SDO-FM is a foundation model using data from NASA's Solar Dynamics\nObservatory (SDO) spacecraft; integrating three separate instruments to\nencapsulate the Sun's complex physical interactions into a multi-modal\nembedding space. This model can be used to streamline scientific investigations\ninvolving SDO by making the enormous datasets more computationally accessible\nfor heliophysics research and enable investigations that require instrument\nfusion. We discuss four key components: an ingestion pipeline to create machine\nlearning ready datasets, the model architecture and training approach,\nresultant embeddings and fine-tunable models, and finally downstream fine-tuned\napplications. A key component of this effort has been to include subject matter\nspecialists at each stage of development; reviewing the scientific value and\nproviding guidance for model architecture, dataset, and training paradigm\ndecisions. This paper marks release of our pretrained models and embedding\ndatasets, available to the community on Hugging Face and sdofm.org.\n","authors":["James Walsh","Daniel G. Gass","Raul Ramos Pollan","Paul J. Wright","Richard Galvez","Noah Kasmanoff","Jason Naradowsky","Anne Spalding","James Parr","Atılım Güneş Baydin"],"pdf_url":"https://arxiv.org/pdf/2410.02530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02528v1","updated":"2024-10-03T14:36:22Z","published":"2024-10-03T14:36:22Z","title":"HiFiSeg: High-Frequency Information Enhanced Polyp Segmentation with\n Global-Local Vision Transformer","summary":" Numerous studies have demonstrated the strong performance of Vision\nTransformer (ViT)-based methods across various computer vision tasks. However,\nViT models often struggle to effectively capture high-frequency components in\nimages, which are crucial for detecting small targets and preserving edge\ndetails, especially in complex scenarios. This limitation is particularly\nchallenging in colon polyp segmentation, where polyps exhibit significant\nvariability in structure, texture, and shape. High-frequency information, such\nas boundary details, is essential for achieving precise semantic segmentation\nin this context. To address these challenges, we propose HiFiSeg, a novel\nnetwork for colon polyp segmentation that enhances high-frequency information\nprocessing through a global-local vision transformer framework. HiFiSeg\nleverages the pyramid vision transformer (PVT) as its encoder and introduces\ntwo key modules: the global-local interaction module (GLIM) and the selective\naggregation module (SAM). GLIM employs a parallel structure to fuse global and\nlocal information at multiple scales, effectively capturing fine-grained\nfeatures. SAM selectively integrates boundary details from low-level features\nwith semantic information from high-level features, significantly improving the\nmodel's ability to accurately detect and segment polyps. Extensive experiments\non five widely recognized benchmark datasets demonstrate the effectiveness of\nHiFiSeg for polyp segmentation. Notably, the mDice scores on the challenging\nCVC-ColonDB and ETIS datasets reached 0.826 and 0.822, respectively,\nunderscoring the superior performance of HiFiSeg in handling the specific\ncomplexities of this task.\n","authors":["Jingjing Ren","Xiaoyong Zhang","Lina Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.02528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02527v1","updated":"2024-10-03T14:35:35Z","published":"2024-10-03T14:35:35Z","title":"Learning from Offline Foundation Features with Tensor Augmentations","summary":" We introduce Learning from Offline Foundation Features with Tensor\nAugmentations (LOFF-TA), an efficient training scheme designed to harness the\ncapabilities of foundation models in limited resource settings where their\ndirect development is not feasible. LOFF-TA involves training a compact\nclassifier on cached feature embeddings from a frozen foundation model,\nresulting in up to $37\\times$ faster training and up to $26\\times$ reduced GPU\nmemory usage. Because the embeddings of augmented images would be too numerous\nto store, yet the augmentation process is essential for training, we propose to\napply tensor augmentations to the cached embeddings of the original\nnon-augmented images. LOFF-TA makes it possible to leverage the power of\nfoundation models, regardless of their size, in settings with limited\ncomputational capacity. Moreover, LOFF-TA can be used to apply foundation\nmodels to high-resolution images without increasing compute. In certain\nscenarios, we find that training with LOFF-TA yields better results than\ndirectly fine-tuning the foundation model.\n","authors":["Emir Konuk","Christos Matsoukas","Moein Sorkhei","Phitchapha Lertsiravaramet","Kevin Smith"],"pdf_url":"https://arxiv.org/pdf/2410.02527v1.pdf","comment":"Accepted to the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2410.02523v1","updated":"2024-10-03T14:29:46Z","published":"2024-10-03T14:29:46Z","title":"Med-TTT: Vision Test-Time Training model for Medical Image Segmentation","summary":" Medical image segmentation plays a crucial role in clinical diagnosis and\ntreatment planning. Although models based on convolutional neural networks\n(CNNs) and Transformers have achieved remarkable success in medical image\nsegmentation tasks, they still face challenges such as high computational\ncomplexity and the loss of local features when capturing long-range\ndependencies. To address these limitations, we propose Med-TTT, a visual\nbackbone network integrated with Test-Time Training (TTT) layers, which\nincorporates dynamic adjustment capabilities. Med-TTT introduces the Vision-TTT\nlayer, which enables effective modeling of long-range dependencies with linear\ncomputational complexity and adaptive parameter adjustment during inference.\nFurthermore, we designed a multi-resolution fusion mechanism to combine image\nfeatures at different scales, facilitating the identification of subtle lesion\ncharacteristics in complex backgrounds. At the same time, we adopt a frequency\ndomain feature enhancement strategy based on high pass filtering, which can\nbetter capture texture and fine-grained details in images. Experimental results\ndemonstrate that Med-TTT significantly outperforms existing methods on multiple\nmedical image datasets, exhibiting strong segmentation capabilities,\nparticularly in complex image backgrounds. The model achieves leading\nperformance in terms of accuracy, sensitivity, and Dice coefficient, providing\nan efficient and robust solution for the field of medical image\nsegmentation.The code is available at https://github.com/Jiashu-Xu/Med-TTT .\n","authors":["Jiashu Xu"],"pdf_url":"https://arxiv.org/pdf/2410.02523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.20409v2","updated":"2024-10-03T14:26:43Z","published":"2024-09-30T15:36:14Z","title":"Physics-Regularized Multi-Modal Image Assimilation for Brain Tumor\n Localization","summary":" Physical models in the form of partial differential equations represent an\nimportant prior for many under-constrained problems. One example is tumor\ntreatment planning, which heavily depends on accurate estimates of the spatial\ndistribution of tumor cells in a patient's anatomy. Medical imaging scans can\nidentify the bulk of the tumor, but they cannot reveal its full spatial\ndistribution. Tumor cells at low concentrations remain undetectable, for\nexample, in the most frequent type of primary brain tumors, glioblastoma.\nDeep-learning-based approaches fail to estimate the complete tumor cell\ndistribution due to a lack of reliable training data. Most existing works\ntherefore rely on physics-based simulations to match observed tumors, providing\nanatomically and physiologically plausible estimations. However, these\napproaches struggle with complex and unknown initial conditions and are limited\nby overly rigid physical models. In this work, we present a novel method that\nbalances data-driven and physics-based cost functions. In particular, we\npropose a unique discretization scheme that quantifies the adherence of our\nlearned spatiotemporal tumor and brain tissue distributions to their\ncorresponding growth and elasticity equations. This quantification, serving as\na regularization term rather than a hard constraint, enables greater\nflexibility and proficiency in assimilating patient data than existing models.\nWe demonstrate improved coverage of tumor recurrence areas compared to existing\ntechniques on real-world data from a cohort of patients. The method holds the\npotential to enhance clinical adoption of model-driven treatment planning for\nglioblastoma.\n","authors":["Michal Balcerak","Tamaz Amiranashvili","Andreas Wagner","Jonas Weidner","Petr Karnakov","Johannes C. Paetzold","Ivan Ezhov","Petros Koumoutsakos","Benedikt Wiestler","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2409.20409v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.02954v3","updated":"2024-10-03T14:25:07Z","published":"2024-05-05T14:48:13Z","title":"Source-Free Domain Adaptation Guided by Vision and Vision-Language\n Pre-Training","summary":" Source-free domain adaptation (SFDA) aims to adapt a source model trained on\na fully-labeled source domain to a related but unlabeled target domain. While\nthe source model is a key avenue for acquiring target pseudolabels, the\ngenerated pseudolabels may exhibit source bias. In the conventional SFDA\npipeline, a large data (e.g. ImageNet) pre-trained feature extractor is used to\ninitialize the source model at the start of source training, and subsequently\ndiscarded. Despite having diverse features important for generalization, the\npre-trained feature extractor can overfit to the source data distribution\nduring source training and forget relevant target domain knowledge. Rather than\ndiscarding this valuable knowledge, we introduce an integrated framework to\nincorporate pre-trained networks into the target adaptation process. The\nproposed framework is flexible and allows us to plug modern pre-trained\nnetworks into the adaptation process to leverage their stronger representation\nlearning capabilities. For adaptation, we propose the Co-learn algorithm to\nimprove target pseudolabel quality collaboratively through the source model and\na pre-trained feature extractor. Building on the recent success of the\nvision-language model CLIP in zero-shot image recognition, we present an\nextension Co-learn++ to further incorporate CLIP's zero-shot classification\ndecisions. We evaluate on 4 benchmark datasets and include more challenging\nscenarios such as open-set, partial-set and open-partial SFDA. Experimental\nresults demonstrate that our proposed strategy improves adaptation performance\nand can be successfully integrated with existing SFDA methods. Project code is\navailable at https://github.com/zwenyu/colearn-plus.\n","authors":["Wenyu Zhang","Li Shen","Chuan-Sheng Foo"],"pdf_url":"https://arxiv.org/pdf/2405.02954v3.pdf","comment":"Extension of ICCV paper arXiv:2212.07585; Published at IJCV"},{"id":"http://arxiv.org/abs/2410.01443v2","updated":"2024-10-03T14:14:29Z","published":"2024-10-02T11:53:28Z","title":"SurgPointTransformer: Vertebrae Shape Completion with RGB-D Data","summary":" State-of-the-art computer- and robot-assisted surgery systems heavily depend\non intraoperative imaging technologies such as CT and fluoroscopy to generate\ndetailed 3D visualization of the patient's anatomy. While imaging techniques\nare highly accurate, they are based on ionizing radiation and expose patients\nand clinicians. This study introduces an alternative, radiation-free approach\nfor reconstructing the 3D spine anatomy using RGB-D data. Drawing inspiration\nfrom the 3D \"mental map\" that surgeons form during surgeries, we introduce\nSurgPointTransformer, a shape completion approach for surgical applications\nthat can accurately reconstruct the unexposed spine regions from sparse\nobservations of the exposed surface.\n Our method involves two main steps: segmentation and shape completion. The\nsegmentation step includes spinal column localization and segmentation,\nfollowed by vertebra-wise segmentation. The segmented vertebra point clouds are\nthen subjected to SurgPointTransformer, which leverages an attention mechanism\nto learn patterns between visible surface features and the underlying anatomy.\nFor evaluation, we utilize an ex-vivo dataset of nine specimens. Their CT data\nis used to establish ground truth data that were used to compare to the outputs\nof our methods. Our method significantly outperforms the state-of-the-art\nbaselines, achieving an average Chamfer Distance of 5.39, an F-Score of 0.85,\nan Earth Mover's Distance of 0.011, and a Signal-to-Noise Ratio of 22.90 dB.\n This study demonstrates the potential of our reconstruction method for 3D\nvertebral shape completion. It enables 3D reconstruction of the entire lumbar\nspine and surgical guidance without ionizing radiation or invasive imaging. Our\nwork contributes to computer-aided and robot-assisted surgery, advancing the\nperception and intelligence of these systems.\n","authors":["Aidana Massalimova","Florentin Liebmann","Sascha Jecklin","Fabio Carrillo","Farshad Mazda","Philipp Fürnstahl"],"pdf_url":"https://arxiv.org/pdf/2410.01443v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02505v1","updated":"2024-10-03T14:14:21Z","published":"2024-10-03T14:14:21Z","title":"Dog-IQA: Standard-guided Zero-shot MLLM for Mix-grained Image Quality\n Assessment","summary":" Image quality assessment (IQA) serves as the golden standard for all models'\nperformance in nearly all computer vision fields. However, it still suffers\nfrom poor out-of-distribution generalization ability and expensive training\ncosts. To address these problems, we propose Dog-IQA, a standard-guided\nzero-shot mix-grained IQA method, which is training-free and utilizes the\nexceptional prior knowledge of multimodal large language models (MLLMs). To\nobtain accurate IQA scores, namely scores consistent with humans, we design an\nMLLM-based inference pipeline that imitates human experts. In detail, Dog-IQA\napplies two techniques. First, Dog-IQA objectively scores with specific\nstandards that utilize MLLM's behavior pattern and minimize the influence of\nsubjective factors. Second, Dog-IQA comprehensively takes local semantic\nobjects and the whole image as input and aggregates their scores, leveraging\nlocal and global information. Our proposed Dog-IQA achieves state-of-the-art\n(SOTA) performance compared with training-free methods, and competitive\nperformance compared with training-based methods in cross-dataset scenarios.\nOur code and models will be available at https://github.com/Kai-Liu001/Dog-IQA.\n","authors":["Kai Liu","Ziqing Zhang","Wenbo Li","Renjing Pei","Fenglong Song","Xiaohong Liu","Linghe Kong","Yulun Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.02505v1.pdf","comment":"10 pages, 5 figures. The code and models will be available at\n https://github.com/Kai-Liu001/Dog-IQA"},{"id":"http://arxiv.org/abs/2306.11528v3","updated":"2024-10-03T14:02:10Z","published":"2023-06-20T13:31:33Z","title":"TransRef: Multi-Scale Reference Embedding Transformer for\n Reference-Guided Image Inpainting","summary":" Image inpainting for completing complicated semantic environments and diverse\nhole patterns of corrupted images is challenging even for state-of-the-art\nlearning-based inpainting methods trained on large-scale data. A reference\nimage capturing the same scene of a corrupted image offers informative guidance\nfor completing the corrupted image as it shares similar texture and structure\npriors to that of the holes of the corrupted image. In this work, we propose a\ntransformer-based encoder-decoder network, named TransRef, for reference-guided\nimage inpainting. Specifically, the guidance is conducted progressively through\na reference embedding procedure, in which the referencing features are\nsubsequently aligned and fused with the features of the corrupted image. For\nprecise utilization of the reference features for guidance, a reference-patch\nalignment (Ref-PA) module is proposed to align the patch features of the\nreference and corrupted images and harmonize their style differences, while a\nreference-patch transformer (Ref-PT) module is proposed to refine the embedded\nreference feature. Moreover, to facilitate the research of reference-guided\nimage restoration tasks, we construct a publicly accessible benchmark dataset\ncontaining 50K pairs of input and reference images. Both quantitative and\nqualitative evaluations demonstrate the efficacy of the reference information\nand the proposed method over the state-of-the-art methods in completing complex\nholes. Code and dataset can be accessed at https://github.com/Cameltr/TransRef.\n","authors":["Taorong Liu","Liang Liao","Delin Chen","Jing Xiao","Zheng Wang","Chia-Wen Lin","Shin'ichi Satoh"],"pdf_url":"https://arxiv.org/pdf/2306.11528v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.19365v2","updated":"2024-10-03T14:01:03Z","published":"2024-09-28T14:36:38Z","title":"Conditional Image Synthesis with Diffusion Models: A Survey","summary":" Conditional image synthesis based on user-specified requirements is a key\ncomponent in creating complex visual content. In recent years, diffusion-based\ngenerative modeling has become a highly effective way for conditional image\nsynthesis, leading to exponential growth in the literature. However, the\ncomplexity of diffusion-based modeling, the wide range of image synthesis\ntasks, and the diversity of conditioning mechanisms present significant\nchallenges for researchers to keep up with rapid developments and understand\nthe core concepts on this topic. In this survey, we categorize existing works\nbased on how conditions are integrated into the two fundamental components of\ndiffusion-based modeling, i.e., the denoising network and the sampling process.\nWe specifically highlight the underlying principles, advantages, and potential\nchallenges of various conditioning approaches in the training, re-purposing,\nand specialization stages to construct a desired denoising network. We also\nsummarize six mainstream conditioning mechanisms in the essential sampling\nprocess. All discussions are centered around popular applications. Finally, we\npinpoint some critical yet still open problems to be solved in the future and\nsuggest some possible solutions. Our reviewed works are itemized at\nhttps://github.com/zju-pi/Awesome-Conditional-Diffusion-Models.\n","authors":["Zheyuan Zhan","Defang Chen","Jian-Ping Mei","Zhenghe Zhao","Jiawei Chen","Chun Chen","Siwei Lyu","Can Wang"],"pdf_url":"https://arxiv.org/pdf/2409.19365v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02492v1","updated":"2024-10-03T13:57:07Z","published":"2024-10-03T13:57:07Z","title":"DTVLT: A Multi-modal Diverse Text Benchmark for Visual Language Tracking\n Based on LLM","summary":" Visual language tracking (VLT) has emerged as a cutting-edge research area,\nharnessing linguistic data to enhance algorithms with multi-modal inputs and\nbroadening the scope of traditional single object tracking (SOT) to encompass\nvideo understanding applications. Despite this, most VLT benchmarks still\ndepend on succinct, human-annotated text descriptions for each video. These\ndescriptions often fall short in capturing the nuances of video content\ndynamics and lack stylistic variety in language, constrained by their uniform\nlevel of detail and a fixed annotation frequency. As a result, algorithms tend\nto default to a \"memorize the answer\" strategy, diverging from the core\nobjective of achieving a deeper understanding of video content. Fortunately,\nthe emergence of large language models (LLMs) has enabled the generation of\ndiverse text. This work utilizes LLMs to generate varied semantic annotations\n(in terms of text lengths and granularities) for representative SOT benchmarks,\nthereby establishing a novel multi-modal benchmark. Specifically, we (1)\npropose a new visual language tracking benchmark with diverse texts, named\nDTVLT, based on five prominent VLT and SOT benchmarks, including three\nsub-tasks: short-term tracking, long-term tracking, and global instance\ntracking. (2) We offer four granularity texts in our benchmark, considering the\nextent and density of semantic information. We expect this multi-granular\ngeneration strategy to foster a favorable environment for VLT and video\nunderstanding research. (3) We conduct comprehensive experimental analyses on\nDTVLT, evaluating the impact of diverse text on tracking performance and hope\nthe identified performance bottlenecks of existing algorithms can support\nfurther research in VLT and video understanding. The proposed benchmark,\nexperimental results and toolkit will be released gradually on\nhttp://videocube.aitestunion.com/.\n","authors":["Xuchen Li","Shiyu Hu","Xiaokun Feng","Dailing Zhang","Meiqi Wu","Jing Zhang","Kaiqi Huang"],"pdf_url":"https://arxiv.org/pdf/2410.02492v1.pdf","comment":"Preprint, Under Review"},{"id":"http://arxiv.org/abs/2409.20195v2","updated":"2024-10-03T13:50:29Z","published":"2024-09-30T11:11:35Z","title":"Forecasting Disease Progression with Parallel Hyperplanes in\n Longitudinal Retinal OCT","summary":" Predicting future disease progression risk from medical images is challenging\ndue to patient heterogeneity, and subtle or unknown imaging biomarkers.\nMoreover, deep learning (DL) methods for survival analysis are susceptible to\nimage domain shifts across scanners. We tackle these issues in the task of\npredicting late dry Age-related Macular Degeneration (dAMD) onset from retinal\nOCT scans. We propose a novel DL method for survival prediction to jointly\npredict from the current scan a risk score, inversely related to\ntime-to-conversion, and the probability of conversion within a time interval\n$t$. It uses a family of parallel hyperplanes generated by parameterizing the\nbias term as a function of $t$. In addition, we develop unsupervised losses\nbased on intra-subject image pairs to ensure that risk scores increase over\ntime and that future conversion predictions are consistent with AMD stage\nprediction using actual scans of future visits. Such losses enable\ndata-efficient fine-tuning of the trained model on new unlabeled datasets\nacquired with a different scanner. Extensive evaluation on two large datasets\nacquired with different scanners resulted in a mean AUROCs of 0.82 for\nDataset-1 and 0.83 for Dataset-2, across prediction intervals of 6,12 and 24\nmonths.\n","authors":["Arunava Chakravarty","Taha Emre","Dmitrii Lachinov","Antoine Rivail","Hendrik Scholl","Lars Fritsche","Sobha Sivaprasad","Daniel Rueckert","Andrew Lotery","Ursula Schmidt-Erfurth","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2409.20195v2.pdf","comment":"accepted in MICCAI 2024"},{"id":"http://arxiv.org/abs/2410.02483v1","updated":"2024-10-03T13:41:58Z","published":"2024-10-03T13:41:58Z","title":"Event-Customized Image Generation","summary":" Customized Image Generation, generating customized images with user-specified\nconcepts, has raised significant attention due to its creativity and novelty.\nWith impressive progress achieved in subject customization, some pioneer works\nfurther explored the customization of action and interaction beyond entity\n(i.e., human, animal, and object) appearance. However, these approaches only\nfocus on basic actions and interactions between two entities, and their effects\nare limited by insufficient ''exactly same'' reference images. To extend\ncustomized image generation to more complex scenes for general real-world\napplications, we propose a new task: event-customized image generation. Given a\nsingle reference image, we define the ''event'' as all specific actions, poses,\nrelations, or interactions between different entities in the scene. This task\naims at accurately capturing the complex event and generating customized images\nwith various target entities. To solve this task, we proposed a novel\ntraining-free event customization method: FreeEvent. Specifically, FreeEvent\nintroduces two extra paths alongside the general diffusion denoising process:\n1) Entity switching path: it applies cross-attention guidance and regulation\nfor target entity generation. 2) Event transferring path: it injects the\nspatial feature and self-attention maps from the reference image to the target\nimage for event generation. To further facilitate this new task, we collected\ntwo evaluation benchmarks: SWiG-Event and Real-Event. Extensive experiments and\nablations have demonstrated the effectiveness of FreeEvent.\n","authors":["Zhen Wang","Yilei Jiang","Dong Zheng","Jun Xiao","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17041v4","updated":"2024-10-03T13:31:39Z","published":"2023-11-28T18:53:06Z","title":"Eliciting In-Context Learning in Vision-Language Models for Videos\n Through Curated Data Distributional Properties","summary":" A major reason behind the recent success of large language models (LLMs) is\ntheir \\textit{in-context learning} capability, which makes it possible to\nrapidly adapt them to downstream text-based tasks by prompting them with a\nsmall number of relevant demonstrations. While large vision-language models\n(VLMs) have recently been developed for tasks requiring both text and images,\nthey largely lack in-context learning over visual information, especially in\nunderstanding and generating text about videos. In this work, we implement\n\\textbf{E}mergent \\textbf{I}n-context \\textbf{Le}arning on \\textbf{V}ideos\n(\\eilev{}), a novel training paradigm that induces in-context learning over\nvideo and text by capturing key properties of pre-training data found by prior\nwork to be essential for in-context learning in transformers. In our\nexperiments, we show that \\eilev-trained models outperform other off-the-shelf\nVLMs in few-shot video narration for novel, rare actions. Furthermore, we\ndemonstrate that these key properties of bursty distributions, skewed marginal\ndistributions, and dynamic meaning each contribute to varying degrees to VLMs'\nin-context learning capability in narrating procedural videos. Our results,\nanalysis, and \\eilev{}-trained models yield numerous insights about the\nemergence of in-context learning over video and text, creating a foundation for\nfuture work to optimize and scale VLMs for open-domain video understanding and\nreasoning. Our code and demo are available at\n\\url{https://github.com/yukw777/EILEV}.\n","authors":["Keunwoo Peter Yu","Zheyuan Zhang","Fengyuan Hu","Shane Storks","Joyce Chai"],"pdf_url":"https://arxiv.org/pdf/2311.17041v4.pdf","comment":"16 pages, LaTeX; Accepted to EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2410.02467v1","updated":"2024-10-03T13:17:06Z","published":"2024-10-03T13:17:06Z","title":"Towards a Theoretical Understanding of Memorization in Diffusion Models","summary":" As diffusion probabilistic models (DPMs) are being employed as mainstream\nmodels for Generative Artificial Intelligence (GenAI), the study of their\nmemorization of training data has attracted growing attention. Existing works\nin this direction aim to establish an understanding of whether or to what\nextent DPMs learn via memorization. Such an understanding is crucial for\nidentifying potential risks of data leakage and copyright infringement in\ndiffusion models and, more importantly, for trustworthy application of GenAI.\nExisting works revealed that conditional DPMs are more prone to training data\nmemorization than unconditional DPMs, and the motivated data extraction methods\nare mostly for conditional DPMs. However, these understandings are primarily\nempirical, and extracting training data from unconditional models has been\nfound to be extremely challenging. In this work, we provide a theoretical\nunderstanding of memorization in both conditional and unconditional DPMs under\nthe assumption of model convergence. Our theoretical analysis indicates that\nextracting data from unconditional models can also be effective by constructing\na proper surrogate condition. Based on this result, we propose a novel data\nextraction method named \\textbf{Surrogate condItional Data Extraction (SIDE)}\nthat leverages a time-dependent classifier trained on the generated data as a\nsurrogate condition to extract training data from unconditional DPMs. Empirical\nresults demonstrate that our SIDE can extract training data in challenging\nscenarios where previous methods fail, and it is, on average, over 50\\% more\neffective across different scales of the CelebA dataset.\n","authors":["Yunhao Chen","Xingjun Ma","Difan Zou","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2410.02467v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2406.12752"},{"id":"http://arxiv.org/abs/2407.07053v5","updated":"2024-10-03T13:12:06Z","published":"2024-07-09T17:18:27Z","title":"Multimodal Self-Instruct: Synthetic Abstract Image and Visual Reasoning\n Instruction Using Language Model","summary":" Although most current large multimodal models (LMMs) can already understand\nphotos of natural scenes and portraits, their understanding of abstract images,\ne.g., charts, maps, or layouts, and visual reasoning capabilities remains quite\nrudimentary. They often struggle with simple daily tasks, such as reading time\nfrom a clock, understanding a flowchart, or planning a route using a road map.\nIn light of this, we design a multi-modal self-instruct, utilizing large\nlanguage models and their code capabilities to synthesize massive abstract\nimages and visual reasoning instructions across daily scenarios. Our strategy\neffortlessly creates a multimodal benchmark with 11,193 instructions for eight\nvisual scenarios: charts, tables, simulated maps, dashboards, flowcharts,\nrelation graphs, floor plans, and visual puzzles. \\textbf{This benchmark,\nconstructed with simple lines and geometric elements, exposes the shortcomings\nof most advanced LMMs} like Claude-3.5-Sonnet and GPT-4o in abstract image\nunderstanding, spatial relations reasoning, and visual element induction.\nBesides, to verify the quality of our synthetic data, we fine-tune an LMM using\n62,476 synthetic chart, table and road map instructions. The results\ndemonstrate improved chart understanding and map navigation performance, and\nalso demonstrate potential benefits for other visual reasoning tasks. Our code\nis available at: \\url{https://github.com/zwq2018/Multi-modal-Self-instruct}.\n","authors":["Wenqi Zhang","Zhenglin Cheng","Yuanyu He","Mengna Wang","Yongliang Shen","Zeqi Tan","Guiyang Hou","Mingqian He","Yanna Ma","Weiming Lu","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2407.07053v5.pdf","comment":"The paper is accepted by EMNLP-24. Code:\n https://github.com/zwq2018/Multi-modal-Self-instruct dataset:\n https://huggingface.co/datasets/zwq2018/Multi-modal-Self-instruct\n Leaderboard: https://multi-modal-self-instruct.github.io/"},{"id":"http://arxiv.org/abs/2410.02456v1","updated":"2024-10-03T13:05:27Z","published":"2024-10-03T13:05:27Z","title":"Recurrent Few-Shot model for Document Verification","summary":" General-purpose ID, or travel, document image- and video-based verification\nsystems have yet to achieve good enough performance to be considered a solved\nproblem. There are several factors that negatively impact their performance,\nincluding low-resolution images and videos and a lack of sufficient data to\ntrain the models. This task is particularly challenging when dealing with\nunseen class of ID, or travel, documents. In this paper we address this task by\nproposing a recurrent-based model able to detect forged documents in a few-shot\nscenario. The recurrent architecture makes the model robust to document\nresolution variability. Moreover, the few-shot approach allow the model to\nperform well even for unseen class of documents. Preliminary results on the\nSIDTD and Findit datasets show good performance of this model for this task.\n","authors":["Maxime Talarmain","Carlos Boned","Sanket Biswas","Oriol Ramos"],"pdf_url":"https://arxiv.org/pdf/2410.02456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11682v2","updated":"2024-10-03T13:03:39Z","published":"2024-09-18T03:47:24Z","title":"SRIF: Semantic Shape Registration Empowered by Diffusion-based Image\n Morphing and Flow Estimation","summary":" In this paper, we propose SRIF, a novel Semantic shape Registration framework\nbased on diffusion-based Image morphing and Flow estimation. More concretely,\ngiven a pair of extrinsically aligned shapes, we first render them from\nmulti-views, and then utilize an image interpolation framework based on\ndiffusion models to generate sequences of intermediate images between them. The\nimages are later fed into a dynamic 3D Gaussian splatting framework, with which\nwe reconstruct and post-process for intermediate point clouds respecting the\nimage morphing processing. In the end, tailored for the above, we propose a\nnovel registration module to estimate continuous normalizing flow, which\ndeforms source shape consistently towards the target, with intermediate point\nclouds as weak guidance. Our key insight is to leverage large vision models\n(LVMs) to associate shapes and therefore obtain much richer semantic\ninformation on the relationship between shapes than the ad-hoc feature\nextraction and alignment. As a consequence, SRIF achieves high-quality dense\ncorrespondences on challenging shape pairs, but also delivers smooth,\nsemantically meaningful interpolation in between. Empirical evidence justifies\nthe effectiveness and superiority of our method as well as specific design\nchoices. The code is released at https://github.com/rqhuang88/SRIF.\n","authors":["Mingze Sun","Chen Guo","Puhua Jiang","Shiwei Mao","Yurun Chen","Ruqi Huang"],"pdf_url":"https://arxiv.org/pdf/2409.11682v2.pdf","comment":"Accepted as a conference paper of SIGGRAPH Asia 2024"},{"id":"http://arxiv.org/abs/2406.06050v2","updated":"2024-10-03T12:52:10Z","published":"2024-06-10T06:38:11Z","title":"Generalizable Human Gaussians from Single-View Image","summary":" In this work, we tackle the task of learning generalizable 3D human Gaussians\nfrom a single image. The main challenge for this task is to recover detailed\ngeometry and appearance, especially for the unobserved regions. To this end, we\npropose single-view generalizable Human Gaussian model (HGM), a\ndiffusion-guided framework for 3D human modeling from a single image. We design\na diffusion-based coarse-to-fine pipeline, where the diffusion model is adapted\nto refine novel-view images rendered from a coarse human Gaussian model. The\nrefined images are then used together with the input image to learn a refined\nhuman Gaussian model. Although effective in hallucinating the unobserved views,\nthe approach may generate unrealistic human pose and shapes due to the lack of\nsupervision. We circumvent this problem by further encoding the geometric\npriors from SMPL model. Specifically, we propagate geometric features from SMPL\nvolume to the predicted Gaussians via sparse convolution and attention\nmechanism. We validate our approach on publicly available datasets and\ndemonstrate that it significantly surpasses state-of-the-art methods in terms\nof PSNR and SSIM. Additionally, our method exhibits strong generalization for\nin-the-wild images.\n","authors":["Jinnan Chen","Chen Li","Jianfeng Zhang","Lingting Zhu","Buzhen Huang","Hanlin Chen","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2406.06050v2.pdf","comment":"https://jinnan-chen.github.io/projects/HGM/"},{"id":"http://arxiv.org/abs/2410.01654v2","updated":"2024-10-03T12:43:14Z","published":"2024-10-02T15:19:31Z","title":"Releasing the Parameter Latency of Neural Representation for\n High-Efficiency Video Compression","summary":" For decades, video compression technology has been a prominent research area.\nTraditional hybrid video compression framework and end-to-end frameworks\ncontinue to explore various intra- and inter-frame reference and prediction\nstrategies based on discrete transforms and deep learning techniques. However,\nthe emerging implicit neural representation (INR) technique models entire\nvideos as basic units, automatically capturing intra-frame and inter-frame\ncorrelations and obtaining promising performance. INR uses a compact neural\nnetwork to store video information in network parameters, effectively\neliminating spatial and temporal redundancy in the original video. However, in\nthis paper, our exploration and verification reveal that current INR video\ncompression methods do not fully exploit their potential to preserve\ninformation. We investigate the potential of enhancing network parameter\nstorage through parameter reuse. By deepening the network, we designed a\nfeasible INR parameter reuse scheme to further improve compression performance.\nExtensive experimental results show that our method significantly enhances the\nrate-distortion performance of INR video compression.\n","authors":["Gai Zhang","Xinfeng Zhang","Lv Tang","Yue Li","Kai Zhang","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.01654v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02443v1","updated":"2024-10-03T12:40:52Z","published":"2024-10-03T12:40:52Z","title":"Clinnova Federated Learning Proof of Concept: Key Takeaways from a\n Cross-border Collaboration","summary":" Clinnova, a collaborative initiative involving France, Germany, Switzerland,\nand Luxembourg, is dedicated to unlocking the power of precision medicine\nthrough data federation, standardization, and interoperability. This European\nGreater Region initiative seeks to create an interoperable European standard\nusing artificial intelligence (AI) and data science to enhance healthcare\noutcomes and efficiency. Key components include multidisciplinary research\ncenters, a federated biobanking strategy, a digital health innovation platform,\nand a federated AI strategy. It targets inflammatory bowel disease, rheumatoid\ndiseases, and multiple sclerosis (MS), emphasizing data quality to develop AI\nalgorithms for personalized treatment and translational research.\n The IHU Strasbourg (Institute of Minimal-invasive Surgery) has the lead in\nthis initiative to develop the federated learning (FL) proof of concept (POC)\nthat will serve as a foundation for advancing AI in healthcare. At its core,\nClinnova-MS aims to enhance MS patient care by using FL to develop more\naccurate models that detect disease progression, guide interventions, and\nvalidate digital biomarkers across multiple sites. This technical report\npresents insights and key takeaways from the first cross-border federated POC\non MS segmentation of MRI images within the Clinnova framework. While our work\nmarks a significant milestone in advancing MS segmentation through cross-border\ncollaboration, it also underscores the importance of addressing technical,\nlogistical, and ethical considerations to realize the full potential of FL in\nhealthcare settings.\n","authors":["Julia Alekseenko","Bram Stieltjes","Michael Bach","Melanie Boerries","Oliver Opitz","Alexandros Karargyris","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2410.02443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02430v1","updated":"2024-10-03T12:25:01Z","published":"2024-10-03T12:25:01Z","title":"Predictive Attractor Models","summary":" Sequential memory, the ability to form and accurately recall a sequence of\nevents or stimuli in the correct order, is a fundamental prerequisite for\nbiological and artificial intelligence as it underpins numerous cognitive\nfunctions (e.g., language comprehension, planning, episodic memory formation,\netc.) However, existing methods of sequential memory suffer from catastrophic\nforgetting, limited capacity, slow iterative learning procedures, low-order\nMarkov memory, and, most importantly, the inability to represent and generate\nmultiple valid future possibilities stemming from the same context. Inspired by\nbiologically plausible neuroscience theories of cognition, we propose\n\\textit{Predictive Attractor Models (PAM)}, a novel sequence memory\narchitecture with desirable generative properties. PAM is a streaming model\nthat learns a sequence in an online, continuous manner by observing each input\n\\textit{only once}. Additionally, we find that PAM avoids catastrophic\nforgetting by uniquely representing past context through lateral inhibition in\ncortical minicolumns, which prevents new memories from overwriting previously\nlearned knowledge. PAM generates future predictions by sampling from a union\nset of predicted possibilities; this generative ability is realized through an\nattractor model trained alongside the predictor. We show that PAM is trained\nwith local computations through Hebbian plasticity rules in a biologically\nplausible framework. Other desirable traits (e.g., noise tolerance, CPU-based\nlearning, capacity scaling) are discussed throughout the paper. Our findings\nsuggest that PAM represents a significant step forward in the pursuit of\nbiologically plausible and computationally efficient sequential memory models,\nwith broad implications for cognitive science and artificial intelligence\nresearch.\n","authors":["Ramy Mounir","Sudeep Sarkar"],"pdf_url":"https://arxiv.org/pdf/2410.02430v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.02423v1","updated":"2024-10-03T12:13:56Z","published":"2024-10-03T12:13:56Z","title":"PnP-Flow: Plug-and-Play Image Restoration with Flow Matching","summary":" In this paper, we introduce Plug-and-Play (PnP) Flow Matching, an algorithm\nfor solving imaging inverse problems. PnP methods leverage the strength of\npre-trained denoisers, often deep neural networks, by integrating them in\noptimization schemes. While they achieve state-of-the-art performance on\nvarious inverse problems in imaging, PnP approaches face inherent limitations\non more generative tasks like inpainting. On the other hand, generative models\nsuch as Flow Matching pushed the boundary in image sampling yet lack a clear\nmethod for efficient use in image restoration. We propose to combine the PnP\nframework with Flow Matching (FM) by defining a time-dependent denoiser using a\npre-trained FM model. Our algorithm alternates between gradient descent steps\non the data-fidelity term, reprojections onto the learned FM path, and\ndenoising. Notably, our method is computationally efficient and\nmemory-friendly, as it avoids backpropagation through ODEs and trace\ncomputations. We evaluate its performance on denoising, super-resolution,\ndeblurring, and inpainting tasks, demonstrating superior results compared to\nexisting PnP algorithms and Flow Matching based state-of-the-art methods.\n","authors":["Ségolène Martin","Anne Gagneux","Paul Hagemann","Gabriele Steidl"],"pdf_url":"https://arxiv.org/pdf/2410.02423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02420v1","updated":"2024-10-03T12:11:22Z","published":"2024-10-03T12:11:22Z","title":"LoGDesc: Local geometric features aggregation for robust point cloud\n registration","summary":" This paper introduces a new hybrid descriptor for 3D point matching and point\ncloud registration, combining local geometrical properties and learning-based\nfeature propagation for each point's neighborhood structure description. The\nproposed architecture first extracts prior geometrical information by computing\neach point's planarity, anisotropy, and omnivariance using a Principal\nComponents Analysis (PCA). This prior information is completed by a descriptor\nbased on the normal vectors estimated thanks to constructing a neighborhood\nbased on triangles. The final geometrical descriptor is propagated between the\npoints using local graph convolutions and attention mechanisms. The new feature\nextractor is evaluated on ModelNet40, Bunny Stanford dataset, KITTI and MVP\n(Multi-View Partial)-RG for point cloud registration and shows interesting\nresults, particularly on noisy and low overlapping point clouds.\n","authors":["Karim Slimani","Brahim Tamadazte","Catherine Achard"],"pdf_url":"https://arxiv.org/pdf/2410.02420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02416v1","updated":"2024-10-03T12:06:29Z","published":"2024-10-03T12:06:29Z","title":"Eliminating Oversaturation and Artifacts of High Guidance Scales in\n Diffusion Models","summary":" Classifier-free guidance (CFG) is crucial for improving both generation\nquality and alignment between the input condition and final output in diffusion\nmodels. While a high guidance scale is generally required to enhance these\naspects, it also causes oversaturation and unrealistic artifacts. In this\npaper, we revisit the CFG update rule and introduce modifications to address\nthis issue. We first decompose the update term in CFG into parallel and\northogonal components with respect to the conditional model prediction and\nobserve that the parallel component primarily causes oversaturation, while the\northogonal component enhances image quality. Accordingly, we propose\ndown-weighting the parallel component to achieve high-quality generations\nwithout oversaturation. Additionally, we draw a connection between CFG and\ngradient ascent and introduce a new rescaling and momentum method for the CFG\nupdate rule based on this insight. Our approach, termed adaptive projected\nguidance (APG), retains the quality-boosting advantages of CFG while enabling\nthe use of higher guidance scales without oversaturation. APG is easy to\nimplement and introduces practically no additional computational overhead to\nthe sampling process. Through extensive experiments, we demonstrate that APG is\ncompatible with various conditional diffusion models and samplers, leading to\nimproved FID, recall, and saturation scores while maintaining precision\ncomparable to CFG, making our method a superior plug-and-play alternative to\nstandard classifier-free guidance.\n","authors":["Seyedmorteza Sadat","Otmar Hilliges","Romann M. Weber"],"pdf_url":"https://arxiv.org/pdf/2410.02416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02401v1","updated":"2024-10-03T11:29:09Z","published":"2024-10-03T11:29:09Z","title":"SynCo: Synthetic Hard Negatives in Contrastive Learning for Better\n Unsupervised Visual Representations","summary":" Contrastive learning has become a dominant approach in self-supervised visual\nrepresentation learning, with hard negatives-samples that closely resemble the\nanchor-being key to enhancing the discriminative power of learned\nrepresentations. However, efficiently leveraging hard negatives remains a\nchallenge due to the difficulty in identifying and incorporating them without\nsignificantly increasing computational costs. To address this, we introduce\nSynCo (Synthetic Negatives in Contrastive learning), a novel contrastive\nlearning approach that improves model performance by generating synthetic hard\nnegatives. Built on the MoCo framework, SynCo introduces six novel strategies\nfor creating diverse synthetic hard negatives that can be generated on-the-fly\nwith minimal computational overhead. SynCo achieves faster training and better\nrepresentation learning, achieving a top-1 accuracy of 68.1% in ImageNet linear\nevaluation after only 200 epochs on pretraining, surpassing MoCo's 67.5% with\nthe same ResNet-50 encoder. Additionally, it transfers more effectively to\ndetection tasks: on the PASCAL VOC, it outperforms both the supervised baseline\nand MoCo, achieving an AP of 82.5%; on the COCO dataset, it sets a new\nbenchmark with 40.4% AP for bounding box detection and 35.4% AP for instance\nsegmentation. Our synthetic hard negative generation procedure significantly\nenhances the quality of visual representations learned through self-supervised\ncontrastive learning. Code is available at\nhttps://github.com/giakoumoglou/synco.\n","authors":["Nikolaos Giakoumoglou","Tania Stathaki"],"pdf_url":"https://arxiv.org/pdf/2410.02401v1.pdf","comment":"10 pages, 6 figures, 4 tables. arXiv admin note: text overlap with\n arXiv:2010.01028 by other authors"},{"id":"http://arxiv.org/abs/2410.02396v1","updated":"2024-10-03T11:17:58Z","published":"2024-10-03T11:17:58Z","title":"Parameter Competition Balancing for Model Merging","summary":" While fine-tuning pretrained models has become common practice, these models\noften underperform outside their specific domains. Recently developed model\nmerging techniques enable the direct integration of multiple models, each\nfine-tuned for distinct tasks, into a single model. This strategy promotes\nmultitasking capabilities without requiring retraining on the original\ndatasets. However, existing methods fall short in addressing potential\nconflicts and complex correlations between tasks, especially in parameter-level\nadjustments, posing a challenge in effectively balancing parameter competition\nacross various tasks. This paper introduces an innovative technique named\nPCB-Merging (Parameter Competition Balancing), a lightweight and training-free\ntechnique that adjusts the coefficients of each parameter for effective model\nmerging. PCB-Merging employs intra-balancing to gauge parameter significance\nwithin individual tasks and inter-balancing to assess parameter similarities\nacross different tasks. Parameters with low importance scores are dropped, and\nthe remaining ones are rescaled to form the final merged model. We assessed our\napproach in diverse merging scenarios, including cross-task, cross-domain, and\ncross-training configurations, as well as out-of-domain generalization. The\nexperimental results reveal that our approach achieves substantial performance\nenhancements across multiple modalities, domains, model sizes, number of tasks,\nfine-tuning forms, and large language models, outperforming existing model\nmerging methods. The code is publicly available at:\n\\url{https://github.com/duguodong7/pcb-merging}.\n","authors":["Guodong Du","Junlin Lee","Jing Li","Runhua Jiang","Yifei Guo","Shuyang Yu","Hanting Liu","Sim Kuan Goh","Ho-Kin Tang","Daojing He","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.02396v1.pdf","comment":"Accepted by NeurIPS2024"},{"id":"http://arxiv.org/abs/2410.02381v1","updated":"2024-10-03T11:01:25Z","published":"2024-10-03T11:01:25Z","title":"MetaMetrics: Calibrating Metrics For Generation Tasks Using Human\n Preferences","summary":" Understanding the quality of a performance evaluation metric is crucial for\nensuring that model outputs align with human preferences. However, it remains\nunclear how well each metric captures the diverse aspects of these preferences,\nas metrics often excel in one particular area but not across all dimensions. To\naddress this, it is essential to systematically calibrate metrics to specific\naspects of human preference, catering to the unique characteristics of each\naspect. We introduce MetaMetrics, a calibrated meta-metric designed to evaluate\ngeneration tasks across different modalities in a supervised manner.\nMetaMetrics optimizes the combination of existing metrics to enhance their\nalignment with human preferences. Our metric demonstrates flexibility and\neffectiveness in both language and vision downstream tasks, showing significant\nbenefits across various multilingual and multi-domain scenarios. MetaMetrics\naligns closely with human preferences and is highly extendable and easily\nintegrable into any application. This makes MetaMetrics a powerful tool for\nimproving the evaluation of generation tasks, ensuring that metrics are more\nrepresentative of human judgment across diverse contexts.\n","authors":["Genta Indra Winata","David Anugraha","Lucky Susanto","Garry Kuwanto","Derry Tanti Wijaya"],"pdf_url":"https://arxiv.org/pdf/2410.02381v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2409.02889v2","updated":"2024-10-03T11:01:14Z","published":"2024-09-04T17:25:21Z","title":"LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via a\n Hybrid Architecture","summary":" Expanding the long-context capabilities of Multi-modal Large Language\nModels~(MLLMs) is crucial for video understanding, high-resolution image\nunderstanding, and multi-modal agents. This involves a series of systematic\noptimizations, including model architecture, data construction and training\nstrategy, particularly addressing challenges such as \\textit{degraded\nperformance with more images} and \\textit{high computational costs}. In this\npaper, we adapt the model architecture to a hybrid of Mamba and Transformer\nblocks, approach data construction with both temporal and spatial dependencies\namong multiple images and employ a progressive training strategy. The released\nmodel \\textbf{LongLLaVA}~(\\textbf{Long}-Context \\textbf{L}arge\n\\textbf{L}anguage \\textbf{a}nd \\textbf{V}ision \\textbf{A}ssistant) is the first\nhybrid MLLM, which achieved a better balance between efficiency and\neffectiveness. LongLLaVA not only achieves competitive results across various\nbenchmarks, but also maintains high throughput and low memory consumption.\nEspecially, it could process nearly a thousand images on a single A100 80GB\nGPU, showing promising application prospects for a wide range of tasks.\n","authors":["Xidong Wang","Dingjie Song","Shunian Chen","Chen Zhang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02889v2.pdf","comment":"20 pages, 9 figures, 9 tables"},{"id":"http://arxiv.org/abs/2410.01536v2","updated":"2024-10-03T10:57:59Z","published":"2024-10-02T13:26:53Z","title":"EUFCC-CIR: a Composed Image Retrieval Dataset for GLAM Collections","summary":" The intersection of Artificial Intelligence and Digital Humanities enables\nresearchers to explore cultural heritage collections with greater depth and\nscale. In this paper, we present EUFCC-CIR, a dataset designed for Composed\nImage Retrieval (CIR) within Galleries, Libraries, Archives, and Museums (GLAM)\ncollections. Our dataset is built on top of the EUFCC-340K image labeling\ndataset and contains over 180K annotated CIR triplets. Each triplet is composed\nof a multi-modal query (an input image plus a short text describing the desired\nattribute manipulations) and a set of relevant target images. The EUFCC-CIR\ndataset fills an existing gap in CIR-specific resources for Digital Humanities.\nWe demonstrate the value of the EUFCC-CIR dataset by highlighting its unique\nqualities in comparison to other existing CIR datasets and evaluating the\nperformance of several zero-shot CIR baselines.\n","authors":["Francesc Net","Lluis Gomez"],"pdf_url":"https://arxiv.org/pdf/2410.01536v2.pdf","comment":"ECCV Workshop (AI4DH2024)"},{"id":"http://arxiv.org/abs/2410.02369v1","updated":"2024-10-03T10:33:49Z","published":"2024-10-03T10:33:49Z","title":"Unleashing the Potential of the Diffusion Model in Few-shot Semantic\n Segmentation","summary":" The Diffusion Model has not only garnered noteworthy achievements in the\nrealm of image generation but has also demonstrated its potential as an\neffective pretraining method utilizing unlabeled data. Drawing from the\nextensive potential unveiled by the Diffusion Model in both semantic\ncorrespondence and open vocabulary segmentation, our work initiates an\ninvestigation into employing the Latent Diffusion Model for Few-shot Semantic\nSegmentation. Recently, inspired by the in-context learning ability of large\nlanguage models, Few-shot Semantic Segmentation has evolved into In-context\nSegmentation tasks, morphing into a crucial element in assessing generalist\nsegmentation models. In this context, we concentrate on Few-shot Semantic\nSegmentation, establishing a solid foundation for the future development of a\nDiffusion-based generalist model for segmentation. Our initial focus lies in\nunderstanding how to facilitate interaction between the query image and the\nsupport image, resulting in the proposal of a KV fusion method within the\nself-attention framework. Subsequently, we delve deeper into optimizing the\ninfusion of information from the support mask and simultaneously re-evaluating\nhow to provide reasonable supervision from the query mask. Based on our\nanalysis, we establish a simple and effective framework named DiffewS,\nmaximally retaining the original Latent Diffusion Model's generative framework\nand effectively utilizing the pre-training prior. Experimental results\ndemonstrate that our method significantly outperforms the previous SOTA models\nin multiple settings.\n","authors":["Muzhi Zhu","Yang Liu","Zekai Luo","Chenchen Jing","Hao Chen","Guangkai Xu","Xinlong Wang","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2410.02369v1.pdf","comment":"Accepted to Proc. Annual Conference on Neural Information Processing\n Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2410.02362v1","updated":"2024-10-03T10:23:03Z","published":"2024-10-03T10:23:03Z","title":"A Comprehensive Survey of Mamba Architectures for Medical Image\n Analysis: Classification, Segmentation, Restoration and Beyond","summary":" Mamba, a special case of the State Space Model, is gaining popularity as an\nalternative to template-based deep learning approaches in medical image\nanalysis. While transformers are powerful architectures, they have drawbacks,\nincluding quadratic computational complexity and an inability to address\nlong-range dependencies efficiently. This limitation affects the analysis of\nlarge and complex datasets in medical imaging, where there are many spatial and\ntemporal relationships. In contrast, Mamba offers benefits that make it\nwell-suited for medical image analysis. It has linear time complexity, which is\na significant improvement over transformers. Mamba processes longer sequences\nwithout attention mechanisms, enabling faster inference and requiring less\nmemory. Mamba also demonstrates strong performance in merging multimodal data,\nimproving diagnosis accuracy and patient outcomes. The organization of this\npaper allows readers to appreciate the capabilities of Mamba in medical imaging\nstep by step. We begin by defining core concepts of SSMs and models, including\nS4, S5, and S6, followed by an exploration of Mamba architectures such as pure\nMamba, U-Net variants, and hybrid models with convolutional neural networks,\ntransformers, and Graph Neural Networks. We also cover Mamba optimizations,\ntechniques and adaptations, scanning, datasets, applications, experimental\nresults, and conclude with its challenges and future directions in medical\nimaging. This review aims to demonstrate the transformative potential of Mamba\nin overcoming existing barriers within medical imaging while paving the way for\ninnovative advancements in the field. A comprehensive list of Mamba\narchitectures applied in the medical field, reviewed in this work, is available\nat Github.\n","authors":["Shubhi Bansal","Sreeharish A","Madhava Prasath J","Manikandan S","Sreekanth Madisetty","Mohammad Zia Ur Rehman","Chandravardhan Singh Raghaw","Gaurav Duggal","Nagendra Kumar"],"pdf_url":"https://arxiv.org/pdf/2410.02362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01574v2","updated":"2024-10-03T10:11:53Z","published":"2024-10-02T14:11:29Z","title":"Fake It Until You Break It: On the Adversarial Robustness of\n AI-generated Image Detectors","summary":" While generative AI (GenAI) offers countless possibilities for creative and\nproductive tasks, artificially generated media can be misused for fraud,\nmanipulation, scams, misinformation campaigns, and more. To mitigate the risks\nassociated with maliciously generated media, forensic classifiers are employed\nto identify AI-generated content. However, current forensic classifiers are\noften not evaluated in practically relevant scenarios, such as the presence of\nan attacker or when real-world artifacts like social media degradations affect\nimages. In this paper, we evaluate state-of-the-art AI-generated image (AIGI)\ndetectors under different attack scenarios. We demonstrate that forensic\nclassifiers can be effectively attacked in realistic settings, even when the\nattacker does not have access to the target model and post-processing occurs\nafter the adversarial examples are created, which is standard on social media\nplatforms. These attacks can significantly reduce detection accuracy to the\nextent that the risks of relying on detectors outweigh their benefits. Finally,\nwe propose a simple defense mechanism to make CLIP-based detectors, which are\ncurrently the best-performing detectors, robust against these attacks.\n","authors":["Sina Mavali","Jonas Ricker","David Pape","Yash Sharma","Asja Fischer","Lea Schönherr"],"pdf_url":"https://arxiv.org/pdf/2410.01574v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02352v1","updated":"2024-10-03T10:05:27Z","published":"2024-10-03T10:05:27Z","title":"ProtoSeg: A Prototype-Based Point Cloud Instance Segmentation Method","summary":" 3D instance segmentation is crucial for obtaining an understanding of a point\ncloud scene. This paper presents a novel neural network architecture for\nperforming instance segmentation on 3D point clouds. We propose to jointly\nlearn coefficients and prototypes in parallel which can be combined to obtain\nthe instance predictions. The coefficients are computed using an overcomplete\nset of sampled points with a novel multi-scale module, dubbed dilated point\ninception. As the set of obtained instance mask predictions is overcomplete, we\nemploy a non-maximum suppression algorithm to retrieve the final predictions.\nThis approach allows to omit the time-expensive clustering step and leads to a\nmore stable inference time. The proposed method is not only 28% faster than the\nstate-of-the-art, it also exhibits the lowest standard deviation. Our\nexperiments have shown that the standard deviation of the inference time is\nonly 1.0% of the total time while it ranges between 10.8 and 53.1% for the\nstate-of-the-art methods. Lastly, our method outperforms the state-of-the-art\nboth on S3DIS-blocks (4.9% in mRec on Fold-5) and PartNet (2.0% on average in\nmAP).\n","authors":["Remco Royen","Leon Denis","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2410.02352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13991v3","updated":"2024-10-03T10:05:24Z","published":"2023-08-27T02:59:59Z","title":"Optimal Projections for Discriminative Dictionary Learning using the\n JL-lemma","summary":" Dimensionality reduction-based dictionary learning methods in the literature\nhave often used iterative random projections. The dimensionality of such a\nrandom projection matrix is a random number that might not lead to a separable\nsubspace structure in the transformed space. The convergence of such methods\nhighly depends on the initial seed values used. Also, gradient descent-based\nupdates might result in local minima. This paper proposes a constructive\napproach to derandomize the projection matrix using the Johnson-Lindenstrauss\nlemma. Rather than reducing dimensionality via random projections, a projection\nmatrix derived from the proposed Modified Supervised PC analysis is used. A\nheuristic is proposed to decide the data perturbation levels and the dictionary\natom's corresponding suitable description length. The projection matrix is\nderived in a single step, provides maximum feature-label consistency of the\ntransformed space, and preserves the geometry of the original data. The\nprojection matrix thus constructed is proved to be a JL-embedding. Despite\nconfusing classes in the OCR datasets, the dictionary trained in the\ntransformed space generates discriminative sparse coefficients with reduced\ncomplexity. Empirical study demonstrates that the proposed method performs well\neven when the number of classes and dimensionality increase. Experimentation on\nOCR and face recognition datasets shows better classification performance than\nother algorithms.\n","authors":["G. Madhuri","Atul Negi","Kaluri V. Rangarao"],"pdf_url":"https://arxiv.org/pdf/2308.13991v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02331v1","updated":"2024-10-03T09:29:28Z","published":"2024-10-03T09:29:28Z","title":"Self-eXplainable AI for Medical Image Analysis: A Survey and New\n Outlooks","summary":" The increasing demand for transparent and reliable models, particularly in\nhigh-stakes decision-making areas such as medical image analysis, has led to\nthe emergence of eXplainable Artificial Intelligence (XAI). Post-hoc XAI\ntechniques, which aim to explain black-box models after training, have been\ncontroversial in recent works concerning their fidelity to the models'\npredictions. In contrast, Self-eXplainable AI (S-XAI) offers a compelling\nalternative by incorporating explainability directly into the training process\nof deep learning models. This approach allows models to generate inherent\nexplanations that are closely aligned with their internal decision-making\nprocesses. Such enhanced transparency significantly supports the\ntrustworthiness, robustness, and accountability of AI systems in real-world\nmedical applications. To facilitate the development of S-XAI methods for\nmedical image analysis, this survey presents an comprehensive review across\nvarious image modalities and clinical applications. It covers more than 200\npapers from three key perspectives: 1) input explainability through the\nintegration of explainable feature engineering and knowledge graph, 2) model\nexplainability via attention-based learning, concept-based learning, and\nprototype-based learning, and 3) output explainability by providing\ncounterfactual explanation and textual explanation. Additionally, this paper\noutlines the desired characteristics of explainability and existing evaluation\nmethods for assessing explanation quality. Finally, it discusses the major\nchallenges and future research directions in developing S-XAI for medical image\nanalysis.\n","authors":["Junlin Hou","Sicen Liu","Yequan Bie","Hongmei Wang","Andong Tan","Luyang Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01697v2","updated":"2024-10-03T09:28:48Z","published":"2024-10-02T16:05:03Z","title":"MOREL: Enhancing Adversarial Robustness through Multi-Objective\n Representation Learning","summary":" Extensive research has shown that deep neural networks (DNNs) are vulnerable\nto slight adversarial perturbations$-$small changes to the input data that\nappear insignificant but cause the model to produce drastically different\noutputs. In addition to augmenting training data with adversarial examples\ngenerated from a specific attack method, most of the current defense strategies\nnecessitate modifying the original model architecture components to improve\nrobustness or performing test-time data purification to handle adversarial\nattacks. In this work, we demonstrate that strong feature representation\nlearning during training can significantly enhance the original model's\nrobustness. We propose MOREL, a multi-objective feature representation learning\napproach, encouraging classification models to produce similar features for\ninputs within the same class, despite perturbations. Our training method\ninvolves an embedding space where cosine similarity loss and multi-positive\ncontrastive loss are used to align natural and adversarial features from the\nmodel encoder and ensure tight clustering. Concurrently, the classifier is\nmotivated to achieve accurate predictions. Through extensive experiments, we\ndemonstrate that our approach significantly enhances the robustness of DNNs\nagainst white-box and black-box adversarial attacks, outperforming other\nmethods that similarly require no architectural changes or test-time data\npurification. Our code is available at https://github.com/salomonhotegni/MOREL\n","authors":["Sedjro Salomon Hotegni","Sebastian Peitz"],"pdf_url":"https://arxiv.org/pdf/2410.01697v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16050v2","updated":"2024-10-03T09:24:56Z","published":"2024-02-25T10:27:46Z","title":"Efficient Temporal Extrapolation of Multimodal Large Language Models\n with Temporal Grounding Bridge","summary":" Despite progress in multimodal large language models (MLLMs), the challenge\nof interpreting long-form videos in response to linguistic queries persists,\nlargely due to the inefficiency in temporal grounding and limited pre-trained\ncontext window size. In this work, we introduce Temporal Grounding Bridge\n(TGB), a novel framework that bootstraps MLLMs with advanced temporal grounding\ncapabilities and broadens their contextual scope. Our framework significantly\nenhances the temporal capabilities of current MLLMs through three key\ninnovations: an efficient multi-span temporal grounding algorithm applied to\nlow-dimension temporal features projected from flow; a multimodal length\nextrapolation training paradigm that utilizes low-dimension temporal features\nto extend the training context window size; and a bootstrapping framework that\nbridges our model with pluggable MLLMs without requiring annotation. We\nvalidate TGB across seven video benchmarks and demonstrate substantial\nperformance improvements compared with prior MLLMs. Notably, our model,\ninitially trained on sequences of four frames, effectively handles sequences up\nto 16 longer without sacrificing performance, highlighting its scalability and\neffectiveness in real-world applications. Our code is publicly available at\nhttps://github.com/bigai-nlco/VideoTGB\n","authors":["Yuxuan Wang","Yueqian Wang","Pengfei Wu","Jianxin Liang","Dongyan Zhao","Yang Liu","Zilong Zheng"],"pdf_url":"https://arxiv.org/pdf/2402.16050v2.pdf","comment":"To appear at EMNLP 2024"},{"id":"http://arxiv.org/abs/2311.10794v2","updated":"2024-10-03T09:10:47Z","published":"2023-11-17T03:00:29Z","title":"Text-to-Sticker: Style Tailoring Latent Diffusion Models for Human\n Expression","summary":" We introduce Style Tailoring, a recipe to finetune Latent Diffusion Models\n(LDMs) in a distinct domain with high visual quality, prompt alignment and\nscene diversity. We choose sticker image generation as the target domain, as\nthe images significantly differ from photorealistic samples typically generated\nby large-scale LDMs. We start with a competent text-to-image model, like Emu,\nand show that relying on prompt engineering with a photorealistic model to\ngenerate stickers leads to poor prompt alignment and scene diversity. To\novercome these drawbacks, we first finetune Emu on millions of sticker-like\nimages collected using weak supervision to elicit diversity. Next, we curate\nhuman-in-the-loop (HITL) Alignment and Style datasets from model generations,\nand finetune to improve prompt alignment and style alignment respectively.\nSequential finetuning on these datasets poses a tradeoff between better style\nalignment and prompt alignment gains. To address this tradeoff, we propose a\nnovel fine-tuning method called Style Tailoring, which jointly fits the content\nand style distribution and achieves best tradeoff. Evaluation results show our\nmethod improves visual quality by 14%, prompt alignment by 16.2% and scene\ndiversity by 15.3%, compared to prompt engineering the base Emu model for\nstickers generation.\n","authors":["Animesh Sinha","Bo Sun","Anmol Kalia","Arantxa Casanova","Elliot Blanchard","David Yan","Winnie Zhang","Tony Nelli","Jiahui Chen","Hardik Shah","Licheng Yu","Mitesh Kumar Singh","Ankit Ramchandani","Maziar Sanjabi","Sonal Gupta","Amy Bearman","Dhruv Mahajan"],"pdf_url":"https://arxiv.org/pdf/2311.10794v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.02323v1","updated":"2024-10-03T09:10:42Z","published":"2024-10-03T09:10:42Z","title":"RESSCAL3D++: Joint Acquisition and Semantic Segmentation of 3D Point\n Clouds","summary":" 3D scene understanding is crucial for facilitating seamless interaction\nbetween digital devices and the physical world. Real-time capturing and\nprocessing of the 3D scene are essential for achieving this seamless\nintegration. While existing approaches typically separate acquisition and\nprocessing for each frame, the advent of resolution-scalable 3D sensors offers\nan opportunity to overcome this paradigm and fully leverage the otherwise\nwasted acquisition time to initiate processing. In this study, we introduce\nVX-S3DIS, a novel point cloud dataset accurately simulating the behavior of a\nresolution-scalable 3D sensor. Additionally, we present RESSCAL3D++, an\nimportant improvement over our prior work, RESSCAL3D, by incorporating an\nupdate module and processing strategy. By applying our method to the new\ndataset, we practically demonstrate the potential of joint acquisition and\nsemantic segmentation of 3D point clouds. Our resolution-scalable approach\nsignificantly reduces scalability costs from 2% to just 0.2% in mIoU while\nachieving impressive speed-ups of 15.6 to 63.9% compared to the non-scalable\nbaseline. Furthermore, our scalable approach enables early predictions, with\nthe first one occurring after only 7% of the total inference time of the\nbaseline. The new VX-S3DIS dataset is available at\nhttps://github.com/remcoroyen/vx-s3dis.\n","authors":["Remco Royen","Kostas Pataridis","Ward van der Tempel","Adrian Munteanu"],"pdf_url":"https://arxiv.org/pdf/2410.02323v1.pdf","comment":"2024 IEEE International Conference on Image Processing (ICIP). IEEE,\n 2024"},{"id":"http://arxiv.org/abs/2105.14711v4","updated":"2024-10-03T09:08:20Z","published":"2021-05-31T05:34:27Z","title":"CTSpine1K: A Large-Scale Dataset for Spinal Vertebrae Segmentation in\n Computed Tomography","summary":" Spine-related diseases have high morbidity and cause a huge burden of social\ncost. Spine imaging is an essential tool for noninvasively visualizing and\nassessing spinal pathology. Segmenting vertebrae in computed tomography (CT)\nimages is the basis of quantitative medical image analysis for clinical\ndiagnosis and surgery planning of spine diseases. Current publicly available\nannotated datasets on spinal vertebrae are small in size. Due to the lack of a\nlarge-scale annotated spine image dataset, the mainstream deep learning-based\nsegmentation methods, which are data-driven, are heavily restricted. In this\npaper, we introduce a large-scale spine CT dataset, called CTSpine1K, curated\nfrom multiple sources for vertebra segmentation, which contains 1,005 CT\nvolumes with over 11,100 labeled vertebrae belonging to different spinal\nconditions. Based on this dataset, we conduct several spinal vertebrae\nsegmentation experiments to set the first benchmark. We believe that this\nlarge-scale dataset will facilitate further research in many spine-related\nimage analysis tasks, including but not limited to vertebrae segmentation,\nlabeling, 3D spine reconstruction from biplanar radiographs, image\nsuper-resolution, and enhancement.\n","authors":["Yang Deng","Ce Wang","Yuan Hui","Qian Li","Jun Li","Shiwei Luo","Mengke Sun","Quan Quan","Shuxin Yang","You Hao","Pengbo Liu","Honghu Xiao","Chunpeng Zhao","Xinbao Wu","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2105.14711v4.pdf","comment":"Accepted by MICCAI2024 Open Data for oral presentation and will be\n published as a part of the journal MELBA special issue"},{"id":"http://arxiv.org/abs/2405.09589v4","updated":"2024-10-03T09:00:35Z","published":"2024-05-15T10:16:25Z","title":"A Comprehensive Survey of Hallucination in Large Language, Image, Video\n and Audio Foundation Models","summary":" The rapid advancement of foundation models (FMs) across language, image,\naudio, and video domains has shown remarkable capabilities in diverse tasks.\nHowever, the proliferation of FMs brings forth a critical challenge: the\npotential to generate hallucinated outputs, particularly in high-stakes\napplications. The tendency of foundation models to produce hallucinated content\narguably represents the biggest hindrance to their widespread adoption in\nreal-world scenarios, especially in domains where reliability and accuracy are\nparamount. This survey paper presents a comprehensive overview of recent\ndevelopments that aim to identify and mitigate the problem of hallucination in\nFMs, spanning text, image, video, and audio modalities. By synthesizing recent\nadvancements in detecting and mitigating hallucination across various\nmodalities, the paper aims to provide valuable insights for researchers,\ndevelopers, and practitioners. Essentially, it establishes a clear framework\nencompassing definition, taxonomy, and detection strategies for addressing\nhallucination in multimodal foundation models, laying the foundation for future\nresearch in this pivotal area.\n","authors":["Pranab Sahoo","Prabhash Meharia","Akash Ghosh","Sriparna Saha","Vinija Jain","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2405.09589v4.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2406.15735v2","updated":"2024-10-03T08:52:55Z","published":"2024-06-22T04:56:16Z","title":"Identifying and Solving Conditional Image Leakage in Image-to-Video\n Diffusion Model","summary":" Diffusion models have obtained substantial progress in image-to-video\ngeneration. However, in this paper, we find that these models tend to generate\nvideos with less motion than expected. We attribute this to the issue called\nconditional image leakage, where the image-to-video diffusion models (I2V-DMs)\ntend to over-rely on the conditional image at large time steps. We further\naddress this challenge from both inference and training aspects. First, we\npropose to start the generation process from an earlier time step to avoid the\nunreliable large-time steps of I2V-DMs, as well as an initial noise\ndistribution with optimal analytic expressions (Analytic-Init) by minimizing\nthe KL divergence between it and the actual marginal distribution to bridge the\ntraining-inference gap. Second, we design a time-dependent noise distribution\n(TimeNoise) for the conditional image during training, applying higher noise\nlevels at larger time steps to disrupt it and reduce the model's dependency on\nit. We validate these general strategies on various I2V-DMs on our collected\nopen-domain image benchmark and the UCF101 dataset. Extensive results show that\nour methods outperform baselines by producing higher motion scores with lower\nerrors while maintaining image alignment and temporal consistency, thereby\nyielding superior overall performance and enabling more accurate motion\ncontrol. The project page: \\url{https://cond-image-leak.github.io/}.\n","authors":["Min Zhao","Hongzhou Zhu","Chendong Xiang","Kaiwen Zheng","Chongxuan Li","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.15735v2.pdf","comment":"NeurIPS 2024. Project page: https://cond-image-leak.github.io/"},{"id":"http://arxiv.org/abs/2410.02316v1","updated":"2024-10-03T08:52:21Z","published":"2024-10-03T08:52:21Z","title":"CTARR: A fast and robust method for identifying anatomical regions on CT\n images via atlas registration","summary":" Medical image analysis tasks often focus on regions or structures located in\na particular location within the patient's body. Often large parts of the image\nmay not be of interest for the image analysis task. When using deep-learning\nbased approaches, this causes an unnecessary increases the computational burden\nduring inference and raises the chance of errors. In this paper, we introduce\nCTARR, a novel generic method for CT Anatomical Region Recognition. The method\nserves as a pre-processing step for any deep learning-based CT image analysis\npipeline by automatically identifying the pre-defined anatomical region that is\nrelevant for the follow-up task and removing the rest. It can be used in (i)\nimage segmentation to prevent false positives in anatomically implausible\nregions and speeding up the inference, (ii) image classification to produce\nimage crops that are consistent in their anatomical context, and (iii) image\nregistration by serving as a fast pre-registration step. Our proposed method is\nbased on atlas registration and provides a fast and robust way to crop any\nanatomical region encoded as one or multiple bounding box(es) from any\nunlabeled CT scan of the brain, chest, abdomen and/or pelvis. We demonstrate\nthe utility and robustness of the proposed method in the context of medical\nimage segmentation by evaluating it on six datasets of public segmentation\nchallenges. The foreground voxels in the regions of interest are preserved in\nthe vast majority of cases and tasks (97.45-100%) while taking only fractions\nof a seconds to compute (0.1-0.21s) on a deep learning workstation and greatly\nreducing the segmentation runtime (2.0-12.7x). Our code is available at\nhttps://github.com/ThomasBudd/ctarr.\n","authors":["Thomas Buddenkotte","Roland Opfer","Julia Krüger","Alessa Hering","Mireia Crispin-Ortuzar"],"pdf_url":"https://arxiv.org/pdf/2410.02316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02309v1","updated":"2024-10-03T08:46:17Z","published":"2024-10-03T08:46:17Z","title":"Decoupling Layout from Glyph in Online Chinese Handwriting Generation","summary":" Text plays a crucial role in the transmission of human civilization, and\nteaching machines to generate online handwritten text in various styles\npresents an interesting and significant challenge. However, most prior work has\nconcentrated on generating individual Chinese fonts, leaving {complete text\nline generation largely unexplored}. In this paper, we identify that text lines\ncan naturally be divided into two components: layout and glyphs. Based on this\ndivision, we designed a text line layout generator coupled with a\ndiffusion-based stylized font synthesizer to address this challenge\nhierarchically. More concretely, the layout generator performs in-context-like\nlearning based on the text content and the provided style references to\ngenerate positions for each glyph autoregressively. Meanwhile, the font\nsynthesizer which consists of a character embedding dictionary, a multi-scale\ncalligraphy style encoder, and a 1D U-Net based diffusion denoiser will\ngenerate each font on its position while imitating the calligraphy style\nextracted from the given style references. Qualitative and quantitative\nexperiments on the CASIA-OLHWDB demonstrate that our method is capable of\ngenerating structurally correct and indistinguishable imitation samples.\n","authors":["Ren-Min Si","Yan-Ming Zhang","Yi Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02305v1","updated":"2024-10-03T08:40:14Z","published":"2024-10-03T08:40:14Z","title":"The Comparison of Individual Cat Recognition Using Neural Networks","summary":" Facial recognition using deep learning has been widely used in social life\nfor applications such as authentication, smart door locks, and photo grouping,\netc. More and more networks have been developed to facilitate computer vision\ntasks, such as ResNet, DenseNet, EfficientNet, ConvNeXt, and Siamese networks.\nHowever, few studies have systematically compared the advantages and\ndisadvantages of such neural networks in identifying individuals from images,\nespecially for pet animals like cats. In the present study, by systematically\ncomparing the efficacy of different neural networks in cat recognition, we\nfound traditional CNNs trained with transfer learning have better performance\nthan models trained with the fine-tuning method or Siamese networks in\nindividual cat recognition. In addition, ConvNeXt and DenseNet yield\nsignificant results which could be further optimized for individual cat\nrecognition in pet stores and in the wild. These results provide a method to\nimprove cat management in pet stores and monitoring of cats in the wild.\n","authors":["Mingxuan Li","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.02305v1.pdf","comment":"13 pages,7 figures"},{"id":"http://arxiv.org/abs/2410.02304v1","updated":"2024-10-03T08:39:06Z","published":"2024-10-03T08:39:06Z","title":"A Novel Method for Accurate & Real-time Food Classification: The\n Synergistic Integration of EfficientNetB7, CBAM, Transfer Learning, and Data\n Augmentation","summary":" Integrating artificial intelligence into modern society is profoundly\ntransformative, significantly enhancing productivity by streamlining various\ndaily tasks. AI-driven recognition systems provide notable advantages in the\nfood sector, including improved nutrient tracking, tackling food waste, and\nboosting food production and consumption efficiency. Accurate food\nclassification is a crucial initial step in utilizing advanced AI models, as\nthe effectiveness of this process directly influences the success of subsequent\noperations; therefore, achieving high accuracy at a reasonable speed is\nessential. Despite existing research efforts, a gap persists in improving\nperformance while ensuring rapid processing times, prompting researchers to\npursue cost-effective and precise models. This study addresses this gap by\nemploying the state-of-the-art EfficientNetB7 architecture, enhanced through\ntransfer learning, data augmentation, and the CBAM attention module. This\nmethodology results in a robust model that surpasses previous studies in\naccuracy while maintaining rapid processing suitable for real-world\napplications. The Food11 dataset from Kaggle was utilized, comprising 16643\nimbalanced images across 11 diverse classes with significant intra-category\ndiversities and inter-category similarities. Furthermore, the proposed\nmethodology, bolstered by various deep learning techniques, consistently\nachieves an impressive average accuracy of 96.40%. Notably, it can classify\nover 60 images within one second during inference on unseen data, demonstrating\nits ability to deliver high accuracy promptly. This underscores its potential\nfor practical applications in accurate food classification and enhancing\nefficiency in subsequent processes.\n","authors":["Shayan Rokhva","Babak Teimourpour"],"pdf_url":"https://arxiv.org/pdf/2410.02304v1.pdf","comment":"20 pages, six figures, two tables"},{"id":"http://arxiv.org/abs/2406.00093v2","updated":"2024-10-03T08:20:17Z","published":"2024-05-31T17:59:56Z","title":"Bootstrap3D: Improving Multi-view Diffusion Model with Synthetic Data","summary":" Recent years have witnessed remarkable progress in multi-view diffusion\nmodels for 3D content creation. However, there remains a significant gap in\nimage quality and prompt-following ability compared to 2D diffusion models. A\ncritical bottleneck is the scarcity of high-quality 3D objects with detailed\ncaptions. To address this challenge, we propose Bootstrap3D, a novel framework\nthat automatically generates an arbitrary quantity of multi-view images to\nassist in training multi-view diffusion models. Specifically, we introduce a\ndata generation pipeline that employs (1) 2D and video diffusion models to\ngenerate multi-view images based on constructed text prompts, and (2) our\nfine-tuned 3D-aware MV-LLaVA for filtering high-quality data and rewriting\ninaccurate captions. Leveraging this pipeline, we have generated 1 million\nhigh-quality synthetic multi-view images with dense descriptive captions to\naddress the shortage of high-quality 3D data. Furthermore, we present a\nTraining Timestep Reschedule (TTR) strategy that leverages the denoising\nprocess to learn multi-view consistency while maintaining the original 2D\ndiffusion prior. Extensive experiments demonstrate that Bootstrap3D can\ngenerate high-quality multi-view images with superior aesthetic quality,\nimage-text alignment, and maintained view consistency.\n","authors":["Zeyi Sun","Tong Wu","Pan Zhang","Yuhang Zang","Xiaoyi Dong","Yuanjun Xiong","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2406.00093v2.pdf","comment":"Project Page: https://sunzey.github.io/Bootstrap3D/"},{"id":"http://arxiv.org/abs/2410.02288v1","updated":"2024-10-03T08:13:26Z","published":"2024-10-03T08:13:26Z","title":"Computer-aided Colorization State-of-the-science: A Survey","summary":" This paper reviews published research in the field of computer-aided\ncolorization technology. We argue that the colorization task originates from\ncomputer graphics, prospers by introducing computer vision, and tends to the\nfusion of vision and graphics, so we put forward our taxonomy and organize the\nwhole paper chronologically. We extend the existing reconstruction-based\ncolorization evaluation techniques, considering that aesthetic assessment of\ncolored images should be introduced to ensure that colorization satisfies human\nvisual-related requirements and emotions more closely. We perform the\ncolorization aesthetic assessment on seven representative unconditional\ncolorization models and discuss the difference between our assessment and the\nexisting reconstruction-based metrics. Finally, this paper identifies\nunresolved issues and proposes fruitful areas for future research and\ndevelopment. Access to the project associated with this survey can be obtained\nat https://github.com/DanielCho-HK/Colorization.\n","authors":["Yu Cao","Xin Duan","Xiangqiao Meng","P. Y. Mok","Ping Li","Tong-Yee Lee"],"pdf_url":"https://arxiv.org/pdf/2410.02288v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02268v1","updated":"2024-10-03T07:40:14Z","published":"2024-10-03T07:40:14Z","title":"Structural-Entropy-Based Sample Selection for Efficient and Effective\n Learning","summary":" Sample selection improves the efficiency and effectiveness of machine\nlearning models by providing informative and representative samples. Typically,\nsamples can be modeled as a sample graph, where nodes are samples and edges\nrepresent their similarities. Most existing methods are based on local\ninformation, such as the training difficulty of samples, thereby overlooking\nglobal information, such as connectivity patterns. This oversight can result in\nsuboptimal selection because global information is crucial for ensuring that\nthe selected samples well represent the structural properties of the graph. To\naddress this issue, we employ structural entropy to quantify global information\nand losslessly decompose it from the whole graph to individual nodes using the\nShapley value. Based on the decomposition, we present\n$\\textbf{S}$tructural-$\\textbf{E}$ntropy-based sample $\\textbf{S}$election\n($\\textbf{SES}$), a method that integrates both global and local information to\nselect informative and representative samples. SES begins by constructing a\n$k$NN-graph among samples based on their similarities. It then measures sample\nimportance by combining structural entropy (global metric) with training\ndifficulty (local metric). Finally, SES applies importance-biased blue noise\nsampling to select a set of diverse and representative samples. Comprehensive\nexperiments on three learning scenarios -- supervised learning, active\nlearning, and continual learning -- clearly demonstrate the effectiveness of\nour method.\n","authors":["Tianchi Xie","Jiangning Zhu","Guozu Ma","Minzhi Lin","Wei Chen","Weikai Yang","Shixia Liu"],"pdf_url":"https://arxiv.org/pdf/2410.02268v1.pdf","comment":"Submitted to ICLR 2025"},{"id":"http://arxiv.org/abs/2404.19460v2","updated":"2024-10-03T07:07:25Z","published":"2024-04-30T11:19:05Z","title":"AttackBench: Evaluating Gradient-based Attacks for Adversarial Examples","summary":" Adversarial examples are typically optimized with gradient-based attacks.\nWhile novel attacks are continuously proposed, each is shown to outperform its\npredecessors using different experimental setups, hyperparameter settings, and\nnumber of forward and backward calls to the target models. This provides\noverly-optimistic and even biased evaluations that may unfairly favor one\nparticular attack over the others. In this work, we aim to overcome these\nlimitations by proposing AttackBench, i.e., the first evaluation framework that\nenables a fair comparison among different attacks. To this end, we first\npropose a categorization of gradient-based attacks, identifying their main\ncomponents and differences. We then introduce our framework, which evaluates\ntheir effectiveness and efficiency. We measure these characteristics by (i)\ndefining an optimality metric that quantifies how close an attack is to the\noptimal solution, and (ii) limiting the number of forward and backward queries\nto the model, such that all attacks are compared within a given maximum query\nbudget. Our extensive experimental analysis compares more than $100$ attack\nimplementations with a total of over $800$ different configurations against\nCIFAR-10 and ImageNet models, highlighting that only very few attacks\noutperform all the competing approaches. Within this analysis, we shed light on\nseveral implementation issues that prevent many attacks from finding better\nsolutions or running at all. We release AttackBench as a publicly-available\nbenchmark, aiming to continuously update it to include and evaluate novel\ngradient-based attacks for optimizing adversarial examples.\n","authors":["Antonio Emanuele Cinà","Jérôme Rony","Maura Pintor","Luca Demetrio","Ambra Demontis","Battista Biggio","Ismail Ben Ayed","Fabio Roli"],"pdf_url":"https://arxiv.org/pdf/2404.19460v2.pdf","comment":"https://attackbench.github.io"},{"id":"http://arxiv.org/abs/2410.02250v1","updated":"2024-10-03T06:43:09Z","published":"2024-10-03T06:43:09Z","title":"Probabilistic road classification in historical maps using synthetic\n data and deep learning","summary":" Historical maps are invaluable for analyzing long-term changes in\ntransportation and spatial development, offering a rich source of data for\nevolutionary studies. However, digitizing and classifying road networks from\nthese maps is often expensive and time-consuming, limiting their widespread\nuse. Recent advancements in deep learning have made automatic road extraction\nfrom historical maps feasible, yet these methods typically require large\namounts of labeled training data. To address this challenge, we introduce a\nnovel framework that integrates deep learning with geoinformation,\ncomputer-based painting, and image processing methodologies. This framework\nenables the extraction and classification of roads from historical maps using\nonly road geometries without needing road class labels for training. The\nprocess begins with training of a binary segmentation model to extract road\ngeometries, followed by morphological operations, skeletonization,\nvectorization, and filtering algorithms. Synthetic training data is then\ngenerated by a painting function that artificially re-paints road segments\nusing predefined symbology for road classes. Using this synthetic data, a deep\nensemble is trained to generate pixel-wise probabilities for road classes to\nmitigate distribution shift. These predictions are then discretized along the\nextracted road geometries. Subsequently, further processing is employed to\nclassify entire roads, enabling the identification of potential changes in road\nclasses and resulting in a labeled road class dataset. Our method achieved\ncompleteness and correctness scores of over 94% and 92%, respectively, for road\nclass 2, the most prevalent class in the two Siegfried Map sheets from\nSwitzerland used for testing. This research offers a powerful tool for urban\nplanning and transportation decision-making by efficiently extracting and\nclassifying roads from historical maps.\n","authors":["Dominik J. Mühlematter","Sebastian Schweizer","Chenjing Jiao","Xue Xia","Magnus Heitzler","Lorenz Hurni"],"pdf_url":"https://arxiv.org/pdf/2410.02250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02249v1","updated":"2024-10-03T06:41:10Z","published":"2024-10-03T06:41:10Z","title":"Spiking Neural Network as Adaptive Event Stream Slicer","summary":" Event-based cameras are attracting significant interest as they provide rich\nedge information, high dynamic range, and high temporal resolution. Many\nstate-of-the-art event-based algorithms rely on splitting the events into fixed\ngroups, resulting in the omission of crucial temporal information, particularly\nwhen dealing with diverse motion scenarios (e.g., high/low speed). In this\nwork, we propose SpikeSlicer, a novel-designed plug-and-play event processing\nmethod capable of splitting events stream adaptively. SpikeSlicer utilizes a\nlightweight (0.41M) and low-energy spiking neural network (SNN) to trigger\nevent slicing. To guide the SNN to fire spikes at optimal time steps, we\npropose the Spiking Position-aware Loss (SPA-Loss) to modulate the neuron's\nstate. Additionally, we develop a Feedback-Update training strategy that\nrefines the slicing decisions using feedback from the downstream artificial\nneural network (ANN). Extensive experiments demonstrate that our method yields\nsignificant performance improvements in event-based object tracking and\nrecognition. Notably, SpikeSlicer provides a brand-new SNN-ANN cooperation\nparadigm, where the SNN acts as an efficient, low-energy data processor to\nassist the ANN in improving downstream performance, injecting new perspectives\nand potential avenues of exploration.\n","authors":["Jiahang Cao","Mingyuan Sun","Ziqing Wang","Hao Cheng","Qiang Zhang","Shibo Zhou","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2410.02249v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.02230v2","updated":"2024-10-03T06:36:14Z","published":"2024-06-04T11:48:44Z","title":"I4VGen: Image as Free Stepping Stone for Text-to-Video Generation","summary":" Text-to-video generation has trailed behind text-to-image generation in terms\nof quality and diversity, primarily due to the inherent complexities of\nspatio-temporal modeling and the limited availability of video-text datasets.\nRecent text-to-video diffusion models employ the image as an intermediate step,\nsignificantly enhancing overall performance but incurring high training costs.\nIn this paper, we present I4VGen, a novel video diffusion inference pipeline to\nleverage advanced image techniques to enhance pre-trained text-to-video\ndiffusion models, which requires no additional training. Instead of the vanilla\ntext-to-video inference pipeline, I4VGen consists of two stages: anchor image\nsynthesis and anchor image-augmented text-to-video synthesis. Correspondingly,\na simple yet effective generation-selection strategy is employed to achieve\nvisually-realistic and semantically-faithful anchor image, and an innovative\nnoise-invariant video score distillation sampling (NI-VSDS) is developed to\nanimate the image to a dynamic video by distilling motion knowledge from video\ndiffusion models, followed by a video regeneration process to refine the video.\nExtensive experiments show that the proposed method produces videos with higher\nvisual realism and textual fidelity. Furthermore, I4VGen also supports being\nseamlessly integrated into existing image-to-video diffusion models, thereby\nimproving overall video quality.\n","authors":["Xiefan Guo","Jinlin Liu","Miaomiao Cui","Liefeng Bo","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2406.02230v2.pdf","comment":"Project page: https://xiefan-guo.github.io/i4vgen"},{"id":"http://arxiv.org/abs/2410.02244v1","updated":"2024-10-03T06:33:43Z","published":"2024-10-03T06:33:43Z","title":"Visual Prompting in LLMs for Enhancing Emotion Recognition","summary":" Vision Large Language Models (VLLMs) are transforming the intersection of\ncomputer vision and natural language processing. Nonetheless, the potential of\nusing visual prompts for emotion recognition in these models remains largely\nunexplored and untapped. Traditional methods in VLLMs struggle with spatial\nlocalization and often discard valuable global context. To address this\nproblem, we propose a Set-of-Vision prompting (SoV) approach that enhances\nzero-shot emotion recognition by using spatial information, such as bounding\nboxes and facial landmarks, to mark targets precisely. SoV improves accuracy in\nface count and emotion categorization while preserving the enriched image\ncontext. Through a battery of experimentation and analysis of recent commercial\nor open-source VLLMs, we evaluate the SoV model's ability to comprehend facial\nexpressions in natural environments. Our findings demonstrate the effectiveness\nof integrating spatial visual prompts into VLLMs for improving emotion\nrecognition performance.\n","authors":["Qixuan Zhang","Zhifeng Wang","Dylan Zhang","Wenjia Niu","Sabrina Caldwell","Tom Gedeon","Yang Liu","Zhenyue Qin"],"pdf_url":"https://arxiv.org/pdf/2410.02244v1.pdf","comment":"Accepted by EMNLP2024 (Main, Long paper)"},{"id":"http://arxiv.org/abs/2409.18147v2","updated":"2024-10-03T06:29:20Z","published":"2024-09-25T02:41:58Z","title":"SSP-RACL: Classification of Noisy Fundus Images with Self-Supervised\n Pretraining and Robust Adaptive Credal Loss","summary":" Fundus image classification is crucial in the computer aided diagnosis tasks,\nbut label noise significantly impairs the performance of deep neural networks.\nTo address this challenge, we propose a robust framework, Self-Supervised\nPre-training with Robust Adaptive Credal Loss (SSP-RACL), for handling label\nnoise in fundus image datasets. First, we use Masked Autoencoders (MAE) for\npre-training to extract features, unaffected by label noise. Subsequently, RACL\nemploy a superset learning framework, setting confidence thresholds and\nadaptive label relaxation parameter to construct possibility distributions and\nprovide more reliable ground-truth estimates, thus effectively suppressing the\nmemorization effect. Additionally, we introduce clinical knowledge-based\nasymmetric noise generation to simulate real-world noisy fundus image datasets.\nExperimental results demonstrate that our proposed method outperforms existing\napproaches in handling label noise, showing superior performance.\n","authors":["Mengwen Ye","Yingzi Huangfu","You Li","Zekuan Yu"],"pdf_url":"https://arxiv.org/pdf/2409.18147v2.pdf","comment":"IEEE BioCAS 2024"},{"id":"http://arxiv.org/abs/2406.13527v3","updated":"2024-10-03T06:26:49Z","published":"2024-06-19T13:11:02Z","title":"4K4DGen: Panoramic 4D Generation at 4K Resolution","summary":" The blooming of virtual reality and augmented reality (VR/AR) technologies\nhas driven an increasing demand for the creation of high-quality, immersive,\nand dynamic environments. However, existing generative techniques either focus\nsolely on dynamic objects or perform outpainting from a single perspective\nimage, failing to meet the requirements of VR/AR applications that need\nfree-viewpoint, 360$^{\\circ}$ virtual views where users can move in all\ndirections. In this work, we tackle the challenging task of elevating a single\npanorama to an immersive 4D experience. For the first time, we demonstrate the\ncapability to generate omnidirectional dynamic scenes with 360$^{\\circ}$ views\nat 4K (4096 $\\times$ 2048) resolution, thereby providing an immersive user\nexperience. Our method introduces a pipeline that facilitates natural scene\nanimations and optimizes a set of dynamic Gaussians using efficient splatting\ntechniques for real-time exploration. To overcome the lack of scene-scale\nannotated 4D data and models, especially in panoramic formats, we propose a\nnovel \\textbf{Panoramic Denoiser} that adapts generic 2D diffusion priors to\nanimate consistently in 360$^{\\circ}$ images, transforming them into panoramic\nvideos with dynamic scenes at targeted regions. Subsequently, we propose\n\\textbf{Dynamic Panoramic Lifting} to elevate the panoramic video into a 4D\nimmersive environment while preserving spatial and temporal consistency. By\ntransferring prior knowledge from 2D models in the perspective domain to the\npanoramic domain and the 4D lifting with spatial appearance and geometry\nregularization, we achieve high-quality Panorama-to-4D generation at a\nresolution of 4K for the first time.\n","authors":["Renjie Li","Panwang Pan","Bangbang Yang","Dejia Xu","Shijie Zhou","Xuanyang Zhang","Zeming Li","Achuta Kadambi","Zhangyang Wang","Zhengzhong Tu","Zhiwen Fan"],"pdf_url":"https://arxiv.org/pdf/2406.13527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02240v1","updated":"2024-10-03T06:25:53Z","published":"2024-10-03T06:25:53Z","title":"SCA: Highly Efficient Semantic-Consistent Unrestricted Adversarial\n Attack","summary":" Unrestricted adversarial attacks typically manipulate the semantic content of\nan image (e.g., color or texture) to create adversarial examples that are both\neffective and photorealistic. Recent works have utilized the diffusion\ninversion process to map images into a latent space, where high-level semantics\nare manipulated by introducing perturbations. However, they often results in\nsubstantial semantic distortions in the denoised output and suffers from low\nefficiency. In this study, we propose a novel framework called\nSemantic-Consistent Unrestricted Adversarial Attacks (SCA), which employs an\ninversion method to extract edit-friendly noise maps and utilizes Multimodal\nLarge Language Model (MLLM) to provide semantic guidance throughout the\nprocess. Under the condition of rich semantic information provided by MLLM, we\nperform the DDPM denoising process of each step using a series of edit-friendly\nnoise maps, and leverage DPM Solver++ to accelerate this process, enabling\nefficient sampling with semantic consistency. Compared to existing methods, our\nframework enables the efficient generation of adversarial examples that exhibit\nminimal discernible semantic changes. Consequently, we for the first time\nintroduce Semantic-Consistent Adversarial Examples (SCAE). Extensive\nexperiments and visualizations have demonstrated the high efficiency of SCA,\nparticularly in being on average 12 times faster than the state-of-the-art\nattacks. Our code can be found at\nhttps://github.com/Pan-Zihao/SCA}{https://github.com/Pan-Zihao/SCA.\n","authors":["Zihao Pan","Weibin Wu","Yuhang Cao","Zibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2410.02240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02237v1","updated":"2024-10-03T06:16:50Z","published":"2024-10-03T06:16:50Z","title":"Key-Grid: Unsupervised 3D Keypoints Detection using Grid Heatmap\n Features","summary":" Detecting 3D keypoints with semantic consistency is widely used in many\nscenarios such as pose estimation, shape registration and robotics. Currently,\nmost unsupervised 3D keypoint detection methods focus on the rigid-body\nobjects. However, when faced with deformable objects, the keypoints they\nidentify do not preserve semantic consistency well. In this paper, we introduce\nan innovative unsupervised keypoint detector Key-Grid for both the rigid-body\nand deformable objects, which is an autoencoder framework. The encoder predicts\nkeypoints and the decoder utilizes the generated keypoints to reconstruct the\nobjects. Unlike previous work, we leverage the identified keypoint in formation\nto form a 3D grid feature heatmap called grid heatmap, which is used in the\ndecoder section. Grid heatmap is a novel concept that represents the latent\nvariables for grid points sampled uniformly in the 3D cubic space, where these\nvariables are the shortest distance between the grid points and the skeleton\nconnected by keypoint pairs. Meanwhile, we incorporate the information from\neach layer of the encoder into the decoder section. We conduct an extensive\nevaluation of Key-Grid on a list of benchmark datasets. Key-Grid achieves the\nstate-of-the-art performance on the semantic consistency and position accuracy\nof keypoints. Moreover, we demonstrate the robustness of Key-Grid to noise and\ndownsampling. In addition, we achieve SE-(3) invariance of keypoints though\ngeneralizing Key-Grid to a SE(3)-invariant backbone.\n","authors":["Chengkai Hou","Zhengrong Xue","Bingyang Zhou","Jinghan Ke","Lin Shao","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2410.02237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01506v2","updated":"2024-10-03T05:50:09Z","published":"2024-10-02T12:58:55Z","title":"LEGO: Learnable Expansion of Graph Operators for Multi-Modal Feature\n Fusion","summary":" In computer vision tasks, features often come from diverse representations,\ndomains, and modalities, such as text, images, and videos. Effectively fusing\nthese features is essential for robust performance, especially with the\navailability of powerful pre-trained models like vision-language models.\nHowever, common fusion methods, such as concatenation, element-wise operations,\nand non-linear techniques, often fail to capture structural relationships, deep\nfeature interactions, and suffer from inefficiency or misalignment of features\nacross domains. In this paper, we shift from high-dimensional feature space to\na lower-dimensional, interpretable graph space by constructing similarity\ngraphs that encode feature relationships at different levels, e.g., clip,\nframe, patch, token, etc. To capture deeper interactions, we use graph power\nexpansions and introduce a learnable graph fusion operator to combine these\ngraph powers for more effective fusion. Our approach is relationship-centric,\noperates in a homogeneous space, and is mathematically principled, resembling\nelement-wise similarity score aggregation via multilinear polynomials. We\ndemonstrate the effectiveness of our graph-based fusion method on video anomaly\ndetection, showing strong performance across multi-representational,\nmulti-modal, and multi-domain feature fusion tasks.\n","authors":["Dexuan Ding","Lei Wang","Liyun Zhu","Tom Gedeon","Piotr Koniusz"],"pdf_url":"https://arxiv.org/pdf/2410.01506v2.pdf","comment":"Research paper"},{"id":"http://arxiv.org/abs/2410.02224v1","updated":"2024-10-03T05:45:24Z","published":"2024-10-03T05:45:24Z","title":"Efficient Semantic Segmentation via Lightweight Multiple-Information\n Interaction Network","summary":" Recently, the integration of the local modeling capabilities of Convolutional\nNeural Networks (CNNs) with the global dependency strengths of Transformers has\ncreated a sensation in the semantic segmentation community. However,\nsubstantial computational workloads and high hardware memory demands remain\nmajor obstacles to their further application in real-time scenarios. In this\nwork, we propose a lightweight multiple-information interaction network for\nreal-time semantic segmentation, called LMIINet, which effectively combines\nCNNs and Transformers while reducing redundant computations and memory\nfootprint. It features Lightweight Feature Interaction Bottleneck (LFIB)\nmodules comprising efficient convolutions that enhance context integration.\nAdditionally, improvements are made to the Flatten Transformer by enhancing\nlocal and global feature interaction to capture detailed semantic information.\nThe incorporation of a combination coefficient learning scheme in both LFIB and\nTransformer blocks facilitates improved feature interaction. Extensive\nexperiments demonstrate that LMIINet excels in balancing accuracy and\nefficiency. With only 0.72M parameters and 11.74G FLOPs, LMIINet achieves 72.0%\nmIoU at 100 FPS on the Cityscapes test set and 69.94% mIoU at 160 FPS on the\nCamVid test dataset using a single RTX2080Ti GPU.\n","authors":["Yangyang Qiu","Guoan Xu","Guangwei Gao","Zhenhua Guo","Yi Yu","Chia-Wen Lin"],"pdf_url":"https://arxiv.org/pdf/2410.02224v1.pdf","comment":"10 pages, 6 figures, 9 tables"},{"id":"http://arxiv.org/abs/2410.02221v1","updated":"2024-10-03T05:32:16Z","published":"2024-10-03T05:32:16Z","title":"Capturing complex hand movements and object interactions using machine\n learning-powered stretchable smart textile gloves","summary":" Accurate real-time tracking of dexterous hand movements and interactions has\nnumerous applications in human-computer interaction, metaverse, robotics, and\ntele-health. Capturing realistic hand movements is challenging because of the\nlarge number of articulations and degrees of freedom. Here, we report accurate\nand dynamic tracking of articulated hand and finger movements using\nstretchable, washable smart gloves with embedded helical sensor yarns and\ninertial measurement units. The sensor yarns have a high dynamic range,\nresponding to low 0.005 % to high 155 % strains, and show stability during\nextensive use and washing cycles. We use multi-stage machine learning to report\naverage joint angle estimation root mean square errors of 1.21 and 1.45 degrees\nfor intra- and inter-subjects cross-validation, respectively, matching accuracy\nof costly motion capture cameras without occlusion or field of view\nlimitations. We report a data augmentation technique that enhances robustness\nto noise and variations of sensors. We demonstrate accurate tracking of\ndexterous hand movements during object interactions, opening new avenues of\napplications including accurate typing on a mock paper keyboard, recognition of\ncomplex dynamic and static gestures adapted from American Sign Language and\nobject identification.\n","authors":["Arvin Tashakori","Zenan Jiang","Amir Servati","Saeid Soltanian","Harishkumar Narayana","Katherine Le","Caroline Nakayama","Chieh-ling Yang","Z. Jane Wang","Janice J. Eng","Peyman Servati"],"pdf_url":"https://arxiv.org/pdf/2410.02221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02217v1","updated":"2024-10-03T05:18:28Z","published":"2024-10-03T05:18:28Z","title":"Stochastic Sampling from Deterministic Flow Models","summary":" Deterministic flow models, such as rectified flows, offer a general framework\nfor learning a deterministic transport map between two distributions, realized\nas the vector field for an ordinary differential equation (ODE). However, they\nare sensitive to model estimation and discretization errors and do not permit\ndifferent samples conditioned on an intermediate state, limiting their\napplication. We present a general method to turn the underlying ODE of such\nflow models into a family of stochastic differential equations (SDEs) that have\nthe same marginal distributions. This method permits us to derive families of\n\\emph{stochastic samplers}, for fixed (e.g., previously trained)\n\\emph{deterministic} flow models, that continuously span the spectrum of\ndeterministic and stochastic sampling, given access to the flow field and the\nscore function. Our method provides additional degrees of freedom that help\nalleviate the issues with the deterministic samplers and empirically\noutperforms them. We empirically demonstrate advantages of our method on a toy\nGaussian setup and on the large scale ImageNet generation task. Further, our\nfamily of stochastic samplers provide an additional knob for controlling the\ndiversity of generation, which we qualitatively demonstrate in our experiments.\n","authors":["Saurabh Singh","Ian Fischer"],"pdf_url":"https://arxiv.org/pdf/2410.02217v1.pdf","comment":"Submitted to ICLR 2025"},{"id":"http://arxiv.org/abs/2410.02212v1","updated":"2024-10-03T04:58:33Z","published":"2024-10-03T04:58:33Z","title":"Hard Negative Sample Mining for Whole Slide Image Classification","summary":" Weakly supervised whole slide image (WSI) classification is challenging due\nto the lack of patch-level labels and high computational costs.\nState-of-the-art methods use self-supervised patch-wise feature representations\nfor multiple instance learning (MIL). Recently, methods have been proposed to\nfine-tune the feature representation on the downstream task using pseudo\nlabeling, but mostly focusing on selecting high-quality positive patches. In\nthis paper, we propose to mine hard negative samples during fine-tuning. This\nallows us to obtain better feature representations and reduce the training\ncost. Furthermore, we propose a novel patch-wise ranking loss in MIL to better\nexploit these hard negative samples. Experiments on two public datasets\ndemonstrate the efficacy of these proposed ideas. Our codes are available at\nhttps://github.com/winston52/HNM-WSI\n","authors":["Wentao Huang","Xiaoling Hu","Shahira Abousamra","Prateek Prasanna","Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02212v1.pdf","comment":"13 pages, 4 figures, accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2410.02207v1","updated":"2024-10-03T04:40:18Z","published":"2024-10-03T04:40:18Z","title":"Adapting Segment Anything Model to Melanoma Segmentation in Microscopy\n Slide Images","summary":" Melanoma segmentation in Whole Slide Images (WSIs) is useful for prognosis\nand the measurement of crucial prognostic factors such as Breslow depth and\nprimary invasive tumor size. In this paper, we present a novel approach that\nuses the Segment Anything Model (SAM) for automatic melanoma segmentation in\nmicroscopy slide images. Our method employs an initial semantic segmentation\nmodel to generate preliminary segmentation masks that are then used to prompt\nSAM. We design a dynamic prompting strategy that uses a combination of centroid\nand grid prompts to achieve optimal coverage of the super high-resolution slide\nimages while maintaining the quality of generated prompts. To optimize for\ninvasive melanoma segmentation, we further refine the prompt generation process\nby implementing in-situ melanoma detection and low-confidence region filtering.\nWe select Segformer as the initial segmentation model and EfficientSAM as the\nsegment anything model for parameter-efficient fine-tuning. Our experimental\nresults demonstrate that this approach not only surpasses other\nstate-of-the-art melanoma segmentation methods but also significantly\noutperforms the baseline Segformer by 9.1% in terms of IoU.\n","authors":["Qingyuan Liu","Avideh Zakhor"],"pdf_url":"https://arxiv.org/pdf/2410.02207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02201v1","updated":"2024-10-03T04:32:21Z","published":"2024-10-03T04:32:21Z","title":"Remember and Recall: Associative-Memory-based Trajectory Prediction","summary":" Trajectory prediction is a pivotal component of autonomous driving systems,\nenabling the application of accumulated movement experience to current\nscenarios. Although most existing methods concentrate on learning continuous\nrepresentations to gain valuable experience, they often suffer from\ncomputational inefficiencies and struggle with unfamiliar situations. To\naddress this issue, we propose the Fragmented-Memory-based Trajectory\nPrediction (FMTP) model, inspired by the remarkable learning capabilities of\nhumans, particularly their ability to leverage accumulated experience and\nrecall relevant memories in unfamiliar situations. The FMTP model employs\ndiscrete representations to enhance computational efficiency by reducing\ninformation redundancy while maintaining the flexibility to utilize past\nexperiences. Specifically, we design a learnable memory array by consolidating\ncontinuous trajectory representations from the training set using defined\nquantization operations during the training phase. This approach further\neliminates redundant information while preserving essential features in\ndiscrete form. Additionally, we develop an advanced reasoning engine based on\nlanguage models to deeply learn the associative rules among these discrete\nrepresentations. Our method has been evaluated on various public datasets,\nincluding ETH-UCY, inD, SDD, nuScenes, Waymo, and VTL-TP. The extensive\nexperimental results demonstrate that our approach achieves significant\nperformance and extracts more valuable experience from past trajectories to\ninform the current state.\n","authors":["Hang Guo","Yuzhen Zhang","Tianci Gao","Junning Su","Pei Lv","Mingliang Xu"],"pdf_url":"https://arxiv.org/pdf/2410.02201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03717v2","updated":"2024-10-03T04:09:33Z","published":"2024-08-07T12:10:32Z","title":"Pick of the Bunch: Detecting Infrared Small Targets Beyond Hit-Miss\n Trade-Offs via Selective Rank-Aware Attention","summary":" Infrared small target detection faces the inherent challenge of precisely\nlocalizing dim targets amidst complex background clutter. Traditional\napproaches struggle to balance detection precision and false alarm rates. To\nbreak this dilemma, we propose SeRankDet, a deep network that achieves high\naccuracy beyond the conventional hit-miss trade-off, by following the ``Pick of\nthe Bunch'' principle. At its core lies our Selective Rank-Aware Attention\n(SeRank) module, employing a non-linear Top-K selection process that preserves\nthe most salient responses, preventing target signal dilution while maintaining\nconstant complexity. Furthermore, we replace the static concatenation typical\nin U-Net structures with our Large Selective Feature Fusion (LSFF) module, a\ndynamic fusion strategy that empowers SeRankDet with adaptive feature\nintegration, enhancing its ability to discriminate true targets from false\nalarms. The network's discernment is further refined by our Dilated Difference\nConvolution (DDC) module, which merges differential convolution aimed at\namplifying subtle target characteristics with dilated convolution to expand the\nreceptive field, thereby substantially improving target-background separation.\nDespite its lightweight architecture, the proposed SeRankDet sets new\nbenchmarks in state-of-the-art performance across multiple public datasets. The\ncode is available at https://github.com/GrokCV/SeRankDet.\n","authors":["Yimian Dai","Peiwen Pan","Yulei Qian","Yuxuan Li","Xiang Li","Jian Yang","Huan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.03717v2.pdf","comment":"IEEE TGRS 2024"},{"id":"http://arxiv.org/abs/2309.04148v3","updated":"2024-10-03T03:55:04Z","published":"2023-09-08T06:24:44Z","title":"Representation Synthesis by Probabilistic Many-Valued Logic Operation in\n Self-Supervised Learning","summary":" In this paper, we propose a new self-supervised learning (SSL) method for\nrepresentations that enable logic operations. Representation learning has been\napplied to various tasks, such as image generation and retrieval. The logical\ncontrollability of representations is important for these tasks. Although some\nmethods have been shown to enable the intuitive control of representations\nusing natural languages as the inputs, representation control via logic\noperations between representations has not been demonstrated. Some SSL methods\nusing representation synthesis (e.g., elementwise mean and maximum operations)\nhave been proposed, but the operations performed in these methods do not\nincorporate logic operations. In this work, we propose a logic-operable\nself-supervised representation learning method by replacing the existing\nrepresentation synthesis with the OR operation on the probabilistic extension\nof many-valued logic. The representations comprise a set of feature-possession\ndegrees, which are truth values indicating the presence or absence of each\nfeature in the image, and realize the logic operations (e.g., OR and AND). Our\nmethod can generate a representation that has the features of both\nrepresentations or only those features common to both representations. In\naddition, the expression of the ambiguous presence of a feature is realized by\nindicating the feature-possession degree by the probability distribution of\ntruth values of the many-valued logic. We showed that our method performs\ncompetitively in single and multi-label classification tasks compared with\nprior SSL methods using synthetic representations. Moreover, experiments on\nimage retrieval using MNIST and PascalVOC showed that the representations of\nour method can be operated by OR and AND operations.\n","authors":["Hiroki Nakamura","Masashi Okada","Tadahiro Taniguchi"],"pdf_url":"https://arxiv.org/pdf/2309.04148v3.pdf","comment":"Accepted to the IEEE Open Journal of Signal Processing (ICIP2024\n track)"},{"id":"http://arxiv.org/abs/2410.02182v1","updated":"2024-10-03T03:51:53Z","published":"2024-10-03T03:51:53Z","title":"BadCM: Invisible Backdoor Attack Against Cross-Modal Learning","summary":" Despite remarkable successes in unimodal learning tasks, backdoor attacks\nagainst cross-modal learning are still underexplored due to the limited\ngeneralization and inferior stealthiness when involving multiple modalities.\nNotably, since works in this area mainly inherit ideas from unimodal visual\nattacks, they struggle with dealing with diverse cross-modal attack\ncircumstances and manipulating imperceptible trigger samples, which hinders\ntheir practicability in real-world applications. In this paper, we introduce a\nnovel bilateral backdoor to fill in the missing pieces of the puzzle in the\ncross-modal backdoor and propose a generalized invisible backdoor framework\nagainst cross-modal learning (BadCM). Specifically, a cross-modal mining scheme\nis developed to capture the modality-invariant components as target poisoning\nareas, where well-designed trigger patterns injected into these regions can be\nefficiently recognized by the victim models. This strategy is adapted to\ndifferent image-text cross-modal models, making our framework available to\nvarious attack scenarios. Furthermore, for generating poisoned samples of high\nstealthiness, we conceive modality-specific generators for visual and\nlinguistic modalities that facilitate hiding explicit trigger patterns in\nmodality-invariant regions. To the best of our knowledge, BadCM is the first\ninvisible backdoor method deliberately designed for diverse cross-modal attacks\nwithin one unified framework. Comprehensive experimental evaluations on two\ntypical applications, i.e., cross-modal retrieval and VQA, demonstrate the\neffectiveness and generalization of our method under multiple kinds of attack\nscenarios. Moreover, we show that BadCM can robustly evade existing backdoor\ndefenses. Our code is available at https://github.com/xandery-geek/BadCM.\n","authors":["Zheng Zhang","Xu Yuan","Lei Zhu","Jingkuan Song","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2410.02182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14637v2","updated":"2024-10-03T03:51:22Z","published":"2023-10-23T07:21:40Z","title":"Semantic-Aware Adversarial Training for Reliable Deep Hashing Retrieval","summary":" Deep hashing has been intensively studied and successfully applied in\nlarge-scale image retrieval systems due to its efficiency and effectiveness.\nRecent studies have recognized that the existence of adversarial examples poses\na security threat to deep hashing models, that is, adversarial vulnerability.\nNotably, it is challenging to efficiently distill reliable semantic\nrepresentatives for deep hashing to guide adversarial learning, and thereby it\nhinders the enhancement of adversarial robustness of deep hashing-based\nretrieval models. Moreover, current researches on adversarial training for deep\nhashing are hard to be formalized into a unified minimax structure. In this\npaper, we explore Semantic-Aware Adversarial Training (SAAT) for improving the\nadversarial robustness of deep hashing models. Specifically, we conceive a\ndiscriminative mainstay features learning (DMFL) scheme to construct semantic\nrepresentatives for guiding adversarial learning in deep hashing. Particularly,\nour DMFL with the strict theoretical guarantee is adaptively optimized in a\ndiscriminative learning manner, where both discriminative and semantic\nproperties are jointly considered. Moreover, adversarial examples are\nfabricated by maximizing the Hamming distance between the hash codes of\nadversarial samples and mainstay features, the efficacy of which is validated\nin the adversarial attack trials. Further, we, for the first time, formulate\nthe formalized adversarial training of deep hashing into a unified minimax\noptimization under the guidance of the generated mainstay codes. Extensive\nexperiments on benchmark datasets show superb attack performance against the\nstate-of-the-art algorithms, meanwhile, the proposed adversarial training can\neffectively eliminate adversarial perturbations for trustworthy deep\nhashing-based retrieval. Our code is available at\nhttps://github.com/xandery-geek/SAAT.\n","authors":["Xu Yuan","Zheng Zhang","Xunguang Wang","Lin Wu"],"pdf_url":"https://arxiv.org/pdf/2310.14637v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03722v2","updated":"2024-10-03T03:46:05Z","published":"2023-09-07T13:58:31Z","title":"A boundary-aware point clustering approach in Euclidean and embedding\n spaces for roof plane segmentation","summary":" Roof plane segmentation from airborne LiDAR point clouds is an important\ntechnology for 3D building model reconstruction. One of the key issues of plane\nsegmentation is how to design powerful features that can exactly distinguish\nadjacent planar patches. The quality of point feature directly determines the\naccuracy of roof plane segmentation. Most of existing approaches use\nhandcrafted features to extract roof planes. However, the abilities of these\nfeatures are relatively low, especially in boundary area. To solve this\nproblem, we propose a boundary-aware point clustering approach in Euclidean and\nembedding spaces constructed by a multi-task deep network for roof plane\nsegmentation. We design a three-branch network to predict semantic labels,\npoint offsets and extract deep embedding features. In the first branch, we\nclassify the input data as non-roof, boundary and plane points. In the second\nbranch, we predict point offsets for shifting each point toward its respective\ninstance center. In the third branch, we constrain that points of the same\nplane instance should have the similar embeddings. We aim to ensure that points\nof the same plane instance are close as much as possible in both Euclidean and\nembedding spaces. However, although deep network has strong feature\nrepresentative ability, it is still hard to accurately distinguish points near\nplane instance boundary. Therefore, we first group plane points into many\nclusters in the two spaces, and then we assign the rest boundary points to\ntheir closest clusters to generate final complete roof planes. In this way, we\ncan effectively reduce the influence of unreliable boundary points. In\naddition, we prepare a synthetic dataset and two real datasets to train and\nevaluate our approach. The experiments results show that the proposed approach\nsignificantly outperforms the existing state-of-the-art approaches.\n","authors":["Li Li","Qingqing Li","Guozheng Xu","Pengwei Zhou","Jingmin Tu","Jie Li","Mingming Li","Jian Yao"],"pdf_url":"https://arxiv.org/pdf/2309.03722v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02179v1","updated":"2024-10-03T03:43:29Z","published":"2024-10-03T03:43:29Z","title":"HATFormer: Historic Handwritten Arabic Text Recognition with\n Transformers","summary":" Arabic handwritten text recognition (HTR) is challenging, especially for\nhistorical texts, due to diverse writing styles and the intrinsic features of\nArabic script. Additionally, Arabic handwriting datasets are smaller compared\nto English ones, making it difficult to train generalizable Arabic HTR models.\nTo address these challenges, we propose HATFormer, a transformer-based\nencoder-decoder architecture that builds on a state-of-the-art English HTR\nmodel. By leveraging the transformer's attention mechanism, HATFormer captures\nspatial contextual information to address the intrinsic challenges of Arabic\nscript through differentiating cursive characters, decomposing visual\nrepresentations, and identifying diacritics. Our customization to historical\nhandwritten Arabic includes an image processor for effective ViT information\npreprocessing, a text tokenizer for compact Arabic text representation, and a\ntraining pipeline that accounts for a limited amount of historic Arabic\nhandwriting data. HATFormer achieves a character error rate (CER) of 8.6% on\nthe largest public historical handwritten Arabic dataset, with a 51%\nimprovement over the best baseline in the literature. HATFormer also attains a\ncomparable CER of 4.2% on the largest private non-historical dataset. Our work\ndemonstrates the feasibility of adapting an English HTR method to a\nlow-resource language with complex, language-specific challenges, contributing\nto advancements in document digitization, information retrieval, and cultural\npreservation.\n","authors":["Adrian Chan","Anupam Mijar","Mehreen Saeed","Chau-Wai Wong","Akram Khater"],"pdf_url":"https://arxiv.org/pdf/2410.02179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03757v2","updated":"2024-10-03T03:40:20Z","published":"2024-02-06T06:48:46Z","title":"The Instinctive Bias: Spurious Images lead to Illusion in MLLMs","summary":" Large language models (LLMs) have recently experienced remarkable progress,\nwhere the advent of multi-modal large language models (MLLMs) has endowed LLMs\nwith visual capabilities, leading to impressive performances in various\nmulti-modal tasks. However, those powerful MLLMs such as GPT-4V still fail\nspectacularly when presented with certain image and text inputs. In this paper,\nwe identify a typical class of inputs that baffles MLLMs, which consist of\nimages that are highly relevant but inconsistent with answers, causing MLLMs to\nsuffer from visual illusion. To quantify the effect, we propose CorrelationQA,\nthe first benchmark that assesses the visual illusion level given spurious\nimages. This benchmark contains 7,308 text-image pairs across 13 categories.\nBased on the proposed CorrelationQA, we conduct a thorough analysis on 9\nmainstream MLLMs, illustrating that they universally suffer from this\ninstinctive bias to varying degrees. We hope that our curated benchmark and\nevaluation results aid in better assessments of the MLLMs' robustness in the\npresence of misleading images. The code and datasets are available at\nhttps://github.com/MasaiahHan/CorrelationQA.\n","authors":["Tianyang Han","Qing Lian","Rui Pan","Renjie Pi","Jipeng Zhang","Shizhe Diao","Yong Lin","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.03757v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18944v4","updated":"2024-10-03T03:35:53Z","published":"2024-06-27T07:14:14Z","title":"Rethinking and Defending Protective Perturbation in Personalized\n Diffusion Models","summary":" Personalized diffusion models (PDMs) have become prominent for adapting\npretrained text-to-image models to generate images of specific subjects using\nminimal training data. However, PDMs are susceptible to minor adversarial\nperturbations, leading to significant degradation when fine-tuned on corrupted\ndatasets. These vulnerabilities are exploited to create protective\nperturbations that prevent unauthorized image generation. Existing purification\nmethods attempt to mitigate this issue but often over-purify images, resulting\nin information loss. In this work, we conduct an in-depth analysis of the\nfine-tuning process of PDMs through the lens of shortcut learning. We\nhypothesize and empirically demonstrate that adversarial perturbations induce a\nlatent-space misalignment between images and their text prompts in the CLIP\nembedding space. This misalignment causes the model to erroneously associate\nnoisy patterns with unique identifiers during fine-tuning, resulting in poor\ngeneralization. Based on these insights, we propose a systematic defense\nframework that includes data purification and contrastive decoupling learning.\nWe first employ off-the-shelf image restoration techniques to realign images\nwith their original semantic meanings in latent space. Then, we introduce\ncontrastive decoupling learning with noise tokens to decouple the learning of\npersonalized concepts from spurious noise patterns. Our study not only uncovers\nfundamental shortcut learning vulnerabilities in PDMs but also provides a\ncomprehensive evaluation framework for developing stronger protection. Our\nextensive evaluation demonstrates its superiority over existing purification\nmethods and stronger robustness against adaptive perturbation.\n","authors":["Yixin Liu","Ruoxi Chen","Xun Chen","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2406.18944v4.pdf","comment":"Our code is available at\n https://github.com/liuyixin-louis/DiffShortcut"},{"id":"http://arxiv.org/abs/2410.02155v1","updated":"2024-10-03T02:34:31Z","published":"2024-10-03T02:34:31Z","title":"From Pixels to Tokens: Byte-Pair Encoding on Quantized Visual Modalities","summary":" Multimodal Large Language Models have made significant strides in integrating\nvisual and textual information, yet they often struggle with effectively\naligning these modalities. We introduce a novel image tokenizer that bridges\nthis gap by applying the principle of Byte-Pair Encoding (BPE) to visual data.\nUnlike conventional approaches that rely on separate visual encoders, our\nmethod directly incorporates structural prior information into image tokens,\nmirroring the successful tokenization strategies used in text-only Large\nLanguage Models. This innovative approach enables Transformer models to more\neffectively learn and reason across modalities. Through theoretical analysis\nand extensive experiments, we demonstrate that our BPE Image Tokenizer\nsignificantly enhances MLLMs' multimodal understanding capabilities, even with\nlimited training data. Our method not only improves performance across various\nbenchmarks but also shows promising scalability, potentially paving the way for\nmore efficient and capable multimodal foundation models.\n","authors":["Wanpeng Zhang","Zilong Xie","Yicheng Feng","Yijiang Li","Xingrun Xing","Sipeng Zheng","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2410.02155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02152v1","updated":"2024-10-03T02:31:14Z","published":"2024-10-03T02:31:14Z","title":"An Evaluation of Large Pre-Trained Models for Gesture Recognition using\n Synthetic Videos","summary":" In this work, we explore the possibility of using synthetically generated\ndata for video-based gesture recognition with large pre-trained models. We\nconsider whether these models have sufficiently robust and expressive\nrepresentation spaces to enable \"training-free\" classification. Specifically,\nwe utilize various state-of-the-art video encoders to extract features for use\nin k-nearest neighbors classification, where the training data points are\nderived from synthetic videos only. We compare these results with another\ntraining-free approach -- zero-shot classification using text descriptions of\neach gesture. In our experiments with the RoCoG-v2 dataset, we find that using\nsynthetic training videos yields significantly lower classification accuracy on\nreal test videos compared to using a relatively small number of real training\nvideos. We also observe that video backbones that were fine-tuned on\nclassification tasks serve as superior feature extractors, and that the choice\nof fine-tuning data has a substantial impact on k-nearest neighbors\nperformance. Lastly, we find that zero-shot text-based classification performs\npoorly on the gesture recognition task, as gestures are not easily described\nthrough natural language.\n","authors":["Arun Reddy","Ketul Shah","Corban Rivera","William Paul","Celso M. De Melo","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2410.02152v1.pdf","comment":"Synthetic Data for Artificial Intelligence and Machine Learning:\n Tools, Techniques, and Applications II (SPIE Defense + Commercial Sensing,\n 2024)"},{"id":"http://arxiv.org/abs/2410.01620v2","updated":"2024-10-03T02:29:12Z","published":"2024-10-02T14:57:58Z","title":"LMOD: A Large Multimodal Ophthalmology Dataset and Benchmark for Large\n Vision-Language Models","summary":" Ophthalmology relies heavily on detailed image analysis for diagnosis and\ntreatment planning. While large vision-language models (LVLMs) have shown\npromise in understanding complex visual information, their performance on\nophthalmology images remains underexplored. We introduce LMOD, a dataset and\nbenchmark for evaluating LVLMs on ophthalmology images, covering anatomical\nunderstanding, diagnostic analysis, and demographic extraction. LMODincludes\n21,993 images spanning optical coherence tomography, scanning laser\nophthalmoscopy, eye photos, surgical scenes, and color fundus photographs. We\nbenchmark 13 state-of-the-art LVLMs and find that they are far from perfect for\ncomprehending ophthalmology images. Models struggle with diagnostic analysis\nand demographic extraction, reveal weaknesses in spatial reasoning, diagnostic\nanalysis, handling out-of-domain queries, and safeguards for handling\nbiomarkers of ophthalmology images.\n","authors":["Zhenyue Qin","Yu Yin","Dylan Campbell","Xuansheng Wu","Ke Zou","Yih-Chung Tham","Ninghao Liu","Xiuzhen Zhang","Qingyu Chen"],"pdf_url":"https://arxiv.org/pdf/2410.01620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19340v4","updated":"2024-10-03T02:23:50Z","published":"2024-07-27T21:00:36Z","title":"Integrating Large Language Models into a Tri-Modal Architecture for\n Automated Depression Classification","summary":" Major Depressive Disorder (MDD) is a pervasive mental health condition that\naffects 300 million people worldwide. This work presents a novel, BiLSTM-based\ntri-modal model-level fusion architecture for the binary classification of\ndepression from clinical interview recordings. The proposed architecture\nincorporates Mel Frequency Cepstral Coefficients, Facial Action Units, and uses\na two-shot learning based GPT-4 model to process text data. This is the first\nwork to incorporate large language models into a multi-modal architecture for\nthis task. It achieves impressive results on the DAIC-WOZ AVEC 2016 Challenge\ncross-validation split and Leave-One-Subject-Out cross-validation split,\nsurpassing all baseline models and multiple state-of-the-art models. In\nLeave-One-Subject-Out testing, it achieves an accuracy of 91.01%, an F1-Score\nof 85.95%, a precision of 80%, and a recall of 92.86%.\n","authors":["Santosh V. Patapati"],"pdf_url":"https://arxiv.org/pdf/2407.19340v4.pdf","comment":"Keywords: Multi-Modal Neural Networks, Deep Learning, Large Language\n Models, Depression Diagnosis, Biomedical Informatics, DAIC-WOZ"},{"id":"http://arxiv.org/abs/2210.15889v5","updated":"2024-10-03T02:17:04Z","published":"2022-10-28T04:38:10Z","title":"Towards Data-and Knowledge-Driven Artificial Intelligence: A Survey on\n Neuro-Symbolic Computing","summary":" Neural-symbolic computing (NeSy), which pursues the integration of the\nsymbolic and statistical paradigms of cognition, has been an active research\narea of Artificial Intelligence (AI) for many years. As NeSy shows promise of\nreconciling the advantages of reasoning and interpretability of symbolic\nrepresentation and robust learning in neural networks, it may serve as a\ncatalyst for the next generation of AI. In the present paper, we provide a\nsystematic overview of the recent developments and important contributions of\nNeSy research. Firstly, we introduce study history of this area, covering early\nwork and foundations. We further discuss background concepts and identify key\ndriving factors behind the development of NeSy. Afterward, we categorize recent\nlandmark approaches along several main characteristics that underline this\nresearch paradigm, including neural-symbolic integration, knowledge\nrepresentation, knowledge embedding, and functionality. Next, we briefly\ndiscuss the successful application of modern NeSy approaches in several\ndomains. Then, we benchmark several NeSy methods on three representative\napplication tasks. Finally, we identify the open problems together with\npotential future research directions. This survey is expected to help new\nresearchers enter this rapidly evolving field and accelerate the progress\ntowards data-and knowledge-driven AI.\n","authors":["Wenguan Wang","Yi Yang","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2210.15889v5.pdf","comment":"PAMI 2024"},{"id":"http://arxiv.org/abs/2406.09588v2","updated":"2024-10-03T01:48:47Z","published":"2024-06-13T21:02:03Z","title":"Color Equivariant Network","summary":" Group equivariant convolutional neural networks have been designed for a\nvariety of geometric transformations from 2D and 3D rotation groups, to\nsemi-groups such as scale. Despite the improved interpretability, accuracy and\ngeneralizability afforded by these architectures, group equivariant networks\nhave seen limited application in the context of perceptual quantities such as\nhue and saturation, even though their variation can lead to significant\nreductions in classification performance. In this paper, we introduce\nconvolutional neural networks equivariant to variations in hue and saturation\nby design. To achieve this, we leverage the observation that hue and saturation\ntransformations can be identified with the 2D rotation and 1D translation\ngroups respectively. Our hue-, saturation-, and fully color-equivariant\nnetworks achieve equivariance to these perceptual transformations without an\nincrease in network parameters. We demonstrate the utility of our networks on\nsynthetic and real world datasets where color and lighting variations are\ncommonplace.\n","authors":["Felix O'Mahony","Yulong Yang","Christine Allen-Blanchette"],"pdf_url":"https://arxiv.org/pdf/2406.09588v2.pdf","comment":"Accepted at CVPR 2024 Equivariant Vision Workshop"},{"id":"http://arxiv.org/abs/2410.02130v1","updated":"2024-10-03T01:23:44Z","published":"2024-10-03T01:23:44Z","title":"MDSGen: Fast and Efficient Masked Diffusion Temporal-Aware Transformers\n for Open-Domain Sound Generation","summary":" We introduce MDSGen, a novel framework for vision-guided open-domain sound\ngeneration optimized for model parameter size, memory consumption, and\ninference speed. This framework incorporates two key innovations: (1) a\nredundant video feature removal module that filters out unnecessary visual\ninformation, and (2) a temporal-aware masking strategy that leverages temporal\ncontext for enhanced audio generation accuracy. In contrast to existing\nresource-heavy Unet-based models, MDSGen employs denoising masked diffusion\ntransformers, facilitating efficient generation without reliance on pre-trained\ndiffusion models. Evaluated on the benchmark VGGSound dataset, our smallest\nmodel (5M parameters) achieves 97.9% alignment accuracy, using 172x fewer\nparameters, 371% less memory, and offering 36x faster inference than the\ncurrent 860M-parameter state-of-the-art model (93.9% accuracy). The larger\nmodel (131M parameters) reaches nearly 99% accuracy while requiring 6.5x fewer\nparameters. These results highlight the scalability and effectiveness of our\napproach.\n","authors":["Trung X. Pham","Tri Ton","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2410.02130v1.pdf","comment":"21 pages, 16 figures"},{"id":"http://arxiv.org/abs/2410.02129v1","updated":"2024-10-03T01:19:21Z","published":"2024-10-03T01:19:21Z","title":"DMC-Net: Lightweight Dynamic Multi-Scale and Multi-Resolution\n Convolution Network for Pancreas Segmentation in CT Images","summary":" Convolutional neural networks (CNNs) have shown great effectiveness in\nmedical image segmentation. However, they may be limited in modeling large\ninter-subject variations in organ shapes and sizes and exploiting global\nlong-range contextual information. This is because CNNs typically employ\nconvolutions with fixed-sized local receptive fields and lack the mechanisms to\nutilize global information. To address these limitations, we developed Dynamic\nMulti-Resolution Convolution (DMRC) and Dynamic Multi-Scale Convolution (DMSC)\nmodules. Both modules enhance the representation capabilities of single\nconvolutions to capture varying scaled features and global contextual\ninformation. This is achieved in the DMRC module by employing a convolutional\nfilter on images with different resolutions and subsequently utilizing dynamic\nmechanisms to model global inter-dependencies between features. In contrast,\nthe DMSC module extracts features at different scales by employing convolutions\nwith different kernel sizes and utilizing dynamic mechanisms to extract global\ncontextual information. The utilization of convolutions with different kernel\nsizes in the DMSC module may increase computational complexity. To lessen this\nburden, we propose to use a lightweight design for convolution layers with a\nlarge kernel size. Thus, DMSC and DMRC modules are designed as lightweight\ndrop-in replacements for single convolutions, and they can be easily integrated\ninto general CNN architectures for end-to-end training. The segmentation\nnetwork was proposed by incorporating our DMSC and DMRC modules into a standard\nU-Net architecture, termed Dynamic Multi-scale and Multi-resolution Convolution\nnetwork (DMC-Net). The results demonstrate that our proposed DMSC and DMRC can\nenhance the representation capabilities of single convolutions and improve\nsegmentation accuracy.\n","authors":["Jin Yang","Daniel S. Marcus","Aristeidis Sotiras"],"pdf_url":"https://arxiv.org/pdf/2410.02129v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.13438v2","updated":"2024-10-03T00:51:50Z","published":"2024-08-24T02:26:42Z","title":"Explainable Concept Generation through Vision-Language Preference\n Learning","summary":" Concept-based explanations have become a popular choice for explaining deep\nneural networks post-hoc because, unlike most other explainable AI techniques,\nthey can be used to test high-level visual \"concepts\" that are not directly\nrelated to feature attributes. For instance, the concept of \"stripes\" is\nimportant to classify an image as a zebra. Concept-based explanation methods,\nhowever, require practitioners to guess and collect multiple candidate concept\nimage sets, which can often be imprecise and labor-intensive. Addressing this\nlimitation, in this paper, we frame concept image set creation as an image\ngeneration problem. However, since naively using a generative model does not\nresult in meaningful concepts, we devise a reinforcement learning-based\npreference optimization (RLPO) algorithm that fine-tunes the vision-language\ngenerative model from approximate textual descriptions of concepts. Through a\nseries of experiments, we demonstrate the capability of our method to\narticulate complex and abstract concepts which aligns with the test class that\nare otherwise challenging to craft manually. In addition to showing the\nefficacy and reliability of our method, we show how our method can be used as a\ndiagnostic tool for analyzing neural networks.\n","authors":["Aditya Taparia","Som Sagar","Ransalu Senanayake"],"pdf_url":"https://arxiv.org/pdf/2408.13438v2.pdf","comment":"25 pages, 27 figures"},{"id":"http://arxiv.org/abs/2409.06183v2","updated":"2024-10-03T00:48:28Z","published":"2024-09-10T03:25:24Z","title":"EDADepth: Enhanced Data Augmentation for Monocular Depth Estimation","summary":" Due to their text-to-image synthesis feature, diffusion models have recently\nseen a rise in visual perception tasks, such as depth estimation. The lack of\ngood-quality datasets makes the extraction of a fine-grain semantic context\nchallenging for the diffusion models. The semantic context with fewer details\nfurther worsens the process of creating effective text embeddings that will be\nused as input for diffusion models. In this paper, we propose a novel EDADepth,\nan enhanced data augmentation method to estimate monocular depth without using\nadditional training data. We use Swin2SR, a super-resolution model, to enhance\nthe quality of input images. We employ the BEiT pre-trained semantic\nsegmentation model for better extraction of text embeddings. We use BLIP-2\ntokenizer to generate tokens from these text embeddings. The novelty of our\napproach is the introduction of Swin2SR, the BEiT model, and the BLIP-2\ntokenizer in the diffusion-based pipeline for the monocular depth estimation.\nOur model achieves state-of-the-art results (SOTA) on the delta3 metric on\nNYUv2 and KITTI datasets. It also achieves results comparable to those of the\nSOTA models in the RMSE and REL metrics. Finally, we also show improvements in\nthe visualization of the estimated depth compared to the SOTA diffusion-based\nmonocular depth estimation models. Code:\nhttps://github.com/edadepthmde/EDADepth_ICMLA.\n","authors":["Nischal Khanal","Shivanand Venkanna Sheshappanavar"],"pdf_url":"https://arxiv.org/pdf/2409.06183v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.18957v2","updated":"2024-10-03T17:57:07Z","published":"2024-09-27T17:58:50Z","title":"LML-DAP: Language Model Learning a Dataset for Data-Augmented Prediction","summary":" Classification tasks are typically handled using Machine Learning (ML)\nmodels, which lack a balance between accuracy and interpretability. This paper\nintroduces a new approach to using Large Language Models (LLMs) for\nclassification tasks in an explainable way. Unlike ML models that rely heavily\non data cleaning and feature engineering, this method streamlines the process\nusing LLMs. This paper proposes a new concept called \"Language Model Learning\n(LML)\" powered by a new method called \"Data-Augmented Prediction (DAP)\". The\nclassification is performed by LLMs using a method similar to humans manually\nexploring and understanding the data and deciding classifications using data as\na reference. In the LML process, a dataset is summarized and evaluated to\ndetermine the features that lead to the classification of each label the most.\nIn the process of DAP, the system uses the data summary and a row of the\ntesting dataset to automatically generate a query, which is used to retrieve\nrelevant rows from the dataset. A classification is generated by the LLM using\ndata summary and relevant rows, ensuring satisfactory accuracy even with\ncomplex data using context-aware decision-making. LML and DAP unlock the\npossibilities of new applications. The proposed method uses the words \"Act as\nan Explainable Machine Learning Model\" in the prompt to enhance the\ninterpretability of the predictions by allowing users to review the logic\nbehind each prediction. In some test cases, the system scored an accuracy above\n90%, proving the effectiveness of the system and its potential to outperform\nconventional ML models in various scenarios. The code is available at\nhttps://github.com/Pro-GenAI/LML-DAP\n","authors":["Praneeth Vadlapati"],"pdf_url":"https://arxiv.org/pdf/2409.18957v2.pdf","comment":"Updated title, abstract, and images"},{"id":"http://arxiv.org/abs/2410.02729v1","updated":"2024-10-03T17:49:09Z","published":"2024-10-03T17:49:09Z","title":"Unified Multi-Modal Interleaved Document Representation for Information\n Retrieval","summary":" Information Retrieval (IR) methods aim to identify relevant documents in\nresponse to a given query, which have gained remarkable attention due to their\nsuccessful application in various natural language tasks. However, existing\napproaches typically consider only the textual information within the\ndocuments, which overlooks the fact that documents can contain multiple\nmodalities, including texts, images, and tables. Further, they often segment\neach long document into multiple discrete passages for embedding, preventing\nthem from capturing the overall document context and interactions between\nparagraphs. We argue that these two limitations lead to suboptimal document\nrepresentations for retrieval. In this work, to address them, we aim to produce\nmore comprehensive and nuanced document representations by holistically\nembedding documents interleaved with different modalities. Specifically, we\nachieve this by leveraging the capability of recent vision-language models that\nenable the processing and integration of text, images, and tables into a\nunified format and representation. Moreover, to mitigate the information loss\nfrom segmenting documents into passages, instead of representing and retrieving\npassages individually, we further merge the representations of segmented\npassages into one single document representation, while we additionally\nintroduce a reranking strategy to decouple and identify the relevant passage\nwithin the document if necessary. Then, through extensive experiments on\ndiverse information retrieval scenarios considering both the textual and\nmultimodal queries, we show that our approach substantially outperforms\nrelevant baselines, thanks to the consideration of the multimodal information\ninterleaved within the documents in a unified way.\n","authors":["Jaewoo Lee","Joonho Ko","Jinheon Baek","Soyeong Jeong","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2410.02729v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.02721v1","updated":"2024-10-03T17:40:55Z","published":"2024-10-03T17:40:55Z","title":"Domain-Specific Retrieval-Augmented Generation Using Vector Stores,\n Knowledge Graphs, and Tensor Factorization","summary":" Large Language Models (LLMs) are pre-trained on large-scale corpora and excel\nin numerous general natural language processing (NLP) tasks, such as question\nanswering (QA). Despite their advanced language capabilities, when it comes to\ndomain-specific and knowledge-intensive tasks, LLMs suffer from hallucinations,\nknowledge cut-offs, and lack of knowledge attributions. Additionally, fine\ntuning LLMs' intrinsic knowledge to highly specific domains is an expensive and\ntime consuming process. The retrieval-augmented generation (RAG) process has\nrecently emerged as a method capable of optimization of LLM responses, by\nreferencing them to a predetermined ontology. It was shown that using a\nKnowledge Graph (KG) ontology for RAG improves the QA accuracy, by taking into\naccount relevant sub-graphs that preserve the information in a structured\nmanner. In this paper, we introduce SMART-SLIC, a highly domain-specific LLM\nframework, that integrates RAG with KG and a vector store (VS) that store\nfactual domain specific information. Importantly, to avoid hallucinations in\nthe KG, we build these highly domain-specific KGs and VSs without the use of\nLLMs, but via NLP, data mining, and nonnegative tensor factorization with\nautomatic model selection. Pairing our RAG with a domain-specific: (i) KG\n(containing structured information), and (ii) VS (containing unstructured\ninformation) enables the development of domain-specific chat-bots that\nattribute the source of information, mitigate hallucinations, lessen the need\nfor fine-tuning, and excel in highly domain-specific question answering tasks.\nWe pair SMART-SLIC with chain-of-thought prompting agents. The framework is\ndesigned to be generalizable to adapt to any specific or specialized domain. In\nthis paper, we demonstrate the question answering capabilities of our framework\non a corpus of scientific publications on malware analysis and anomaly\ndetection.\n","authors":["Ryan C. Barron","Ves Grantcharov","Selma Wanna","Maksim E. Eren","Manish Bhattarai","Nicholas Solovyev","George Tompkins","Charles Nicholas","Kim Ø. Rasmussen","Cynthia Matuszek","Boian S. Alexandrov"],"pdf_url":"https://arxiv.org/pdf/2410.02721v1.pdf","comment":"9 pages 7 figures, 1 table, 1 cypher code Accepted to ICMLA 2024"},{"id":"http://arxiv.org/abs/2410.02642v1","updated":"2024-10-03T16:25:37Z","published":"2024-10-03T16:25:37Z","title":"Attention in Large Language Models Yields Efficient Zero-Shot Re-Rankers","summary":" Information retrieval (IR) systems have played a vital role in modern digital\nlife and have cemented their continued usefulness in this new era of generative\nAI via retrieval-augmented generation. With strong language processing\ncapabilities and remarkable versatility, large language models (LLMs) have\nbecome popular choices for zero-shot re-ranking in IR systems. So far,\nLLM-based re-ranking methods rely on strong generative capabilities, which\nrestricts their use to either specialized or powerful proprietary models. Given\nthese restrictions, we ask: is autoregressive generation necessary and optimal\nfor LLMs to perform re-ranking? We hypothesize that there are abundant signals\nrelevant to re-ranking within LLMs that might not be used to their full\npotential via generation. To more directly leverage such signals, we propose\nin-context re-ranking (ICR), a novel method that leverages the change in\nattention pattern caused by the search query for accurate and efficient\nre-ranking. To mitigate the intrinsic biases in LLMs, we propose a calibration\nmethod using a content-free query. Due to the absence of generation, ICR only\nrequires two ($O(1)$) forward passes to re-rank $N$ documents, making it\nsubstantially more efficient than generative re-ranking methods that require at\nleast $O(N)$ forward passes. Our novel design also enables ICR to be applied to\nany LLM without specialized training while guaranteeing a well-formed ranking.\nExtensive experiments with two popular open-weight LLMs on standard single-hop\nand multi-hop information retrieval benchmarks show that ICR outperforms\nRankGPT while cutting the latency by more than 60% in practice. Through\ndetailed analyses, we show that ICR's performance is specially strong on tasks\nthat require more complex re-ranking signals. Our findings call for further\nexploration on novel ways of utilizing open-weight LLMs beyond text generation.\n","authors":["Shijie Chen","Bernal Jiménez Gutiérrez","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2410.02642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02604v1","updated":"2024-10-03T15:45:15Z","published":"2024-10-03T15:45:15Z","title":"Long-Sequence Recommendation Models Need Decoupled Embeddings","summary":" Lifelong user behavior sequences, comprising up to tens of thousands of\nhistory behaviors, are crucial for capturing user interests and predicting user\nresponses in modern recommendation systems. A two-stage paradigm is typically\nadopted to handle these long sequences: a few relevant behaviors are first\nsearched from the original long sequences via an attention mechanism in the\nfirst stage and then aggregated with the target item to construct a\ndiscriminative representation for prediction in the second stage. In this work,\nwe identify and characterize, for the first time, a neglected deficiency in\nexisting long-sequence recommendation models: a single set of embeddings\nstruggles with learning both attention and representation, leading to\ninterference between these two processes. Initial attempts to address this\nissue using linear projections -- a technique borrowed from language processing\n-- proved ineffective, shedding light on the unique challenges of\nrecommendation models. To overcome this, we propose the Decoupled Attention and\nRepresentation Embeddings (DARE) model, where two distinct embedding tables are\ninitialized and learned separately to fully decouple attention and\nrepresentation. Extensive experiments and analysis demonstrate that DARE\nprovides more accurate search of correlated behaviors and outperforms baselines\nwith AUC gains up to 0.9% on public datasets and notable online system\nimprovements. Furthermore, decoupling embedding spaces allows us to reduce the\nattention embedding dimension and accelerate the search procedure by 50%\nwithout significant performance impact, enabling more efficient,\nhigh-performance online serving.\n","authors":["Ningya Feng","Junwei Pan","Jialong Wu","Baixu Chen","Ximei Wang","Qian Li","Xian Hu","Jie Jiang","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2410.02604v1.pdf","comment":"First three authors contributed equally"},{"id":"http://arxiv.org/abs/2305.18952v4","updated":"2024-10-03T15:08:07Z","published":"2023-05-27T16:05:00Z","title":"Exploring the Practicality of Generative Retrieval on Dynamic Corpora","summary":" Benchmarking the performance of information retrieval (IR) is mostly\nconducted with a fixed set of documents (static corpora). However, in realistic\nscenarios, this is rarely the case and the documents to be retrieved are\nconstantly updated and added. In this paper, we focus on Generative Retrievals\n(GR), which apply autoregressive language models to IR problems, and explore\ntheir adaptability and robustness in dynamic scenarios. We also conduct an\nextensive evaluation of computational and memory efficiency, crucial factors\nfor real-world deployment of IR systems handling vast and ever-changing\ndocument collections. Our results on the StreamingQA benchmark demonstrate that\nGR is more adaptable to evolving knowledge (4 -- 11%), robust in learning\nknowledge with temporal information, and efficient in terms of inference FLOPs\n(x 2), indexing time (x 6), and storage footprint (x 4) compared to Dual\nEncoders (DE), which are commonly used in retrieval systems. Our paper\nhighlights the potential of GR for future use in practical IR systems within\ndynamic environments.\n","authors":["Chaeeun Kim","Soyoung Yoon","Hyunji Lee","Joel Jang","Sohee Yang","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2305.18952v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07103v3","updated":"2024-10-03T13:55:08Z","published":"2024-04-10T15:41:53Z","title":"Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on\n Graphs","summary":" Large language models (LLMs), while exhibiting exceptional performance,\nsuffer from hallucinations, especially on knowledge-intensive tasks. Existing\nworks propose to augment LLMs with individual text units retrieved from\nexternal knowledge corpora to alleviate the issue. However, in many domains,\ntexts are interconnected (e.g., academic papers in a bibliographic graph are\nlinked by citations and co-authorships) which form a (text-attributed) graph.\nThe knowledge in such graphs is encoded not only in single texts/nodes but also\nin their associated connections. To facilitate the research of augmenting LLMs\nwith graphs, we manually construct a Graph Reasoning Benchmark dataset called\nGRBench, containing 1,740 questions that can be answered with the knowledge\nfrom 10 domain graphs. Then, we propose a simple and effective framework called\nGraph Chain-of-thought (Graph-CoT) to augment LLMs with graphs by encouraging\nLLMs to reason on the graph iteratively. Each Graph-CoT iteration consists of\nthree sub-steps: LLM reasoning, LLM-graph interaction, and graph execution. We\nconduct systematic experiments with three LLM backbones on GRBench, where\nGraph-CoT outperforms the baselines consistently. The code is available at\nhttps://github.com/PeterGriffinJin/Graph-CoT.\n","authors":["Bowen Jin","Chulin Xie","Jiawei Zhang","Kashob Kumar Roy","Yu Zhang","Zheng Li","Ruirui Li","Xianfeng Tang","Suhang Wang","Yu Meng","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2404.07103v3.pdf","comment":"21 pages. Code: https://github.com/PeterGriffinJin/Graph-CoT"},{"id":"http://arxiv.org/abs/2410.02453v1","updated":"2024-10-03T13:02:07Z","published":"2024-10-03T13:02:07Z","title":"Quantifying User Coherence: A Unified Framework for Cross-Domain\n Recommendation Analysis","summary":" The effectiveness of Recommender Systems (RS) is closely tied to the quality\nand distinctiveness of user profiles, yet despite many advancements in raw\nperformance, the sensitivity of RS to user profile quality remains\nunder-researched. This paper introduces novel information-theoretic measures\nfor understanding recommender systems: a \"surprise\" measure quantifying users'\ndeviations from popular choices, and a \"conditional surprise\" measure capturing\nuser interaction coherence. We evaluate 7 recommendation algorithms across 9\ndatasets, revealing the relationships between our measures and standard\nperformance metrics. Using a rigorous statistical framework, our analysis\nquantifies how much user profile density and information measures impact\nalgorithm performance across domains. By segmenting users based on these\nmeasures, we achieve improved performance with reduced data and show that\nsimpler algorithms can match complex ones for low-coherence users.\nAdditionally, we employ our measures to analyze how well different\nrecommendation algorithms maintain the coherence and diversity of user\npreferences in their predictions, providing insights into algorithm behavior.\nThis work advances the theoretical understanding of user behavior and practical\nheuristics for personalized recommendation systems, promoting more efficient\nand adaptive architectures.\n","authors":["Michaël Soumm","Alexandre Fournier-Montgieux","Adrian Popescu","Bertrand Delezoide"],"pdf_url":"https://arxiv.org/pdf/2410.02453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15558v2","updated":"2024-10-03T10:40:23Z","published":"2024-09-23T21:29:03Z","title":"Stalactite: Toolbox for Fast Prototyping of Vertical Federated Learning\n Systems","summary":" Machine learning (ML) models trained on datasets owned by different\norganizations and physically located in remote databases offer benefits in many\nreal-world use cases. State regulations or business requirements often prevent\ndata transfer to a central location, making it difficult to utilize standard\nmachine learning algorithms. Federated Learning (FL) is a technique that\nenables models to learn from distributed datasets without revealing the\noriginal data. Vertical Federated learning (VFL) is a type of FL where data\nsamples are divided by features across several data owners. For instance, in a\nrecommendation task, a user can interact with various sets of items, and the\nlogs of these interactions are stored by different organizations. In this demo\npaper, we present \\emph{Stalactite} - an open-source framework for VFL that\nprovides the necessary functionality for building prototypes of VFL systems. It\nhas several advantages over the existing frameworks. In particular, it allows\nresearchers to focus on the algorithmic side rather than engineering and to\neasily deploy learning in a distributed environment. It implements several VFL\nalgorithms and has a built-in homomorphic encryption layer. We demonstrate its\nuse on a real-world recommendation datasets.\n","authors":["Anastasiia Zakharova","Dmitriy Alexandrov","Maria Khodorchenko","Nikolay Butakov","Alexey Vasilev","Maxim Savchenko","Alexander Grigorievskiy"],"pdf_url":"https://arxiv.org/pdf/2409.15558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02219v1","updated":"2024-10-03T05:23:39Z","published":"2024-10-03T05:23:39Z","title":"Multi-modal clothing recommendation model based on large model and VAE\n enhancement","summary":" Accurately recommending products has long been a subject requiring in-depth\nresearch. This study proposes a multimodal paradigm for clothing\nrecommendations. Specifically, it designs a multimodal analysis method that\nintegrates clothing description texts and images, utilizing a pre-trained large\nlanguage model to deeply explore the hidden meanings of users and products.\nAdditionally, a variational encoder is employed to learn the relationship\nbetween user information and products to address the cold start problem in\nrecommendation systems. This study also validates the significant performance\nadvantages of this method over various recommendation system methods through\nextensive ablation experiments, providing crucial practical guidance for the\ncomprehensive optimization of recommendation systems.\n","authors":["Bingjie Huang","Qingyu Lu","Shuaishuai Huang","Xue-she Wang","Haowei Yang"],"pdf_url":"https://arxiv.org/pdf/2410.02219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02191v1","updated":"2024-10-03T04:11:42Z","published":"2024-10-03T04:11:42Z","title":"A Survey on Point-of-Interest Recommendation: Models, Architectures, and\n Security","summary":" The widespread adoption of smartphones and Location-Based Social Networks has\nled to a massive influx of spatio-temporal data, creating unparalleled\nopportunities for enhancing Point-of-Interest (POI) recommendation systems.\nThese advanced POI systems are crucial for enriching user experiences, enabling\npersonalized interactions, and optimizing decision-making processes in the\ndigital landscape. However, existing surveys tend to focus on traditional\napproaches and few of them delve into cutting-edge developments, emerging\narchitectures, as well as security considerations in POI recommendations. To\naddress this gap, our survey stands out by offering a comprehensive, up-to-date\nreview of POI recommendation systems, covering advancements in models,\narchitectures, and security aspects. We systematically examine the transition\nfrom traditional models to advanced techniques such as large language models.\nAdditionally, we explore the architectural evolution from centralized to\ndecentralized and federated learning systems, highlighting the improvements in\nscalability and privacy. Furthermore, we address the increasing importance of\nsecurity, examining potential vulnerabilities and privacy-preserving\napproaches. Our taxonomy provides a structured overview of the current state of\nPOI recommendation, while we also identify promising directions for future\nresearch in this rapidly advancing field.\n","authors":["Qianru Zhang","Peng Yang","Junliang Yu","Haixin Wang","Xingwei He","Siu-Ming Yiu","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2410.02191v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2407.05441v2","updated":"2024-10-03T03:41:56Z","published":"2024-07-07T17:05:24Z","title":"Language Representations Can be What Recommenders Need: Findings and\n Potentials","summary":" Recent studies empirically indicate that language models (LMs) encode rich\nworld knowledge beyond mere semantics, attracting significant attention across\nvarious fields. However, in the recommendation domain, it remains uncertain\nwhether LMs implicitly encode user preference information. Contrary to\nprevailing understanding that LMs and traditional recommenders learn two\ndistinct representation spaces due to the huge gap in language and behavior\nmodeling objectives, this work re-examines such understanding and explores\nextracting a recommendation space directly from the language representation\nspace. Surprisingly, our findings demonstrate that item representations, when\nlinearly mapped from advanced LM representations, yield superior recommendation\nperformance. This outcome suggests the possible homomorphism between the\nadvanced language representation space and an effective item representation\nspace for recommendation, implying that collaborative signals may be implicitly\nencoded within LMs. Motivated by these findings, we explore the possibility of\ndesigning advanced collaborative filtering (CF) models purely based on language\nrepresentations without ID-based embeddings. To be specific, we incorporate\nseveral crucial components to build a simple yet effective model, with item\ntitles as the input. Empirical results show that such a simple model can\noutperform leading ID-based CF models, which sheds light on using language\nrepresentations for better recommendation. Moreover, we systematically analyze\nthis simple model and find several key features for using advanced language\nrepresentations: a good initialization for item representations, zero-shot\nrecommendation abilities, and being aware of user intention. Our findings\nhighlight the connection between language modeling and behavior modeling, which\ncan inspire both natural language processing and recommender system\ncommunities.\n","authors":["Leheng Sheng","An Zhang","Yi Zhang","Yuxin Chen","Xiang Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2407.05441v2.pdf","comment":"Codes are available at https://github.com/LehengTHU/AlphaRec"},{"id":"http://arxiv.org/abs/2407.17451v2","updated":"2024-10-03T01:51:55Z","published":"2024-07-24T17:31:48Z","title":"BlueTempNet: A Temporal Multi-network Dataset of Social Interactions in\n Bluesky Social","summary":" Decentralized social media platforms like Bluesky Social (Bluesky) have made\nit possible to publicly disclose some user behaviors with millisecond-level\nprecision. Embracing Bluesky's principles of open-source and open-data, we\npresent the first collection of the temporal dynamics of user-driven social\ninteractions. BlueTempNet integrates multiple types of networks into a single\nmulti-network, including user-to-user interactions (following and blocking\nusers) and user-to-community interactions (creating and joining communities).\nCommunities are user-formed groups in custom Feeds, where users subscribe to\nposts aligned with their interests. Following Bluesky's public data policy, we\ncollect existing Bluesky Feeds, including the users who liked and generated\nthese Feeds, and provide tools to gather users' social interactions within a\ndate range. This data-collection strategy captures past user behaviors and\nsupports the future data collection of user behavior.\n","authors":["Ujun Jeong","Bohan Jiang","Zhen Tan","H. Russell Bernard","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2407.17451v2.pdf","comment":"accepted to IEEE Data Descriptions 24"},{"id":"http://arxiv.org/abs/2409.18511v3","updated":"2024-10-03T01:44:40Z","published":"2024-09-27T07:46:06Z","title":"Do We Need Domain-Specific Embedding Models? An Empirical Investigation","summary":" Embedding models play a crucial role in representing and retrieving\ninformation across various NLP applications. Recent advancements in Large\nLanguage Models (LLMs) have further enhanced the performance of embedding\nmodels, which are trained on massive amounts of text covering almost every\ndomain. These models are often benchmarked on general-purpose datasets like\nMassive Text Embedding Benchmark (MTEB), where they demonstrate superior\nperformance. However, a critical question arises: Is the development of\ndomain-specific embedding models necessary when general-purpose models are\ntrained on vast corpora that already include specialized domain texts? In this\npaper, we empirically investigate this question, choosing the finance domain as\nan example. We introduce the Finance Massive Text Embedding Benchmark\n(FinMTEB), a counterpart to MTEB that consists of financial domain-specific\ntext datasets. We evaluate the performance of seven state-of-the-art embedding\nmodels on FinMTEB and observe a significant performance drop compared to their\nperformance on MTEB. To account for the possibility that this drop is driven by\nFinMTEB's higher complexity, we propose four measures to quantify dataset\ncomplexity and control for this factor in our analysis. Our analysis provides\ncompelling evidence that state-of-the-art embedding models struggle to capture\ndomain-specific linguistic and semantic patterns. Moreover, we find that the\nperformance of general-purpose embedding models on MTEB is not correlated with\ntheir performance on FinMTEB, indicating the need for domain-specific embedding\nbenchmarks for domain-specific embedding models. This study sheds light on\ndeveloping domain-specific embedding models in the LLM era. FinMTEB comes with\nopen-source code at https://github.com/yixuantt/FinMTEB\n","authors":["Yixuan Tang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2409.18511v3.pdf","comment":"https://github.com/yixuantt/FinMTEB"},{"id":"http://arxiv.org/abs/2410.02126v1","updated":"2024-10-03T01:14:30Z","published":"2024-10-03T01:14:30Z","title":"BayesCNS: A Unified Bayesian Approach to Address Cold Start and\n Non-Stationarity in Search Systems at Scale","summary":" Information Retrieval (IR) systems used in search and recommendation\nplatforms frequently employ Learning-to-Rank (LTR) models to rank items in\nresponse to user queries. These models heavily rely on features derived from\nuser interactions, such as clicks and engagement data. This dependence\nintroduces cold start issues for items lacking user engagement and poses\nchallenges in adapting to non-stationary shifts in user behavior over time. We\naddress both challenges holistically as an online learning problem and propose\nBayesCNS, a Bayesian approach designed to handle cold start and non-stationary\ndistribution shifts in search systems at scale. BayesCNS achieves this by\nestimating prior distributions for user-item interactions, which are\ncontinuously updated with new user interactions gathered online. This online\nlearning procedure is guided by a ranker model, enabling efficient exploration\nof relevant items using contextual information provided by the ranker. We\nsuccessfully deployed BayesCNS in a large-scale search system and demonstrated\nits efficacy through comprehensive offline and online experiments. Notably, an\nonline A/B experiment showed a 10.60% increase in new item interactions and a\n1.05% improvement in overall success metrics over the existing production\nbaseline.\n","authors":["Randy Ardywibowo","Rakesh Sunki","Lucy Kuo","Sankalp Nayak"],"pdf_url":"https://arxiv.org/pdf/2410.02126v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2410.02764v1","updated":"2024-10-03T17:59:59Z","published":"2024-10-03T17:59:59Z","title":"Flash-Splat: 3D Reflection Removal with Flash Cues and Gaussian Splats","summary":" We introduce a simple yet effective approach for separating transmitted and\nreflected light. Our key insight is that the powerful novel view synthesis\ncapabilities provided by modern inverse rendering methods (e.g.,~3D Gaussian\nsplatting) allow one to perform flash/no-flash reflection separation using\nunpaired measurements -- this relaxation dramatically simplifies image\nacquisition over conventional paired flash/no-flash reflection separation\nmethods. Through extensive real-world experiments, we demonstrate our method,\nFlash-Splat, accurately reconstructs both transmitted and reflected scenes in\n3D. Our method outperforms existing 3D reflection separation methods, which do\nnot leverage illumination control, by a large margin. Our project webpage is at\nhttps://flash-splat.github.io/.\n","authors":["Mingyang Xie","Haoming Cai","Sachin Shah","Yiran Xu","Brandon Y. Feng","Jia-Bin Huang","Christopher A. Metzler"],"pdf_url":"https://arxiv.org/pdf/2410.02764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02763v1","updated":"2024-10-03T17:59:58Z","published":"2024-10-03T17:59:58Z","title":"Vinoground: Scrutinizing LMMs over Dense Temporal Reasoning with Short\n Videos","summary":" There has been growing sentiment recently that modern large multimodal models\n(LMMs) have addressed most of the key challenges related to short video\ncomprehension. As a result, both academia and industry are gradually shifting\ntheir attention towards the more complex challenges posed by understanding\nlong-form videos. However, is this really the case? Our studies indicate that\nLMMs still lack many fundamental reasoning capabilities even when dealing with\nshort videos. We introduce Vinoground, a temporal counterfactual LMM evaluation\nbenchmark encompassing 1000 short and natural video-caption pairs. We\ndemonstrate that existing LMMs severely struggle to distinguish temporal\ndifferences between different actions and object transformations. For example,\nthe best model GPT-4o only obtains ~50% on our text and video scores, showing a\nlarge gap compared to the human baseline of ~90%. All open-source multimodal\nmodels and CLIP-based models perform much worse, producing mostly random chance\nperformance. Through this work, we shed light onto the fact that temporal\nreasoning in short videos is a problem yet to be fully solved. The dataset and\nevaluation code are available at https://vinoground.github.io.\n","authors":["Jianrui Zhang","Mu Cai","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2410.02763v1.pdf","comment":"Project Page: https://vinoground.github.io"},{"id":"http://arxiv.org/abs/2410.02762v1","updated":"2024-10-03T17:59:57Z","published":"2024-10-03T17:59:57Z","title":"Interpreting and Editing Vision-Language Representations to Mitigate\n Hallucinations","summary":" We investigate the internal representations of vision-language models (VLMs)\nto address hallucinations, a persistent challenge despite advances in model\nsize and training. We project VLMs' internal image representations to their\nlanguage vocabulary and observe more confident output probabilities on real\nobjects than hallucinated objects. We additionally use these output\nprobabilities to spatially localize real objects. Building on this approach, we\nintroduce a knowledge erasure algorithm that removes hallucinations by linearly\northogonalizing image features with respect to hallucinated object features. We\nshow that targeted edits to a model's latent representations can reduce\nhallucinations by up to 25.7% on the COCO2014 dataset while preserving\nperformance. Our findings demonstrate how a deeper understanding of VLMs'\nlatent representations can enhance reliability and enable novel capabilities,\nsuch as zero-shot segmentation.\n","authors":["Nick Jiang","Anish Kachinthaya","Suzie Petryk","Yossi Gandelsman"],"pdf_url":"https://arxiv.org/pdf/2410.02762v1.pdf","comment":"Project page and code: http://anishk23733.github.io/vl-interp/"},{"id":"http://arxiv.org/abs/2410.02760v1","updated":"2024-10-03T17:59:30Z","published":"2024-10-03T17:59:30Z","title":"Erasing Conceptual Knowledge from Language Models","summary":" Concept erasure in language models has traditionally lacked a comprehensive\nevaluation framework, leading to incomplete assessments of effectiveness of\nerasure methods. We propose an evaluation paradigm centered on three critical\ncriteria: innocence (complete knowledge removal), seamlessness (maintaining\nconditional fluent generation), and specificity (preserving unrelated task\nperformance). Our evaluation metrics naturally motivate the development of\nErasure of Language Memory (ELM), a new method designed to address all three\ndimensions. ELM employs targeted low-rank updates to alter output distributions\nfor erased concepts while preserving overall model capabilities including\nfluency when prompted for an erased concept. We demonstrate ELM's efficacy on\nbiosecurity, cybersecurity, and literary domain erasure tasks. Comparative\nanalysis shows that ELM achieves superior performance across our proposed\nmetrics, including near-random scores on erased topic assessments, generation\nfluency, maintained accuracy on unrelated benchmarks, and robustness under\nadversarial attacks. Our code, data, and trained models are available at\nhttps://elm.baulab.info\n","authors":["Rohit Gandikota","Sheridan Feucht","Samuel Marks","David Bau"],"pdf_url":"https://arxiv.org/pdf/2410.02760v1.pdf","comment":"Project Page: https://elm.baulab.info"},{"id":"http://arxiv.org/abs/2403.17916v2","updated":"2024-10-03T17:59:25Z","published":"2024-03-26T17:53:27Z","title":"CMP: Cooperative Motion Prediction with Multi-Agent Communication","summary":" The confluence of the advancement of Autonomous Vehicles (AVs) and the\nmaturity of Vehicle-to-Everything (V2X) communication has enabled the\ncapability of cooperative connected and automated vehicles (CAVs). Building on\ntop of cooperative perception, this paper explores the feasibility and\neffectiveness of cooperative motion prediction. Our method, CMP, takes LiDAR\nsignals as model input to enhance tracking and prediction capabilities. Unlike\nprevious work that focuses separately on either cooperative perception or\nmotion prediction, our framework, to the best of our knowledge, is the first to\naddress the unified problem where CAVs share information in both perception and\nprediction modules. Incorporated into our design is the unique capability to\ntolerate realistic V2X bandwidth limitations and transmission delays, while\ndealing with bulky perception representations. We also propose a prediction\naggregation module, which unifies the predictions obtained by different CAVs\nand generates the final prediction. Through extensive experiments and ablation\nstudies on the OPV2V and V2V4Real datasets, we demonstrate the effectiveness of\nour method in cooperative perception, tracking, and motion prediction. In\nparticular, CMP reduces the average prediction error by 16.4\\% with fewer\nmissing detections compared with the no cooperation setting and by 12.3\\%\ncompared with the strongest baseline. Our work marks a significant step forward\nin the cooperative capabilities of CAVs, showcasing enhanced performance in\ncomplex scenarios. The code can be found on the project website:\nhttps://cmp-cooperative-prediction.github.io/.\n","authors":["Zehao Wang","Yuping Wang","Zhuoyuan Wu","Hengbo Ma","Zhaowei Li","Hang Qiu","Jiachen Li"],"pdf_url":"https://arxiv.org/pdf/2403.17916v2.pdf","comment":"Project website: https://cmp-cooperative-prediction.github.io/"},{"id":"http://arxiv.org/abs/2410.02759v1","updated":"2024-10-03T17:59:13Z","published":"2024-10-03T17:59:13Z","title":"Forecasting Smog Clouds With Deep Learning","summary":" In this proof-of-concept study, we conduct multivariate timeseries\nforecasting for the concentrations of nitrogen dioxide (NO2), ozone (O3), and\n(fine) particulate matter (PM10 & PM2.5) with meteorological covariates between\ntwo locations using various deep learning models, with a focus on long\nshort-term memory (LSTM) and gated recurrent unit (GRU) architectures. In\nparticular, we propose an integrated, hierarchical model architecture inspired\nby air pollution dynamics and atmospheric science that employs multi-task\nlearning and is benchmarked by unidirectional and fully-connected models.\nResults demonstrate that, above all, the hierarchical GRU proves itself as a\ncompetitive and efficient method for forecasting the concentration of\nsmog-related pollutants.\n","authors":["Valentijn Oldenburg","Juan Cardenas-Cartagena","Matias Valdenegro-Toro"],"pdf_url":"https://arxiv.org/pdf/2410.02759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02755v1","updated":"2024-10-03T17:58:29Z","published":"2024-10-03T17:58:29Z","title":"SIEVE: General Purpose Data Filtering System Matching GPT-4o Accuracy at\n 1% the Cost","summary":" Creating specialized large language models requires vast amounts of clean,\nspecial purpose data for training and fine-tuning. With only a handful of\nexisting large-scale, domain-specific datasets, creation of new datasets is\nrequired in most applications. This requires the development of new\napplication-specific filtering of web-scale data. Filtering with a\nhigh-performance, general-purpose LLM such as GPT-4o can be highly effective,\nbut this is extremely expensive at web-scale. This paper proposes SIEVE, a\nlightweight alternative that matches GPT-4o accuracy at a fraction of the cost.\nSIEVE can perform up to 500 filtering operations for the cost of one GPT-4o\nfiltering call. The key to SIEVE is a seamless integration of GPT-4o and\nlightweight T5 models, using active learning to fine-tune T5 in the background\nwith a small number of calls to GPT-4o. Once trained, it performs as well as\nGPT-4o at a tiny fraction of the cost. We experimentally validate SIEVE on the\nOpenWebText dataset, using five highly customized filter tasks targeting high\nquality and domain-specific content. Our results demonstrate the effectiveness\nand efficiency of our method in curating large, high-quality datasets for\nlanguage model training at a substantially lower cost (1%) than existing\ntechniques. To further validate SIEVE, experiments show that SIEVE and GPT-4o\nachieve similar accuracy, with human evaluators preferring SIEVE's filtering\nresults to those of GPT-4o.\n","authors":["Jifan Zhang","Robert Nowak"],"pdf_url":"https://arxiv.org/pdf/2410.02755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02751v1","updated":"2024-10-03T17:58:11Z","published":"2024-10-03T17:58:11Z","title":"ReLIC: A Recipe for 64k Steps of In-Context Reinforcement Learning for\n Embodied AI","summary":" Intelligent embodied agents need to quickly adapt to new scenarios by\nintegrating long histories of experience into decision-making. For instance, a\nrobot in an unfamiliar house initially wouldn't know the locations of objects\nneeded for tasks and might perform inefficiently. However, as it gathers more\nexperience, it should learn the layout of its environment and remember where\nobjects are, allowing it to complete new tasks more efficiently. To enable such\nrapid adaptation to new tasks, we present ReLIC, a new approach for in-context\nreinforcement learning (RL) for embodied agents. With ReLIC, agents are capable\nof adapting to new environments using 64,000 steps of in-context experience\nwith full attention while being trained through self-generated experience via\nRL. We achieve this by proposing a novel policy update scheme for on-policy RL\ncalled \"partial updates'' as well as a Sink-KV mechanism that enables effective\nutilization of a long observation history for embodied agents. Our method\noutperforms a variety of meta-RL baselines in adapting to unseen houses in an\nembodied multi-object navigation task. In addition, we find that ReLIC is\ncapable of few-shot imitation learning despite never being trained with expert\ndemonstrations. We also provide a comprehensive analysis of ReLIC, highlighting\nthat the combination of large-scale RL training, the proposed partial updates\nscheme, and the Sink-KV are essential for effective in-context learning. The\ncode for ReLIC and all our experiments is at https://github.com/aielawady/relic\n","authors":["Ahmad Elawady","Gunjan Chhablani","Ram Ramrakhya","Karmesh Yadav","Dhruv Batra","Zsolt Kira","Andrew Szot"],"pdf_url":"https://arxiv.org/pdf/2410.02751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04434v2","updated":"2024-10-03T17:57:59Z","published":"2024-09-06T17:55:49Z","title":"Accelerating Training with Neuron Interaction and Nowcasting Networks","summary":" Neural network training can be accelerated when a learnable update rule is\nused in lieu of classic adaptive optimizers (e.g. Adam). However, learnable\nupdate rules can be costly and unstable to train and use. Recently, Jang et al.\n(2023) proposed a simpler approach to accelerate training based on weight\nnowcaster networks (WNNs). In their approach, Adam is used for most of the\noptimization steps and periodically, only every few steps, a WNN nowcasts\n(predicts near future) parameters. We improve WNNs by proposing neuron\ninteraction and nowcasting (NiNo) networks. In contrast to WNNs, NiNo leverages\nneuron connectivity and graph neural networks to more accurately nowcast\nparameters. We further show that in some networks, such as Transformers,\nmodeling neuron connectivity accurately is challenging. We address this and\nother limitations, which allows NiNo to accelerate Adam training by up to 50%\nin vision and language tasks.\n","authors":["Boris Knyazev","Abhinav Moudgil","Guillaume Lajoie","Eugene Belilovsky","Simon Lacoste-Julien"],"pdf_url":"https://arxiv.org/pdf/2409.04434v2.pdf","comment":"added Llama3-based results and other updates, code is\n https://github.com/SamsungSAILMontreal/nino"},{"id":"http://arxiv.org/abs/2410.02750v1","updated":"2024-10-03T17:57:50Z","published":"2024-10-03T17:57:50Z","title":"An Online Automatic Modulation Classification Scheme Based on Isolation\n Distributional Kernel","summary":" Automatic Modulation Classification (AMC), as a crucial technique in modern\nnon-cooperative communication networks, plays a key role in various civil and\nmilitary applications. However, existing AMC methods usually are complicated\nand can work in batch mode only due to their high computational complexity.\nThis paper introduces a new online AMC scheme based on Isolation Distributional\nKernel. Our method stands out in two aspects. Firstly, it is the first proposal\nto represent baseband signals using a distributional kernel. Secondly, it\nintroduces a pioneering AMC technique that works well in online settings under\nrealistic time-varying channel conditions. Through extensive experiments in\nonline settings, we demonstrate the effectiveness of the proposed classifier.\nOur results indicate that the proposed approach outperforms existing baseline\nmodels, including two state-of-the-art deep learning classifiers. Moreover, it\ndistinguishes itself as the first online classifier for AMC with linear time\ncomplexity, which marks a significant efficiency boost for real-time\napplications.\n","authors":["Xinpeng Li","Zile Jiang","Kai Ming Ting","Ye Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.02750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02749v1","updated":"2024-10-03T17:57:22Z","published":"2024-10-03T17:57:22Z","title":"Training Language Models on Synthetic Edit Sequences Improves Code\n Synthesis","summary":" Software engineers mainly write code by editing existing programs. In\ncontrast, large language models (LLMs) autoregressively synthesize programs in\na single pass. One explanation for this is the scarcity of open-sourced edit\ndata. While high-quality instruction data for code synthesis is already scarce,\nhigh-quality edit data is even scarcer. To fill this gap, we develop a\nsynthetic data generation algorithm called LintSeq. This algorithm refactors\nexisting code into a sequence of code edits by using a linter to procedurally\nsample across the error-free insertions that can be used to sequentially write\nprograms. It outputs edit sequences as text strings consisting of consecutive\nprogram diffs. To test LintSeq, we use it to refactor a dataset of instruction\n+ program pairs into instruction + program-diff-sequence tuples. Then, we\ninstruction finetune a series of smaller LLMs ranging from 2.6B to 14B\nparameters on both the re-factored and original versions of this dataset,\ncomparing zero-shot performance on code synthesis benchmarks. We show that\nduring repeated sampling, edit sequence finetuned models produce more diverse\nprograms than baselines. This results in better inference-time scaling for\nbenchmark coverage as a function of samples, i.e. the fraction of problems\n\"pass@k\" solved by any attempt given \"k\" tries. For example, on HumanEval\npass@50, small LLMs finetuned on synthetic edit sequences are competitive with\nGPT-4 and outperform models finetuned on the baseline dataset by +20% (+/-3%)\nin absolute score. Finally, we also pretrain our own tiny LMs for code\nunderstanding. We show that finetuning tiny models on synthetic code edits\nresults in state-of-the-art code synthesis for the on-device model class. Our\n150M parameter edit sequence LM matches or outperforms code models with twice\nas many parameters, both with and without repeated sampling, including Codex\nand AlphaCode.\n","authors":["Ulyana Piterbarg","Lerrel Pinto","Rob Fergus"],"pdf_url":"https://arxiv.org/pdf/2410.02749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18957v2","updated":"2024-10-03T17:57:07Z","published":"2024-09-27T17:58:50Z","title":"LML-DAP: Language Model Learning a Dataset for Data-Augmented Prediction","summary":" Classification tasks are typically handled using Machine Learning (ML)\nmodels, which lack a balance between accuracy and interpretability. This paper\nintroduces a new approach to using Large Language Models (LLMs) for\nclassification tasks in an explainable way. Unlike ML models that rely heavily\non data cleaning and feature engineering, this method streamlines the process\nusing LLMs. This paper proposes a new concept called \"Language Model Learning\n(LML)\" powered by a new method called \"Data-Augmented Prediction (DAP)\". The\nclassification is performed by LLMs using a method similar to humans manually\nexploring and understanding the data and deciding classifications using data as\na reference. In the LML process, a dataset is summarized and evaluated to\ndetermine the features that lead to the classification of each label the most.\nIn the process of DAP, the system uses the data summary and a row of the\ntesting dataset to automatically generate a query, which is used to retrieve\nrelevant rows from the dataset. A classification is generated by the LLM using\ndata summary and relevant rows, ensuring satisfactory accuracy even with\ncomplex data using context-aware decision-making. LML and DAP unlock the\npossibilities of new applications. The proposed method uses the words \"Act as\nan Explainable Machine Learning Model\" in the prompt to enhance the\ninterpretability of the predictions by allowing users to review the logic\nbehind each prediction. In some test cases, the system scored an accuracy above\n90%, proving the effectiveness of the system and its potential to outperform\nconventional ML models in various scenarios. The code is available at\nhttps://github.com/Pro-GenAI/LML-DAP\n","authors":["Praneeth Vadlapati"],"pdf_url":"https://arxiv.org/pdf/2409.18957v2.pdf","comment":"Updated title, abstract, and images"},{"id":"http://arxiv.org/abs/2410.02748v1","updated":"2024-10-03T17:57:01Z","published":"2024-10-03T17:57:01Z","title":"CriSPO: Multi-Aspect Critique-Suggestion-guided Automatic Prompt\n Optimization for Text Generation","summary":" Large language models (LLMs) can generate fluent summaries across domains\nusing prompting techniques, reducing the need to train models for summarization\napplications. However, crafting effective prompts that guide LLMs to generate\nsummaries with the appropriate level of detail and writing style remains a\nchallenge. In this paper, we explore the use of salient information extracted\nfrom the source document to enhance summarization prompts. We show that adding\nkeyphrases in prompts can improve ROUGE F1 and recall, making the generated\nsummaries more similar to the reference and more complete. The number of\nkeyphrases can control the precision-recall trade-off. Furthermore, our\nanalysis reveals that incorporating phrase-level salient information is\nsuperior to word- or sentence-level. However, the impact on hallucination is\nnot universally positive across LLMs. To conduct this analysis, we introduce\nKeyphrase Signal Extractor (CriSPO), a lightweight model that can be finetuned\nto extract salient keyphrases. By using CriSPO, we achieve consistent ROUGE\nimprovements across datasets and open-weight and proprietary LLMs without any\nLLM customization. Our findings provide insights into leveraging salient\ninformation in building prompt-based summarization systems.\n","authors":["Han He","Qianchu Liu","Lei Xu","Chaitanya Shivade","Yi Zhang","Sundararajan Srinivasan","Katrin Kirchhoff"],"pdf_url":"https://arxiv.org/pdf/2410.02748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07840v3","updated":"2024-10-03T17:56:12Z","published":"2024-04-11T15:27:56Z","title":"On Training Data Influence of GPT Models","summary":" Amidst the rapid advancements in generative language models, the\ninvestigation of how training data shapes the performance of GPT models is\nstill emerging. This paper presents GPTfluence, a novel approach that leverages\na featurized simulation to assess the impact of training examples on the\ntraining dynamics of GPT models. Our approach not only traces the influence of\nindividual training instances on performance trajectories, such as loss and\nother key metrics, on targeted test points but also enables a comprehensive\ncomparison with existing methods across various training scenarios in GPT\nmodels, ranging from 14 million to 2.8 billion parameters, across a range of\ndownstream tasks. Contrary to earlier methods that struggle with generalization\nto new data, GPTfluence introduces a parameterized simulation of training\ndynamics, demonstrating robust generalization capabilities to unseen training\ndata. This adaptability is evident across both fine-tuning and\ninstruction-tuning scenarios, spanning tasks in natural language understanding\nand generation. We make our code and data publicly available at\nhttps://github.com/ernie-research/gptfluence.\n","authors":["Yekun Chai","Qingyi Liu","Shuohuan Wang","Yu Sun","Qiwei Peng","Hua Wu"],"pdf_url":"https://arxiv.org/pdf/2404.07840v3.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.02746v1","updated":"2024-10-03T17:56:09Z","published":"2024-10-03T17:56:09Z","title":"Contrastive Localized Language-Image Pre-Training","summary":" Contrastive Language-Image Pre-training (CLIP) has been a celebrated method\nfor training vision encoders to generate image/text representations\nfacilitating various applications. Recently, CLIP has been widely adopted as\nthe vision backbone of multimodal large language models (MLLMs) to connect\nimage inputs for language interactions. The success of CLIP as a\nvision-language foundation model relies on aligning web-crawled noisy text\nannotations at image levels. Nevertheless, such criteria may become\ninsufficient for downstream tasks in need of fine-grained vision\nrepresentations, especially when region-level understanding is demanding for\nMLLMs. In this paper, we improve the localization capability of CLIP with\nseveral advances. We propose a pre-training method called Contrastive Localized\nLanguage-Image Pre-training (CLOC) by complementing CLIP with region-text\ncontrastive loss and modules. We formulate a new concept, promptable\nembeddings, of which the encoder produces image embeddings easy to transform\ninto region representations given spatial hints. To support large-scale\npre-training, we design a visually-enriched and spatially-localized captioning\nframework to effectively generate region-text pseudo-labels at scale. By\nscaling up to billions of annotated images, CLOC enables high-quality regional\nembeddings for image region recognition and retrieval tasks, and can be a\ndrop-in replacement of CLIP to enhance MLLMs, especially on referring and\ngrounding tasks.\n","authors":["Hong-You Chen","Zhengfeng Lai","Haotian Zhang","Xinze Wang","Marcin Eichner","Keen You","Meng Cao","Bowen Zhang","Yinfei Yang","Zhe Gan"],"pdf_url":"https://arxiv.org/pdf/2410.02746v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.02744v1","updated":"2024-10-03T17:55:17Z","published":"2024-10-03T17:55:17Z","title":"Neutral residues: revisiting adapters for model extension","summary":" We address the problem of extending a pretrained large language model to a\nnew domain that was not seen at training time, like adding a language for which\nthe original model has seen no or little training data. Popular solutions like\nfine-tuning or low-rank adaptation are successful at domain adaptation, but\nformally they do not add any extra capacity and degrade the performance in the\noriginal domain.\n Our paper analyzes this extension problem under three angles: data,\narchitecture and training procedure, which are advantageously considered\njointly. In particular, we improve adapters and make it possible to learn an\nentire new language while ensuring that the output of the neural network is\nalmost unchanged in the original domain. For this purpose, we modify the new\nresidual blocks in a way that leads each new residual block to output\nnear-zeros in the original domain.\n This solution of neutral residues, which borrows architectural components\nfrom mixture of experts, is effective: with only 20% extra learnable weights\ncompared to an original model trained on English, we get results that are\nsignificantly better than concurrent approaches (fine-tuning, low-rank or\nvanilla adapters) in terms of the trade-off between learning a new language and\nnot forgetting English.\n","authors":["Franck Signe Talla","Herve Jegou","Edouard Grave"],"pdf_url":"https://arxiv.org/pdf/2410.02744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02742v1","updated":"2024-10-03T17:55:09Z","published":"2024-10-03T17:55:09Z","title":"Grounding Large Language Models In Embodied Environment With Imperfect\n World Models","summary":" Despite a widespread success in various applications, large language models\n(LLMs) often stumble when tackling basic physical reasoning or executing\nrobotics tasks, due to a lack of direct experience with the physical nuances of\nthe real world. To address these issues, we propose a Grounding Large language\nmodel with Imperfect world MOdel (GLIMO), which utilizes proxy world models\nsuch as simulators to collect and synthesize trining data. GLIMO incorporates\nan LLM agent-based data generator to automatically create high-quality and\ndiverse instruction datasets. The generator includes an iterative self-refining\nmodule for temporally consistent experience sampling, a diverse set of\nquestion-answering instruction seeds, and a retrieval-augmented generation\nmodule for reflecting on prior experiences. Comprehensive experiments show that\nour approach improve the performance of strong open-source LLMs like LLaMA-3\nwith a performance boost of 2.04 $\\times$, 1.54 $\\times$, and 1.82 $\\times$\nacross three different benchmarks, respectively. The performance is able to\ncompete with or surpass their larger counterparts such as GPT-4.\n","authors":["Haolan Liu","Jishen Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.02742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02741v1","updated":"2024-10-03T17:54:56Z","published":"2024-10-03T17:54:56Z","title":"Salient Information Prompting to Steer Content in Prompt-based\n Abstractive Summarization","summary":" Large language models (LLMs) can generate fluent summaries across domains\nusing prompting techniques, reducing the need to train models for summarization\napplications. However, crafting effective prompts that guide LLMs to generate\nsummaries with the appropriate level of detail and writing style remains a\nchallenge. In this paper, we explore the use of salient information extracted\nfrom the source document to enhance summarization prompts. We show that adding\nkeyphrases in prompts can improve ROUGE F1 and recall, making the generated\nsummaries more similar to the reference and more complete. The number of\nkeyphrases can control the precision-recall trade-off. Furthermore, our\nanalysis reveals that incorporating phrase-level salient information is\nsuperior to word- or sentence-level. However, the impact on hallucination is\nnot universally positive across LLMs. To conduct this analysis, we introduce\nKeyphrase Signal Extractor (SigExt), a lightweight model that can be finetuned\nto extract salient keyphrases. By using SigExt, we achieve consistent ROUGE\nimprovements across datasets and open-weight and proprietary LLMs without any\nLLM customization. Our findings provide insights into leveraging salient\ninformation in building prompt-based summarization systems.\n","authors":["Lei Xu","Mohammed Asad Karim","Saket Dingliwal","Aparna Elangovan"],"pdf_url":"https://arxiv.org/pdf/2410.02741v1.pdf","comment":"Accepted to EMNLP 2024 Industry Track"},{"id":"http://arxiv.org/abs/2410.02740v1","updated":"2024-10-03T17:54:52Z","published":"2024-10-03T17:54:52Z","title":"Revisit Large-Scale Image-Caption Data in Pre-training Multimodal\n Foundation Models","summary":" Recent advancements in multimodal models highlight the value of rewritten\ncaptions for improving performance, yet key challenges remain. For example,\nwhile synthetic captions often provide superior quality and image-text\nalignment, it is not clear whether they can fully replace AltTexts: the role of\nsynthetic captions and their interaction with original web-crawled AltTexts in\npre-training is still not well understood. Moreover, different multimodal\nfoundation models may have unique preferences for specific caption formats, but\nefforts to identify the optimal captions for each model remain limited. In this\nwork, we propose a novel, controllable, and scalable captioning pipeline\ndesigned to generate diverse caption formats tailored to various multimodal\nmodels. By examining Short Synthetic Captions (SSC) towards Dense Synthetic\nCaptions (DSC+) as case studies, we systematically explore their effects and\ninteractions with AltTexts across models such as CLIP, multimodal LLMs, and\ndiffusion models. Our findings reveal that a hybrid approach that keeps both\nsynthetic captions and AltTexts can outperform the use of synthetic captions\nalone, improving both alignment and performance, with each model demonstrating\npreferences for particular caption formats. This comprehensive analysis\nprovides valuable insights into optimizing captioning strategies, thereby\nadvancing the pre-training of multimodal foundation models.\n","authors":["Zhengfeng Lai","Vasileios Saveris","Chen Chen","Hong-You Chen","Haotian Zhang","Bowen Zhang","Juan Lao Tebar","Wenze Hu","Zhe Gan","Peter Grasch","Meng Cao","Yinfei Yang"],"pdf_url":"https://arxiv.org/pdf/2410.02740v1.pdf","comment":"CV/ML"},{"id":"http://arxiv.org/abs/2410.02735v1","updated":"2024-10-03T17:52:42Z","published":"2024-10-03T17:52:42Z","title":"OOD-Chameleon: Is Algorithm Selection for OOD Generalization Learnable?","summary":" Out-of-distribution (OOD) generalization is challenging because distribution\nshifts come in many forms. A multitude of learning algorithms exist and each\ncan improve performance in specific OOD situations. We posit that much of the\nchallenge of OOD generalization lies in choosing the right algorithm for the\nright dataset. However, such algorithm selection is often elusive under complex\nreal-world shifts. In this work, we formalize the task of algorithm selection\nfor OOD generalization and investigate whether it could be approached by\nlearning. We propose a solution, dubbed OOD-Chameleon that treats the task as a\nsupervised classification over candidate algorithms. We construct a dataset of\ndatasets to learn from, which represents diverse types, magnitudes and\ncombinations of shifts (covariate shift, label shift, spurious correlations).\nWe train the model to predict the relative performance of algorithms given a\ndataset's characteristics. This enables a priori selection of the best learning\nstrategy, i.e. without training various models as needed with traditional model\nselection. Our experiments show that the adaptive selection outperforms any\nindividual algorithm and simple selection heuristics, on unseen datasets of\ncontrollable and realistic image data. Inspecting the model shows that it\nlearns non-trivial data/algorithms interactions, and reveals the conditions for\nany one algorithm to surpass another. This opens new avenues for (1) enhancing\nOOD generalization with existing algorithms instead of designing new ones, and\n(2) gaining insights into the applicability of existing algorithms with respect\nto datasets' properties.\n","authors":["Liangze Jiang","Damien Teney"],"pdf_url":"https://arxiv.org/pdf/2410.02735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02733v1","updated":"2024-10-03T17:51:21Z","published":"2024-10-03T17:51:21Z","title":"Data Similarity-Based One-Shot Clustering for Multi-Task Hierarchical\n Federated Learning","summary":" We address the problem of cluster identity estimation in a hierarchical\nfederated learning setting in which users work toward learning different tasks.\nTo overcome the challenge of task heterogeneity, users need to be grouped in a\nway such that users with the same task are in the same group, conducting\ntraining together, while sharing the weights of feature extraction layers with\nthe other groups. Toward that end, we propose a one-shot clustering algorithm\nthat can effectively identify and group users based on their data similarity.\nThis enables more efficient collaboration and sharing of a common layer\nrepresentation within the federated learning system. Our proposed algorithm not\nonly enhances the clustering process, but also overcomes challenges related to\nprivacy concerns, communication overhead, and the need for prior knowledge\nabout learning models or loss function behaviors. We validate our proposed\nalgorithm using various datasets such as CIFAR-10 and Fashion MNIST, and show\nthat it outperforms the baseline in terms of accuracy and variance reduction.\n","authors":["Abdulmoneam Ali","Ahmed Arafa"],"pdf_url":"https://arxiv.org/pdf/2410.02733v1.pdf","comment":"To appear in Asilomar 2024"},{"id":"http://arxiv.org/abs/2407.00023v2","updated":"2024-10-03T17:50:33Z","published":"2024-05-08T06:30:58Z","title":"Preble: Efficient Distributed Prompt Scheduling for LLM Serving","summary":" Prompts to large language models (LLMs) have evolved beyond simple user\nquestions. For LLMs to solve complex problems, today's practices are to include\ndomain-specific instructions, illustration of tool usages, and/or long context\nsuch as textbook chapters in prompts. As such, many parts of prompts are\nrepetitive across requests. Recent works propose to cache and reuse KV state of\nprompts. However, they are all confined to a single-GPU optimization, while\nproduction LLM serving systems are distributed by nature.\n This paper proposes Preble, the first distributed LLM serving platform that\ntargets and optimizes for prompt sharing. We designed a distributed scheduling\nsystem that co-optimizes KV state reuse and computation load-balancing with a\nnew scheduling algorithm and a hierarchical scheduling mechanism. Our\nevaluation of Preble with real workloads and request arrival patterns on two\nopen-source LLMs shows that Preble outperforms the SOTA serving systems by 1.5X\nto 14.5X on average latency and 2X to 10X on p99 latency.\n","authors":["Vikranth Srivatsa","Zijian He","Reyna Abhyankar","Dongming Li","Yiying Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.00023v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02725v1","updated":"2024-10-03T17:47:29Z","published":"2024-10-03T17:47:29Z","title":"Adaptive Inference-Time Compute: LLMs Can Predict if They Can Do Better,\n Even Mid-Generation","summary":" Inference-time computation is a powerful paradigm to enhance the performance\nof large language models (LLMs), with Best-of-N sampling being a widely used\ntechnique. However, this method is computationally expensive, requiring both\n(1) an external reward model and (2) the generation of multiple samples. In\nthis work, we introduce a new generative self-evaluation scheme designed to\nadaptively reduce the number of generated samples while maintaining or even\nimproving performance. We use a generative reward model formulation, allowing\nthe LLM to predict mid-generation the probability that restarting the\ngeneration will yield a better response. These predictions are obtained without\nan external reward model and can be used to decide whether or not to generate\nmore samples, prune unpromising samples early on, or to pick the best sample.\nThis capability is very inexpensive as it involves generating a single\npredefined token. Trained using a dataset constructed with real unfiltered\nLMSYS user prompts, Llama 3.1 8B's win rate against GPT-4 on AlpacaEval\nincreases from 21% to 34% with 16 samples and math performance on GSM8K\nimproves from 84% to 91%. By sampling only when the LLM determines that it is\nbeneficial to do so and adaptively adjusting temperature annealing, we\ndemonstrate that 74% of the improvement from using 16 samples can be achieved\nwith only 1.2 samples on average. We further demonstrate that 50-75% of samples\ncan be pruned early in generation with minimal degradation in performance.\nOverall, our methods enable more efficient and scalable compute utilization\nduring inference for LLMs.\n","authors":["Rohin Manvi","Anikait Singh","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2410.02725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02724v1","updated":"2024-10-03T17:45:31Z","published":"2024-10-03T17:45:31Z","title":"Large Language Models as Markov Chains","summary":" Large language models (LLMs) have proven to be remarkably efficient, both\nacross a wide range of natural language processing tasks and well beyond them.\nHowever, a comprehensive theoretical analysis of the origins of their\nimpressive performance remains elusive. In this paper, we approach this\nchallenging task by drawing an equivalence between generic autoregressive\nlanguage models with vocabulary of size $T$ and context window of size $K$ and\nMarkov chains defined on a finite state space of size $\\mathcal{O}(T^K)$. We\nderive several surprising findings related to the existence of a stationary\ndistribution of Markov chains that capture the inference power of LLMs, their\nspeed of convergence to it, and the influence of the temperature on the latter.\nWe then prove pre-training and in-context generalization bounds and show how\nthe drawn equivalence allows us to enrich their interpretation. Finally, we\nillustrate our theoretical guarantees with experiments on several recent LLMs\nto highlight how they capture the behavior observed in practice.\n","authors":["Oussama Zekri","Ambroise Odonnat","Abdelhakim Benechehab","Linus Bleistein","Nicolas Boullé","Ievgen Redko"],"pdf_url":"https://arxiv.org/pdf/2410.02724v1.pdf","comment":"49 pages, 17 figures"},{"id":"http://arxiv.org/abs/2405.15429v4","updated":"2024-10-03T17:44:27Z","published":"2024-05-24T10:55:38Z","title":"E(n) Equivariant Topological Neural Networks","summary":" Graph neural networks excel at modeling pairwise interactions, but they\ncannot flexibly accommodate higher-order interactions and features. Topological\ndeep learning (TDL) has emerged recently as a promising tool for addressing\nthis issue. TDL enables the principled modeling of arbitrary multi-way,\nhierarchical higher-order interactions by operating on combinatorial\ntopological spaces, such as simplicial or cell complexes, instead of graphs.\nHowever, little is known about how to leverage geometric features such as\npositions and velocities for TDL. This paper introduces E(n)-Equivariant\nTopological Neural Networks (ETNNs), which are E(n)-equivariant message-passing\nnetworks operating on combinatorial complexes, formal objects unifying graphs,\nhypergraphs, simplicial, path, and cell complexes. ETNNs incorporate geometric\nnode features while respecting rotation, reflection, and translation\nequivariance. Moreover, ETNNs are natively ready for settings with\nheterogeneous interactions. We provide a theoretical analysis to show the\nimproved expressiveness of ETNNs over architectures for geometric graphs. We\nalso show how E(n)-equivariant variants of TDL models can be directly derived\nfrom our framework. The broad applicability of ETNNs is demonstrated through\ntwo tasks of vastly different scales: i) molecular property prediction on the\nQM9 benchmark and ii) land-use regression for hyper-local estimation of air\npollution with multi-resolution irregular geospatial data. The results indicate\nthat ETNNs are an effective tool for learning from diverse types of richly\nstructured data, as they match or surpass SotA equivariant TDL models with a\nsignificantly smaller computational burden, thus highlighting the benefits of a\nprincipled geometric inductive bias.\n","authors":["Claudio Battiloro","Ege Karaismailoğlu","Mauricio Tec","George Dasoulas","Michelle Audirac","Francesca Dominici"],"pdf_url":"https://arxiv.org/pdf/2405.15429v4.pdf","comment":"41 pages, 11 figures, 12 tables"},{"id":"http://arxiv.org/abs/2410.02718v1","updated":"2024-10-03T17:38:46Z","published":"2024-10-03T17:38:46Z","title":"SynthFormer: Equivariant Pharmacophore-based Generation of Molecules for\n Ligand-Based Drug Design","summary":" Drug discovery is a complex and resource-intensive process, with significant\ntime and cost investments required to bring new medicines to patients. Recent\nadvancements in generative machine learning (ML) methods offer promising\navenues to accelerate early-stage drug discovery by efficiently exploring\nchemical space. This paper addresses the gap between in silico generative\napproaches and practical in vitro methodologies, highlighting the need for\ntheir integration to optimize molecule discovery. We introduce SynthFormer, a\nnovel ML model that utilizes a 3D equivariant encoder for pharmacophores to\ngenerate fully synthesizable molecules, constructed as synthetic trees. Unlike\nprevious methods, SynthFormer incorporates 3D information and provides\nsynthetic paths, enhancing its ability to produce molecules with good docking\nscores across various proteins. Our contributions include a new methodology for\nefficient chemical space exploration using 3D information, a novel architecture\ncalled Synthformer for translating 3D pharmacophore representations into\nmolecules, and a meaningful embedding space that organizes reagents for drug\ndiscovery optimization. Synthformer generates molecules that dock well and\nenables effective late-stage optimization restricted by synthesis paths.\n","authors":["Zygimantas Jocys","Henriette M. G. Willems","Katayoun Farrahi"],"pdf_url":"https://arxiv.org/pdf/2410.02718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02717v1","updated":"2024-10-03T17:38:43Z","published":"2024-10-03T17:38:43Z","title":"Measurements with Noise: Bayesian Optimization for Co-optimizing Noise\n and Property Discovery in Automated Experiments","summary":" We have developed a Bayesian optimization (BO) workflow that integrates\nintra-step noise optimization into automated experimental cycles. Traditional\nBO approaches in automated experiments focus on optimizing experimental\ntrajectories but often overlook the impact of measurement noise on data quality\nand cost. Our proposed framework simultaneously optimizes both the target\nproperty and the associated measurement noise by introducing time as an\nadditional input parameter, thereby balancing the signal-to-noise ratio and\nexperimental duration. Two approaches are explored: a reward-driven noise\noptimization and a double-optimization acquisition function, both enhancing the\nefficiency of automated workflows by considering noise and cost within the\noptimization process. We validate our method through simulations and real-world\nexperiments using Piezoresponse Force Microscopy (PFM), demonstrating the\nsuccessful optimization of measurement duration and property exploration. Our\napproach offers a scalable solution for optimizing multiple variables in\nautomated experimental workflows, improving data quality, and reducing resource\nexpenditure in materials science and beyond.\n","authors":["Boris N. Slautin","Yu Liu","Jan Dec","Vladimir V. Shvartsman","Doru C. Lupascu","Maxim Ziatdinov","Sergei V. Kalinin"],"pdf_url":"https://arxiv.org/pdf/2410.02717v1.pdf","comment":"22 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.05689v3","updated":"2024-10-03T17:37:33Z","published":"2024-02-08T14:07:20Z","title":"Unichain and Aperiodicity are Sufficient for Asymptotic Optimality of\n Average-Reward Restless Bandits","summary":" We consider the infinite-horizon, average-reward restless bandit problem in\ndiscrete time. We propose a new class of policies that are designed to drive a\nprogressively larger subset of arms toward the optimal distribution. We show\nthat our policies are asymptotically optimal with an $O(1/\\sqrt{N})$ optimality\ngap for an $N$-armed problem, assuming only a unichain and aperiodicity\nassumption. Our approach departs from most existing work that focuses on index\nor priority policies, which rely on the Global Attractor Property (GAP) to\nguarantee convergence to the optimum, or a recently developed simulation-based\npolicy, which requires a Synchronization Assumption (SA).\n","authors":["Yige Hong","Qiaomin Xie","Yudong Chen","Weina Wang"],"pdf_url":"https://arxiv.org/pdf/2402.05689v3.pdf","comment":"58 pages, 14 figures. This version includes a restructured main\n result section and new experiments"},{"id":"http://arxiv.org/abs/2410.02714v1","updated":"2024-10-03T17:37:18Z","published":"2024-10-03T17:37:18Z","title":"AlzhiNet: Traversing from 2DCNN to 3DCNN, Towards Early Detection and\n Diagnosis of Alzheimer's Disease","summary":" Alzheimer's disease (AD) is a progressive neurodegenerative disorder with\nincreasing prevalence among the aging population, necessitating early and\naccurate diagnosis for effective disease management. In this study, we present\na novel hybrid deep learning framework that integrates both 2D Convolutional\nNeural Networks (2D-CNN) and 3D Convolutional Neural Networks (3D-CNN), along\nwith a custom loss function and volumetric data augmentation, to enhance\nfeature extraction and improve classification performance in AD diagnosis.\nAccording to extensive experiments, AlzhiNet outperforms standalone 2D and 3D\nmodels, highlighting the importance of combining these complementary\nrepresentations of data. The depth and quality of 3D volumes derived from the\naugmented 2D slices also significantly influence the model's performance. The\nresults indicate that carefully selecting weighting factors in hybrid\npredictions is imperative for achieving optimal results. Our framework has been\nvalidated on the Magnetic Resonance Imaging (MRI) from Kaggle and MIRIAD\ndatasets, obtaining accuracies of 98.9% and 99.99%, respectively, with an AUC\nof 100%. Furthermore, AlzhiNet was studied under a variety of perturbation\nscenarios on the Alzheimer's Kaggle dataset, including Gaussian noise,\nbrightness, contrast, salt and pepper noise, color jitter, and occlusion. The\nresults obtained show that AlzhiNet is more robust to perturbations than\nResNet-18, making it an excellent choice for real-world applications. This\napproach represents a promising advancement in the early diagnosis and\ntreatment planning for Alzheimer's disease.\n","authors":["Romoke Grace Akindele","Samuel Adebayo","Paul Shekonya Kanda","Ming Yu"],"pdf_url":"https://arxiv.org/pdf/2410.02714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02711v1","updated":"2024-10-03T17:35:38Z","published":"2024-10-03T17:35:38Z","title":"NETS: A Non-Equilibrium Transport Sampler","summary":" We propose an algorithm, termed the Non-Equilibrium Transport Sampler (NETS),\nto sample from unnormalized probability distributions. NETS can be viewed as a\nvariant of annealed importance sampling (AIS) based on Jarzynski's equality, in\nwhich the stochastic differential equation used to perform the non-equilibrium\nsampling is augmented with an additional learned drift term that lowers the\nimpact of the unbiasing weights used in AIS. We show that this drift is the\nminimizer of a variety of objective functions, which can all be estimated in an\nunbiased fashion without backpropagating through solutions of the stochastic\ndifferential equations governing the sampling. We also prove that some these\nobjectives control the Kullback-Leibler divergence of the estimated\ndistribution from its target. NETS is shown to be unbiased and, in addition,\nhas a tunable diffusion coefficient which can be adjusted post-training to\nmaximize the effective sample size. We demonstrate the efficacy of the method\non standard benchmarks, high-dimensional Gaussian mixture distributions, and a\nmodel from statistical lattice field theory, for which it surpasses the\nperformances of related work and existing baselines.\n","authors":["Michael S. Albergo","Eric Vanden-Eijnden"],"pdf_url":"https://arxiv.org/pdf/2410.02711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02703v1","updated":"2024-10-03T17:27:30Z","published":"2024-10-03T17:27:30Z","title":"Selective Attention Improves Transformer","summary":" Unneeded elements in the attention's context degrade performance. We\nintroduce Selective Attention, a simple parameter-free change to the standard\nattention mechanism which reduces attention to unneeded elements. Selective\nattention improves language modeling performance in a variety of model sizes\nand context lengths. For example, a range of transformers trained with the\nlanguage modeling objective on C4 with selective attention perform equivalently\nto standard transformers with ~2X more heads and parameters in their attention\nmodules. Selective attention also allows decreasing the size of the attention's\ncontext buffer, leading to meaningful reductions in the memory and compute\nrequirements during inference. For example, transformers with 100M parameters\ntrained on C4 with context sizes of 512, 1,024, and 2,048 need 16X, 25X, and\n47X less memory for their attention module, respectively, when equipped with\nselective attention, as those without selective attention, with the same\nvalidation perplexity.\n","authors":["Yaniv Leviathan","Matan Kalman","Yossi Matias"],"pdf_url":"https://arxiv.org/pdf/2410.02703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07071v2","updated":"2024-10-03T17:26:48Z","published":"2024-07-09T17:44:34Z","title":"Lookback Lens: Detecting and Mitigating Contextual Hallucinations in\n Large Language Models Using Only Attention Maps","summary":" When asked to summarize articles or answer questions given a passage, large\nlanguage models (LLMs) can hallucinate details and respond with unsubstantiated\nanswers that are inaccurate with respect to the input context. This paper\ndescribes a simple approach for detecting such contextual hallucinations. We\nhypothesize that contextual hallucinations are related to the extent to which\nan LLM attends to information in the provided context versus its own\ngenerations. Based on this intuition, we propose a simple hallucination\ndetection model whose input features are given by the ratio of attention\nweights on the context versus newly generated tokens (for each attention head).\nWe find that a linear classifier based on these lookback ratio features is as\neffective as a richer detector that utilizes the entire hidden states of an LLM\nor a text-based entailment model. The lookback ratio-based detector -- Lookback\nLens -- is found to transfer across tasks and even models, allowing a detector\nthat is trained on a 7B model to be applied (without retraining) to a larger\n13B model. We further apply this detector to mitigate contextual\nhallucinations, and find that a simple classifier-guided decoding approach is\nable to reduce the amount of hallucination, for example by 9.6% in the XSum\nsummarization task.\n","authors":["Yung-Sung Chuang","Linlu Qiu","Cheng-Yu Hsieh","Ranjay Krishna","Yoon Kim","James Glass"],"pdf_url":"https://arxiv.org/pdf/2407.07071v2.pdf","comment":"EMNLP 2024 main conference long paper. The source code is available\n at https://github.com/voidism/Lookback-Lens"},{"id":"http://arxiv.org/abs/2406.03520v2","updated":"2024-10-03T17:24:40Z","published":"2024-06-05T17:53:55Z","title":"VideoPhy: Evaluating Physical Commonsense for Video Generation","summary":" Recent advances in internet-scale video data pretraining have led to the\ndevelopment of text-to-video generative models that can create high-quality\nvideos across a broad range of visual concepts, synthesize realistic motions\nand render complex objects. Hence, these generative models have the potential\nto become general-purpose simulators of the physical world. However, it is\nunclear how far we are from this goal with the existing text-to-video\ngenerative models. To this end, we present VideoPhy, a benchmark designed to\nassess whether the generated videos follow physical commonsense for real-world\nactivities (e.g. marbles will roll down when placed on a slanted surface).\nSpecifically, we curate diverse prompts that involve interactions between\nvarious material types in the physical world (e.g., solid-solid, solid-fluid,\nfluid-fluid). We then generate videos conditioned on these captions from\ndiverse state-of-the-art text-to-video generative models, including open models\n(e.g., CogVideoX) and closed models (e.g., Lumiere, Dream Machine). Our human\nevaluation reveals that the existing models severely lack the ability to\ngenerate videos adhering to the given text prompts, while also lack physical\ncommonsense. Specifically, the best performing model, CogVideoX-5B, generates\nvideos that adhere to the caption and physical laws for 39.6% of the instances.\nVideoPhy thus highlights that the video generative models are far from\naccurately simulating the physical world. Finally, we propose an\nauto-evaluator, VideoCon-Physics, to assess the performance reliably for the\nnewly released models.\n","authors":["Hritik Bansal","Zongyu Lin","Tianyi Xie","Zeshun Zong","Michal Yarom","Yonatan Bitton","Chenfanfu Jiang","Yizhou Sun","Kai-Wei Chang","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2406.03520v2.pdf","comment":"43 pages, 29 figures, 12 tables. Added CogVideo and Dream Machine in\n v2"},{"id":"http://arxiv.org/abs/2410.02698v1","updated":"2024-10-03T17:21:30Z","published":"2024-10-03T17:21:30Z","title":"Lie Algebra Canonicalization: Equivariant Neural Operators under\n arbitrary Lie Groups","summary":" The quest for robust and generalizable machine learning models has driven\nrecent interest in exploiting symmetries through equivariant neural networks.\nIn the context of PDE solvers, recent works have shown that Lie point\nsymmetries can be a useful inductive bias for Physics-Informed Neural Networks\n(PINNs) through data and loss augmentation. Despite this, directly enforcing\nequivariance within the model architecture for these problems remains elusive.\nThis is because many PDEs admit non-compact symmetry groups, oftentimes not\nstudied beyond their infinitesimal generators, making them incompatible with\nmost existing equivariant architectures. In this work, we propose Lie aLgebrA\nCanonicalization (LieLAC), a novel approach that exploits only the action of\ninfinitesimal generators of the symmetry group, circumventing the need for\nknowledge of the full group structure. To achieve this, we address existing\ntheoretical issues in the canonicalization literature, establishing connections\nwith frame averaging in the case of continuous non-compact groups. Operating\nwithin the framework of canonicalization, LieLAC can easily be integrated with\nunconstrained pre-trained models, transforming inputs to a canonical form\nbefore feeding them into the existing model, effectively aligning the input for\nmodel inference according to allowed symmetries. LieLAC utilizes standard Lie\ngroup descent schemes, achieving equivariance in pre-trained models. Finally,\nwe showcase LieLAC's efficacy on tasks of invariant image classification and\nLie point symmetry equivariant neural PDE solvers using pre-trained models.\n","authors":["Zakhar Shumaylov","Peter Zaika","James Rowbottom","Ferdia Sherry","Melanie Weber","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2410.02698v1.pdf","comment":"40 pages; preprint"},{"id":"http://arxiv.org/abs/2402.17917v2","updated":"2024-10-03T17:18:53Z","published":"2024-02-27T22:10:51Z","title":"Collaborative learning of common latent representations in routinely\n collected multivariate ICU physiological signals","summary":" In Intensive Care Units (ICU), the abundance of multivariate time series\npresents an opportunity for machine learning (ML) to enhance patient\nphenotyping. In contrast to previous research focused on electronic health\nrecords (EHR), here we propose an ML approach for phenotyping using routinely\ncollected physiological time series data. Our new algorithm integrates Long\nShort-Term Memory (LSTM) networks with collaborative filtering concepts to\nidentify common physiological states across patients. Tested on real-world ICU\nclinical data for intracranial hypertension (IH) detection in patients with\nbrain injury, our method achieved an area under the curve (AUC) of 0.889 and\naverage precision (AP) of 0.725. Moreover, our algorithm outperforms\nautoencoders in learning more structured latent representations of the\nphysiological signals. These findings highlight the promise of our methodology\nfor patient phenotyping, leveraging routinely collected multivariate time\nseries to improve clinical care practices.\n","authors":["Hollan Haule","Ian Piper","Patricia Jones","Tsz-Yan Milly Lo","Javier Escudero"],"pdf_url":"https://arxiv.org/pdf/2402.17917v2.pdf","comment":"Published in 2024 IEEE International Conference on Acoustics, Speech,\n and Signal Processing Workshops (ICASSPW)"},{"id":"http://arxiv.org/abs/2410.02693v1","updated":"2024-10-03T17:18:37Z","published":"2024-10-03T17:18:37Z","title":"Discovering Clues of Spoofed LM Watermarks","summary":" LLM watermarks stand out as a promising way to attribute ownership of\nLLM-generated text. One threat to watermark credibility comes from spoofing\nattacks, where an unauthorized third party forges the watermark, enabling it to\nfalsely attribute arbitrary texts to a particular LLM. While recent works have\ndemonstrated that state-of-the-art schemes are in fact vulnerable to spoofing,\nthey lack deeper qualitative analysis of the texts produced by spoofing\nmethods. In this work, we for the first time reveal that there are observable\ndifferences between genuine and spoofed watermark texts. Namely, we show that\nregardless of their underlying approach, all current spoofing methods\nconsistently leave observable artifacts in spoofed texts, indicative of\nwatermark forgery. We build upon these findings to propose rigorous statistical\ntests that reliably reveal the presence of such artifacts, effectively\ndiscovering that a watermark was spoofed. Our experimental evaluation shows\nhigh test power across all current spoofing methods, providing insights into\ntheir fundamental limitations, and suggesting a way to mitigate this threat.\n","authors":["Thibaud Gloaguen","Nikola Jovanović","Robin Staab","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2410.02693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10224v4","updated":"2024-10-03T17:13:41Z","published":"2023-10-16T09:34:06Z","title":"Generalizing Medical Image Representations via Quaternion Wavelet\n Networks","summary":" Neural network generalizability is becoming a broad research field due to the\nincreasing availability of datasets from different sources and for various\ntasks. This issue is even wider when processing medical data, where a lack of\nmethodological standards causes large variations being provided by different\nimaging centers or acquired with various devices and cofactors. To overcome\nthese limitations, we introduce a novel, generalizable, data- and task-agnostic\nframework able to extract salient features from medical images. The proposed\nquaternion wavelet network (QUAVE) can be easily integrated with any\npre-existing medical image analysis or synthesis task, and it can be involved\nwith real, quaternion, or hypercomplex-valued models, generalizing their\nadoption to single-channel data. QUAVE first extracts different sub-bands\nthrough the quaternion wavelet transform, resulting in both\nlow-frequency/approximation bands and high-frequency/fine-grained features.\nThen, it weighs the most representative set of sub-bands to be involved as\ninput to any other neural model for image processing, replacing standard data\nsamples. We conduct an extensive experimental evaluation comprising different\ndatasets, diverse image analysis, and synthesis tasks including reconstruction,\nsegmentation, and modality translation. We also evaluate QUAVE in combination\nwith both real and quaternion-valued models. Results demonstrate the\neffectiveness and the generalizability of the proposed framework that improves\nnetwork performance while being flexible to be adopted in manifold scenarios\nand robust to domain shifts. The full code is available at:\nhttps://github.com/ispamm/QWT.\n","authors":["Luigi Sigillo","Eleonora Grassucci","Aurelio Uncini","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2310.10224v4.pdf","comment":"This paper is currently under review"},{"id":"http://arxiv.org/abs/2409.03650v2","updated":"2024-10-03T17:13:04Z","published":"2024-09-05T16:08:19Z","title":"On the Limited Generalization Capability of the Implicit Reward Model\n Induced by Direct Preference Optimization","summary":" Reinforcement Learning from Human Feedback (RLHF) is an effective approach\nfor aligning language models to human preferences. Central to RLHF is learning\na reward function for scoring human preferences. Two main approaches for\nlearning a reward model are 1) training an EXplicit Reward Model (EXRM) as in\nRLHF, and 2) using an implicit reward learned from preference data through\nmethods such as Direct Preference Optimization (DPO). Prior work has shown that\nthe implicit reward model of DPO (denoted as DPORM) can approximate an EXRM in\nthe limit. DPORM's effectiveness directly implies the optimality of the learned\npolicy, and also has practical implication for LLM alignment methods including\niterative DPO. However, it is unclear how well DPORM empirically matches the\nperformance of EXRM. This work studies the accuracy at distinguishing preferred\nand rejected answers for both DPORM and EXRM. Our findings indicate that even\nthough DPORM fits the training dataset comparably, it generalizes less\neffectively than EXRM, especially when the validation datasets contain\ndistribution shifts. Across five out-of-distribution settings, DPORM has a mean\ndrop in accuracy of 3% and a maximum drop of 7%. These findings highlight that\nDPORM has limited generalization ability and substantiates the integration of\nan explicit reward model in iterative DPO approaches.\n","authors":["Yong Lin","Skyler Seto","Maartje ter Hoeve","Katherine Metcalf","Barry-John Theobald","Xuan Wang","Yizhe Zhang","Chen Huang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03650v2.pdf","comment":"12 pages, 8 tables, 3 figures; Paper Accepted at EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2406.18725v2","updated":"2024-10-03T17:10:09Z","published":"2024-06-26T19:48:48Z","title":"Jailbreaking LLMs with Arabic Transliteration and Arabizi","summary":" This study identifies the potential vulnerabilities of Large Language Models\n(LLMs) to 'jailbreak' attacks, specifically focusing on the Arabic language and\nits various forms. While most research has concentrated on English-based prompt\nmanipulation, our investigation broadens the scope to investigate the Arabic\nlanguage. We initially tested the AdvBench benchmark in Standardized Arabic,\nfinding that even with prompt manipulation techniques like prefix injection, it\nwas insufficient to provoke LLMs into generating unsafe content. However, when\nusing Arabic transliteration and chatspeak (or arabizi), we found that unsafe\ncontent could be produced on platforms like OpenAI GPT-4 and Anthropic Claude 3\nSonnet. Our findings suggest that using Arabic and its various forms could\nexpose information that might remain hidden, potentially increasing the risk of\njailbreak attacks. We hypothesize that this exposure could be due to the\nmodel's learned connection to specific words, highlighting the need for more\ncomprehensive safety training across all language forms.\n","authors":["Mansour Al Ghanim","Saleh Almohaimeed","Mengxin Zheng","Yan Solihin","Qian Lou"],"pdf_url":"https://arxiv.org/pdf/2406.18725v2.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.02683v1","updated":"2024-10-03T17:08:52Z","published":"2024-10-03T17:08:52Z","title":"DailyDilemmas: Revealing Value Preferences of LLMs with Quandaries of\n Daily Life","summary":" As we increasingly seek guidance from LLMs for decision-making in daily life,\nmany of these decisions are not clear-cut and depend significantly on the\npersonal values and ethical standards of the users. We present DailyDilemmas, a\ndataset of 1,360 moral dilemmas encountered in everyday life. Each dilemma\nincludes two possible actions and with each action, the affected parties and\nhuman values invoked. Based on these dilemmas, we consolidated a set of human\nvalues across everyday topics e.g., interpersonal relationships, workplace, and\nenvironmental issues. We evaluated LLMs on these dilemmas to determine what\naction they will take and the values represented by these actions. Then, we\nanalyzed these values through the lens of five popular theories inspired by\nsociology, psychology and philosophy. These theories are: World Value Survey,\nMoral Foundation Theory, Maslow's Hierarchy of Needs, Aristotle's Virtues, and\nPlutchik Wheel of Emotion. We find that LLMs are most aligned with the\nself-expression over survival values in terms of World Value Survey, care over\nloyalty in Moral Foundation Theory. Interestingly, we find large preferences\ndifferences in models for some core values such as truthfulness e.g.,\nMixtral-8x7B model tends to neglect it by 9.7% while GPT-4-turbo model tends to\nselect it by 9.4%. We also study the recent guidance released by OpenAI\n(ModelSpec), and Anthropic (Constitutional AI) to understand how their released\nprinciples reflect their actual value prioritization when facing nuanced moral\nreasoning in daily-life settings. We find that end users cannot effectively\nsteer such prioritization using system prompts.\n","authors":["Yu Ying Chiu","Liwei Jiang","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2410.02683v1.pdf","comment":"Preprint. Under Review"},{"id":"http://arxiv.org/abs/2410.02681v1","updated":"2024-10-03T17:06:21Z","published":"2024-10-03T17:06:21Z","title":"Understanding and Mitigating Miscalibration in Prompt Tuning for\n Vision-Language Models","summary":" Confidence calibration is critical for the safe deployment of machine\nlearning models in the real world. However, such issue in vision-language\nmodels like CLIP, particularly after fine-tuning, has not been fully addressed.\nIn this work, we demonstrate that existing prompt tuning methods usually lead\nto a trade-off of calibration between base and new classes: the cross-entropy\nloss in CoOp causes overconfidence in new classes by increasing textual label\ndivergence, whereas the regularization of KgCoOp maintains the confidence level\nbut results in underconfidence in base classes due to the improved accuracy.\nInspired by the observations, we introduce Dynamic Outlier Regularization (DOR)\nto ensure the confidence calibration on both base and new classes after\nfine-tuning. In particular, we propose to minimize the feature deviation of\nnovel textual labels (instead of base classes) sampled from a large vocabulary.\nIn effect, DOR prevents the increase in textual divergence for new labels while\neasing restrictions on base classes. Extensive experiments demonstrate that DOR\ncan enhance the calibration performance of current fine-tuning methods on base\nand new classes.\n","authors":["Shuoyuan Wang","Yixuan Li","Hongxin Wei"],"pdf_url":"https://arxiv.org/pdf/2410.02681v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.02680v1","updated":"2024-10-03T17:06:06Z","published":"2024-10-03T17:06:06Z","title":"Highly Adaptive Ridge","summary":" In this paper we propose the Highly Adaptive Ridge (HAR): a regression method\nthat achieves a $n^{-1/3}$ dimension-free L2 convergence rate in the class of\nright-continuous functions with square-integrable sectional derivatives. This\nis a large nonparametric function class that is particularly appropriate for\ntabular data. HAR is exactly kernel ridge regression with a specific\ndata-adaptive kernel based on a saturated zero-order tensor-product spline\nbasis expansion. We use simulation and real data to confirm our theory. We\ndemonstrate empirical performance better than state-of-the-art algorithms for\nsmall datasets in particular.\n","authors":["Alejandro Schuler","Alexander Hagemeister","Mark van der Laan"],"pdf_url":"https://arxiv.org/pdf/2410.02680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00147v3","updated":"2024-10-03T17:05:51Z","published":"2024-05-31T19:26:05Z","title":"Fair Allocation in Dynamic Mechanism Design","summary":" We consider a dynamic mechanism design problem where an auctioneer sells an\nindivisible good to groups of buyers in every round, for a total of $T$ rounds.\nThe auctioneer aims to maximize their discounted overall revenue while adhering\nto a fairness constraint that guarantees a minimum average allocation for each\ngroup. We begin by studying the static case ($T=1$) and establish that the\noptimal mechanism involves two types of subsidization: one that increases the\noverall probability of allocation to all buyers, and another that favors the\ngroups which otherwise have a lower probability of winning the item. We then\nextend our results to the dynamic case by characterizing a set of recursive\nfunctions that determine the optimal allocation and payments in each round.\nNotably, our results establish that in the dynamic case, the seller, on the one\nhand, commits to a participation bonus to incentivize truth-telling, and on the\nother hand, charges an entry fee for every round. Moreover, the optimal\nallocation once more involves subsidization, which its extent depends on the\ndifference in future utilities for both the seller and buyers when allocating\nthe item to one group versus the others. Finally, we present an approximation\nscheme to solve the recursive equations and determine an approximately optimal\nand fair allocation efficiently.\n","authors":["Alireza Fallah","Michael I. Jordan","Annie Ulichney"],"pdf_url":"https://arxiv.org/pdf/2406.00147v3.pdf","comment":"A shorter conference version has been accepted at the Advances in\n Neural Information Processing Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2403.04405v2","updated":"2024-10-03T17:05:49Z","published":"2024-03-07T11:00:35Z","title":"Signature Isolation Forest","summary":" Functional Isolation Forest (FIF) is a recent state-of-the-art Anomaly\nDetection (AD) algorithm designed for functional data. It relies on a tree\npartition procedure where an abnormality score is computed by projecting each\ncurve observation on a drawn dictionary through a linear inner product. Such\nlinear inner product and the dictionary are a priori choices that highly\ninfluence the algorithm's performances and might lead to unreliable results,\nparticularly with complex datasets. This work addresses these challenges by\nintroducing \\textit{Signature Isolation Forest}, a novel AD algorithm class\nleveraging the rough path theory's signature transform. Our objective is to\nremove the constraints imposed by FIF through the proposition of two algorithms\nwhich specifically target the linearity of the FIF inner product and the choice\nof the dictionary. We provide several numerical experiments, including a\nreal-world applications benchmark showing the relevance of our methods.\n","authors":["Marta Campi","Guillaume Staerman","Gareth W. Peters","Tomoko Matsui"],"pdf_url":"https://arxiv.org/pdf/2403.04405v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02677v1","updated":"2024-10-03T17:04:31Z","published":"2024-10-03T17:04:31Z","title":"CulturalBench: a Robust, Diverse and Challenging Benchmark on Measuring\n the (Lack of) Cultural Knowledge of LLMs","summary":" To make large language models (LLMs) more helpful across diverse cultures, it\nis essential to have effective cultural knowledge benchmarks to measure and\ntrack our progress. Effective benchmarks need to be robust, diverse, and\nchallenging. We introduce CulturalBench: a set of 1,227 human-written and\nhuman-verified questions for effectively assessing LLMs' cultural knowledge,\ncovering 45 global regions including the underrepresented ones like Bangladesh,\nZimbabwe, and Peru. Questions - each verified by five independent annotators -\nspan 17 diverse topics ranging from food preferences to greeting etiquettes. We\nevaluate models on two setups: CulturalBench-Easy and CulturalBench-Hard which\nshare the same questions but asked differently. We find that LLMs are sensitive\nto such difference in setups (e.g., GPT-4o with 27.3% difference). Compared to\nhuman performance (92.6% accuracy), CulturalBench-Hard is more challenging for\nfrontier LLMs with the best performing model (GPT-4o) at only 61.5% and the\nworst (Llama3-8b) at 21.4%. Moreover, we find that LLMs often struggle with\ntricky questions that have multiple correct answers (e.g., What utensils do the\nChinese usually use?), revealing a tendency to converge to a single answer. Our\nresults also indicate that OpenAI GPT-4o substantially outperform other\nproprietary and open source models in questions related to all but one region\n(Oceania). Nonetheless, all models consistently underperform on questions\nrelated to South America and the Middle East.\n","authors":["Yu Ying Chiu","Liwei Jiang","Bill Yuchen Lin","Chan Young Park","Shuyue Stella Li","Sahithya Ravi","Mehar Bhatia","Maria Antoniak","Yulia Tsvetkov","Vered Shwartz","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2410.02677v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2410.02675v1","updated":"2024-10-03T17:02:21Z","published":"2024-10-03T17:02:21Z","title":"FAN: Fourier Analysis Networks","summary":" Despite the remarkable success achieved by neural networks, particularly\nthose represented by MLP and Transformer, we reveal that they exhibit potential\nflaws in the modeling and reasoning of periodicity, i.e., they tend to memorize\nthe periodic data rather than genuinely understanding the underlying principles\nof periodicity. However, periodicity is a crucial trait in various forms of\nreasoning and generalization, underpinning predictability across natural and\nengineered systems through recurring patterns in observations. In this paper,\nwe propose FAN, a novel network architecture based on Fourier Analysis, which\nempowers the ability to efficiently model and reason about periodic phenomena.\nBy introducing Fourier Series, the periodicity is naturally integrated into the\nstructure and computational processes of the neural network, thus achieving a\nmore accurate expression and prediction of periodic patterns. As a promising\nsubstitute to multi-layer perceptron (MLP), FAN can seamlessly replace MLP in\nvarious models with fewer parameters and FLOPs. Through extensive experiments,\nwe demonstrate the effectiveness of FAN in modeling and reasoning about\nperiodic functions, and the superiority and generalizability of FAN across a\nrange of real-world tasks, including symbolic formula representation, time\nseries forecasting, and language modeling.\n","authors":["Yihong Dong","Ge Li","Yongding Tao","Xue Jiang","Kechi Zhang","Jia Li","Jing Su","Jun Zhang","Jingjing Xu"],"pdf_url":"https://arxiv.org/pdf/2410.02675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13937v6","updated":"2024-10-03T16:59:18Z","published":"2024-05-22T19:10:24Z","title":"DyGPrompt: Learning Feature and Time Prompts on Dynamic Graphs","summary":" Dynamic graphs capture evolving interactions between entities, such as in\nsocial networks, online learning platforms, and crowdsourcing projects. For\ndynamic graph modeling, dynamic graph neural networks (DGNNs) have emerged as a\nmainstream technique. However, they are generally pre-trained on the link\nprediction task, leaving a significant gap from the objectives of downstream\ntasks such as node classification. To bridge the gap, prompt-based learning has\ngained traction on graphs, but most existing efforts focus on static graphs,\nneglecting the evolution of dynamic graphs. In this paper, we propose\nDYGPROMPT, a novel pre-training and prompt learning framework for dynamic graph\nmodeling. First, we design dual prompts to address the gap in both task\nobjectives and temporal variations across pre-training and downstream tasks.\nSecond, we recognize that node and time features mutually characterize each\nother, and propose dual condition-nets to model the evolving node-time patterns\nin downstream tasks. Finally, we thoroughly evaluate and analyze DYGPROMPT\nthrough extensive experiments on four public datasets.\n","authors":["Xingtong Yu","Zhenghao Liu","Yuan Fang","Xinming Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.13937v6.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2410.02667v1","updated":"2024-10-03T16:51:14Z","published":"2024-10-03T16:51:14Z","title":"GUD: Generation with Unified Diffusion","summary":" Diffusion generative models transform noise into data by inverting a process\nthat progressively adds noise to data samples. Inspired by concepts from the\nrenormalization group in physics, which analyzes systems across different\nscales, we revisit diffusion models by exploring three key design aspects: 1)\nthe choice of representation in which the diffusion process operates (e.g.\npixel-, PCA-, Fourier-, or wavelet-basis), 2) the prior distribution that data\nis transformed into during diffusion (e.g. Gaussian with covariance $\\Sigma$),\nand 3) the scheduling of noise levels applied separately to different parts of\nthe data, captured by a component-wise noise schedule. Incorporating the\nflexibility in these choices, we develop a unified framework for diffusion\ngenerative models with greatly enhanced design freedom. In particular, we\nintroduce soft-conditioning models that smoothly interpolate between standard\ndiffusion models and autoregressive models (in any basis), conceptually\nbridging these two approaches. Our framework opens up a wide design space which\nmay lead to more efficient training and data generation, and paves the way to\nnovel architectures integrating different generative approaches and generation\ntasks.\n","authors":["Mathis Gerdes","Max Welling","Miranda C. N. Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.02667v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.02666v1","updated":"2024-10-03T16:50:30Z","published":"2024-10-03T16:50:30Z","title":"AlphaIntegrator: Transformer Action Search for Symbolic Integration\n Proofs","summary":" We present the first correct-by-construction learning-based system for\nstep-by-step mathematical integration. The key idea is to learn a policy,\nrepresented by a GPT transformer model, which guides the search for the right\nmathematical integration rule, to be carried out by a symbolic solver.\nConcretely, we introduce a symbolic engine with axiomatically correct actions\non mathematical expressions, as well as the first dataset for step-by-step\nintegration. Our GPT-style transformer model, trained on this synthetic data,\ndemonstrates strong generalization by surpassing its own data generator in\naccuracy and efficiency, using 50% fewer search steps. Our experimental results\nwith SoTA LLMs also demonstrate that the standard approach of fine-tuning LLMs\non a set of question-answer pairs is insufficient for solving this mathematical\ntask. This motivates the importance of discovering creative methods for\ncombining LLMs with symbolic reasoning engines, of which our work is an\ninstance.\n","authors":["Mert Ünsal","Timon Gehr","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2410.02666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02660v1","updated":"2024-10-03T16:46:52Z","published":"2024-10-03T16:46:52Z","title":"How to Train Long-Context Language Models (Effectively)","summary":" We study continued training and supervised fine-tuning (SFT) of a language\nmodel (LM) to make effective use of long-context information. We first\nestablish a reliable evaluation protocol to guide model development -- Instead\nof perplexity or simple needle-in-a-haystack (NIAH) tests, we use a broad set\nof long-context tasks, and we evaluate models after SFT with instruction data\nas this better reveals long-context abilities. Supported by our robust\nevaluations, we run thorough experiments to decide the data mix for continued\npre-training, the instruction tuning dataset, and many other design choices. We\nfind that (1) code repositories and books are excellent sources of long data,\nbut it is crucial to combine them with high-quality short data; (2) training\nwith a sequence length beyond the evaluation length boosts long-context\nperformance; (3) for SFT, using only short instruction datasets yields strong\nperformance on long-context tasks. Our final model, ProLong-8B, which is\ninitialized from Llama-3 and trained on 40B tokens, demonstrates\nstate-of-the-art long-context performance among similarly sized models at a\nlength of 128K. ProLong outperforms Llama-3.18B-Instruct on the majority of\nlong-context tasks despite having seen only 5% as many tokens during\nlong-context training. Additionally, ProLong can effectively process up to 512K\ntokens, one of the longest context windows of publicly available LMs.\n","authors":["Tianyu Gao","Alexander Wettig","Howard Yen","Danqi Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02660v1.pdf","comment":"Our code, data, and models are available at\n https://github.com/princeton-nlp/ProLong"},{"id":"http://arxiv.org/abs/2407.11969v3","updated":"2024-10-03T16:46:09Z","published":"2024-07-16T17:59:55Z","title":"Does Refusal Training in LLMs Generalize to the Past Tense?","summary":" Refusal training is widely used to prevent LLMs from generating harmful,\nundesirable, or illegal outputs. We reveal a curious generalization gap in the\ncurrent refusal training approaches: simply reformulating a harmful request in\nthe past tense (e.g., \"How to make a Molotov cocktail?\" to \"How did people make\na Molotov cocktail?\") is often sufficient to jailbreak many state-of-the-art\nLLMs. We systematically evaluate this method on Llama-3 8B, Claude-3.5 Sonnet,\nGPT-3.5 Turbo, Gemma-2 9B, Phi-3-Mini, GPT-4o mini, GPT-4o, o1-mini,\no1-preview, and R2D2 models using GPT-3.5 Turbo as a reformulation model. For\nexample, the success rate of this simple attack on GPT-4o increases from 1%\nusing direct requests to 88% using 20 past tense reformulation attempts on\nharmful requests from JailbreakBench with GPT-4 as a jailbreak judge.\nInterestingly, we also find that reformulations in the future tense are less\neffective, suggesting that refusal guardrails tend to consider past historical\nquestions more benign than hypothetical future questions. Moreover, our\nexperiments on fine-tuning GPT-3.5 Turbo show that defending against past\nreformulations is feasible when past tense examples are explicitly included in\nthe fine-tuning data. Overall, our findings highlight that the widely used\nalignment techniques -- such as SFT, RLHF, and adversarial training -- employed\nto align the studied models can be brittle and do not always generalize as\nintended. We provide code and jailbreak artifacts at\nhttps://github.com/tml-epfl/llm-past-tense.\n","authors":["Maksym Andriushchenko","Nicolas Flammarion"],"pdf_url":"https://arxiv.org/pdf/2407.11969v3.pdf","comment":"Update in v3: o1-mini and o1-preview results (on top of GPT-4o and\n Claude 3.5 Sonnet added in v2). We provide code and jailbreak artifacts at\n https://github.com/tml-epfl/llm-past-tense"},{"id":"http://arxiv.org/abs/2410.02656v1","updated":"2024-10-03T16:43:00Z","published":"2024-10-03T16:43:00Z","title":"Scalable Simulation-free Entropic Unbalanced Optimal Transport","summary":" The Optimal Transport (OT) problem investigates a transport map that connects\ntwo distributions while minimizing a given cost function. Finding such a\ntransport map has diverse applications in machine learning, such as generative\nmodeling and image-to-image translation. In this paper, we introduce a scalable\nand simulation-free approach for solving the Entropic Unbalanced Optimal\nTransport (EUOT) problem. We derive the dynamical form of this EUOT problem,\nwhich is a generalization of the Schr\\\"odinger bridges (SB) problem. Based on\nthis, we derive dual formulation and optimality conditions of the EUOT problem\nfrom the stochastic optimal control interpretation. By leveraging these\nproperties, we propose a simulation-free algorithm to solve EUOT, called\nSimulation-free EUOT (SF-EUOT). While existing SB models require expensive\nsimulation costs during training and evaluation, our model achieves\nsimulation-free training and one-step generation by utilizing the reciprocal\nproperty. Our model demonstrates significantly improved scalability in\ngenerative modeling and image-to-image translation tasks compared to previous\nSB methods.\n","authors":["Jaemoo Choi","Jaewoong Choi"],"pdf_url":"https://arxiv.org/pdf/2410.02656v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2311.16556v2","updated":"2024-10-03T16:42:08Z","published":"2023-11-28T06:52:53Z","title":"Scalable Label Distribution Learning for Multi-Label Classification","summary":" Multi-label classification (MLC) refers to the problem of tagging a given\ninstance with a set of relevant labels. Most existing MLC methods are based on\nthe assumption that the correlation of two labels in each label pair is\nsymmetric, which is violated in many real-world scenarios. Moreover, most\nexisting methods design learning processes associated with the number of\nlabels, which makes their computational complexity a bottleneck when scaling up\nto large-scale output space. To tackle these issues, we propose a novel method\nnamed Scalable Label Distribution Learning (SLDL) for multi-label\nclassification which can describe different labels as distributions in a latent\nspace, where the label correlation is asymmetric and the dimension is\nindependent of the number of labels. Specifically, SLDL first converts labels\ninto continuous distributions within a low-dimensional latent space and\nleverages the asymmetric metric to establish the correlation between different\nlabels. Then, it learns the mapping from the feature space to the latent space,\nresulting in the computational complexity is no longer related to the number of\nlabels. Finally, SLDL leverages a nearest-neighbor-based strategy to decode the\nlatent representations and obtain the final predictions. Extensive experiments\nillustrate that SLDL achieves very competitive classification performances with\nlittle computational consumption.\n","authors":["Xingyu Zhao","Yuexuan An","Lei Qi","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2311.16556v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02654v1","updated":"2024-10-03T16:41:51Z","published":"2024-10-03T16:41:51Z","title":"Deconstructing Recurrence, Attention, and Gating: Investigating the\n transferability of Transformers and Gated Recurrent Neural Networks in\n forecasting of dynamical systems","summary":" Machine learning architectures, including transformers and recurrent neural\nnetworks (RNNs) have revolutionized forecasting in applications ranging from\ntext processing to extreme weather. Notably, advanced network architectures,\ntuned for applications such as natural language processing, are transferable to\nother tasks such as spatiotemporal forecasting tasks. However, there is a\nscarcity of ablation studies to illustrate the key components that enable this\nforecasting accuracy. The absence of such studies, although explainable due to\nthe associated computational cost, intensifies the belief that these models\nought to be considered as black boxes. In this work, we decompose the key\narchitectural components of the most powerful neural architectures, namely\ngating and recurrence in RNNs, and attention mechanisms in transformers. Then,\nwe synthesize and build novel hybrid architectures from the standard blocks,\nperforming ablation studies to identify which mechanisms are effective for each\ntask. The importance of considering these components as hyper-parameters that\ncan augment the standard architectures is exhibited on various forecasting\ndatasets, from the spatiotemporal chaotic dynamics of the multiscale Lorenz 96\nsystem, the Kuramoto-Sivashinsky equation, as well as standard real world\ntime-series benchmarks. A key finding is that neural gating and attention\nimproves the performance of all standard RNNs in most tasks, while the addition\nof a notion of recurrence in transformers is detrimental. Furthermore, our\nstudy reveals that a novel, sparsely used, architecture which integrates\nRecurrent Highway Networks with neural gating and attention mechanisms, emerges\nas the best performing architecture in high-dimensional spatiotemporal\nforecasting of dynamical systems.\n","authors":["Hunter S. Heidenreich","Pantelis R. Vlachas","Petros Koumoutsakos"],"pdf_url":"https://arxiv.org/pdf/2410.02654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02651v1","updated":"2024-10-03T16:36:05Z","published":"2024-10-03T16:36:05Z","title":"CAX: Cellular Automata Accelerated in JAX","summary":" Cellular automata have become a cornerstone for investigating emergence and\nself-organization across diverse scientific disciplines, spanning neuroscience,\nartificial life, and theoretical physics. However, the absence of a\nhardware-accelerated cellular automata library limits the exploration of new\nresearch directions, hinders collaboration, and impedes reproducibility. In\nthis work, we introduce CAX (Cellular Automata Accelerated in JAX), a\nhigh-performance and flexible open-source library designed to accelerate\ncellular automata research. CAX offers cutting-edge performance and a modular\ndesign through a user-friendly interface, and can support both discrete and\ncontinuous cellular automata with any number of dimensions. We demonstrate\nCAX's performance and flexibility through a wide range of benchmarks and\napplications. From classic models like elementary cellular automata and\nConway's Game of Life to advanced applications such as growing neural cellular\nautomata and self-classifying MNIST digits, CAX speeds up simulations up to\n2,000 times faster. Furthermore, we demonstrate CAX's potential to accelerate\nresearch by presenting a collection of three novel cellular automata\nexperiments, each implemented in just a few lines of code thanks to the\nlibrary's modular architecture. Notably, we show that a simple one-dimensional\ncellular automaton can outperform GPT-4 on the 1D-ARC challenge.\n","authors":["Maxence Faldor","Antoine Cully"],"pdf_url":"https://arxiv.org/pdf/2410.02651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02647v1","updated":"2024-10-03T16:33:35Z","published":"2024-10-03T16:33:35Z","title":"Immunogenicity Prediction with Dual Attention Enables Vaccine Target\n Selection","summary":" Immunogenicity prediction is a central topic in reverse vaccinology for\nfinding candidate vaccines that can trigger protective immune responses.\nExisting approaches typically rely on highly compressed features and simple\nmodel architectures, leading to limited prediction accuracy and poor\ngeneralizability. To address these challenges, we introduce ProVaccine, a novel\ndeep learning solution with a dual attention mechanism that integrates\npre-trained latent vector representations of protein sequences and structures.\nWe also compile the most comprehensive immunogenicity dataset to date,\nencompassing over 9,500 antigen sequences, structures, and immunogenicity\nlabels from bacteria, viruses, and tumors. Extensive experiments demonstrate\nthat ProVaccine outperforms existing methods across a wide range of evaluation\nmetrics. Furthermore, we establish a post-hoc validation protocol to assess the\npractical significance of deep learning models in tackling vaccine design\nchallenges. Our work provides an effective tool for vaccine design and sets\nvaluable benchmarks for future research.\n","authors":["Song Li","Yang Tan","Song Ke","Liang Hong","Bingxin Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.02647v1.pdf","comment":"18 pages, 11 tables, 5 figures"},{"id":"http://arxiv.org/abs/2409.02026v2","updated":"2024-10-03T16:31:59Z","published":"2024-09-03T16:20:22Z","title":"Foundations of Large Language Model Compression -- Part 1: Weight\n Quantization","summary":" In recent years, compression of large language models (LLMs) has emerged as\nan important problem to enable language model deployment on\nresource-constrained devices, reduce computational costs, and mitigate the\nenvironmental footprint of large-scale AI infrastructure. In this paper, we lay\ndown the foundation for LLM quantization from a convex optimization perspective\nand propose a quantization technique that builds on this foundation for optimum\nquantization outcomes. Our quantization framework, CVXQ, scales to models\ncontaining hundreds of billions of weight parameters and provides users with\nthe flexibility to compress models to any specified model size, post-training.\nA reference implementation of CVXQ can be obtained from github.com/seannz/cvxq.\n","authors":["Sean I. Young"],"pdf_url":"https://arxiv.org/pdf/2409.02026v2.pdf","comment":"Preprint. 17 pages, 4 figures, 5 appendices"},{"id":"http://arxiv.org/abs/2410.00712v2","updated":"2024-10-03T16:31:23Z","published":"2024-10-01T14:05:30Z","title":"NECOMIMI: Neural-Cognitive Multimodal EEG-informed Image Generation with\n Diffusion Models","summary":" NECOMIMI (NEural-COgnitive MultImodal EEG-Informed Image Generation with\nDiffusion Models) introduces a novel framework for generating images directly\nfrom EEG signals using advanced diffusion models. Unlike previous works that\nfocused solely on EEG-image classification through contrastive learning,\nNECOMIMI extends this task to image generation. The proposed NERV EEG encoder\ndemonstrates state-of-the-art (SoTA) performance across multiple zero-shot\nclassification tasks, including 2-way, 4-way, and 200-way, and achieves top\nresults in our newly proposed Category-based Assessment Table (CAT) Score,\nwhich evaluates the quality of EEG-generated images based on semantic concepts.\nA key discovery of this work is that the model tends to generate abstract or\ngeneralized images, such as landscapes, rather than specific objects,\nhighlighting the inherent challenges of translating noisy and low-resolution\nEEG data into detailed visual outputs. Additionally, we introduce the CAT Score\nas a new metric tailored for EEG-to-image evaluation and establish a benchmark\non the ThingsEEG dataset. This study underscores the potential of EEG-to-image\ngeneration while revealing the complexities and challenges that remain in\nbridging neural activity with visual representation.\n","authors":["Chi-Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2410.00712v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11295v2","updated":"2024-10-03T16:30:43Z","published":"2024-09-17T15:49:44Z","title":"EIA: Environmental Injection Attack on Generalist Web Agents for Privacy\n Leakage","summary":" Generalist web agents have demonstrated remarkable potential in autonomously\ncompleting a wide range of tasks on real websites, significantly boosting human\nproductivity. However, web tasks, such as booking flights, usually involve\nusers' PII, which may be exposed to potential privacy risks if web agents\naccidentally interact with compromised websites, a scenario that remains\nlargely unexplored in the literature. In this work, we narrow this gap by\nconducting the first study on the privacy risks of generalist web agents in\nadversarial environments. First, we present a realistic threat model for\nattacks on the website, where we consider two adversarial targets: stealing\nusers' specific PII or the entire user request. Then, we propose a novel attack\nmethod, termed Environmental Injection Attack (EIA). EIA injects malicious\ncontent designed to adapt well to environments where the agents operate and our\nwork instantiates EIA specifically for privacy scenarios in web environments.\nWe collect 177 action steps that involve diverse PII categories on realistic\nwebsites from the Mind2Web, and conduct experiments using one of the most\ncapable generalist web agent frameworks to date. The results demonstrate that\nEIA achieves up to 70% ASR in stealing specific PII and 16% ASR for full user\nrequest. Additionally, by accessing the stealthiness and experimenting with a\ndefensive system prompt, we indicate that EIA is hard to detect and mitigate.\nNotably, attacks that are not well adapted for a webpage can be detected via\nhuman inspection, leading to our discussion about the trade-off between\nsecurity and autonomy. However, extra attackers' efforts can make EIA\nseamlessly adapted, rendering such supervision ineffective. Thus, we further\ndiscuss the defenses at the pre- and post-deployment stages of the websites\nwithout relying on human supervision and call for more advanced defense\nstrategies.\n","authors":["Zeyi Liao","Lingbo Mo","Chejian Xu","Mintong Kang","Jiawei Zhang","Chaowei Xiao","Yuan Tian","Bo Li","Huan Sun"],"pdf_url":"https://arxiv.org/pdf/2409.11295v2.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2401.13858v3","updated":"2024-10-03T16:29:02Z","published":"2024-01-24T23:45:31Z","title":"Graph Diffusion Transformers for Multi-Conditional Molecular Generation","summary":" Inverse molecular design with diffusion models holds great potential for\nadvancements in material and drug discovery. Despite success in unconditional\nmolecular generation, integrating multiple properties such as synthetic score\nand gas permeability as condition constraints into diffusion models remains\nunexplored. We present the Graph Diffusion Transformer (Graph DiT) for\nmulti-conditional molecular generation. Graph DiT integrates an encoder to\nlearn numerical and categorical property representations with the\nTransformer-based denoiser. Unlike previous graph diffusion models that add\nnoise separately on the atoms and bonds in the forward diffusion process, Graph\nDiT is trained with a novel graph-dependent noise model for accurate estimation\nof graph-related noise in molecules. We extensively validate Graph DiT for\nmulti-conditional polymer and small molecule generation. Results demonstrate\nthe superiority of Graph DiT across nine metrics from distribution learning to\ncondition control for molecular properties. A polymer inverse design task for\ngas separation with feedback from domain experts further demonstrates its\npractical utility.\n","authors":["Gang Liu","Jiaxin Xu","Tengfei Luo","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.13858v3.pdf","comment":"Accepted by NeurIPS 2024 (Oral). 21 pages, 11 figures, 8 tables"},{"id":"http://arxiv.org/abs/2410.02639v1","updated":"2024-10-03T16:24:14Z","published":"2024-10-03T16:24:14Z","title":"Labor Migration Modeling through Large-scale Job Query Data","summary":" Accurate and timely modeling of labor migration is crucial for various urban\ngovernance and commercial tasks, such as local policy-making and business site\nselection. However, existing studies on labor migration largely rely on limited\nsurvey data with statistical methods, which fail to deliver timely and\nfine-grained insights for time-varying regional trends. To this end, we propose\na deep learning-based spatial-temporal labor migration analysis framework,\nDHG-SIL, by leveraging large-scale job query data. Specifically, we first\nacquire labor migration intention as a proxy of labor migration via job queries\nfrom one of the world's largest search engines. Then, a Disprepant Homophily\nco-preserved Graph Convolutional Network (DH-GCN) and an interpretable temporal\nmodule are respectively proposed to capture cross-city and sequential labor\nmigration dependencies. Besides, we introduce four interpretable variables to\nquantify city migration properties, which are co-optimized with city\nrepresentations via tailor-designed contrastive losses. Extensive experiments\non three real-world datasets demonstrate the superiority of our DHG-SIL.\nNotably, DHG-SIL has been deployed as a core component of a cooperative\npartner's intelligent human resource system, and the system supported a series\nof city talent attraction reports.\n","authors":["Zhuoning Guo","Le Zhang","Hengshu Zhu","Weijia Zhang","Hui Xiong","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2410.02639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04840v2","updated":"2024-10-03T16:23:07Z","published":"2024-09-07T14:38:05Z","title":"Sample and Oracle Efficient Reinforcement Learning for MDPs with\n Linearly-Realizable Value Functions","summary":" Designing sample-efficient and computationally feasible reinforcement\nlearning (RL) algorithms is particularly challenging in environments with large\nor infinite state and action spaces. In this paper, we advance this effort by\npresenting an efficient algorithm for Markov Decision Processes (MDPs) where\nthe state-action value function of any policy is linear in a given feature map.\nThis challenging setting can model environments with infinite states and\nactions, strictly generalizes classic linear MDPs, and currently lacks a\ncomputationally efficient algorithm under online access to the MDP.\nSpecifically, we introduce a new RL algorithm that efficiently finds a\nnear-optimal policy in this setting, using a number of episodes and calls to a\ncost-sensitive classification (CSC) oracle that are both polynomial in the\nproblem parameters. Notably, our CSC oracle can be efficiently implemented when\nthe feature dimension is constant, representing a clear improvement over\nstate-of-the-art methods, which require solving non-convex problems with\nhorizon-many variables and can incur computational costs that are exponential\nin the horizon.\n","authors":["Zakaria Mhammedi"],"pdf_url":"https://arxiv.org/pdf/2409.04840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.04901v3","updated":"2024-10-03T16:17:50Z","published":"2022-05-10T13:50:10Z","title":"Adjusted Expected Improvement for Cumulative Regret Minimization in\n Noisy Bayesian Optimization","summary":" The expected improvement (EI) is one of the most popular acquisition\nfunctions for Bayesian optimization (BO) and has demonstrated good empirical\nperformances in many applications for the minimization of simple regret.\nHowever, under the evaluation metric of cumulative regret, the performance of\nEI may not be competitive, and its existing theoretical regret upper bound\nstill has room for improvement. To adapt the EI for better performance under\ncumulative regret, we introduce a novel quantity called the evaluation cost\nwhich is compared against the acquisition function, and with this, develop the\nexpected improvement-cost (EIC) algorithm. In each iteration of EIC, a new\npoint with the largest acquisition function value is sampled, only if that\nvalue exceeds its evaluation cost. If none meets this criteria, the current\nbest point is resampled. This evaluation cost quantifies the potential downside\nof sampling a point, which is important under the cumulative regret metric as\nthe objective function value in every iteration affects the performance\nmeasure. We establish in theory a high-probability regret upper bound of EIC\nbased on the maximum information gain, which is tighter than the bound of\nexisting EI-based algorithms. It is also comparable to the regret bound of\nother popular BO algorithms such as Thompson sampling (GP-TS) and upper\nconfidence bound (GP-UCB). We further perform experiments to illustrate the\nimprovement of EIC over several popular BO algorithms.\n","authors":["Shouri Hu","Haowei Wang","Zhongxiang Dai","Bryan Kian Hsiang Low","Szu Hui Ng"],"pdf_url":"https://arxiv.org/pdf/2205.04901v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02629v1","updated":"2024-10-03T16:13:42Z","published":"2024-10-03T16:13:42Z","title":"Estimating Generalization Performance Along the Trajectory of Proximal\n SGD in Robust Regression","summary":" This paper studies the generalization performance of iterates obtained by\nGradient Descent (GD), Stochastic Gradient Descent (SGD) and their proximal\nvariants in high-dimensional robust regression problems. The number of features\nis comparable to the sample size and errors may be heavy-tailed. We introduce\nestimators that precisely track the generalization error of the iterates along\nthe trajectory of the iterative algorithm. These estimators are provably\nconsistent under suitable conditions. The results are illustrated through\nseveral examples, including Huber regression, pseudo-Huber regression, and\ntheir penalized variants with non-smooth regularizer. We provide explicit\ngeneralization error estimates for iterates generated from GD and SGD, or from\nproximal SGD in the presence of a non-smooth regularizer. The proposed risk\nestimates serve as effective proxies for the actual generalization error,\nallowing us to determine the optimal stopping iteration that minimizes the\ngeneralization error. Extensive simulations confirm the effectiveness of the\nproposed generalization error estimates.\n","authors":["Kai Tan","Pierre C. Bellec"],"pdf_url":"https://arxiv.org/pdf/2410.02629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02628v1","updated":"2024-10-03T16:12:59Z","published":"2024-10-03T16:12:59Z","title":"Inverse Entropic Optimal Transport Solves Semi-supervised Learning via\n Data Likelihood Maximization","summary":" Learning conditional distributions $\\pi^*(\\cdot|x)$ is a central problem in\nmachine learning, which is typically approached via supervised methods with\npaired data $(x,y) \\sim \\pi^*$. However, acquiring paired data samples is often\nchallenging, especially in problems such as domain translation. This\nnecessitates the development of $\\textit{semi-supervised}$ models that utilize\nboth limited paired data and additional unpaired i.i.d. samples $x \\sim\n\\pi^*_x$ and $y \\sim \\pi^*_y$ from the marginal distributions. The usage of\nsuch combined data is complex and often relies on heuristic approaches. To\ntackle this issue, we propose a new learning paradigm that integrates both\npaired and unpaired data $\\textbf{seamlessly}$ through the data likelihood\nmaximization techniques. We demonstrate that our approach also connects\nintriguingly with inverse entropic optimal transport (OT). This finding allows\nus to apply recent advances in computational OT to establish a $\\textbf{light}$\nlearning algorithm to get $\\pi^*(\\cdot|x)$. Furthermore, we demonstrate through\nempirical tests that our method effectively learns conditional distributions\nusing paired and unpaired data simultaneously.\n","authors":["Mikhail Persiianov","Arip Asadulaev","Nikita Andreev","Nikita Starodubcev","Dmitry Baranchuk","Anastasis Kratsios","Evgeny Burnaev","Alexander Korotin"],"pdf_url":"https://arxiv.org/pdf/2410.02628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06121v2","updated":"2024-10-03T16:10:43Z","published":"2024-08-12T13:03:34Z","title":"A Methodological Report on Anomaly Detection on Dynamic Knowledge Graphs","summary":" In this paper, we explore different approaches to anomaly detection on\ndynamic knowledge graphs, specifically in a microservices environment for\nKubernetes applications. Our approach explores three dynamic knowledge graph\nrepresentations: sequential data, one-hop graph structure, and two-hop graph\nstructure, with each representation incorporating increasingly complex\nstructural information. Each phase includes different machine learning and deep\nlearning models. We empirically analyse their performance and propose an\napproach based on ensemble learning of these models. Our approach significantly\noutperforms the baseline on the ISWC 2024 Dynamic Knowledge Graph Anomaly\nDetection dataset, providing a robust solution for anomaly detection in dynamic\ncomplex data.\n","authors":["Xiaohua Lu","Leshanshui Yang"],"pdf_url":"https://arxiv.org/pdf/2408.06121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03731v2","updated":"2024-10-03T16:09:10Z","published":"2024-09-05T17:42:19Z","title":"A Deep Generative Learning Approach for Two-stage Adaptive Robust\n Optimization","summary":" Two-stage adaptive robust optimization (ARO) is a powerful approach for\nplanning under uncertainty, balancing first-stage decisions with recourse\ndecisions made after uncertainty is realized. To account for uncertainty,\nmodelers typically define a simple uncertainty set over which potential\noutcomes are considered. However, classical methods for defining these sets\nunintentionally capture a wide range of unrealistic outcomes, resulting in\noverly-conservative and costly planning in anticipation of unlikely\ncontingencies. In this work, we introduce AGRO, a solution algorithm that\nperforms adversarial generation for two-stage adaptive robust optimization\nusing a variational autoencoder. AGRO generates high-dimensional contingencies\nthat are simultaneously adversarial and realistic, improving the robustness of\nfirst-stage decisions at a lower planning cost than standard methods. To ensure\ngenerated contingencies lie in high-density regions of the uncertainty\ndistribution, AGRO defines a tight uncertainty set as the image of \"latent\"\nuncertainty sets under the VAE decoding transformation. Projected gradient\nascent is then used to maximize recourse costs over the latent uncertainty sets\nby leveraging differentiable optimization methods. We demonstrate the\ncost-efficiency of AGRO by applying it to both a synthetic\nproduction-distribution problem and a real-world power system expansion\nsetting. We show that AGRO outperforms the standard column-and-constraint\nalgorithm by up to 1.8% in production-distribution planning and up to 11.6% in\npower system expansion.\n","authors":["Aron Brenner","Rahman Khorramfar","Jennifer Sun","Saurabh Amin"],"pdf_url":"https://arxiv.org/pdf/2409.03731v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02626v1","updated":"2024-10-03T16:08:16Z","published":"2024-10-03T16:08:16Z","title":"Online Learning Guided Quasi-Newton Methods with Global Non-Asymptotic\n Convergence","summary":" In this paper, we propose a quasi-Newton method for solving smooth and\nmonotone nonlinear equations, including unconstrained minimization and minimax\noptimization as special cases. For the strongly monotone setting, we establish\ntwo global convergence bounds: (i) a linear convergence rate that matches the\nrate of the celebrated extragradient method, and (ii) an explicit global\nsuperlinear convergence rate that provably surpasses the linear convergence\nrate after at most ${O}(d)$ iterations, where $d$ is the problem's dimension.\nIn addition, for the case where the operator is only monotone, we prove a\nglobal convergence rate of ${O}(\\min\\{{1}/{k},{\\sqrt{d}}/{k^{1.25}}\\})$ in\nterms of the duality gap. This matches the rate of the extragradient method\nwhen $k = {O}(d^2)$ and is faster when $k = \\Omega(d^2)$. These results are the\nfirst global convergence results to demonstrate a provable advantage of a\nquasi-Newton method over the extragradient method, without querying the\nJacobian of the operator. Unlike classical quasi-Newton methods, we achieve\nthis by using the hybrid proximal extragradient framework and a novel online\nlearning approach for updating the Jacobian approximation matrices.\nSpecifically, guided by the convergence analysis, we formulate the Jacobian\napproximation update as an online convex optimization problem over\nnon-symmetric matrices, relating the regret of the online problem to the\nconvergence rate of our method. To facilitate efficient implementation, we\nfurther develop a tailored online learning algorithm based on an approximate\nseparation oracle, which preserves structures such as symmetry and sparsity in\nthe Jacobian matrices.\n","authors":["Ruichen Jiang","Aryan Mokhtari"],"pdf_url":"https://arxiv.org/pdf/2410.02626v1.pdf","comment":"54 pages"},{"id":"http://arxiv.org/abs/2410.02622v1","updated":"2024-10-03T16:02:02Z","published":"2024-10-03T16:02:02Z","title":"Diss-l-ECT: Dissecting Graph Data with local Euler Characteristic\n Transforms","summary":" The Euler Characteristic Transform (ECT) is an efficiently-computable\ngeometrical-topological invariant that characterizes the global shape of data.\nIn this paper, we introduce the Local Euler Characteristic Transform\n($\\ell$-ECT), a novel extension of the ECT particularly designed to enhance\nexpressivity and interpretability in graph representation learning. Unlike\ntraditional Graph Neural Networks (GNNs), which may lose critical local details\nthrough aggregation, the $\\ell$-ECT provides a lossless representation of local\nneighborhoods. This approach addresses key limitations in GNNs by preserving\nnuanced local structures while maintaining global interpretability. Moreover,\nwe construct a rotation-invariant metric based on $\\ell$-ECTs for spatial\nalignment of data spaces. Our method exhibits superior performance than\nstandard GNNs on a variety of node classification tasks, particularly in graphs\nwith high heterophily.\n","authors":["Julius von Rohrscheidt","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2410.02622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13681v2","updated":"2024-10-03T16:01:01Z","published":"2024-03-20T15:39:54Z","title":"PARAMANU-AYN: Pretrain from scratch or Continual Pretraining of LLMs for\n Legal Domain Adaptation?","summary":" In this paper, we present Paramanu-Ayn, a collection of legal language models\ntrained exclusively on Indian legal case documents. This 97-million-parameter\nAuto-Regressive (AR) decoder-only model was pretrained from scratch with a\ncontext size of 8192 on a single GPU for just 185 hours, achieving an efficient\nMFU of 41.35. We also developed a legal domain specialized BPE tokenizer. We\nevaluated our model using perplexity and zero-shot tasks: case judgment\nprediction with explanation and abstractive case summarization. Paramanu-Ayn\noutperformed Llama-2 7B and Gemini-Pro in case judgment prediction with\nexplanation task on test accuracy by nearly 2 percentage points, despite being\n72 times smaller. In zero-shot abstractive summarization, it surpassed\ndecoder-only LLMs generating fixed-length summaries (5000 tokens) by over 10\npercentage points in BLEU and METEOR metrics, and by nearly 4 percentage points\nin BERTScore. Further evaluations on zero-shot commonsense and mathematical\nbenchmarks showed that Paramanu-Ayn excelled despite being trained exclusively\non legal documents, outperforming Llama-1, Llama-2, and Falcon on\nAGIEVAL-AQuA-RAT and AGIEVAL-SAT-Math tasks. We also instruction-tuned our\nmodel on 10,763 diverse legal tasks, including legal clause generation, legal\ndrafting, case summarization, etc. The Paramanu-Ayn-instruct model scored above\n8 out of 10 in clarity, relevance, completeness, and legal reasoning metrics by\nGPT-3.5-Turbo. We found that our models, were able to learn drafting knowledge\nand generalize to draft legal contracts and legal clauses with limited\ninstruction-tuning. Hence, we conclude that for a strong domain-specialized\ngenerative language model (such as legal), domain specialized pretraining from\nscratch is more cost effective, environmentally friendly, and remains\ncompetitive with larger models or even better than adapting LLMs for legal\ndomain tasks.\n","authors":["Mitodru Niyogi","Arnab Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2403.13681v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02618v1","updated":"2024-10-03T15:56:03Z","published":"2024-10-03T15:56:03Z","title":"Achieving Fairness in Predictive Process Analytics via Adversarial\n Learning (Extended Version)","summary":" Predictive business process analytics has become important for organizations,\noffering real-time operational support for their processes. However, these\nalgorithms often perform unfair predictions because they are based on biased\nvariables (e.g., gender or nationality), namely variables embodying\ndiscrimination. This paper addresses the challenge of integrating a debiasing\nphase into predictive business process analytics to ensure that predictions are\nnot influenced by biased variables. Our framework leverages on adversial\ndebiasing is evaluated on four case studies, showing a significant reduction in\nthe contribution of biased variables to the predicted value. The proposed\ntechnique is also compared with the state of the art in fairness in process\nmining, illustrating that our framework allows for a more enhanced level of\nfairness, while retaining a better prediction quality.\n","authors":["Massimiliano de Leoni","Alessandro Padella"],"pdf_url":"https://arxiv.org/pdf/2410.02618v1.pdf","comment":"17 pages, 5 figures"},{"id":"http://arxiv.org/abs/2406.14662v2","updated":"2024-10-03T15:52:17Z","published":"2024-06-20T18:30:09Z","title":"Advantage Alignment Algorithms","summary":" Artificially intelligent agents are increasingly being integrated into human\ndecision-making: from large language model (LLM) assistants to autonomous\nvehicles. These systems often optimize their individual objective, leading to\nconflicts, particularly in general-sum games where naive reinforcement learning\nagents empirically converge to Pareto-suboptimal Nash equilibria. To address\nthis issue, opponent shaping has emerged as a paradigm for finding socially\nbeneficial equilibria in general-sum games. In this work, we introduce\nAdvantage Alignment, a family of algorithms derived from first principles that\nperform opponent shaping efficiently and intuitively. We achieve this by\naligning the advantages of interacting agents, increasing the probability of\nmutually beneficial actions when their interaction has been positive. We prove\nthat existing opponent shaping methods implicitly perform Advantage Alignment.\nCompared to these methods, Advantage Alignment simplifies the mathematical\nformulation of opponent shaping, reduces the computational burden and extends\nto continuous action domains. We demonstrate the effectiveness of our\nalgorithms across a range of social dilemmas, achieving state-of-the-art\ncooperation and robustness against exploitation.\n","authors":["Juan Agustin Duque","Milad Aghajohari","Tim Cooijmans","Razvan Ciuca","Tianyu Zhang","Gauthier Gidel","Aaron Courville"],"pdf_url":"https://arxiv.org/pdf/2406.14662v2.pdf","comment":"25 Pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.02615v1","updated":"2024-10-03T15:52:03Z","published":"2024-10-03T15:52:03Z","title":"LoGra-Med: Long Context Multi-Graph Alignment for Medical\n Vision-Language Model","summary":" State-of-the-art medical multi-modal large language models (med-MLLM), like\nLLaVA-Med or BioMedGPT, leverage instruction-following data in pre-training.\nHowever, those models primarily focus on scaling the model size and data volume\nto boost performance while mainly relying on the autoregressive learning\nobjectives. Surprisingly, we reveal that such learning schemes might result in\na weak alignment between vision and language modalities, making these models\nhighly reliant on extensive pre-training datasets - a significant challenge in\nmedical domains due to the expensive and time-consuming nature of curating\nhigh-quality instruction-following instances. We address this with LoGra-Med, a\nnew multi-graph alignment algorithm that enforces triplet correlations across\nimage modalities, conversation-based descriptions, and extended captions. This\nhelps the model capture contextual meaning, handle linguistic variability, and\nbuild cross-modal associations between visuals and text. To scale our approach,\nwe designed an efficient end-to-end learning scheme using black-box gradient\nestimation, enabling faster LLaMa 7B training. Our results show LoGra-Med\nmatches LLAVA-Med performance on 600K image-text pairs for Medical VQA and\nsignificantly outperforms it when trained on 10% of the data. For example, on\nVQA-RAD, we exceed LLAVA-Med by 20.13% and nearly match the 100% pre-training\nscore (72.52% vs. 72.64%). We also surpass SOTA methods like BiomedGPT on\nvisual chatbots and RadFM on zero-shot image classification with VQA,\nhighlighting the effectiveness of multi-graph alignment.\n","authors":["Duy M. H. Nguyen","Nghiem T. Diep","Trung Q. Nguyen","Hoang-Bao Le","Tai Nguyen","Tien Nguyen","TrungTin Nguyen","Nhat Ho","Pengtao Xie","Roger Wattenhofer","James Zhou","Daniel Sonntag","Mathias Niepert"],"pdf_url":"https://arxiv.org/pdf/2410.02615v1.pdf","comment":"First version"},{"id":"http://arxiv.org/abs/2410.02611v1","updated":"2024-10-03T15:50:08Z","published":"2024-10-03T15:50:08Z","title":"IndicSentEval: How Effectively do Multilingual Transformer Models encode\n Linguistic Properties for Indic Languages?","summary":" Transformer-based models have revolutionized the field of natural language\nprocessing. To understand why they perform so well and to assess their\nreliability, several studies have focused on questions such as: Which\nlinguistic properties are encoded by these models, and to what extent? How\nrobust are these models in encoding linguistic properties when faced with\nperturbations in the input text? However, these studies have mainly focused on\nBERT and the English language. In this paper, we investigate similar questions\nregarding encoding capability and robustness for 8 linguistic properties across\n13 different perturbations in 6 Indic languages, using 9 multilingual\nTransformer models (7 universal and 2 Indic-specific). To conduct this study,\nwe introduce a novel multilingual benchmark dataset, IndicSentEval, containing\napproximately $\\sim$47K sentences. Surprisingly, our probing analysis of\nsurface, syntactic, and semantic properties reveals that while almost all\nmultilingual models demonstrate consistent encoding performance for English,\nthey show mixed results for Indic languages. As expected, Indic-specific\nmultilingual models capture linguistic properties in Indic languages better\nthan universal models. Intriguingly, universal models broadly exhibit better\nrobustness compared to Indic-specific models, particularly under perturbations\nsuch as dropping both nouns and verbs, dropping only verbs, or keeping only\nnouns. Overall, this study provides valuable insights into probing and\nperturbation-specific strengths and weaknesses of popular multilingual\nTransformer-based models for different Indic languages. We make our code and\ndataset publicly available [https://tinyurl.com/IndicSentEval}].\n","authors":["Akhilesh Aravapalli","Mounika Marreddy","Subba Reddy Oota","Radhika Mamidi","Manish Gupta"],"pdf_url":"https://arxiv.org/pdf/2410.02611v1.pdf","comment":"23 pages, 11 figures"},{"id":"http://arxiv.org/abs/2407.10960v3","updated":"2024-10-03T15:48:45Z","published":"2024-07-15T17:55:42Z","title":"Fast Matrix Multiplications for Lookup Table-Quantized LLMs","summary":" The deployment of large language models (LLMs) is often constrained by memory\nbandwidth, where the primary bottleneck is the cost of transferring model\nparameters from the GPU's global memory to its registers. When coupled with\ncustom kernels that fuse the dequantization and matmul operations, weight-only\nquantization can thus enable faster inference by reducing the amount of memory\nmovement. However, developing high-performance kernels for weight-quantized\nLLMs presents substantial challenges, especially when the weights are\ncompressed to non-evenly-divisible bit widths (e.g., 3 bits) with non-uniform,\nlookup table (LUT) quantization. This paper describes FLUTE, a flexible lookup\ntable engine for LUT-quantized LLMs, which uses offline restructuring of the\nquantized weight matrix to minimize bit manipulations associated with\nunpacking, and vectorization and duplication of the lookup table to mitigate\nshared memory bandwidth constraints. At batch sizes < 32 and quantization group\nsize of 128 (typical in LLM inference), the FLUTE kernel can be 2-4x faster\nthan existing GEMM kernels. As an application of FLUTE, we explore a simple\nextension to lookup table-based NormalFloat quantization and apply it to\nquantize LLaMA3 to various configurations, obtaining competitive quantization\nperformance against strong baselines while obtaining an end-to-end throughput\nincrease of 1.5 to 2 times.\n","authors":["Han Guo","William Brandon","Radostin Cholakov","Jonathan Ragan-Kelley","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2407.10960v3.pdf","comment":"EMNLP 2024 (Findings)"},{"id":"http://arxiv.org/abs/2406.18164v3","updated":"2024-10-03T15:46:16Z","published":"2024-06-26T08:24:44Z","title":"Nebula: A discourse aware Minecraft Builder","summary":" When engaging in collaborative tasks, humans efficiently exploit the semantic\nstructure of a conversation to optimize verbal and nonverbal interactions. But\nin recent \"language to code\" or \"language to action\" models, this information\nis lacking. We show how incorporating the prior discourse and nonlinguistic\ncontext of a conversation situated in a nonlinguistic environment can improve\nthe \"language to action\" component of such interactions. We finetune an LLM to\npredict actions based on prior context; our model, Nebula, doubles the\nnet-action F1 score over the baseline on this task of Jayannavar et al.(2020).\nWe also investigate our model's ability to construct shapes and understand\nlocation descriptions using a synthetic dataset\n","authors":["Akshay Chaturvedi","Kate Thompson","Nicholas Asher"],"pdf_url":"https://arxiv.org/pdf/2406.18164v3.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2304.08460v3","updated":"2024-10-03T15:46:13Z","published":"2023-04-17T17:36:35Z","title":"LongForm: Effective Instruction Tuning with Reverse Instructions","summary":" Instruction tuning enables language models to more effectively generalize and\nbetter follow user intent. However, obtaining instruction data is costly and\nchallenging. Prior work employs methods such as expensive human annotation,\ncrowd-sourced datasets with alignment issues, and generating noisy examples via\nLLMs. We introduce the LongForm-C dataset, which is created by reverse\ninstructions. We generate instructions via LLMs for human-written corpus\nexamples using reverse instructions. First we select a diverse set of\nhuman-written documents from corpora such as C4 and Wikipedia; then we generate\ninstructions for these documents via LLMs. This approach provides a cheaper and\ncleaner instruction-tuning dataset with natural output and one suitable for\nlong text generation. Our models outperform 10x larger language models without\ninstruction tuning on tasks such as story/recipe generation and long-form\nquestion answering. Moreover, LongForm models outperform prior\ninstruction-tuned models such as FLAN-T5 and Alpaca by a large margin, and\nimprove language understanding capabilities further. We publicly release our\ndata and models: https://github.com/akoksal/LongForm.\n","authors":["Abdullatif Köksal","Timo Schick","Anna Korhonen","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2304.08460v3.pdf","comment":"EMNLP 2024 Findings. This version extends the training with recent\n LLMs, evaluation with new metrics, and NLU tasks"},{"id":"http://arxiv.org/abs/2410.02605v1","updated":"2024-10-03T15:45:39Z","published":"2024-10-03T15:45:39Z","title":"Beyond Expected Returns: A Policy Gradient Algorithm for Cumulative\n Prospect Theoretic Reinforcement Learning","summary":" The widely used expected utility theory has been shown to be empirically\ninconsistent with human preferences in the psychology and behavioral economy\nliteratures. Cumulative Prospect Theory (CPT) has been developed to fill in\nthis gap and provide a better model for human-based decision-making supported\nby empirical evidence. It allows to express a wide range of attitudes and\nperceptions towards risk, gains and losses. A few years ago, CPT has been\ncombined with Reinforcement Learning (RL) to formulate a CPT policy\noptimization problem where the goal of the agent is to search for a policy\ngenerating long-term returns which are aligned with their preferences. In this\nwork, we revisit this policy optimization problem and provide new insights on\noptimal policies and their nature depending on the utility function under\nconsideration. We further derive a novel policy gradient theorem for the CPT\npolicy optimization objective generalizing the seminal corresponding result in\nstandard RL. This result enables us to design a model-free policy gradient\nalgorithm to solve the CPT-RL problem. We illustrate the performance of our\nalgorithm in simple examples motivated by traffic control and electricity\nmanagement applications. We also demonstrate that our policy gradient algorithm\nscales better to larger state spaces compared to the existing zeroth order\nalgorithm for solving the same problem.\n","authors":["Olivier Lepel","Anas Barakat"],"pdf_url":"https://arxiv.org/pdf/2410.02605v1.pdf","comment":"33 pages, 19 figures"},{"id":"http://arxiv.org/abs/2410.02604v1","updated":"2024-10-03T15:45:15Z","published":"2024-10-03T15:45:15Z","title":"Long-Sequence Recommendation Models Need Decoupled Embeddings","summary":" Lifelong user behavior sequences, comprising up to tens of thousands of\nhistory behaviors, are crucial for capturing user interests and predicting user\nresponses in modern recommendation systems. A two-stage paradigm is typically\nadopted to handle these long sequences: a few relevant behaviors are first\nsearched from the original long sequences via an attention mechanism in the\nfirst stage and then aggregated with the target item to construct a\ndiscriminative representation for prediction in the second stage. In this work,\nwe identify and characterize, for the first time, a neglected deficiency in\nexisting long-sequence recommendation models: a single set of embeddings\nstruggles with learning both attention and representation, leading to\ninterference between these two processes. Initial attempts to address this\nissue using linear projections -- a technique borrowed from language processing\n-- proved ineffective, shedding light on the unique challenges of\nrecommendation models. To overcome this, we propose the Decoupled Attention and\nRepresentation Embeddings (DARE) model, where two distinct embedding tables are\ninitialized and learned separately to fully decouple attention and\nrepresentation. Extensive experiments and analysis demonstrate that DARE\nprovides more accurate search of correlated behaviors and outperforms baselines\nwith AUC gains up to 0.9% on public datasets and notable online system\nimprovements. Furthermore, decoupling embedding spaces allows us to reduce the\nattention embedding dimension and accelerate the search procedure by 50%\nwithout significant performance impact, enabling more efficient,\nhigh-performance online serving.\n","authors":["Ningya Feng","Junwei Pan","Jialong Wu","Baixu Chen","Ximei Wang","Qian Li","Xian Hu","Jie Jiang","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2410.02604v1.pdf","comment":"First three authors contributed equally"},{"id":"http://arxiv.org/abs/2410.02603v1","updated":"2024-10-03T15:44:42Z","published":"2024-10-03T15:44:42Z","title":"Agents' Room: Narrative Generation through Multi-step Collaboration","summary":" Writing compelling fiction is a multifaceted process combining elements such\nas crafting a plot, developing interesting characters, and using evocative\nlanguage. While large language models (LLMs) show promise for story writing,\nthey currently rely heavily on intricate prompting, which limits their use. We\npropose Agents' Room, a generation framework inspired by narrative theory, that\ndecomposes narrative writing into subtasks tackled by specialized agents. To\nillustrate our method, we introduce Tell Me A Story, a high-quality dataset of\ncomplex writing prompts and human-written stories, and a novel evaluation\nframework designed specifically for assessing long narratives. We show that\nAgents' Room generates stories that are preferred by expert evaluators over\nthose produced by baseline systems by leveraging collaboration and\nspecialization to decompose the complex story writing task into tractable\ncomponents. We provide extensive analysis with automated and human-based\nmetrics of the generated output.\n","authors":["Fantine Huot","Reinald Kim Amplayo","Jennimaria Palomaki","Alice Shoshana Jakobovits","Elizabeth Clark","Mirella Lapata"],"pdf_url":"https://arxiv.org/pdf/2410.02603v1.pdf","comment":"Under review as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2410.02601v1","updated":"2024-10-03T15:43:17Z","published":"2024-10-03T15:43:17Z","title":"Diffusion & Adversarial Schrödinger Bridges via Iterative Proportional\n Markovian Fitting","summary":" The Iterative Markovian Fitting (IMF) procedure based on iterative reciprocal\nand Markovian projections has recently been proposed as a powerful method for\nsolving the Schr\\\"odinger Bridge problem. However, it has been observed that\nfor the practical implementation of this procedure, it is crucial to alternate\nbetween fitting a forward and backward time diffusion at each iteration. Such\nimplementation is thought to be a practical heuristic, which is required to\nstabilize training and obtain good results in applications such as unpaired\ndomain translation. In our work, we show that this heuristic closely connects\nwith the pioneer approaches for the Schr\\\"odinger Bridge based on the Iterative\nProportional Fitting (IPF) procedure. Namely, we find that the practical\nimplementation of IMF is, in fact, a combination of IMF and IPF procedures, and\nwe call this combination the Iterative Proportional Markovian Fitting (IPMF)\nprocedure. We show both theoretically and practically that this combined IPMF\nprocedure can converge under more general settings, thus, showing that the IPMF\nprocedure opens a door towards developing a unified framework for solving\nSchr\\\"odinger Bridge problems.\n","authors":["Sergei Kholkin","Grigoriy Ksenofontov","David Li","Nikita Kornilov","Nikita Gushchin","Evgeny Burnaev","Alexander Korotin"],"pdf_url":"https://arxiv.org/pdf/2410.02601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02597v1","updated":"2024-10-03T15:38:20Z","published":"2024-10-03T15:38:20Z","title":"Three-in-One: Fast and Accurate Transducer for Hybrid-Autoregressive ASR","summary":" We present \\textbf{H}ybrid-\\textbf{A}utoregressive \\textbf{IN}ference\nTr\\textbf{AN}sducers (HAINAN), a novel architecture for speech recognition that\nextends the Token-and-Duration Transducer (TDT) model. Trained with randomly\nmasked predictor network outputs, HAINAN supports both autoregressive inference\nwith all network components and non-autoregressive inference without the\npredictor. Additionally, we propose a novel semi-autoregressive inference\nparadigm that first generates an initial hypothesis using non-autoregressive\ninference, followed by refinement steps where each token prediction is\nregenerated using parallelized autoregression on the initial hypothesis.\nExperiments on multiple datasets across different languages demonstrate that\nHAINAN achieves efficiency parity with CTC in non-autoregressive mode and with\nTDT in autoregressive mode. In terms of accuracy, autoregressive HAINAN\noutperforms TDT and RNN-T, while non-autoregressive HAINAN significantly\noutperforms CTC. Semi-autoregressive inference further enhances the model's\naccuracy with minimal computational overhead, and even outperforms TDT results\nin some cases. These results highlight HAINAN's flexibility in balancing\naccuracy and speed, positioning it as a strong candidate for real-world speech\nrecognition applications.\n","authors":["Hainan Xu","Travis M. Bartley","Vladimir Bataev","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2410.02597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02596v1","updated":"2024-10-03T15:37:22Z","published":"2024-10-03T15:37:22Z","title":"Beyond Squared Error: Exploring Loss Design for Enhanced Training of\n Generative Flow Networks","summary":" Generative Flow Networks (GFlowNets) are a novel class of generative models\ndesigned to sample from unnormalized distributions and have found applications\nin various important tasks, attracting great research interest in their\ntraining algorithms. In general, GFlowNets are trained by fitting the forward\nflow to the backward flow on sampled training objects. Prior work focused on\nthe choice of training objects, parameterizations, sampling and resampling\nstrategies, and backward policies, aiming to enhance credit assignment,\nexploration, or exploitation of the training process. However, the choice of\nregression loss, which can highly influence the exploration and exploitation\nbehavior of the under-training policy, has been overlooked. Due to the lack of\ntheoretical understanding for choosing an appropriate regression loss, most\nexisting algorithms train the flow network by minimizing the squared error of\nthe forward and backward flows in log-space, i.e., using the quadratic\nregression loss. In this work, we rigorously prove that distinct regression\nlosses correspond to specific divergence measures, enabling us to design and\nanalyze regression losses according to the desired properties of the\ncorresponding divergence measures. Specifically, we examine two key properties:\nzero-forcing and zero-avoiding, where the former promotes exploitation and\nhigher rewards, and the latter encourages exploration and enhances diversity.\nBased on our theoretical framework, we propose three novel regression losses,\nnamely, Shifted-Cosh, Linex(1/2), and Linex(1). We evaluate them across three\nbenchmarks: hyper-grid, bit-sequence generation, and molecule generation. Our\nproposed losses are compatible with most existing training algorithms, and\nsignificantly improve the performances of the algorithms concerning convergence\nspeed, sample diversity, and robustness.\n","authors":["Rui Hu","Yifan Zhang","Zhuoran Li","Longbo Huang"],"pdf_url":"https://arxiv.org/pdf/2410.02596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02592v1","updated":"2024-10-03T15:34:41Z","published":"2024-10-03T15:34:41Z","title":"IC3M: In-Car Multimodal Multi-object Monitoring for Abnormal Status of\n Both Driver and Passengers","summary":" Recently, in-car monitoring has emerged as a promising technology for\ndetecting early-stage abnormal status of the driver and providing timely alerts\nto prevent traffic accidents. Although training models with multimodal data\nenhances the reliability of abnormal status detection, the scarcity of labeled\ndata and the imbalance of class distribution impede the extraction of critical\nabnormal state features, significantly deteriorating training performance.\nFurthermore, missing modalities due to environment and hardware limitations\nfurther exacerbate the challenge of abnormal status identification. More\nimportantly, monitoring abnormal health conditions of passengers, particularly\nin elderly care, is of paramount importance but remains underexplored. To\naddress these challenges, we introduce our IC3M, an efficient\ncamera-rotation-based multimodal framework for monitoring both driver and\npassengers in a car. Our IC3M comprises two key modules: an adaptive threshold\npseudo-labeling strategy and a missing modality reconstruction. The former\ncustomizes pseudo-labeling thresholds for different classes based on the class\ndistribution, generating class-balanced pseudo labels to guide model training\neffectively, while the latter leverages crossmodality relationships learned\nfrom limited labels to accurately recover missing modalities by distribution\ntransferring from available modalities. Extensive experimental results\ndemonstrate that IC3M outperforms state-of-the-art benchmarks in accuracy,\nprecision, and recall while exhibiting superior robustness under limited\nlabeled data and severe missing modality.\n","authors":["Zihan Fang","Zheng Lin","Senkang Hu","Hangcheng Cao","Yiqin Deng","Xianhao Chen","Yuguang Fang"],"pdf_url":"https://arxiv.org/pdf/2410.02592v1.pdf","comment":"16 pages, 17 figures"},{"id":"http://arxiv.org/abs/2405.06443v2","updated":"2024-10-03T15:34:02Z","published":"2024-05-10T12:48:57Z","title":"Residual-based Attention Physics-informed Neural Networks for\n Spatio-Temporal Ageing Assessment of Transformers Operated in Renewable Power\n Plants","summary":" Transformers are crucial for reliable and efficient power system operations,\nparticularly in supporting the integration of renewable energy. Effective\nmonitoring of transformer health is critical to maintain grid stability and\nperformance. Thermal insulation ageing is a key transformer failure mode, which\nis generally tracked by monitoring the hotspot temperature (HST). However, HST\nmeasurement is complex, costly, and often estimated from indirect measurements.\nExisting HST models focus on space-agnostic thermal models, providing\nworst-case HST estimates. This article introduces a spatio-temporal model for\ntransformer winding temperature and ageing estimation, which leverages\nphysics-based partial differential equations (PDEs) with data-driven Neural\nNetworks (NN) in a Physics Informed Neural Networks (PINNs) configuration to\nimprove prediction accuracy and acquire spatio-temporal resolution. The\ncomputational accuracy of the PINN model is improved through the implementation\nof the Residual-Based Attention (PINN-RBA) scheme that accelerates the PINN\nmodel convergence. The PINN-RBA model is benchmarked against self-adaptive\nattention schemes and classical vanilla PINN configurations. For the first\ntime, PINN based oil temperature predictions are used to estimate\nspatio-temporal transformer winding temperature values, validated through PDE\nnumerical solution and fiber optic sensor measurements. Furthermore, the\nspatio-temporal transformer ageing model is inferred, which supports\ntransformer health management decision-making. Results are validated with a\ndistribution transformer operating on a floating photovoltaic power plant.\n","authors":["Ibai Ramirez","Joel Pino","David Pardo","Mikel Sanz","Luis del Rio","Alvaro Ortiz","Kateryna Morozovska","Jose I. Aizpurua"],"pdf_url":"https://arxiv.org/pdf/2405.06443v2.pdf","comment":"23 pages, 18 figures"},{"id":"http://arxiv.org/abs/2410.02590v1","updated":"2024-10-03T15:32:08Z","published":"2024-10-03T15:32:08Z","title":"Generalization emerges from local optimization in a self-organized\n learning network","summary":" We design and analyze a new paradigm for building supervised learning\nnetworks, driven only by local optimization rules without relying on a global\nerror function. Traditional neural networks with a fixed topology are made up\nof identical nodes and derive their expressiveness from an appropriate\nadjustment of connection weights. In contrast, our network stores new knowledge\nin the nodes accurately and instantaneously, in the form of a lookup table.\nOnly then is some of this information structured and incorporated into the\nnetwork geometry. The training error is initially zero by construction and\nremains so throughout the network topology transformation phase. The latter\ninvolves a small number of local topological transformations, such as splitting\nor merging of nodes and adding binary connections between them. The choice of\noperations to be carried out is only driven by optimization of expressivity at\nthe local scale. What we are primarily looking for in a learning network is its\nability to generalize, i.e. its capacity to correctly answer questions for\nwhich it has never learned the answers. We show on numerous examples of\nclassification tasks that the networks generated by our algorithm\nsystematically reach such a state of perfect generalization when the number of\nlearned examples becomes sufficiently large. We report on the dynamics of the\nchange of state and show that it is abrupt and has the distinctive\ncharacteristics of a first order phase transition, a phenomenon already\nobserved for traditional learning networks and known as grokking. In addition\nto proposing a non-potential approach for the construction of learning\nnetworks, our algorithm makes it possible to rethink the grokking transition in\na new light, under which acquisition of training data and topological\nstructuring of data are completely decoupled phenomena.\n","authors":["S. Barland","L. Gil"],"pdf_url":"https://arxiv.org/pdf/2410.02590v1.pdf","comment":"This paper is submitted to Phys. Rev. X. It's a physicist's study\n that focus on a new paradigm for deep learning networks. We would have liked\n to choose other keywords for arXiv to reach a wider community, but don't have\n the rights to do so"},{"id":"http://arxiv.org/abs/2410.02581v1","updated":"2024-10-03T15:25:37Z","published":"2024-10-03T15:25:37Z","title":"Boosting Sample Efficiency and Generalization in Multi-agent\n Reinforcement Learning via Equivariance","summary":" Multi-Agent Reinforcement Learning (MARL) struggles with sample inefficiency\nand poor generalization [1]. These challenges are partially due to a lack of\nstructure or inductive bias in the neural networks typically used in learning\nthe policy. One such form of structure that is commonly observed in multi-agent\nscenarios is symmetry. The field of Geometric Deep Learning has developed\nEquivariant Graph Neural Networks (EGNN) that are equivariant (or symmetric) to\nrotations, translations, and reflections of nodes. Incorporating equivariance\nhas been shown to improve learning efficiency and decrease error [ 2 ]. In this\npaper, we demonstrate that EGNNs improve the sample efficiency and\ngeneralization in MARL. However, we also show that a naive application of EGNNs\nto MARL results in poor early exploration due to a bias in the EGNN structure.\nTo mitigate this bias, we present Exploration-enhanced Equivariant Graph Neural\nNetworks or E2GN2. We compare E2GN2 to other common function approximators\nusing common MARL benchmarks MPE and SMACv2. E2GN2 demonstrates a significant\nimprovement in sample efficiency, greater final reward convergence, and a 2x-5x\ngain in over standard GNNs in our generalization tests. These results pave the\nway for more reliable and effective solutions in complex multi-agent systems.\n","authors":["Joshua McClellan","Naveed Haghani","John Winder","Furong Huang","Pratap Tokekar"],"pdf_url":"https://arxiv.org/pdf/2410.02581v1.pdf","comment":"accepted as a poster at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.18313v3","updated":"2024-10-03T15:17:22Z","published":"2024-09-26T21:44:11Z","title":"Embodied-RAG: General Non-parametric Embodied Memory for Retrieval and\n Generation","summary":" There is no limit to how much a robot might explore and learn, but all of\nthat knowledge needs to be searchable and actionable. Within language research,\nretrieval augmented generation (RAG) has become the workhouse of large-scale\nnon-parametric knowledge, however existing techniques do not directly transfer\nto the embodied domain, which is multimodal, data is highly correlated, and\nperception requires abstraction.\n To address these challenges, we introduce Embodied-RAG, a framework that\nenhances the foundational model of an embodied agent with a non-parametric\nmemory system capable of autonomously constructing hierarchical knowledge for\nboth navigation and language generation. Embodied-RAG handles a full range of\nspatial and semantic resolutions across diverse environments and query types,\nwhether for a specific object or a holistic description of ambiance. At its\ncore, Embodied-RAG's memory is structured as a semantic forest, storing\nlanguage descriptions at varying levels of detail. This hierarchical\norganization allows the system to efficiently generate context-sensitive\noutputs across different robotic platforms. We demonstrate that Embodied-RAG\neffectively bridges RAG to the robotics domain, successfully handling over 200\nexplanation and navigation queries across 19 environments, highlighting its\npromise for general-purpose non-parametric system for embodied agents.\n","authors":["Quanting Xie","So Yeon Min","Tianyi Zhang","Aarav Bajaj","Ruslan Salakhutdinov","Matthew Johnson-Roberson","Yonatan Bisk"],"pdf_url":"https://arxiv.org/pdf/2409.18313v3.pdf","comment":"Web: https://quanting-xie.github.io/Embodied-RAG-web/"},{"id":"http://arxiv.org/abs/2405.17829v2","updated":"2024-10-03T15:14:29Z","published":"2024-05-28T04:59:13Z","title":"LDMol: Text-to-Molecule Diffusion Model with Structurally Informative\n Latent Space","summary":" With the emergence of diffusion models as the frontline of generative models,\nmany researchers have proposed molecule generation techniques with conditional\ndiffusion models. However, the unavoidable discreteness of a molecule makes it\ndifficult for a diffusion model to connect raw data with highly complex\nconditions like natural language. To address this, we present a novel latent\ndiffusion model dubbed LDMol for text-conditioned molecule generation. LDMol\ncomprises a molecule autoencoder that produces a learnable and structurally\ninformative feature space, and a natural language-conditioned latent diffusion\nmodel. In particular, recognizing that multiple SMILES notations can represent\nthe same molecule, we employ a contrastive learning strategy to extract feature\nspace that is aware of the unique characteristics of the molecule structure.\nLDMol outperforms the existing baselines on the text-to-molecule generation\nbenchmark, suggesting a potential for diffusion models can outperform\nautoregressive models in text data generation with a better choice of the\nlatent domain. Furthermore, we show that LDMol can be applied to downstream\ntasks such as molecule-to-text retrieval and text-guided molecule editing,\ndemonstrating its versatility as a diffusion model.\n","authors":["Jinho Chang","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2405.17829v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02566v1","updated":"2024-10-03T15:10:02Z","published":"2024-10-03T15:10:02Z","title":"Deep Learning-Based Prediction of Suspension Dynamics Performance in\n Multi-Axle Vehicles","summary":" This paper presents a deep learning-based framework for predicting the\ndynamic performance of suspension systems in multi-axle vehicles, emphasizing\nthe integration of machine learning with traditional vehicle dynamics modeling.\nA Multi-Task Deep Belief Network Deep Neural Network (MTL-DBN-DNN) was\ndeveloped to capture the relationships between key vehicle parameters and\nsuspension performance metrics. The model was trained on data generated from\nnumerical simulations and demonstrated superior prediction accuracy compared to\nconventional DNN models. A comprehensive sensitivity analysis was conducted to\nassess the impact of various vehicle and suspension parameters on dynamic\nsuspension performance. Additionally, the Suspension Dynamic Performance Index\n(SDPI) was introduced as a holistic measure to quantify overall suspension\nperformance, accounting for the combined effects of multiple parameters. The\nfindings highlight the effectiveness of multitask learning in improving\npredictive models for complex vehicle systems.\n","authors":["Kai Chun Lin","Bo-Yi Lin"],"pdf_url":"https://arxiv.org/pdf/2410.02566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14407v2","updated":"2024-10-03T15:07:52Z","published":"2024-02-22T09:48:47Z","title":"Learning an Actionable Discrete Diffusion Policy via Large-Scale\n Actionless Video Pre-Training","summary":" Learning a generalist embodied agent capable of completing multiple tasks\nposes challenges, primarily stemming from the scarcity of action-labeled\nrobotic datasets. In contrast, a vast amount of human videos exist, capturing\nintricate tasks and interactions with the physical world. Promising prospects\narise for utilizing actionless human videos for pre-training and transferring\nthe knowledge to facilitate robot policy learning through limited robot\ndemonstrations. However, it remains a challenge due to the domain gap between\nhumans and robots. Moreover, it is difficult to extract useful information\nrepresenting the dynamic world from human videos, because of its noisy and\nmultimodal data structure. In this paper, we introduce a novel framework to\ntackle these challenges, which leverages a unified discrete diffusion to\ncombine generative pre-training on human videos and policy fine-tuning on a\nsmall number of action-labeled robot videos. We start by compressing both human\nand robot videos into unified video tokens. In the pre-training stage, we\nemploy a discrete diffusion model with a mask-and-replace diffusion strategy to\npredict future video tokens in the latent space. In the fine-tuning stage, we\nharness the imagined future videos to guide low-level action learning with a\nlimited set of robot data. Experiments demonstrate that our method generates\nhigh-fidelity future videos for planning and enhances the fine-tuned policies\ncompared to previous state-of-the-art approaches with superior performance. Our\nproject website is available at https://video-diff.github.io/.\n","authors":["Haoran He","Chenjia Bai","Ling Pan","Weinan Zhang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2402.14407v2.pdf","comment":"Accepted by NeurIPS 2024. 24 pages"},{"id":"http://arxiv.org/abs/2410.02561v1","updated":"2024-10-03T15:04:47Z","published":"2024-10-03T15:04:47Z","title":"The Benefit of Being Bayesian in Online Conformal Prediction","summary":" Based on the framework of Conformal Prediction (CP), we study the online\nconstruction of valid confidence sets given a black-box machine learning model.\nBy converting the target confidence levels into quantile levels, the problem\ncan be reduced to predicting the quantiles (in hindsight) of a sequentially\nrevealed data sequence. Two very different approaches have been studied\npreviously. (i) Direct approach: Assuming the data sequence is iid or\nexchangeable, one could maintain the empirical distribution of the observed\ndata as an algorithmic belief, and directly predict its quantiles. (ii)\nIndirect approach: As statistical assumptions often do not hold in practice, a\nrecent trend is to consider the adversarial setting and apply first-order\nonline optimization to moving quantile losses (Gibbs & Cand\\`es, 2021). It\nrequires knowing the target quantile level beforehand, and suffers from certain\nvalidity issues on the obtained confidence sets, due to the associated loss\nlinearization.\n This paper presents a novel Bayesian CP framework that combines their\nstrengths. Without any statistical assumption, it is able to both: (i) answer\nmultiple arbitrary confidence level queries online, with provably low regret;\nand (ii) overcome the validity issues suffered by first-order optimization\nbaselines, due to being \"data-centric\" rather than \"iterate-centric\".\n From a technical perspective, our key idea is to regularize the algorithmic\nbelief of the above direct approach by a Bayesian prior, which \"robustifies\" it\nby simulating a non-linearized Follow the Regularized Leader (FTRL) algorithm\non the output. For statisticians, this can be regarded as an online adversarial\nview of Bayesian inference. Importantly, the proposed belief update backbone is\nshared by prediction heads targeting different confidence levels, bringing\npractical benefits analogous to U-calibration (Kleinberg et al., 2023).\n","authors":["Zhiyu Zhang","Zhou Lu","Heng Yang"],"pdf_url":"https://arxiv.org/pdf/2410.02561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02559v1","updated":"2024-10-03T15:04:01Z","published":"2024-10-03T15:04:01Z","title":"Obtaining Lower Query Complexities through Lightweight Zeroth-Order\n Proximal Gradient Algorithms","summary":" Zeroth-order (ZO) optimization is one key technique for machine learning\nproblems where gradient calculation is expensive or impossible. Several\nvariance reduced ZO proximal algorithms have been proposed to speed up ZO\noptimization for non-smooth problems, and all of them opted for the coordinated\nZO estimator against the random ZO estimator when approximating the true\ngradient, since the former is more accurate. While the random ZO estimator\nintroduces bigger error and makes convergence analysis more challenging\ncompared to coordinated ZO estimator, it requires only $\\mathcal{O}(1)$\ncomputation, which is significantly less than $\\mathcal{O}(d)$ computation of\nthe coordinated ZO estimator, with $d$ being dimension of the problem space. To\ntake advantage of the computationally efficient nature of the random ZO\nestimator, we first propose a ZO objective decrease (ZOOD) property which can\nincorporate two different types of errors in the upper bound of convergence\nrate. Next, we propose two generic reduction frameworks for ZO optimization\nwhich can automatically derive the convergence results for convex and\nnon-convex problems respectively, as long as the convergence rate for the inner\nsolver satisfies the ZOOD property. With the application of two reduction\nframeworks on our proposed ZOR-ProxSVRG and ZOR-ProxSAGA, two variance reduced\nZO proximal algorithms with fully random ZO estimators, we improve the\nstate-of-the-art function query complexities from\n$\\mathcal{O}\\left(\\min\\{\\frac{dn^{1/2}}{\\epsilon^2},\n\\frac{d}{\\epsilon^3}\\}\\right)$ to\n$\\tilde{\\mathcal{O}}\\left(\\frac{n+d}{\\epsilon^2}\\right)$ under $d >\nn^{\\frac{1}{2}}$ for non-convex problems, and from\n$\\mathcal{O}\\left(\\frac{d}{\\epsilon^2}\\right)$ to\n$\\tilde{\\mathcal{O}}\\left(n\\log\\frac{1}{\\epsilon}+\\frac{d}{\\epsilon}\\right)$\nfor convex problems.\n","authors":["Bin Gu","Xiyuan Wei","Hualin Zhang","Yi Chang","Heng Huang"],"pdf_url":"https://arxiv.org/pdf/2410.02559v1.pdf","comment":"Neural Computation 36 (5), 897-935"},{"id":"http://arxiv.org/abs/2402.12817v2","updated":"2024-10-03T14:56:24Z","published":"2024-02-20T08:38:19Z","title":"On Sensitivity of Learning with Limited Labelled Data to the Effects of\n Randomness: Impact of Interactions and Systematic Choices","summary":" While learning with limited labelled data can improve performance when the\nlabels are lacking, it is also sensitive to the effects of uncontrolled\nrandomness introduced by so-called randomness factors (e.g., varying order of\ndata). We propose a method to systematically investigate the effects of\nrandomness factors while taking the interactions between them into\nconsideration. To measure the true effects of an individual randomness factor,\nour method mitigates the effects of other factors and observes how the\nperformance varies across multiple runs. Applying our method to multiple\nrandomness factors across in-context learning and fine-tuning approaches on 7\nrepresentative text classification tasks and meta-learning on 3 tasks, we show\nthat: 1) disregarding interactions between randomness factors in existing works\ncaused inconsistent findings due to incorrect attribution of the effects of\nrandomness factors, such as disproving the consistent sensitivity of in-context\nlearning to sample order even with random sample selection; and 2) besides\nmutual interactions, the effects of randomness factors, especially sample\norder, are also dependent on more systematic choices unexplored in existing\nworks, such as number of classes, samples per class or choice of prompt format.\n","authors":["Branislav Pecher","Ivan Srba","Maria Bielikova"],"pdf_url":"https://arxiv.org/pdf/2402.12817v2.pdf","comment":"Accepted to the EMNLP'24 Main Conference"},{"id":"http://arxiv.org/abs/2309.16519v3","updated":"2024-10-03T14:55:41Z","published":"2023-09-28T15:25:17Z","title":"AtomSurf : Surface Representation for Learning on Protein Structures","summary":" While there has been significant progress in evaluating and comparing\ndifferent representations for learning on protein data, the role of\nsurface-based learning approaches remains not well-understood. In particular,\nthere is a lack of direct and fair benchmark comparison between the best\navailable surface-based learning methods against alternative representations\nsuch as graphs. Moreover, the few existing surface-based approaches either use\nsurface information in isolation or, at best, perform global pooling between\nsurface and graph-based architectures.\n In this work, we fill this gap by first adapting a state-of-the-art surface\nencoder for protein learning tasks. We then perform a direct and fair\ncomparison of the resulting method against alternative approaches within the\nAtom3D benchmark, highlighting the limitations of pure surface-based learning.\nFinally, we propose an integrated approach, which allows learned feature\nsharing between graphs and surface representations on the level of nodes and\nvertices $\\textit{across all layers}$.\n We demonstrate that the resulting architecture achieves state-of-the-art\nresults on all tasks in the Atom3D benchmark, while adhering to the strict\nbenchmark protocol, as well as more broadly on binding site identification and\nbinding pocket classification. Furthermore, we use coarsened surfaces and\noptimize our approach for efficiency, making our tool competitive in training\nand inference time with existing techniques. Our code and data can be found\nonline: $\\texttt{github.com/Vincentx15/atomsurf}$\n","authors":["Vincent Mallet","Souhaib Attaiki","Yangyang Miao","Bruno Correia","Maks Ovsjanikov"],"pdf_url":"https://arxiv.org/pdf/2309.16519v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2410.02551v1","updated":"2024-10-03T14:55:22Z","published":"2024-10-03T14:55:22Z","title":"ColaCare: Enhancing Electronic Health Record Modeling through Large\n Language Model-Driven Multi-Agent Collaboration","summary":" We introduce ColaCare, a framework that enhances Electronic Health Record\n(EHR) modeling through multi-agent collaboration driven by Large Language\nModels (LLMs). Our approach seamlessly integrates domain-specific expert models\nwith LLMs to bridge the gap between structured EHR data and text-based\nreasoning. Inspired by clinical consultations, ColaCare employs two types of\nagents: DoctorAgent and MetaAgent, which collaboratively analyze patient data.\nExpert models process and generate predictions from numerical EHR data, while\nLLM agents produce reasoning references and decision-making reports within the\ncollaborative consultation framework. We additionally incorporate the Merck\nManual of Diagnosis and Therapy (MSD) medical guideline within a\nretrieval-augmented generation (RAG) module for authoritative evidence support.\nExtensive experiments conducted on four distinct EHR datasets demonstrate\nColaCare's superior performance in mortality prediction tasks, underscoring its\npotential to revolutionize clinical decision support systems and advance\npersonalized precision medicine. The code, complete prompt templates, more case\nstudies, etc. are publicly available at the anonymous link:\nhttps://colacare.netlify.app.\n","authors":["Zixiang Wang","Yinghao Zhu","Huiya Zhao","Xiaochen Zheng","Tianlong Wang","Wen Tang","Yasha Wang","Chengwei Pan","Ewen M. Harrison","Junyi Gao","Liantao Ma"],"pdf_url":"https://arxiv.org/pdf/2410.02551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02548v1","updated":"2024-10-03T14:53:10Z","published":"2024-10-03T14:53:10Z","title":"Local Flow Matching Generative Models","summary":" Flow Matching (FM) is a simulation-free method for learning a continuous and\ninvertible flow to interpolate between two distributions, and in particular to\ngenerate data from noise in generative modeling. In this paper, we introduce\nLocal Flow Matching (LFM), which learns a sequence of FM sub-models and each\nmatches a diffusion process up to the time of the step size in the\ndata-to-noise direction. In each step, the two distributions to be interpolated\nby the sub-model are closer to each other than data vs. noise, and this enables\nthe use of smaller models with faster training. The stepwise structure of LFM\nis natural to be distilled and different distillation techniques can be adopted\nto speed up generation. Theoretically, we prove a generation guarantee of the\nproposed flow model in terms of the $\\chi^2$-divergence between the generated\nand true data distributions. In experiments, we demonstrate the improved\ntraining efficiency and competitive generative performance of LFM compared to\nFM on the unconditional generation of tabular data and image datasets, and also\non the conditional generation of robotic manipulation policies.\n","authors":["Chen Xu","Xiuyuan Cheng","Yao Xie"],"pdf_url":"https://arxiv.org/pdf/2410.02548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02543v1","updated":"2024-10-03T14:47:46Z","published":"2024-10-03T14:47:46Z","title":"Diffusion Models are Evolutionary Algorithms","summary":" In a convergence of machine learning and biology, we reveal that diffusion\nmodels are evolutionary algorithms. By considering evolution as a denoising\nprocess and reversed evolution as diffusion, we mathematically demonstrate that\ndiffusion models inherently perform evolutionary algorithms, naturally\nencompassing selection, mutation, and reproductive isolation. Building on this\nequivalence, we propose the Diffusion Evolution method: an evolutionary\nalgorithm utilizing iterative denoising -- as originally introduced in the\ncontext of diffusion models -- to heuristically refine solutions in parameter\nspaces. Unlike traditional approaches, Diffusion Evolution efficiently\nidentifies multiple optimal solutions and outperforms prominent mainstream\nevolutionary algorithms. Furthermore, leveraging advanced concepts from\ndiffusion models, namely latent space diffusion and accelerated sampling, we\nintroduce Latent Space Diffusion Evolution, which finds solutions for\nevolutionary tasks in high-dimensional complex parameter space while\nsignificantly reducing computational steps. This parallel between diffusion and\nevolution not only bridges two different fields but also opens new avenues for\nmutual enhancement, raising questions about open-ended evolution and\npotentially utilizing non-Gaussian or discrete diffusion models in the context\nof Diffusion Evolution.\n","authors":["Yanbo Zhang","Benedikt Hartl","Hananel Hazan","Michael Levin"],"pdf_url":"https://arxiv.org/pdf/2410.02543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02541v1","updated":"2024-10-03T14:45:23Z","published":"2024-10-03T14:45:23Z","title":"Fair Decentralized Learning","summary":" Decentralized learning (DL) is an emerging approach that enables nodes to\ncollaboratively train a machine learning model without sharing raw data. In\nmany application domains, such as healthcare, this approach faces challenges\ndue to the high level of heterogeneity in the training data's feature space.\nSuch feature heterogeneity lowers model utility and negatively impacts\nfairness, particularly for nodes with under-represented training data. In this\npaper, we introduce \\textsc{Facade}, a clustering-based DL algorithm\nspecifically designed for fair model training when the training data exhibits\nseveral distinct features. The challenge of \\textsc{Facade} is to assign nodes\nto clusters, one for each feature, based on the similarity in the features of\ntheir local data, without requiring individual nodes to know apriori which\ncluster they belong to. \\textsc{Facade} (1) dynamically assigns nodes to their\nappropriate clusters over time, and (2) enables nodes to collaboratively train\na specialized model for each cluster in a fully decentralized manner. We\ntheoretically prove the convergence of \\textsc{Facade}, implement our\nalgorithm, and compare it against three state-of-the-art baselines. Our\nexperimental results on three datasets demonstrate the superiority of our\napproach in terms of model accuracy and fairness compared to all three\ncompetitors. Compared to the best-performing baseline, \\textsc{Facade} on the\nCIFAR-10 dataset also reduces communication costs by 32.3\\% to reach a target\naccuracy when cluster sizes are imbalanced.\n","authors":["Sayan Biswas","Anne-Marie Kermarrec","Rishi Sharma","Thibaud Trinca","Martijn de Vos"],"pdf_url":"https://arxiv.org/pdf/2410.02541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04057v3","updated":"2024-10-03T14:40:12Z","published":"2024-08-07T19:39:37Z","title":"PowerPM: Foundation Model for Power Systems","summary":" The emergence of abundant electricity time series (ETS) data provides ample\nopportunities for various applications in the power systems, including\ndemand-side management, grid stability, and consumer behavior analysis. Deep\nlearning models have advanced ETS modeling by effectively capturing sequence\ndependence. Nevertheless, learning a generic representation of ETS data for\nvarious applications remains challenging due to the inherently complex\nhierarchical structure of ETS data. Moreover, ETS data exhibits intricate\ntemporal dependencies and is suscepti ble to the influence of exogenous\nvariables. Furthermore, different instances exhibit diverse electricity\nconsumption behavior. In this paper, we propose a foundation model PowerPM to\nmodel ETS data, providing a large-scale, off-the-shelf model for power systems.\nPowerPM consists of a temporal encoder and a hierarchical encoder. The temporal\nencoder captures both temporal dependencies in ETS data, considering exogenous\nvariables. The hierarchical encoder models the correlation between hierarchy.\nFurthermore, PowerPM leverages a novel self-supervised pretraining framework\nconsisting of masked ETS modeling and dual-view contrastive learning, which\nenable PowerPM to capture temporal dependency within ETS windows and aware the\ndiscrepancy across ETS windows, providing two different perspectives to learn\ngeneric representation. Our experiments involve five real world scenario\ndatasets, comprising private and public data. Through pre-training on massive\nETS data, PowerPM achieves SOTA performance on diverse downstream tasks within\nthe private dataset. Impressively, when transferred to the public datasets,\nPowerPM maintains its superiority, showcasing its remarkable generalization\nability across various tasks and domains. Moreover, ablation studies, few-shot\nexperiments provide additional evidence of the effectiveness of our model.\n","authors":["Shihao Tu","Yupeng Zhang","Jing Zhang","Zhendong Fu","Yin Zhang","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2408.04057v3.pdf","comment":"23 pages, 5 figures, 8 tables"},{"id":"http://arxiv.org/abs/2401.07961v4","updated":"2024-10-03T14:35:40Z","published":"2024-01-15T20:57:50Z","title":"Solution of the Probabilistic Lambert Problem: Connections with Optimal\n Mass Transport, Schrödinger Bridge and Reaction-Diffusion PDEs","summary":" The Lambert problem originated in orbital mechanics. It concerns with\ndetermining the initial velocity for a boundary value problem involving the\ndynamical constraint due to gravitational potential with additional time\nhorizon and endpoint position constraints. Its solution has application in\ntransferring a spacecraft from a given initial to a given terminal position\nwithin prescribed flight time via velocity control. We consider a probabilistic\nvariant of the Lambert problem where the knowledge of the endpoint constraints\nin position vectors are replaced by the knowledge of their respective joint\nprobability density functions. We show that the Lambert problem with endpoint\njoint probability density constraints is a generalized optimal mass transport\n(OMT) problem, thereby connecting this classical astrodynamics problem with a\nburgeoning area of research in modern stochastic control and stochastic machine\nlearning. This newfound connection allows us to rigorously establish the\nexistence and uniqueness of solution for the probabilistic Lambert problem. The\nsame connection also helps to numerically solve the probabilistic Lambert\nproblem via diffusion regularization, i.e., by leveraging further connection of\nthe OMT with the Schr\\\"odinger bridge problem (SBP). This also shows that the\nprobabilistic Lambert problem with additive dynamic process noise is a\ngeneralized SBP, and can be solved numerically using the so-called\nSchr\\\"odinger factors, as we do in this work. Our analysis leads to solving a\nsystem of reaction-diffusion PDEs where the gravitational potential appears as\nthe reaction rate.\n","authors":["Alexis M. H. Teter","Iman Nodozi","Abhishek Halder"],"pdf_url":"https://arxiv.org/pdf/2401.07961v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02519v1","updated":"2024-10-03T14:28:05Z","published":"2024-10-03T14:28:05Z","title":"Semantic-Guided RL for Interpretable Feature Engineering","summary":" The quality of Machine Learning (ML) models strongly depends on the input\ndata, as such generating high-quality features is often required to improve the\npredictive accuracy. This process is referred to as Feature Engineering (FE).\nHowever, since manual feature engineering is time-consuming and requires\ncase-by-case domain knowledge, Automated Feature Engineering (AutoFE) is\ncrucial. A major challenge that remains is to generate interpretable features.\nTo tackle this problem, we introduce SMART, a hybrid approach that uses\nsemantic technologies to guide the generation of interpretable features through\na two-step process: Exploitation and Exploration. The former uses Description\nLogics (DL) to reason on the semantics embedded in Knowledge Graphs (KG) to\ninfer domain-specific features, while the latter exploits the knowledge graph\nto conduct a guided exploration of the search space through Deep Reinforcement\nLearning (DRL). Our experiments on public datasets demonstrate that SMART\nsignificantly improves prediction accuracy while ensuring a high level of\ninterpretability.\n","authors":["Mohamed Bouadi","Arta Alavi","Salima Benbernou","Mourad Ouziri"],"pdf_url":"https://arxiv.org/pdf/2410.02519v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2406.00544"},{"id":"http://arxiv.org/abs/2405.02954v3","updated":"2024-10-03T14:25:07Z","published":"2024-05-05T14:48:13Z","title":"Source-Free Domain Adaptation Guided by Vision and Vision-Language\n Pre-Training","summary":" Source-free domain adaptation (SFDA) aims to adapt a source model trained on\na fully-labeled source domain to a related but unlabeled target domain. While\nthe source model is a key avenue for acquiring target pseudolabels, the\ngenerated pseudolabels may exhibit source bias. In the conventional SFDA\npipeline, a large data (e.g. ImageNet) pre-trained feature extractor is used to\ninitialize the source model at the start of source training, and subsequently\ndiscarded. Despite having diverse features important for generalization, the\npre-trained feature extractor can overfit to the source data distribution\nduring source training and forget relevant target domain knowledge. Rather than\ndiscarding this valuable knowledge, we introduce an integrated framework to\nincorporate pre-trained networks into the target adaptation process. The\nproposed framework is flexible and allows us to plug modern pre-trained\nnetworks into the adaptation process to leverage their stronger representation\nlearning capabilities. For adaptation, we propose the Co-learn algorithm to\nimprove target pseudolabel quality collaboratively through the source model and\na pre-trained feature extractor. Building on the recent success of the\nvision-language model CLIP in zero-shot image recognition, we present an\nextension Co-learn++ to further incorporate CLIP's zero-shot classification\ndecisions. We evaluate on 4 benchmark datasets and include more challenging\nscenarios such as open-set, partial-set and open-partial SFDA. Experimental\nresults demonstrate that our proposed strategy improves adaptation performance\nand can be successfully integrated with existing SFDA methods. Project code is\navailable at https://github.com/zwenyu/colearn-plus.\n","authors":["Wenyu Zhang","Li Shen","Chuan-Sheng Foo"],"pdf_url":"https://arxiv.org/pdf/2405.02954v3.pdf","comment":"Extension of ICCV paper arXiv:2212.07585; Published at IJCV"},{"id":"http://arxiv.org/abs/2410.02516v1","updated":"2024-10-03T14:25:02Z","published":"2024-10-03T14:25:02Z","title":"Learning Emergence of Interaction Patterns across Independent RL Agents\n in Multi-Agent Environments","summary":" Many real-world problems, such as controlling swarms of drones and urban\ntraffic, naturally lend themselves to modeling as multi-agent reinforcement\nlearning (RL) problems. However, existing multi-agent RL methods often suffer\nfrom scalability challenges, primarily due to the introduction of communication\namong agents. Consequently, a key challenge lies in adapting the success of\ndeep learning in single-agent RL to the multi-agent setting. In response to\nthis challenge, we propose an approach that fundamentally reimagines\nmulti-agent environments. Unlike conventional methods that model each agent\nindividually with separate networks, our approach, the Bottom Up Network (BUN),\nadopts a unique perspective. BUN treats the collective of multi-agents as a\nunified entity while employing a specialized weight initialization strategy\nthat promotes independent learning. Furthermore, we dynamically establish\nconnections among agents using gradient information, enabling coordination when\nnecessary while maintaining these connections as limited and sparse to\neffectively manage the computational budget. Our extensive empirical\nevaluations across a variety of cooperative multi-agent scenarios, including\ntasks such as cooperative navigation and traffic control, consistently\ndemonstrate BUN's superiority over baseline methods with substantially reduced\ncomputational costs.\n","authors":["Vasanth Reddy Baddam","Suat Gumussoy","Almuatazbellah Boker","Hoda Eldardiry"],"pdf_url":"https://arxiv.org/pdf/2410.02516v1.pdf","comment":"13 pages, 24 figures"},{"id":"http://arxiv.org/abs/2410.02513v1","updated":"2024-10-03T14:22:55Z","published":"2024-10-03T14:22:55Z","title":"Minimax Group Fairness in Strategic Classification","summary":" In strategic classification, agents manipulate their features, at a cost, to\nreceive a positive classification outcome from the learner's classifier. The\ngoal of the learner in such settings is to learn a classifier that is robust to\nstrategic manipulations. While the majority of works in this domain consider\naccuracy as the primary objective of the learner, in this work, we consider\nlearning objectives that have group fairness guarantees in addition to accuracy\nguarantees. We work with the minimax group fairness notion that asks for\nminimizing the maximal group error rate across population groups.\n We formalize a fairness-aware Stackelberg game between a population of agents\nconsisting of several groups, with each group having its own cost function, and\na learner in the agnostic PAC setting in which the learner is working with a\nhypothesis class H. When the cost functions of the agents are separable, we\nshow the existence of an efficient algorithm that finds an approximately\noptimal deterministic classifier for the learner when the number of groups is\nsmall. This algorithm remains efficient, both statistically and\ncomputationally, even when H is the set of all classifiers. We then consider\ncost functions that are not necessarily separable and show the existence of\noracle-efficient algorithms that find approximately optimal randomized\nclassifiers for the learner when H has finite strategic VC dimension. These\nalgorithms work under the assumption that the learner is fully transparent: the\nlearner draws a classifier from its distribution (randomized classifier) before\nthe agents respond by manipulating their feature vectors. We highlight the\neffectiveness of such transparency in developing oracle-efficient algorithms.\nWe conclude with verifying the efficacy of our algorithms on real data by\nconducting an experimental analysis.\n","authors":["Emily Diana","Saeed Sharifi-Malvajerdi","Ali Vakilian"],"pdf_url":"https://arxiv.org/pdf/2410.02513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16791v2","updated":"2024-10-03T14:22:02Z","published":"2024-09-25T10:09:47Z","title":"Symbolic State Partitioning for Reinforcement Learning","summary":" Tabular reinforcement learning methods cannot operate directly on continuous\nstate spaces. One solution for this problem is to partition the state space. A\ngood partitioning enables generalization during learning and more efficient\nexploitation of prior experiences. Consequently, the learning process becomes\nfaster and produces more reliable policies. However, partitioning introduces\napproximation, which is particularly harmful in the presence of nonlinear\nrelations between state components. An ideal partition should be as coarse as\npossible, while capturing the key structure of the state space for the given\nproblem. This work extracts partitions from the environment dynamics by\nsymbolic execution. We show that symbolic partitioning improves state space\ncoverage with respect to environmental behavior and allows reinforcement\nlearning to perform better for sparse rewards. We evaluate symbolic state space\npartitioning with respect to precision, scalability, learning agent performance\nand state space coverage for the learnt policies.\n","authors":["Mohsen Ghaffari","Mahsa Varshosaz","Einar Broch Johnsen","Andrzej Wąsowski"],"pdf_url":"https://arxiv.org/pdf/2409.16791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02512v1","updated":"2024-10-03T14:21:49Z","published":"2024-10-03T14:21:49Z","title":"SAFLEX: Self-Adaptive Augmentation via Feature Label Extrapolation","summary":" Data augmentation, a cornerstone technique in deep learning, is crucial in\nenhancing model performance, especially with scarce labeled data. While\ntraditional techniques are effective, their reliance on hand-crafted methods\nlimits their applicability across diverse data types and tasks. Although modern\nlearnable augmentation methods offer increased adaptability, they are\ncomputationally expensive and challenging to incorporate within prevalent\naugmentation workflows. In this work, we present a novel, efficient method for\ndata augmentation, effectively bridging the gap between existing augmentation\nstrategies and emerging datasets and learning tasks. We introduce SAFLEX\n(Self-Adaptive Augmentation via Feature Label EXtrapolation), which learns the\nsample weights and soft labels of augmented samples provided by any given\nupstream augmentation pipeline, using a specifically designed efficient bilevel\noptimization algorithm. Remarkably, SAFLEX effectively reduces the noise and\nlabel errors of the upstream augmentation pipeline with a marginal\ncomputational cost. As a versatile module, SAFLEX excels across diverse\ndatasets, including natural and medical images and tabular data, showcasing its\nprowess in few-shot learning and out-of-distribution generalization. SAFLEX\nseamlessly integrates with common augmentation strategies like RandAug, CutMix,\nand those from large pre-trained generative models like stable diffusion and is\nalso compatible with frameworks such as CLIP's fine-tuning. Our findings\nhighlight the potential to adapt existing augmentation pipelines for new data\ntypes and tasks, signaling a move towards more adaptable and resilient training\nframeworks.\n","authors":["Mucong Ding","Bang An","Yuancheng Xu","Anirudh Satheesh","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2410.02512v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2408.03350v2","updated":"2024-10-03T14:20:40Z","published":"2024-08-05T20:19:18Z","title":"miniCTX: Neural Theorem Proving with (Long-)Contexts","summary":" Real-world formal theorem proving often depends on a wealth of context,\nincluding definitions, lemmas, comments, file structure, and other information.\nWe introduce miniCTX, which tests a model's ability to prove formal\nmathematical theorems that depend on new context that is not seen during\ntraining. miniCTX contains theorems sourced from real Lean projects and\ntextbooks, each associated with a context that can span tens of thousands of\ntokens. Models are tasked with proving a theorem given access to code from the\ntheorem's repository, which contains context that is needed for the proof. As a\nbaseline for miniCTX, we tested fine-tuning and prompting methods that\ncondition theorem proving on preceding context. Both approaches substantially\noutperform traditional methods that rely solely on state information. We found\nthat this ability to use context is not captured by previous benchmarks such as\nminiF2F. Alongside miniCTX, we offer ntp-toolkit for automatically extracting\nand annotating theorem proving data, making it easy to add new projects into\nminiCTX to ensure that contexts are not seen during training. miniCTX offers a\nchallenging and realistic evaluation of neural theorem provers.\n","authors":["Jiewen Hu","Thomas Zhu","Sean Welleck"],"pdf_url":"https://arxiv.org/pdf/2408.03350v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14488v2","updated":"2024-10-03T14:16:47Z","published":"2024-03-21T15:36:26Z","title":"A Causal Bayesian Network and Probabilistic Programming Based Reasoning\n Framework for Robot Manipulation Under Uncertainty","summary":" Robot object manipulation in real-world environments is challenging because\nrobot operation must be robust to a range of sensing, estimation, and actuation\nuncertainties to avoid potentially unsafe and costly mistakes that are a\nbarrier to their adoption. In this paper, we propose a flexible and\ngeneralisable physics-informed causal Bayesian network (CBN) based framework\nfor a robot to probabilistically reason about candidate manipulation actions,\nto enable robot decision-making robust to arbitrary robot system uncertainties\n-- the first of its kind to use a probabilistic programming language\nimplementation. Using experiments in high-fidelity Gazebo simulation of an\nexemplar block stacking task, we demonstrate our framework's ability to: (1)\npredict manipulation outcomes with high accuracy (Pred Acc: 88.6%); and, (2)\nperform greedy next-best action selection with 94.2% task success rate. We also\ndemonstrate our framework's suitability for real-world robot systems with a\ndomestic robot. Thus, we show that by combining probabilistic causal modelling\nwith physics simulations, we can make robot manipulation more robust to system\nuncertainties and hence more feasible for real-world applications. Further, our\ngeneralised reasoning framework can be used and extended for future robotics\nand causality research.\n","authors":["Ricardo Cannizzaro","Michael Groom","Jonathan Routley","Robert Osazuwa Ness","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2403.14488v2.pdf","comment":"7 pages, 7 figures, submitted to the 2025 IEEE Conference on Robotics\n and Automation (ICRA 2025)"},{"id":"http://arxiv.org/abs/2410.02506v1","updated":"2024-10-03T14:14:31Z","published":"2024-10-03T14:14:31Z","title":"Cut the Crap: An Economical Communication Pipeline for LLM-based\n Multi-Agent Systems","summary":" Recent advancements in large language model (LLM)-powered agents have shown\nthat collective intelligence can significantly outperform individual\ncapabilities, largely attributed to the meticulously designed inter-agent\ncommunication topologies. Though impressive in performance, existing\nmulti-agent pipelines inherently introduce substantial token overhead, as well\nas increased economic costs, which pose challenges for their large-scale\ndeployments. In response to this challenge, we propose an economical, simple,\nand robust multi-agent communication framework, termed $\\texttt{AgentPrune}$,\nwhich can seamlessly integrate into mainstream multi-agent systems and prunes\nredundant or even malicious communication messages. Technically,\n$\\texttt{AgentPrune}$ is the first to identify and formally define the\n\\textit{communication redundancy} issue present in current LLM-based\nmulti-agent pipelines, and efficiently performs one-shot pruning on the\nspatial-temporal message-passing graph, yielding a token-economic and\nhigh-performing communication topology. Extensive experiments across six\nbenchmarks demonstrate that $\\texttt{AgentPrune}$ \\textbf{(I)} achieves\ncomparable results as state-of-the-art topologies at merely $\\$5.6$ cost\ncompared to their $\\$43.7$, \\textbf{(II)} integrates seamlessly into existing\nmulti-agent frameworks with $28.1\\%\\sim72.8\\%\\downarrow$ token reduction, and\n\\textbf{(III)} successfully defend against two types of agent-based adversarial\nattacks with $3.5\\%\\sim10.8\\%\\uparrow$ performance boost.\n","authors":["Guibin Zhang","Yanwei Yue","Zhixun Li","Sukwon Yun","Guancheng Wan","Kun Wang","Dawei Cheng","Jeffrey Xu Yu","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02504v1","updated":"2024-10-03T14:09:58Z","published":"2024-10-03T14:09:58Z","title":"Dual Active Learning for Reinforcement Learning from Human Feedback","summary":" Aligning large language models (LLMs) with human preferences is critical to\nrecent advances in generative artificial intelligence. Reinforcement learning\nfrom human feedback (RLHF) is widely applied to achieve this objective. A key\nstep in RLHF is to learn the reward function from human feedback. However,\nhuman feedback is costly and time-consuming, making it essential to collect\nhigh-quality conversation data for human teachers to label. Additionally,\ndifferent human teachers have different levels of expertise. It is thus\ncritical to query the most appropriate teacher for their opinions. In this\npaper, we use offline reinforcement learning (RL) to formulate the alignment\nproblem. Motivated by the idea of $D$-optimal design, we first propose a dual\nactive reward learning algorithm for the simultaneous selection of\nconversations and teachers. Next, we apply pessimistic RL to solve the\nalignment problem, based on the learned reward estimator. Theoretically, we\nshow that the reward estimator obtained through our proposed adaptive selection\nstrategy achieves minimal generalized variance asymptotically, and prove that\nthe sub-optimality of our pessimistic policy scales as $O(1/\\sqrt{T})$ with a\ngiven sample budget $T$. Through simulations and experiments on LLMs, we\ndemonstrate the effectiveness of our algorithm and its superiority over\nstate-of-the-arts.\n","authors":["Pangpang Liu","Chengchun Shi","Will Wei Sun"],"pdf_url":"https://arxiv.org/pdf/2410.02504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04557v2","updated":"2024-10-03T14:08:12Z","published":"2023-09-08T19:17:03Z","title":"Regret-Optimal Federated Transfer Learning for Kernel Regression with\n Applications in American Option Pricing","summary":" We propose an optimal iterative scheme for federated transfer learning, where\na central planner has access to datasets ${\\cal D}_1,\\dots,{\\cal D}_N$ for the\nsame learning model $f_{\\theta}$. Our objective is to minimize the cumulative\ndeviation of the generated parameters $\\{\\theta_i(t)\\}_{t=0}^T$ across all $T$\niterations from the specialized parameters\n$\\theta^\\star_{1},\\ldots,\\theta^\\star_N$ obtained for each dataset, while\nrespecting the loss function for the model $f_{\\theta(T)}$ produced by the\nalgorithm upon halting. We only allow for continual communication between each\nof the specialized models (nodes/agents) and the central planner (server), at\neach iteration (round). For the case where the model $f_{\\theta}$ is a\nfinite-rank kernel regression, we derive explicit updates for the\nregret-optimal algorithm. By leveraging symmetries within the regret-optimal\nalgorithm, we further develop a nearly regret-optimal heuristic that runs with\n$\\mathcal{O}(Np^2)$ fewer elementary operations, where $p$ is the dimension of\nthe parameter space. Additionally, we investigate the adversarial robustness of\nthe regret-optimal algorithm showing that an adversary which perturbs $q$\ntraining pairs by at-most $\\varepsilon>0$, across all training sets, cannot\nreduce the regret-optimal algorithm's regret by more than\n$\\mathcal{O}(\\varepsilon q \\bar{N}^{1/2})$, where $\\bar{N}$ is the aggregate\nnumber of training pairs. To validate our theoretical findings, we conduct\nnumerical experiments in the context of American option pricing, utilizing a\nrandomly generated finite-rank kernel.\n","authors":["Xuwei Yang","Anastasis Kratsios","Florian Krach","Matheus Grasselli","Aurelien Lucchi"],"pdf_url":"https://arxiv.org/pdf/2309.04557v2.pdf","comment":"51 pages, 2 figures"},{"id":"http://arxiv.org/abs/2410.02498v1","updated":"2024-10-03T14:00:44Z","published":"2024-10-03T14:00:44Z","title":"Dynamic Gradient Alignment for Online Data Mixing","summary":" The composition of training data mixtures is critical for effectively\ntraining large language models (LLMs), as it directly impacts their performance\non downstream tasks. Our goal is to identify an optimal data mixture to\nspecialize an LLM for a specific task with access to only a few examples.\nTraditional approaches to this problem include ad-hoc reweighting methods,\nimportance sampling, and gradient alignment techniques. This paper focuses on\ngradient alignment and introduces Dynamic Gradient Alignment (DGA), a scalable\nonline gradient alignment algorithm. DGA dynamically estimates the pre-training\ndata mixture on which the models' gradients align as well as possible with\nthose of the model on the specific task. DGA is the first gradient alignment\napproach that incurs minimal overhead compared to standard pre-training and\noutputs a competitive model, eliminating the need for retraining the model.\nExperimentally, we demonstrate significant improvements over importance\nsampling in two key scenarios: (i) when the pre-training set is small and\nimportance sampling overfits due to limited data; and (ii) when there is\ninsufficient specialized data, trapping importance sampling on narrow pockets\nof data. Our findings underscore the effectiveness of gradient alignment\nmethods in optimizing training data mixtures, particularly in data-constrained\nenvironments, and offer a practical solution for enhancing LLM performance on\nspecific tasks with limited data availability.\n","authors":["Simin Fan","David Grangier","Pierre Ablin"],"pdf_url":"https://arxiv.org/pdf/2410.02498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02496v1","updated":"2024-10-03T13:59:38Z","published":"2024-10-03T13:59:38Z","title":"Efficient learning of differential network in multi-source\n non-paranormal graphical models","summary":" This paper addresses learning of sparse structural changes or differential\nnetwork between two classes of non-paranormal graphical models. We assume a\nmulti-source and heterogeneous dataset is available for each class, where the\ncovariance matrices are identical for all non-paranormal graphical models. The\ndifferential network, which are encoded by the difference precision matrix, can\nthen be decoded by optimizing a lasso penalized D-trace loss function. To this\naim, an efficient approach is proposed that outputs the exact solution path,\noutperforming the previous methods that only sample from the solution path in\npre-selected regularization parameters. Notably, our proposed method has low\ncomputational complexity, especially when the differential network are sparse.\nOur simulations on synthetic data demonstrate a superior performance for our\nstrategy in terms of speed and accuracy compared to an existing method.\nMoreover, our strategy in combining datasets from multiple sources is shown to\nbe very effective in inferring differential network in real-world problems.\nThis is backed by our experimental results on drug resistance in tumor cancers.\nIn the latter case, our strategy outputs important genes for drug resistance\nwhich are already confirmed by various independent studies.\n","authors":["Mojtaba Nikahd","Seyed Abolfazl Motahari"],"pdf_url":"https://arxiv.org/pdf/2410.02496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02490v1","updated":"2024-10-03T13:55:49Z","published":"2024-10-03T13:55:49Z","title":"Stochastic variance-reduced Gaussian variational inference on the\n Bures-Wasserstein manifold","summary":" Optimization in the Bures-Wasserstein space has been gaining popularity in\nthe machine learning community since it draws connections between variational\ninference and Wasserstein gradient flows. The variational inference objective\nfunction of Kullback-Leibler divergence can be written as the sum of the\nnegative entropy and the potential energy, making forward-backward Euler the\nmethod of choice. Notably, the backward step admits a closed-form solution in\nthis case, facilitating the practicality of the scheme. However, the forward\nstep is no longer exact since the Bures-Wasserstein gradient of the potential\nenergy involves \"intractable\" expectations. Recent approaches propose using the\nMonte Carlo method -- in practice a single-sample estimator -- to approximate\nthese terms, resulting in high variance and poor performance. We propose a\nnovel variance-reduced estimator based on the principle of control variates. We\ntheoretically show that this estimator has a smaller variance than the\nMonte-Carlo estimator in scenarios of interest. We also prove that variance\nreduction helps improve the optimization bounds of the current analysis. We\ndemonstrate that the proposed estimator gains order-of-magnitude improvements\nover the previous Bures-Wasserstein methods.\n","authors":["Hoang Phuc Hau Luu","Hanlin Yu","Bernardo Williams","Marcelo Hartmann","Arto Klami"],"pdf_url":"https://arxiv.org/pdf/2410.02490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07103v3","updated":"2024-10-03T13:55:08Z","published":"2024-04-10T15:41:53Z","title":"Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on\n Graphs","summary":" Large language models (LLMs), while exhibiting exceptional performance,\nsuffer from hallucinations, especially on knowledge-intensive tasks. Existing\nworks propose to augment LLMs with individual text units retrieved from\nexternal knowledge corpora to alleviate the issue. However, in many domains,\ntexts are interconnected (e.g., academic papers in a bibliographic graph are\nlinked by citations and co-authorships) which form a (text-attributed) graph.\nThe knowledge in such graphs is encoded not only in single texts/nodes but also\nin their associated connections. To facilitate the research of augmenting LLMs\nwith graphs, we manually construct a Graph Reasoning Benchmark dataset called\nGRBench, containing 1,740 questions that can be answered with the knowledge\nfrom 10 domain graphs. Then, we propose a simple and effective framework called\nGraph Chain-of-thought (Graph-CoT) to augment LLMs with graphs by encouraging\nLLMs to reason on the graph iteratively. Each Graph-CoT iteration consists of\nthree sub-steps: LLM reasoning, LLM-graph interaction, and graph execution. We\nconduct systematic experiments with three LLM backbones on GRBench, where\nGraph-CoT outperforms the baselines consistently. The code is available at\nhttps://github.com/PeterGriffinJin/Graph-CoT.\n","authors":["Bowen Jin","Chulin Xie","Jiawei Zhang","Kashob Kumar Roy","Yu Zhang","Zheng Li","Ruirui Li","Xianfeng Tang","Suhang Wang","Yu Meng","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2404.07103v3.pdf","comment":"21 pages. Code: https://github.com/PeterGriffinJin/Graph-CoT"},{"id":"http://arxiv.org/abs/2407.04069v2","updated":"2024-10-03T13:51:53Z","published":"2024-07-04T17:15:37Z","title":"A Systematic Survey and Critical Review on Evaluating Large Language\n Models: Challenges, Limitations, and Recommendations","summary":" Large Language Models (LLMs) have recently gained significant attention due\nto their remarkable capabilities in performing diverse tasks across various\ndomains. However, a thorough evaluation of these models is crucial before\ndeploying them in real-world applications to ensure they produce reliable\nperformance. Despite the well-established importance of evaluating LLMs in the\ncommunity, the complexity of the evaluation process has led to varied\nevaluation setups, causing inconsistencies in findings and interpretations. To\naddress this, we systematically review the primary challenges and limitations\ncausing these inconsistencies and unreliable evaluations in various steps of\nLLM evaluation. Based on our critical review, we present our perspectives and\nrecommendations to ensure LLM evaluations are reproducible, reliable, and\nrobust.\n","authors":["Md Tahmid Rahman Laskar","Sawsan Alqahtani","M Saiful Bari","Mizanur Rahman","Mohammad Abdullah Matin Khan","Haidar Khan","Israt Jahan","Amran Bhuiyan","Chee Wei Tan","Md Rizwan Parvez","Enamul Hoque","Shafiq Joty","Jimmy Huang"],"pdf_url":"https://arxiv.org/pdf/2407.04069v2.pdf","comment":"Accepted at EMNLP 2024 (Main Conference)"},{"id":"http://arxiv.org/abs/2409.20195v2","updated":"2024-10-03T13:50:29Z","published":"2024-09-30T11:11:35Z","title":"Forecasting Disease Progression with Parallel Hyperplanes in\n Longitudinal Retinal OCT","summary":" Predicting future disease progression risk from medical images is challenging\ndue to patient heterogeneity, and subtle or unknown imaging biomarkers.\nMoreover, deep learning (DL) methods for survival analysis are susceptible to\nimage domain shifts across scanners. We tackle these issues in the task of\npredicting late dry Age-related Macular Degeneration (dAMD) onset from retinal\nOCT scans. We propose a novel DL method for survival prediction to jointly\npredict from the current scan a risk score, inversely related to\ntime-to-conversion, and the probability of conversion within a time interval\n$t$. It uses a family of parallel hyperplanes generated by parameterizing the\nbias term as a function of $t$. In addition, we develop unsupervised losses\nbased on intra-subject image pairs to ensure that risk scores increase over\ntime and that future conversion predictions are consistent with AMD stage\nprediction using actual scans of future visits. Such losses enable\ndata-efficient fine-tuning of the trained model on new unlabeled datasets\nacquired with a different scanner. Extensive evaluation on two large datasets\nacquired with different scanners resulted in a mean AUROCs of 0.82 for\nDataset-1 and 0.83 for Dataset-2, across prediction intervals of 6,12 and 24\nmonths.\n","authors":["Arunava Chakravarty","Taha Emre","Dmitrii Lachinov","Antoine Rivail","Hendrik Scholl","Lars Fritsche","Sobha Sivaprasad","Daniel Rueckert","Andrew Lotery","Ursula Schmidt-Erfurth","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2409.20195v2.pdf","comment":"accepted in MICCAI 2024"},{"id":"http://arxiv.org/abs/2410.02486v1","updated":"2024-10-03T13:48:35Z","published":"2024-10-03T13:48:35Z","title":"Encryption-Friendly LLM Architecture","summary":" Large language models (LLMs) offer personalized responses based on user\ninteractions, but this use case raises serious privacy concerns. Homomorphic\nencryption (HE) is a cryptographic protocol supporting arithmetic computations\nin encrypted states and provides a potential solution for privacy-preserving\nmachine learning (PPML). However, the computational intensity of transformers\nposes challenges for applying HE to LLMs. In this work, we propose a modified\nHE-friendly transformer architecture with an emphasis on inference following\npersonalized (private) fine-tuning. Utilizing LoRA fine-tuning and Gaussian\nkernels, we achieve significant computational speedups -- 6.94x for fine-tuning\nand 2.3x for inference -- while maintaining performance comparable to plaintext\nmodels. Our findings provide a viable proof of concept for offering\nprivacy-preserving LLM services in areas where data protection is crucial.\n","authors":["Donghwan Rho","Taeseong Kim","Minje Park","Jung Woo Kim","Hyunsik Chae","Jung Hee Cheon","Ernest K. Ryu"],"pdf_url":"https://arxiv.org/pdf/2410.02486v1.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2312.02783v3","updated":"2024-10-03T13:47:02Z","published":"2023-12-05T14:14:27Z","title":"Large Language Models on Graphs: A Comprehensive Survey","summary":" Large language models (LLMs), such as GPT4 and LLaMA, are creating\nsignificant advancements in natural language processing, due to their strong\ntext encoding/decoding ability and newly found emergent capability (e.g.,\nreasoning). While LLMs are mainly designed to process pure texts, there are\nmany real-world scenarios where text data is associated with rich structure\ninformation in the form of graphs (e.g., academic networks, and e-commerce\nnetworks) or scenarios where graph data is paired with rich textual information\n(e.g., molecules with descriptions). Besides, although LLMs have shown their\npure text-based reasoning ability, it is underexplored whether such ability can\nbe generalized to graphs (i.e., graph-based reasoning). In this paper, we\nprovide a systematic review of scenarios and techniques related to large\nlanguage models on graphs. We first summarize potential scenarios of adopting\nLLMs on graphs into three categories, namely pure graphs, text-attributed\ngraphs, and text-paired graphs. We then discuss detailed techniques for\nutilizing LLMs on graphs, including LLM as Predictor, LLM as Encoder, and LLM\nas Aligner, and compare the advantages and disadvantages of different schools\nof models. Furthermore, we discuss the real-world applications of such methods\nand summarize open-source codes and benchmark datasets. Finally, we conclude\nwith potential future research directions in this fast-growing field. The\nrelated source can be found at\nhttps://github.com/PeterGriffinJin/Awesome-Language-Model-on-Graphs.\n","authors":["Bowen Jin","Gang Liu","Chi Han","Meng Jiang","Heng Ji","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2312.02783v3.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2410.02479v1","updated":"2024-10-03T13:36:02Z","published":"2024-10-03T13:36:02Z","title":"Cross-Embodiment Dexterous Grasping with Reinforcement Learning","summary":" Dexterous hands exhibit significant potential for complex real-world grasping\ntasks. While recent studies have primarily focused on learning policies for\nspecific robotic hands, the development of a universal policy that controls\ndiverse dexterous hands remains largely unexplored. In this work, we study the\nlearning of cross-embodiment dexterous grasping policies using reinforcement\nlearning (RL). Inspired by the capability of human hands to control various\ndexterous hands through teleoperation, we propose a universal action space\nbased on the human hand's eigengrasps. The policy outputs eigengrasp actions\nthat are then converted into specific joint actions for each robot hand through\na retargeting mapping. We simplify the robot hand's proprioception to include\nonly the positions of fingertips and the palm, offering a unified observation\nspace across different robot hands. Our approach demonstrates an 80% success\nrate in grasping objects from the YCB dataset across four distinct embodiments\nusing a single vision-based policy. Additionally, our policy exhibits zero-shot\ngeneralization to two previously unseen embodiments and significant improvement\nin efficient finetuning. For further details and videos, visit our project page\nhttps://sites.google.com/view/crossdex.\n","authors":["Haoqi Yuan","Bohan Zhou","Yuhui Fu","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2410.02479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02478v1","updated":"2024-10-03T13:35:28Z","published":"2024-10-03T13:35:28Z","title":"Temporal Predictive Coding for Gradient Compression in Distributed\n Learning","summary":" This paper proposes a prediction-based gradient compression method for\ndistributed learning with event-triggered communication. Our goal is to reduce\nthe amount of information transmitted from the distributed agents to the\nparameter server by exploiting temporal correlation in the local gradients. We\nuse a linear predictor that \\textit{combines past gradients to form a\nprediction of the current gradient}, with coefficients that are optimized by\nsolving a least-square problem. In each iteration, every agent transmits the\npredictor coefficients to the server such that the predicted local gradient can\nbe computed. The difference between the true local gradient and the predicted\none, termed the \\textit{prediction residual, is only transmitted when its norm\nis above some threshold.} When this additional communication step is omitted,\nthe server uses the prediction as the estimated gradient. This proposed design\nshows notable performance gains compared to existing methods in the literature,\nachieving convergence with reduced communication costs.\n","authors":["Adrian Edin","Zheng Chen","Michel Kieffer","Mikael Johansson"],"pdf_url":"https://arxiv.org/pdf/2410.02478v1.pdf","comment":"8 pages, 3 figures, presented at the 60th Allerton conference on\n Communication, Control, and Computing"},{"id":"http://arxiv.org/abs/2410.02477v1","updated":"2024-10-03T13:35:15Z","published":"2024-10-03T13:35:15Z","title":"Learning Diverse Bimanual Dexterous Manipulation Skills from Human\n Demonstrations","summary":" Bimanual dexterous manipulation is a critical yet underexplored area in\nrobotics. Its high-dimensional action space and inherent task complexity\npresent significant challenges for policy learning, and the limited task\ndiversity in existing benchmarks hinders general-purpose skill development.\nExisting approaches largely depend on reinforcement learning, often constrained\nby intricately designed reward functions tailored to a narrow set of tasks. In\nthis work, we present a novel approach for efficiently learning diverse\nbimanual dexterous skills from abundant human demonstrations. Specifically, we\nintroduce BiDexHD, a framework that unifies task construction from existing\nbimanual datasets and employs teacher-student policy learning to address all\ntasks. The teacher learns state-based policies using a general two-stage reward\nfunction across tasks with shared behaviors, while the student distills the\nlearned multi-task policies into a vision-based policy. With BiDexHD, scalable\nlearning of numerous bimanual dexterous skills from auto-constructed tasks\nbecomes feasible, offering promising advances toward universal bimanual\ndexterous manipulation. Our empirical evaluation on the TACO dataset, spanning\n141 tasks across six categories, demonstrates a task fulfillment rate of 74.59%\non trained tasks and 51.07% on unseen tasks, showcasing the effectiveness and\ncompetitive zero-shot generalization capabilities of BiDexHD. For videos and\nmore information, visit our project page https://sites.google.com/view/bidexhd.\n","authors":["Bohan Zhou","Haoqi Yuan","Yuhui Fu","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2410.02477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02476v1","updated":"2024-10-03T13:35:08Z","published":"2024-10-03T13:35:08Z","title":"Online Convex Optimization with a Separation Oracle","summary":" In this paper, we introduce a new projection-free algorithm for Online Convex\nOptimization (OCO) with a state-of-the-art regret guarantee among\nseparation-based algorithms. Existing projection-free methods based on the\nclassical Frank-Wolfe algorithm achieve a suboptimal regret bound of\n$O(T^{3/4})$, while more recent separation-based approaches guarantee a regret\nbound of $O(\\kappa \\sqrt{T})$, where $\\kappa$ denotes the asphericity of the\nfeasible set, defined as the ratio of the radii of the containing and contained\nballs. However, for ill-conditioned sets, $\\kappa$ can be arbitrarily large,\npotentially leading to poor performance. Our algorithm achieves a regret bound\nof $\\tilde{O}(\\sqrt{dT} + \\kappa d)$, while requiring only $\\tilde{O}(1)$ calls\nto a separation oracle per round. Crucially, the main term in the bound,\n$\\tilde{O}(\\sqrt{d T})$, is independent of $\\kappa$, addressing the limitations\nof previous methods. Additionally, as a by-product of our analysis, we recover\nthe $O(\\kappa \\sqrt{T})$ regret bound of existing OCO algorithms with a more\nstraightforward analysis and improve the regret bound for projection-free\nonline exp-concave optimization. Finally, for constrained stochastic convex\noptimization, we achieve a state-of-the-art convergence rate of\n$\\tilde{O}(\\sigma/\\sqrt{T} + \\kappa d/T)$, where $\\sigma$ represents the noise\nin the stochastic gradients, while requiring only $\\tilde{O}(1)$ calls to a\nseparation oracle per iteration.\n","authors":["Zakaria Mhammedi"],"pdf_url":"https://arxiv.org/pdf/2410.02476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02475v1","updated":"2024-10-03T13:33:02Z","published":"2024-10-03T13:33:02Z","title":"Efficient Residual Learning with Mixture-of-Experts for Universal\n Dexterous Grasping","summary":" Universal dexterous grasping across diverse objects presents a fundamental\nyet formidable challenge in robot learning. Existing approaches using\nreinforcement learning (RL) to develop policies on extensive object datasets\nface critical limitations, including complex curriculum design for multi-task\nlearning and limited generalization to unseen objects. To overcome these\nchallenges, we introduce ResDex, a novel approach that integrates residual\npolicy learning with a mixture-of-experts (MoE) framework. ResDex is\ndistinguished by its use of geometry-unaware base policies that are efficiently\nacquired on individual objects and capable of generalizing across a wide range\nof unseen objects. Our MoE framework incorporates several base policies to\nfacilitate diverse grasping styles suitable for various objects. By learning\nresidual actions alongside weights that combine these base policies, ResDex\nenables efficient multi-task RL for universal dexterous grasping. ResDex\nachieves state-of-the-art performance on the DexGraspNet dataset comprising\n3,200 objects with an 88.8% success rate. It exhibits no generalization gap\nwith unseen objects and demonstrates superior training efficiency, mastering\nall tasks within only 12 hours on a single GPU.\n","authors":["Ziye Huang","Haoqi Yuan","Yuhui Fu","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2410.02475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02472v1","updated":"2024-10-03T13:25:15Z","published":"2024-10-03T13:25:15Z","title":"Meta-Models: An Architecture for Decoding LLM Behaviors Through\n Interpreted Embeddings and Natural Language","summary":" As Large Language Models (LLMs) become increasingly integrated into our daily\nlives, the potential harms from deceptive behavior underlie the need for\nfaithfully interpreting their decision-making. While traditional probing\nmethods have shown some effectiveness, they remain best for narrowly scoped\ntasks while more comprehensive explanations are still necessary. To this end,\nwe investigate meta-models-an architecture using a \"meta-model\" that takes\nactivations from an \"input-model\" and answers natural language questions about\nthe input-model's behaviors. We evaluate the meta-model's ability to generalize\nby training them on selected task types and assessing their out-of-distribution\nperformance in deceptive scenarios. Our findings show that meta-models\ngeneralize well to out-of-distribution tasks and point towards opportunities\nfor future research in this area.\n","authors":["Anthony Costarelli","Mat Allen","Severin Field","Joshua Clymer"],"pdf_url":"https://arxiv.org/pdf/2410.02472v1.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2410.02467v1","updated":"2024-10-03T13:17:06Z","published":"2024-10-03T13:17:06Z","title":"Towards a Theoretical Understanding of Memorization in Diffusion Models","summary":" As diffusion probabilistic models (DPMs) are being employed as mainstream\nmodels for Generative Artificial Intelligence (GenAI), the study of their\nmemorization of training data has attracted growing attention. Existing works\nin this direction aim to establish an understanding of whether or to what\nextent DPMs learn via memorization. Such an understanding is crucial for\nidentifying potential risks of data leakage and copyright infringement in\ndiffusion models and, more importantly, for trustworthy application of GenAI.\nExisting works revealed that conditional DPMs are more prone to training data\nmemorization than unconditional DPMs, and the motivated data extraction methods\nare mostly for conditional DPMs. However, these understandings are primarily\nempirical, and extracting training data from unconditional models has been\nfound to be extremely challenging. In this work, we provide a theoretical\nunderstanding of memorization in both conditional and unconditional DPMs under\nthe assumption of model convergence. Our theoretical analysis indicates that\nextracting data from unconditional models can also be effective by constructing\na proper surrogate condition. Based on this result, we propose a novel data\nextraction method named \\textbf{Surrogate condItional Data Extraction (SIDE)}\nthat leverages a time-dependent classifier trained on the generated data as a\nsurrogate condition to extract training data from unconditional DPMs. Empirical\nresults demonstrate that our SIDE can extract training data in challenging\nscenarios where previous methods fail, and it is, on average, over 50\\% more\neffective across different scales of the CelebA dataset.\n","authors":["Yunhao Chen","Xingjun Ma","Difan Zou","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2410.02467v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2406.12752"},{"id":"http://arxiv.org/abs/2409.07431v2","updated":"2024-10-03T13:07:25Z","published":"2024-09-11T17:21:59Z","title":"Synthetic continued pretraining","summary":" Pretraining on large-scale, unstructured internet text enables language\nmodels to acquire a significant amount of world knowledge. However, this\nknowledge acquisition is data-inefficient--to learn a given fact, models must\nbe trained on hundreds to thousands of diverse representations of it. This\nposes a challenge when adapting a pretrained model to a small corpus of\ndomain-specific documents, where each fact may appear rarely or only once. We\npropose to bridge this gap with synthetic continued pretraining: using the\nsmall domain-specific corpus to synthesize a large corpus more amenable to\nlearning, and then performing continued pretraining on the synthesized corpus.\nWe instantiate this proposal with EntiGraph, a synthetic data augmentation\nalgorithm that extracts salient entities from the source documents and then\ngenerates diverse text by drawing connections between the sampled entities.\nSynthetic continued pretraining with EntiGraph enables a language model to\nanswer questions and follow generic instructions related to the source\ndocuments without access to them. If, instead, the source documents are\navailable at inference time, we show that the knowledge acquired through our\napproach compounds with retrieval-augmented generation. To better understand\nthese results, we build a simple mathematical model of EntiGraph, and show how\nsynthetic data augmentation can \"rearrange\" knowledge to enable more\ndata-efficient learning.\n","authors":["Zitong Yang","Neil Band","Shuangping Li","Emmanuel Candès","Tatsunori Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2409.07431v2.pdf","comment":"Updated organization of experimental results and methods\n introduction. Released the dataset and model weights artifact"},{"id":"http://arxiv.org/abs/2410.02453v1","updated":"2024-10-03T13:02:07Z","published":"2024-10-03T13:02:07Z","title":"Quantifying User Coherence: A Unified Framework for Cross-Domain\n Recommendation Analysis","summary":" The effectiveness of Recommender Systems (RS) is closely tied to the quality\nand distinctiveness of user profiles, yet despite many advancements in raw\nperformance, the sensitivity of RS to user profile quality remains\nunder-researched. This paper introduces novel information-theoretic measures\nfor understanding recommender systems: a \"surprise\" measure quantifying users'\ndeviations from popular choices, and a \"conditional surprise\" measure capturing\nuser interaction coherence. We evaluate 7 recommendation algorithms across 9\ndatasets, revealing the relationships between our measures and standard\nperformance metrics. Using a rigorous statistical framework, our analysis\nquantifies how much user profile density and information measures impact\nalgorithm performance across domains. By segmenting users based on these\nmeasures, we achieve improved performance with reduced data and show that\nsimpler algorithms can match complex ones for low-coherence users.\nAdditionally, we employ our measures to analyze how well different\nrecommendation algorithms maintain the coherence and diversity of user\npreferences in their predictions, providing insights into algorithm behavior.\nThis work advances the theoretical understanding of user behavior and practical\nheuristics for personalized recommendation systems, promoting more efficient\nand adaptive architectures.\n","authors":["Michaël Soumm","Alexandre Fournier-Montgieux","Adrian Popescu","Bertrand Delezoide"],"pdf_url":"https://arxiv.org/pdf/2410.02453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02450v1","updated":"2024-10-03T12:52:36Z","published":"2024-10-03T12:52:36Z","title":"Personalized Federated Learning for Generative AI-Assisted Semantic\n Communications","summary":" Semantic Communication (SC) focuses on transmitting only the semantic\ninformation rather than the raw data. This approach offers an efficient\nsolution to the issue of spectrum resource utilization caused by the various\nintelligent applications on Mobile Users (MUs). Generative Artificial\nIntelligence (GAI) models have recently exhibited remarkable content generation\nand signal processing capabilities, presenting new opportunities for enhancing\nSC. Therefore, we propose a GAI-assisted SC (GSC) model deployed between MUs\nand the Base Station (BS). Then, to train the GSC model using the local data of\nMUs while ensuring privacy and accommodating heterogeneous requirements of MUs,\nwe introduce Personalized Semantic Federated Learning (PSFL). This approach\nincorporates a novel Personalized Local Distillation (PLD) and Adaptive Global\nPruning (AGP). In PLD, each MU selects a personalized GSC model as a mentor\ntailored to its local resources and a unified Convolutional Neural Networks\n(CNN)-based SC (CSC) model as a student. This mentor model is then distilled\ninto the student model for global aggregation. In AGP, we perform network\npruning on the aggregated global model according to real-time communication\nenvironments, reducing communication energy. Finally, numerical results\ndemonstrate the feasibility and efficiency of the proposed PSFL scheme.\n","authors":["Yubo Peng","Feibo Jiang","Li Dong","Kezhi Wang","Kun Yang"],"pdf_url":"https://arxiv.org/pdf/2410.02450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03086v2","updated":"2024-10-03T12:45:48Z","published":"2024-07-03T13:15:12Z","title":"Effective Heterogeneous Federated Learning via Efficient\n Hypernetwork-based Weight Generation","summary":" While federated learning leverages distributed client resources, it faces\nchallenges due to heterogeneous client capabilities. This necessitates\nallocating models suited to clients' resources and careful parameter\naggregation to accommodate this heterogeneity. We propose HypeMeFed, a novel\nfederated learning framework for supporting client heterogeneity by combining a\nmulti-exit network architecture with hypernetwork-based model weight\ngeneration. This approach aligns the feature spaces of heterogeneous model\nlayers and resolves per-layer information disparity during weight aggregation.\nTo practically realize HypeMeFed, we also propose a low-rank factorization\napproach to minimize computation and memory overhead associated with\nhypernetworks. Our evaluations on a real-world heterogeneous device testbed\nindicate that \\system enhances accuracy by 5.12% over FedAvg, reduces the\nhypernetwork memory requirements by 98.22%, and accelerates its operations by\n1.86x compared to a naive hypernetwork approach. These results demonstrate\nHypeMeFed's effectiveness in leveraging and engaging heterogeneous clients for\nfederated learning.\n","authors":["Yujin Shin","Kichang Lee","Sungmin Lee","You Rim Choi","Hyung-Sin Kim","JeongGil Ko"],"pdf_url":"https://arxiv.org/pdf/2407.03086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14260v2","updated":"2024-10-03T12:42:40Z","published":"2024-05-23T07:40:21Z","title":"Graph Sparsification via Mixture of Graphs","summary":" Graph Neural Networks (GNNs) have demonstrated superior performance across\nvarious graph learning tasks but face significant computational challenges when\napplied to large-scale graphs. One effective approach to mitigate these\nchallenges is graph sparsification, which involves removing non-essential edges\nto reduce computational overhead. However, previous graph sparsification\nmethods often rely on a single global sparsity setting and uniform pruning\ncriteria, failing to provide customized sparsification schemes for each node's\ncomplex local context. In this paper, we introduce Mixture-of-Graphs (MoG),\nleveraging the concept of Mixture-of-Experts (MoE), to dynamically select\ntailored pruning solutions for each node. Specifically, MoG incorporates\nmultiple sparsifier experts, each characterized by unique sparsity levels and\npruning criteria, and selects the appropriate experts for each node.\nSubsequently, MoG performs a mixture of the sparse graphs produced by different\nexperts on the Grassmann manifold to derive an optimal sparse graph. One\nnotable property of MoG is its entirely local nature, as it depends on the\nspecific circumstances of each individual node. Extensive experiments on four\nlarge-scale OGB datasets and two superpixel datasets, equipped with five GNN\nbackbones, demonstrate that MoG (I) identifies subgraphs at higher sparsity\nlevels ($8.67\\%\\sim 50.85\\%$), with performance equal to or better than the\ndense graph, (II) achieves $1.47-2.62\\times$ speedup in GNN inference with\nnegligible performance drop, and (III) boosts ``top-student'' GNN performance\n($1.02\\%\\uparrow$ on RevGNN+\\textsc{ogbn-proteins} and $1.74\\%\\uparrow$ on\nDeeperGCN+\\textsc{ogbg-ppa}).\n","authors":["Guibin Zhang","Xiangguo Sun","Yanwei Yue","Chonghe Jiang","Kun Wang","Tianlong Chen","Shirui Pan"],"pdf_url":"https://arxiv.org/pdf/2405.14260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02443v1","updated":"2024-10-03T12:40:52Z","published":"2024-10-03T12:40:52Z","title":"Clinnova Federated Learning Proof of Concept: Key Takeaways from a\n Cross-border Collaboration","summary":" Clinnova, a collaborative initiative involving France, Germany, Switzerland,\nand Luxembourg, is dedicated to unlocking the power of precision medicine\nthrough data federation, standardization, and interoperability. This European\nGreater Region initiative seeks to create an interoperable European standard\nusing artificial intelligence (AI) and data science to enhance healthcare\noutcomes and efficiency. Key components include multidisciplinary research\ncenters, a federated biobanking strategy, a digital health innovation platform,\nand a federated AI strategy. It targets inflammatory bowel disease, rheumatoid\ndiseases, and multiple sclerosis (MS), emphasizing data quality to develop AI\nalgorithms for personalized treatment and translational research.\n The IHU Strasbourg (Institute of Minimal-invasive Surgery) has the lead in\nthis initiative to develop the federated learning (FL) proof of concept (POC)\nthat will serve as a foundation for advancing AI in healthcare. At its core,\nClinnova-MS aims to enhance MS patient care by using FL to develop more\naccurate models that detect disease progression, guide interventions, and\nvalidate digital biomarkers across multiple sites. This technical report\npresents insights and key takeaways from the first cross-border federated POC\non MS segmentation of MRI images within the Clinnova framework. While our work\nmarks a significant milestone in advancing MS segmentation through cross-border\ncollaboration, it also underscores the importance of addressing technical,\nlogistical, and ethical considerations to realize the full potential of FL in\nhealthcare settings.\n","authors":["Julia Alekseenko","Bram Stieltjes","Michael Bach","Melanie Boerries","Oliver Opitz","Alexandros Karargyris","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2410.02443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02438v1","updated":"2024-10-03T12:35:17Z","published":"2024-10-03T12:35:17Z","title":"Learning K-U-Net with constant complexity: An Application to time series\n forecasting","summary":" Training deep models for time series forecasting is a critical task with an\ninherent challenge of time complexity. While current methods generally ensure\nlinear time complexity, our observations on temporal redundancy show that\nhigh-level features are learned 98.44\\% slower than low-level features. To\naddress this issue, we introduce a new exponentially weighted stochastic\ngradient descent algorithm designed to achieve constant time complexity in deep\nlearning models. We prove that the theoretical complexity of this learning\nmethod is constant. Evaluation of this method on Kernel U-Net (K-U-Net) on\nsynthetic datasets shows a significant reduction in complexity while improving\nthe accuracy of the test set.\n","authors":["Jiang You","Arben Cela","René Natowicz","Jacob Ouanounou","Patrick Siarry"],"pdf_url":"https://arxiv.org/pdf/2410.02438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02433v1","updated":"2024-10-03T12:28:13Z","published":"2024-10-03T12:28:13Z","title":"Better Call SAUL: Fluent and Consistent Language Model Editing with\n Generation Regularization","summary":" To ensure large language models contain up-to-date knowledge, they need to be\nupdated regularly. However, model editing is challenging as it might also\naffect knowledge that is unrelated to the new data. State-of-the-art methods\nidentify parameters associated with specific knowledge and then modify them via\ndirect weight updates. However, these locate-and-edit methods suffer from heavy\ncomputational overhead and lack theoretical validation. In contrast, directly\nfine-tuning the model on requested edits affects the model's behavior on\nunrelated knowledge, and significantly damages the model's generation fluency\nand consistency. To address these challenges, we propose SAUL, a streamlined\nmodel editing method that uses sentence concatenation with augmented random\nfacts for generation regularization. Evaluations on three model editing\nbenchmarks show that SAUL is a practical and reliable solution for model\nediting outperforming state-of-the-art methods while maintaining generation\nquality and reducing computational overhead.\n","authors":["Mingyang Wang","Lukas Lange","Heike Adel","Jannik Strötgen","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2410.02433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02430v1","updated":"2024-10-03T12:25:01Z","published":"2024-10-03T12:25:01Z","title":"Predictive Attractor Models","summary":" Sequential memory, the ability to form and accurately recall a sequence of\nevents or stimuli in the correct order, is a fundamental prerequisite for\nbiological and artificial intelligence as it underpins numerous cognitive\nfunctions (e.g., language comprehension, planning, episodic memory formation,\netc.) However, existing methods of sequential memory suffer from catastrophic\nforgetting, limited capacity, slow iterative learning procedures, low-order\nMarkov memory, and, most importantly, the inability to represent and generate\nmultiple valid future possibilities stemming from the same context. Inspired by\nbiologically plausible neuroscience theories of cognition, we propose\n\\textit{Predictive Attractor Models (PAM)}, a novel sequence memory\narchitecture with desirable generative properties. PAM is a streaming model\nthat learns a sequence in an online, continuous manner by observing each input\n\\textit{only once}. Additionally, we find that PAM avoids catastrophic\nforgetting by uniquely representing past context through lateral inhibition in\ncortical minicolumns, which prevents new memories from overwriting previously\nlearned knowledge. PAM generates future predictions by sampling from a union\nset of predicted possibilities; this generative ability is realized through an\nattractor model trained alongside the predictor. We show that PAM is trained\nwith local computations through Hebbian plasticity rules in a biologically\nplausible framework. Other desirable traits (e.g., noise tolerance, CPU-based\nlearning, capacity scaling) are discussed throughout the paper. Our findings\nsuggest that PAM represents a significant step forward in the pursuit of\nbiologically plausible and computationally efficient sequential memory models,\nwith broad implications for cognitive science and artificial intelligence\nresearch.\n","authors":["Ramy Mounir","Sudeep Sarkar"],"pdf_url":"https://arxiv.org/pdf/2410.02430v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.02425v1","updated":"2024-10-03T12:19:06Z","published":"2024-10-03T12:19:06Z","title":"LLM-Pilot: Characterize and Optimize Performance of your LLM Inference\n Services","summary":" As Large Language Models (LLMs) are rapidly growing in popularity, LLM\ninference services must be able to serve requests from thousands of users while\nsatisfying performance requirements. The performance of an LLM inference\nservice is largely determined by the hardware onto which it is deployed, but\nunderstanding of which hardware will deliver on performance requirements\nremains challenging. In this work we present LLM-Pilot - a first-of-its-kind\nsystem for characterizing and predicting performance of LLM inference services.\nLLM-Pilot performs benchmarking of LLM inference services, under a realistic\nworkload, across a variety of GPUs, and optimizes the service configuration for\neach considered GPU to maximize performance. Finally, using this\ncharacterization data, LLM-Pilot learns a predictive model, which can be used\nto recommend the most cost-effective hardware for a previously unseen LLM.\nCompared to existing methods, LLM-Pilot can deliver on performance requirements\n33% more frequently, whilst reducing costs by 60% on average.\n","authors":["Małgorzata Łazuka","Andreea Anghel","Thomas Parnell"],"pdf_url":"https://arxiv.org/pdf/2410.02425v1.pdf","comment":"Accepted to the International Conference for High Performance\n Computing, Networking, Storage and Analysis (SC '24)"},{"id":"http://arxiv.org/abs/2405.03582v2","updated":"2024-10-03T12:18:09Z","published":"2024-05-06T15:53:55Z","title":"Functional Latent Dynamics for Irregularly Sampled Time Series\n Forecasting","summary":" Irregularly sampled time series with missing values are often observed in\nmultiple real-world applications such as healthcare, climate and astronomy.\nThey pose a significant challenge to standard deep learning models that operate\nonly on fully observed and regularly sampled time series. In order to capture\nthe continuous dynamics of the irregular time series, many models rely on\nsolving an Ordinary Differential Equation (ODE) in the hidden state. These\nODE-based models tend to perform slow and require large memory due to\nsequential operations and a complex ODE solver. As an alternative to complex\nODE-based models, we propose a family of models called Functional Latent\nDynamics (FLD). Instead of solving the ODE, we use simple curves which exist at\nall time points to specify the continuous latent state in the model. The\ncoefficients of these curves are learned only from the observed values in the\ntime series ignoring the missing values. Through extensive experiments, we\ndemonstrate that FLD achieves better performance compared to the best ODE-based\nmodel while reducing the runtime and memory overhead. Specifically, FLD\nrequires an order of magnitude less time to infer the forecasts compared to the\nbest performing forecasting model.\n","authors":["Christian Klötergens","Vijaya Krishna Yalavarthi","Maximilian Stubbemann","Lars Schmidt-Thieme"],"pdf_url":"https://arxiv.org/pdf/2405.03582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02423v1","updated":"2024-10-03T12:13:56Z","published":"2024-10-03T12:13:56Z","title":"PnP-Flow: Plug-and-Play Image Restoration with Flow Matching","summary":" In this paper, we introduce Plug-and-Play (PnP) Flow Matching, an algorithm\nfor solving imaging inverse problems. PnP methods leverage the strength of\npre-trained denoisers, often deep neural networks, by integrating them in\noptimization schemes. While they achieve state-of-the-art performance on\nvarious inverse problems in imaging, PnP approaches face inherent limitations\non more generative tasks like inpainting. On the other hand, generative models\nsuch as Flow Matching pushed the boundary in image sampling yet lack a clear\nmethod for efficient use in image restoration. We propose to combine the PnP\nframework with Flow Matching (FM) by defining a time-dependent denoiser using a\npre-trained FM model. Our algorithm alternates between gradient descent steps\non the data-fidelity term, reprojections onto the learned FM path, and\ndenoising. Notably, our method is computationally efficient and\nmemory-friendly, as it avoids backpropagation through ODEs and trace\ncomputations. We evaluate its performance on denoising, super-resolution,\ndeblurring, and inpainting tasks, demonstrating superior results compared to\nexisting PnP algorithms and Flow Matching based state-of-the-art methods.\n","authors":["Ségolène Martin","Anne Gagneux","Paul Hagemann","Gabriele Steidl"],"pdf_url":"https://arxiv.org/pdf/2410.02423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02417v1","updated":"2024-10-03T12:07:34Z","published":"2024-10-03T12:07:34Z","title":"MenakBERT -- Hebrew Diacriticizer","summary":" Diacritical marks in the Hebrew language give words their vocalized form. The\ntask of adding diacritical marks to plain Hebrew text is still dominated by a\nsystem that relies heavily on human-curated resources. Recent models trained on\ndiacritized Hebrew texts still present a gap in performance. We use a recently\ndeveloped char-based PLM to narrowly bridge this gap. Presenting MenakBERT, a\ncharacter level transformer pretrained on Hebrew text and fine-tuned to produce\ndiacritical marks for Hebrew sentences. We continue to show how finetuning a\nmodel for diacritizing transfers to a task such as part of speech tagging.\n","authors":["Ido Cohen","Jacob Gidron","Idan Pinto"],"pdf_url":"https://arxiv.org/pdf/2410.02417v1.pdf","comment":"Published at ISCOL2022 as a poster"},{"id":"http://arxiv.org/abs/2410.02416v1","updated":"2024-10-03T12:06:29Z","published":"2024-10-03T12:06:29Z","title":"Eliminating Oversaturation and Artifacts of High Guidance Scales in\n Diffusion Models","summary":" Classifier-free guidance (CFG) is crucial for improving both generation\nquality and alignment between the input condition and final output in diffusion\nmodels. While a high guidance scale is generally required to enhance these\naspects, it also causes oversaturation and unrealistic artifacts. In this\npaper, we revisit the CFG update rule and introduce modifications to address\nthis issue. We first decompose the update term in CFG into parallel and\northogonal components with respect to the conditional model prediction and\nobserve that the parallel component primarily causes oversaturation, while the\northogonal component enhances image quality. Accordingly, we propose\ndown-weighting the parallel component to achieve high-quality generations\nwithout oversaturation. Additionally, we draw a connection between CFG and\ngradient ascent and introduce a new rescaling and momentum method for the CFG\nupdate rule based on this insight. Our approach, termed adaptive projected\nguidance (APG), retains the quality-boosting advantages of CFG while enabling\nthe use of higher guidance scales without oversaturation. APG is easy to\nimplement and introduces practically no additional computational overhead to\nthe sampling process. Through extensive experiments, we demonstrate that APG is\ncompatible with various conditional diffusion models and samplers, leading to\nimproved FID, recall, and saturation scores while maintaining precision\ncomparable to CFG, making our method a superior plug-and-play alternative to\nstandard classifier-free guidance.\n","authors":["Seyedmorteza Sadat","Otmar Hilliges","Romann M. Weber"],"pdf_url":"https://arxiv.org/pdf/2410.02416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05754v3","updated":"2024-10-03T11:58:22Z","published":"2024-03-09T01:34:26Z","title":"Hybrid Quantum-inspired Resnet and Densenet for Pattern Recognition","summary":" In this paper, we propose two hybrid quantum-inspired neural networks with\nresidual and dense connections respectively for pattern recognition. We explain\nthe concrete frameworks and illustrate the potential superiority to prevent\ngradient explosion of our hybrid models. A group of numerical experiments about\ngeneralization power shows that our hybrid models possess the same\ngeneralization power as the pure classical models with different noisy datasets\nutilized. More importantly, another group of numerical experiments of\nrobustness demonstrates that our hybrid models outperform pure classical models\nnotably in resistance to parameter attacks with various asymmetric noises.\nAlso, an ablation study indicate that the recognition accuracy of our hybrid\nmodels is 2\\%-3\\% higher than that of the quantum neural network without\nresidual or dense connection. Eventually, we discuss the application scenarios\nof our hybrid models by analyzing their computational complexities.\n","authors":["Andi Chen","Hua-Lei Yin","Zeng-Bing Chen","Shengjun Wu"],"pdf_url":"https://arxiv.org/pdf/2403.05754v3.pdf","comment":"12 pages for main paper with a hyperlink of a 18-page supplementary\n material in the last page of the main paper"},{"id":"http://arxiv.org/abs/2402.04051v4","updated":"2024-10-03T11:36:28Z","published":"2024-02-06T14:53:28Z","title":"Analysis of Linear Mode Connectivity via Permutation-Based Weight\n Matching","summary":" Recently, Ainsworth et al. showed that using weight matching (WM) to minimize\nthe $L_2$ distance in a permutation search of model parameters effectively\nidentifies permutations that satisfy linear mode connectivity (LMC), where the\nloss along a linear path between two independently trained models with\ndifferent seeds remains nearly constant. This paper analyzes LMC using WM,\nwhich is useful for understanding stochastic gradient descent's effectiveness\nand its application in areas like model merging. We first empirically show that\npermutations found by WM do not significantly reduce the $L_2$ distance between\ntwo models, and the occurrence of LMC is not merely due to distance reduction\nby WM itself. We then demonstrate that permutations can change the directions\nof the singular vectors, but not the singular values, of the weight matrices in\neach layer. This finding shows that permutations found by WM primarily align\nthe directions of singular vectors associated with large singular values across\nmodels. This alignment brings the singular vectors with large singular values,\nwhich determine the model's functionality, closer between the original and\nmerged models, allowing the merged model to retain functionality similar to the\noriginal models, thereby satisfying LMC. This paper also analyzes activation\nmatching (AM) in terms of singular vectors and finds that the principle of AM\nis the same as that of WM. Finally, we analyze the difference between WM and\nthe straight-through estimator (STE), a dataset-dependent permutation search\nmethod, and show that WM can be more advantageous than STE in achieving LMC\namong three or more models.\n","authors":["Akira Ito","Masanori Yamada","Atsutoshi Kumagai"],"pdf_url":"https://arxiv.org/pdf/2402.04051v4.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2410.02400v1","updated":"2024-10-03T11:27:55Z","published":"2024-10-03T11:27:55Z","title":"An Online Feasible Point Method for Benign Generalized Nash Equilibrium\n Problems","summary":" We consider a repeatedly played generalized Nash equilibrium game. This\ninduces a multi-agent online learning problem with joint constraints. An\nimportant challenge in this setting is that the feasible set for each agent\ndepends on the simultaneous moves of the other agents and, therefore, varies\nover time. As a consequence, the agents face time-varying constraints, which\nare not adversarial but rather endogenous to the system. Prior work in this\nsetting focused on convergence to a feasible solution in the limit via\nintegrating the constraints in the objective as a penalty function. However, no\nexisting work can guarantee that the constraints are satisfied for all\niterations while simultaneously guaranteeing convergence to a generalized Nash\nequilibrium. This is a problem of fundamental theoretical interest and\npractical relevance. In this work, we introduce a new online feasible point\nmethod. Under the assumption that limited communication between the agents is\nallowed, this method guarantees feasibility. We identify the class of benign\ngeneralized Nash equilibrium problems, for which the convergence of our method\nto the equilibrium is guaranteed. We set this class of benign generalized Nash\nequilibrium games in context with existing definitions and illustrate our\nmethod with examples.\n","authors":["Sarah Sachs","Hedi Hadiji","Tim van Erven","Mathias Staudigl"],"pdf_url":"https://arxiv.org/pdf/2410.02400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02396v1","updated":"2024-10-03T11:17:58Z","published":"2024-10-03T11:17:58Z","title":"Parameter Competition Balancing for Model Merging","summary":" While fine-tuning pretrained models has become common practice, these models\noften underperform outside their specific domains. Recently developed model\nmerging techniques enable the direct integration of multiple models, each\nfine-tuned for distinct tasks, into a single model. This strategy promotes\nmultitasking capabilities without requiring retraining on the original\ndatasets. However, existing methods fall short in addressing potential\nconflicts and complex correlations between tasks, especially in parameter-level\nadjustments, posing a challenge in effectively balancing parameter competition\nacross various tasks. This paper introduces an innovative technique named\nPCB-Merging (Parameter Competition Balancing), a lightweight and training-free\ntechnique that adjusts the coefficients of each parameter for effective model\nmerging. PCB-Merging employs intra-balancing to gauge parameter significance\nwithin individual tasks and inter-balancing to assess parameter similarities\nacross different tasks. Parameters with low importance scores are dropped, and\nthe remaining ones are rescaled to form the final merged model. We assessed our\napproach in diverse merging scenarios, including cross-task, cross-domain, and\ncross-training configurations, as well as out-of-domain generalization. The\nexperimental results reveal that our approach achieves substantial performance\nenhancements across multiple modalities, domains, model sizes, number of tasks,\nfine-tuning forms, and large language models, outperforming existing model\nmerging methods. The code is publicly available at:\n\\url{https://github.com/duguodong7/pcb-merging}.\n","authors":["Guodong Du","Junlin Lee","Jing Li","Runhua Jiang","Yifei Guo","Shuyang Yu","Hanting Liu","Sim Kuan Goh","Ho-Kin Tang","Daojing He","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.02396v1.pdf","comment":"Accepted by NeurIPS2024"},{"id":"http://arxiv.org/abs/2410.02394v1","updated":"2024-10-03T11:16:43Z","published":"2024-10-03T11:16:43Z","title":"Online Multi-Label Classification under Noisy and Changing Label\n Distribution","summary":" Multi-label data stream usually contains noisy labels in the real-world\napplications, namely occuring in both relevant and irrelevant labels. However,\nexisting online multi-label classification methods are mostly limited in terms\nof label quality and fail to deal with the case of noisy labels. On the other\nhand, the ground-truth label distribution may vary with the time changing,\nwhich is hidden in the observed noisy label distribution and difficult to\ntrack, posing a major challenge for concept drift adaptation. Motivated by\nthis, we propose an online multi-label classification algorithm under Noisy and\nChanging Label Distribution (NCLD). The convex objective is designed to\nsimultaneously model the label scoring and the label ranking for high accuracy,\nwhose robustness to NCLD benefits from three novel works: 1) The local feature\ngraph is used to reconstruct the label scores jointly with the observed labels,\nand an unbiased ranking loss is derived and applied to learn reliable ranking\ninformation. 2) By detecting the difference between two adjacent chunks with\nthe unbiased label cardinality, we identify the change in the ground-truth\nlabel distribution and reset the ranking or all information learned from the\npast to match the new distribution. 3) Efficient and accurate updating is\nachieved based on the updating rule derived from the closed-form optimal model\nsolution. Finally, empirical experimental results validate the effectiveness of\nour method in classifying instances under NCLD.\n","authors":["Yizhang Zou","Xuegang Hu","Peipei Li","Jun Hu","You Wu"],"pdf_url":"https://arxiv.org/pdf/2410.02394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02392v1","updated":"2024-10-03T11:13:55Z","published":"2024-10-03T11:13:55Z","title":"MANTRA: The Manifold Triangulations Assemblage","summary":" The rising interest in leveraging higher-order interactions present in\ncomplex systems has led to a surge in more expressive models exploiting\nhigh-order structures in the data, especially in topological deep learning\n(TDL), which designs neural networks on high-order domains such as simplicial\ncomplexes. However, progress in this field is hindered by the scarcity of\ndatasets for benchmarking these architectures. To address this gap, we\nintroduce MANTRA, the first large-scale, diverse, and intrinsically high order\ndataset for benchmarking high-order models, comprising over 43,000 and 249,000\ntriangulations of surfaces and three-dimensional manifolds, respectively. With\nMANTRA, we assess several graph- and simplicial complex-based models on three\ntopological classification tasks. We demonstrate that while simplicial\ncomplex-based neural networks generally outperform their graph-based\ncounterparts in capturing simple topological invariants, they also struggle,\nsuggesting a rethink of TDL. Thus, MANTRA serves as a benchmark for assessing\nand advancing topological methods, leading the way for more effective\nhigh-order models.\n","authors":["Rubén Ballester","Ernst Röell","Daniel Bin Schmid","Mathieu Alain","Sergio Escalera","Carles Casacuberta","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2410.02392v1.pdf","comment":"26 pages, 2 figures, 22 tables"},{"id":"http://arxiv.org/abs/2410.02389v1","updated":"2024-10-03T11:10:37Z","published":"2024-10-03T11:10:37Z","title":"Diffusion Meets Options: Hierarchical Generative Skill Composition for\n Temporally-Extended Tasks","summary":" Safe and successful deployment of robots requires not only the ability to\ngenerate complex plans but also the capacity to frequently replan and correct\nexecution errors. This paper addresses the challenge of long-horizon trajectory\nplanning under temporally extended objectives in a receding horizon manner. To\nthis end, we propose DOPPLER, a data-driven hierarchical framework that\ngenerates and updates plans based on instruction specified by linear temporal\nlogic (LTL). Our method decomposes temporal tasks into chain of options with\nhierarchical reinforcement learning from offline non-expert datasets. It\nleverages diffusion models to generate options with low-level actions. We\ndevise a determinantal-guided posterior sampling technique during batch\ngeneration, which improves the speed and diversity of diffusion generated\noptions, leading to more efficient querying. Experiments on robot navigation\nand manipulation tasks demonstrate that DOPPLER can generate sequences of\ntrajectories that progressively satisfy the specified formulae for obstacle\navoidance and sequential visitation. Demonstration videos are available online\nat: https://philiptheother.github.io/doppler/.\n","authors":["Zeyu Feng","Hao Luan","Kevin Yuchen Ma","Harold Soh"],"pdf_url":"https://arxiv.org/pdf/2410.02389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02387v1","updated":"2024-10-03T11:07:43Z","published":"2024-10-03T11:07:43Z","title":"BiSSL: Bilevel Optimization for Self-Supervised Pre-Training and\n Fine-Tuning","summary":" In this work, we present BiSSL, a first-of-its-kind training framework that\nintroduces bilevel optimization to enhance the alignment between the pretext\npre-training and downstream fine-tuning stages in self-supervised learning.\nBiSSL formulates the pretext and downstream task objectives as the lower- and\nupper-level objectives in a bilevel optimization problem and serves as an\nintermediate training stage within the self-supervised learning pipeline. By\nmore explicitly modeling the interdependence of these training stages, BiSSL\nfacilitates enhanced information sharing between them, ultimately leading to a\nbackbone parameter initialization that is better suited for the downstream\ntask. We propose a training algorithm that alternates between optimizing the\ntwo objectives defined in BiSSL. Using a ResNet-18 backbone pre-trained with\nSimCLR on the STL10 dataset, we demonstrate that our proposed framework\nconsistently achieves improved or competitive classification accuracies across\nvarious downstream image classification datasets compared to the conventional\nself-supervised learning pipeline. Qualitative analyses of the backbone\nfeatures further suggest that BiSSL enhances the alignment of downstream\nfeatures in the backbone prior to fine-tuning.\n","authors":["Gustav Wagner Zakarias","Lars Kai Hansen","Zheng-Hua Tan"],"pdf_url":"https://arxiv.org/pdf/2410.02387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13663v3","updated":"2024-10-03T11:03:22Z","published":"2024-06-19T16:10:26Z","title":"Model Internals-based Answer Attribution for Trustworthy\n Retrieval-Augmented Generation","summary":" Ensuring the verifiability of model answers is a fundamental challenge for\nretrieval-augmented generation (RAG) in the question answering (QA) domain.\nRecently, self-citation prompting was proposed to make large language models\n(LLMs) generate citations to supporting documents along with their answers.\nHowever, self-citing LLMs often struggle to match the required format, refer to\nnon-existent sources, and fail to faithfully reflect LLMs' context usage\nthroughout the generation. In this work, we present MIRAGE --Model\nInternals-based RAG Explanations -- a plug-and-play approach using model\ninternals for faithful answer attribution in RAG applications. MIRAGE detects\ncontext-sensitive answer tokens and pairs them with retrieved documents\ncontributing to their prediction via saliency methods. We evaluate our proposed\napproach on a multilingual extractive QA dataset, finding high agreement with\nhuman answer attribution. On open-ended QA, MIRAGE achieves citation quality\nand efficiency comparable to self-citation while also allowing for a\nfiner-grained control of attribution parameters. Our qualitative evaluation\nhighlights the faithfulness of MIRAGE's attributions and underscores the\npromising application of model internals for RAG answer attribution.\n","authors":["Jirui Qi","Gabriele Sarti","Raquel Fernández","Arianna Bisazza"],"pdf_url":"https://arxiv.org/pdf/2406.13663v3.pdf","comment":"Accepted by EMNLP 2024 Main Conference. Code and data released at\n https://github.com/Betswish/MIRAGE"}],"Multimedia":[{"id":"http://arxiv.org/abs/2406.17932v2","updated":"2024-10-03T14:34:52Z","published":"2024-06-25T20:47:10Z","title":"SonicSense: Object Perception from In-Hand Acoustic Vibration","summary":" We introduce SonicSense, a holistic design of hardware and software to enable\nrich robot object perception through in-hand acoustic vibration sensing. While\nprevious studies have shown promising results with acoustic sensing for object\nperception, current solutions are constrained to a handful of objects with\nsimple geometries and homogeneous materials, single-finger sensing, and mixing\ntraining and testing on the same objects. SonicSense enables container\ninventory status differentiation, heterogeneous material prediction, 3D shape\nreconstruction, and object re-identification from a diverse set of 83\nreal-world objects. Our system employs a simple but effective heuristic\nexploration policy to interact with the objects as well as end-to-end\nlearning-based algorithms to fuse vibration signals to infer object properties.\nOur framework underscores the significance of in-hand acoustic vibration\nsensing in advancing robot tactile perception.\n","authors":["Jiaxun Liu","Boyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2406.17932v2.pdf","comment":"Our project website is at: http://generalroboticslab.com/SonicSense"},{"id":"http://arxiv.org/abs/2410.01654v2","updated":"2024-10-03T12:43:14Z","published":"2024-10-02T15:19:31Z","title":"Releasing the Parameter Latency of Neural Representation for\n High-Efficiency Video Compression","summary":" For decades, video compression technology has been a prominent research area.\nTraditional hybrid video compression framework and end-to-end frameworks\ncontinue to explore various intra- and inter-frame reference and prediction\nstrategies based on discrete transforms and deep learning techniques. However,\nthe emerging implicit neural representation (INR) technique models entire\nvideos as basic units, automatically capturing intra-frame and inter-frame\ncorrelations and obtaining promising performance. INR uses a compact neural\nnetwork to store video information in network parameters, effectively\neliminating spatial and temporal redundancy in the original video. However, in\nthis paper, our exploration and verification reveal that current INR video\ncompression methods do not fully exploit their potential to preserve\ninformation. We investigate the potential of enhancing network parameter\nstorage through parameter reuse. By deepening the network, we designed a\nfeasible INR parameter reuse scheme to further improve compression performance.\nExtensive experimental results show that our method significantly enhances the\nrate-distortion performance of INR video compression.\n","authors":["Gai Zhang","Xinfeng Zhang","Lv Tang","Yue Li","Kai Zhang","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.01654v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02889v2","updated":"2024-10-03T11:01:14Z","published":"2024-09-04T17:25:21Z","title":"LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via a\n Hybrid Architecture","summary":" Expanding the long-context capabilities of Multi-modal Large Language\nModels~(MLLMs) is crucial for video understanding, high-resolution image\nunderstanding, and multi-modal agents. This involves a series of systematic\noptimizations, including model architecture, data construction and training\nstrategy, particularly addressing challenges such as \\textit{degraded\nperformance with more images} and \\textit{high computational costs}. In this\npaper, we adapt the model architecture to a hybrid of Mamba and Transformer\nblocks, approach data construction with both temporal and spatial dependencies\namong multiple images and employ a progressive training strategy. The released\nmodel \\textbf{LongLLaVA}~(\\textbf{Long}-Context \\textbf{L}arge\n\\textbf{L}anguage \\textbf{a}nd \\textbf{V}ision \\textbf{A}ssistant) is the first\nhybrid MLLM, which achieved a better balance between efficiency and\neffectiveness. LongLLaVA not only achieves competitive results across various\nbenchmarks, but also maintains high throughput and low memory consumption.\nEspecially, it could process nearly a thousand images on a single A100 80GB\nGPU, showing promising application prospects for a wide range of tasks.\n","authors":["Xidong Wang","Dingjie Song","Shunian Chen","Chen Zhang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02889v2.pdf","comment":"20 pages, 9 figures, 9 tables"},{"id":"http://arxiv.org/abs/2406.00093v2","updated":"2024-10-03T08:20:17Z","published":"2024-05-31T17:59:56Z","title":"Bootstrap3D: Improving Multi-view Diffusion Model with Synthetic Data","summary":" Recent years have witnessed remarkable progress in multi-view diffusion\nmodels for 3D content creation. However, there remains a significant gap in\nimage quality and prompt-following ability compared to 2D diffusion models. A\ncritical bottleneck is the scarcity of high-quality 3D objects with detailed\ncaptions. To address this challenge, we propose Bootstrap3D, a novel framework\nthat automatically generates an arbitrary quantity of multi-view images to\nassist in training multi-view diffusion models. Specifically, we introduce a\ndata generation pipeline that employs (1) 2D and video diffusion models to\ngenerate multi-view images based on constructed text prompts, and (2) our\nfine-tuned 3D-aware MV-LLaVA for filtering high-quality data and rewriting\ninaccurate captions. Leveraging this pipeline, we have generated 1 million\nhigh-quality synthetic multi-view images with dense descriptive captions to\naddress the shortage of high-quality 3D data. Furthermore, we present a\nTraining Timestep Reschedule (TTR) strategy that leverages the denoising\nprocess to learn multi-view consistency while maintaining the original 2D\ndiffusion prior. Extensive experiments demonstrate that Bootstrap3D can\ngenerate high-quality multi-view images with superior aesthetic quality,\nimage-text alignment, and maintained view consistency.\n","authors":["Zeyi Sun","Tong Wu","Pan Zhang","Yuhang Zang","Xiaoyi Dong","Yuanjun Xiong","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2406.00093v2.pdf","comment":"Project Page: https://sunzey.github.io/Bootstrap3D/"},{"id":"http://arxiv.org/abs/2410.02182v1","updated":"2024-10-03T03:51:53Z","published":"2024-10-03T03:51:53Z","title":"BadCM: Invisible Backdoor Attack Against Cross-Modal Learning","summary":" Despite remarkable successes in unimodal learning tasks, backdoor attacks\nagainst cross-modal learning are still underexplored due to the limited\ngeneralization and inferior stealthiness when involving multiple modalities.\nNotably, since works in this area mainly inherit ideas from unimodal visual\nattacks, they struggle with dealing with diverse cross-modal attack\ncircumstances and manipulating imperceptible trigger samples, which hinders\ntheir practicability in real-world applications. In this paper, we introduce a\nnovel bilateral backdoor to fill in the missing pieces of the puzzle in the\ncross-modal backdoor and propose a generalized invisible backdoor framework\nagainst cross-modal learning (BadCM). Specifically, a cross-modal mining scheme\nis developed to capture the modality-invariant components as target poisoning\nareas, where well-designed trigger patterns injected into these regions can be\nefficiently recognized by the victim models. This strategy is adapted to\ndifferent image-text cross-modal models, making our framework available to\nvarious attack scenarios. Furthermore, for generating poisoned samples of high\nstealthiness, we conceive modality-specific generators for visual and\nlinguistic modalities that facilitate hiding explicit trigger patterns in\nmodality-invariant regions. To the best of our knowledge, BadCM is the first\ninvisible backdoor method deliberately designed for diverse cross-modal attacks\nwithin one unified framework. Comprehensive experimental evaluations on two\ntypical applications, i.e., cross-modal retrieval and VQA, demonstrate the\neffectiveness and generalization of our method under multiple kinds of attack\nscenarios. Moreover, we show that BadCM can robustly evade existing backdoor\ndefenses. Our code is available at https://github.com/xandery-geek/BadCM.\n","authors":["Zheng Zhang","Xu Yuan","Lei Zhu","Jingkuan Song","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2410.02182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14637v2","updated":"2024-10-03T03:51:22Z","published":"2023-10-23T07:21:40Z","title":"Semantic-Aware Adversarial Training for Reliable Deep Hashing Retrieval","summary":" Deep hashing has been intensively studied and successfully applied in\nlarge-scale image retrieval systems due to its efficiency and effectiveness.\nRecent studies have recognized that the existence of adversarial examples poses\na security threat to deep hashing models, that is, adversarial vulnerability.\nNotably, it is challenging to efficiently distill reliable semantic\nrepresentatives for deep hashing to guide adversarial learning, and thereby it\nhinders the enhancement of adversarial robustness of deep hashing-based\nretrieval models. Moreover, current researches on adversarial training for deep\nhashing are hard to be formalized into a unified minimax structure. In this\npaper, we explore Semantic-Aware Adversarial Training (SAAT) for improving the\nadversarial robustness of deep hashing models. Specifically, we conceive a\ndiscriminative mainstay features learning (DMFL) scheme to construct semantic\nrepresentatives for guiding adversarial learning in deep hashing. Particularly,\nour DMFL with the strict theoretical guarantee is adaptively optimized in a\ndiscriminative learning manner, where both discriminative and semantic\nproperties are jointly considered. Moreover, adversarial examples are\nfabricated by maximizing the Hamming distance between the hash codes of\nadversarial samples and mainstay features, the efficacy of which is validated\nin the adversarial attack trials. Further, we, for the first time, formulate\nthe formalized adversarial training of deep hashing into a unified minimax\noptimization under the guidance of the generated mainstay codes. Extensive\nexperiments on benchmark datasets show superb attack performance against the\nstate-of-the-art algorithms, meanwhile, the proposed adversarial training can\neffectively eliminate adversarial perturbations for trustworthy deep\nhashing-based retrieval. Our code is available at\nhttps://github.com/xandery-geek/SAAT.\n","authors":["Xu Yuan","Zheng Zhang","Xunguang Wang","Lin Wu"],"pdf_url":"https://arxiv.org/pdf/2310.14637v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19340v4","updated":"2024-10-03T02:23:50Z","published":"2024-07-27T21:00:36Z","title":"Integrating Large Language Models into a Tri-Modal Architecture for\n Automated Depression Classification","summary":" Major Depressive Disorder (MDD) is a pervasive mental health condition that\naffects 300 million people worldwide. This work presents a novel, BiLSTM-based\ntri-modal model-level fusion architecture for the binary classification of\ndepression from clinical interview recordings. The proposed architecture\nincorporates Mel Frequency Cepstral Coefficients, Facial Action Units, and uses\na two-shot learning based GPT-4 model to process text data. This is the first\nwork to incorporate large language models into a multi-modal architecture for\nthis task. It achieves impressive results on the DAIC-WOZ AVEC 2016 Challenge\ncross-validation split and Leave-One-Subject-Out cross-validation split,\nsurpassing all baseline models and multiple state-of-the-art models. In\nLeave-One-Subject-Out testing, it achieves an accuracy of 91.01%, an F1-Score\nof 85.95%, a precision of 80%, and a recall of 92.86%.\n","authors":["Santosh V. Patapati"],"pdf_url":"https://arxiv.org/pdf/2407.19340v4.pdf","comment":"Keywords: Multi-Modal Neural Networks, Deep Learning, Large Language\n Models, Depression Diagnosis, Biomedical Informatics, DAIC-WOZ"}]},"2024-10-02T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2410.02103v1","updated":"2024-10-02T23:48:31Z","published":"2024-10-02T23:48:31Z","title":"MVGS: Multi-view-regulated Gaussian Splatting for Novel View Synthesis","summary":" Recent works in volume rendering, \\textit{e.g.} NeRF and 3D Gaussian\nSplatting (3DGS), significantly advance the rendering quality and efficiency\nwith the help of the learned implicit neural radiance field or 3D Gaussians.\nRendering on top of an explicit representation, the vanilla 3DGS and its\nvariants deliver real-time efficiency by optimizing the parametric model with\nsingle-view supervision per iteration during training which is adopted from\nNeRF. Consequently, certain views are overfitted, leading to unsatisfying\nappearance in novel-view synthesis and imprecise 3D geometries. To solve\naforementioned problems, we propose a new 3DGS optimization method embodying\nfour key novel contributions: 1) We transform the conventional single-view\ntraining paradigm into a multi-view training strategy. With our proposed\nmulti-view regulation, 3D Gaussian attributes are further optimized without\noverfitting certain training views. As a general solution, we improve the\noverall accuracy in a variety of scenarios and different Gaussian variants. 2)\nInspired by the benefit introduced by additional views, we further propose a\ncross-intrinsic guidance scheme, leading to a coarse-to-fine training procedure\nconcerning different resolutions. 3) Built on top of our multi-view regulated\ntraining, we further propose a cross-ray densification strategy, densifying\nmore Gaussian kernels in the ray-intersect regions from a selection of views.\n4) By further investigating the densification strategy, we found that the\neffect of densification should be enhanced when certain views are distinct\ndramatically. As a solution, we propose a novel multi-view augmented\ndensification strategy, where 3D Gaussians are encouraged to get densified to a\nsufficient number accordingly, resulting in improved reconstruction accuracy.\n","authors":["Xiaobiao Du","Yida Wang","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2410.02103v1.pdf","comment":"Project Page:https://xiaobiaodu.github.io/mvgs-project/"},{"id":"http://arxiv.org/abs/2410.02101v1","updated":"2024-10-02T23:46:45Z","published":"2024-10-02T23:46:45Z","title":"Orient Anything","summary":" Orientation estimation is a fundamental task in 3D shape analysis which\nconsists of estimating a shape's orientation axes: its side-, up-, and\nfront-axes. Using this data, one can rotate a shape into canonical orientation,\nwhere its orientation axes are aligned with the coordinate axes. Developing an\norientation algorithm that reliably estimates complete orientations of general\nshapes remains an open problem. We introduce a two-stage orientation pipeline\nthat achieves state of the art performance on up-axis estimation and further\ndemonstrate its efficacy on full-orientation estimation, where one seeks all\nthree orientation axes. Unlike previous work, we train and evaluate our method\non all of Shapenet rather than a subset of classes. We motivate our engineering\ncontributions by theory describing fundamental obstacles to orientation\nestimation for rotationally-symmetric shapes, and show how our method avoids\nthese obstacles.\n","authors":["Christopher Scarvelis","David Benhaim","Paul Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.02101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02098v1","updated":"2024-10-02T23:39:10Z","published":"2024-10-02T23:39:10Z","title":"EC-DIT: Scaling Diffusion Transformers with Adaptive Expert-Choice\n Routing","summary":" Diffusion transformers have been widely adopted for text-to-image synthesis.\nWhile scaling these models up to billions of parameters shows promise, the\neffectiveness of scaling beyond current sizes remains underexplored and\nchallenging. By explicitly exploiting the computational heterogeneity of image\ngenerations, we develop a new family of Mixture-of-Experts (MoE) models\n(EC-DIT) for diffusion transformers with expert-choice routing. EC-DIT learns\nto adaptively optimize the compute allocated to understand the input texts and\ngenerate the respective image patches, enabling heterogeneous computation\naligned with varying text-image complexities. This heterogeneity provides an\nefficient way of scaling EC-DIT up to 97 billion parameters and achieving\nsignificant improvements in training convergence, text-to-image alignment, and\noverall generation quality over dense models and conventional MoE models.\nThrough extensive ablations, we show that EC-DIT demonstrates superior\nscalability and adaptive compute allocation by recognizing varying textual\nimportance through end-to-end training. Notably, in text-to-image alignment\nevaluation, our largest models achieve a state-of-the-art GenEval score of\n71.68% and still maintain competitive inference speed with intuitive\ninterpretability.\n","authors":["Haotian Sun","Bowen Zhang","Yanghao Li","Haoshuo Huang","Tao Lei","Ruoming Pang","Bo Dai","Nan Du"],"pdf_url":"https://arxiv.org/pdf/2410.02098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02094v1","updated":"2024-10-02T23:30:05Z","published":"2024-10-02T23:30:05Z","title":"Tracking objects that change in appearance with phase synchrony","summary":" Objects we encounter often change appearance as we interact with them.\nChanges in illumination (shadows), object pose, or movement of nonrigid objects\ncan drastically alter available image features. How do biological visual\nsystems track objects as they change? It may involve specific attentional\nmechanisms for reasoning about the locations of objects independently of their\nappearances -- a capability that prominent neuroscientific theories have\nassociated with computing through neural synchrony. We computationally test the\nhypothesis that the implementation of visual attention through neural synchrony\nunderlies the ability of biological visual systems to track objects that change\nin appearance over time. We first introduce a novel deep learning circuit that\ncan learn to precisely control attention to features separately from their\nlocation in the world through neural synchrony: the complex-valued recurrent\nneural network (CV-RNN). Next, we compare object tracking in humans, the\nCV-RNN, and other deep neural networks (DNNs), using FeatureTracker: a\nlarge-scale challenge that asks observers to track objects as their locations\nand appearances change in precisely controlled ways. While humans effortlessly\nsolved FeatureTracker, state-of-the-art DNNs did not. In contrast, our CV-RNN\nbehaved similarly to humans on the challenge, providing a computational\nproof-of-concept for the role of phase synchronization as a neural substrate\nfor tracking appearance-morphing objects as they move about.\n","authors":["Sabine Muzellec","Drew Linsley","Alekh K. Ashok","Ennio Mingolla","Girik Malik","Rufin VanRullen","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2410.02094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02086v1","updated":"2024-10-02T23:19:23Z","published":"2024-10-02T23:19:23Z","title":"Anchors Aweigh! Sail for Optimal Unified Multi-Modal Representations","summary":" Multimodal learning plays a crucial role in enabling machine learning models\nto fuse and utilize diverse data sources, such as text, images, and audio, to\nsupport a variety of downstream tasks. A unified representation across various\nmodalities is particularly important for improving efficiency and performance.\nRecent binding methods, such as ImageBind (Girdhar et al., 2023), typically use\na fixed anchor modality to align multimodal data in the anchor modal embedding\nspace. In this paper, we mathematically analyze the fixed anchor binding\nmethods and uncover notable limitations: (1) over-reliance on the choice of the\nanchor modality, (2) failure to capture intra-modal information, and (3)\nfailure to account for inter-modal correlation among non-anchored modalities.\nTo address these limitations, we propose CentroBind, a simple yet powerful\napproach that eliminates the need for a fixed anchor; instead, it employs\ndynamically adjustable centroid-based anchors generated from all available\nmodalities, resulting in a balanced and rich representation space. We\ntheoretically demonstrate that our method captures three crucial properties of\nmultimodal learning: intra-modal learning, inter-modal learning, and multimodal\nalignment, while also constructing a robust unified representation across all\nmodalities. Our experiments on both synthetic and real-world datasets\ndemonstrate the superiority of the proposed method, showing that dynamic anchor\nmethods outperform all fixed anchor binding methods as the former captures more\nnuanced multimodal interactions.\n","authors":["Minoh Jeong","Min Namgung","Zae Myung Kim","Dongyeop Kang","Yao-Yi Chiang","Alfred Hero"],"pdf_url":"https://arxiv.org/pdf/2410.02086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02080v1","updated":"2024-10-02T23:00:31Z","published":"2024-10-02T23:00:31Z","title":"EMMA: Efficient Visual Alignment in Multi-Modal LLMs","summary":" Multi-modal Large Language Models (MLLMs) have recently exhibited impressive\ngeneral-purpose capabilities by leveraging vision foundation models to encode\nthe core concepts of images into representations. These are then combined with\ninstructions and processed by the language model to generate high-quality\nresponses. Despite significant progress in enhancing the language component,\nchallenges persist in optimally fusing visual encodings within the language\nmodel for task-specific adaptability. Recent research has focused on improving\nthis fusion through modality adaptation modules but at the cost of\nsignificantly increased model complexity and training data needs. In this\npaper, we propose EMMA (Efficient Multi-Modal Adaptation), a lightweight\ncross-modality module designed to efficiently fuse visual and textual\nencodings, generating instruction-aware visual representations for the language\nmodel. Our key contributions include: (1) an efficient early fusion mechanism\nthat integrates vision and language representations with minimal added\nparameters (less than 0.2% increase in model size), (2) an in-depth\ninterpretability analysis that sheds light on the internal mechanisms of the\nproposed method; (3) comprehensive experiments that demonstrate notable\nimprovements on both specialized and general benchmarks for MLLMs. Empirical\nresults show that EMMA boosts performance across multiple tasks by up to 9.3%\nwhile significantly improving robustness against hallucinations. Our code is\navailable at https://github.com/SaraGhazanfari/EMMA\n","authors":["Sara Ghazanfari","Alexandre Araujo","Prashanth Krishnamurthy","Siddharth Garg","Farshad Khorrami"],"pdf_url":"https://arxiv.org/pdf/2410.02080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02078v1","updated":"2024-10-02T22:57:47Z","published":"2024-10-02T22:57:47Z","title":"Posterior sampling via Langevin dynamics based on generative priors","summary":" Posterior sampling in high-dimensional spaces using generative models holds\nsignificant promise for various applications, including but not limited to\ninverse problems and guided generation tasks. Despite many recent developments,\ngenerating diverse posterior samples remains a challenge, as existing methods\nrequire restarting the entire generative process for each new sample, making\nthe procedure computationally expensive. In this work, we propose efficient\nposterior sampling by simulating Langevin dynamics in the noise space of a\npre-trained generative model. By exploiting the mapping between the noise and\ndata spaces which can be provided by distilled flows or consistency models, our\nmethod enables seamless exploration of the posterior without the need to re-run\nthe full sampling chain, drastically reducing computational overhead.\nTheoretically, we prove a guarantee for the proposed noise-space Langevin\ndynamics to approximate the posterior, assuming that the generative model\nsufficiently approximates the prior distribution. Our framework is\nexperimentally validated on image restoration tasks involving noisy linear and\nnonlinear forward operators applied to LSUN-Bedroom (256 x 256) and ImageNet\n(64 x 64) datasets. The results demonstrate that our approach generates\nhigh-fidelity samples with enhanced semantic diversity even under a limited\nnumber of function evaluations, offering superior efficiency and performance\ncompared to existing diffusion-based posterior sampling techniques.\n","authors":["Vishal Purohit","Matthew Repasky","Jianfeng Lu","Qiang Qiu","Yao Xie","Xiuyuan Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.02078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02077v1","updated":"2024-10-02T22:56:00Z","published":"2024-10-02T22:56:00Z","title":"Kolmogorov-Arnold Network Autoencoders","summary":" Deep learning models have revolutionized various domains, with Multi-Layer\nPerceptrons (MLPs) being a cornerstone for tasks like data regression and image\nclassification. However, a recent study has introduced Kolmogorov-Arnold\nNetworks (KANs) as promising alternatives to MLPs, leveraging activation\nfunctions placed on edges rather than nodes. This structural shift aligns KANs\nclosely with the Kolmogorov-Arnold representation theorem, potentially\nenhancing both model accuracy and interpretability. In this study, we explore\nthe efficacy of KANs in the context of data representation via autoencoders,\ncomparing their performance with traditional Convolutional Neural Networks\n(CNNs) on the MNIST, SVHN, and CIFAR-10 datasets. Our results demonstrate that\nKAN-based autoencoders achieve competitive performance in terms of\nreconstruction accuracy, thereby suggesting their viability as effective tools\nin data analysis tasks.\n","authors":["Mohammadamin Moradi","Shirin Panahi","Erik Bollt","Ying-Cheng Lai"],"pdf_url":"https://arxiv.org/pdf/2410.02077v1.pdf","comment":"12 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2410.02073v1","updated":"2024-10-02T22:42:20Z","published":"2024-10-02T22:42:20Z","title":"Depth Pro: Sharp Monocular Metric Depth in Less Than a Second","summary":" We present a foundation model for zero-shot metric monocular depth\nestimation. Our model, Depth Pro, synthesizes high-resolution depth maps with\nunparalleled sharpness and high-frequency details. The predictions are metric,\nwith absolute scale, without relying on the availability of metadata such as\ncamera intrinsics. And the model is fast, producing a 2.25-megapixel depth map\nin 0.3 seconds on a standard GPU. These characteristics are enabled by a number\nof technical contributions, including an efficient multi-scale vision\ntransformer for dense prediction, a training protocol that combines real and\nsynthetic datasets to achieve high metric accuracy alongside fine boundary\ntracing, dedicated evaluation metrics for boundary accuracy in estimated depth\nmaps, and state-of-the-art focal length estimation from a single image.\nExtensive experiments analyze specific design choices and demonstrate that\nDepth Pro outperforms prior work along multiple dimensions. We release code and\nweights at https://github.com/apple/ml-depth-pro\n","authors":["Aleksei Bochkovskii","Amaël Delaunoy","Hugo Germain","Marcel Santos","Yichao Zhou","Stephan R. Richter","Vladlen Koltun"],"pdf_url":"https://arxiv.org/pdf/2410.02073v1.pdf","comment":"Code and weights available at https://github.com/apple/ml-depth-pro"},{"id":"http://arxiv.org/abs/2410.02072v1","updated":"2024-10-02T22:41:12Z","published":"2024-10-02T22:41:12Z","title":"Learning from the Giants: A Practical Approach to Underwater Depth and\n Surface Normals Estimation","summary":" Monocular Depth and Surface Normals Estimation (MDSNE) is crucial for tasks\nsuch as 3D reconstruction, autonomous navigation, and underwater exploration.\nCurrent methods rely either on discriminative models, which struggle with\ntransparent or reflective surfaces, or generative models, which, while\naccurate, are computationally expensive. This paper presents a novel deep\nlearning model for MDSNE, specifically tailored for underwater environments,\nusing a hybrid architecture that integrates Convolutional Neural Networks\n(CNNs) with Transformers, leveraging the strengths of both approaches. Training\neffective MDSNE models is often hampered by noisy real-world datasets and the\nlimited generalization of synthetic datasets. To address this, we generate\npseudo-labeled real data using multiple pre-trained MDSNE models. To ensure the\nquality of this data, we propose the Depth Normal Evaluation and Selection\nAlgorithm (DNESA), which evaluates and selects the most reliable pseudo-labeled\nsamples using domain-specific metrics. A lightweight student model is then\ntrained on this curated dataset. Our model reduces parameters by 90% and\ntraining costs by 80%, allowing real-time 3D perception on resource-constrained\ndevices. Key contributions include: a novel and efficient MDSNE model, the\nDNESA algorithm, a domain-specific data pipeline, and a focus on real-time\nperformance and scalability. Designed for real-world underwater applications,\nour model facilitates low-cost deployments in underwater robots and autonomous\nvehicles, bridging the gap between research and practical implementation.\n","authors":["Alzayat Saleh","Melanie Olsen","Bouchra Senadji","Mostafa Rahimi Azghadi"],"pdf_url":"https://arxiv.org/pdf/2410.02072v1.pdf","comment":"18 pages, 6 figures, 8 tables. Submitted to Elsevier"},{"id":"http://arxiv.org/abs/2410.02069v1","updated":"2024-10-02T22:36:12Z","published":"2024-10-02T22:36:12Z","title":"Semi-Supervised Fine-Tuning of Vision Foundation Models with\n Content-Style Decomposition","summary":" In this paper, we present a semi-supervised fine-tuning approach designed to\nimprove the performance of foundation models on downstream tasks with limited\nlabeled data. By leveraging content-style decomposition within an\ninformation-theoretic framework, our method enhances the latent representations\nof pre-trained vision foundation models, aligning them more effectively with\nspecific task objectives and addressing the problem of distribution shift. We\nevaluate our approach on multiple datasets, including MNIST, its augmented\nvariations (with yellow and white stripes), CIFAR-10, SVHN, and GalaxyMNIST.\nThe experiments show improvements over purely supervised baselines,\nparticularly in low-labeled data regimes, across both frozen and trainable\nbackbones for the majority of the tested datasets.\n","authors":["Mariia Drozdova","Vitaliy Kinakh","Yury Belousov","Erica Lastufka","Slava Voloshynovskiy"],"pdf_url":"https://arxiv.org/pdf/2410.02069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11262v2","updated":"2024-10-02T22:33:08Z","published":"2024-06-17T07:06:58Z","title":"Generative Visual Instruction Tuning","summary":" We propose to use automatically generated instruction-following data to\nimprove the zero-shot capabilities of a large multimodal model with additional\nsupport for generative and image editing tasks. We achieve this by curating a\nnew multimodal instruction-following set using GPT-4V and existing datasets for\nimage generation and editing. Using this instruction set and the existing\nLLaVA-Finetune instruction set for visual understanding tasks, we produce\nGenLLaVA, a Generative Large Language and Visual Assistant. GenLLaVA is built\nthrough a strategy that combines three types of large pretrained models through\ninstruction finetuning: Mistral for language modeling, SigLIP for image-text\nmatching, and StableDiffusion for text-to-image generation. Our model\ndemonstrates visual understanding capabilities superior to LLaVA and\nadditionally demonstrates competitive results with native multimodal models\nsuch as Unified-IO 2, paving the way for building advanced general-purpose\nvisual assistants by effectively re-using existing multimodal models. We\nopen-source our dataset, codebase, and model checkpoints to foster further\nresearch and application in this domain.\n","authors":["Jefferson Hernandez","Ruben Villegas","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2406.11262v2.pdf","comment":"Add more results using task tokens, expand the introduction and\n related work FIX: error in LLM-as-judge evaluation that was over-inflating\n the results"},{"id":"http://arxiv.org/abs/2410.02067v1","updated":"2024-10-02T22:29:14Z","published":"2024-10-02T22:29:14Z","title":"DisEnvisioner: Disentangled and Enriched Visual Prompt for Customized\n Image Generation","summary":" In the realm of image generation, creating customized images from visual\nprompt with additional textual instruction emerges as a promising endeavor.\nHowever, existing methods, both tuning-based and tuning-free, struggle with\ninterpreting the subject-essential attributes from the visual prompt. This\nleads to subject-irrelevant attributes infiltrating the generation process,\nultimately compromising the personalization quality in both editability and ID\npreservation. In this paper, we present DisEnvisioner, a novel approach for\neffectively extracting and enriching the subject-essential features while\nfiltering out -irrelevant information, enabling exceptional customization\nperformance, in a tuning-free manner and using only a single image.\nSpecifically, the feature of the subject and other irrelevant components are\neffectively separated into distinctive visual tokens, enabling a much more\naccurate customization. Aiming to further improving the ID consistency, we\nenrich the disentangled features, sculpting them into more granular\nrepresentations. Experiments demonstrate the superiority of our approach over\nexisting methods in instruction response (editability), ID consistency,\ninference speed, and the overall image quality, highlighting the effectiveness\nand efficiency of DisEnvisioner. Project page:\nhttps://disenvisioner.github.io/.\n","authors":["Jing He","Haodong Li","Yongzhe Hu","Guibao Shen","Yingjie Cai","Weichao Qiu","Ying-Cong Chen"],"pdf_url":"https://arxiv.org/pdf/2410.02067v1.pdf","comment":"The first two authors contributed equally. Project page:\n https://disenvisioner.github.io/"},{"id":"http://arxiv.org/abs/2409.19951v2","updated":"2024-10-02T22:24:44Z","published":"2024-09-30T05:12:01Z","title":"Law of the Weakest Link: Cross Capabilities of Large Language Models","summary":" The development and evaluation of Large Language Models (LLMs) have largely\nfocused on individual capabilities. However, this overlooks the intersection of\nmultiple abilities across different types of expertise that are often required\nfor real-world tasks, which we term cross capabilities. To systematically\nexplore this concept, we first define seven core individual capabilities and\nthen pair them to form seven common cross capabilities, each supported by a\nmanually constructed taxonomy. Building on these definitions, we introduce\nCrossEval, a benchmark comprising 1,400 human-annotated prompts, with 100\nprompts for each individual and cross capability. To ensure reliable\nevaluation, we involve expert annotators to assess 4,200 model responses,\ngathering 8,400 human ratings with detailed explanations to serve as reference\nexamples. Our findings reveal that, in both static evaluations and attempts to\nenhance specific abilities, current LLMs consistently exhibit the \"Law of the\nWeakest Link,\" where cross-capability performance is significantly constrained\nby the weakest component. Specifically, across 58 cross-capability scores from\n17 models, 38 scores are lower than all individual capabilities, while 20 fall\nbetween strong and weak, but closer to the weaker ability. These results\nhighlight the under-performance of LLMs in cross-capability tasks, making the\nidentification and improvement of the weakest capabilities a critical priority\nfor future research to optimize performance in complex, multi-dimensional\nscenarios.\n","authors":["Ming Zhong","Aston Zhang","Xuewei Wang","Rui Hou","Wenhan Xiong","Chenguang Zhu","Zhengxing Chen","Liang Tan","Chloe Bi","Mike Lewis","Sravya Popuri","Sharan Narang","Melanie Kambadur","Dhruv Mahajan","Sergey Edunov","Jiawei Han","Laurens van der Maaten"],"pdf_url":"https://arxiv.org/pdf/2409.19951v2.pdf","comment":"Data, Code, & Benchmark: www.llm-cross-capabilities.org"},{"id":"http://arxiv.org/abs/2409.18124v2","updated":"2024-10-02T22:17:56Z","published":"2024-09-26T17:58:55Z","title":"Lotus: Diffusion-based Visual Foundation Model for High-quality Dense\n Prediction","summary":" Leveraging the visual priors of pre-trained text-to-image diffusion models\noffers a promising solution to enhance zero-shot generalization in dense\nprediction tasks. However, existing methods often uncritically use the original\ndiffusion formulation, which may not be optimal due to the fundamental\ndifferences between dense prediction and image generation. In this paper, we\nprovide a systemic analysis of the diffusion formulation for the dense\nprediction, focusing on both quality and efficiency. And we find that the\noriginal parameterization type for image generation, which learns to predict\nnoise, is harmful for dense prediction; the multi-step noising/denoising\ndiffusion process is also unnecessary and challenging to optimize. Based on\nthese insights, we introduce Lotus, a diffusion-based visual foundation model\nwith a simple yet effective adaptation protocol for dense prediction.\nSpecifically, Lotus is trained to directly predict annotations instead of\nnoise, thereby avoiding harmful variance. We also reformulate the diffusion\nprocess into a single-step procedure, simplifying optimization and\nsignificantly boosting inference speed. Additionally, we introduce a novel\ntuning strategy called detail preserver, which achieves more accurate and\nfine-grained predictions. Without scaling up the training data or model\ncapacity, Lotus achieves SoTA performance in zero-shot depth and normal\nestimation across various datasets. It also enhances efficiency, being\nsignificantly faster than most existing diffusion-based methods. Lotus'\nsuperior quality and efficiency also enable a wide range of practical\napplications, such as joint estimation, single/multi-view 3D reconstruction,\netc. Project page: https://lotus3d.github.io/.\n","authors":["Jing He","Haodong Li","Wei Yin","Yixun Liang","Leheng Li","Kaiqiang Zhou","Hongbo Zhang","Bingbing Liu","Ying-Cong Chen"],"pdf_url":"https://arxiv.org/pdf/2409.18124v2.pdf","comment":"The first two authors contributed equally. Project page:\n https://lotus3d.github.io/"},{"id":"http://arxiv.org/abs/2410.02055v1","updated":"2024-10-02T22:05:30Z","published":"2024-10-02T22:05:30Z","title":"Using Style Ambiguity Loss to Improve Aesthetics of Diffusion Models","summary":" Teaching text-to-image models to be creative involves using style ambiguity\nloss. In this work, we explore using the style ambiguity training objective,\nused to approximate creativity, on a diffusion model. We then experiment with\nforms of style ambiguity loss that do not require training a classifier or a\nlabeled dataset, and find that the models trained with style ambiguity loss can\ngenerate better images than the baseline diffusion models and GANs. Code is\navailable at https://github.com/jamesBaker361/clipcreate.\n","authors":["James Baker"],"pdf_url":"https://arxiv.org/pdf/2410.02055v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2407.12009"},{"id":"http://arxiv.org/abs/2303.12001v3","updated":"2024-10-02T21:58:04Z","published":"2023-03-21T16:33:40Z","title":"ViC-MAE: Self-Supervised Representation Learning from Images and Video\n with Contrastive Masked Autoencoders","summary":" We propose ViC-MAE, a model that combines both Masked AutoEncoders (MAE) and\ncontrastive learning. ViC-MAE is trained using a global featured obtained by\npooling the local representations learned under an MAE reconstruction loss and\nleveraging this representation under a contrastive objective across images and\nvideo frames. We show that visual representations learned under ViC-MAE\ngeneralize well to both video and image classification tasks. Particularly,\nViC-MAE obtains state-of-the-art transfer learning performance from video to\nimages on Imagenet-1k compared to the recently proposed OmniMAE by achieving a\ntop-1 accuracy of 86% (+1.3% absolute improvement) when trained on the same\ndata and 87.1% (+2.4% absolute improvement) when training on extra data. At the\nsame time ViC-MAE outperforms most other methods on video benchmarks by\nobtaining 75.9% top-1 accuracy on the challenging Something something-v2 video\nbenchmark . When training on videos and images from a diverse combination of\ndatasets, our method maintains a balanced transfer-learning performance between\nvideo and image classification benchmarks, coming only as a close second to the\nbest supervised method.\n","authors":["Jefferson Hernandez","Ruben Villegas","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2303.12001v3.pdf","comment":"Published at ECCV 2024"},{"id":"http://arxiv.org/abs/2409.19291v2","updated":"2024-10-02T21:50:58Z","published":"2024-09-28T09:28:51Z","title":"CLIP-MoE: Towards Building Mixture of Experts for CLIP with Diversified\n Multiplet Upcycling","summary":" In recent years, Contrastive Language-Image Pre-training (CLIP) has become a\ncornerstone in multimodal intelligence. However, recent studies have identified\nthat the information loss in the CLIP encoding process is substantial, and CLIP\ntends to capture only coarse-grained features from the input. This deficiency\nsignificantly limits the ability of a single CLIP model to handle images rich\nin visual detail. In this work, we propose a simple yet effective\nmodel-agnostic strategy, Diversified Multiplet Upcycling (DMU), for CLIP. DMU\nefficiently fine-tunes a series of CLIP models that capture different feature\nspaces, from a dense pre-trained CLIP checkpoint, sharing parameters except for\nthe Feed-Forward Network (FFN). These models can then be transformed into a\nCLIP-MoE with a larger model capacity, leading to significantly enhanced\nperformance with minimal computational overhead. To the best of our knowledge,\nDiversified Multiplet Upcycling is the first approach to introduce sparsely\nactivated MoE into CLIP foundation models. Extensive experiments demonstrate\nthe significant performance of CLIP-MoE across various zero-shot retrieval,\nzero-shot image classification tasks, and downstream Multimodal Large Language\nModel (MLLM) benchmarks by serving as a vision encoder. Furthermore,\nDiversified Multiplet Upcycling enables the conversion of any dense CLIP model\ninto CLIP-MoEs, which can seamlessly replace CLIP in a plug-and-play manner\nwithout requiring further adaptation in downstream frameworks. Through\nDiversified Multiplet Upcycling, we aim to provide valuable insights for future\nresearch on developing more efficient and effective multimodal learning\nsystems.\n","authors":["Jihai Zhang","Xiaoye Qu","Tong Zhu","Yu Cheng"],"pdf_url":"https://arxiv.org/pdf/2409.19291v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02052v1","updated":"2024-10-02T21:42:35Z","published":"2024-10-02T21:42:35Z","title":"Improving Autonomous AI Agents with Reflective Tree Search and\n Self-Learning","summary":" Autonomous agents have demonstrated significant potential in automating\ncomplex multistep decision-making tasks. However, even state-of-the-art\nvision-language models (VLMs), such as GPT-4o, still fall short of human-level\nperformance, particularly in intricate web environments and long-horizon\nplanning tasks. To address these limitations, we introduce Reflective Monte\nCarlo Tree Search (R-MCTS), a novel test-time algorithm designed to enhance the\nability of AI agents, e.g., powered by GPT-4o, to explore decision space on the\nfly. R-MCTS extends traditional MCTS by 1) incorporating contrastive\nreflection, allowing agents to learn from past interactions and dynamically\nimprove their search efficiency; and 2) using multi-agent debate to provide\nreliable state evaluation. Moreover, we improve the agent's performance by\nfine-tuning GPT-4o through self-learning, using R-MCTS generated tree\ntraversals without any human-provided labels. On the challenging VisualWebArena\nbenchmark, our GPT-4o-based R-MCTS agent achieves a 6% to 30% relative\nimprovement across various tasks compared to the previous state-of-the-art.\nAdditionally, we show that the knowledge gained from test-time search can be\neffectively transferred back to GPT-4o via fine-tuning. The fine-tuned GPT-4o\nmatches 97% of R-MCTS's performance while reducing compute usage by a factor of\nfour at test time. Furthermore, qualitative results reveal that the fine-tuned\nGPT-4o model demonstrates the ability to explore the environment, evaluate a\nstate, and backtrack to viable ones when it detects that the current state\ncannot lead to success. Moreover, our work demonstrates the compute scaling\nproperties in both training - data collection with R-MCTS - and testing time.\nThese results suggest a promising research direction to enhance VLMs' reasoning\nand planning capabilities for agentic applications via test-time search and\nself-learning.\n","authors":["Xiao Yu","Baolin Peng","Vineeth Vajipey","Hao Cheng","Michel Galley","Jianfeng Gao","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2410.02052v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2410.02074v1","updated":"2024-10-02T22:46:51Z","published":"2024-10-02T22:46:51Z","title":"Price-guided user attention in large-scale E-commerce group\n recommendation","summary":" Existing group recommender systems utilize attention mechanisms to identify\ncritical users who influence group decisions the most. We analyzed user\nattention scores from a widely-used group recommendation model on a real-world\nE-commerce dataset and found that item price and user interaction history\nsignificantly influence the selection of critical users. When item prices are\nlow, users with extensive interaction histories are more influential in group\ndecision-making. Conversely, their influence diminishes with higher item\nprices. Based on these observations, we propose a novel group recommendation\napproach that incorporates item price as a guiding factor for user aggregation.\nOur model employs an adaptive sigmoid function to adjust output logits based on\nitem prices, enhancing the accuracy of user aggregation. Our model can be\nplugged into any attention-based group recommender system if the price\ninformation is available. We evaluate our model's performance on a public\nbenchmark and a real-world dataset. We compare it with other state-of-the-art\ngroup recommendation methods. Our results demonstrate that our price-guided\nuser attention approach outperforms the state-of-the-art methods in terms of\nhit ratio and mean square error.\n","authors":["Yang Shi","Young-joo Chung"],"pdf_url":"https://arxiv.org/pdf/2410.02074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00026v5","updated":"2024-10-02T20:45:53Z","published":"2024-03-20T21:02:16Z","title":"Ink and Individuality: Crafting a Personalised Narrative in the Age of\n LLMs","summary":" Individuality and personalization comprise the distinctive characteristics\nthat make each writer unique and influence their words in order to effectively\nengage readers while conveying authenticity. However, our growing reliance on\nLLM-based writing assistants risks compromising our creativity and\nindividuality over time. We often overlook the negative impacts of this trend\non our creativity and uniqueness, despite the possible consequences. This study\ninvestigates these concerns by performing a brief survey to explore different\nperspectives and concepts, as well as trying to understand people's viewpoints,\nin conjunction with past studies in the area. Addressing these issues is\nessential for improving human-computer interaction systems and enhancing\nwriting assistants for personalization and individuality.\n","authors":["Azmine Toushik Wasi","Raima Islam","Mst Rafia Islam"],"pdf_url":"https://arxiv.org/pdf/2404.00026v5.pdf","comment":"5 Pages, 4 Figures. Accepted in The Third Workshop on Intelligent and\n Interactive Writing Assistants at CHI 2024"},{"id":"http://arxiv.org/abs/2410.01987v1","updated":"2024-10-02T19:48:17Z","published":"2024-10-02T19:48:17Z","title":"Financial Sentiment Analysis on News and Reports Using Large Language\n Models and FinBERT","summary":" Financial sentiment analysis (FSA) is crucial for evaluating market sentiment\nand making well-informed financial decisions. The advent of large language\nmodels (LLMs) such as BERT and its financial variant, FinBERT, has notably\nenhanced sentiment analysis capabilities. This paper investigates the\napplication of LLMs and FinBERT for FSA, comparing their performance on news\narticles, financial reports and company announcements. The study emphasizes the\nadvantages of prompt engineering with zero-shot and few-shot strategy to\nimprove sentiment classification accuracy. Experimental results indicate that\nGPT-4o, with few-shot examples of financial texts, can be as competent as a\nwell fine-tuned FinBERT in this specialized field.\n","authors":["Yanxin Shen","Pulin Kirin Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.01987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16627v2","updated":"2024-10-02T15:57:50Z","published":"2024-09-25T05:12:07Z","title":"Train Once, Deploy Anywhere: Matryoshka Representation Learning for\n Multimodal Recommendation","summary":" Despite recent advancements in language and vision modeling, integrating rich\nmultimodal knowledge into recommender systems continues to pose significant\nchallenges. This is primarily due to the need for efficient recommendation,\nwhich requires adaptive and interactive responses. In this study, we focus on\nsequential recommendation and introduce a lightweight framework called\nfull-scale Matryoshka representation learning for multimodal recommendation\n(fMRLRec). Our fMRLRec captures item features at different granularities,\nlearning informative representations for efficient recommendation across\nmultiple dimensions. To integrate item features from diverse modalities,\nfMRLRec employs a simple mapping to project multimodal item features into an\naligned feature space. Additionally, we design an efficient linear\ntransformation that embeds smaller features into larger ones, substantially\nreducing memory requirements for large-scale training on recommendation data.\nCombined with improved state space modeling techniques, fMRLRec scales to\ndifferent dimensions and only requires one-time training to produce multiple\nmodels tailored to various granularities. We demonstrate the effectiveness and\nefficiency of fMRLRec on multiple benchmark datasets, which consistently\nachieves superior performance over state-of-the-art baseline methods. We make\nour code and data publicly available at https://github.com/yueqirex/fMRLRec.\n","authors":["Yueqi Wang","Zhenrui Yue","Huimin Zeng","Dong Wang","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2409.16627v2.pdf","comment":"Accepted to EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2409.04701v2","updated":"2024-10-02T15:07:09Z","published":"2024-09-07T03:54:46Z","title":"Late Chunking: Contextual Chunk Embeddings Using Long-Context Embedding\n Models","summary":" Many use cases require retrieving smaller portions of text, and dense\nvector-based retrieval systems often perform better with shorter text segments,\nas the semantics are less likely to be over-compressed in the embeddings.\nConsequently, practitioners often split text documents into smaller chunks and\nencode them separately. However, chunk embeddings created in this way can lose\ncontextual information from surrounding chunks, resulting in sub-optimal\nrepresentations. In this paper, we introduce a novel method called late\nchunking, which leverages long context embedding models to first embed all\ntokens of the long text, with chunking applied after the transformer model and\njust before mean pooling - hence the term late in its naming. The resulting\nchunk embeddings capture the full contextual information, leading to superior\nresults across various retrieval tasks. The method is generic enough to be\napplied to a wide range of long-context embedding models and works without\nadditional training. To further increase the effectiveness of late chunking, we\npropose a dedicated fine-tuning approach for embedding models.\n","authors":["Michael Günther","Isabelle Mohr","Daniel James Williams","Bo Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.04701v2.pdf","comment":"11 pages, 3rd draft"},{"id":"http://arxiv.org/abs/2410.01598v1","updated":"2024-10-02T14:36:18Z","published":"2024-10-02T14:36:18Z","title":"Elaborative Subtopic Query Reformulation for Broad and Indirect Queries\n in Travel Destination Recommendation","summary":" In Query-driven Travel Recommender Systems (RSs), it is crucial to understand\nthe user intent behind challenging natural language(NL) destination queries\nsuch as the broadly worded \"youth-friendly activities\" or the indirect\ndescription \"a high school graduation trip\". Such queries are challenging due\nto the wide scope and subtlety of potential user intents that confound the\nability of retrieval methods to infer relevant destinations from available\ntextual descriptions such as WikiVoyage. While query reformulation (QR) has\nproven effective in enhancing retrieval by addressing user intent, existing QR\nmethods tend to focus only on expanding the range of potentially matching query\nsubtopics (breadth) or elaborating on the potential meaning of a query (depth),\nbut not both. In this paper, we introduce Elaborative Subtopic Query\nReformulation (EQR), a large language model-based QR method that combines both\nbreadth and depth by generating potential query subtopics with information-rich\nelaborations. We also release TravelDest, a novel dataset for query-driven\ntravel destination RSs. Experiments on TravelDest show that EQR achieves\nsignificant improvements in recall and precision over existing state-of-the-art\nQR methods.\n","authors":["Qianfeng Wen","Yifan Liu","Joshua Zhang","George Saad","Anton Korikov","Yury Sambale","Scott Sanner"],"pdf_url":"https://arxiv.org/pdf/2410.01598v1.pdf","comment":"9 pages, 7 figures,The 1st Workshop on Risks, Opportunities, and\n Evaluation of Generative Models in Recommender Systems (ROEGEN@RecSys 2024),\n October 2024, Bari, Italy"},{"id":"http://arxiv.org/abs/2409.13385v2","updated":"2024-10-02T14:30:28Z","published":"2024-09-20T10:36:49Z","title":"Contextual Compression in Retrieval-Augmented Generation for Large\n Language Models: A Survey","summary":" Large Language Models (LLMs) showcase remarkable abilities, yet they struggle\nwith limitations such as hallucinations, outdated knowledge, opacity, and\ninexplicable reasoning. To address these challenges, Retrieval-Augmented\nGeneration (RAG) has proven to be a viable solution, leveraging external\ndatabases to improve the consistency and coherence of generated content,\nespecially valuable for complex, knowledge-rich tasks, and facilitates\ncontinuous improvement by leveraging domain-specific insights. By combining the\nintrinsic knowledge of LLMs with the vast, dynamic repositories of external\ndatabases, RAG achieves a synergistic effect. However, RAG is not without its\nlimitations, including a limited context window, irrelevant information, and\nthe high processing overhead for extensive contextual data. In this\ncomprehensive work, we explore the evolution of Contextual Compression\nparadigms, providing an in-depth examination of the field. Finally, we outline\nthe current challenges and suggest potential research and development\ndirections, paving the way for future advancements in this area.\n","authors":["Sourav Verma"],"pdf_url":"https://arxiv.org/pdf/2409.13385v2.pdf","comment":"Ongoing Work"},{"id":"http://arxiv.org/abs/2407.04528v2","updated":"2024-10-02T12:38:39Z","published":"2024-07-05T14:16:47Z","title":"GPT vs RETRO: Exploring the Intersection of Retrieval and\n Parameter-Efficient Fine-Tuning","summary":" Parameter-Efficient Fine-Tuning (PEFT) and Retrieval-Augmented Generation\n(RAG) have become popular methods for adapting large language models while\nminimizing compute requirements. In this paper, we apply PEFT methods\n(P-tuning, Adapters, and LoRA) to a modified Retrieval-Enhanced Transformer\n(RETRO) and a baseline GPT model across several sizes, ranging from 823 million\nto 48 billion parameters. We show that RETRO models outperform GPT models in\nzero-shot settings due to their unique pre-training process but GPT models have\nhigher performance potential with PEFT. Additionally, our study indicates that\n8B parameter models strike an optimal balance between cost and performance and\nP-tuning lags behind other PEFT techniques. We further provide a comparative\nanalysis between applying PEFT to an Instruction-tuned RETRO model and base\nRETRO model. This work presents the first comprehensive comparison of various\nPEFT methods integrated with RAG, applied to both GPT and RETRO models,\nhighlighting their relative performance.\n","authors":["Aleksander Ficek","Jiaqi Zeng","Oleksii Kuchaiev"],"pdf_url":"https://arxiv.org/pdf/2407.04528v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.01470v1","updated":"2024-10-02T12:21:31Z","published":"2024-10-02T12:21:31Z","title":"Peeling Back the Layers: An In-Depth Evaluation of Encoder Architectures\n in Neural News Recommenders","summary":" Encoder architectures play a pivotal role in neural news recommenders by\nembedding the semantic and contextual information of news and users. Thus,\nresearch has heavily focused on enhancing the representational capabilities of\nnews and user encoders to improve recommender performance. Despite the\nsignificant impact of encoder architectures on the quality of news and user\nrepresentations, existing analyses of encoder designs focus only on the overall\ndownstream recommendation performance. This offers a one-sided assessment of\nthe encoders' similarity, ignoring more nuanced differences in their behavior,\nand potentially resulting in sub-optimal model selection. In this work, we\nperform a comprehensive analysis of encoder architectures in neural news\nrecommender systems. We systematically evaluate the most prominent news and\nuser encoder architectures, focusing on their (i) representational similarity,\nmeasured with the Central Kernel Alignment, (ii) overlap of generated\nrecommendation lists, quantified with the Jaccard similarity, and (iii) the\noverall recommendation performance. Our analysis reveals that the complexity of\ncertain encoding techniques is often empirically unjustified, highlighting the\npotential for simpler, more efficient architectures. By isolating the effects\nof individual components, we provide valuable insights for researchers and\npractitioners to make better informed decisions about encoder selection and\navoid unnecessary complexity in the design of news recommenders.\n","authors":["Andreea Iana","Goran Glavaš","Heiko Paulheim"],"pdf_url":"https://arxiv.org/pdf/2410.01470v1.pdf","comment":"Accepted at the 12th International Workshop on News Recommendation\n and Analytics (INRA 2024) in conjunction with ACM RecSys 2024"},{"id":"http://arxiv.org/abs/2410.01448v1","updated":"2024-10-02T11:59:58Z","published":"2024-10-02T11:59:58Z","title":"Analyzing Byte-Pair Encoding on Monophonic and Polyphonic Symbolic\n Music: A Focus on Musical Phrase Segmentation","summary":" Byte-Pair Encoding (BPE) is an algorithm commonly used in Natural Language\nProcessing to build a vocabulary of subwords, which has been recently applied\nto symbolic music. Given that symbolic music can differ significantly from\ntext, particularly with polyphony, we investigate how BPE behaves with\ndifferent types of musical content. This study provides a qualitative analysis\nof BPE's behavior across various instrumentations and evaluates its impact on a\nmusical phrase segmentation task for both monophonic and polyphonic music. Our\nfindings show that the BPE training process is highly dependent on the\ninstrumentation and that BPE \"supertokens\" succeed in capturing abstract\nmusical content. In a musical phrase segmentation task, BPE notably improves\nperformance in a polyphonic setting, but enhances performance in monophonic\ntunes only within a specific range of BPE merges.\n","authors":["Dinh-Viet-Toan Le","Louis Bigo","Mikaela Keller"],"pdf_url":"https://arxiv.org/pdf/2410.01448v1.pdf","comment":"Accepted to 3rd Workshop on NLP for Music and Audio (NLP4MusA,\n co-located with ISMIR 2024)"},{"id":"http://arxiv.org/abs/2410.01396v1","updated":"2024-10-02T10:16:54Z","published":"2024-10-02T10:16:54Z","title":"Can We Delegate Learning to Automation?: A Comparative Study of LLM\n Chatbots, Search Engines, and Books","summary":" Learning is a key motivator behind information search behavior. With the\nemergence of LLM-based chatbots, students are increasingly turning to these\ntools as their primary resource for acquiring knowledge. However, the\ntransition from traditional resources like textbooks and web searches raises\nconcerns among educators. They worry that these fully-automated LLMs might lead\nstudents to delegate critical steps of search as learning. In this paper, we\nsystematically uncover three main concerns from educators' perspectives. In\nresponse to these concerns, we conducted a mixed-methods study with 92\nuniversity students to compare three learning sources with different automation\nlevels. Our results show that LLMs support comprehensive understanding of key\nconcepts without promoting passive learning, though their effectiveness in\nknowledge retention was limited. Additionally, we found that academic\nperformance impacted both learning outcomes and search patterns. Notably,\nhigher-competence learners engaged more deeply with content through\nreading-intensive behaviors rather than relying on search activities.\n","authors":["Yeonsun Yang","Ahyeon Shin","Mincheol Kang","Jiheon Kang","Jean Young Song"],"pdf_url":"https://arxiv.org/pdf/2410.01396v1.pdf","comment":"21 pages, 14 figures"},{"id":"http://arxiv.org/abs/2410.01383v1","updated":"2024-10-02T09:51:42Z","published":"2024-10-02T09:51:42Z","title":"PairDistill: Pairwise Relevance Distillation for Dense Retrieval","summary":" Effective information retrieval (IR) from vast datasets relies on advanced\ntechniques to extract relevant information in response to queries. Recent\nadvancements in dense retrieval have showcased remarkable efficacy compared to\ntraditional sparse retrieval methods. To further enhance retrieval performance,\nknowledge distillation techniques, often leveraging robust cross-encoder\nrerankers, have been extensively explored. However, existing approaches\nprimarily distill knowledge from pointwise rerankers, which assign absolute\nrelevance scores to documents, thus facing challenges related to inconsistent\ncomparisons. This paper introduces Pairwise Relevance Distillation\n(PairDistill) to leverage pairwise reranking, offering fine-grained\ndistinctions between similarly relevant documents to enrich the training of\ndense retrieval models. Our experiments demonstrate that PairDistill\noutperforms existing methods, achieving new state-of-the-art results across\nmultiple benchmarks. This highlights the potential of PairDistill in advancing\ndense retrieval techniques effectively. Our source code and trained models are\nreleased at https://github.com/MiuLab/PairDistill\n","authors":["Chao-Wei Huang","Yun-Nung Chen"],"pdf_url":"https://arxiv.org/pdf/2410.01383v1.pdf","comment":"Accepted to EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2402.16508v3","updated":"2024-10-02T07:51:47Z","published":"2024-02-26T11:42:29Z","title":"Pre-training Cross-lingual Open Domain Question Answering with\n Large-scale Synthetic Supervision","summary":" Cross-lingual open domain question answering (CLQA) is a complex problem,\ncomprising cross-lingual retrieval from a multilingual knowledge base, followed\nby answer generation in the query language. Both steps are usually tackled by\nseparate models, requiring substantial annotated datasets, and typically\nauxiliary resources, like machine translation systems to bridge between\nlanguages. In this paper, we show that CLQA can be addressed using a single\nencoder-decoder model. To effectively train this model, we propose a\nself-supervised method based on exploiting the cross-lingual link structure\nwithin Wikipedia. We demonstrate how linked Wikipedia pages can be used to\nsynthesise supervisory signals for cross-lingual retrieval, through a form of\ncloze query, and generate more natural questions to supervise answer\ngeneration. Together, we show our approach, \\texttt{CLASS}, outperforms\ncomparable methods on both supervised and zero-shot language adaptation\nsettings, including those using machine translation.\n","authors":["Fan Jiang","Tom Drummond","Trevor Cohn"],"pdf_url":"https://arxiv.org/pdf/2402.16508v3.pdf","comment":"EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2409.13621v2","updated":"2024-10-02T06:14:17Z","published":"2024-09-20T16:32:54Z","title":"Advancing Event Causality Identification via Heuristic Semantic\n Dependency Inquiry Network","summary":" Event Causality Identification (ECI) focuses on extracting causal relations\nbetween events in texts. Existing methods for ECI primarily rely on causal\nfeatures and external knowledge. However, these approaches fall short in two\ndimensions: (1) causal features between events in a text often lack explicit\nclues, and (2) external knowledge may introduce bias, while specific problems\nrequire tailored analyses. To address these issues, we propose SemDI - a simple\nand effective Semantic Dependency Inquiry Network for ECI. SemDI captures\nsemantic dependencies within the context using a unified encoder. Then, it\nutilizes a Cloze Analyzer to generate a fill-in token based on comprehensive\ncontext understanding. Finally, this fill-in token is used to inquire about the\ncausal relation between two events. Extensive experiments demonstrate the\neffectiveness of SemDI, surpassing state-of-the-art methods on three widely\nused benchmarks. Code is available at https://github.com/hrlics/SemDI.\n","authors":["Haoran Li","Qiang Gao","Hongmei Wu","Li Huang"],"pdf_url":"https://arxiv.org/pdf/2409.13621v2.pdf","comment":"EMNLP 2024 camera-ready version. Code is released at\n https://github.com/hrlics/SemDI"},{"id":"http://arxiv.org/abs/2402.11202v2","updated":"2024-10-02T05:39:46Z","published":"2024-02-17T05:36:13Z","title":"Towards Scalability and Extensibility of Query Reformulation Modeling in\n E-commerce Search","summary":" Customer behavioral data significantly impacts e-commerce search systems.\nHowever, in the case of less common queries, the associated behavioral data\ntends to be sparse and noisy, offering inadequate support to the search\nmechanism. To address this challenge, the concept of query reformulation has\nbeen introduced. It suggests that less common queries could utilize the\nbehavior patterns of their popular counterparts with similar meanings. In\nAmazon product search, query reformulation has displayed its effectiveness in\nimproving search relevance and bolstering overall revenue. Nonetheless,\nadapting this method for smaller or emerging businesses operating in regions\nwith lower traffic and complex multilingual settings poses the challenge in\nterms of scalability and extensibility. This study focuses on overcoming this\nchallenge by constructing a query reformulation solution capable of functioning\neffectively, even when faced with limited training data, in terms of quality\nand scale, along with relatively complex linguistic characteristics. In this\npaper we provide an overview of the solution implemented within Amazon product\nsearch infrastructure, which encompasses a range of elements, including\nrefining the data mining process, redefining model training objectives, and\nreshaping training strategies. The effectiveness of the proposed solution is\nvalidated through online A/B testing on search ranking and Ads matching.\nNotably, employing the proposed solution in search ranking resulted in 0.14%\nand 0.29% increase in overall revenue in Japanese and Hindi cases,\nrespectively, and a 0.08% incremental gain in the English case compared to the\nlegacy implementation; while in search Ads matching led to a 0.36% increase in\nAds revenue in the Japanese case.\n","authors":["Ziqi Zhang","Yupin Huang","Quan Deng","Jinghui Xiao","Vivek Mittal","Jingyuan Deng"],"pdf_url":"https://arxiv.org/pdf/2402.11202v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05152v2","updated":"2024-10-02T05:02:02Z","published":"2024-09-08T16:35:19Z","title":"OneGen: Efficient One-Pass Unified Generation and Retrieval for LLMs","summary":" Despite the recent advancements in Large Language Models (LLMs), which have\nsignificantly enhanced the generative capabilities for various NLP tasks, LLMs\nstill face limitations in directly handling retrieval tasks. However, many\npractical applications demand the seamless integration of both retrieval and\ngeneration. This paper introduces a novel and efficient One-pass Generation and\nretrieval framework (OneGen), designed to improve LLMs' performance on tasks\nthat require both generation and retrieval. The proposed framework bridges the\ntraditionally separate training approaches for generation and retrieval by\nincorporating retrieval tokens generated autoregressively. This enables a\nsingle LLM to handle both tasks simultaneously in a unified forward pass. We\nconduct experiments on two distinct types of composite tasks, RAG and Entity\nLinking, to validate the pluggability, effectiveness, and efficiency of OneGen\nin training and inference. Furthermore, our results show that integrating\ngeneration and retrieval within the same context preserves the generative\ncapabilities of LLMs while improving retrieval performance. To the best of our\nknowledge, OneGen is the first to enable LLMs to conduct vector retrieval\nduring the generation.\n","authors":["Jintian Zhang","Cheng Peng","Mengshu Sun","Xiang Chen","Lei Liang","Zhiqiang Zhang","Jun Zhou","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05152v2.pdf","comment":"EMNLP 2024 Findings; code is available at\n https://github.com/zjunlp/OneGen"},{"id":"http://arxiv.org/abs/2401.05967v3","updated":"2024-10-02T04:17:36Z","published":"2024-01-11T15:13:00Z","title":"Block-Diagonal Orthogonal Relation and Matrix Entity for Knowledge Graph\n Embedding","summary":" The primary aim of Knowledge Graph embeddings (KGE) is to learn\nlow-dimensional representations of entities and relations for predicting\nmissing facts. While rotation-based methods like RotatE and QuatE perform well\nin KGE, they face two challenges: limited model flexibility requiring\nproportional increases in relation size with entity dimension, and difficulties\nin generalizing the model for higher-dimensional rotations. To address these\nissues, we introduce OrthogonalE, a novel KGE model employing matrices for\nentities and block-diagonal orthogonal matrices with Riemannian optimization\nfor relations. This approach enhances the generality and flexibility of KGE\nmodels. The experimental results indicate that our new KGE model, OrthogonalE,\nis both general and flexible, significantly outperforming state-of-the-art KGE\nmodels while substantially reducing the number of relation parameters.\n","authors":["Yihua Zhu","Hidetoshi Shimodaira"],"pdf_url":"https://arxiv.org/pdf/2401.05967v3.pdf","comment":"EMNLP2024 findings (Long)"},{"id":"http://arxiv.org/abs/2410.01190v1","updated":"2024-10-02T02:51:02Z","published":"2024-10-02T02:51:02Z","title":"Integrating Visual and Textual Inputs for Searching Large-Scale Map\n Collections with CLIP","summary":" Despite the prevalence and historical importance of maps in digital\ncollections, current methods of navigating and exploring map collections are\nlargely restricted to catalog records and structured metadata. In this paper,\nwe explore the potential for interactively searching large-scale map\ncollections using natural language inputs (\"maps with sea monsters\"), visual\ninputs (i.e., reverse image search), and multimodal inputs (an example map +\n\"more grayscale\"). As a case study, we adopt 562,842 images of maps publicly\naccessible via the Library of Congress's API. To accomplish this, we use the\nmulitmodal Contrastive Language-Image Pre-training (CLIP) machine learning\nmodel to generate embeddings for these maps, and we develop code to implement\nexploratory search capabilities with these input strategies. We present results\nfor example searches created in consultation with staff in the Library of\nCongress's Geography and Map Division and describe the strengths, weaknesses,\nand possibilities for these search queries. Moreover, we introduce a\nfine-tuning dataset of 10,504 map-caption pairs, along with an architecture for\nfine-tuning a CLIP model on this dataset. To facilitate re-use, we provide all\nof our code in documented, interactive Jupyter notebooks and place all code\ninto the public domain. Lastly, we discuss the opportunities and challenges for\napplying these approaches across both digitized and born-digital collections\nheld by galleries, libraries, archives, and museums.\n","authors":["Jamie Mahowald","Benjamin Charles Germain Lee"],"pdf_url":"https://arxiv.org/pdf/2410.01190v1.pdf","comment":"18 pages, 7 figures, accepted at the Computational Humanities\n Research Conference (CHR 2024)"},{"id":"http://arxiv.org/abs/2410.01160v1","updated":"2024-10-02T01:29:49Z","published":"2024-10-02T01:29:49Z","title":"GraphRevisedIE: Multimodal Information Extraction with Graph-Revised\n Network","summary":" Key information extraction (KIE) from visually rich documents (VRD) has been\na challenging task in document intelligence because of not only the complicated\nand diverse layouts of VRD that make the model hard to generalize but also the\nlack of methods to exploit the multimodal features in VRD. In this paper, we\npropose a light-weight model named GraphRevisedIE that effectively embeds\nmultimodal features such as textual, visual, and layout features from VRD and\nleverages graph revision and graph convolution to enrich the multimodal\nembedding with global context. Extensive experiments on multiple real-world\ndatasets show that GraphRevisedIE generalizes to documents of varied layouts\nand achieves comparable or better performance compared to previous KIE methods.\nWe also publish a business license dataset that contains both real-life and\nsynthesized documents to facilitate research of document KIE.\n","authors":["Panfeng Cao","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2410.01160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01154v1","updated":"2024-10-02T01:12:54Z","published":"2024-10-02T01:12:54Z","title":"Unleashing the Power of Large Language Models in Zero-shot Relation\n Extraction via Self-Prompting","summary":" Recent research in zero-shot Relation Extraction (RE) has focused on using\nLarge Language Models (LLMs) due to their impressive zero-shot capabilities.\nHowever, current methods often perform suboptimally, mainly due to a lack of\ndetailed, context-specific prompts needed for understanding various sentences\nand relations. To address this, we introduce the Self-Prompting framework, a\nnovel method designed to fully harness the embedded RE knowledge within LLMs.\nSpecifically, our framework employs a three-stage diversity approach to prompt\nLLMs, generating multiple synthetic samples that encapsulate specific relations\nfrom scratch. These generated samples act as in-context learning samples,\noffering explicit and context-specific guidance to efficiently prompt LLMs for\nRE. Experimental evaluations on benchmark datasets show our approach\noutperforms existing LLM-based zero-shot RE methods. Additionally, our\nexperiments confirm the effectiveness of our generation pipeline in producing\nhigh-quality synthetic data that enhances performance.\n","authors":["Siyi Liu","Yang Li","Jiang Li","Shan Yang","Yunshi Lan"],"pdf_url":"https://arxiv.org/pdf/2410.01154v1.pdf","comment":"EMNLP 2024 Short"}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.01906v1","updated":"2024-10-02T18:05:03Z","published":"2024-10-02T18:05:03Z","title":"Social Media Authentication and Combating Deepfakes using Semi-fragile\n Invisible Image Watermarking","summary":" With the significant advances in deep generative models for image and video\nsynthesis, Deepfakes and manipulated media have raised severe societal\nconcerns. Conventional machine learning classifiers for deepfake detection\noften fail to cope with evolving deepfake generation technology and are\nsusceptible to adversarial attacks. Alternatively, invisible image watermarking\nis being researched as a proactive defense technique that allows media\nauthentication by verifying an invisible secret message embedded in the image\npixels. A handful of invisible image watermarking techniques introduced for\nmedia authentication have proven vulnerable to basic image processing\noperations and watermark removal attacks. In response, we have proposed a\nsemi-fragile image watermarking technique that embeds an invisible secret\nmessage into real images for media authentication. Our proposed watermarking\nframework is designed to be fragile to facial manipulations or tampering while\nbeing robust to benign image-processing operations and watermark removal\nattacks. This is facilitated through a unique architecture of our proposed\ntechnique consisting of critic and adversarial networks that enforce high image\nquality and resiliency to watermark removal efforts, respectively, along with\nthe backbone encoder-decoder and the discriminator networks. Thorough\nexperimental investigations on SOTA facial Deepfake datasets demonstrate that\nour proposed model can embed a $64$-bit secret as an imperceptible image\nwatermark that can be recovered with a high-bit recovery accuracy when benign\nimage processing operations are applied while being non-recoverable when unseen\nDeepfake manipulations are applied. In addition, our proposed watermarking\ntechnique demonstrates high resilience to several white-box and black-box\nwatermark removal attacks. Thus, obtaining state-of-the-art performance.\n","authors":["Aakash Varma Nadimpalli","Ajita Rattani"],"pdf_url":"https://arxiv.org/pdf/2410.01906v1.pdf","comment":"ACM Transactions (Digital Threats: Research and Practice)"},{"id":"http://arxiv.org/abs/2303.08336v3","updated":"2024-10-02T18:01:46Z","published":"2023-03-15T02:54:27Z","title":"Progressive Frame Patching for FoV-based Point Cloud Video Streaming","summary":" Many XR applications require the delivery of volumetric video to users with\nsix degrees of freedom (6-DoF) movements. Point Cloud has become a popular\nvolumetric video format. A dense point cloud consumes much higher bandwidth\nthan a 2D/360 degree video frame. User Field of View (FoV) is more dynamic with\n6-DoF movement than 3-DoF movement. To save bandwidth, FoV-adaptive streaming\npredicts a user's FoV and only downloads point cloud data falling in the\npredicted FoV. However, it is vulnerable to FoV prediction errors, which can be\nsignificant when a long buffer is utilized for smoothed streaming. In this\nwork, we propose a multi-round progressive refinement framework for point cloud\nvideo streaming. Instead of sequentially downloading point cloud frames, our\nsolution simultaneously downloads/patches multiple frames falling into a\nsliding time-window, leveraging the inherent scalability of octree-based\npoint-cloud coding. The optimal rate allocation among all tiles of active\nframes are solved analytically using the heterogeneous tile rate-quality\nfunctions calibrated by the predicted user FoV. Multi-frame\ndownloading/patching simultaneously takes advantage of the streaming smoothness\nresulting from long buffer and the FoV prediction accuracy at short buffer\nlength. We evaluate our streaming solution using simulations driven by real\npoint cloud videos, real bandwidth traces, and 6-DoF FoV traces of real users.\nOur solution is robust against the bandwidth/FoV prediction errors, and can\ndeliver high and smooth view quality in the face of bandwidth variations and\ndynamic user and point cloud movements.\n","authors":["Tongyu Zong","Yixiang Mao","Chen Li","Yong Liu","Yao Wang"],"pdf_url":"https://arxiv.org/pdf/2303.08336v3.pdf","comment":"Transactions on Multimedia (under review)"},{"id":"http://arxiv.org/abs/2410.01737v1","updated":"2024-10-02T16:47:55Z","published":"2024-10-02T16:47:55Z","title":"RADAR: Robust Two-stage Modality-incomplete Industrial Anomaly Detection","summary":" Multimodal Industrial Anomaly Detection (MIAD), utilizing 3D point clouds and\n2D RGB images to identify the abnormal region of products, plays a crucial role\nin industrial quality inspection. However, the conventional MIAD setting\npresupposes that all 2D and 3D modalities are paired, overlooking the fact that\nmultimodal data collected from the real world is often imperfect due to missing\nmodalities. Consequently, MIAD models that demonstrate robustness against\nmodal-incomplete data are highly desirable in practice. To address this\npractical challenge, we introduce a first-of-its-kind study that\ncomprehensively investigates Modality-Incomplete Industrial Anomaly Detection\n(MIIAD), to consider the imperfect learning environment in which the multimodal\ninformation may be incomplete. Not surprisingly, we discovered that most\nexisting MIAD approaches are inadequate for addressing MIIAD challenges,\nleading to significant performance degradation on the MIIAD benchmark we\ndeveloped. In this paper, we propose a novel two-stage Robust\nmodAlity-imcomplete fusing and Detecting frAmewoRk, abbreviated as RADAR. Our\nbootstrapping philosophy is to enhance two stages in MIIAD, improving the\nrobustness of the Multimodal Transformer: i) In feature fusion, we first\nexplore learning modality-incomplete instruction, guiding the pre-trained\nMultimodal Transformer to robustly adapt to various modality-incomplete\nscenarios, and implement adaptive parameter learning based on a HyperNetwork;\nii) In anomaly detection, we construct a real-pseudo hybrid module to highlight\nthe distinctiveness of modality combinations, further enhancing the robustness\nof the MIIAD model. Our experimental results demonstrate that the proposed\nRADAR significantly surpasses conventional MIAD methods in terms of\neffectiveness and robustness on our newly created MIIAD dataset, underscoring\nits practical application value.\n","authors":["Bingchen Miao","Wenqiao Zhang","Juncheng Li","Siliang Tang","Zhaocheng Li","Haochen Shi","Jun Xiao","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2410.01737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13049v2","updated":"2024-10-02T13:04:02Z","published":"2024-09-19T18:55:13Z","title":"DiffSSD: A Diffusion-Based Dataset For Speech Forensics","summary":" Diffusion-based speech generators are ubiquitous. These methods can generate\nvery high quality synthetic speech and several recent incidents report their\nmalicious use. To counter such misuse, synthetic speech detectors have been\ndeveloped. Many of these detectors are trained on datasets which do not include\ndiffusion-based synthesizers. In this paper, we demonstrate that existing\ndetectors trained on one such dataset, ASVspoof2019, do not perform well in\ndetecting synthetic speech from recent diffusion-based synthesizers. We propose\nthe Diffusion-Based Synthetic Speech Dataset (DiffSSD), a dataset consisting of\nabout 200 hours of labeled speech, including synthetic speech generated by 8\ndiffusion-based open-source and 2 commercial generators. We also examine the\nperformance of existing synthetic speech detectors on DiffSSD in both\nclosed-set and open-set scenarios. The results highlight the importance of this\ndataset in detecting synthetic speech generated from recent open-source and\ncommercial speech generators.\n","authors":["Kratika Bhagtani","Amit Kumar Singh Yadav","Paolo Bestagini","Edward J. Delp"],"pdf_url":"https://arxiv.org/pdf/2409.13049v2.pdf","comment":"Submitted to IEEE International Conference on Acoustics, Speech, and\n Signal Processing (ICASSP) 2025"},{"id":"http://arxiv.org/abs/2410.01366v1","updated":"2024-10-02T09:28:21Z","published":"2024-10-02T09:28:21Z","title":"Harnessing the Latent Diffusion Model for Training-Free Image Style\n Transfer","summary":" Diffusion models have recently shown the ability to generate high-quality\nimages. However, controlling its generation process still poses challenges. The\nimage style transfer task is one of those challenges that transfers the visual\nattributes of a style image to another content image. Typical obstacle of this\ntask is the requirement of additional training of a pre-trained model. We\npropose a training-free style transfer algorithm, Style Tracking Reverse\nDiffusion Process (STRDP) for a pretrained Latent Diffusion Model (LDM). Our\nalgorithm employs Adaptive Instance Normalization (AdaIN) function in a\ndistinct manner during the reverse diffusion process of an LDM while tracking\nthe encoding history of the style image. This algorithm enables style transfer\nin the latent space of LDM for reduced computational cost, and provides\ncompatibility for various LDM models. Through a series of experiments and a\nuser study, we show that our method can quickly transfer the style of an image\nwithout additional training. The speed, compatibility, and training-free aspect\nof our algorithm facilitates agile experiments with combinations of styles and\nLDMs for extensive application.\n","authors":["Kento Masui","Mayu Otani","Masahiro Nomura","Hideki Nakayama"],"pdf_url":"https://arxiv.org/pdf/2410.01366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18680v2","updated":"2024-10-02T01:45:40Z","published":"2024-09-27T12:06:53Z","title":"Beyond Single-Audio: Advancing Multi-Audio Processing in Audio Large\n Language Models","summary":" Various audio-LLMs (ALLMs) have been explored recently for tackling different\naudio tasks simultaneously using a single, unified model. While existing\nevaluations of ALLMs primarily focus on single-audio tasks, real-world\napplications often involve processing multiple audio streams simultaneously. To\nbridge this gap, we propose the first multi-audio evaluation (MAE) benchmark\nthat consists of 20 datasets from 11 multi-audio tasks encompassing both speech\nand sound scenarios. Comprehensive experiments on MAE demonstrate that the\nexisting ALLMs, while being powerful in comprehending primary audio elements in\nindividual audio inputs, struggling to handle multi-audio scenarios. To this\nend, we propose a novel multi-audio-LLM (MALLM) to capture audio context among\nmultiple similar audios using discriminative learning on our proposed synthetic\ndata. The results demonstrate that the proposed MALLM outperforms all baselines\nand achieves high data efficiency using synthetic data without requiring human\nannotations. The proposed MALLM opens the door for ALLMs towards multi-audio\nprocessing era and brings us closer to replicating human auditory capabilities\nin machines.\n","authors":["Yiming Chen","Xianghu Yue","Xiaoxue Gao","Chen Zhang","Luis Fernando D'Haro","Robby T. Tan","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2409.18680v2.pdf","comment":"EMNLP24 Findings"}]},"2024-10-01T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.10463v3","updated":"2024-10-01T22:53:01Z","published":"2023-12-16T14:42:46Z","title":"RecPrompt: A Self-tuning Prompting Framework for News Recommendation\n Using Large Language Models","summary":" News recommendations heavily rely on Natural Language Processing (NLP)\nmethods to analyze, understand, and categorize content, enabling personalized\nsuggestions based on user interests and reading behaviors. Large Language\nModels (LLMs) like GPT-4 have shown promising performance in understanding\nnatural language. However, the extent of their applicability to news\nrecommendation systems remains to be validated. This paper introduces\nRecPrompt, the first self-tuning prompting framework for news recommendation,\nleveraging the capabilities of LLMs to perform complex news recommendation\ntasks. This framework incorporates a news recommender and a prompt optimizer\nthat applies an iterative bootstrapping process to enhance recommendations\nthrough automatic prompt engineering. Extensive experimental results with 400\nusers show that RecPrompt can achieve an improvement of 3.36% in AUC, 10.49% in\nMRR, 9.64% in nDCG@5, and 6.20% in nDCG@10 compared to deep neural models.\nAdditionally, we introduce TopicScore, a novel metric to assess explainability\nby evaluating LLM's ability to summarize topics of interest for users. The\nresults show LLM's effectiveness in accurately identifying topics of interest\nand delivering comprehensive topic-based explanations.\n","authors":["Dairui Liu","Boming Yang","Honghui Du","Derek Greene","Aonghus Lawlor","Ruihai Dong","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2312.10463v3.pdf","comment":"5 pages, 2 figures, and 2 tables"},{"id":"http://arxiv.org/abs/2204.11970v5","updated":"2024-10-01T20:19:50Z","published":"2022-04-25T21:20:27Z","title":"Visual Acuity Prediction on Real-Life Patient Data Using a Machine\n Learning Based Multistage System","summary":" In ophthalmology, intravitreal operative medication therapy (IVOM) is a\nwidespread treatment for diseases related to the age-related macular\ndegeneration (AMD), the diabetic macular edema (DME), as well as the retinal\nvein occlusion (RVO). However, in real-world settings, patients often suffer\nfrom loss of vision on time scales of years despite therapy, whereas the\nprediction of the visual acuity (VA) and the earliest possible detection of\ndeterioration under real-life conditions is challenging due to heterogeneous\nand incomplete data. In this contribution, we present a workflow for the\ndevelopment of a research-compatible data corpus fusing different IT systems of\nthe department of ophthalmology of a German maximum care hospital. The\nextensive data corpus allows predictive statements of the expected progression\nof a patient and his or her VA in each of the three diseases. For the disease\nAMD, we found out a significant deterioration of the visual acuity over time.\nWithin our proposed multistage system, we subsequently classify the VA\nprogression into the three groups of therapy \"winners\", \"stabilizers\", and\n\"losers\" (WSL classification scheme). Our OCT biomarker classification using an\nensemble of deep neural networks results in a classification accuracy\n(F1-score) of over 98 %, enabling us to complete incomplete OCT documentations\nwhile allowing us to exploit them for a more precise VA modeling process. Our\nVA prediction requires at least four VA examinations and optionally OCT\nbiomarkers from the same time period to predict the VA progression within a\nforecasted time frame, whereas our prediction is currently restricted to IVOM /\nno therapy. We achieve a final prediction accuracy of 69 % in macro average\nF1-score, while being in the same range as the ophthalmologists with 57.8 and\n50 +- 10.7 % F1-score.\n","authors":["Tobias Schlosser","Frederik Beuth","Trixy Meyer","Arunodhayan Sampath Kumar","Gabriel Stolze","Olga Furashova","Katrin Engelmann","Danny Kowerko"],"pdf_url":"https://arxiv.org/pdf/2204.11970v5.pdf","comment":"Accepted for: Scientific Reports"},{"id":"http://arxiv.org/abs/2407.12857v2","updated":"2024-10-01T17:13:38Z","published":"2024-07-09T15:06:14Z","title":"Automated Peer Reviewing in Paper SEA: Standardization, Evaluation, and\n Analysis","summary":" In recent years, the rapid increase in scientific papers has overwhelmed\ntraditional review mechanisms, resulting in varying quality of publications.\nAlthough existing methods have explored the capabilities of Large Language\nModels (LLMs) for automated scientific reviewing, their generated contents are\noften generic or partial. To address the issues above, we introduce an\nautomated paper reviewing framework SEA. It comprises of three modules:\nStandardization, Evaluation, and Analysis, which are represented by models\nSEA-S, SEA-E, and SEA-A, respectively. Initially, SEA-S distills data\nstandardization capabilities of GPT-4 for integrating multiple reviews for a\npaper. Then, SEA-E utilizes standardized data for fine-tuning, enabling it to\ngenerate constructive reviews. Finally, SEA-A introduces a new evaluation\nmetric called mismatch score to assess the consistency between paper contents\nand reviews. Moreover, we design a self-correction strategy to enhance the\nconsistency. Extensive experimental results on datasets collected from eight\nvenues show that SEA can generate valuable insights for authors to improve\ntheir papers.\n","authors":["Jianxiang Yu","Zichen Ding","Jiaqi Tan","Kangyang Luo","Zhenmin Weng","Chenghua Gong","Long Zeng","Renjing Cui","Chengcheng Han","Qiushi Sun","Zhiyong Wu","Yunshi Lan","Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2407.12857v2.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2210.16928v2","updated":"2024-10-01T14:39:12Z","published":"2022-10-30T19:08:38Z","title":"FELRec: Efficient Handling of Item Cold-Start With Dynamic\n Representation Learning in Recommender Systems","summary":" Recommender systems suffer from the cold-start problem whenever a new user\njoins the platform or a new item is added to the catalog. To address item\ncold-start, we propose to replace the embedding layer in sequential\nrecommenders with a dynamic storage that has no learnable weights and can keep\nan arbitrary number of representations. In this paper, we present FELRec, a\nlarge embedding network that refines the existing representations of users and\nitems in a recursive manner, as new information becomes available. In contrast\nto similar approaches, our model represents new users and items without side\ninformation and time-consuming finetuning, instead it runs a single forward\npass over a sequence of existing representations. During item cold-start, our\nmethod outperforms similar method by 29.50%-47.45%. Further, our proposed model\ngeneralizes well to previously unseen datasets in zero-shot settings. The\nsource code is publicly available at https://github.com/kweimann/FELRec .\n","authors":["Kuba Weimann","Tim O. F. Conrad"],"pdf_url":"https://arxiv.org/pdf/2210.16928v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19979v2","updated":"2024-10-01T13:04:55Z","published":"2024-09-30T06:07:12Z","title":"Enhancing High-order Interaction Awareness in LLM-based Recommender\n Model","summary":" Large language models (LLMs) have demonstrated prominent reasoning\ncapabilities in recommendation tasks by transforming them into text-generation\ntasks. However, existing approaches either disregard or ineffectively model the\nuser-item high-order interactions. To this end, this paper presents an enhanced\nLLM-based recommender (ELMRec). We enhance whole-word embeddings to\nsubstantially enhance LLMs' interpretation of graph-constructed interactions\nfor recommendations, without requiring graph pre-training. This finding may\ninspire endeavors to incorporate rich knowledge graphs into LLM-based\nrecommenders via whole-word embedding. We also found that LLMs often recommend\nitems based on users' earlier interactions rather than recent ones, and present\na reranking solution. Our ELMRec outperforms state-of-the-art (SOTA) methods in\nboth direct and sequential recommendations.\n","authors":["Xinfeng Wang","Jin Cui","Fumiyo Fukumoto","Yoshimi Suzuki"],"pdf_url":"https://arxiv.org/pdf/2409.19979v2.pdf","comment":"Long paper accepted to EMNLP 2024 Main. 16 pages"},{"id":"http://arxiv.org/abs/2310.17373v2","updated":"2024-10-01T12:43:55Z","published":"2023-10-26T13:10:59Z","title":"Causality-Inspired Fair Representation Learning for Multimodal\n Recommendation","summary":" Recently, multimodal recommendations (MMR) have gained increasing attention\nfor alleviating the data sparsity problem of traditional recommender systems by\nincorporating modality-based representations. Although MMR exhibit notable\nimprovement in recommendation accuracy, we empirically validate that an\nincrease in the quantity or variety of modalities leads to a higher degree of\nusers' sensitive information leakage due to entangled causal relationships,\nrisking fair representation learning. On the other hand, existing fair\nrepresentation learning approaches are mostly based on the assumption that\nsensitive information is solely leaked from users' interaction data and do not\nexplicitly model the causal relationships introduced by multimodal data, which\nlimits their applicability in multimodal scenarios. Particularly, we\ndisentangle biased and filtered modal embeddings inspired by causal inference\ntechniques, enabling the mining of modality-based unfair and fair user-user\nrelations, thereby enhancing the fairness and informativeness of user\nrepresentations. By addressing the causal effects of sensitive attributes on\nuser preferences, our approach aims to achieve counterfactual fairness in\nmultimodal recommendations. Experiments on two public datasets demonstrate the\nsuperiority of our FMMRec relative to the state-of-the-art baselines. Our\nsource code is available at https://github.com/WeixinChen98/FMMRec.\n","authors":["Weixin Chen","Li Chen","Yongxin Ni","Yuhan Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.17373v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09818v3","updated":"2024-10-01T08:55:44Z","published":"2024-06-14T08:21:42Z","title":"ClimRetrieve: A Benchmarking Dataset for Information Retrieval from\n Corporate Climate Disclosures","summary":" To handle the vast amounts of qualitative data produced in corporate climate\ncommunication, stakeholders increasingly rely on Retrieval Augmented Generation\n(RAG) systems. However, a significant gap remains in evaluating domain-specific\ninformation retrieval - the basis for answer generation. To address this\nchallenge, this work simulates the typical tasks of a sustainability analyst by\nexamining 30 sustainability reports with 16 detailed climate-related questions.\nAs a result, we obtain a dataset with over 8.5K unique question-source-answer\npairs labeled by different levels of relevance. Furthermore, we develop a use\ncase with the dataset to investigate the integration of expert knowledge into\ninformation retrieval with embeddings. Although we show that incorporating\nexpert knowledge works, we also outline the critical limitations of embeddings\nin knowledge-intensive downstream domains like climate change communication.\n","authors":["Tobias Schimanski","Jingwei Ni","Roberto Spacey","Nicola Ranger","Markus Leippold"],"pdf_url":"https://arxiv.org/pdf/2406.09818v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00427v1","updated":"2024-10-01T06:16:07Z","published":"2024-10-01T06:16:07Z","title":"Conversational Exploratory Search of Scholarly Publications Using\n Knowledge Graphs","summary":" Traditional search methods primarily depend on string matches, while semantic\nsearch targets concept-based matches by recognizing underlying intents and\ncontextual meanings of search terms. Semantic search is particularly beneficial\nfor discovering scholarly publications where differences in vocabulary between\nusers' search terms and document content are common, often yielding irrelevant\nsearch results. Many scholarly search engines have adopted knowledge graphs to\nrepresent semantic relations between authors, publications, and research\nconcepts. However, users may face challenges when navigating these graphical\nsearch interfaces due to the complexity and volume of data, which impedes their\nability to discover publications effectively. To address this problem, we\ndeveloped a conversational search system for exploring scholarly publications\nusing a knowledge graph. We outline the methodical approach for designing and\nimplementing the proposed system, detailing its architecture and functional\ncomponents. To assess the system's effectiveness, we employed various\nperformance metrics and conducted a human evaluation with 40 participants,\ndemonstrating how the conversational interface compares against a graphical\ninterface with traditional text search. The findings from our evaluation\nprovide practical insights for advancing the design of conversational search\nsystems.\n","authors":["Phillip Schneider","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2410.00427v1.pdf","comment":"Accepted to ICNLSP 2024"},{"id":"http://arxiv.org/abs/2409.19014v2","updated":"2024-10-01T05:55:33Z","published":"2024-09-24T01:40:50Z","title":"FLEX: Expert-level False-Less EXecution Metric for Reliable Text-to-SQL\n Benchmark","summary":" Text-to-SQL technology has become crucial for translating natural language\ninto SQL queries in various industries, enabling non-technical users to perform\ncomplex data operations. The need for accurate evaluation methods has increased\nas these systems have grown more sophisticated. However, we found that the\nExecution Accuracy (EX), the most promising evaluation metric, still shows a\nsubstantial portion of false positives and negatives compared to human\nevaluation. Thus, this paper introduces FLEX (False-Less EXecution), a novel\napproach to evaluating text-to-SQL systems using large language models (LLMs)\nto emulate human expert-level evaluation of SQL queries. Our method shows\nsignificantly higher agreement with human expert judgments, improving Cohen's\nkappa from 61 to 78.17. Re-evaluating top-performing models on the Spider and\nBIRD benchmarks using FLEX reveals substantial shifts in performance rankings,\nwith an average performance decrease of 3.15 due to false positive corrections\nand an increase of 6.07 from addressing false negatives. This work contributes\nto a more accurate and nuanced evaluation of text-to-SQL systems, potentially\nreshaping our understanding of state-of-the-art performance in this field.\n","authors":["Heegyu Kim","Taeyang Jeon","Seunghwan Choi","Seungtaek Choi","Hyunsouk Cho"],"pdf_url":"https://arxiv.org/pdf/2409.19014v2.pdf","comment":"preprint, under review"},{"id":"http://arxiv.org/abs/2408.11345v3","updated":"2024-10-01T05:53:36Z","published":"2024-08-21T05:09:53Z","title":"Deep Tree-based Retrieval for Efficient Recommendation: Theory and\n Method","summary":" With the development of deep learning techniques, deep recommendation models\nalso achieve remarkable improvements in terms of recommendation accuracy.\nHowever, due to the large number of candidate items in practice and the high\ncost of preference computation, these methods also suffer from low efficiency\nof recommendation. The recently proposed tree-based deep recommendation models\nalleviate the problem by directly learning tree structure and representations\nunder the guidance of recommendation objectives. However, such models have\nshortcomings. The max-heap assumption in the hierarchical tree, in which the\npreference for a parent node should be the maximum between the preferences for\nits children, is difficult to satisfy in their binary classification\nobjectives. To this end, we propose Tree-based Deep Retrieval (TDR for short)\nfor efficient recommendation. In TDR, all the trees generated during the\ntraining process are retained to form the forest. When learning the node\nrepresentation of each tree, we have to satisfy the max-heap assumption as much\nas possible and mimic beam search behavior over the tree in the training stage.\nThis is achieved by TDR to regard the training task as multi-classification\nover tree nodes at the same level. However, the number of tree nodes grows\nexponentially with levels, making us train the preference model with the\nguidance of the sampled-softmax technique. The experiments are conducted on\nreal-world datasets, validating the effectiveness of the proposed preference\nmodel learning method and tree learning method.\n","authors":["Ze Liu","Jin Zhang","Chao Feng","Defu Lian","Jie Wang","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2408.11345v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00412v1","updated":"2024-10-01T05:37:31Z","published":"2024-10-01T05:37:31Z","title":"TPN: Transferable Proto-Learning Network towards Few-shot Document-Level\n Relation Extraction","summary":" Few-shot document-level relation extraction suffers from poor performance due\nto the challenging cross-domain transferability of NOTA (none-of-the-above)\nrelation representation. In this paper, we introduce a Transferable\nProto-Learning Network (TPN) to address the challenging issue. It comprises\nthree core components: Hybrid Encoder hierarchically encodes semantic content\nof input text combined with attention information to enhance the relation\nrepresentations. As a plug-and-play module for Out-of-Domain (OOD) Detection,\nTransferable Proto-Learner computes NOTA prototype through an adaptive\nlearnable block, effectively mitigating NOTA bias across various domains.\nDynamic Weighting Calibrator detects relation-specific classification\nconfidence, serving as dynamic weights to calibrate the NOTA-dominant loss\nfunction. Finally, to bolster the model's cross-domain performance, we\ncomplement it with virtual adversarial training (VAT). We conduct extensive\nexperimental analyses on FREDo and ReFREDo, demonstrating the superiority of\nTPN. Compared to state-of-the-art methods, our approach achieves competitive\nperformance with approximately half the parameter size. Data and code are\navailable at https://github.com/EchoDreamer/TPN.\n","authors":["Yu Zhang","Zhao Kang"],"pdf_url":"https://arxiv.org/pdf/2410.00412v1.pdf","comment":"Few shot document-level relation extraction"},{"id":"http://arxiv.org/abs/2410.00408v1","updated":"2024-10-01T05:06:07Z","published":"2024-10-01T05:06:07Z","title":"ECORS: An Ensembled Clustering Approach to Eradicate The Local And\n Global Outlier In Collaborative Filtering Recommender System","summary":" Recommender systems are designed to suggest items based on user preferences,\nhelping users navigate the vast amount of information available on the\ninternet. Given the overwhelming content, outlier detection has emerged as a\nkey research area in recommender systems. It involves identifying unusual or\nsuspicious patterns in user behavior. However, existing studies in this field\nface several challenges, including the limited universality of algorithms,\ndifficulties in selecting users, and a lack of optimization. In this paper, we\npropose an approach that addresses these challenges by employing various\nclustering algorithms. Specifically, we utilize a user-user matrix-based\nclustering technique to detect outliers. By constructing a user-user matrix, we\ncan identify suspicious users in the system. Both local and global outliers are\ndetected to ensure comprehensive analysis. Our experimental results demonstrate\nthat this approach significantly improves the accuracy of outlier detection in\nrecommender systems.\n","authors":["Mahamudul Hasan"],"pdf_url":"https://arxiv.org/pdf/2410.00408v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.16674v3","updated":"2024-10-01T04:45:04Z","published":"2024-09-25T07:06:14Z","title":"A Prompting-Based Representation Learning Method for Recommendation with\n Large Language Models","summary":" In recent years, Recommender Systems (RS) have witnessed a transformative\nshift with the advent of Large Language Models (LLMs) in the field of Natural\nLanguage Processing (NLP). Models such as GPT-3.5/4, Llama, have demonstrated\nunprecedented capabilities in understanding and generating human-like text. The\nextensive information pre-trained by these LLMs allows for the potential to\ncapture a more profound semantic representation from different contextual\ninformation of users and items.\n While the great potential lies behind the thriving of LLMs, the challenge of\nleveraging user-item preferences from contextual information and its alignment\nwith the improvement of Recommender Systems needs to be addressed. Believing\nthat a better understanding of the user or item itself can be the key factor in\nimproving recommendation performance, we conduct research on generating\ninformative profiles using state-of-the-art LLMs.\n To boost the linguistic abilities of LLMs in Recommender Systems, we\nintroduce the Prompting-Based Representation Learning Method for Recommendation\n(P4R). In our P4R framework, we utilize the LLM prompting strategy to create\npersonalized item profiles. These profiles are then transformed into semantic\nrepresentation spaces using a pre-trained BERT model for text embedding.\nFurthermore, we incorporate a Graph Convolution Network (GCN) for collaborative\nfiltering representation. The P4R framework aligns these two embedding spaces\nin order to address the general recommendation tasks. In our evaluation, we\ncompare P4R with state-of-the-art Recommender models and assess the quality of\nprompt-based profile generation.\n","authors":["Junyi Chen","Toyotaro Suzumura"],"pdf_url":"https://arxiv.org/pdf/2409.16674v3.pdf","comment":"Risks: The 1st International Workshop on Risks, Opportunities, and\n Evaluation of Generative Models in Recommendation"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.18236v2","updated":"2024-10-01T21:53:44Z","published":"2024-09-26T19:27:11Z","title":"Spatial Visibility and Temporal Dynamics: Revolutionizing Field of View\n Prediction in Adaptive Point Cloud Video Streaming","summary":" Field-of-View (FoV) adaptive streaming significantly reduces bandwidth\nrequirement of immersive point cloud video (PCV) by only transmitting visible\npoints in a viewer's FoV. The traditional approaches often focus on\ntrajectory-based 6 degree-of-freedom (6DoF) FoV predictions. The predicted FoV\nis then used to calculate point visibility. Such approaches do not explicitly\nconsider video content's impact on viewer attention, and the conversion from\nFoV to point visibility is often error-prone and time-consuming. We reformulate\nthe PCV FoV prediction problem from the cell visibility perspective, allowing\nfor precise decision-making regarding the transmission of 3D data at the cell\nlevel based on the predicted visibility distribution. We develop a novel\nspatial visibility and object-aware graph model that leverages the historical\n3D visibility data and incorporates spatial perception, neighboring cell\ncorrelation, and occlusion information to predict the cell visibility in the\nfuture. Our model significantly improves the long-term cell visibility\nprediction, reducing the prediction MSE loss by up to 50% compared to the\nstate-of-the-art models while maintaining real-time performance (more than\n30fps) for point cloud videos with over 1 million points.\n","authors":["Chen Li","Tongyu Zong","Yueyu Hu","Yao Wang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18236v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01027v1","updated":"2024-10-01T19:35:07Z","published":"2024-10-01T19:35:07Z","title":"Graph-based Scalable Sampling of 3D Point Cloud Attributes","summary":" 3D Point clouds (PCs) are commonly used to represent 3D scenes. They can have\nmillions of points, making subsequent downstream tasks such as compression and\nstreaming computationally expensive. PC sampling (selecting a subset of points)\ncan be used to reduce complexity. Existing PC sampling algorithms focus on\npreserving geometry features and often do not scale to handle large PCs. In\nthis work, we develop scalable graph-based sampling algorithms for PC color\nattributes, assuming the full geometry is available. Our sampling algorithms\nare optimized for a signal reconstruction method that minimizes the graph\nLaplacian quadratic form. We first develop a global sampling algorithm that can\nbe applied to PCs with millions of points by exploiting sparsity and sampling\nrate adaptive parameter selection. Further, we propose a block-based sampling\nstrategy where each block is sampled independently. We show that sampling the\ncorresponding sub-graphs with optimally chosen self-loop weights (node weights)\nwill produce a sampling set that approximates the results of global sampling\nwhile reducing complexity by an order of magnitude. Our empirical results on\ntwo large PC datasets show that our algorithms outperform the existing fast PC\nsubsampling techniques (uniform and geometry feature preserving random\nsampling) by 2dB. Our algorithm is up to 50 times faster than existing graph\nsignal sampling algorithms while providing better reconstruction accuracy.\nFinally, we illustrate the efficacy of PC attribute sampling within a\ncompression scenario, showing that pre-compression sampling of PC attributes\ncan lower the bitrate by 11% while having minimal effect on reconstruction.\n","authors":["Shashank N. Sridhara","Eduardo Pavez","Ajinkya Jayawant","Antonio Ortega","Ryosuke Watanabe","Keisuke Nonaka"],"pdf_url":"https://arxiv.org/pdf/2410.01027v1.pdf","comment":"13 pages, 13 Figures"},{"id":"http://arxiv.org/abs/2410.00849v1","updated":"2024-10-01T16:31:29Z","published":"2024-10-01T16:31:29Z","title":"Energy-Quality-aware Variable Framerate Pareto-Front for Adaptive Video\n Streaming","summary":" Optimizing framerate for a given bitrate-spatial resolution pair in adaptive\nvideo streaming is essential to maintain perceptual quality while considering\ndecoding complexity. Low framerates at low bitrates reduce compression\nartifacts and decrease decoding energy. We propose a novel method,\nDecoding-complexity aware Framerate Prediction (DECODRA), which employs a\nVariable Framerate Pareto-front approach to predict an optimized framerate that\nminimizes decoding energy under quality degradation constraints. DECODRA\ndynamically adjusts the framerate based on current bitrate and spatial\nresolution, balancing trade-offs between framerate, perceptual quality, and\ndecoding complexity. Extensive experimentation with the Inter-4K dataset\ndemonstrates DECODRA's effectiveness, yielding an average decoding energy\nreduction of up to 13.45%, with minimal VMAF reduction of 0.33 points at a\nlow-quality degradation threshold, compared to the default 60 fps encoding.\nEven at an aggressive threshold, DECODRA achieves significant energy savings of\n13.45% while only reducing VMAF by 2.11 points. In this way, DECODRA extends\nmobile device battery life and reduces the energy footprint of streaming\nservices by providing a more energy-efficient video streaming pipeline.\n","authors":["Prajit T Rajendran","Samira Afzal","Vignesh V Menon","Christian Timmerer"],"pdf_url":"https://arxiv.org/pdf/2410.00849v1.pdf","comment":"Accepted at IEEE International Conference on Visual Communications\n and Image Processing (VCIP) 2024"},{"id":"http://arxiv.org/abs/2410.00817v1","updated":"2024-10-01T16:00:17Z","published":"2024-10-01T16:00:17Z","title":"Maximum entropy and quantized metric models for absolute category\n ratings","summary":" The datasets of most image quality assessment studies contain ratings on a\ncategorical scale with five levels, from bad (1) to excellent (5). For each\nstimulus, the number of ratings from 1 to 5 is summarized and given in the form\nof the mean opinion score. In this study, we investigate families of\nmultinomial probability distributions parameterized by mean and variance that\nare used to fit the empirical rating distributions. To this end, we consider\nquantized metric models based on continuous distributions that model perceived\nstimulus quality on a latent scale. The probabilities for the rating categories\nare determined by quantizing the corresponding random variables using threshold\nvalues. Furthermore, we introduce a novel discrete maximum entropy distribution\nfor a given mean and variance. We compare the performance of these models and\nthe state of the art given by the generalized score distribution for two large\ndata sets, KonIQ-10k and VQEG HDTV. Given an input distribution of ratings, our\nfitted two-parameter models predict unseen ratings better than the empirical\ndistribution. In contrast to empirical ACR distributions and their discrete\nmodels, our continuous models can provide fine-grained estimates of quantiles\nof quality of experience that are relevant to service providers to satisfy a\ntarget fraction of the user population.\n","authors":["Dietmar Saupe","Krzysztof Rusek","David Hägele","Daniel Weiskopf","Lucjan Janowski"],"pdf_url":"https://arxiv.org/pdf/2410.00817v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2410.00741v1","updated":"2024-10-01T14:33:22Z","published":"2024-10-01T14:33:22Z","title":"VideoCLIP-XL: Advancing Long Description Understanding for Video CLIP\n Models","summary":" Contrastive Language-Image Pre-training (CLIP) has been widely studied and\napplied in numerous applications. However, the emphasis on brief summary texts\nduring pre-training prevents CLIP from understanding long descriptions. This\nissue is particularly acute regarding videos given that videos often contain\nabundant detailed contents. In this paper, we propose the VideoCLIP-XL (eXtra\nLength) model, which aims to unleash the long-description understanding\ncapability of video CLIP models. Firstly, we establish an automatic data\ncollection system and gather a large-scale VILD pre-training dataset with VIdeo\nand Long-Description pairs. Then, we propose Text-similarity-guided Primary\nComponent Matching (TPCM) to better learn the distribution of feature space\nwhile expanding the long description capability. We also introduce two new\ntasks namely Detail-aware Description Ranking (DDR) and Hallucination-aware\nDescription Ranking (HDR) for further understanding improvement. Finally, we\nconstruct a Long Video Description Ranking (LVDR) benchmark for evaluating the\nlong-description capability more comprehensively. Extensive experimental\nresults on widely-used text-video retrieval benchmarks with both short and long\ndescriptions and our LVDR benchmark can fully demonstrate the effectiveness of\nour method.\n","authors":["Jiapeng Wang","Chengyu Wang","Kunzhe Huang","Jun Huang","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2410.00741v1.pdf","comment":"EMNLP 2024 Main conference"},{"id":"http://arxiv.org/abs/2410.00557v1","updated":"2024-10-01T10:10:43Z","published":"2024-10-01T10:10:43Z","title":"STanH : Parametric Quantization for Variable Rate Learned Image\n Compression","summary":" In end-to-end learned image compression, encoder and decoder are jointly\ntrained to minimize a $R + {\\lambda}D$ cost function, where ${\\lambda}$\ncontrols the trade-off between rate of the quantized latent representation and\nimage quality. Unfortunately, a distinct encoder-decoder pair with millions of\nparameters must be trained for each ${\\lambda}$, hence the need to switch\nencoders and to store multiple encoders and decoders on the user device for\nevery target rate. This paper proposes to exploit a differentiable quantizer\ndesigned around a parametric sum of hyperbolic tangents, called STanH , that\nrelaxes the step-wise quantization function. STanH is implemented as a\ndifferentiable activation layer with learnable quantization parameters that can\nbe plugged into a pre-trained fixed rate model and refined to achieve different\ntarget bitrates. Experimental results show that our method enables variable\nrate coding with comparable efficiency to the state-of-the-art, yet with\nsignificant savings in terms of ease of deployment, training time, and storage\ncosts\n","authors":["Alberto Presta","Enzo Tartaglione","Attilio Fiandrotti","Marco Grangetto"],"pdf_url":"https://arxiv.org/pdf/2410.00557v1.pdf","comment":"Submitted to IEEE Transactions on Image Processing"},{"id":"http://arxiv.org/abs/2401.00416v2","updated":"2024-10-01T07:55:22Z","published":"2023-12-31T07:44:05Z","title":"SVFAP: Self-supervised Video Facial Affect Perceiver","summary":" Video-based facial affect analysis has recently attracted increasing\nattention owing to its critical role in human-computer interaction. Previous\nstudies mainly focus on developing various deep learning architectures and\ntraining them in a fully supervised manner. Although significant progress has\nbeen achieved by these supervised methods, the longstanding lack of large-scale\nhigh-quality labeled data severely hinders their further improvements.\nMotivated by the recent success of self-supervised learning in computer vision,\nthis paper introduces a self-supervised approach, termed Self-supervised Video\nFacial Affect Perceiver (SVFAP), to address the dilemma faced by supervised\nmethods. Specifically, SVFAP leverages masked facial video autoencoding to\nperform self-supervised pre-training on massive unlabeled facial videos.\nConsidering that large spatiotemporal redundancy exists in facial videos, we\npropose a novel temporal pyramid and spatial bottleneck Transformer as the\nencoder of SVFAP, which not only largely reduces computational costs but also\nachieves excellent performance. To verify the effectiveness of our method, we\nconduct experiments on nine datasets spanning three downstream tasks, including\ndynamic facial expression recognition, dimensional emotion recognition, and\npersonality recognition. Comprehensive results demonstrate that SVFAP can learn\npowerful affect-related representations via large-scale self-supervised\npre-training and it significantly outperforms previous state-of-the-art methods\non all datasets. Code is available at https://github.com/sunlicai/SVFAP.\n","authors":["Licai Sun","Zheng Lian","Kexin Wang","Yu He","Mingyu Xu","Haiyang Sun","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2401.00416v2.pdf","comment":"Published in: IEEE Transactions on Affective Computing (Early\n Access). The code and models are available at\n https://github.com/sunlicai/SVFAP"},{"id":"http://arxiv.org/abs/2309.04023v2","updated":"2024-10-01T04:17:05Z","published":"2023-09-07T21:30:57Z","title":"BOLA360: Near-optimal View and Bitrate Adaptation for 360-degree Video\n Streaming","summary":" Recent advances in omnidirectional cameras and AR/VR headsets have spurred\nthe adoption of 360-degree videos that are widely believed to be the future of\nonline video streaming. 360-degree videos allow users to wear a head-mounted\ndisplay (HMD) and experience the video as if they are physically present in the\nscene. Streaming high-quality 360-degree videos at scale is an unsolved problem\nthat is more challenging than traditional (2D) video delivery. The data rate\nrequired to stream 360-degree videos is an order of magnitude more than\ntraditional videos. Further, the penalty for rebuffering events where the video\nfreezes or displays a blank screen is more severe as it may cause\ncybersickness. We propose an online adaptive bitrate (ABR) algorithm for\n360-degree videos called BOLA360 that runs inside the client's video player and\norchestrates the download of video segments from the server so as to maximize\nthe quality-of-experience (QoE) of the user. BOLA360 conserves bandwidth by\ndownloading only those video segments that are likely to fall within the\nfield-of-view (FOV) of the user. In addition, BOLA360 continually adapts the\nbitrate of the downloaded video segments so as to enable a smooth playback\nwithout rebuffering. We prove that BOLA360 is near-optimal with respect to an\noptimal offline algorithm that maximizes QoE. Further, we evaluate BOLA360 on a\nwide range of network and user head movement profiles and show that it provides\n$13.6\\%$ to $372.5\\%$ more QoE than state-of-the-art algorithms. While ABR\nalgorithms for traditional (2D) videos have been well-studied over the last\ndecade, our work is the first ABR algorithm for 360-degree videos with both\ntheoretical and empirical guarantees on its performance.\n","authors":["Ali Zeynali","Mahsa Sahebdel","Mohammad Hajiesmaili","Ramesh K. Sitaraman"],"pdf_url":"https://arxiv.org/pdf/2309.04023v2.pdf","comment":"27 pages"}]},"2024-09-30T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2405.06242v3","updated":"2024-09-30T21:46:20Z","published":"2024-05-10T04:44:34Z","title":"Impedance vs. Power Side-channel Vulnerabilities: A Comparative Study","summary":" Physical side channels emerge from the relation between internal computation\nor data with observable physical parameters of a chip. Previous works mostly\nfocus on properties related to current consumption such as power consumption.\nThe fundamental property behind current consumption occur from the impedance of\nthe chip. Contemporary works have stared using chip impedance as a physical\nside channel in extracting sensitive information from computing systems. It\nleverages variations in intrinsic impedance of a chip across different logic\nstates. However, there has been a lack of comparative studies. In this study,\nwe conduct a comparative analysis of the impedance side channel, which has been\nlimitedly explored, and the well-established power side channel. Through\nexperimental evaluation, we investigate the efficacy of these side channels in\nextracting stored advanced encryption standard (AES) cryptographic key on a\nmemory and analyze their performance. Our findings indicate that impedance\nanalysis demonstrates a higher potential for cryptographic key extraction\ncompared to power side-channel analysis (SCA). Moreover, we identify scenarios\nwhere power SCA does not yield satisfactory results, whereas impedance analysis\nproves to be more robust and effective. This work not only underscores the\nsignificance of impedance SCA in enhancing cryptographic security but also\nemphasizes the necessity for a deeper understanding of its mechanisms and\nimplications.\n","authors":["Md Sadik Awal","Buddhipriya Gayanath","Md Tauhidur Rahman"],"pdf_url":"https://arxiv.org/pdf/2405.06242v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14732v2","updated":"2024-09-30T21:25:22Z","published":"2024-06-20T20:55:38Z","title":"TTQA-RS- A break-down prompting approach for Multi-hop Table-Text\n Question Answering with Reasoning and Summarization","summary":" Question answering (QA) over tables and text has gained much popularity over\nthe years. Multi-hop table-text QA requires multiple hops between the table and\ntext, making it a challenging QA task. Although several works have attempted to\nsolve the table-text QA task, most involve training the models and requiring\nlabeled data. In this paper, we have proposed a Retrieval Augmented Generation\n(RAG) based model - TTQA-RS: A break-down prompting approach for Multi-hop\nTable-Text Question Answering with Reasoning and Summarization. Our model uses\nan enhanced retriever for table-text information retrieval and uses augmented\nknowledge, including table-text summary with decomposed sub-questions with\nanswers for a reasoning-based table-text QA. Using open-source language models,\nour model outperformed all existing prompting methods for table-text QA tasks\non existing table-text QA datasets, such as HybridQA and OTT-QA's development\nset. Our experiments demonstrate the potential of prompt-based approaches using\nopen-source LLMs. Additionally, by using LLaMA3-70B, our model achieved\nstate-of-the-art performance for prompting-based methods on multi-hop\ntable-text QA.\n","authors":["Jayetri Bardhan","Bushi Xiao","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2406.14732v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.00927v1","updated":"2024-09-30T16:57:34Z","published":"2024-09-30T16:57:34Z","title":"Text Clustering as Classification with LLMs","summary":" Text clustering remains valuable in real-world applications where manual\nlabeling is cost-prohibitive. It facilitates efficient organization and\nanalysis of information by grouping similar texts based on their\nrepresentations. However, implementing this approach necessitates fine-tuned\nembedders for downstream data and sophisticated similarity metrics. To address\nthis issue, this study presents a novel framework for text clustering that\neffectively leverages the in-context learning capacity of Large Language Models\n(LLMs). Instead of fine-tuning embedders, we propose to transform the text\nclustering into a classification task via LLM. First, we prompt LLM to generate\npotential labels for a given dataset. Second, after integrating similar labels\ngenerated by the LLM, we prompt the LLM to assign the most appropriate label to\neach sample in the dataset. Our framework has been experimentally proven to\nachieve comparable or superior performance to state-of-the-art clustering\nmethods that employ embeddings, without requiring complex fine-tuning or\nclustering algorithms. We make our code available to the public for utilization\nat https://anonymous.4open.science/r/Text-Clustering-via-LLM-E500.\n","authors":["Chen Huang","Guoxiu He"],"pdf_url":"https://arxiv.org/pdf/2410.00927v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.20483v1","updated":"2024-09-30T16:42:57Z","published":"2024-09-30T16:42:57Z","title":"RecSys Challenge 2024: Balancing Accuracy and Editorial Values in News\n Recommendations","summary":" The RecSys Challenge 2024 aims to advance news recommendation by addressing\nboth the technical and normative challenges inherent in designing effective and\nresponsible recommender systems for news publishing. This paper describes the\nchallenge, including its objectives, problem setting, and the dataset provided\nby the Danish news publishers Ekstra Bladet and JP/Politikens Media Group\n(\"Ekstra Bladet\"). The challenge explores the unique aspects of news\nrecommendation, such as modeling user preferences based on behavior, accounting\nfor the influence of the news agenda on user interests, and managing the rapid\ndecay of news items. Additionally, the challenge embraces normative\ncomplexities, investigating the effects of recommender systems on news flow and\ntheir alignment with editorial values. We summarize the challenge setup,\ndataset characteristics, and evaluation metrics. Finally, we announce the\nwinners and highlight their contributions. The dataset is available at:\nhttps://recsys.eb.dk.\n","authors":["Johannes Kruse","Kasper Lindskow","Saikishore Kalloori","Marco Polignano","Claudio Pomo","Abhishek Srivastava","Anshuk Uppal","Michael Riis Andersen","Jes Frellsen"],"pdf_url":"https://arxiv.org/pdf/2409.20483v1.pdf","comment":"5 pages, 3 tables, RecSys' 24"},{"id":"http://arxiv.org/abs/2409.20305v1","updated":"2024-09-30T14:04:27Z","published":"2024-09-30T14:04:27Z","title":"Mixed-Precision Embeddings for Large-Scale Recommendation Models","summary":" Embedding techniques have become essential components of large databases in\nthe deep learning era. By encoding discrete entities, such as words, items, or\ngraph nodes, into continuous vector spaces, embeddings facilitate more\nefficient storage, retrieval, and processing in large databases. Especially in\nthe domain of recommender systems, millions of categorical features are encoded\nas unique embedding vectors, which facilitates the modeling of similarities and\ninteractions among features. However, numerous embedding vectors can result in\nsignificant storage overhead. In this paper, we aim to compress the embedding\ntable through quantization techniques. Given that features vary in importance\nlevels, we seek to identify an appropriate precision for each feature to\nbalance model accuracy and memory usage. To this end, we propose a novel\nembedding compression method, termed Mixed-Precision Embeddings (MPE).\nSpecifically, to reduce the size of the search space, we first group features\nby frequency and then search precision for each feature group. MPE further\nlearns the probability distribution over precision levels for each feature\ngroup, which can be used to identify the most suitable precision with a\nspecially designed sampling strategy. Extensive experiments on three public\ndatasets demonstrate that MPE significantly outperforms existing embedding\ncompression methods. Remarkably, MPE achieves about 200x compression on the\nCriteo dataset without comprising the prediction accuracy.\n","authors":["Shiwei Li","Zhuoqi Hu","Fuyuan Lyu","Xing Tang","Haozhao Wang","Shijie Xu","Weihong Luo","Yuhua Li","Xue Liu","Xiuqiang He","Ruixuan Li"],"pdf_url":"https://arxiv.org/pdf/2409.20305v1.pdf","comment":"under submision"},{"id":"http://arxiv.org/abs/2409.20302v1","updated":"2024-09-30T14:00:04Z","published":"2024-09-30T14:00:04Z","title":"OM4OV: Leveraging Ontology Matching for Ontology Versioning","summary":" Due to the dynamic nature of the semantic web, ontology version control is\nrequired to capture time-varying information, most importantly for widely-used\nontologies. Despite the long-standing recognition of ontology versioning (OV)\nas a crucial component for efficient ontology management, the growing size of\nontologies and accumulating errors caused by manual labour overwhelm current OV\napproaches. In this paper, we propose yet another approach to performing OV\nusing existing ontology matching (OM) techniques and systems. We introduce a\nunified OM4OV pipeline. From an OM perspective, we reconstruct a new task\nformulation, performance measurement, and dataset construction for OV tasks.\nReusing the prior alignment(s) from OM, we also propose a cross-reference\nmechanism to effectively reduce the matching candidature and improve overall OV\nperformance. We experimentally validate the OM4OV pipeline and its\ncross-reference mechanism using three datasets from the Alignment Evaluation\nInitiative (OAEI) and exploit insights on OM used for OV tasks.\n","authors":["Zhangcheng Qiang","Kerry Taylor"],"pdf_url":"https://arxiv.org/pdf/2409.20302v1.pdf","comment":"7 pages, 7 figures, 1 table"},{"id":"http://arxiv.org/abs/2406.05666v6","updated":"2024-09-30T12:20:40Z","published":"2024-06-09T06:49:22Z","title":"Probability Distribution Learning: A theoretical framework for Deep\n Learning","summary":" This paper introduces Probability Distribution Learning (PD learning), a\nnovel theoretical learning framework designed to address a comprehensive range\nof machine learning and statistical tasks, including classification,\nregression, and parameter estimation. Departing from the traditional\nstatistical learning framework, PD learning focuses on learning the underlying\nprobability distribution of a dataset, which is modeled as a random variable\nwithin the probability simplex. In this framework, the learning error is\ndecomposed into uncertainty and the model's fitting error to the optimal\nestimate. Uncertainty, which is non-optimizable and independent of both the\nmodel and optimization algorithm, depends solely on prior knowledge and\nsampling data, constituting the optimal bound of the learning error. Minimizing\nthe fitting error represents a typical non-convex optimization problem. To\naddress this, we initially demonstrate that under the conditions of unique\noptimum and sampling stability, the loss function exhibits a unified\nmathematical form, which we refer to as the standard loss function. Moreover,\nwe prove that by employing the standard loss function, the optima of fitting\nerror minimization can be approached by reducing the gradient norm and\nstructural error. Subsequently, we demonstrate that with random parameter\ninitialization, increasing network depth and the parameter count can reduce\nstructural error. Consequently, from the perspective of structural error,\ntechniques such as over-parameterization, non-convex optimization, and the flat\nminima in deep learning are beneficial in reducing structural error, thereby\nensuring that gradient-based iterative algorithms can attain an approximate\nglobal optimum for fitting error minimization. Ultimately, the experimental\nresults on various models have validated the effectiveness of the framework\nproposed in this paper.\n","authors":["Binchuan Qi"],"pdf_url":"https://arxiv.org/pdf/2406.05666v6.pdf","comment":"arXiv admin note: text overlap with arXiv:2105.04026 by other\n authors. arXiv admin note: text overlap with arXiv:2105.04026 by other\n authors"},{"id":"http://arxiv.org/abs/2407.06716v3","updated":"2024-09-30T10:49:26Z","published":"2024-07-09T09:43:42Z","title":"Analyzing the Effectiveness of Listwise Reranking with Positional\n Invariance on Temporal Generalizability","summary":" This working note outlines our participation in the retrieval task at CLEF\n2024. We highlight the considerable gap between studying retrieval performance\non static knowledge documents and understanding performance in real-world\nenvironments. Therefore, Addressing these discrepancies and measuring the\ntemporal persistence of IR systems is crucial. By investigating the LongEval\nbenchmark, specifically designed for such dynamic environments, our findings\ndemonstrate the effectiveness of a listwise reranking approach, which\nproficiently handles inaccuracies induced by temporal distribution shifts.\nAmong listwise rerankers, our findings show that ListT5, which effectively\nmitigates the positional bias problem by adopting the Fusion-in-Decoder\narchitecture, is especially effective, and more so, as temporal drift\nincreases, on the test-long subset.\n","authors":["Soyoung Yoon","Jongyoon Kim","Seung-won Hwang"],"pdf_url":"https://arxiv.org/pdf/2407.06716v3.pdf","comment":"Accepted at CLEF 2024 LongEval track. Abstract revised: its first two\n (background) sentences were too similar to an earlier paper arXiv:2305.18952"},{"id":"http://arxiv.org/abs/2409.20156v1","updated":"2024-09-30T10:07:28Z","published":"2024-09-30T10:07:28Z","title":"ASTRA: Accurate and Scalable ANNS-based Training of Extreme Classifiers","summary":" `Extreme Classification'' (or XC) is the task of annotating data points\n(queries) with relevant labels (documents), from an extremely large set of $L$\npossible labels, arising in search and recommendations. The most successful\ndeep learning paradigm that has emerged over the last decade or so for XC is to\nembed the queries (and labels) using a deep encoder (e.g. DistilBERT), and use\nlinear classifiers on top of the query embeddings. This architecture is of\nappeal because it enables millisecond-time inference using approximate nearest\nneighbor search (ANNS). The key question is how do we design training\nalgorithms that are accurate as well as scale to $O(100M)$ labels on a limited\nnumber of GPUs.\n State-of-the-art XC techniques that demonstrate high accuracies (e.g., DEXML,\nRen\\'ee, DEXA) on standard datasets have per-epoch training time that scales as\n$O(L)$ or employ expensive negative sampling strategies, which are prohibitive\nin XC scenarios. In this work, we develop an accurate and scalable XC algorithm\nASTRA with two key observations: (a) building ANNS index on the classifier\nvectors and retrieving hard negatives using the classifiers aligns the negative\nsampling strategy to the loss function optimized; (b) keeping the ANNS indices\ncurrent as the classifiers change through the epochs is prohibitively expensive\nwhile using stale negatives (refreshed periodically) results in poor accuracy;\nto remedy this, we propose a negative sampling strategy that uses a mixture of\nimportance sampling and uniform sampling. By extensive evaluation on standard\nXC as well as proprietary datasets with 120M labels, we demonstrate that ASTRA\nachieves SOTA precision, while reducing training time by 4x-15x relative to the\nsecond best.\n","authors":["Sonu Mehta","Jayashree Mohan","Nagarajan Natarajan","Ramachandran Ramjee","Manik Varma"],"pdf_url":"https://arxiv.org/pdf/2409.20156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14263v3","updated":"2024-09-30T08:17:01Z","published":"2023-08-28T02:38:17Z","title":"Cross-Modal Retrieval: A Systematic Review of Methods and Future\n Directions","summary":" With the exponential surge in diverse multi-modal data, traditional uni-modal\nretrieval methods struggle to meet the needs of users seeking access to data\nacross various modalities. To address this, cross-modal retrieval has emerged,\nenabling interaction across modalities, facilitating semantic matching, and\nleveraging complementarity and consistency between heterogeneous data. Although\nprior literature has reviewed the field of cross-modal retrieval, it suffers\nfrom numerous deficiencies in terms of timeliness, taxonomy, and\ncomprehensiveness. This paper conducts a comprehensive review of cross-modal\nretrieval's evolution, spanning from shallow statistical analysis techniques to\nvision-language pre-training models. Commencing with a comprehensive taxonomy\ngrounded in machine learning paradigms, mechanisms, and models, the paper\ndelves deeply into the principles and architectures underpinning existing\ncross-modal retrieval methods. Furthermore, it offers an overview of\nwidely-used benchmarks, metrics, and performances. Lastly, the paper probes the\nprospects and challenges that confront contemporary cross-modal retrieval,\nwhile engaging in a discourse on potential directions for further progress in\nthe field. To facilitate the ongoing research on cross-modal retrieval, we\ndevelop a user-friendly toolbox and an open-source repository at\nhttps://cross-modal-retrieval.github.io.\n","authors":["Tianshi Wang","Fengling Li","Lei Zhu","Jingjing Li","Zheng Zhang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.14263v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.20055v1","updated":"2024-09-30T08:00:04Z","published":"2024-09-30T08:00:04Z","title":"Neural Click Models for Recommender Systems","summary":" We develop and evaluate neural architectures to model the user behavior in\nrecommender systems (RS) inspired by click models for Web search but going\nbeyond standard click models. Proposed architectures include recurrent\nnetworks, Transformer-based models that alleviate the quadratic complexity of\nself-attention, adversarial and hierarchical architectures. Our models\noutperform baselines on the ContentWise and RL4RS datasets and can be used in\nRS simulators to model user response for RS evaluation and pretraining.\n","authors":["Mikhail Shirokikh","Ilya Shenbin","Anton Alekseev","Anna Volodkevich","Alexey Vasilev","Andrey V. Savchenko","Sergey Nikolenko"],"pdf_url":"https://arxiv.org/pdf/2409.20055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.20052v1","updated":"2024-09-30T07:57:13Z","published":"2024-09-30T07:57:13Z","title":"Mitigating Propensity Bias of Large Language Models for Recommender\n Systems","summary":" The rapid development of Large Language Models (LLMs) creates new\nopportunities for recommender systems, especially by exploiting the side\ninformation (e.g., descriptions and analyses of items) generated by these\nmodels. However, aligning this side information with collaborative information\nfrom historical interactions poses significant challenges. The inherent biases\nwithin LLMs can skew recommendations, resulting in distorted and potentially\nunfair user experiences. On the other hand, propensity bias causes side\ninformation to be aligned in such a way that it often tends to represent all\ninputs in a low-dimensional subspace, leading to a phenomenon known as\ndimensional collapse, which severely restricts the recommender system's ability\nto capture user preferences and behaviours. To address these issues, we\nintroduce a novel framework named Counterfactual LLM Recommendation (CLLMR).\nSpecifically, we propose a spectrum-based side information encoder that\nimplicitly embeds structural information from historical interactions into the\nside information representation, thereby circumventing the risk of dimension\ncollapse. Furthermore, our CLLMR approach explores the causal relationships\ninherent in LLM-based recommender systems. By leveraging counterfactual\ninference, we counteract the biases introduced by LLMs. Extensive experiments\ndemonstrate that our CLLMR approach consistently enhances the performance of\nvarious recommender models.\n","authors":["Guixian Zhang","Guan Yuan","Debo Cheng","Lin Liu","Jiuyong Li","Shichao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.20052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21034v3","updated":"2024-09-30T07:40:15Z","published":"2024-07-17T06:51:24Z","title":"Watermarking Recommender Systems","summary":" Recommender systems embody significant commercial value and represent crucial\nintellectual property. However, the integrity of these systems is constantly\nchallenged by malicious actors seeking to steal their underlying models.\nSafeguarding against such threats is paramount to upholding the rights and\ninterests of the model owner. While model watermarking has emerged as a potent\ndefense mechanism in various domains, its direct application to recommender\nsystems remains unexplored and non-trivial. In this paper, we address this gap\nby introducing Autoregressive Out-of-distribution Watermarking (AOW), a novel\ntechnique tailored specifically for recommender systems. Our approach entails\nselecting an initial item and querying it through the oracle model, followed by\nthe selection of subsequent items with small prediction scores. This iterative\nprocess generates a watermark sequence autoregressively, which is then\ningrained into the model's memory through training. To assess the efficacy of\nthe watermark, the model is tasked with predicting the subsequent item given a\ntruncated watermark sequence. Through extensive experimentation and analysis,\nwe demonstrate the superior performance and robust properties of AOW. Notably,\nour watermarking technique exhibits high-confidence extraction capabilities and\nmaintains effectiveness even in the face of distillation and fine-tuning\nprocesses.\n","authors":["Sixiao Zhang","Cheng Long","Wei Yuan","Hongxu Chen","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2407.21034v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19925v1","updated":"2024-09-30T03:59:06Z","published":"2024-09-30T03:59:06Z","title":"Large Language Model Empowered Embedding Generator for Sequential\n Recommendation","summary":" Sequential Recommender Systems (SRS) are extensively applied across various\ndomains to predict users' next interaction by modeling their interaction\nsequences. However, these systems typically grapple with the long-tail problem,\nwhere they struggle to recommend items that are less popular. This challenge\nresults in a decline in user discovery and reduced earnings for vendors,\nnegatively impacting the system as a whole. Large Language Model (LLM) has the\npotential to understand the semantic connections between items, regardless of\ntheir popularity, positioning them as a viable solution to this dilemma. In our\npaper, we present LLMEmb, an innovative technique that harnesses LLM to create\nitem embeddings that bolster the performance of SRS. To align the capabilities\nof general-purpose LLM with the needs of the recommendation domain, we\nintroduce a method called Supervised Contrastive Fine-Tuning (SCFT). This\nmethod involves attribute-level data augmentation and a custom contrastive loss\ndesigned to tailor LLM for enhanced recommendation performance. Moreover, we\nhighlight the necessity of incorporating collaborative filtering signals into\nLLM-generated embeddings and propose Recommendation Adaptation Training (RAT)\nfor this purpose. RAT refines the embeddings to be optimally suited for SRS.\nThe embeddings derived from LLMEmb can be easily integrated with any SRS model,\nshowcasing its practical utility. Extensive experimentation on three real-world\ndatasets has shown that LLMEmb significantly improves upon current methods when\napplied across different SRS models.\n","authors":["Qidong Liu","Xian Wu","Wanyu Wang","Yejing Wang","Yuanshao Zhu","Xiangyu Zhao","Feng Tian","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.19925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16326v4","updated":"2024-09-30T03:11:04Z","published":"2023-05-10T13:40:06Z","title":"A systematic evaluation of large language models for biomedical natural\n language processing: benchmarks, baselines, and recommendations","summary":" The biomedical literature is rapidly expanding, posing a significant\nchallenge for manual curation and knowledge discovery. Biomedical Natural\nLanguage Processing (BioNLP) has emerged as a powerful solution, enabling the\nautomated extraction of information and knowledge from this extensive\nliterature. Recent attention has been directed towards Large Language Models\n(LLMs) due to their impressive performance. However, there remains a critical\ngap in understanding the effectiveness of LLMs in BioNLP tasks and their\nbroader implications for method development and downstream users. Currently,\nthere is a lack of baseline performance data, benchmarks, and practical\nrecommendations for using LLMs in the biomedical domain. To address this gap,\nwe present a systematic evaluation of four representative LLMs: GPT-3.5 and\nGPT-4 (closed-source), LLaMA 2 (open-sourced), and PMC LLaMA (domain-specific)\nacross 12 BioNLP datasets covering six applications (named entity recognition,\nrelation extraction, multi-label document classification, question answering,\ntext summarization, and text simplification). The evaluation is conducted under\nfour settings: zero-shot, static few-shot, dynamic K-nearest few-shot, and\nfine-tuning. We compare these models against state-of-the-art (SOTA) approaches\nthat fine-tune (domain-specific) BERT or BART models, which are\nwell-established methods in BioNLP tasks. The evaluation covers both\nquantitative and qualitative evaluations, where the latter involves manually\nreviewing collectively hundreds of thousands of LLM outputs for\ninconsistencies, missing information, and hallucinations in extractive and\nclassification tasks. The qualitative review also examines accuracy, 1\ncompleteness, and readability in text summarization tasks. Additionally, a cost\nanalysis of closed-source GPT models is conducted.\n","authors":["Qingyu Chen","Yan Hu","Xueqing Peng","Qianqian Xie","Qiao Jin","Aidan Gilson","Maxwell B. Singer","Xuguang Ai","Po-Ting Lai","Zhizheng Wang","Vipina Kuttichi Keloth","Kalpana Raja","Jiming Huang","Huan He","Fongci Lin","Jingcheng Du","Rui Zhang","W. Jim Zheng","Ron A. Adelman","Zhiyong Lu","Hua Xu"],"pdf_url":"https://arxiv.org/pdf/2305.16326v4.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.00289v1","updated":"2024-09-30T23:57:07Z","published":"2024-09-30T23:57:07Z","title":"Delving Deep into Engagement Prediction of Short Videos","summary":" Understanding and modeling the popularity of User Generated Content (UGC)\nshort videos on social media platforms presents a critical challenge with broad\nimplications for content creators and recommendation systems. This study delves\ndeep into the intricacies of predicting engagement for newly published videos\nwith limited user interactions. Surprisingly, our findings reveal that Mean\nOpinion Scores from previous video quality assessment datasets do not strongly\ncorrelate with video engagement levels. To address this, we introduce a\nsubstantial dataset comprising 90,000 real-world UGC short videos from\nSnapchat. Rather than relying on view count, average watch time, or rate of\nlikes, we propose two metrics: normalized average watch percentage (NAWP) and\nengagement continuation rate (ECR) to describe the engagement levels of short\nvideos. Comprehensive multi-modal features, including visual content,\nbackground music, and text data, are investigated to enhance engagement\nprediction. With the proposed dataset and two key metrics, our method\ndemonstrates its ability to predict engagements of short videos purely from\nvideo content.\n","authors":["Dasong Li","Wenjie Li","Baili Lu","Hongsheng Li","Sizhuo Ma","Gurunandan Krishnan","Jian Wang"],"pdf_url":"https://arxiv.org/pdf/2410.00289v1.pdf","comment":"Accepted to ECCV 2024. Project page:\n https://github.com/dasongli1/SnapUGC_Engagement"},{"id":"http://arxiv.org/abs/2409.20500v1","updated":"2024-09-30T17:01:26Z","published":"2024-09-30T17:01:26Z","title":"FreeMask: Rethinking the Importance of Attention Masks for Zero-Shot\n Video Editing","summary":" Text-to-video diffusion models have made remarkable advancements. Driven by\ntheir ability to generate temporally coherent videos, research on zero-shot\nvideo editing using these fundamental models has expanded rapidly. To enhance\nediting quality, structural controls are frequently employed in video editing.\nAmong these techniques, cross-attention mask control stands out for its\neffectiveness and efficiency. However, when cross-attention masks are naively\napplied to video editing, they can introduce artifacts such as blurring and\nflickering. Our experiments uncover a critical factor overlooked in previous\nvideo editing research: cross-attention masks are not consistently clear but\nvary with model structure and denoising timestep. To address this issue, we\npropose the metric Mask Matching Cost (MMC) that quantifies this variability\nand propose FreeMask, a method for selecting optimal masks tailored to specific\nvideo editing tasks. Using MMC-selected masks, we further improve the masked\nfusion mechanism within comprehensive attention features, e.g., temp, cross,\nand self-attention modules. Our approach can be seamlessly integrated into\nexisting zero-shot video editing frameworks with better performance, requiring\nno control assistance or parameter fine-tuning but enabling adaptive decoupling\nof unedited semantic layouts with mask precision control. Extensive experiments\ndemonstrate that FreeMask achieves superior semantic fidelity, temporal\nconsistency, and editing quality compared to state-of-the-art methods.\n","authors":["Lingling Cai","Kang Zhao","Hangjie Yuan","Yingya Zhang","Shiwei Zhang","Kejie Huang"],"pdf_url":"https://arxiv.org/pdf/2409.20500v1.pdf","comment":"Video Editing"},{"id":"http://arxiv.org/abs/2409.20260v1","updated":"2024-09-30T12:50:46Z","published":"2024-09-30T12:50:46Z","title":"Computer-mediated therapies for stroke rehabilitation: a systematic\n review and meta-Analysis","summary":" OBJECTIVE: To evaluate the efficacy of different forms of virtual reality\n(VR) treatments as either immersive virtual reality (IVR) or non-immersive\nvirtual reality (NIVR) in comparison to conventional therapy (CT) in improving\nphysical and psychological status among stroke patients. METHODS: The\nliterature search was conducted on seven databases. ACM Digital Library,\nMedline (via PubMed), Cochrane, IEEE Xplore, Web of Science, and Scopus. The\neffect sizes of the main outcomes were calculated using Cohen's d. Pooled\nresults were used to present an overall estimate of the treatment effect using\na random-effects model. RESULTS: A total of 22 randomized controlled trials\nwere evaluated. 3 trials demonstrated that immersive virtual reality improved\nupper limb activity, function and activity of daily life in a way comparable to\nCT. 18 trials showed that NIVR had similar benefits to CT for upper limb\nactivity and function, balance and mobility, activities of daily living and\nparticipation. A comparison between the different forms of VR showed that IVR\nmay be more beneficial than NIVR for upper limb training and activities of\ndaily life. CONCLUSIONS: This study found out that IVR therapies may be more\neffective than NIVR but not CT to improve upper limb activity, function, and\ndaily life activities. However, there is no evidence of the durability of IVR\ntreatment. More research involving studies with larger samples is needed to\nassess the long-term effects and promising benefits of immersive virtual\nreality technology.\n","authors":["Stanley Mugisha. Mirko Job. Matteo Zoppi","Marco Testa","Rezia Molfino"],"pdf_url":"https://arxiv.org/pdf/2409.20260v1.pdf","comment":"32 pages"},{"id":"http://arxiv.org/abs/2409.20142v1","updated":"2024-09-30T09:45:08Z","published":"2024-09-30T09:45:08Z","title":"Signal Processing for Haptic Surface Modeling: a Review","summary":" Haptic feedback has been integrated into Virtual and Augmented Reality,\ncomplementing acoustic and visual information and contributing to an all-round\nimmersive experience in multiple fields, spanning from the medical domain to\nentertainment and gaming. Haptic technologies involve complex\ncross-disciplinary research that encompasses sensing, data representation,\ninteractive rendering, perception, and quality of experience. The standard\nprocessing pipeline, consists of (I) sensing physical features in the real\nworld using a transducer, (II) modeling and storing the collected information\nin some digital format, (III) communicating the information, and finally, (IV)\nrendering the haptic information through appropriate devices, thus producing a\nuser experience (V) perceptually close to the original physical world. Among\nthese areas, sensing, rendering and perception have been deeply investigated\nand are the subject of different comprehensive surveys available in the\nliterature. Differently, research dealing with haptic surface modeling and data\nrepresentation still lacks a comprehensive dissection. In this work, we aim at\nproviding an overview on modeling and representation of haptic surfaces from a\nsignal processing perspective, covering the aspects that lie in between haptic\ninformation acquisition on one side and rendering and perception on the other\nside. We analyze, categorize, and compare research papers that address the\nhaptic surface modeling and data representation, pointing out existing gaps and\npossible research directions.\n","authors":["Antonio Luigi Stefani","Niccolò Bisagno","Andrea Rosani","Nicola Conci","Francesco De Natale"],"pdf_url":"https://arxiv.org/pdf/2409.20142v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.20081v1","updated":"2024-09-30T08:31:14Z","published":"2024-09-30T08:31:14Z","title":"ProFD: Prompt-Guided Feature Disentangling for Occluded Person\n Re-Identification","summary":" To address the occlusion issues in person Re-Identification (ReID) tasks,\nmany methods have been proposed to extract part features by introducing\nexternal spatial information. However, due to missing part appearance\ninformation caused by occlusion and noisy spatial information from external\nmodel, these purely vision-based approaches fail to correctly learn the\nfeatures of human body parts from limited training data and struggle in\naccurately locating body parts, ultimately leading to misaligned part features.\nTo tackle these challenges, we propose a Prompt-guided Feature Disentangling\nmethod (ProFD), which leverages the rich pre-trained knowledge in the textual\nmodality facilitate model to generate well-aligned part features. ProFD first\ndesigns part-specific prompts and utilizes noisy segmentation mask to\npreliminarily align visual and textual embedding, enabling the textual prompts\nto have spatial awareness. Furthermore, to alleviate the noise from external\nmasks, ProFD adopts a hybrid-attention decoder, ensuring spatial and semantic\nconsistency during the decoding process to minimize noise impact. Additionally,\nto avoid catastrophic forgetting, we employ a self-distillation strategy,\nretaining pre-trained knowledge of CLIP to mitigate over-fitting. Evaluation\nresults on the Market1501, DukeMTMC-ReID, Occluded-Duke, Occluded-ReID, and\nP-DukeMTMC datasets demonstrate that ProFD achieves state-of-the-art results.\nOur project is available at: https://github.com/Cuixxx/ProFD.\n","authors":["Can Cui","Siteng Huang","Wenxuan Song","Pengxiang Ding","Min Zhang","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.20081v1.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2308.14263v3","updated":"2024-09-30T08:17:01Z","published":"2023-08-28T02:38:17Z","title":"Cross-Modal Retrieval: A Systematic Review of Methods and Future\n Directions","summary":" With the exponential surge in diverse multi-modal data, traditional uni-modal\nretrieval methods struggle to meet the needs of users seeking access to data\nacross various modalities. To address this, cross-modal retrieval has emerged,\nenabling interaction across modalities, facilitating semantic matching, and\nleveraging complementarity and consistency between heterogeneous data. Although\nprior literature has reviewed the field of cross-modal retrieval, it suffers\nfrom numerous deficiencies in terms of timeliness, taxonomy, and\ncomprehensiveness. This paper conducts a comprehensive review of cross-modal\nretrieval's evolution, spanning from shallow statistical analysis techniques to\nvision-language pre-training models. Commencing with a comprehensive taxonomy\ngrounded in machine learning paradigms, mechanisms, and models, the paper\ndelves deeply into the principles and architectures underpinning existing\ncross-modal retrieval methods. Furthermore, it offers an overview of\nwidely-used benchmarks, metrics, and performances. Lastly, the paper probes the\nprospects and challenges that confront contemporary cross-modal retrieval,\nwhile engaging in a discourse on potential directions for further progress in\nthe field. To facilitate the ongoing research on cross-modal retrieval, we\ndevelop a user-friendly toolbox and an open-source repository at\nhttps://cross-modal-retrieval.github.io.\n","authors":["Tianshi Wang","Fengling Li","Lei Zhu","Jingjing Li","Zheng Zhang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.14263v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.20012v1","updated":"2024-09-30T07:14:31Z","published":"2024-09-30T07:14:31Z","title":"Towards Robust Multimodal Sentiment Analysis with Incomplete Data","summary":" The field of Multimodal Sentiment Analysis (MSA) has recently witnessed an\nemerging direction seeking to tackle the issue of data incompleteness.\nRecognizing that the language modality typically contains dense sentiment\ninformation, we consider it as the dominant modality and present an innovative\nLanguage-dominated Noise-resistant Learning Network (LNLN) to achieve robust\nMSA. The proposed LNLN features a dominant modality correction (DMC) module and\ndominant modality based multimodal learning (DMML) module, which enhances the\nmodel's robustness across various noise scenarios by ensuring the quality of\ndominant modality representations. Aside from the methodical design, we perform\ncomprehensive experiments under random data missing scenarios, utilizing\ndiverse and meaningful settings on several popular datasets (\\textit{e.g.,}\nMOSI, MOSEI, and SIMS), providing additional uniformity, transparency, and\nfairness compared to existing evaluations in the literature. Empirically, LNLN\nconsistently outperforms existing baselines, demonstrating superior performance\nacross these challenging and extensive evaluation metrics.\n","authors":["Haoyu Zhang","Wenbin Wang","Tianshu Yu"],"pdf_url":"https://arxiv.org/pdf/2409.20012v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.19904v1","updated":"2024-09-30T03:18:10Z","published":"2024-09-30T03:18:10Z","title":"WildFusion: Multimodal Implicit 3D Reconstructions in the Wild","summary":" We propose WildFusion, a novel approach for 3D scene reconstruction in\nunstructured, in-the-wild environments using multimodal implicit neural\nrepresentations. WildFusion integrates signals from LiDAR, RGB camera, contact\nmicrophones, tactile sensors, and IMU. This multimodal fusion generates\ncomprehensive, continuous environmental representations, including pixel-level\ngeometry, color, semantics, and traversability. Through real-world experiments\non legged robot navigation in challenging forest environments, WildFusion\ndemonstrates improved route selection by accurately predicting traversability.\nOur results highlight its potential to advance robotic navigation and 3D\nmapping in complex outdoor terrains.\n","authors":["Yanbaihui Liu","Boyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2409.19904v1.pdf","comment":"Our project website is at: http://generalroboticslab.com/WildFusion"}]},"2024-09-29T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.19824v1","updated":"2024-09-29T23:12:04Z","published":"2024-09-29T23:12:04Z","title":"Counterfactual Evaluation of Ads Ranking Models through Domain\n Adaptation","summary":" We propose a domain-adapted reward model that works alongside an Offline A/B\ntesting system for evaluating ranking models. This approach effectively\nmeasures reward for ranking model changes in large-scale Ads recommender\nsystems, where model-free methods like IPS are not feasible. Our experiments\ndemonstrate that the proposed technique outperforms both the vanilla IPS method\nand approaches using non-generalized reward models.\n","authors":["Mohamed A. Radwan","Himaghna Bhattacharjee","Quinn Lanners","Jiasheng Zhang","Serkan Karakulak","Houssam Nassif","Murat Ali Bayir"],"pdf_url":"https://arxiv.org/pdf/2409.19824v1.pdf","comment":"Accepted at the CONSEQUENCES'24 workshop, co-located with ACM\n RecSys'24"},{"id":"http://arxiv.org/abs/2406.10250v2","updated":"2024-09-29T10:18:08Z","published":"2024-06-09T15:42:54Z","title":"Robust portfolio optimization for recommender systems considering\n uncertainty of estimated statistics","summary":" This paper is concerned with portfolio optimization models for creating\nhigh-quality lists of recommended items to balance the accuracy and diversity\nof recommendations. However, the statistics (i.e., expectation and covariance\nof ratings) required for mean--variance portfolio optimization are subject to\ninevitable estimation errors. To remedy this situation, we focus on robust\noptimization techniques that derive reliable solutions to uncertain\noptimization problems. Specifically, we propose a robust portfolio optimization\nmodel that copes with the uncertainty of estimated statistics based on the\ncardinality-based uncertainty sets. This robust portfolio optimization model\ncan be reduced to a mixed-integer linear optimization problem, which can be\nsolved exactly using mathematical optimization solvers. Experimental results\nusing two publicly available rating datasets demonstrate that our method can\nimprove not only the recommendation accuracy but also the diversity of\nrecommendations compared with conventional mean--variance portfolio\noptimization models. Notably, our method has the potential to improve the\nrecommendation quality of various rating prediction algorithms.\n","authors":["Tomoya Yanagi","Shunnosuke Ikeda","Yuichi Takano"],"pdf_url":"https://arxiv.org/pdf/2406.10250v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19692v2","updated":"2024-09-29T10:12:40Z","published":"2024-07-29T04:30:38Z","title":"High-Order Fusion Graph Contrastive Learning for Recommendation","summary":" Self-supervised learning (SSL) has recently attracted significant attention\nin the field of recommender systems. Contrastive learning (CL) stands out as a\nmajor SSL paradigm due to its robust ability to generate self-supervised\nsignals. Mainstream graph contrastive learning (GCL)-based methods typically\nimplement CL by creating contrastive views through various data augmentation\ntechniques. Despite these methods are effective, we argue that there still\nexist several challenges. i) Data augmentation (e.g., discarding edges or\nadding noise) necessitates additional graph convolution (GCN) or modeling\noperations, which are highly time-consuming and potentially harm the embedding\nquality. ii) Existing CL-based methods use traditional CL objectives to capture\nself-supervised signals. However, few studies have explored obtaining CL\nobjectives from more perspectives and have attempted to fuse the varying\nsignals from these CL objectives to enhance recommendation performance.\n To overcome these challenges, we propose a High-order Fusion Graph\nContrastive Learning (HFGCL) framework for recommendation. Specifically,\ninstead of facilitating data augmentations, we use high-order information from\nGCN process to create contrastive views. Additionally, to integrate\nself-supervised signals from various CL objectives, we propose an advanced CL\nobjective. By ensuring that positive pairs are distanced from negative samples\nderived from both contrastive views, we effectively fuse self-supervised\nsignals from distinct CL objectives, thereby enhancing the mutual information\nbetween positive pairs. Experimental results on three public datasets\ndemonstrate the superior recommendation performance and efficiency of HFGCL\ncompared to the state-of-the-art baselines.\n","authors":["Yu Zhang","Lei Sang","Yi Zhang","Yiwen Zhang","Yun Yang"],"pdf_url":"https://arxiv.org/pdf/2407.19692v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19574v1","updated":"2024-09-29T06:30:44Z","published":"2024-09-29T06:30:44Z","title":"The Devil is in the Sources! Knowledge Enhanced Cross-Domain\n Recommendation in an Information Bottleneck Perspective","summary":" Cross-domain Recommendation (CDR) aims to alleviate the data sparsity and the\ncold-start problems in traditional recommender systems by leveraging knowledge\nfrom an informative source domain. However, previously proposed CDR models\npursue an imprudent assumption that the entire information from the source\ndomain is equally contributed to the target domain, neglecting the evil part\nthat is completely irrelevant to users' intrinsic interest. To address this\nconcern, in this paper, we propose a novel knowledge enhanced cross-domain\nrecommendation framework named CoTrans, which remolds the core procedures of\nCDR models with: Compression on the knowledge from the source domain and\nTransfer of the purity to the target domain. Specifically, following the theory\nof Graph Information Bottleneck, CoTrans first compresses the source behaviors\nwith the perception of information from the target domain. Then to preserve all\nthe important information for the CDR task, the feedback signals from both\ndomains are utilized to promote the effectiveness of the transfer procedure.\nAdditionally, a knowledge-enhanced encoder is employed to narrow gaps caused by\nthe non-overlapped items across separate domains. Comprehensive experiments on\nthree widely used cross-domain datasets demonstrate that CoTrans significantly\noutperforms both single-domain and state-of-the-art cross-domain recommendation\napproaches.\n","authors":["Binbin Hu","Weifan Wang","Hanshu Wang","Ziqi Liu","Bin Shen","Yong He","Jiawei Chen"],"pdf_url":"https://arxiv.org/pdf/2409.19574v1.pdf","comment":"Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2409.19548v1","updated":"2024-09-29T04:24:38Z","published":"2024-09-29T04:24:38Z","title":"Meta Learning to Rank for Sparsely Supervised Queries","summary":" Supervisory signals are a critical resource for training learning to rank\nmodels. In many real-world search and retrieval scenarios, these signals may\nnot be readily available or could be costly to obtain for some queries. The\nexamples include domains where labeling requires professional expertise,\napplications with strong privacy constraints, and user engagement information\nthat are too scarce. We refer to these scenarios as sparsely supervised queries\nwhich pose significant challenges to traditional learning to rank models. In\nthis work, we address sparsely supervised queries by proposing a novel meta\nlearning to rank framework which leverages fast learning and adaption\ncapability of meta-learning. The proposed approach accounts for the fact that\ndifferent queries have different optimal parameters for their rankers, in\ncontrast to traditional learning to rank models which only learn a global\nranking model applied to all the queries. In consequence, the proposed method\nwould yield significant advantages especially when new queries are of different\ncharacteristics with the training queries. Moreover, the proposed meta learning\nto rank framework is generic and flexible. We conduct a set of comprehensive\nexperiments on both public datasets and a real-world e-commerce dataset. The\nresults demonstrate that the proposed meta-learning approach can significantly\nenhance the performance of learning to rank models with sparsely labeled\nqueries.\n","authors":["Xuyang Wu","Ajit Puthenputhussery","Hongwei Shang","Changsung Kang","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2409.19548v1.pdf","comment":"Accepted at TOIS"},{"id":"http://arxiv.org/abs/2408.11557v3","updated":"2024-09-29T02:28:45Z","published":"2024-08-21T12:09:37Z","title":"A Quick, trustworthy spectral knowledge Q&A system\n leveragingretrieval-augmented generation on LLM","summary":" Large Language Model (LLM) has demonstrated significant success in a range of\nnatural language processing (NLP) tasks within general domain. The emergence of\nLLM has introduced innovative methodologies across diverse fields, including\nthe natural sciences. Researchers aim to implement automated, concurrent\nprocess driven by LLM to supplant conventional manual, repetitive and\nlabor-intensive work. In the domain of spectral analysis and detection, it is\nimperative for researchers to autonomously acquire pertinent knowledge across\nvarious research objects, which encompasses the spectroscopic techniques and\nthe chemometric methods that are employed in experiments and analysis.\nParadoxically, despite the recognition of spectroscopic detection as an\neffective analytical method, the fundamental process of knowledge retrieval\nremains both time-intensive and repetitive. In response to this challenge, we\nfirst introduced the Spectral Detection and Analysis Based Paper(SDAAP)\ndataset, which is the first open-source textual knowledge dataset for spectral\nanalysis and detection and contains annotated literature data as well as\ncorresponding knowledge instruction data. Subsequently, we also designed an\nautomated Q\\&A framework based on the SDAAP dataset, which can retrieve\nrelevant knowledge and generate high-quality responses by extracting entities\nin the input as retrieval parameters. It is worth noting that: within this\nframework, LLM is only used as a tool to provide generalizability, while RAG\ntechnique is used to accurately capture the source of the knowledge.This\napproach not only improves the quality of the generated responses, but also\nensures the traceability of the knowledge. Experimental results show that our\nframework generates responses with more reliable expertise compared to the\nbaseline.\n","authors":["Jiheng Liang","Ziru Yu","Zujie Xie","Xiangyang Yu"],"pdf_url":"https://arxiv.org/pdf/2408.11557v3.pdf","comment":"16 pages,10 figures,3 tables"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.19672v1","updated":"2024-09-29T12:00:57Z","published":"2024-09-29T12:00:57Z","title":"Modeling Layout Reading Order as Ordering Relations for Visually-rich\n Document Understanding","summary":" Modeling and leveraging layout reading order in visually-rich documents\n(VrDs) is critical in document intelligence as it captures the rich structure\nsemantics within documents. Previous works typically formulated layout reading\norder as a permutation of layout elements, i.e. a sequence containing all the\nlayout elements. However, we argue that this formulation does not adequately\nconvey the complete reading order information in the layout, which may\npotentially lead to performance decline in downstream VrD tasks. To address\nthis issue, we propose to model the layout reading order as ordering relations\nover the set of layout elements, which have sufficient expressive capability\nfor the complete reading order information. To enable empirical evaluation on\nmethods towards the improved form of reading order prediction (ROP), we\nestablish a comprehensive benchmark dataset including the reading order\nannotation as relations over layout elements, together with a\nrelation-extraction-based method that outperforms previous methods. Moreover,\nto highlight the practical benefits of introducing the improved form of layout\nreading order, we propose a reading-order-relation-enhancing pipeline to\nimprove model performance on any arbitrary VrD task by introducing additional\nreading order relation inputs. Comprehensive results demonstrate that the\npipeline generally benefits downstream VrD tasks: (1) with utilizing the\nreading order relation information, the enhanced downstream models achieve SOTA\nresults on both two task settings of the targeted dataset; (2) with utilizing\nthe pseudo reading order information generated by the proposed ROP model, the\nperformance of the enhanced models has improved across all three models and\neight cross-domain VrD-IE/QA task settings without targeted optimization.\n","authors":["Chong Zhang","Yi Tu","Yixi Zhao","Chenshu Yuan","Huan Chen","Yue Zhang","Mingxu Chai","Ya Guo","Huijia Zhu","Qi Zhang","Tao Gui"],"pdf_url":"https://arxiv.org/pdf/2409.19672v1.pdf","comment":"Accepted as a long paper in the main conference of EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.19627v1","updated":"2024-09-29T09:32:54Z","published":"2024-09-29T09:32:54Z","title":"IDEAW: Robust Neural Audio Watermarking with Invertible Dual-Embedding","summary":" The audio watermarking technique embeds messages into audio and accurately\nextracts messages from the watermarked audio. Traditional methods develop\nalgorithms based on expert experience to embed watermarks into the time-domain\nor transform-domain of signals. With the development of deep neural networks,\ndeep learning-based neural audio watermarking has emerged. Compared to\ntraditional algorithms, neural audio watermarking achieves better robustness by\nconsidering various attacks during training. However, current neural\nwatermarking methods suffer from low capacity and unsatisfactory\nimperceptibility. Additionally, the issue of watermark locating, which is\nextremely important and even more pronounced in neural audio watermarking, has\nnot been adequately studied. In this paper, we design a dual-embedding\nwatermarking model for efficient locating. We also consider the impact of the\nattack layer on the invertible neural network in robustness training, improving\nthe model to enhance both its reasonableness and stability. Experiments show\nthat the proposed model, IDEAW, can withstand various attacks with higher\ncapacity and more efficient locating ability compared to existing methods.\n","authors":["Pengcheng Li","Xulong Zhang","Jing Xiao","Jianzong Wang"],"pdf_url":"https://arxiv.org/pdf/2409.19627v1.pdf","comment":"Accepted by the 2024 Conference on Empirical Methods in Natural\n Language Processing (EMNLP 2024)"},{"id":"http://arxiv.org/abs/2409.19575v1","updated":"2024-09-29T06:30:46Z","published":"2024-09-29T06:30:46Z","title":"Quantitative Analysis of Audio-Visual Tasks: An Information-Theoretic\n Perspective","summary":" In the field of spoken language processing, audio-visual speech processing is\nreceiving increasing research attention. Key components of this research\ninclude tasks such as lip reading, audio-visual speech recognition, and\nvisual-to-speech synthesis. Although significant success has been achieved,\ntheoretical analysis is still insufficient for audio-visual tasks. This paper\npresents a quantitative analysis based on information theory, focusing on\ninformation intersection between different modalities. Our results show that\nthis analysis is valuable for understanding the difficulties of audio-visual\nprocessing tasks as well as the benefits that could be obtained by modality\nintegration.\n","authors":["Chen Chen","Xiaolou Li","Zehua Liu","Lantian Li","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2409.19575v1.pdf","comment":"Accepted by ISCSLP2024"},{"id":"http://arxiv.org/abs/2409.19532v1","updated":"2024-09-29T03:33:35Z","published":"2024-09-29T03:33:35Z","title":"Video DataFlywheel: Resolving the Impossible Data Trinity in\n Video-Language Understanding","summary":" Recently, video-language understanding has achieved great success through\nlarge-scale pre-training. However, data scarcity remains a prevailing\nchallenge. This study quantitatively reveals an \"impossible trinity\" among data\nquantity, diversity, and quality in pre-training datasets. Recent efforts seek\nto refine large-scale, diverse ASR datasets compromised by low quality through\nsynthetic annotations. These methods successfully leverage useful information\nin multimodal video content (frames, tags, ASR transcripts, etc.) to refine the\noriginal annotations. Nevertheless, they struggle to mitigate noise within\nsynthetic annotations and lack scalability as the dataset size expands. To\naddress these issues, we introduce the Video DataFlywheel framework, which\niteratively refines video annotations with improved noise control methods. For\niterative refinement, we first leverage a video-language model to generate\nsynthetic annotations, resulting in a refined dataset. Then, we pre-train on it\nand fine-tune on human refinement examples for a stronger model. These\nprocesses are repeated for continuous improvement. For noise control, we\npresent AdaTaiLr, a novel noise control method that requires weaker assumptions\non noise distribution, thereby proving more effective in large datasets with\ntheoretical guarantees. The combination of iterative refinement and AdaTaiLr\ncan achieve better scalability in video-language understanding. Extensive\nexperiments show that our framework outperforms existing data refinement\nbaselines, delivering a 3% performance boost and improving dataset quality with\nminimal diversity loss. Furthermore, our refined dataset facilitates\nsignificant improvements in various video-language understanding tasks,\nincluding video question answering and text-video retrieval.\n","authors":["Xiao Wang","Jianlong Wu","Zijia Lin","Fuzheng Zhang","Di Zhang","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2409.19532v1.pdf","comment":"Under peer review"},{"id":"http://arxiv.org/abs/2409.19506v1","updated":"2024-09-29T01:29:34Z","published":"2024-09-29T01:29:34Z","title":"IWN: Image Watermarking Based on Idempotency","summary":" In the expanding field of digital media, maintaining the strength and\nintegrity of watermarking technology is becoming increasingly challenging. This\npaper, inspired by the Idempotent Generative Network (IGN), explores the\nprospects of introducing idempotency into image watermark processing and\nproposes an innovative neural network model - the Idempotent Watermarking\nNetwork (IWN). The proposed model, which focuses on enhancing the recovery\nquality of color image watermarks, leverages idempotency to ensure superior\nimage reversibility. This feature ensures that, even if color image watermarks\nare attacked or damaged, they can be effectively projected and mapped back to\ntheir original state. Therefore, the extracted watermarks have unquestionably\nincreased quality. The IWN model achieves a balance between embedding capacity\nand robustness, alleviating to some extent the inherent contradiction between\nthese two factors in traditional watermarking techniques and steganography\nmethods.\n","authors":["Kaixin Deng"],"pdf_url":"https://arxiv.org/pdf/2409.19506v1.pdf","comment":null}]},"2024-09-28T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.19445v1","updated":"2024-09-28T19:58:29Z","published":"2024-09-28T19:58:29Z","title":"HTML-LSTM: Information Extraction from HTML Tables in Web Pages using\n Tree-Structured LSTM","summary":" In this paper, we propose a novel method for extracting information from HTML\ntables with similar contents but with a different structure. We aim to\nintegrate multiple HTML tables into a single table for retrieval of information\ncontaining in various Web pages. The method is designed by extending\ntree-structured LSTM, the neural network for tree-structured data, in order to\nextract information that is both linguistic and structural information of HTML\ndata. We evaluate the proposed method through experiments using real data\npublished on the WWW.\n","authors":["Kazuki Kawamura","Akihiro Yamamoto"],"pdf_url":"https://arxiv.org/pdf/2409.19445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19401v1","updated":"2024-09-28T16:22:53Z","published":"2024-09-28T16:22:53Z","title":"Crafting Personalized Agents through Retrieval-Augmented Generation on\n Editable Memory Graphs","summary":" In the age of mobile internet, user data, often referred to as memories, is\ncontinuously generated on personal devices. Effectively managing and utilizing\nthis data to deliver services to users is a compelling research topic. In this\npaper, we introduce a novel task of crafting personalized agents powered by\nlarge language models (LLMs), which utilize a user's smartphone memories to\nenhance downstream applications with advanced LLM capabilities. To achieve this\ngoal, we introduce EMG-RAG, a solution that combines Retrieval-Augmented\nGeneration (RAG) techniques with an Editable Memory Graph (EMG). This approach\nis further optimized using Reinforcement Learning to address three distinct\nchallenges: data collection, editability, and selectability. Extensive\nexperiments on a real-world dataset validate the effectiveness of EMG-RAG,\nachieving an improvement of approximately 10% over the best existing approach.\nAdditionally, the personalized agents have been transferred into a real\nsmartphone AI assistant, which leads to enhanced usability.\n","authors":["Zheng Wang","Zhongyang Li","Zeren Jiang","Dandan Tu","Wei Shi"],"pdf_url":"https://arxiv.org/pdf/2409.19401v1.pdf","comment":"This paper has been accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.19267v1","updated":"2024-09-28T06:47:30Z","published":"2024-09-28T06:47:30Z","title":"Utilizing Collaborative Filtering in a Personalized Research-Paper\n Recommendation System","summary":" Recommendation system is such a platform that helps people to easily find out\nthe things they need within a few seconds. It is implemented based on the\npreferences of similar users or items. In this digital era, the internet has\nprovided us with huge opportunities to use a lot of open resources for our own\nneeds. But there are too many resources on the internet from which finding the\nprecise one is a difficult job. Recommendation system has made this easier for\npeople. Research-paper recommendation system is a system that is developed for\npeople with common research interests using a collaborative filtering\nrecommender system. In this paper, coauthor, keyword, reference, and common\ncitation similarities are calculated using Jaccard Similarity to find the final\nsimilarity and to find the top-n similar users. Based on the test of top-n\nsimilar users of the target user research paper recommendations have been made.\nFinally, the accuracy of our recommendation system has been calculated. An\nimpressive result has been found using our proposed system.\n","authors":["Mahamudul Hasan","Anika Tasnim Islam","Nabila Islam"],"pdf_url":"https://arxiv.org/pdf/2409.19267v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.19262v1","updated":"2024-09-28T06:33:18Z","published":"2024-09-28T06:33:18Z","title":"An Efficient Multi-threaded Collaborative Filtering Approach in\n Recommendation System","summary":" Recommender systems are a subset of information filtering systems designed to\npredict and suggest items that users may find interesting or relevant based on\ntheir preferences, behaviors, or interactions. By analyzing user data such as\npast activities, ratings, and preferences, these systems generate personalized\nrecommendations for products, services, or content, with common applications\nincluding online retail, media streaming platforms, and social media.\nRecommender systems are typically categorized into three types: content-based\nfiltering, which recommends items similar to those the user has shown interest\nin; collaborative filtering, which analyzes the preferences of similar users;\nand hybrid methods, which combine both approaches to improve accuracy. These\nsystems enhance user experience by reducing information overload and providing\npersonalized suggestions, thus increasing engagement and satisfaction. However,\nbuilding a scalable recommendation system capable of handling numerous users\nefficiently is a significant challenge, particularly when considering both\nperformance consistency and user data security, which are emerging research\ntopics. The primary objective of this research is to address these challenges\nby reducing the processing time in recommendation systems. A multithreaded\nsimilarity approach is employed to achieve this, where users are divided into\nindependent threads that run in parallel. This parallelization significantly\nreduces computation time compared to traditional methods, resulting in a\nfaster, more efficient, and scalable recommendation system that ensures\nimproved performance without compromising user data security.\n","authors":["Mahamudul Hasan"],"pdf_url":"https://arxiv.org/pdf/2409.19262v1.pdf","comment":"6 Pages 6 Figure, Paper got accepted at the 2nd International\n Conference on Artificial Intelligence, Blockchain, and Internet of Things,\n (AIBThings)"},{"id":"http://arxiv.org/abs/2406.14900v2","updated":"2024-09-28T05:47:38Z","published":"2024-06-21T06:47:28Z","title":"Decoding Matters: Addressing Amplification Bias and Homogeneity Issue\n for LLM-based Recommendation","summary":" Adapting Large Language Models (LLMs) for recommendation requires careful\nconsideration of the decoding process, given the inherent differences between\ngenerating items and natural language. Existing approaches often directly apply\nLLMs' original decoding methods. However, we find these methods encounter\nsignificant challenges: 1) amplification bias -- where standard length\nnormalization inflates scores for items containing tokens with generation\nprobabilities close to 1 (termed ghost tokens), and 2) homogeneity issue --\ngenerating multiple similar or repetitive items for a user. To tackle these\nchallenges, we introduce a new decoding approach named Debiasing-Diversifying\nDecoding (D3). D3 disables length normalization for ghost tokens to alleviate\namplification bias, and it incorporates a text-free assistant model to\nencourage tokens less frequently generated by LLMs for counteracting\nrecommendation homogeneity. Extensive experiments on real-world datasets\ndemonstrate the method's effectiveness in enhancing accuracy and diversity.\n","authors":["Keqin Bao","Jizhi Zhang","Yang Zhang","Xinyue Huo","Chong Chen","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2406.14900v2.pdf","comment":"Accepted at EMNLP 2024 Main Conference"}],"Multimedia":[{"id":"http://arxiv.org/abs/2404.18202v2","updated":"2024-09-28T17:00:44Z","published":"2024-04-28T14:42:02Z","title":"WorldGPT: Empowering LLM as Multimodal World Model","summary":" World models are progressively being employed across diverse fields,\nextending from basic environment simulation to complex scenario construction.\nHowever, existing models are mainly trained on domain-specific states and\nactions, and confined to single-modality state representations. In this paper,\nWe introduce WorldGPT, a generalist world model built upon Multimodal Large\nLanguage Model (MLLM). WorldGPT acquires an understanding of world dynamics\nthrough analyzing millions of videos across various domains. To further enhance\nWorldGPT's capability in specialized scenarios and long-term tasks, we have\nintegrated it with a novel cognitive architecture that combines memory\noffloading, knowledge retrieval, and context reflection. As for evaluation, we\nbuild WorldNet, a multimodal state transition prediction benchmark encompassing\nvaried real-life scenarios. Conducting evaluations on WorldNet directly\ndemonstrates WorldGPT's capability to accurately model state transition\npatterns, affirming its effectiveness in understanding and predicting the\ndynamics of complex scenarios. We further explore WorldGPT's emerging potential\nin serving as a world simulator, helping multimodal agents generalize to\nunfamiliar domains through efficiently synthesising multimodal instruction\ninstances which are proved to be as reliable as authentic data for fine-tuning\npurposes. The project is available on\n\\url{https://github.com/DCDmllm/WorldGPT}.\n","authors":["Zhiqi Ge","Hongzhe Huang","Mingze Zhou","Juncheng Li","Guoming Wang","Siliang Tang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2404.18202v2.pdf","comment":"update v2"},{"id":"http://arxiv.org/abs/2409.10994v2","updated":"2024-09-28T14:04:43Z","published":"2024-09-17T08:56:27Z","title":"Less is More: A Simple yet Effective Token Reduction Method for\n Efficient Multi-modal LLMs","summary":" The rapid advancement of Multimodal Large Language Models (MLLMs) has led to\nremarkable performances across various domains. However, this progress is\naccompanied by a substantial surge in the resource consumption of these models.\nWe address this pressing issue by introducing a new approach, Token Reduction\nusing CLIP Metric (TRIM), aimed at improving the efficiency of MLLMs without\nsacrificing their performance. Inspired by human attention patterns in Visual\nQuestion Answering (VQA) tasks, TRIM presents a fresh perspective on the\nselection and reduction of image tokens. The TRIM method has been extensively\ntested across 12 datasets, and the results demonstrate a significant reduction\nin computational overhead while maintaining a consistent level of performance.\nThis research marks a critical stride in efficient MLLM development, promoting\ngreater accessibility and sustainability of high-performing models.\n","authors":["Dingjie Song","Wenjun Wang","Shunian Chen","Xidong Wang","Michael Guan","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.10994v2.pdf","comment":"9 pages, 3 figures, 6 tables Code and Model:\n https://github.com/FreedomIntelligence/TRIM"},{"id":"http://arxiv.org/abs/2409.19220v1","updated":"2024-09-28T03:24:18Z","published":"2024-09-28T03:24:18Z","title":"Extending Depth of Field for Varifocal Multiview Images","summary":" Optical imaging systems are generally limited by the depth of field because\nof the nature of the optics. Therefore, extending depth of field (EDoF) is a\nfundamental task for meeting the requirements of emerging visual applications.\nTo solve this task, the common practice is using multi-focus images from a\nsingle viewpoint. This method can obtain acceptable quality of EDoF under the\ncondition of fixed field of view, but it is only applicable to static scenes\nand the field of view is limited and fixed. An emerging data type, varifocal\nmultiview images have the potential to become a new paradigm for solving the\nEDoF, because the data contains more field of view information than multi-focus\nimages. To realize EDoF of varifocal multiview images, we propose an end-to-end\nmethod for the EDoF, including image alignment, image optimization and image\nfusion. Experimental results demonstrate the efficiency of the proposed method.\n","authors":["Zhilong Li","Kejun Wu","Qiong Liu","You Yang"],"pdf_url":"https://arxiv.org/pdf/2409.19220v1.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 150 + +
+
+
+ + ☆ Vinoground: Scrutinizing LMMs over Dense Temporal Reasoning with Short + Videos + + +
+ There has been growing sentiment recently that modern large multimodal models +(LMMs) have addressed most of the key challenges related to short video +comprehension. As a result, both academia and industry are gradually shifting +their attention towards the more complex challenges posed by understanding +long-form videos. However, is this really the case? Our studies indicate that +LMMs still lack many fundamental reasoning capabilities even when dealing with +short videos. We introduce Vinoground, a temporal counterfactual LMM evaluation +benchmark encompassing 1000 short and natural video-caption pairs. We +demonstrate that existing LMMs severely struggle to distinguish temporal +differences between different actions and object transformations. For example, +the best model GPT-4o only obtains ~50% on our text and video scores, showing a +large gap compared to the human baseline of ~90%. All open-source multimodal +models and CLIP-based models perform much worse, producing mostly random chance +performance. Through this work, we shed light onto the fact that temporal +reasoning in short videos is a problem yet to be fully solved. The dataset and +evaluation code are available at https://vinoground.github.io. + +
+
+ comment: Project Page: https://vinoground.github.io +
+
+
+
+
+ + ☆ Erasing Conceptual Knowledge from Language Models + + +
+ Concept erasure in language models has traditionally lacked a comprehensive +evaluation framework, leading to incomplete assessments of effectiveness of +erasure methods. We propose an evaluation paradigm centered on three critical +criteria: innocence (complete knowledge removal), seamlessness (maintaining +conditional fluent generation), and specificity (preserving unrelated task +performance). Our evaluation metrics naturally motivate the development of +Erasure of Language Memory (ELM), a new method designed to address all three +dimensions. ELM employs targeted low-rank updates to alter output distributions +for erased concepts while preserving overall model capabilities including +fluency when prompted for an erased concept. We demonstrate ELM's efficacy on +biosecurity, cybersecurity, and literary domain erasure tasks. Comparative +analysis shows that ELM achieves superior performance across our proposed +metrics, including near-random scores on erased topic assessments, generation +fluency, maintained accuracy on unrelated benchmarks, and robustness under +adversarial attacks. Our code, data, and trained models are available at +https://elm.baulab.info + +
+
+ comment: Project Page: https://elm.baulab.info +
+
+
+
+
+ + ☆ CorPipe at CRAC 2024: Predicting Zero Mentions from Raw Text + + +
+ We present CorPipe 24, the winning entry to the CRAC 2024 Shared Task on +Multilingual Coreference Resolution. In this third iteration of the shared +task, a novel objective is to also predict empty nodes needed for zero +coreference mentions (while the empty nodes were given on input in previous +years). This way, coreference resolution can be performed on raw text. We +evaluate two model variants: a~two-stage approach (where the empty nodes are +predicted first using a pretrained encoder model and then processed together +with sentence words by another pretrained model) and a single-stage approach +(where a single pretrained encoder model generates empty nodes, coreference +mentions, and coreference links jointly). In both settings, CorPipe surpasses +other participants by a large margin of 3.9 and 2.8 percent points, +respectively. The source code and the trained model are available at +https://github.com/ufal/crac2024-corpipe . + +
+
+ comment: Accepted to CRAC 2024 +
+
+
+
+
+ + ☆ SIEVE: General Purpose Data Filtering System Matching GPT-4o Accuracy at + 1% the Cost + + +
+ Creating specialized large language models requires vast amounts of clean, +special purpose data for training and fine-tuning. With only a handful of +existing large-scale, domain-specific datasets, creation of new datasets is +required in most applications. This requires the development of new +application-specific filtering of web-scale data. Filtering with a +high-performance, general-purpose LLM such as GPT-4o can be highly effective, +but this is extremely expensive at web-scale. This paper proposes SIEVE, a +lightweight alternative that matches GPT-4o accuracy at a fraction of the cost. +SIEVE can perform up to 500 filtering operations for the cost of one GPT-4o +filtering call. The key to SIEVE is a seamless integration of GPT-4o and +lightweight T5 models, using active learning to fine-tune T5 in the background +with a small number of calls to GPT-4o. Once trained, it performs as well as +GPT-4o at a tiny fraction of the cost. We experimentally validate SIEVE on the +OpenWebText dataset, using five highly customized filter tasks targeting high +quality and domain-specific content. Our results demonstrate the effectiveness +and efficiency of our method in curating large, high-quality datasets for +language model training at a substantially lower cost (1%) than existing +techniques. To further validate SIEVE, experiments show that SIEVE and GPT-4o +achieve similar accuracy, with human evaluators preferring SIEVE's filtering +results to those of GPT-4o. + +
+
+
+
+
+ + ☆ Training Language Models on Synthetic Edit Sequences Improves Code + Synthesis + + +
+ Software engineers mainly write code by editing existing programs. In +contrast, large language models (LLMs) autoregressively synthesize programs in +a single pass. One explanation for this is the scarcity of open-sourced edit +data. While high-quality instruction data for code synthesis is already scarce, +high-quality edit data is even scarcer. To fill this gap, we develop a +synthetic data generation algorithm called LintSeq. This algorithm refactors +existing code into a sequence of code edits by using a linter to procedurally +sample across the error-free insertions that can be used to sequentially write +programs. It outputs edit sequences as text strings consisting of consecutive +program diffs. To test LintSeq, we use it to refactor a dataset of instruction ++ program pairs into instruction + program-diff-sequence tuples. Then, we +instruction finetune a series of smaller LLMs ranging from 2.6B to 14B +parameters on both the re-factored and original versions of this dataset, +comparing zero-shot performance on code synthesis benchmarks. We show that +during repeated sampling, edit sequence finetuned models produce more diverse +programs than baselines. This results in better inference-time scaling for +benchmark coverage as a function of samples, i.e. the fraction of problems +"pass@k" solved by any attempt given "k" tries. For example, on HumanEval +pass@50, small LLMs finetuned on synthetic edit sequences are competitive with +GPT-4 and outperform models finetuned on the baseline dataset by +20% (+/-3%) +in absolute score. Finally, we also pretrain our own tiny LMs for code +understanding. We show that finetuning tiny models on synthetic code edits +results in state-of-the-art code synthesis for the on-device model class. Our +150M parameter edit sequence LM matches or outperforms code models with twice +as many parameters, both with and without repeated sampling, including Codex +and AlphaCode. + +
+
+
+
+
+ + ☆ CriSPO: Multi-Aspect Critique-Suggestion-guided Automatic Prompt + Optimization for Text Generation + + +
+ Large language models (LLMs) can generate fluent summaries across domains +using prompting techniques, reducing the need to train models for summarization +applications. However, crafting effective prompts that guide LLMs to generate +summaries with the appropriate level of detail and writing style remains a +challenge. In this paper, we explore the use of salient information extracted +from the source document to enhance summarization prompts. We show that adding +keyphrases in prompts can improve ROUGE F1 and recall, making the generated +summaries more similar to the reference and more complete. The number of +keyphrases can control the precision-recall trade-off. Furthermore, our +analysis reveals that incorporating phrase-level salient information is +superior to word- or sentence-level. However, the impact on hallucination is +not universally positive across LLMs. To conduct this analysis, we introduce +Keyphrase Signal Extractor (CriSPO), a lightweight model that can be finetuned +to extract salient keyphrases. By using CriSPO, we achieve consistent ROUGE +improvements across datasets and open-weight and proprietary LLMs without any +LLM customization. Our findings provide insights into leveraging salient +information in building prompt-based summarization systems. + +
+
+
+
+
+ + ☆ Neutral residues: revisiting adapters for model extension + + +
+ We address the problem of extending a pretrained large language model to a +new domain that was not seen at training time, like adding a language for which +the original model has seen no or little training data. Popular solutions like +fine-tuning or low-rank adaptation are successful at domain adaptation, but +formally they do not add any extra capacity and degrade the performance in the +original domain. + Our paper analyzes this extension problem under three angles: data, +architecture and training procedure, which are advantageously considered +jointly. In particular, we improve adapters and make it possible to learn an +entire new language while ensuring that the output of the neural network is +almost unchanged in the original domain. For this purpose, we modify the new +residual blocks in a way that leads each new residual block to output +near-zeros in the original domain. + This solution of neutral residues, which borrows architectural components +from mixture of experts, is effective: with only 20% extra learnable weights +compared to an original model trained on English, we get results that are +significantly better than concurrent approaches (fine-tuning, low-rank or +vanilla adapters) in terms of the trade-off between learning a new language and +not forgetting English. + +
+
+
+
+
+ + ☆ MA-RLHF: Reinforcement Learning from Human Feedback with Macro Actions + + +
+ Reinforcement learning from human feedback (RLHF) has demonstrated +effectiveness in aligning large language models (LLMs) with human preferences. +However, token-level RLHF suffers from the credit assignment problem over long +sequences, where delayed rewards make it challenging for the model to discern +which actions contributed to successful outcomes. This hinders learning +efficiency and slows convergence. In this paper, we propose MA-RLHF, a simple +yet effective RLHF framework that incorporates macro actions -- sequences of +tokens or higher-level language constructs -- into the learning process. By +operating at this higher level of abstraction, our approach reduces the +temporal distance between actions and rewards, facilitating faster and more +accurate credit assignment. This results in more stable policy gradient +estimates and enhances learning efficiency within each episode, all without +increasing computational complexity during training or inference. We validate +our approach through extensive experiments across various model sizes and +tasks, including text summarization, dialogue generation, question answering, +and program synthesis. Our method achieves substantial performance improvements +over standard RLHF, with performance gains of up to 30% in text summarization +and code generation, 18% in dialogue, and 8% in question answering tasks. +Notably, our approach reaches parity with vanilla RLHF 1.7x to 2x faster in +terms of training time and continues to outperform it with further training. We +will make our code and data publicly available at +https://github.com/ernie-research/MA-RLHF . + +
+
+
+
+
+ + ☆ Grounding Large Language Models In Embodied Environment With Imperfect + World Models + + +
+ Despite a widespread success in various applications, large language models +(LLMs) often stumble when tackling basic physical reasoning or executing +robotics tasks, due to a lack of direct experience with the physical nuances of +the real world. To address these issues, we propose a Grounding Large language +model with Imperfect world MOdel (GLIMO), which utilizes proxy world models +such as simulators to collect and synthesize trining data. GLIMO incorporates +an LLM agent-based data generator to automatically create high-quality and +diverse instruction datasets. The generator includes an iterative self-refining +module for temporally consistent experience sampling, a diverse set of +question-answering instruction seeds, and a retrieval-augmented generation +module for reflecting on prior experiences. Comprehensive experiments show that +our approach improve the performance of strong open-source LLMs like LLaMA-3 +with a performance boost of 2.04 $\times$, 1.54 $\times$, and 1.82 $\times$ +across three different benchmarks, respectively. The performance is able to +compete with or surpass their larger counterparts such as GPT-4. + +
+
+
+
+
+ + ☆ Salient Information Prompting to Steer Content in Prompt-based + Abstractive Summarization EMNLP 2024 + + +
+ Large language models (LLMs) can generate fluent summaries across domains +using prompting techniques, reducing the need to train models for summarization +applications. However, crafting effective prompts that guide LLMs to generate +summaries with the appropriate level of detail and writing style remains a +challenge. In this paper, we explore the use of salient information extracted +from the source document to enhance summarization prompts. We show that adding +keyphrases in prompts can improve ROUGE F1 and recall, making the generated +summaries more similar to the reference and more complete. The number of +keyphrases can control the precision-recall trade-off. Furthermore, our +analysis reveals that incorporating phrase-level salient information is +superior to word- or sentence-level. However, the impact on hallucination is +not universally positive across LLMs. To conduct this analysis, we introduce +Keyphrase Signal Extractor (SigExt), a lightweight model that can be finetuned +to extract salient keyphrases. By using SigExt, we achieve consistent ROUGE +improvements across datasets and open-weight and proprietary LLMs without any +LLM customization. Our findings provide insights into leveraging salient +information in building prompt-based summarization systems. + +
+
+ comment: Accepted to EMNLP 2024 Industry Track +
+
+
+
+
+ + ☆ Justice or Prejudice? Quantifying Biases in LLM-as-a-Judge + + +
+ LLM-as-a-Judge has been widely utilized as an evaluation method in various +benchmarks and served as supervised rewards in model training. However, despite +their excellence in many domains, potential issues are under-explored, +undermining their reliability and the scope of their utility. Therefore, we +identify 12 key potential biases and propose a new automated bias +quantification framework-CALM-which systematically quantifies and analyzes each +type of bias in LLM-as-a-Judge by using automated and principle-guided +modification. Our experiments cover multiple popular language models, and the +results indicate that while advanced models have achieved commendable overall +performance, significant biases persist in certain specific tasks. Empirical +results suggest that there remains room for improvement in the reliability of +LLM-as-a-Judge. Moreover, we also discuss the explicit and implicit influence +of these biases and give some suggestions for the reliable application of +LLM-as-a-Judge. Our work highlights the need for stakeholders to address these +issues and remind users to exercise caution in LLM-as-a-Judge applications. + +
+
+
+
+
+ + ☆ DivScene: Benchmarking LVLMs for Object Navigation with Diverse Scenes + and Objects + + +
+ Object navigation in unknown environments is crucial for deploying embodied +agents in real-world applications. While we have witnessed huge progress due to +large-scale scene datasets, faster simulators, and stronger models, previous +studies mainly focus on limited scene types and target objects. In this paper, +we study a new task of navigating to diverse target objects in a large number +of scene types. To benchmark the problem, we present a large-scale scene +dataset, DivScene, which contains 4,614 scenes across 81 different types. With +the dataset, we build an end-to-end embodied agent, NatVLM, by fine-tuning a +Large Vision Language Model (LVLM) through imitation learning. The LVLM is +trained to take previous observations from the environment and generate the +next actions. We also introduce CoT explanation traces of the action prediction +for better performance when tuning LVLMs. Our extensive experiments find that +we can build a performant LVLM-based agent through imitation learning on the +shortest paths constructed by a BFS planner without any human supervision. Our +agent achieves a success rate that surpasses GPT-4o by over 20%. Meanwhile, we +carry out various analyses showing the generalization ability of our agent. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ Unified Multi-Modal Interleaved Document Representation for Information + Retrieval + + +
+ Information Retrieval (IR) methods aim to identify relevant documents in +response to a given query, which have gained remarkable attention due to their +successful application in various natural language tasks. However, existing +approaches typically consider only the textual information within the +documents, which overlooks the fact that documents can contain multiple +modalities, including texts, images, and tables. Further, they often segment +each long document into multiple discrete passages for embedding, preventing +them from capturing the overall document context and interactions between +paragraphs. We argue that these two limitations lead to suboptimal document +representations for retrieval. In this work, to address them, we aim to produce +more comprehensive and nuanced document representations by holistically +embedding documents interleaved with different modalities. Specifically, we +achieve this by leveraging the capability of recent vision-language models that +enable the processing and integration of text, images, and tables into a +unified format and representation. Moreover, to mitigate the information loss +from segmenting documents into passages, instead of representing and retrieving +passages individually, we further merge the representations of segmented +passages into one single document representation, while we additionally +introduce a reranking strategy to decouple and identify the relevant passage +within the document if necessary. Then, through extensive experiments on +diverse information retrieval scenarios considering both the textual and +multimodal queries, we show that our approach substantially outperforms +relevant baselines, thanks to the consideration of the multimodal information +interleaved within the documents in a unified way. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Adaptive Inference-Time Compute: LLMs Can Predict if They Can Do Better, + Even Mid-Generation + + +
+ Inference-time computation is a powerful paradigm to enhance the performance +of large language models (LLMs), with Best-of-N sampling being a widely used +technique. However, this method is computationally expensive, requiring both +(1) an external reward model and (2) the generation of multiple samples. In +this work, we introduce a new generative self-evaluation scheme designed to +adaptively reduce the number of generated samples while maintaining or even +improving performance. We use a generative reward model formulation, allowing +the LLM to predict mid-generation the probability that restarting the +generation will yield a better response. These predictions are obtained without +an external reward model and can be used to decide whether or not to generate +more samples, prune unpromising samples early on, or to pick the best sample. +This capability is very inexpensive as it involves generating a single +predefined token. Trained using a dataset constructed with real unfiltered +LMSYS user prompts, Llama 3.1 8B's win rate against GPT-4 on AlpacaEval +increases from 21% to 34% with 16 samples and math performance on GSM8K +improves from 84% to 91%. By sampling only when the LLM determines that it is +beneficial to do so and adaptively adjusting temperature annealing, we +demonstrate that 74% of the improvement from using 16 samples can be achieved +with only 1.2 samples on average. We further demonstrate that 50-75% of samples +can be pruned early in generation with minimal degradation in performance. +Overall, our methods enable more efficient and scalable compute utilization +during inference for LLMs. + +
+
+
+
+
+ + ☆ Large Language Models as Markov Chains + + +
+ Large language models (LLMs) have proven to be remarkably efficient, both +across a wide range of natural language processing tasks and well beyond them. +However, a comprehensive theoretical analysis of the origins of their +impressive performance remains elusive. In this paper, we approach this +challenging task by drawing an equivalence between generic autoregressive +language models with vocabulary of size $T$ and context window of size $K$ and +Markov chains defined on a finite state space of size $\mathcal{O}(T^K)$. We +derive several surprising findings related to the existence of a stationary +distribution of Markov chains that capture the inference power of LLMs, their +speed of convergence to it, and the influence of the temperature on the latter. +We then prove pre-training and in-context generalization bounds and show how +the drawn equivalence allows us to enrich their interpretation. Finally, we +illustrate our theoretical guarantees with experiments on several recent LLMs +to highlight how they capture the behavior observed in practice. + +
+
+ comment: 49 pages, 17 figures +
+
+
+
+
+ + ☆ Domain-Specific Retrieval-Augmented Generation Using Vector Stores, + Knowledge Graphs, and Tensor Factorization ICML + + +
+ Large Language Models (LLMs) are pre-trained on large-scale corpora and excel +in numerous general natural language processing (NLP) tasks, such as question +answering (QA). Despite their advanced language capabilities, when it comes to +domain-specific and knowledge-intensive tasks, LLMs suffer from hallucinations, +knowledge cut-offs, and lack of knowledge attributions. Additionally, fine +tuning LLMs' intrinsic knowledge to highly specific domains is an expensive and +time consuming process. The retrieval-augmented generation (RAG) process has +recently emerged as a method capable of optimization of LLM responses, by +referencing them to a predetermined ontology. It was shown that using a +Knowledge Graph (KG) ontology for RAG improves the QA accuracy, by taking into +account relevant sub-graphs that preserve the information in a structured +manner. In this paper, we introduce SMART-SLIC, a highly domain-specific LLM +framework, that integrates RAG with KG and a vector store (VS) that store +factual domain specific information. Importantly, to avoid hallucinations in +the KG, we build these highly domain-specific KGs and VSs without the use of +LLMs, but via NLP, data mining, and nonnegative tensor factorization with +automatic model selection. Pairing our RAG with a domain-specific: (i) KG +(containing structured information), and (ii) VS (containing unstructured +information) enables the development of domain-specific chat-bots that +attribute the source of information, mitigate hallucinations, lessen the need +for fine-tuning, and excel in highly domain-specific question answering tasks. +We pair SMART-SLIC with chain-of-thought prompting agents. The framework is +designed to be generalizable to adapt to any specific or specialized domain. In +this paper, we demonstrate the question answering capabilities of our framework +on a corpus of scientific publications on malware analysis and anomaly +detection. + +
+
+ comment: 9 pages 7 figures, 1 table, 1 cypher code Accepted to ICMLA 2024 +
+
+
+
+
+ + ☆ UncertaintyRAG: Span-Level Uncertainty Enhanced Long-Context Modeling + for Retrieval-Augmented Generation + + +
+ We present UncertaintyRAG, a novel approach for long-context +Retrieval-Augmented Generation (RAG) that utilizes Signal-to-Noise Ratio +(SNR)-based span uncertainty to estimate similarity between text chunks. This +span uncertainty enhances model calibration, improving robustness and +mitigating semantic inconsistencies introduced by random chunking. Leveraging +this insight, we propose an efficient unsupervised learning technique to train +the retrieval model, alongside an effective data sampling and scaling strategy. +UncertaintyRAG outperforms baselines by 2.03% on LLaMA-2-7B, achieving +state-of-the-art results while using only 4% of the training data compared to +other advanced open-source retrieval models under distribution shift settings. +Our method demonstrates strong calibration through span uncertainty, leading to +improved generalization and robustness in long-context RAG tasks. Additionally, +UncertaintyRAG provides a lightweight retrieval model that can be integrated +into any large language model with varying context window lengths, without the +need for fine-tuning, showcasing the flexibility of our approach. + +
+
+
+
+
+ + ☆ Video Instruction Tuning With Synthetic Data + + +
+ The development of video large multimodal models (LMMs) has been hindered by +the difficulty of curating large amounts of high-quality raw data from the web. +To address this, we propose an alternative approach by creating a high-quality +synthetic dataset specifically for video instruction-following, namely +LLaVA-Video-178K. This dataset includes key tasks such as detailed captioning, +open-ended question-answering (QA), and multiple-choice QA. By training on this +dataset, in combination with existing visual instruction tuning data, we +introduce LLaVA-Video, a new video LMM. Our experiments demonstrate that +LLaVA-Video achieves strong performance across various video benchmarks, +highlighting the effectiveness of our dataset. We plan to release the dataset, +its generation pipeline, and the model checkpoints. + +
+
+ comment: Project page: https://llava-vl.github.io/blog/2024-09-30-llava-video/ +
+
+
+
+
+ + ☆ LLaVA-Critic: Learning to Evaluate Multimodal Models + + +
+ We introduce LLaVA-Critic, the first open-source large multimodal model (LMM) +designed as a generalist evaluator to assess performance across a wide range of +multimodal tasks. LLaVA-Critic is trained using a high-quality critic +instruction-following dataset that incorporates diverse evaluation criteria and +scenarios. Our experiments demonstrate the model's effectiveness in two key +areas: (1) LMM-as-a-Judge, where LLaVA-Critic provides reliable evaluation +scores, performing on par with or surpassing GPT models on multiple evaluation +benchmarks; and (2) Preference Learning, where it generates reward signals for +preference learning, enhancing model alignment capabilities. This work +underscores the potential of open-source LMMs in self-critique and evaluation, +setting the stage for future research into scalable, superhuman alignment +feedback mechanisms for LMMs. + +
+
+ comment: Project Page: https://llava-vl.github.io/blog/2024-10-03-llava-critic +
+
+
+
+
+ + ☆ LLMs Know More Than They Show: On the Intrinsic Representation of LLM + Hallucinations + + +
+ Large language models (LLMs) often produce errors, including factual +inaccuracies, biases, and reasoning failures, collectively referred to as +"hallucinations". Recent studies have demonstrated that LLMs' internal states +encode information regarding the truthfulness of their outputs, and that this +information can be utilized to detect errors. In this work, we show that the +internal representations of LLMs encode much more information about +truthfulness than previously recognized. We first discover that the +truthfulness information is concentrated in specific tokens, and leveraging +this property significantly enhances error detection performance. Yet, we show +that such error detectors fail to generalize across datasets, implying that -- +contrary to prior claims -- truthfulness encoding is not universal but rather +multifaceted. Next, we show that internal representations can also be used for +predicting the types of errors the model is likely to make, facilitating the +development of tailored mitigation strategies. Lastly, we reveal a discrepancy +between LLMs' internal encoding and external behavior: they may encode the +correct answer, yet consistently generate an incorrect one. Taken together, +these insights deepen our understanding of LLM errors from the model's internal +perspective, which can guide future research on enhancing error analysis and +mitigation. + +
+
+
+
+
+ + ☆ Selective Attention Improves Transformer + + +
+ Unneeded elements in the attention's context degrade performance. We +introduce Selective Attention, a simple parameter-free change to the standard +attention mechanism which reduces attention to unneeded elements. Selective +attention improves language modeling performance in a variety of model sizes +and context lengths. For example, a range of transformers trained with the +language modeling objective on C4 with selective attention perform equivalently +to standard transformers with ~2X more heads and parameters in their attention +modules. Selective attention also allows decreasing the size of the attention's +context buffer, leading to meaningful reductions in the memory and compute +requirements during inference. For example, transformers with 100M parameters +trained on C4 with context sizes of 512, 1,024, and 2,048 need 16X, 25X, and +47X less memory for their attention module, respectively, when equipped with +selective attention, as those without selective attention, with the same +validation perplexity. + +
+
+
+
+
+ + ☆ HELMET: How to Evaluate Long-Context Language Models Effectively and + Thoroughly + + +
+ There have been many benchmarks for evaluating long-context language models +(LCLMs), but developers often rely on synthetic tasks like needle-in-a-haystack +(NIAH) or arbitrary subsets of tasks. It remains unclear whether they translate +to the diverse downstream applications of LCLMs, and the inconsistency further +complicates model comparison. We investigate the underlying reasons behind +current practices and find that existing benchmarks often provide noisy signals +due to low coverage of applications, insufficient lengths, unreliable metrics, +and incompatibility with base models. In this work, we present HELMET (How to +Evaluate Long-context Models Effectively and Thoroughly), a comprehensive +benchmark encompassing seven diverse, application-centric categories. We also +address many issues in previous benchmarks by adding controllable lengths up to +128k tokens, model-based evaluation for reliable metrics, and few-shot +prompting for robustly evaluating base models. Consequently, we demonstrate +that HELMET offers more reliable and consistent rankings of frontier LCLMs. +Through a comprehensive study of 51 LCLMs, we find that (1) synthetic tasks +like NIAH are not good predictors of downstream performance; (2) the diverse +categories in HELMET exhibit distinct trends and low correlation with each +other; and (3) while most LCLMs achieve perfect NIAH scores, open-source models +significantly lag behind closed ones when the task requires full-context +reasoning or following complex instructions -- the gap widens with increased +lengths. Finally, we recommend using our RAG tasks for fast model development, +as they are easy to run and more predictive of other downstream performance; +ultimately, we advocate for a holistic evaluation across diverse tasks. + +
+
+ comment: Code and data are available here: + https://github.com/princeton-nlp/HELMET +
+
+
+
+
+ + ☆ On the Proper Treatment of Tokenization in Psycholinguistics EMNLP 2024 + + +
+ Language models are widely used in computational psycholinguistics to test +theories that relate the negative log probability (the surprisal) of a region +of interest (a substring of characters) under a language model to its cognitive +cost experienced by readers, as operationalized, for example, by gaze duration +on the region. However, the application of modern language models to +psycholinguistic studies is complicated by the practice of using tokenization +as an intermediate step in training a model. Doing so results in a language +model over token strings rather than one over character strings. Vexingly, +regions of interest are generally misaligned with these token strings. The +paper argues that token-level language models should be (approximately) +marginalized into character-level language models before they are used in +psycholinguistic studies to compute the surprisal of a region of interest; +then, the marginalized character-level language model can be used to compute +the surprisal of an arbitrary character substring, which we term a focal area, +that the experimenter may wish to use as a predictor. Our proposal of +marginalizing a token-level model into a character-level one solves this +misalignment issue independently of the tokenization scheme. Empirically, we +discover various focal areas whose surprisal is a better psychometric predictor +than the surprisal of the region of interest itself. + +
+
+ comment: Main conference long paper at EMNLP 2024 +
+
+
+
+
+ + ☆ HiddenGuard: Fine-Grained Safe Generation with Specialized + Representation Router + + +
+ As Large Language Models (LLMs) grow increasingly powerful, ensuring their +safety and alignment with human values remains a critical challenge. Ideally, +LLMs should provide informative responses while avoiding the disclosure of +harmful or sensitive information. However, current alignment approaches, which +rely heavily on refusal strategies, such as training models to completely +reject harmful prompts or applying coarse filters are limited by their binary +nature. These methods either fully deny access to information or grant it +without sufficient nuance, leading to overly cautious responses or failures to +detect subtle harmful content. For example, LLMs may refuse to provide basic, +public information about medication due to misuse concerns. Moreover, these +refusal-based methods struggle to handle mixed-content scenarios and lack the +ability to adapt to context-dependent sensitivities, which can result in +over-censorship of benign content. To overcome these challenges, we introduce +HiddenGuard, a novel framework for fine-grained, safe generation in LLMs. +HiddenGuard incorporates Prism (rePresentation Router for In-Stream +Moderation), which operates alongside the LLM to enable real-time, token-level +detection and redaction of harmful content by leveraging intermediate hidden +states. This fine-grained approach allows for more nuanced, context-aware +moderation, enabling the model to generate informative responses while +selectively redacting or replacing sensitive information, rather than outright +refusal. We also contribute a comprehensive dataset with token-level +fine-grained annotations of potentially harmful information across diverse +contexts. Our experiments demonstrate that HiddenGuard achieves over 90% in F1 +score for detecting and redacting harmful content while preserving the overall +utility and informativeness of the model's responses. + +
+
+
+
+
+ + ☆ DailyDilemmas: Revealing Value Preferences of LLMs with Quandaries of + Daily Life + + +
+ As we increasingly seek guidance from LLMs for decision-making in daily life, +many of these decisions are not clear-cut and depend significantly on the +personal values and ethical standards of the users. We present DailyDilemmas, a +dataset of 1,360 moral dilemmas encountered in everyday life. Each dilemma +includes two possible actions and with each action, the affected parties and +human values invoked. Based on these dilemmas, we consolidated a set of human +values across everyday topics e.g., interpersonal relationships, workplace, and +environmental issues. We evaluated LLMs on these dilemmas to determine what +action they will take and the values represented by these actions. Then, we +analyzed these values through the lens of five popular theories inspired by +sociology, psychology and philosophy. These theories are: World Value Survey, +Moral Foundation Theory, Maslow's Hierarchy of Needs, Aristotle's Virtues, and +Plutchik Wheel of Emotion. We find that LLMs are most aligned with the +self-expression over survival values in terms of World Value Survey, care over +loyalty in Moral Foundation Theory. Interestingly, we find large preferences +differences in models for some core values such as truthfulness e.g., +Mixtral-8x7B model tends to neglect it by 9.7% while GPT-4-turbo model tends to +select it by 9.4%. We also study the recent guidance released by OpenAI +(ModelSpec), and Anthropic (Constitutional AI) to understand how their released +principles reflect their actual value prioritization when facing nuanced moral +reasoning in daily-life settings. We find that end users cannot effectively +steer such prioritization using system prompts. + +
+
+ comment: Preprint. Under Review +
+
+
+
+
+ + ☆ Distilling an End-to-End Voice Assistant Without Instruction Training + Data + + +
+ Voice assistants, such as Siri and Google Assistant, typically model audio +and text separately, resulting in lost speech information and increased +complexity. Recent efforts to address this with end-to-end Speech Large +Language Models (LLMs) trained with supervised finetuning (SFT) + have led to models ``forgetting" capabilities from text-only LLMs. Our work +proposes an alternative paradigm for training Speech LLMs without instruction +data, using the response of a text-only LLM to transcripts as self-supervision. +Importantly, this process can be performed without annotated responses. We show +that our Distilled Voice Assistant (DiVA) generalizes to Spoken Question +Answering, Classification, and Translation. Furthermore, we show that DiVA +better meets user preferences, achieving a 72\% win rate compared with +state-of-the-art models like Qwen 2 Audio, despite using $>$100x less training +compute. + +
+
+
+
+
+ + ☆ CulturalBench: a Robust, Diverse and Challenging Benchmark on Measuring + the (Lack of) Cultural Knowledge of LLMs + + +
+ To make large language models (LLMs) more helpful across diverse cultures, it +is essential to have effective cultural knowledge benchmarks to measure and +track our progress. Effective benchmarks need to be robust, diverse, and +challenging. We introduce CulturalBench: a set of 1,227 human-written and +human-verified questions for effectively assessing LLMs' cultural knowledge, +covering 45 global regions including the underrepresented ones like Bangladesh, +Zimbabwe, and Peru. Questions - each verified by five independent annotators - +span 17 diverse topics ranging from food preferences to greeting etiquettes. We +evaluate models on two setups: CulturalBench-Easy and CulturalBench-Hard which +share the same questions but asked differently. We find that LLMs are sensitive +to such difference in setups (e.g., GPT-4o with 27.3% difference). Compared to +human performance (92.6% accuracy), CulturalBench-Hard is more challenging for +frontier LLMs with the best performing model (GPT-4o) at only 61.5% and the +worst (Llama3-8b) at 21.4%. Moreover, we find that LLMs often struggle with +tricky questions that have multiple correct answers (e.g., What utensils do the +Chinese usually use?), revealing a tendency to converge to a single answer. Our +results also indicate that OpenAI GPT-4o substantially outperform other +proprietary and open source models in questions related to all but one region +(Oceania). Nonetheless, all models consistently underperform on questions +related to South America and the Middle East. + +
+
+ comment: Preprint. Under review +
+
+
+
+
+ + ☆ FAN: Fourier Analysis Networks + + +
+ Despite the remarkable success achieved by neural networks, particularly +those represented by MLP and Transformer, we reveal that they exhibit potential +flaws in the modeling and reasoning of periodicity, i.e., they tend to memorize +the periodic data rather than genuinely understanding the underlying principles +of periodicity. However, periodicity is a crucial trait in various forms of +reasoning and generalization, underpinning predictability across natural and +engineered systems through recurring patterns in observations. In this paper, +we propose FAN, a novel network architecture based on Fourier Analysis, which +empowers the ability to efficiently model and reason about periodic phenomena. +By introducing Fourier Series, the periodicity is naturally integrated into the +structure and computational processes of the neural network, thus achieving a +more accurate expression and prediction of periodic patterns. As a promising +substitute to multi-layer perceptron (MLP), FAN can seamlessly replace MLP in +various models with fewer parameters and FLOPs. Through extensive experiments, +we demonstrate the effectiveness of FAN in modeling and reasoning about +periodic functions, and the superiority and generalizability of FAN across a +range of real-world tasks, including symbolic formula representation, time +series forecasting, and language modeling. + +
+
+
+
+
+ + ☆ Examining Language Modeling Assumptions Using an Annotated Literary + Dialect Corpus EMNLP2024 + + +
+ We present a dataset of 19th century American literary orthovariant tokens +with a novel layer of human-annotated dialect group tags designed to serve as +the basis for computational experiments exploring literarily meaningful +orthographic variation. We perform an initial broad set of experiments over +this dataset using both token (BERT) and character (CANINE)-level contextual +language models. We find indications that the "dialect effect" produced by +intentional orthographic variation employs multiple linguistic channels, and +that these channels are able to be surfaced to varied degrees given particular +language modelling assumptions. Specifically, we find evidence showing that +choice of tokenization scheme meaningfully impact the type of orthographic +information a model is able to surface. + +
+
+ comment: Accepted to NLP4DH@EMNLP2024 +
+
+
+
+
+ + ☆ How to Train Long-Context Language Models (Effectively) + + +
+ We study continued training and supervised fine-tuning (SFT) of a language +model (LM) to make effective use of long-context information. We first +establish a reliable evaluation protocol to guide model development -- Instead +of perplexity or simple needle-in-a-haystack (NIAH) tests, we use a broad set +of long-context tasks, and we evaluate models after SFT with instruction data +as this better reveals long-context abilities. Supported by our robust +evaluations, we run thorough experiments to decide the data mix for continued +pre-training, the instruction tuning dataset, and many other design choices. We +find that (1) code repositories and books are excellent sources of long data, +but it is crucial to combine them with high-quality short data; (2) training +with a sequence length beyond the evaluation length boosts long-context +performance; (3) for SFT, using only short instruction datasets yields strong +performance on long-context tasks. Our final model, ProLong-8B, which is +initialized from Llama-3 and trained on 40B tokens, demonstrates +state-of-the-art long-context performance among similarly sized models at a +length of 128K. ProLong outperforms Llama-3.18B-Instruct on the majority of +long-context tasks despite having seen only 5% as many tokens during +long-context training. Additionally, ProLong can effectively process up to 512K +tokens, one of the longest context windows of publicly available LMs. + +
+
+ comment: Our code, data, and models are available at + https://github.com/princeton-nlp/ProLong +
+
+
+
+
+ + ☆ Hate Personified: Investigating the role of LLMs in content moderation EMNLP'24 + + +
+ For subjective tasks such as hate detection, where people perceive hate +differently, the Large Language Model's (LLM) ability to represent diverse +groups is unclear. By including additional context in prompts, we +comprehensively analyze LLM's sensitivity to geographical priming, persona +attributes, and numerical information to assess how well the needs of various +groups are reflected. Our findings on two LLMs, five languages, and six +datasets reveal that mimicking persona-based attributes leads to annotation +variability. Meanwhile, incorporating geographical signals leads to better +regional alignment. We also find that the LLMs are sensitive to numerical +anchors, indicating the ability to leverage community-based flagging efforts +and exposure to adversaries. Our work provides preliminary guidelines and +highlights the nuances of applying LLMs in culturally sensitive cases. + +
+
+ comment: 17 pages, 6 Figures, 13 Tables, EMNLP'24 Mains +
+
+
+
+
+ + ☆ Measuring and Improving Persuasiveness of Generative Models + + +
+ LLMs are increasingly being used in workflows involving generating content to +be consumed by humans (e.g., marketing) and also in directly interacting with +humans (e.g., through chatbots). The development of such systems that are +capable of generating verifiably persuasive messages presents both +opportunities and challenges for society. On the one hand, such systems could +positively impact domains like advertising and social good, such as addressing +drug addiction, and on the other, they could be misused for spreading +misinformation and shaping political opinions. To channel LLMs' impact on +society, we need to develop systems to measure and benchmark their +persuasiveness. With this motivation, we introduce PersuasionBench and +PersuasionArena, the first large-scale benchmark and arena containing a battery +of tasks to measure the persuasion ability of generative models automatically. +We investigate to what extent LLMs know and leverage linguistic patterns that +can help them generate more persuasive language. Our findings indicate that the +persuasiveness of LLMs correlates positively with model size, but smaller +models can also be made to have a higher persuasiveness than much larger +models. Notably, targeted training using synthetic and natural datasets +significantly enhances smaller models' persuasive capabilities, challenging +scale-dependent assumptions. Our findings carry key implications for both model +developers and policymakers. For instance, while the EU AI Act and California's +SB-1047 aim to regulate AI models based on the number of floating point +operations, we demonstrate that simple metrics like this alone fail to capture +the full scope of AI's societal impact. We invite the community to explore and +contribute to PersuasionArena and PersuasionBench, available at +https://bit.ly/measure-persuasion, to advance our understanding of AI-driven +persuasion and its societal implications. + +
+
+
+
+
+ + ☆ Undesirable Memorization in Large Language Models: A Survey + + +
+ While recent research increasingly showcases the remarkable capabilities of +Large Language Models (LLMs), it's vital to confront their hidden pitfalls. +Among these challenges, the issue of memorization stands out, posing +significant ethical and legal risks. In this paper, we presents a +Systematization of Knowledge (SoK) on the topic of memorization in LLMs. +Memorization is the effect that a model tends to store and reproduce phrases or +passages from the training data and has been shown to be the fundamental issue +to various privacy and security attacks against LLMs. + We begin by providing an overview of the literature on the memorization, +exploring it across five key dimensions: intentionality, degree, +retrievability, abstraction, and transparency. Next, we discuss the metrics and +methods used to measure memorization, followed by an analysis of the factors +that contribute to memorization phenomenon. We then examine how memorization +manifests itself in specific model architectures and explore strategies for +mitigating these effects. We conclude our overview by identifying potential +research topics for the near future: to develop methods for balancing +performance and privacy in LLMs, and the analysis of memorization in specific +contexts, including conversational agents, retrieval-augmented generation, +multilingual language models, and diffusion language models. + +
+
+
+
+
+ + ☆ Immunogenicity Prediction with Dual Attention Enables Vaccine Target + Selection + + +
+ Immunogenicity prediction is a central topic in reverse vaccinology for +finding candidate vaccines that can trigger protective immune responses. +Existing approaches typically rely on highly compressed features and simple +model architectures, leading to limited prediction accuracy and poor +generalizability. To address these challenges, we introduce ProVaccine, a novel +deep learning solution with a dual attention mechanism that integrates +pre-trained latent vector representations of protein sequences and structures. +We also compile the most comprehensive immunogenicity dataset to date, +encompassing over 9,500 antigen sequences, structures, and immunogenicity +labels from bacteria, viruses, and tumors. Extensive experiments demonstrate +that ProVaccine outperforms existing methods across a wide range of evaluation +metrics. Furthermore, we establish a post-hoc validation protocol to assess the +practical significance of deep learning models in tackling vaccine design +challenges. Our work provides an effective tool for vaccine design and sets +valuable benchmarks for future research. + +
+
+ comment: 18 pages, 11 tables, 5 figures +
+
+
+
+
+ + ☆ Attention in Large Language Models Yields Efficient Zero-Shot Re-Rankers + + +
+ Information retrieval (IR) systems have played a vital role in modern digital +life and have cemented their continued usefulness in this new era of generative +AI via retrieval-augmented generation. With strong language processing +capabilities and remarkable versatility, large language models (LLMs) have +become popular choices for zero-shot re-ranking in IR systems. So far, +LLM-based re-ranking methods rely on strong generative capabilities, which +restricts their use to either specialized or powerful proprietary models. Given +these restrictions, we ask: is autoregressive generation necessary and optimal +for LLMs to perform re-ranking? We hypothesize that there are abundant signals +relevant to re-ranking within LLMs that might not be used to their full +potential via generation. To more directly leverage such signals, we propose +in-context re-ranking (ICR), a novel method that leverages the change in +attention pattern caused by the search query for accurate and efficient +re-ranking. To mitigate the intrinsic biases in LLMs, we propose a calibration +method using a content-free query. Due to the absence of generation, ICR only +requires two ($O(1)$) forward passes to re-rank $N$ documents, making it +substantially more efficient than generative re-ranking methods that require at +least $O(N)$ forward passes. Our novel design also enables ICR to be applied to +any LLM without specialized training while guaranteeing a well-formed ranking. +Extensive experiments with two popular open-weight LLMs on standard single-hop +and multi-hop information retrieval benchmarks show that ICR outperforms +RankGPT while cutting the latency by more than 60% in practice. Through +detailed analyses, we show that ICR's performance is specially strong on tasks +that require more complex re-ranking signals. Our findings call for further +exploration on novel ways of utilizing open-weight LLMs beyond text generation. + +
+
+
+
+
+ + ☆ Large Language Model for Multi-Domain Translation: Benchmarking and + Domain CoT Fine-tuning + + +
+ Achieving consistent high-quality machine translation (MT) across diverse +domains remains a significant challenge, primarily due to the limited and +imbalanced parallel training data available in various domains. While large +language models (LLMs) have demonstrated impressive general understanding and +generation abilities, their potential in multi-domain MT is under-explored. We +establish a comprehensive benchmark for multi-domain translation, featuring 25 +German$\Leftrightarrow$English and 22 Chinese$\Leftrightarrow$English test sets +respectively covering 15 domains. Our evaluation of prominent LLMs reveals a +discernible performance gap against traditional MT systems, highlighting domain +overfitting and catastrophic forgetting issues after fine-tuning on +domain-limited corpora. To mitigate this, we propose a domain Chain of Thought +(CoT) fine-tuning technique that utilizes the intrinsic multi-domain +intelligence of LLMs to improve translation performance. This method inspires +the LLM to perceive domain information from the source text, which then serves +as a helpful hint to guide the translation process. Despite being trained on a +small dataset of four domains, our CoT fine-tune approach achieves notable +enhancements in translation accuracy and domain robustness than traditional +fine-tuning, as evidenced by an average 1.53 BLEU score increase in over 20 +German$\rightarrow$English distinct out-of-domain tests. + +
+
+
+
+
+ + ☆ NL-Eye: Abductive NLI for Images + + +
+ Will a Visual Language Model (VLM)-based bot warn us about slipping if it +detects a wet floor? Recent VLMs have demonstrated impressive capabilities, yet +their ability to infer outcomes and causes remains underexplored. To address +this, we introduce NL-Eye, a benchmark designed to assess VLMs' visual +abductive reasoning skills. NL-Eye adapts the abductive Natural Language +Inference (NLI) task to the visual domain, requiring models to evaluate the +plausibility of hypothesis images based on a premise image and explain their +decisions. NL-Eye consists of 350 carefully curated triplet examples (1,050 +images) spanning diverse reasoning categories: physical, functional, logical, +emotional, cultural, and social. The data curation process involved two steps - +writing textual descriptions and generating images using text-to-image models, +both requiring substantial human involvement to ensure high-quality and +challenging scenes. Our experiments show that VLMs struggle significantly on +NL-Eye, often performing at random baseline levels, while humans excel in both +plausibility prediction and explanation quality. This demonstrates a deficiency +in the abductive reasoning capabilities of modern VLMs. NL-Eye represents a +crucial step toward developing VLMs capable of robust multimodal reasoning for +real-world applications, including accident-prevention bots and generated video +verification. + +
+
+
+
+
+ + ☆ IndicSentEval: How Effectively do Multilingual Transformer Models encode + Linguistic Properties for Indic Languages? + + +
+ Transformer-based models have revolutionized the field of natural language +processing. To understand why they perform so well and to assess their +reliability, several studies have focused on questions such as: Which +linguistic properties are encoded by these models, and to what extent? How +robust are these models in encoding linguistic properties when faced with +perturbations in the input text? However, these studies have mainly focused on +BERT and the English language. In this paper, we investigate similar questions +regarding encoding capability and robustness for 8 linguistic properties across +13 different perturbations in 6 Indic languages, using 9 multilingual +Transformer models (7 universal and 2 Indic-specific). To conduct this study, +we introduce a novel multilingual benchmark dataset, IndicSentEval, containing +approximately $\sim$47K sentences. Surprisingly, our probing analysis of +surface, syntactic, and semantic properties reveals that while almost all +multilingual models demonstrate consistent encoding performance for English, +they show mixed results for Indic languages. As expected, Indic-specific +multilingual models capture linguistic properties in Indic languages better +than universal models. Intriguingly, universal models broadly exhibit better +robustness compared to Indic-specific models, particularly under perturbations +such as dropping both nouns and verbs, dropping only verbs, or keeping only +nouns. Overall, this study provides valuable insights into probing and +perturbation-specific strengths and weaknesses of popular multilingual +Transformer-based models for different Indic languages. We make our code and +dataset publicly available [https://tinyurl.com/IndicSentEval}]. + +
+
+ comment: 23 pages, 11 figures +
+
+
+
+
+ + ☆ Ethio-Fake: Cutting-Edge Approaches to Combat Fake News in + Under-Resourced Languages Using Explainable AI + + +
+ The proliferation of fake news has emerged as a significant threat to the +integrity of information dissemination, particularly on social media platforms. +Misinformation can spread quickly due to the ease of creating and disseminating +content, affecting public opinion and sociopolitical events. Identifying false +information is therefore essential to reducing its negative consequences and +maintaining the reliability of online news sources. Traditional approaches to +fake news detection often rely solely on content-based features, overlooking +the crucial role of social context in shaping the perception and propagation of +news articles. In this paper, we propose a comprehensive approach that +integrates social context-based features with news content features to enhance +the accuracy of fake news detection in under-resourced languages. We perform +several experiments utilizing a variety of methodologies, including traditional +machine learning, neural networks, ensemble learning, and transfer learning. +Assessment of the outcomes of the experiments shows that the ensemble learning +approach has the highest accuracy, achieving a 0.99 F1 score. Additionally, +when compared with monolingual models, the fine-tuned model with the target +language outperformed others, achieving a 0.94 F1 score. We analyze the +functioning of the models, considering the important features that contribute +to model performance, using explainable AI techniques. + +
+
+
+
+
+ + ☆ Agents' Room: Narrative Generation through Multi-step Collaboration ICLR 2025 + + +
+ Writing compelling fiction is a multifaceted process combining elements such +as crafting a plot, developing interesting characters, and using evocative +language. While large language models (LLMs) show promise for story writing, +they currently rely heavily on intricate prompting, which limits their use. We +propose Agents' Room, a generation framework inspired by narrative theory, that +decomposes narrative writing into subtasks tackled by specialized agents. To +illustrate our method, we introduce Tell Me A Story, a high-quality dataset of +complex writing prompts and human-written stories, and a novel evaluation +framework designed specifically for assessing long narratives. We show that +Agents' Room generates stories that are preferred by expert evaluators over +those produced by baseline systems by leveraging collaboration and +specialization to decompose the complex story writing task into tractable +components. We provide extensive analysis with automated and human-based +metrics of the generated output. + +
+
+ comment: Under review as a conference paper at ICLR 2025 +
+
+
+
+
+ + ☆ Towards Implicit Bias Detection and Mitigation in Multi-Agent LLM + Interactions EMNLP + + +
+ As Large Language Models (LLMs) continue to evolve, they are increasingly +being employed in numerous studies to simulate societies and execute diverse +social tasks. However, LLMs are susceptible to societal biases due to their +exposure to human-generated data. Given that LLMs are being used to gain +insights into various societal aspects, it is essential to mitigate these +biases. To that end, our study investigates the presence of implicit gender +biases in multi-agent LLM interactions and proposes two strategies to mitigate +these biases. We begin by creating a dataset of scenarios where implicit gender +biases might arise, and subsequently develop a metric to assess the presence of +biases. Our empirical analysis reveals that LLMs generate outputs characterized +by strong implicit bias associations (>= 50\% of the time). Furthermore, these +biases tend to escalate following multi-agent interactions. To mitigate them, +we propose two strategies: self-reflection with in-context examples (ICE); and +supervised fine-tuning. Our research demonstrates that both methods effectively +mitigate implicit biases, with the ensemble of fine-tuning and self-reflection +proving to be the most successful. + +
+
+ comment: Accepted to EMNLP Findings 2024 +
+
+
+
+
+ + ☆ Convolutional Variational Autoencoders for Spectrogram Compression in + Automatic Speech Recognition + + +
+ For many Automatic Speech Recognition (ASR) tasks audio features as +spectrograms show better results than Mel-frequency Cepstral Coefficients +(MFCC), but in practice they are hard to use due to a complex dimensionality of +a feature space. The following paper presents an alternative approach towards +generating compressed spectrogram representation, based on Convolutional +Variational Autoencoders (VAE). A Convolutional VAE model was trained on a +subsample of the LibriSpeech dataset to reconstruct short fragments of audio +spectrograms (25 ms) from a 13-dimensional embedding. The trained model for a +40-dimensional (300 ms) embedding was used to generate features for corpus of +spoken commands on the GoogleSpeechCommands dataset. Using the generated +features an ASR system was built and compared to the model with MFCC features. + +
+
+ comment: Theory and Practice of Natural Computing 9th International + Conference, TPNC 2020, Taoyuan, Taiwan, 2020, Proceedings 9 +
+
+
+
+
+ + ☆ Improving Unsupervised Constituency Parsing via Maximizing Semantic + Information + + +
+ Unsupervised constituency parsers organize phrases within a sentence into a +tree-shaped syntactic constituent structure that reflects the organization of +sentence semantics. However, the traditional objective of maximizing sentence +log-likelihood (LL) does not explicitly account for the close relationship +between the constituent structure and the semantics, resulting in a weak +correlation between LL values and parsing accuracy. In this paper, we introduce +a novel objective for training unsupervised parsers: maximizing the information +between constituent structures and sentence semantics (SemInfo). We introduce a +bag-of-substrings model to represent the semantics and apply the +probability-weighted information metric to estimate the SemInfo. Additionally, +we develop a Tree Conditional Random Field (TreeCRF)-based model to apply the +SemInfo maximization objective to Probabilistic Context-Free Grammar (PCFG) +induction, the state-of-the-art method for unsupervised constituency parsing. +Experiments demonstrate that SemInfo correlates more strongly with parsing +accuracy than LL. Our algorithm significantly enhances parsing accuracy by an +average of 7.85 points across five PCFG variants and in four languages, +achieving new state-of-the-art results in three of the four languages. + +
+
+
+
+
+ + ☆ ColaCare: Enhancing Electronic Health Record Modeling through Large + Language Model-Driven Multi-Agent Collaboration + + +
+ We introduce ColaCare, a framework that enhances Electronic Health Record +(EHR) modeling through multi-agent collaboration driven by Large Language +Models (LLMs). Our approach seamlessly integrates domain-specific expert models +with LLMs to bridge the gap between structured EHR data and text-based +reasoning. Inspired by clinical consultations, ColaCare employs two types of +agents: DoctorAgent and MetaAgent, which collaboratively analyze patient data. +Expert models process and generate predictions from numerical EHR data, while +LLM agents produce reasoning references and decision-making reports within the +collaborative consultation framework. We additionally incorporate the Merck +Manual of Diagnosis and Therapy (MSD) medical guideline within a +retrieval-augmented generation (RAG) module for authoritative evidence support. +Extensive experiments conducted on four distinct EHR datasets demonstrate +ColaCare's superior performance in mortality prediction tasks, underscoring its +potential to revolutionize clinical decision support systems and advance +personalized precision medicine. The code, complete prompt templates, more case +studies, etc. are publicly available at the anonymous link: +https://colacare.netlify.app. + +
+
+
+
+
+ + ☆ MedVisionLlama: Leveraging Pre-Trained Large Language Model Layers to + Enhance Medical Image Segmentation WACV + + +
+ Large Language Models (LLMs), known for their versatility in textual data, +are increasingly being explored for their potential to enhance medical image +segmentation, a crucial task for accurate diagnostic imaging. This study +explores enhancing Vision Transformers (ViTs) for medical image segmentation by +integrating pre-trained LLM transformer blocks. Our approach, which +incorporates a frozen LLM transformer block into the encoder of a ViT-based +model, leads to substantial improvements in segmentation performance across +various medical imaging modalities. We propose a Hybrid Attention Mechanism +that combines global and local feature learning with a Multi-Scale Fusion Block +for aggregating features across different scales. The enhanced model shows +significant performance gains, including an average Dice score increase from +0.74 to 0.79 and improvements in accuracy, precision, and the Jaccard Index. +These results demonstrate the effectiveness of LLM-based transformers in +refining medical image segmentation, highlighting their potential to +significantly boost model accuracy and robustness. The source code and our +implementation are available at: https://bit.ly/3zf2CVs + +
+
+ comment: Submitted to IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2025 +
+
+
+
+
+ + ☆ Algorithms For Automatic Accentuation And Transcription Of Russian Texts + In Speech Recognition Systems SP + + +
+ This paper presents an overview of rule-based system for automatic +accentuation and phonemic transcription of Russian texts for speech connected +tasks, such as Automatic Speech Recognition (ASR). Two parts of the developed +system, accentuation and transcription, use different approaches to achieve +correct phonemic representations of input phrases. Accentuation is based on +"Grammatical dictionary of the Russian language" of A.A. Zaliznyak and +wiktionary corpus. To distinguish homographs, the accentuation system also +utilises morphological information of the sentences based on Recurrent Neural +Networks (RNN). Transcription algorithms apply the rules presented in the +monograph of B.M. Lobanov and L.I. Tsirulnik "Computer Synthesis and Voice +Cloning". The rules described in the present paper are implemented in an +open-source module, which can be of use to any scientific study connected to +ASR or Speech To Text (STT) tasks. Automatically marked up text annotations of +the Russian Voxforge database were used as training data for an acoustic model +in CMU Sphinx. The resulting acoustic model was evaluated on cross-validation, +mean Word Accuracy being 71.2%. The developed toolkit is written in the Python +language and is accessible on GitHub for any researcher interested. + +
+
+ comment: Speech and Computer 20th International Conference, SPECOM 2018, + Leipzig, Germany, Proceedings 20 +
+
+
+
+
+ + ☆ Contextual Document Embeddings + + +
+ Dense document embeddings are central to neural retrieval. The dominant +paradigm is to train and construct embeddings by running encoders directly on +individual documents. In this work, we argue that these embeddings, while +effective, are implicitly out-of-context for targeted use cases of retrieval, +and that a contextualized document embedding should take into account both the +document and neighboring documents in context - analogous to contextualized +word embeddings. We propose two complementary methods for contextualized +document embeddings: first, an alternative contrastive learning objective that +explicitly incorporates the document neighbors into the intra-batch contextual +loss; second, a new contextual architecture that explicitly encodes neighbor +document information into the encoded representation. Results show that both +methods achieve better performance than biencoders in several settings, with +differences especially pronounced out-of-domain. We achieve state-of-the-art +results on the MTEB benchmark with no hard negative mining, score distillation, +dataset-specific instructions, intra-GPU example-sharing, or extremely large +batch sizes. Our method can be applied to improve performance on any +contrastive learning dataset and any biencoder. + +
+
+
+
+
+ + ☆ Methods for Automatic Matrix Language Determination of Code-Switched + Speech EMNLP + + +
+ Code-switching (CS) is the process of speakers interchanging between two or +more languages which in the modern world becomes increasingly common. In order +to better describe CS speech the Matrix Language Frame (MLF) theory introduces +the concept of a Matrix Language, which is the language that provides the +grammatical structure for a CS utterance. In this work the MLF theory was used +to develop systems for Matrix Language Identity (MLID) determination. The MLID +of English/Mandarin and English/Spanish CS text and speech was compared to +acoustic language identity (LID), which is a typical way to identify a language +in monolingual utterances. MLID predictors from audio show higher correlation +with the textual principles than LID in all cases while also outperforming LID +in an MLID recognition task based on F1 macro (60\%) and correlation score +(0.38). This novel approach has identified that non-English languages (Mandarin +and Spanish) are preferred over the English language as the ML contrary to the +monolingual choice of LID. + +
+
+ comment: Accepted at EMNLP +
+
+
+
+
+ + ☆ Can Large Language Models Grasp Legal Theories? Enhance Legal Reasoning + with Insights from Multi-Agent Collaboration + + +
+ Large Language Models (LLMs) could struggle to fully understand legal +theories and perform complex legal reasoning tasks. In this study, we introduce +a challenging task (confusing charge prediction) to better evaluate LLMs' +understanding of legal theories and reasoning capabilities. We also propose a +novel framework: Multi-Agent framework for improving complex Legal Reasoning +capability (MALR). MALR employs non-parametric learning, encouraging LLMs to +automatically decompose complex legal tasks and mimic human learning process to +extract insights from legal rules, helping LLMs better understand legal +theories and enhance their legal reasoning abilities. Extensive experiments on +multiple real-world datasets demonstrate that the proposed framework +effectively addresses complex reasoning issues in practical scenarios, paving +the way for more reliable applications in the legal domain. + +
+
+
+
+
+ + ☆ Mixed-Session Conversation with Egocentric Memory EMNLP + + +
+ Recently introduced dialogue systems have demonstrated high usability. +However, they still fall short of reflecting real-world conversation scenarios. +Current dialogue systems exhibit an inability to replicate the dynamic, +continuous, long-term interactions involving multiple partners. This shortfall +arises because there have been limited efforts to account for both aspects of +real-world dialogues: deeply layered interactions over the long-term dialogue +and widely expanded conversation networks involving multiple participants. As +the effort to incorporate these aspects combined, we introduce Mixed-Session +Conversation, a dialogue system designed to construct conversations with +various partners in a multi-session dialogue setup. We propose a new dataset +called MiSC to implement this system. The dialogue episodes of MiSC consist of +6 consecutive sessions, with four speakers (one main speaker and three +partners) appearing in each episode. Also, we propose a new dialogue model with +a novel memory management mechanism, called Egocentric Memory Enhanced +Mixed-Session Conversation Agent (EMMA). EMMA collects and retains memories +from the main speaker's perspective during conversations with partners, +enabling seamless continuity in subsequent interactions. Extensive human +evaluations validate that the dialogues in MiSC demonstrate a seamless +conversational flow, even when conversation partners change in each session. +EMMA trained with MiSC is also evaluated to maintain high memorability without +contradiction throughout the entire conversation. + +
+
+ comment: EMNLP Findings 2024 (30 pages); Project website: + https://mixed-session.github.io/ +
+
+
+
+
+ + ☆ Defining Knowledge: Bridging Epistemology and Large Language Models EMNLP 2024 + + +
+ Knowledge claims are abundant in the literature on large language models +(LLMs); but can we say that GPT-4 truly "knows" the Earth is round? To address +this question, we review standard definitions of knowledge in epistemology and +we formalize interpretations applicable to LLMs. In doing so, we identify +inconsistencies and gaps in how current NLP research conceptualizes knowledge +with respect to epistemological frameworks. Additionally, we conduct a survey +of 100 professional philosophers and computer scientists to compare their +preferences in knowledge definitions and their views on whether LLMs can really +be said to know. Finally, we suggest evaluation protocols for testing knowledge +in accordance to the most relevant definitions. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ☆ Dynamic Gradient Alignment for Online Data Mixing + + +
+ The composition of training data mixtures is critical for effectively +training large language models (LLMs), as it directly impacts their performance +on downstream tasks. Our goal is to identify an optimal data mixture to +specialize an LLM for a specific task with access to only a few examples. +Traditional approaches to this problem include ad-hoc reweighting methods, +importance sampling, and gradient alignment techniques. This paper focuses on +gradient alignment and introduces Dynamic Gradient Alignment (DGA), a scalable +online gradient alignment algorithm. DGA dynamically estimates the pre-training +data mixture on which the models' gradients align as well as possible with +those of the model on the specific task. DGA is the first gradient alignment +approach that incurs minimal overhead compared to standard pre-training and +outputs a competitive model, eliminating the need for retraining the model. +Experimentally, we demonstrate significant improvements over importance +sampling in two key scenarios: (i) when the pre-training set is small and +importance sampling overfits due to limited data; and (ii) when there is +insufficient specialized data, trapping importance sampling on narrow pockets +of data. Our findings underscore the effectiveness of gradient alignment +methods in optimizing training data mixtures, particularly in data-constrained +environments, and offer a practical solution for enhancing LLM performance on +specific tasks with limited data availability. + +
+
+
+
+
+ + ☆ DTVLT: A Multi-modal Diverse Text Benchmark for Visual Language Tracking + Based on LLM + + +
+ Visual language tracking (VLT) has emerged as a cutting-edge research area, +harnessing linguistic data to enhance algorithms with multi-modal inputs and +broadening the scope of traditional single object tracking (SOT) to encompass +video understanding applications. Despite this, most VLT benchmarks still +depend on succinct, human-annotated text descriptions for each video. These +descriptions often fall short in capturing the nuances of video content +dynamics and lack stylistic variety in language, constrained by their uniform +level of detail and a fixed annotation frequency. As a result, algorithms tend +to default to a "memorize the answer" strategy, diverging from the core +objective of achieving a deeper understanding of video content. Fortunately, +the emergence of large language models (LLMs) has enabled the generation of +diverse text. This work utilizes LLMs to generate varied semantic annotations +(in terms of text lengths and granularities) for representative SOT benchmarks, +thereby establishing a novel multi-modal benchmark. Specifically, we (1) +propose a new visual language tracking benchmark with diverse texts, named +DTVLT, based on five prominent VLT and SOT benchmarks, including three +sub-tasks: short-term tracking, long-term tracking, and global instance +tracking. (2) We offer four granularity texts in our benchmark, considering the +extent and density of semantic information. We expect this multi-granular +generation strategy to foster a favorable environment for VLT and video +understanding research. (3) We conduct comprehensive experimental analyses on +DTVLT, evaluating the impact of diverse text on tracking performance and hope +the identified performance bottlenecks of existing algorithms can support +further research in VLT and video understanding. The proposed benchmark, +experimental results and toolkit will be released gradually on +http://videocube.aitestunion.com/. + +
+
+ comment: Preprint, Under Review +
+
+
+
+
+ + ☆ Response Tuning: Aligning Large Language Models without Instruction + + +
+ Instruction tuning-supervised fine-tuning using instruction-response pairs-is +a foundational step in transitioning pre-trained Large Language Models (LLMs) +into helpful and safe chat assistants. Our hypothesis is that establishing an +adequate output space can enable such a transition given the capabilities +inherent in pre-trained LLMs. To verify this, we propose Response Tuning (RT), +which eliminates the instruction-conditioning step in instruction tuning and +solely focuses on response space supervision. Our experiments demonstrate that +RT models, trained only using responses, can effectively respond to a wide +range of instructions and exhibit helpfulness comparable to that of their +instruction-tuned counterparts. Furthermore, we observe that controlling the +training response distribution can significantly improve their user preference +or elicit target behaviors such as refusing assistance for unsafe queries. Our +findings illuminate the role of establishing an adequate output space in +alignment, highlighting the potential of the extensive inherent capabilities of +pre-trained LLMs. + +
+
+ comment: 34 pages +
+
+
+
+
+ + ☆ Embedded Topic Models Enhanced by Wikification EMNLP 2024 + + +
+ Topic modeling analyzes a collection of documents to learn meaningful +patterns of words. However, previous topic models consider only the spelling of +words and do not take into consideration the homography of words. In this +study, we incorporate the Wikipedia knowledge into a neural topic model to make +it aware of named entities. We evaluate our method on two datasets, 1) news +articles of \textit{New York Times} and 2) the AIDA-CoNLL dataset. Our +experiments show that our method improves the performance of neural topic +models in generalizability. Moreover, we analyze frequent terms in each topic +and the temporal dependencies between topics to demonstrate that our +entity-aware topic models can capture the time-series development of topics +well. + +
+
+ comment: Accepted at EMNLP 2024 Workshop NLP for Wikipedia +
+
+
+
+
+ + ☆ Better Call SAUL: Fluent and Consistent Language Model Editing with + Generation Regularization + + +
+ To ensure large language models contain up-to-date knowledge, they need to be +updated regularly. However, model editing is challenging as it might also +affect knowledge that is unrelated to the new data. State-of-the-art methods +identify parameters associated with specific knowledge and then modify them via +direct weight updates. However, these locate-and-edit methods suffer from heavy +computational overhead and lack theoretical validation. In contrast, directly +fine-tuning the model on requested edits affects the model's behavior on +unrelated knowledge, and significantly damages the model's generation fluency +and consistency. To address these challenges, we propose SAUL, a streamlined +model editing method that uses sentence concatenation with augmented random +facts for generation regularization. Evaluations on three model editing +benchmarks show that SAUL is a practical and reliable solution for model +editing outperforming state-of-the-art methods while maintaining generation +quality and reducing computational overhead. + +
+
+
+
+
+ + ☆ IoT-LLM: Enhancing Real-World IoT Task Reasoning with Large Language + Models ICLR 2025 + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities across +textual and visual domains but often generate outputs that violate physical +laws, revealing a gap in their understanding of the physical world. Inspired by +human cognition, where perception is fundamental to reasoning, we explore +augmenting LLMs with enhanced perception abilities using Internet of Things +(IoT) sensor data and pertinent knowledge for IoT task reasoning in the +physical world. In this work, we systematically study LLMs capability to +address real-world IoT tasks by augmenting their perception and knowledge base, +and then propose a unified framework, IoT-LLM, to enhance such capability. In +IoT-LLM, we customize three steps for LLMs: preprocessing IoT data into formats +amenable to LLMs, activating their commonsense knowledge through +chain-of-thought prompting and specialized role definitions, and expanding +their understanding via IoT-oriented retrieval-augmented generation based on +in-context learning. To evaluate the performance, We design a new benchmark +with five real-world IoT tasks with different data types and reasoning +difficulties and provide the benchmarking results on six open-source and +close-source LLMs. Experimental results demonstrate the limitations of existing +LLMs with naive textual inputs that cannot perform these tasks effectively. We +show that IoT-LLM significantly enhances the performance of IoT tasks reasoning +of LLM, such as GPT-4, achieving an average improvement of 65% across various +tasks against previous methods. The results also showcase LLMs ability to +comprehend IoT data and the physical law behind data by providing a reasoning +process. Limitations of our work are claimed to inspire future research in this +new era. + +
+
+ comment: 21 pages, 10 figures, submitted to ICLR 2025 Conference +
+
+
+
+
+ + ☆ Collective Critics for Creative Story Generation EMNLP 2024 + + +
+ Generating a long story of several thousand words with narrative coherence +using Large Language Models (LLMs) has been a challenging task. Previous +research has addressed this challenge by proposing different frameworks that +create a story plan and generate a long story based on that plan. However, +these frameworks have been mainly focusing on maintaining narrative coherence +in stories, often overlooking creativity in story planning and the +expressiveness of the stories generated from those plans, which are desirable +properties to captivate readers' interest. In this paper, we propose Collective +Critics for Creative Story Generation framework (CritiCS), which is composed of +plan refining stage (CrPlan) and story generation stage (CrText), to integrate +a collective revision mechanism that promotes those properties into long-form +story generation process. Specifically, in each stage, a group of LLM critics +and one leader collaborate to incrementally refine drafts of plan and story +throughout multiple rounds. Extensive human evaluation shows that the CritiCS +can significantly enhance story creativity and reader engagement, while also +maintaining narrative coherence. Furthermore, the design of the framework +allows active participation from human writers in any role within the critique +process, enabling interactive human-machine collaboration in story writing. + +
+
+ comment: EMNLP 2024 (36 pages) +
+
+
+
+
+ + ☆ Learning the Latent Rules of a Game from Data: A Chess Story + + +
+ We demonstrate that small pretrained foundational generative language models +with millions of parameters can learn the latent rules of a process from data +associated with the process. Inspired by Stefan Zweig's novella +"Schachnovelle," also known as "The Royal Game" in English, we show that 28M +and 125M parameter pretrained foundational small language models (SLMs) can be +instruction fine-tuned with 1,000-to-1,000,000 examples to learn the rules of +chess, propose legal moves, and accurately solve chess problems. We also +explore the impact of successive language model fine-tuning epochs on improved +outcomes and demonstrate reductions in model hallucinations by increasing the +number of instruction fine-tuning examples. + +
+
+
+
+
+ + ☆ LLM-Pilot: Characterize and Optimize Performance of your LLM Inference + Services SC '24 + + +
+ As Large Language Models (LLMs) are rapidly growing in popularity, LLM +inference services must be able to serve requests from thousands of users while +satisfying performance requirements. The performance of an LLM inference +service is largely determined by the hardware onto which it is deployed, but +understanding of which hardware will deliver on performance requirements +remains challenging. In this work we present LLM-Pilot - a first-of-its-kind +system for characterizing and predicting performance of LLM inference services. +LLM-Pilot performs benchmarking of LLM inference services, under a realistic +workload, across a variety of GPUs, and optimizes the service configuration for +each considered GPU to maximize performance. Finally, using this +characterization data, LLM-Pilot learns a predictive model, which can be used +to recommend the most cost-effective hardware for a previously unseen LLM. +Compared to existing methods, LLM-Pilot can deliver on performance requirements +33% more frequently, whilst reducing costs by 60% on average. + +
+
+ comment: Accepted to the International Conference for High Performance + Computing, Networking, Storage and Analysis (SC '24) +
+
+
+
+
+ + ☆ MenakBERT -- Hebrew Diacriticizer SC + + +
+ Diacritical marks in the Hebrew language give words their vocalized form. The +task of adding diacritical marks to plain Hebrew text is still dominated by a +system that relies heavily on human-curated resources. Recent models trained on +diacritized Hebrew texts still present a gap in performance. We use a recently +developed char-based PLM to narrowly bridge this gap. Presenting MenakBERT, a +character level transformer pretrained on Hebrew text and fine-tuned to produce +diacritical marks for Hebrew sentences. We continue to show how finetuning a +model for diacritizing transfers to a task such as part of speech tagging. + +
+
+ comment: Published at ISCOL2022 as a poster +
+
+
+
+
+ + ☆ Parameter Competition Balancing for Model Merging NeurIPS2024 + + +
+ While fine-tuning pretrained models has become common practice, these models +often underperform outside their specific domains. Recently developed model +merging techniques enable the direct integration of multiple models, each +fine-tuned for distinct tasks, into a single model. This strategy promotes +multitasking capabilities without requiring retraining on the original +datasets. However, existing methods fall short in addressing potential +conflicts and complex correlations between tasks, especially in parameter-level +adjustments, posing a challenge in effectively balancing parameter competition +across various tasks. This paper introduces an innovative technique named +PCB-Merging (Parameter Competition Balancing), a lightweight and training-free +technique that adjusts the coefficients of each parameter for effective model +merging. PCB-Merging employs intra-balancing to gauge parameter significance +within individual tasks and inter-balancing to assess parameter similarities +across different tasks. Parameters with low importance scores are dropped, and +the remaining ones are rescaled to form the final merged model. We assessed our +approach in diverse merging scenarios, including cross-task, cross-domain, and +cross-training configurations, as well as out-of-domain generalization. The +experimental results reveal that our approach achieves substantial performance +enhancements across multiple modalities, domains, model sizes, number of tasks, +fine-tuning forms, and large language models, outperforming existing model +merging methods. The code is publicly available at: +\url{https://github.com/duguodong7/pcb-merging}. + +
+
+ comment: Accepted by NeurIPS2024 +
+
+
+
+
+ + ☆ MetaMetrics: Calibrating Metrics For Generation Tasks Using Human + Preferences + + +
+ Understanding the quality of a performance evaluation metric is crucial for +ensuring that model outputs align with human preferences. However, it remains +unclear how well each metric captures the diverse aspects of these preferences, +as metrics often excel in one particular area but not across all dimensions. To +address this, it is essential to systematically calibrate metrics to specific +aspects of human preference, catering to the unique characteristics of each +aspect. We introduce MetaMetrics, a calibrated meta-metric designed to evaluate +generation tasks across different modalities in a supervised manner. +MetaMetrics optimizes the combination of existing metrics to enhance their +alignment with human preferences. Our metric demonstrates flexibility and +effectiveness in both language and vision downstream tasks, showing significant +benefits across various multilingual and multi-domain scenarios. MetaMetrics +aligns closely with human preferences and is highly extendable and easily +integrable into any application. This makes MetaMetrics a powerful tool for +improving the evaluation of generation tasks, ensuring that metrics are more +representative of human judgment across diverse contexts. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Towards Comprehensive Detection of Chinese Harmful Memes + + +
+ This paper has been accepted in the NeurIPS 2024 D & B Track. Harmful memes +have proliferated on the Chinese Internet, while research on detecting Chinese +harmful memes significantly lags behind due to the absence of reliable datasets +and effective detectors. To this end, we focus on the comprehensive detection +of Chinese harmful memes. We construct ToxiCN MM, the first Chinese harmful +meme dataset, which consists of 12,000 samples with fine-grained annotations +for various meme types. Additionally, we propose a baseline detector, +Multimodal Knowledge Enhancement (MKE), incorporating contextual information of +meme content generated by the LLM to enhance the understanding of Chinese +memes. During the evaluation phase, we conduct extensive quantitative +experiments and qualitative analyses on multiple baselines, including LLMs and +our MKE. The experimental results indicate that detecting Chinese harmful memes +is challenging for existing models while demonstrating the effectiveness of +MKE. The resources for this paper are available at +https://github.com/DUT-lujunyu/ToxiCN_MM. + +
+
+
+
+
+ + ☆ From Concrete to Abstract: A Multimodal Generative Approach to Abstract + Concept Learning + + +
+ Understanding and manipulating concrete and abstract concepts is fundamental +to human intelligence. Yet, they remain challenging for artificial agents. This +paper introduces a multimodal generative approach to high order abstract +concept learning, which integrates visual and categorical linguistic +information from concrete ones. Our model initially grounds subordinate level +concrete concepts, combines them to form basic level concepts, and finally +abstracts to superordinate level concepts via the grounding of basic-level +concepts. We evaluate the model language learning ability through +language-to-visual and visual-to-language tests with high order abstract +concepts. Experimental results demonstrate the proficiency of the model in both +language understanding and language naming tasks. + +
+
+
+
+
+ + ☆ AlphaEdit: Null-Space Constrained Knowledge Editing for Language Models + + +
+ Large language models (LLMs) often exhibit hallucinations due to incorrect or +outdated knowledge. Hence, model editing methods have emerged to enable +targeted knowledge updates. To achieve this, a prevailing paradigm is the +locating-then-editing approach, which first locates influential parameters and +then edits them by introducing a perturbation. While effective, current studies +have demonstrated that this perturbation inevitably disrupt the originally +preserved knowledge within LLMs, especially in sequential editing scenarios. To +address this, we introduce AlphaEdit, a novel solution that projects +perturbation onto the null space of the preserved knowledge before applying it +to the parameters. We theoretically prove that this projection ensures the +output of post-edited LLMs remains unchanged when queried about the preserved +knowledge, thereby mitigating the issue of disruption. Extensive experiments on +various LLMs, including LLaMA3, GPT2-XL, and GPT-J, show that AlphaEdit boosts +the performance of most locating-then-editing methods by an average of 36.4% +with a single line of additional code for projection solely. Our code is +available at: https://github.com/jianghoucheng/AlphaEdit. + +
+
+
+
+
+ + ☆ Listening to the Wise Few: Select-and-Copy Attention Heads for + Multiple-Choice QA + + +
+ A standard way to evaluate the abilities of LLM involves presenting a +multiple-choice question and selecting the option with the highest logit as the +model's predicted answer. However, such a format for evaluating LLMs has +limitations, since even if the model knows the correct answer, it may struggle +to select the corresponding letter simply due to difficulties in following this +rigid format. To address this, we introduce new scores that better capture and +reveal model's underlying knowledge: the Query-Key Score (QK-score), derived +from the interaction between query and key representations in attention heads, +and the Attention Score, based on attention weights. These scores are extracted +from specific \textit{select-and-copy} heads, which show consistent performance +across popular Multi-Choice Question Answering (MCQA) datasets. Based on these +scores, our method improves knowledge extraction, yielding up to 16\% gain for +LLaMA2-7B and up to 10\% for larger models on popular MCQA benchmarks. At the +same time, the accuracy on a simple synthetic dataset, where the model +explicitly knows the right answer, increases by almost 60\%, achieving nearly +perfect accuracy, therefore demonstrating the method's efficiency in mitigating +MCQA format limitations. To support our claims, we conduct experiments on +models ranging from 7 billion to 70 billion parameters in both zero- and +few-shot setups. + +
+
+
+
+
+ + ☆ How Much Can RAG Help the Reasoning of LLM? + + +
+ Retrieval-Augmented Generation (RAG) has gained significant popularity in +modern Large Language Models (LLMs) due to its effectiveness in introducing new +knowledge and reducing hallucinations. However, the deep understanding of RAG +remains limited, how does RAG help the reasoning process and can RAG help +improve the reasoning capability remains question. While external documents are +typically considered as a method to incorporate domain-specific information, +they also contain intermediate reasoning results related to the query, this +suggests that documents could enhance the reasoning capability of LLMs, which +has not been previously explored. In this paper, we investigate this issue in +depth and find that while RAG can assist with reasoning, the help is limited. +If we conceptualize the reasoning process as a tree with fixed depth, then RAG +struggles to assist LLMs in performing deeper reasoning. Additionally, the +information in the documents requires preprocessing to filter out noise. We +demonstrate that this preprocessing is difficult to achieve simply fine-tuning +of the LLM, it often necessitates numerous additional transformer layers to +solve the problem. To simplify the problem, we propose DPrompt tuning, which +effectively resolves the issue within just limited transformer layers, leading +to improved performance. + +
+
+
+
+
+ + ☆ Llama SLayer 8B: Shallow Layers Hold the Key to Knowledge Injection + + +
+ As a manner to augment pre-trained large language models (LLM), knowledge +injection is critical to develop vertical domain large models and has been +widely studied. Although most current approaches, including parameter-efficient +fine-tuning (PEFT) and block expansion methods, uniformly apply knowledge +across all LLM layers, it raises the question: are all layers equally crucial +for knowledge injection? We begin by evaluating the importance of each layer in +finding the optimal layer range for knowledge injection. Intuitively, the more +important layers should play a more critical role in knowledge injection and +deserve a denser injection. We observe performance dips in question-answering +benchmarks after the removal or expansion of the shallow layers, and the +degradation shrinks as the layer gets deeper, indicating that the shallow +layers hold the key to knowledge injection. This insight leads us to propose +the S strategy, a post-pretraining strategy of selectively enhancing shallow +layers while pruning the less effective deep ones. Based on this strategy, we +introduce Llama Slayer-8B and Llama Slayer-8B-Instruct. We experimented on the +corpus of code $\&$ math and demonstrated the effectiveness of our strategy. +Further experiments across different LLM, Mistral-7B, and a legal corpus +confirmed the general applicability of the approach, underscoring its +wide-ranging efficacy. Our code is available at: +\https://github.com/txchen-USTC/Llama-Slayer + +
+
+
+
+
+ + ☆ Post-edits Are Preferences Too + + +
+ Preference Optimization (PO) techniques are currently one of the state of the +art techniques for fine-tuning large language models (LLMs) on pairwise +preference feedback from human annotators. However, in machine translation, +this sort of feedback can be difficult to solicit. Additionally, Kreutzer et +al. (2018) have shown that, for machine translation, pairwise preferences are +less reliable than other forms of human feedback, such as 5-point ratings. + We examine post-edits to see if they can be a source of reliable human +preferences by construction. In PO, a human annotator is shown sequences $s_1$ +and $s_2$ and asked for a preference judgment, %$s_1 > s_2$; while for +post-editing, editors \emph{create} $s_1$ and know that it should be better +than $s_2$. We attempt to use these implicit preferences for PO and show that +it helps the model move towards post-edit-like hypotheses and away from machine +translation-like hypotheses. Furthermore, we show that best results are +obtained by pre-training the model with supervised fine-tuning (SFT) on +post-edits in order to promote post-edit-like hypotheses to the top output +ranks. + +
+
+ comment: To appear at the Ninth Conference on Machine Translation (WMT24) +
+
+
+
+
+ + ☆ Traffic Light or Light Traffic? Investigating Phrasal Semantics in Large + Language Models EMNLP 2024 + + +
+ Phrases are fundamental linguistic units through which humans convey +semantics. This study critically examines the capacity of API-based large +language models (LLMs) to comprehend phrase semantics, utilizing three +human-annotated datasets. We assess the performance of LLMs in executing phrase +semantic reasoning tasks guided by natural language instructions and explore +the impact of common prompting techniques, including few-shot demonstrations +and Chain-of-Thought reasoning. Our findings reveal that LLMs greatly +outperform traditional embedding methods across the datasets; however, they do +not show a significant advantage over fine-tuned methods. The effectiveness of +advanced prompting strategies shows variability. We conduct detailed error +analyses to interpret the limitations faced by LLMs in comprehending phrase +semantics. Code and data can be found at +https://github.com/memray/llm_phrase_semantics. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ☆ Jailbreak Antidote: Runtime Safety-Utility Balance via Sparse + Representation Adjustment in Large Language Models + + +
+ As large language models (LLMs) become integral to various applications, +ensuring both their safety and utility is paramount. Jailbreak attacks, which +manipulate LLMs into generating harmful content, pose significant challenges to +this balance. Existing defenses, such as prompt engineering and safety +fine-tuning, often introduce computational overhead, increase inference +latency, and lack runtime flexibility. Moreover, overly restrictive safety +measures can degrade model utility by causing refusals of benign queries. In +this paper, we introduce Jailbreak Antidote, a method that enables real-time +adjustment of LLM safety preferences by manipulating a sparse subset of the +model's internal states during inference. By shifting the model's hidden +representations along a safety direction with varying strengths, we achieve +flexible control over the safety-utility balance without additional token +overhead or inference delays. Our analysis reveals that safety-related +information in LLMs is sparsely distributed; adjusting approximately 5% of the +internal state is as effective as modifying the entire state. Extensive +experiments on nine LLMs (ranging from 2 billion to 72 billion parameters), +evaluated against ten jailbreak attack methods and compared with six defense +strategies, validate the effectiveness and efficiency of our approach. By +directly manipulating internal states during reasoning, Jailbreak Antidote +offers a lightweight, scalable solution that enhances LLM safety while +preserving utility, opening new possibilities for real-time safety mechanisms +in widely-deployed AI systems. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Make Compound Sentences Simple to Analyze: Learning to Split Sentences + for Aspect-based Sentiment Analysis EMNLP 2024 + + +
+ In the domain of Aspect-Based Sentiment Analysis (ABSA), generative methods +have shown promising results and achieved substantial advancements. However, +despite these advancements, the tasks of extracting sentiment quadruplets, +which capture the nuanced sentiment expressions within a sentence, remain +significant challenges. In particular, compound sentences can potentially +contain multiple quadruplets, making the extraction task increasingly difficult +as sentence complexity grows. To address this issue, we are focusing on +simplifying sentence structures to facilitate the easier recognition of these +elements and crafting a model that integrates seamlessly with various ABSA +tasks. In this paper, we propose Aspect Term Oriented Sentence Splitter +(ATOSS), which simplifies compound sentence into simpler and clearer forms, +thereby clarifying their structure and intent. As a plug-and-play module, this +approach retains the parameters of the ABSA model while making it easier to +identify essential intent within input sentences. Extensive experimental +results show that utilizing ATOSS outperforms existing methods in both ASQP and +ACOS tasks, which are the primary tasks for extracting sentiment quadruplets. + +
+
+ comment: Accepted at EMNLP 2024 (Findings, long paper) +
+
+
+
+
+ + ☆ Language Models are Graph Learners + + +
+ Language Models (LMs) are increasingly challenging the dominance of +domain-specific models, including Graph Neural Networks (GNNs) and Graph +Transformers (GTs), in graph learning tasks. Following this trend, we propose a +novel approach that empowers off-the-shelf LMs to achieve performance +comparable to state-of-the-art GNNs on node classification tasks, without +requiring any architectural modification. By preserving the LM's original +architecture, our approach retains a key benefit of LM instruction tuning: the +ability to jointly train on diverse datasets, fostering greater flexibility and +efficiency. To achieve this, we introduce two key augmentation strategies: (1) +Enriching LMs' input using topological and semantic retrieval methods, which +provide richer contextual information, and (2) guiding the LMs' classification +process through a lightweight GNN classifier that effectively prunes class +candidates. Our experiments on real-world datasets show that backbone Flan-T5 +models equipped with these augmentation strategies outperform state-of-the-art +text-output node classifiers and are comparable to top-performing vector-output +node classifiers. By bridging the gap between specialized task-specific node +classifiers and general LMs, this work paves the way for more versatile and +widely applicable graph learning models. We will open-source the code upon +publication. + +
+
+
+
+
+ + ☆ Efficient Second-Order Neural Network Optimization via Adaptive Trust + Region Methods + + +
+ Second-order optimization methods offer notable advantages in training deep +neural networks by utilizing curvature information to achieve faster +convergence. However, traditional second-order techniques are computationally +prohibitive, primarily due to the large matrix inversions and high memory +demands they require. While adaptive trust-region methods have been developed +to mitigate these issues, their performance is often hindered by conservative +estimates of key parameters, such as the Lipschitz constant of the Hessian, +resulting in suboptimal outcomes. In this paper, we introduce +SecondOrderAdaptiveAdam (SOAA), a novel optimization algorithm designed to +overcome these limitations. SOAA approximates the Fisher information matrix +using a diagonal representation, reducing computational complexity from +\(O(n^{2})\) to \(O(n)\), thereby making it suitable for large-scale deep +learning models, including large language models (LLMs). Additionally, the +algorithm integrates an adaptive trust-region mechanism that dynamically +adjusts the trust region size based on observed loss reduction, ensuring both +robust convergence and computational efficiency. We empirically demonstrate +that SOAA achieves faster and more stable convergence compared to first-order +optimizers, such as Adam, under similar computational constraints. However, the +diagonal approximation of the Fisher information matrix may be less effective +in capturing higher-order interactions between gradients, suggesting potential +areas for further refinement and future research. + +
+
+
+
+
+ + ☆ Correlation and Navigation in the Vocabulary Key Representation Space of + Language Models + + +
+ Language model (LM) decoding is based on the next-token prediction (NTP) +probability distribution. For neural LMs (e.g., Transformer-based), NTP +distribution is essentially a softmax-regularized dot product between an +encoded input context (query) and fixed vocabulary representations (keys). In +this paper, we study the effect of the key distribution on the NTP +distribution, with a focus on whether the similarity between keys will trigger +spurious correlations in NTP. Through knowledge-probing tasks, we show that in +the NTP distribution, the few top-ranked tokens are typically accurate. +However, the middle-ranked prediction is highly biased towards the tokens that +are distributionally (not necessarily semantically) similar to these top ones. +For instance, if "P" is predicted as the top-1 token, "A"-"Z" will all be +ranked high in NTP, no matter whether they can lead to correct decoding +results. This hurts the sampling diversity and makes the sampling of correct, +long-tail results hopeless and noisy. We attempt to alleviate this issue via a +novel in-context method that iteratively pushes the query representation away +from explored regions. Specifically, we include the explored decoding results +in the context and prompt the LM to generate something else, which encourages +the LM to produce a query representation that has small dot products with +explored keys. Experiments on knowledge-probing tasks show that our method +leads to efficient navigation away from explored keys to correct new keys. We +further extend our method to open-ended and chain-of-thought (for reasoning) +generation. Experiment results show that ICN contributes to better generation +diversity and improved self-consistency voting performance. Finally, we discuss +potential training issues caused by the fixed key space together with the +challenges and possible ways to address them in future research. + +
+
+
+
+
+ + ☆ Morphological evaluation of subwords vocabulary used by BETO language + model + + +
+ Subword tokenization algorithms used by Large Language Models are +significantly more efficient and can independently build the necessary +vocabulary of words and subwords without human intervention. However, those +subwords do not always align with real morphemes, potentially impacting the +models' performance, though it remains uncertain when this might occur. In +previous research, we proposed a method to assess the morphological quality of +vocabularies, focusing on the overlap between these vocabularies and the +morphemes of a given language. Our evaluation method was built on three quality +measures, relevance, cohesion, and morphological accuracy, and a procedure for +their assessment. By applying this method to vocabularies created by three +subword tokenization algorithms, BPE, Wordpiece, and Unigram, we concluded that +these vocabularies generally exhibit very low morphological quality. In this +article, we apply this evaluation to the tokenizer of BETO, a BERT language +model trained on large Spanish corpora. This evaluation, along with our +previous results, helped us conclude that its vocabulary has a low +morphological quality, and we also found that training the tokenizer in a +larger corpus does not improve the morphological quality of the generated +vocabulary. Additionally, this evaluation helps clarify the algorithm used by +the tokenizer, that is, Wordpiece, given the inconsistencies between the +authors' claims and the model's configuration. + +
+
+ comment: in Spanish language +
+
+
+
+
+ + ☆ Annotation Guidelines for Corpus Novelties: Part 1 -- Named Entity + Recognition + + +
+ The Novelties corpus is a collection of novels (and parts of novels) +annotated for Named Entity Recognition (NER) among other tasks. This document +describes the guidelines applied during its annotation. It contains the +instructions used by the annotators, as well as a number of examples retrieved +from the annotated novels, and illustrating expressions that should be marked +as entities as well as expressions that should not. + +
+
+
+
+
+ + ♻ ☆ Which questions should I answer? Salience Prediction of Inquisitive + Questions EMNLP 2024 + + +
+ Inquisitive questions -- open-ended, curiosity-driven questions people ask as +they read -- are an integral part of discourse processing (Kehler and Rohde, +2017; Onea, 2016) and comprehension (Prince, 2004). Recent work in NLP has +taken advantage of question generation capabilities of LLMs to enhance a wide +range of applications. But the space of inquisitive questions is vast: many +questions can be evoked from a given context. So which of those should be +prioritized to find answers? Linguistic theories, unfortunately, have not yet +provided an answer to this question. This paper presents QSALIENCE, a salience +predictor of inquisitive questions. QSALIENCE is instruction-tuned over our +dataset of linguist-annotated salience scores of 1,766 (context, question) +pairs. A question scores high on salience if answering it would greatly enhance +the understanding of the text (Van Rooy, 2003). We show that highly salient +questions are empirically more likely to be answered in the same article, +bridging potential questions (Onea, 2016) with Questions Under Discussion +(Roberts, 2012). We further validate our findings by showing that answering +salient questions is an indicator of summarization quality in news. + +
+
+ comment: Camera Ready for EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ LML-DAP: Language Model Learning a Dataset for Data-Augmented Prediction + + +
+ Classification tasks are typically handled using Machine Learning (ML) +models, which lack a balance between accuracy and interpretability. This paper +introduces a new approach to using Large Language Models (LLMs) for +classification tasks in an explainable way. Unlike ML models that rely heavily +on data cleaning and feature engineering, this method streamlines the process +using LLMs. This paper proposes a new concept called "Language Model Learning +(LML)" powered by a new method called "Data-Augmented Prediction (DAP)". The +classification is performed by LLMs using a method similar to humans manually +exploring and understanding the data and deciding classifications using data as +a reference. In the LML process, a dataset is summarized and evaluated to +determine the features that lead to the classification of each label the most. +In the process of DAP, the system uses the data summary and a row of the +testing dataset to automatically generate a query, which is used to retrieve +relevant rows from the dataset. A classification is generated by the LLM using +data summary and relevant rows, ensuring satisfactory accuracy even with +complex data using context-aware decision-making. LML and DAP unlock the +possibilities of new applications. The proposed method uses the words "Act as +an Explainable Machine Learning Model" in the prompt to enhance the +interpretability of the predictions by allowing users to review the logic +behind each prediction. In some test cases, the system scored an accuracy above +90%, proving the effectiveness of the system and its potential to outperform +conventional ML models in various scenarios. The code is available at +https://github.com/Pro-GenAI/LML-DAP + +
+
+ comment: Updated title, abstract, and images +
+
+
+
+
+ + ♻ ☆ Tokenization Falling Short: The Curse of Tokenization EMNLP 2024 + + +
+ Language models typically tokenize raw text into sequences of subword +identifiers from a predefined vocabulary, a process inherently sensitive to +typographical errors, length variations, and largely oblivious to the internal +structure of tokens--issues we term the curse of tokenization. In this study, +we delve into these drawbacks and demonstrate that large language models (LLMs) +remain susceptible to these problems. This study systematically investigates +these challenges and their impact on LLMs through three critical research +questions: (1) complex problem solving, (2) token structure probing, and (3) +resilience to typographical variation. Our findings reveal that scaling model +parameters can mitigate the issue of tokenization; however, LLMs still suffer +from biases induced by typos and other text format variations. Our experiments +show that subword regularization such as BPE-dropout can mitigate this issue. +We release our evaluation code and data at https://github.com/FloatAI/TKEval. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ On Training Data Influence of GPT Models EMNLP 2024 + + +
+ Amidst the rapid advancements in generative language models, the +investigation of how training data shapes the performance of GPT models is +still emerging. This paper presents GPTfluence, a novel approach that leverages +a featurized simulation to assess the impact of training examples on the +training dynamics of GPT models. Our approach not only traces the influence of +individual training instances on performance trajectories, such as loss and +other key metrics, on targeted test points but also enables a comprehensive +comparison with existing methods across various training scenarios in GPT +models, ranging from 14 million to 2.8 billion parameters, across a range of +downstream tasks. Contrary to earlier methods that struggle with generalization +to new data, GPTfluence introduces a parameterized simulation of training +dynamics, demonstrating robust generalization capabilities to unseen training +data. This adaptability is evident across both fine-tuning and +instruction-tuning scenarios, spanning tasks in natural language understanding +and generation. We make our code and data publicly available at +https://github.com/ernie-research/gptfluence. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ PharmacyGPT: The AI Pharmacist + + +
+ In this study, we introduce PharmacyGPT, a novel framework to assess the +capabilities of large language models (LLMs) such as ChatGPT and GPT-4 in +emulating the role of clinical pharmacists. Our methodology encompasses the +utilization of LLMs to generate comprehensible patient clusters, formulate +medication plans, and forecast patient outcomes. We conduct our investigation +using real data acquired from the intensive care unit (ICU) at the University +of North Carolina Chapel Hill (UNC) Hospital. Our analysis offers valuable +insights into the potential applications and limitations of LLMs in the field +of clinical pharmacy, with implications for both patient care and the +development of future AI-driven healthcare solutions. By evaluating the +performance of PharmacyGPT, we aim to contribute to the ongoing discourse +surrounding the integration of artificial intelligence in healthcare settings, +ultimately promoting the responsible and efficacious use of such technologies. + +
+
+
+
+
+ + ♻ ☆ Autoregressive Pre-Training on Pixels and Texts EMNLP 2024 + + +
+ The integration of visual and textual information represents a promising +direction in the advancement of language models. In this paper, we explore the +dual modality of language--both visual and textual--within an autoregressive +framework, pre-trained on both document images and texts. Our method employs a +multimodal training strategy, utilizing visual data through next patch +prediction with a regression head and/or textual data through next token +prediction with a classification head. We focus on understanding the +interaction between these two modalities and their combined impact on model +performance. Our extensive evaluation across a wide range of benchmarks shows +that incorporating both visual and textual data significantly improves the +performance of pixel-based language models. Remarkably, we find that a +unidirectional pixel-based model trained solely on visual data can achieve +comparable results to state-of-the-art bidirectional models on several language +understanding tasks. This work uncovers the untapped potential of integrating +visual and textual modalities for more effective language modeling. We release +our code, data, and model checkpoints at +\url{https://github.com/ernie-research/pixelgpt}. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Turning English-centric LLMs Into Polyglots: How Much Multilinguality Is + Needed? EMNLP 2024 + + +
+ The vast majority of today's large language models (LLMs) are +English-centric, having been pretrained predominantly on English text. Yet, in +order to meet user expectations, models need to be able to respond +appropriately in multiple languages once deployed in downstream applications. +This requires strong cross-lingual transfer abilities. In this work, we +investigate the minimal amount of multilinguality required during finetuning to +elicit cross-lingual generalisation in English-centric LLMs. In experiments +across four LLMs, we find that multilingual instruction tuning with as few as +two to three languages is both necessary and sufficient to elicit effective +cross-lingual generalisation, with the limiting factor being the degree to +which a target language is seen during pretraining. Evaluations on five +different tasks further reveal that multilingual instruction tuning is most +beneficial for generative tasks that assume input/output language agreement, +such as in chat settings, while being of less importance for highly structured +classification-style tasks. Our code and data is available at +https://github.com/ZurichNLP/multilingual-instruction-tuning. + +
+
+ comment: Accepted at Findings of EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Lookback Lens: Detecting and Mitigating Contextual Hallucinations in + Large Language Models Using Only Attention Maps EMNLP 2024 + + +
+ When asked to summarize articles or answer questions given a passage, large +language models (LLMs) can hallucinate details and respond with unsubstantiated +answers that are inaccurate with respect to the input context. This paper +describes a simple approach for detecting such contextual hallucinations. We +hypothesize that contextual hallucinations are related to the extent to which +an LLM attends to information in the provided context versus its own +generations. Based on this intuition, we propose a simple hallucination +detection model whose input features are given by the ratio of attention +weights on the context versus newly generated tokens (for each attention head). +We find that a linear classifier based on these lookback ratio features is as +effective as a richer detector that utilizes the entire hidden states of an LLM +or a text-based entailment model. The lookback ratio-based detector -- Lookback +Lens -- is found to transfer across tasks and even models, allowing a detector +that is trained on a 7B model to be applied (without retraining) to a larger +13B model. We further apply this detector to mitigate contextual +hallucinations, and find that a simple classifier-guided decoding approach is +able to reduce the amount of hallucination, for example by 9.6% in the XSum +summarization task. + +
+
+ comment: EMNLP 2024 main conference long paper. The source code is available + at https://github.com/voidism/Lookback-Lens +
+
+
+
+
+ + ♻ ☆ The Mystery of In-Context Learning: A Comprehensive Survey on + Interpretation and Analysis EMNLP 2024 + + +
+ Understanding in-context learning (ICL) capability that enables large +language models (LLMs) to excel in proficiency through demonstration examples +is of utmost importance. This importance stems not only from the better +utilization of this capability across various tasks, but also from the +proactive identification and mitigation of potential risks, including concerns +regarding truthfulness, bias, and toxicity, that may arise alongside the +capability. In this paper, we present a thorough survey on the interpretation +and analysis of in-context learning. First, we provide a concise introduction +to the background and definition of in-context learning. Then, we give an +overview of advancements from two perspectives: 1) a theoretical perspective, +emphasizing studies on mechanistic interpretability and delving into the +mathematical foundations behind ICL; and 2) an empirical perspective, +concerning studies that empirically analyze factors associated with ICL. We +conclude by highlighting the challenges encountered and suggesting potential +avenues for future research. We believe that our work establishes the basis for +further exploration into the interpretation of in-context learning. +Additionally, we have created a repository containing the resources referenced +in our survey. + +
+
+ comment: Accepted to the main conference of EMNLP 2024. Resources are + available at https://github.com/zyxnlp/ICL-Interpretation-Analysis-Resources +
+
+
+
+
+ + ♻ ☆ Enhanced Automated Code Vulnerability Repair using Large Language Models + + +
+ This research addresses the complex challenge of automated repair of code +vulnerabilities, vital for enhancing digital security in an increasingly +technology-driven world. The study introduces a novel and efficient format for +the representation of code modification, using advanced Large Language Models +(LLMs) such as Code Llama and Mistral. These models, fine-tuned on datasets +featuring C code vulnerabilities, significantly improve the accuracy and +adaptability of automated code repair techniques. A key finding is the enhanced +repair accuracy of these models when compared to previous methods such as +VulRepair, which underscores their practical utility and efficiency. The +research also offers a critical assessment of current evaluation metrics, such +as perfect predictions, and their limitations in reflecting the true +capabilities of automated repair models in real-world scenarios. Following +this, it underscores the importance of using test datasets devoid of train +samples, emphasizing the need for dataset integrity to enhance the +effectiveness of LLMs in code repair tasks. The significance of this work is +its contribution to digital security, setting new standards for automated code +vulnerability repair and paving the way for future advancements in the fields +of cybersecurity and artificial intelligence. The study does not only highlight +the potential of LLMs in enhancing code security but also fosters further +exploration and research in these crucial areas. + +
+
+
+
+
+ + ♻ ☆ On the Limited Generalization Capability of the Implicit Reward Model + Induced by Direct Preference Optimization EMNLP + + +
+ Reinforcement Learning from Human Feedback (RLHF) is an effective approach +for aligning language models to human preferences. Central to RLHF is learning +a reward function for scoring human preferences. Two main approaches for +learning a reward model are 1) training an EXplicit Reward Model (EXRM) as in +RLHF, and 2) using an implicit reward learned from preference data through +methods such as Direct Preference Optimization (DPO). Prior work has shown that +the implicit reward model of DPO (denoted as DPORM) can approximate an EXRM in +the limit. DPORM's effectiveness directly implies the optimality of the learned +policy, and also has practical implication for LLM alignment methods including +iterative DPO. However, it is unclear how well DPORM empirically matches the +performance of EXRM. This work studies the accuracy at distinguishing preferred +and rejected answers for both DPORM and EXRM. Our findings indicate that even +though DPORM fits the training dataset comparably, it generalizes less +effectively than EXRM, especially when the validation datasets contain +distribution shifts. Across five out-of-distribution settings, DPORM has a mean +drop in accuracy of 3% and a maximum drop of 7%. These findings highlight that +DPORM has limited generalization ability and substantiates the integration of +an explicit reward model in iterative DPO approaches. + +
+
+ comment: 12 pages, 8 tables, 3 figures; Paper Accepted at EMNLP Findings 2024 +
+
+
+
+
+ + ♻ ☆ Jailbreaking LLMs with Arabic Transliteration and Arabizi EMNLP 2024 + + +
+ This study identifies the potential vulnerabilities of Large Language Models +(LLMs) to 'jailbreak' attacks, specifically focusing on the Arabic language and +its various forms. While most research has concentrated on English-based prompt +manipulation, our investigation broadens the scope to investigate the Arabic +language. We initially tested the AdvBench benchmark in Standardized Arabic, +finding that even with prompt manipulation techniques like prefix injection, it +was insufficient to provoke LLMs into generating unsafe content. However, when +using Arabic transliteration and chatspeak (or arabizi), we found that unsafe +content could be produced on platforms like OpenAI GPT-4 and Anthropic Claude 3 +Sonnet. Our findings suggest that using Arabic and its various forms could +expose information that might remain hidden, potentially increasing the risk of +jailbreak attacks. We hypothesize that this exposure could be due to the +model's learned connection to specific words, highlighting the need for more +comprehensive safety training across all language forms. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ StorySparkQA: Expert-Annotated QA Pairs with Real-World Knowledge for + Children's Story-Based Learning EMNLP 2024 + + +
+ Interactive story reading is a common parent-child activity, where parents +expect to teach both language skills and real-world knowledge beyond the story. +While increasing storytelling and reading systems have been developed for this +activity, they often fail to infuse real-world knowledge into the conversation. +This limitation can be attributed to the existing question-answering (QA) +datasets used for children's education, upon which the systems are built, +failing to capture the nuances of how education experts think when conducting +interactive story reading activities. To bridge this gap, we design an +annotation framework, empowered by existing knowledge graph to capture experts' +annotations and thinking process, and leverage this framework to construct +StorySparkQA dataset, which comprises 5,868 expert-annotated QA pairs with +real-world knowledge. We conduct automated and human expert evaluations across +various QA pair generation settings to demonstrate that our StorySparkQA can +effectively support models in generating QA pairs that target real-world +knowledge beyond story content. StorySparkQA is available at +https://huggingface.co/datasets/NEU-HAI/StorySparkQA. + +
+
+ comment: Accepted at EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Rel-A.I.: An Interaction-Centered Approach To Measuring Human-LM + Reliance + + +
+ The ability to communicate uncertainty, risk, and limitation is crucial for +the safety of large language models. However, current evaluations of these +abilities rely on simple calibration, asking whether the language generated by +the model matches appropriate probabilities. Instead, evaluation of this aspect +of LLM communication should focus on the behaviors of their human +interlocutors: how much do they rely on what the LLM says? Here we introduce an +interaction-centered evaluation framework called Rel-A.I. (pronounced "rely"}) +that measures whether humans rely on LLM generations. We use this framework to +study how reliance is affected by contextual features of the interaction (e.g, +the knowledge domain that is being discussed), or the use of greetings +communicating warmth or competence (e.g., "I'm happy to help!"). We find that +contextual characteristics significantly affect human reliance behavior. For +example, people rely 10% more on LMs when responding to questions involving +calculations and rely 30% more on LMs that are perceived as more competent. Our +results show that calibration and language quality alone are insufficient in +evaluating the risks of human-LM interactions, and illustrate the need to +consider features of the interactional context. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ On Leakage of Code Generation Evaluation Datasets EMNLP 2024 + + +
+ In this paper, we consider contamination by code generation test sets, in +particular in their use in modern large language models. We discuss three +possible sources of such contamination and show findings supporting each of +them: (i) direct data leakage, (ii) indirect data leakage through the use of +synthetic data and (iii) overfitting to evaluation sets during model selection. +To address this, we release Less Basic Python Problems (LBPP): an +uncontaminated new benchmark of 161 prompts with their associated Python +solutions. LBPP is released at https://huggingface.co/datasets/CohereForAI/lbpp . + +
+
+ comment: EMNLP 2024 Findings. 5 main pages, 9 in total +
+
+
+
+
+ + ♻ ☆ Does Refusal Training in LLMs Generalize to the Past Tense? + + +
+ Refusal training is widely used to prevent LLMs from generating harmful, +undesirable, or illegal outputs. We reveal a curious generalization gap in the +current refusal training approaches: simply reformulating a harmful request in +the past tense (e.g., "How to make a Molotov cocktail?" to "How did people make +a Molotov cocktail?") is often sufficient to jailbreak many state-of-the-art +LLMs. We systematically evaluate this method on Llama-3 8B, Claude-3.5 Sonnet, +GPT-3.5 Turbo, Gemma-2 9B, Phi-3-Mini, GPT-4o mini, GPT-4o, o1-mini, +o1-preview, and R2D2 models using GPT-3.5 Turbo as a reformulation model. For +example, the success rate of this simple attack on GPT-4o increases from 1% +using direct requests to 88% using 20 past tense reformulation attempts on +harmful requests from JailbreakBench with GPT-4 as a jailbreak judge. +Interestingly, we also find that reformulations in the future tense are less +effective, suggesting that refusal guardrails tend to consider past historical +questions more benign than hypothetical future questions. Moreover, our +experiments on fine-tuning GPT-3.5 Turbo show that defending against past +reformulations is feasible when past tense examples are explicitly included in +the fine-tuning data. Overall, our findings highlight that the widely used +alignment techniques -- such as SFT, RLHF, and adversarial training -- employed +to align the studied models can be brittle and do not always generalize as +intended. We provide code and jailbreak artifacts at +https://github.com/tml-epfl/llm-past-tense. + +
+
+ comment: Update in v3: o1-mini and o1-preview results (on top of GPT-4o and + Claude 3.5 Sonnet added in v2). We provide code and jailbreak artifacts at + https://github.com/tml-epfl/llm-past-tense +
+
+
+
+
+ + ♻ ☆ Immunization against harmful fine-tuning attacks EMNLP 2024 + + +
+ Large Language Models (LLMs) are often trained with safety guards intended to +prevent harmful text generation. However, such safety training can be removed +by fine-tuning the LLM on harmful datasets. While this emerging threat (harmful +fine-tuning attacks) has been characterized by previous work, there is little +understanding of how we should proceed in constructing and validating defenses +against these attacks especially in the case where defenders would not have +control of the fine-tuning process. We introduce a formal framework based on +the training budget of an attacker which we call "Immunization" conditions. +Using a formal characterisation of the harmful fine-tuning problem, we provide +a thorough description of what a successful defense must comprise of and +establish a set of guidelines on how rigorous defense research that gives us +confidence should proceed. + +
+
+ comment: Published in EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Foundations of Large Language Model Compression -- Part 1: Weight + Quantization + + +
+ In recent years, compression of large language models (LLMs) has emerged as +an important problem to enable language model deployment on +resource-constrained devices, reduce computational costs, and mitigate the +environmental footprint of large-scale AI infrastructure. In this paper, we lay +down the foundation for LLM quantization from a convex optimization perspective +and propose a quantization technique that builds on this foundation for optimum +quantization outcomes. Our quantization framework, CVXQ, scales to models +containing hundreds of billions of weight parameters and provides users with +the flexibility to compress models to any specified model size, post-training. +A reference implementation of CVXQ can be obtained from github.com/seannz/cvxq. + +
+
+ comment: Preprint. 17 pages, 4 figures, 5 appendices +
+
+
+
+
+ + ♻ ☆ EIA: Environmental Injection Attack on Generalist Web Agents for Privacy + Leakage + + +
+ Generalist web agents have demonstrated remarkable potential in autonomously +completing a wide range of tasks on real websites, significantly boosting human +productivity. However, web tasks, such as booking flights, usually involve +users' PII, which may be exposed to potential privacy risks if web agents +accidentally interact with compromised websites, a scenario that remains +largely unexplored in the literature. In this work, we narrow this gap by +conducting the first study on the privacy risks of generalist web agents in +adversarial environments. First, we present a realistic threat model for +attacks on the website, where we consider two adversarial targets: stealing +users' specific PII or the entire user request. Then, we propose a novel attack +method, termed Environmental Injection Attack (EIA). EIA injects malicious +content designed to adapt well to environments where the agents operate and our +work instantiates EIA specifically for privacy scenarios in web environments. +We collect 177 action steps that involve diverse PII categories on realistic +websites from the Mind2Web, and conduct experiments using one of the most +capable generalist web agent frameworks to date. The results demonstrate that +EIA achieves up to 70% ASR in stealing specific PII and 16% ASR for full user +request. Additionally, by accessing the stealthiness and experimenting with a +defensive system prompt, we indicate that EIA is hard to detect and mitigate. +Notably, attacks that are not well adapted for a webpage can be detected via +human inspection, leading to our discussion about the trade-off between +security and autonomy. However, extra attackers' efforts can make EIA +seamlessly adapted, rendering such supervision ineffective. Thus, we further +discuss the defenses at the pre- and post-deployment stages of the websites +without relying on human supervision and call for more advanced defense +strategies. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ♻ ☆ PRompt Optimization in Multi-Step Tasks (PROMST): Integrating Human + Feedback and Heuristic-based Sampling EMNLP 2024 + + +
+ Prompt optimization aims to find the best prompt to a large language model +(LLM) for a given task. LLMs have been successfully used to help find and +improve prompt candidates for single-step tasks. However, realistic tasks for +agents are multi-step and introduce new challenges: (1) Prompt content is +likely to be more extensive and complex, making it more difficult for LLMs to +analyze errors, (2) the impact of an individual step is difficult to evaluate, +and (3) different people may have varied preferences about task execution. +While humans struggle to optimize prompts, they are good at providing feedback +about LLM outputs; we therefore introduce a new LLM-driven discrete prompt +optimization framework PRompt Optimization in Multi-Step Tasks (PROMST) that +incorporates human-designed feedback rules to automatically offer direct +suggestions for improvement. We also use an extra learned heuristic model that +predicts prompt performance to efficiently sample from prompt candidates. This +approach significantly outperforms both human-engineered prompts and several +other prompt optimization methods across 11 representative multi-step tasks (an +average 10.6\%-29.3\% improvement to current best methods on five LLMs +respectively). We believe our work can serve as a benchmark for automatic +prompt optimization for LLM-driven multi-step tasks. Datasets and Codes are +available at https://github.com/yongchao98/PROMST. Project Page is available at +https://yongchao98.github.io/MIT-REALM-PROMST. + +
+
+ comment: 62 pages, 14 figures, Published in EMNLP 2024 Main +
+
+
+
+
+ + ♻ ☆ PARAMANU-AYN: Pretrain from scratch or Continual Pretraining of LLMs for + Legal Domain Adaptation? + + +
+ In this paper, we present Paramanu-Ayn, a collection of legal language models +trained exclusively on Indian legal case documents. This 97-million-parameter +Auto-Regressive (AR) decoder-only model was pretrained from scratch with a +context size of 8192 on a single GPU for just 185 hours, achieving an efficient +MFU of 41.35. We also developed a legal domain specialized BPE tokenizer. We +evaluated our model using perplexity and zero-shot tasks: case judgment +prediction with explanation and abstractive case summarization. Paramanu-Ayn +outperformed Llama-2 7B and Gemini-Pro in case judgment prediction with +explanation task on test accuracy by nearly 2 percentage points, despite being +72 times smaller. In zero-shot abstractive summarization, it surpassed +decoder-only LLMs generating fixed-length summaries (5000 tokens) by over 10 +percentage points in BLEU and METEOR metrics, and by nearly 4 percentage points +in BERTScore. Further evaluations on zero-shot commonsense and mathematical +benchmarks showed that Paramanu-Ayn excelled despite being trained exclusively +on legal documents, outperforming Llama-1, Llama-2, and Falcon on +AGIEVAL-AQuA-RAT and AGIEVAL-SAT-Math tasks. We also instruction-tuned our +model on 10,763 diverse legal tasks, including legal clause generation, legal +drafting, case summarization, etc. The Paramanu-Ayn-instruct model scored above +8 out of 10 in clarity, relevance, completeness, and legal reasoning metrics by +GPT-3.5-Turbo. We found that our models, were able to learn drafting knowledge +and generalize to draft legal contracts and legal clauses with limited +instruction-tuning. Hence, we conclude that for a strong domain-specialized +generative language model (such as legal), domain specialized pretraining from +scratch is more cost effective, environmentally friendly, and remains +competitive with larger models or even better than adapting LLMs for legal +domain tasks. + +
+
+
+
+
+ + ♻ ☆ Leopard: A Vision Language Model For Text-Rich Multi-Image Tasks + + +
+ Text-rich images, where text serves as the central visual element guiding the +overall understanding, are prevalent in real-world applications, such as +presentation slides, scanned documents, and webpage snapshots. Tasks involving +multiple text-rich images are especially challenging, as they require not only +understanding the content of individual images but reasoning about +inter-relationships and logical flows across multiple visual inputs. Despite +the importance of these scenarios, current multimodal large language models +(MLLMs) struggle to handle such tasks due to two key challenges: (1) the +scarcity of high-quality instruction tuning datasets for text-rich multi-image +scenarios, and (2) the difficulty in balancing image resolution with visual +feature sequence length. To address these challenges, we propose Leopard, a +MLLM designed specifically for handling vision-language tasks involving +multiple text-rich images. First, we curated about one million high-quality +multimodal instruction-tuning data, tailored to text-rich, multi-image +scenarios. Second, we developed an adaptive high-resolution multi-image +encoding module to dynamically optimize the allocation of visual sequence +length based on the original aspect ratios and resolutions of the input images. +Experiments across a wide range of benchmarks demonstrate our model's superior +capabilities in text-rich, multi-image evaluations and competitive performance +in general domain evaluations. + +
+
+ comment: Our code is available at https://github.com/Jill0001/Leopard +
+
+
+
+
+ + ♻ ☆ Seemingly Plausible Distractors in Multi-Hop Reasoning: Are Large + Language Models Attentive Readers? + + +
+ State-of-the-art Large Language Models (LLMs) are accredited with an +increasing number of different capabilities, ranging from reading +comprehension, over advanced mathematical and reasoning skills to possessing +scientific knowledge. In this paper we focus on their multi-hop reasoning +capability: the ability to identify and integrate information from multiple +textual sources. + Given the concerns with the presence of simplifying cues in existing +multi-hop reasoning benchmarks, which allow models to circumvent the reasoning +requirement, we set out to investigate, whether LLMs are prone to exploiting +such simplifying cues. We find evidence that they indeed circumvent the +requirement to perform multi-hop reasoning, but they do so in more subtle ways +than what was reported about their fine-tuned pre-trained language model (PLM) +predecessors. Motivated by this finding, we propose a challenging multi-hop +reasoning benchmark, by generating seemingly plausible multi-hop reasoning +chains, which ultimately lead to incorrect answers. We evaluate multiple open +and proprietary state-of-the-art LLMs, and find that their performance to +perform multi-hop reasoning is affected, as indicated by up to 45% relative +decrease in F1 score when presented with such seemingly plausible alternatives. +We conduct a deeper analysis and find evidence that while LLMs tend to ignore +misleading lexical cues, misleading reasoning paths indeed present a +significant challenge. + +
+
+ comment: 15 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at + Any Resolution + + +
+ We present the Qwen2-VL Series, an advanced upgrade of the previous Qwen-VL +models that redefines the conventional predetermined-resolution approach in +visual processing. Qwen2-VL introduces the Naive Dynamic Resolution mechanism, +which enables the model to dynamically process images of varying resolutions +into different numbers of visual tokens. This approach allows the model to +generate more efficient and accurate visual representations, closely aligning +with human perceptual processes. The model also integrates Multimodal Rotary +Position Embedding (M-RoPE), facilitating the effective fusion of positional +information across text, images, and videos. We employ a unified paradigm for +processing both images and videos, enhancing the model's visual perception +capabilities. To explore the potential of large multimodal models, Qwen2-VL +investigates the scaling laws for large vision-language models (LVLMs). By +scaling both the model size-with versions at 2B, 8B, and 72B parameters-and the +amount of training data, the Qwen2-VL Series achieves highly competitive +performance. Notably, the Qwen2-VL-72B model achieves results comparable to +leading models such as GPT-4o and Claude3.5-Sonnet across various multimodal +benchmarks, outperforming other generalist models. Code is available at +https://github.com/QwenLM/Qwen2-VL . + +
+
+ comment: Code is available at https://github.com/QwenLM/Qwen2-VL. arXiv admin + note: text overlap with arXiv:2408.15262 by other authors +
+
+
+
+
+ + ♻ ☆ Fast Matrix Multiplications for Lookup Table-Quantized LLMs EMNLP 2024 + + +
+ The deployment of large language models (LLMs) is often constrained by memory +bandwidth, where the primary bottleneck is the cost of transferring model +parameters from the GPU's global memory to its registers. When coupled with +custom kernels that fuse the dequantization and matmul operations, weight-only +quantization can thus enable faster inference by reducing the amount of memory +movement. However, developing high-performance kernels for weight-quantized +LLMs presents substantial challenges, especially when the weights are +compressed to non-evenly-divisible bit widths (e.g., 3 bits) with non-uniform, +lookup table (LUT) quantization. This paper describes FLUTE, a flexible lookup +table engine for LUT-quantized LLMs, which uses offline restructuring of the +quantized weight matrix to minimize bit manipulations associated with +unpacking, and vectorization and duplication of the lookup table to mitigate +shared memory bandwidth constraints. At batch sizes < 32 and quantization group +size of 128 (typical in LLM inference), the FLUTE kernel can be 2-4x faster +than existing GEMM kernels. As an application of FLUTE, we explore a simple +extension to lookup table-based NormalFloat quantization and apply it to +quantize LLaMA3 to various configurations, obtaining competitive quantization +performance against strong baselines while obtaining an end-to-end throughput +increase of 1.5 to 2 times. + +
+
+ comment: EMNLP 2024 (Findings) +
+
+
+
+
+ + ♻ ☆ Llamipa: An Incremental Discourse Parser EMNLP 2024 + + +
+ This paper provides the first discourse parsing experiments with a large +language model(LLM) finetuned on corpora annotated in the style of SDRT +(Segmented Discourse Representation Theory Asher, 1993; Asher and Lascarides, +2003). The result is a discourse parser, Llamipa (Llama Incremental Parser), +that leverages discourse context, leading to substantial performance gains over +approaches that use encoder-only models to provide local, context-sensitive +representations of discourse units. Furthermore, it can process discourse data +incrementally, which is essential for the eventual use of discourse information +in downstream tasks. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Nebula: A discourse aware Minecraft Builder EMNLP 2024 + + +
+ When engaging in collaborative tasks, humans efficiently exploit the semantic +structure of a conversation to optimize verbal and nonverbal interactions. But +in recent "language to code" or "language to action" models, this information +is lacking. We show how incorporating the prior discourse and nonlinguistic +context of a conversation situated in a nonlinguistic environment can improve +the "language to action" component of such interactions. We finetune an LLM to +predict actions based on prior context; our model, Nebula, doubles the +net-action F1 score over the baseline on this task of Jayannavar et al.(2020). +We also investigate our model's ability to construct shapes and understand +location descriptions using a synthetic dataset + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ LongForm: Effective Instruction Tuning with Reverse Instructions EMNLP 2024 + + +
+ Instruction tuning enables language models to more effectively generalize and +better follow user intent. However, obtaining instruction data is costly and +challenging. Prior work employs methods such as expensive human annotation, +crowd-sourced datasets with alignment issues, and generating noisy examples via +LLMs. We introduce the LongForm-C dataset, which is created by reverse +instructions. We generate instructions via LLMs for human-written corpus +examples using reverse instructions. First we select a diverse set of +human-written documents from corpora such as C4 and Wikipedia; then we generate +instructions for these documents via LLMs. This approach provides a cheaper and +cleaner instruction-tuning dataset with natural output and one suitable for +long text generation. Our models outperform 10x larger language models without +instruction tuning on tasks such as story/recipe generation and long-form +question answering. Moreover, LongForm models outperform prior +instruction-tuned models such as FLAN-T5 and Alpaca by a large margin, and +improve language understanding capabilities further. We publicly release our +data and models: https://github.com/akoksal/LongForm. + +
+
+ comment: EMNLP 2024 Findings. This version extends the training with recent + LLMs, evaluation with new metrics, and NLU tasks +
+
+
+
+
+ + ♻ ☆ TurkishMMLU: Measuring Massive Multitask Language Understanding in + Turkish EMNLP 2024 + + +
+ Multiple choice question answering tasks evaluate the reasoning, +comprehension, and mathematical abilities of Large Language Models (LLMs). +While existing benchmarks employ automatic translation for multilingual +evaluation, this approach is error-prone and potentially introduces culturally +biased questions, especially in social sciences. We introduce the first +multitask, multiple-choice Turkish QA benchmark, TurkishMMLU, to evaluate LLMs' +understanding of the Turkish language. TurkishMMLU includes over 10,000 +questions, covering 9 different subjects from Turkish high-school education +curricula. These questions are written by curriculum experts, suitable for the +high-school curricula in Turkey, covering subjects ranging from natural +sciences and math questions to more culturally representative topics such as +Turkish Literature and the history of the Turkish Republic. We evaluate over 20 +LLMs, including multilingual open-source (e.g., Gemma, Llama, MT5), +closed-source (GPT 4o, Claude, Gemini), and Turkish-adapted (e.g., Trendyol) +models. We provide an extensive evaluation, including zero-shot and few-shot +evaluation of LLMs, chain-of-thought reasoning, and question difficulty +analysis along with model performance. We provide an in-depth analysis of the +Turkish capabilities and limitations of current LLMs to provide insights for +future LLMs for the Turkish language. We publicly release our code for the +dataset and evaluation: https://github.com/ArdaYueksel/TurkishMMLU. + +
+
+ comment: EMNLP 2024 - Findings +
+
+
+
+
+ + ♻ ☆ Generate-on-Graph: Treat LLM as both Agent and KG in Incomplete + Knowledge Graph Question Answering EMNLP 2024 + + +
+ To address the issues of insufficient knowledge and hallucination in Large +Language Models (LLMs), numerous studies have explored integrating LLMs with +Knowledge Graphs (KGs). However, these methods are typically evaluated on +conventional Knowledge Graph Question Answering (KGQA) with complete KGs, where +all factual triples required for each question are entirely covered by the +given KG. In such cases, LLMs primarily act as an agent to find answer entities +within the KG, rather than effectively integrating the internal knowledge of +LLMs and external knowledge sources such as KGs. In fact, KGs are often +incomplete to cover all the knowledge required to answer questions. To simulate +these real-world scenarios and evaluate the ability of LLMs to integrate +internal and external knowledge, we propose leveraging LLMs for QA under +Incomplete Knowledge Graph (IKGQA), where the provided KG lacks some of the +factual triples for each question, and construct corresponding datasets. To +handle IKGQA, we propose a training-free method called Generate-on-Graph (GoG), +which can generate new factual triples while exploring KGs. Specifically, GoG +performs reasoning through a Thinking-Searching-Generating framework, which +treats LLM as both Agent and KG in IKGQA. Experimental results on two datasets +demonstrate that our GoG outperforms all previous methods. + +
+
+ comment: Accepted by EMNLP 2024 Main +
+
+
+
+
+ + ♻ ☆ Quantifying Generalization Complexity for Large Language Models + + +
+ While large language models (LLMs) have shown exceptional capabilities in +understanding complex queries and performing sophisticated tasks, their +generalization abilities are often deeply entangled with memorization, +necessitating more precise evaluation. To address this challenge, we introduce +Scylla, a dynamic evaluation framework that quantitatively measures the +generalization abilities of LLMs. Scylla disentangles generalization from +memorization via assessing model performance on both in-distribution (ID) and +out-of-distribution (OOD) data through 20 tasks across 5 levels of complexity. +Through extensive experiments, we uncover a non-monotonic relationship between +task complexity and the performance gap between ID and OOD data, which we term +the generalization valley. Specifically, this phenomenon reveals a critical +threshold - referred to as critical complexity - where reliance on +non-generalizable behavior peaks, indicating the upper bound of LLMs' +generalization capabilities. As model size increases, the critical complexity +shifts toward higher levels of task complexity, suggesting that larger models +can handle more complex reasoning tasks before over-relying on memorization. +Leveraging Scylla and the concept of critical complexity, we benchmark 28LLMs +including both open-sourced models such as LLaMA and Qwen families, and +close-sourced models like Claude and GPT, providing a more robust evaluation +and establishing a clearer understanding of LLMs' generalization capabilities. + +
+
+
+
+
+ + ♻ ☆ Ada-Instruct: Adapting Instruction Generators for Complex Reasoning + + +
+ Instructions augmentation is a crucial step for unleashing the full potential +of large language models (LLMs) in downstream tasks. Existing Self-Instruct +methods primarily simulate new instructions from a few initial instructions +with in-context learning. However, our study identifies a critical flaw in this +approach: even with GPT4o, Self-Instruct cannot generate complex instructions +of length $\ge 100$, which is necessary in complex tasks such as code +completion. + To address this issue, our key insight is that fine-tuning open source LLMs +with only ten examples can produce complex instructions that maintain +distributional consistency for complex reasoning tasks. We introduce +Ada-Instruct, an adaptive instruction generator developed through fine-tuning. +We empirically validated Ada-Instruct's efficacy across different applications. +The results highlight Ada-Instruct's capacity to generate long, intricate, and +distributionally consistent instructions. + +
+
+
+
+
+ + ♻ ☆ In-Context Editing: Learning Knowledge from Self-Induced Distributions + + +
+ In scenarios where language models must incorporate new information +efficiently without extensive retraining, traditional fine-tuning methods are +prone to overfitting, degraded generalization, and unnatural language +generation. To address these limitations, we introduce Consistent In-Context +Editing (ICE), a novel approach leveraging the model's in-context learning +capability to optimize toward a contextual distribution rather than a one-hot +target. ICE introduces a simple yet effective optimization framework for the +model to internalize new knowledge by aligning its output distributions with +and without additional context. This method enhances the robustness and +effectiveness of gradient-based tuning methods, preventing overfitting and +preserving the model's integrity. We analyze ICE across four critical aspects +of knowledge editing: accuracy, locality, generalization, and linguistic +quality, demonstrating its advantages. Experimental results confirm the +effectiveness of ICE and demonstrate its potential for continual editing, +ensuring that the integrity of the model is preserved while updating +information. + +
+
+
+
+
+ + ♻ ☆ Fighting Randomness with Randomness: Mitigating Optimisation Instability + of Fine-Tuning using Delayed Ensemble and Noisy Interpolation EMNLP'24 + + +
+ While fine-tuning of pre-trained language models generally helps to overcome +the lack of labelled training samples, it also displays model performance +instability. This instability mainly originates from randomness in +initialisation or data shuffling. To address this, researchers either modify +the training process or augment the available samples, which typically results +in increased computational costs. We propose a new mitigation strategy, called +Delayed Ensemble with Noisy Interpolation (DENI), that leverages the strengths +of ensembling, noise regularisation and model interpolation, while retaining +computational efficiency. We compare DENI with 9 representative mitigation +strategies across 3 models, 4 tuning strategies and 7 text classification +datasets. We show that: 1) DENI outperforms the best performing mitigation +strategy (Ensemble), while using only a fraction of its cost; 2) the mitigation +strategies are beneficial for parameter-efficient fine-tuning (PEFT) methods, +outperforming full fine-tuning in specific cases; and 3) combining DENI with +data augmentation often leads to even more effective instability mitigation. + +
+
+ comment: Accepted to the Findings of the EMNLP'24 Conference +
+
+
+
+
+ + ♻ ☆ On Sensitivity of Learning with Limited Labelled Data to the Effects of + Randomness: Impact of Interactions and Systematic Choices EMNLP'24 + + +
+ While learning with limited labelled data can improve performance when the +labels are lacking, it is also sensitive to the effects of uncontrolled +randomness introduced by so-called randomness factors (e.g., varying order of +data). We propose a method to systematically investigate the effects of +randomness factors while taking the interactions between them into +consideration. To measure the true effects of an individual randomness factor, +our method mitigates the effects of other factors and observes how the +performance varies across multiple runs. Applying our method to multiple +randomness factors across in-context learning and fine-tuning approaches on 7 +representative text classification tasks and meta-learning on 3 tasks, we show +that: 1) disregarding interactions between randomness factors in existing works +caused inconsistent findings due to incorrect attribution of the effects of +randomness factors, such as disproving the consistent sensitivity of in-context +learning to sample order even with random sample selection; and 2) besides +mutual interactions, the effects of randomness factors, especially sample +order, are also dependent on more systematic choices unexplored in existing +works, such as number of classes, samples per class or choice of prompt format. + +
+
+ comment: Accepted to the EMNLP'24 Main Conference +
+
+
+
+
+ + ♻ ☆ 2D-TPE: Two-Dimensional Positional Encoding Enhances Table Understanding + for Large Language Models + + +
+ Tables are ubiquitous across various domains for concisely representing +structured information. Empowering large language models (LLMs) to reason over +tabular data represents an actively explored direction. However, since typical +LLMs only support one-dimensional~(1D) inputs, existing methods often flatten +the two-dimensional~(2D) table structure into a sequence of tokens, which can +severely disrupt the spatial relationships and result in an inevitable loss of +vital contextual information. In this paper, we first empirically demonstrate +the detrimental impact of such flattening operations on the performance of LLMs +in capturing the spatial information of tables through two elaborate proxy +tasks. Subsequently, we introduce a simple yet effective positional encoding +method, termed ``2D-TPE'' (Two-Dimensional Table Positional Encoding), to +address this challenge. 2D-TPE enables each attention head to dynamically +select a permutation order of tokens within the context for attending to them, +where each permutation represents a distinct traversal mode for the table, such +as column-wise or row-wise traversal. 2D-TPE effectively mitigates the risk of +losing essential spatial information while preserving computational efficiency, +thus better preserving the table structure. Extensive experiments across five +benchmarks demonstrate that 2D-TPE outperforms strong baselines, underscoring +the importance of preserving the table structure for accurate table +comprehension. Comprehensive analysis further reveals the substantially better +scalability of 2D-TPE to large tables than baselines. + +
+
+
+
+
+ + ♻ ☆ TCSinger: Zero-Shot Singing Voice Synthesis with Style Transfer and + Multi-Level Style Control EMNLP 2024 + + +
+ Zero-shot singing voice synthesis (SVS) with style transfer and style control +aims to generate high-quality singing voices with unseen timbres and styles +(including singing method, emotion, rhythm, technique, and pronunciation) from +audio and text prompts. However, the multifaceted nature of singing styles +poses a significant challenge for effective modeling, transfer, and control. +Furthermore, current SVS models often fail to generate singing voices rich in +stylistic nuances for unseen singers. To address these challenges, we introduce +TCSinger, the first zero-shot SVS model for style transfer across cross-lingual +speech and singing styles, along with multi-level style control. Specifically, +TCSinger proposes three primary modules: 1) the clustering style encoder +employs a clustering vector quantization model to stably condense style +information into a compact latent space; 2) the Style and Duration Language +Model (S\&D-LM) concurrently predicts style information and phoneme duration, +which benefits both; 3) the style adaptive decoder uses a novel mel-style +adaptive normalization method to generate singing voices with enhanced details. +Experimental results show that TCSinger outperforms all baseline models in +synthesis quality, singer similarity, and style controllability across various +tasks, including zero-shot style transfer, multi-level style control, +cross-lingual style transfer, and speech-to-singing style transfer. Singing +voice samples can be accessed at https://tcsinger.github.io/. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Multi-FAct: Assessing Factuality of Multilingual LLMs using FActScore + + +
+ Evaluating the factuality of long-form large language model (LLM)-generated +text is an important challenge. Recently there has been a surge of interest in +factuality evaluation for English, but little is known about the factuality +evaluation of multilingual LLMs, specially when it comes to long-form +generation. %This paper systematically evaluates multilingual LLMs' factual +accuracy across languages and geographic regions. We introduce a simple +pipeline for multilingual factuality evaluation, by applying FActScore (Min et +al., 2023) for diverse languages. In addition to evaluating multilingual +factual generation, we evaluate the factual accuracy of long-form text +generation in topics that reflect regional diversity. We also examine the +feasibility of running the FActScore pipeline using non-English Wikipedia and +provide comprehensive guidelines on multilingual factual evaluation for +regionally diverse topics. + +
+
+
+
+
+ + ♻ ☆ Latte: Latent Attention for Linear Time Transformers + + +
+ The time complexity of the standard attention mechanism in transformers +scales quadratically with sequence length. We propose a probabilistic framework +for attention, enabling us to derive a novel low-rank linear +re-parameterisation of both bidirectional and causal cases, based on defining a +latent variable model. Our method can be seamlessly integrated as a drop-in +replacement for the standard attention mechanism. Additionally, this framework +provides a natural extension for combining local standard attention with our +global linear attention. This approach allows us to extend the context length +of existing large pre-trained models with only a few additional training steps. +The resulting ``Latte Transformer'' achieves performance comparable to standard +attention and other state-of-the-art models, while maintaining linear time and +memory complexity, along with constant-time next-token prediction during +inference. + +
+
+
+
+
+ + ♻ ☆ Twists, Humps, and Pebbles: Multilingual Speech Recognition Models + Exhibit Gender Performance Gaps EMNLP 2024 + + +
+ Current automatic speech recognition (ASR) models are designed to be used +across many languages and tasks without substantial changes. However, this +broad language coverage hides performance gaps within languages, for example, +across genders. Our study systematically evaluates the performance of two +widely used multilingual ASR models on three datasets, encompassing 19 +languages from eight language families and two speaking conditions. Our +findings reveal clear gender disparities, with the advantaged group varying +across languages and models. Surprisingly, those gaps are not explained by +acoustic or lexical properties. However, probing internal model states reveals +a correlation with gendered performance gap. That is, the easier it is to +distinguish speaker gender in a language using probes, the more the gap +reduces, favoring female speakers. Our results show that gender disparities +persist even in state-of-the-art models. Our findings have implications for the +improvement of multilingual ASR systems, underscoring the importance of +accessibility to training data and nuanced evaluation to predict and mitigate +gender gaps. We release all code and artifacts at +https://github.com/g8a9/multilingual-asr-gender-gap. + +
+
+ comment: Accepted at EMNLP 2024. Code and artifacts at + https://github.com/g8a9/multilingual-asr-gender-gap +
+
+
+
+
+ + ♻ ☆ Conversational Feedback in Scripted versus Spontaneous Dialogues: A + Comparative Analysis + + +
+ Scripted dialogues such as movie and TV subtitles constitute a widespread +source of training data for conversational NLP models. However, there are +notable linguistic differences between these dialogues and spontaneous +interactions, especially regarding the occurrence of communicative feedback +such as backchannels, acknowledgments, or clarification requests. This paper +presents a quantitative analysis of such feedback phenomena in both subtitles +and spontaneous conversations. Based on conversational data spanning eight +languages and multiple genres, we extract lexical statistics, classifications +from a dialogue act tagger, expert annotations and labels derived from a +fine-tuned Large Language Model (LLM). Our main empirical findings are that (1) +communicative feedback is markedly less frequent in subtitles than in +spontaneous dialogues and (2) subtitles contain a higher proportion of negative +feedback. We also show that dialogues generated by standard LLMs lie much +closer to scripted dialogues than spontaneous interactions in terms of +communicative feedback. + +
+
+ comment: Updated version for SIGdial 2024 +
+
+
+
+
+ + ♻ ☆ miniCTX: Neural Theorem Proving with (Long-)Contexts + + +
+ Real-world formal theorem proving often depends on a wealth of context, +including definitions, lemmas, comments, file structure, and other information. +We introduce miniCTX, which tests a model's ability to prove formal +mathematical theorems that depend on new context that is not seen during +training. miniCTX contains theorems sourced from real Lean projects and +textbooks, each associated with a context that can span tens of thousands of +tokens. Models are tasked with proving a theorem given access to code from the +theorem's repository, which contains context that is needed for the proof. As a +baseline for miniCTX, we tested fine-tuning and prompting methods that +condition theorem proving on preceding context. Both approaches substantially +outperform traditional methods that rely solely on state information. We found +that this ability to use context is not captured by previous benchmarks such as +miniF2F. Alongside miniCTX, we offer ntp-toolkit for automatically extracting +and annotating theorem proving data, making it easy to add new projects into +miniCTX to ensure that contexts are not seen during training. miniCTX offers a +challenging and realistic evaluation of neural theorem provers. + +
+
+
+
+
+ + ♻ ☆ Compositional Hardness of Code in Large Language Models -- A + Probabilistic Perspective + + +
+ A common practice in large language model (LLM) usage for complex analytical +tasks such as code generation, is to sample a solution for the entire task +within the model's context window. Previous works have shown that subtask +decomposition within the model's context (chain of thought), is beneficial for +solving such tasks. In this work, we point a limitation of LLMs' ability to +perform several sub-tasks within the same context window - an in-context +hardness of composition, pointing to an advantage for distributing a decomposed +problem in a multi-agent system of LLMs. The hardness of composition is +quantified by a generation complexity metric, i.e., the number of LLM +generations required to sample at least one correct solution. We find a gap +between the generation complexity of solving a compositional problem within the +same context relative to distributing it among multiple agents, that increases +exponentially with the solution's length. We prove our results theoretically +and demonstrate them empirically. + +
+
+
+
+
+ + ♻ ☆ Evaluating Automatic Metrics with Incremental Machine Translation + Systems + + +
+ We introduce a dataset comprising commercial machine translations, gathered +weekly over six years across 12 translation directions. Since human A/B testing +is commonly used, we assume commercial systems improve over time, which enables +us to evaluate machine translation (MT) metrics based on their preference for +more recent translations. Our study not only confirms several prior findings, +such as the advantage of neural metrics over non-neural ones, but also explores +the debated issue of how MT quality affects metric reliability--an +investigation that smaller datasets in previous research could not sufficiently +explore. Overall, our research demonstrates the dataset's value as a testbed +for metric evaluation. We release our code at https://github.com/gjwubyron/Evo + +
+
+
+
+
+ + ♻ ☆ Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on + Graphs + + +
+ Large language models (LLMs), while exhibiting exceptional performance, +suffer from hallucinations, especially on knowledge-intensive tasks. Existing +works propose to augment LLMs with individual text units retrieved from +external knowledge corpora to alleviate the issue. However, in many domains, +texts are interconnected (e.g., academic papers in a bibliographic graph are +linked by citations and co-authorships) which form a (text-attributed) graph. +The knowledge in such graphs is encoded not only in single texts/nodes but also +in their associated connections. To facilitate the research of augmenting LLMs +with graphs, we manually construct a Graph Reasoning Benchmark dataset called +GRBench, containing 1,740 questions that can be answered with the knowledge +from 10 domain graphs. Then, we propose a simple and effective framework called +Graph Chain-of-thought (Graph-CoT) to augment LLMs with graphs by encouraging +LLMs to reason on the graph iteratively. Each Graph-CoT iteration consists of +three sub-steps: LLM reasoning, LLM-graph interaction, and graph execution. We +conduct systematic experiments with three LLM backbones on GRBench, where +Graph-CoT outperforms the baselines consistently. The code is available at +https://github.com/PeterGriffinJin/Graph-CoT. + +
+
+ comment: 21 pages. Code: https://github.com/PeterGriffinJin/Graph-CoT +
+
+
+
+
+ + ♻ ☆ Distilling Instruction-following Abilities of Large Language Models with + Task-aware Curriculum Planning + + +
+ Instruction tuning aims to align large language models (LLMs) with +open-domain instructions and human-preferred responses. While several studies +have explored autonomous approaches to distilling and annotating instructions +from powerful proprietary LLMs, such as ChatGPT, they often neglect the impact +of the distributions and characteristics of tasks, together with the varying +difficulty of instructions in training sets. This oversight can lead to +imbalanced knowledge capabilities and poor generalization powers of student +LLMs. To address these challenges, we introduce Task-Aware Curriculum Planning +for Instruction Refinement (TAPIR), a multi-round distillation framework that +utilizes an oracle LLM to select instructions that are difficult for a student +LLM to follow. To balance the student's capabilities, task distributions in +training sets are adjusted with responses automatically refined according to +their corresponding tasks. In addition, by incorporating curriculum planning, +our approach systematically escalates the difficulty levels of tasks, +progressively enhancing the student LLM's capabilities. We rigorously evaluate +TAPIR using several widely recognized benchmarks (such as AlpacaEval 2.0, +MT-Bench, etc.) and multiple student LLMs. Empirical results demonstrate that +student LLMs, trained with our method and less training data, outperform larger +instruction-tuned models and strong distillation baselines. + +
+
+ comment: emnlp 2024 findings +
+
+
+
+
+ + ♻ ☆ A Systematic Survey and Critical Review on Evaluating Large Language + Models: Challenges, Limitations, and Recommendations EMNLP 2024 + + +
+ Large Language Models (LLMs) have recently gained significant attention due +to their remarkable capabilities in performing diverse tasks across various +domains. However, a thorough evaluation of these models is crucial before +deploying them in real-world applications to ensure they produce reliable +performance. Despite the well-established importance of evaluating LLMs in the +community, the complexity of the evaluation process has led to varied +evaluation setups, causing inconsistencies in findings and interpretations. To +address this, we systematically review the primary challenges and limitations +causing these inconsistencies and unreliable evaluations in various steps of +LLM evaluation. Based on our critical review, we present our perspectives and +recommendations to ensure LLM evaluations are reproducible, reliable, and +robust. + +
+
+ comment: Accepted at EMNLP 2024 (Main Conference) +
+
+
+
+
+ + ♻ ☆ Large Language Models on Graphs: A Comprehensive Survey + + +
+ Large language models (LLMs), such as GPT4 and LLaMA, are creating +significant advancements in natural language processing, due to their strong +text encoding/decoding ability and newly found emergent capability (e.g., +reasoning). While LLMs are mainly designed to process pure texts, there are +many real-world scenarios where text data is associated with rich structure +information in the form of graphs (e.g., academic networks, and e-commerce +networks) or scenarios where graph data is paired with rich textual information +(e.g., molecules with descriptions). Besides, although LLMs have shown their +pure text-based reasoning ability, it is underexplored whether such ability can +be generalized to graphs (i.e., graph-based reasoning). In this paper, we +provide a systematic review of scenarios and techniques related to large +language models on graphs. We first summarize potential scenarios of adopting +LLMs on graphs into three categories, namely pure graphs, text-attributed +graphs, and text-paired graphs. We then discuss detailed techniques for +utilizing LLMs on graphs, including LLM as Predictor, LLM as Encoder, and LLM +as Aligner, and compare the advantages and disadvantages of different schools +of models. Furthermore, we discuss the real-world applications of such methods +and summarize open-source codes and benchmark datasets. Finally, we conclude +with potential future research directions in this fast-growing field. The +related source can be found at +https://github.com/PeterGriffinJin/Awesome-Language-Model-on-Graphs. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ Tradeoffs Between Alignment and Helpfulness in Language Models with + Representation Engineering + + +
+ Language model alignment has become an important component of AI safety, +allowing safe interactions between humans and language models, by enhancing +desired behaviors and inhibiting undesired ones. It is often done by tuning the +model or inserting preset aligning prompts. Recently, representation +engineering, a method which alters the model's behavior via changing its +representations post-training, was shown to be effective in aligning LLMs (Zou +et al., 2023a). Representation engineering yields gains in alignment oriented +tasks such as resistance to adversarial attacks and reduction of social biases, +but was also shown to cause a decrease in the ability of the model to perform +basic tasks. In this paper we study the tradeoff between the increase in +alignment and decrease in helpfulness of the model. We propose a theoretical +framework which provides bounds for these two quantities, and demonstrate their +relevance empirically. First, we find that under the conditions of our +framework, alignment can be guaranteed with representation engineering, and at +the same time that helpfulness is harmed in the process. Second, we show that +helpfulness is harmed quadratically with the norm of the representation +engineering vector, while the alignment increases linearly with it, indicating +a regime in which it is efficient to use representation engineering. We +validate our findings empirically, and chart the boundaries to the usefulness +of representation engineering for alignment. + +
+
+
+
+
+ + ♻ ☆ Eliciting In-Context Learning in Vision-Language Models for Videos + Through Curated Data Distributional Properties EMNLP 2024 + + +
+ A major reason behind the recent success of large language models (LLMs) is +their \textit{in-context learning} capability, which makes it possible to +rapidly adapt them to downstream text-based tasks by prompting them with a +small number of relevant demonstrations. While large vision-language models +(VLMs) have recently been developed for tasks requiring both text and images, +they largely lack in-context learning over visual information, especially in +understanding and generating text about videos. In this work, we implement +\textbf{E}mergent \textbf{I}n-context \textbf{Le}arning on \textbf{V}ideos +(\eilev{}), a novel training paradigm that induces in-context learning over +video and text by capturing key properties of pre-training data found by prior +work to be essential for in-context learning in transformers. In our +experiments, we show that \eilev-trained models outperform other off-the-shelf +VLMs in few-shot video narration for novel, rare actions. Furthermore, we +demonstrate that these key properties of bursty distributions, skewed marginal +distributions, and dynamic meaning each contribute to varying degrees to VLMs' +in-context learning capability in narrating procedural videos. Our results, +analysis, and \eilev{}-trained models yield numerous insights about the +emergence of in-context learning over video and text, creating a foundation for +future work to optimize and scale VLMs for open-domain video understanding and +reasoning. Our code and demo are available at +\url{https://github.com/yukw777/EILEV}. + +
+
+ comment: 16 pages, LaTeX; Accepted to EMNLP 2024 Main +
+
+
+
+
+ + ♻ ☆ Does Instruction Tuning Make LLMs More Consistent? + + +
+ The purpose of instruction tuning is enabling zero-shot performance, but +instruction tuning has also been shown to improve chain-of-thought reasoning +and value alignment (Si et al., 2023). Here we consider the impact on +$\textit{consistency}$, i.e., the sensitivity of language models to small +perturbations in the input. We compare 10 instruction-tuned LLaMA models to the +original LLaMA-7b model and show that almost across-the-board they become more +consistent, both in terms of their representations and their predictions in +zero-shot and downstream tasks. We explain these improvements through +mechanistic analyses of factual recall. + +
+
+ comment: We need to run extra experiments to ensure some of the claims in the + paper are fully correct +
+
+
+
+
+ + ♻ ☆ RGD: Multi-LLM Based Agent Debugger via Refinement and Generation + Guidance + + +
+ Large Language Models (LLMs) have shown incredible potential in code +generation tasks, and recent research in prompt engineering have enhanced LLMs' +understanding of textual information. However, ensuring the accuracy of +generated code often requires extensive testing and validation by programmers. +While LLMs can typically generate code based on task descriptions, their +accuracy remains limited, especially for complex tasks that require a deeper +understanding of both the problem statement and the code generation process. +This limitation is primarily due to the LLMs' need to simultaneously comprehend +text and generate syntactically and semantically correct code, without having +the capability to automatically refine the code. In real-world software +development, programmers rarely produce flawless code in a single attempt based +on the task description alone, they rely on iterative feedback and debugging to +refine their programs. Inspired by this process, we introduce a novel +architecture of LLM-based agents for code generation and automatic debugging: +Refinement and Guidance Debugging (RGD). The RGD framework is a multi-LLM-based +agent debugger that leverages three distinct LLM agents-Guide Agent, Debug +Agent, and Feedback Agent. RGD decomposes the code generation task into +multiple steps, ensuring a clearer workflow and enabling iterative code +refinement based on self-reflection and feedback. Experimental results +demonstrate that RGD exhibits remarkable code generation capabilities, +achieving state-of-the-art performance with a 9.8% improvement on the HumanEval +dataset and a 16.2% improvement on the MBPP dataset compared to the +state-of-the-art approaches and traditional direct prompting approaches. We +highlight the effectiveness of the RGD framework in enhancing LLMs' ability to +generate and refine code autonomously. + +
+
+
+
+
+ + ♻ ☆ Synthetic continued pretraining + + +
+ Pretraining on large-scale, unstructured internet text enables language +models to acquire a significant amount of world knowledge. However, this +knowledge acquisition is data-inefficient--to learn a given fact, models must +be trained on hundreds to thousands of diverse representations of it. This +poses a challenge when adapting a pretrained model to a small corpus of +domain-specific documents, where each fact may appear rarely or only once. We +propose to bridge this gap with synthetic continued pretraining: using the +small domain-specific corpus to synthesize a large corpus more amenable to +learning, and then performing continued pretraining on the synthesized corpus. +We instantiate this proposal with EntiGraph, a synthetic data augmentation +algorithm that extracts salient entities from the source documents and then +generates diverse text by drawing connections between the sampled entities. +Synthetic continued pretraining with EntiGraph enables a language model to +answer questions and follow generic instructions related to the source +documents without access to them. If, instead, the source documents are +available at inference time, we show that the knowledge acquired through our +approach compounds with retrieval-augmented generation. To better understand +these results, we build a simple mathematical model of EntiGraph, and show how +synthetic data augmentation can "rearrange" knowledge to enable more +data-efficient learning. + +
+
+ comment: Updated organization of experimental results and methods + introduction. Released the dataset and model weights artifact +
+
+
+
+
+ + ♻ ☆ The SIFo Benchmark: Investigating the Sequential Instruction Following + Ability of Large Language Models EMNLP 2024 + + +
+ Following multiple instructions is a crucial ability for large language +models (LLMs). Evaluating this ability comes with significant challenges: (i) +limited coherence between multiple instructions, (ii) positional bias where the +order of instructions affects model performance, and (iii) a lack of +objectively verifiable tasks. To address these issues, we introduce a benchmark +designed to evaluate models' abilities to follow multiple instructions through +sequential instruction following (SIFo) tasks. In SIFo, the successful +completion of multiple instructions is verifiable by examining only the final +instruction. Our benchmark evaluates instruction following using four tasks +(text modification, question answering, mathematics, and security rules), each +assessing different aspects of sequential instruction following. Our evaluation +of popular LLMs, both closed-source and open-source, shows that more recent and +larger models significantly outperform their older and smaller counterparts on +the SIFo tasks, validating the benchmark's effectiveness. All models struggle +with following sequences of instructions, hinting at an important lack of +robustness of today's language models. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Multilingual Synopses of Movie Narratives: A Dataset for Vision-Language + Story Understanding + + +
+ Story video-text alignment, a core task in computational story understanding, +aims to align video clips with corresponding sentences in their descriptions. +However, progress on the task has been held back by the scarcity of manually +annotated video-text correspondence and the heavy concentration on English +narrations of Hollywood movies. To address these issues, in this paper, we +construct a large-scale multilingual video story dataset named Multilingual +Synopses of Movie Narratives (M-SYMON), containing 13,166 movie summary videos +from 7 languages, as well as manual annotation of fine-grained video-text +correspondences for 101.5 hours of video. Training on the human annotated data +from SyMoN outperforms the SOTA methods by 15.7 and 16.2 percentage points on +Clip Accuracy and Sentence IoU scores, respectively, demonstrating the +effectiveness of the annotations. As benchmarks for future research, we create +6 baseline approaches with different multilingual training strategies, compare +their performance in both intra-lingual and cross-lingual setups, exemplifying +the challenges of multilingual video-text alignment. The dataset is released +at: https://github.com/insundaycathy/M-SyMoN + +
+
+ comment: 17 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ The Potential and Challenges of Evaluating Attitudes, Opinions, and + Values in Large Language Models EMNLP 2024 + + +
+ Recent advances in Large Language Models (LLMs) have sparked wide interest in +validating and comprehending the human-like cognitive-behavioral traits LLMs +may capture and convey. These cognitive-behavioral traits include typically +Attitudes, Opinions, Values (AOVs). However, measuring AOVs embedded within +LLMs remains opaque, and different evaluation methods may yield different +results. This has led to a lack of clarity on how different studies are related +to each other and how they can be interpreted. This paper aims to bridge this +gap by providing a comprehensive overview of recent works on the evaluation of +AOVs in LLMs. Moreover, we survey related approaches in different stages of the +evaluation pipeline in these works. By doing so, we address the potential and +challenges with respect to understanding the model, human-AI alignment, and +downstream application in social sciences. Finally, we provide practical +insights into evaluation methods, model enhancement, and interdisciplinary +collaboration, thereby contributing to the evolving landscape of evaluating +AOVs in LLMs. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Language models and brains align due to more than next-word prediction + and word-level information EMNLP 2024 + + +
+ Pretrained language models have been shown to significantly predict brain +recordings of people comprehending language. Recent work suggests that the +prediction of the next word is a key mechanism that contributes to this +alignment. What is not yet understood is whether prediction of the next word is +necessary for this observed alignment or simply sufficient, and whether there +are other shared mechanisms or information that are similarly important. In +this work, we take a step towards understanding the reasons for brain alignment +via two simple perturbations in popular pretrained language models. These +perturbations help us design contrasts that can control for different types of +information. By contrasting the brain alignment of these differently perturbed +models, we show that improvements in alignment with brain recordings are due to +more than improvements in next-word prediction and word-level information. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Lexically Grounded Subword Segmentation EMNLP + + +
+ We present three innovations in tokenization and subword segmentation. First, +we propose to use unsupervised morphological analysis with Morfessor as +pre-tokenization. Second, we present an algebraic method for obtaining subword +embeddings grounded in a word embedding space. Based on that, we design a novel +subword segmentation algorithm that uses the embeddings, ensuring that the +procedure considers lexical meaning. Third, we introduce an efficient +segmentation algorithm based on a subword bigram model that can be initialized +with the lexically aware segmentation method to avoid using Morfessor and large +embedding tables at inference time. We evaluate the proposed approaches using +two intrinsic metrics and measure their performance on two downstream tasks: +part-of-speech tagging and machine translation. Our experiments show +significant improvements in the morphological plausibility of the segmentation +when evaluated using segmentation precision on morpheme boundaries and improved +R\'enyi efficiency in 8 languages. Although the proposed tokenization methods +do not have a large impact on automatic translation quality, we observe +consistent performance gains in the arguably more morphological task of +part-of-speech tagging. + +
+
+ comment: Camera-ready, EMNLP Main conf +
+
+
+
+
+ + ♻ ☆ Model Internals-based Answer Attribution for Trustworthy + Retrieval-Augmented Generation EMNLP 2024 + + +
+ Ensuring the verifiability of model answers is a fundamental challenge for +retrieval-augmented generation (RAG) in the question answering (QA) domain. +Recently, self-citation prompting was proposed to make large language models +(LLMs) generate citations to supporting documents along with their answers. +However, self-citing LLMs often struggle to match the required format, refer to +non-existent sources, and fail to faithfully reflect LLMs' context usage +throughout the generation. In this work, we present MIRAGE --Model +Internals-based RAG Explanations -- a plug-and-play approach using model +internals for faithful answer attribution in RAG applications. MIRAGE detects +context-sensitive answer tokens and pairs them with retrieved documents +contributing to their prediction via saliency methods. We evaluate our proposed +approach on a multilingual extractive QA dataset, finding high agreement with +human answer attribution. On open-ended QA, MIRAGE achieves citation quality +and efficiency comparable to self-citation while also allowing for a +finer-grained control of attribution parameters. Our qualitative evaluation +highlights the faithfulness of MIRAGE's attributions and underscores the +promising application of model internals for RAG answer attribution. + +
+
+ comment: Accepted by EMNLP 2024 Main Conference. Code and data released at + https://github.com/Betswish/MIRAGE +
+
+
+
+
+ + ♻ ☆ LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via a + Hybrid Architecture + + +
+ Expanding the long-context capabilities of Multi-modal Large Language +Models~(MLLMs) is crucial for video understanding, high-resolution image +understanding, and multi-modal agents. This involves a series of systematic +optimizations, including model architecture, data construction and training +strategy, particularly addressing challenges such as \textit{degraded +performance with more images} and \textit{high computational costs}. In this +paper, we adapt the model architecture to a hybrid of Mamba and Transformer +blocks, approach data construction with both temporal and spatial dependencies +among multiple images and employ a progressive training strategy. The released +model \textbf{LongLLaVA}~(\textbf{Long}-Context \textbf{L}arge +\textbf{L}anguage \textbf{a}nd \textbf{V}ision \textbf{A}ssistant) is the first +hybrid MLLM, which achieved a better balance between efficiency and +effectiveness. LongLLaVA not only achieves competitive results across various +benchmarks, but also maintains high throughput and low memory consumption. +Especially, it could process nearly a thousand images on a single A100 80GB +GPU, showing promising application prospects for a wide range of tasks. + +
+
+ comment: 20 pages, 9 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ PromptWizard: Task-Aware Prompt Optimization Framework + + +
+ Large language models (LLMs) have transformed AI across diverse domains, with +prompting being central to their success in guiding model outputs. However, +manual prompt engineering is both labor-intensive and domain-specific, +necessitating the need for automated solutions. We introduce PromptWizard, a +novel, fully automated framework for discrete prompt optimization, utilizing a +self-evolving, self-adapting mechanism. Through a feedback-driven critique and +synthesis process, PromptWizard achieves an effective balance between +exploration and exploitation, iteratively refining both prompt instructions and +in-context examples to generate human-readable, task-specific prompts. This +guided approach systematically improves prompt quality, resulting in superior +performance across 45 tasks. PromptWizard excels even with limited training +data, smaller LLMs, and various LLM architectures. Additionally, our cost +analysis reveals a substantial reduction in API calls, token usage, and overall +cost, demonstrating PromptWizard's efficiency, scalability, and advantages over +existing prompt optimization strategies. + +
+
+
+
+
+ + ♻ ☆ On the Adversarial Vulnerability of Pairwise Evaluation Using Large + Language Models + + +
+ Pairwise evaluation using large language models (LLMs) is widely adopted for +evaluating generated outputs. However, the reliability of LLM evaluators is +often compromised by their biased preferences, such as favoring verbosity and +an authoritative tone. In this work, we find that the evaluation setup itself +can significantly amplify these biases, where pairwise evaluators exhibit more +undesirable tendencies than pointwise evaluators. Our analysis further reveals +that even when pairwise evaluators make incorrect judgments, they can still +accurately identify shortcomings in low-quality outputs. As a simple remedy, we +also propose incorporating pointwise reasoning into pairwise evaluation. +Experimental results show that our method improves the performance of pairwise +evaluators on adversarial samples across various models. We hope our findings +encourage further exploration into the reliability of LLM evaluators. + +
+
+
+
+
+ + ♻ ☆ Generalists vs. Specialists: Evaluating Large Language Models for Urdu + + +
+ In this paper, we compare general-purpose models, GPT-4-Turbo and Llama-3-8b, +with special-purpose models--XLM-Roberta-large, mT5-large, and Llama-3-8b--that +have been fine-tuned on specific tasks. We focus on seven classification and +seven generation tasks to evaluate the performance of these models on Urdu +language. Urdu has 70 million native speakers, yet it remains underrepresented +in Natural Language Processing (NLP). Despite the frequent advancements in +Large Language Models (LLMs), their performance in low-resource languages, +including Urdu, still needs to be explored. We also conduct a human evaluation +for the generation tasks and compare the results with the evaluations performed +by GPT-4-Turbo, Llama-3-8b and Claude 3.5 Sonnet. We find that special-purpose +models consistently outperform general-purpose models across various tasks. We +also find that the evaluation done by GPT-4-Turbo for generation tasks aligns +more closely with human evaluation compared to the evaluation the evaluation +done by Llama-3-8b. This paper contributes to the NLP community by providing +insights into the effectiveness of general and specific-purpose LLMs for +low-resource languages. + +
+
+
+
+
+ + ♻ ☆ Efficient Temporal Extrapolation of Multimodal Large Language Models + with Temporal Grounding Bridge EMNLP 2024 + + +
+ Despite progress in multimodal large language models (MLLMs), the challenge +of interpreting long-form videos in response to linguistic queries persists, +largely due to the inefficiency in temporal grounding and limited pre-trained +context window size. In this work, we introduce Temporal Grounding Bridge +(TGB), a novel framework that bootstraps MLLMs with advanced temporal grounding +capabilities and broadens their contextual scope. Our framework significantly +enhances the temporal capabilities of current MLLMs through three key +innovations: an efficient multi-span temporal grounding algorithm applied to +low-dimension temporal features projected from flow; a multimodal length +extrapolation training paradigm that utilizes low-dimension temporal features +to extend the training context window size; and a bootstrapping framework that +bridges our model with pluggable MLLMs without requiring annotation. We +validate TGB across seven video benchmarks and demonstrate substantial +performance improvements compared with prior MLLMs. Notably, our model, +initially trained on sequences of four frames, effectively handles sequences up +to 16 longer without sacrificing performance, highlighting its scalability and +effectiveness in real-world applications. Our code is publicly available at +https://github.com/bigai-nlco/VideoTGB + +
+
+ comment: To appear at EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ WaveletGPT: Wavelets Meet Large Language Models + + +
+ Large Language Models (LLMs) have ushered in a new wave of artificial +intelligence advancements impacting every scientific field and discipline. They +are trained on a simple objective: to predict the next token given the previous +context. We live in a world where most of the data around us, e.g., text, +audio, and music, has a multi-scale structure associated with it. This paper +infuses LLMs with traditional signal processing ideas, namely wavelets, during +pre-training to take advantage of the structure. Without adding \textbf{any +extra parameters} to a GPT-style LLM architecture, we achieve the same +pre-training performance almost twice as fast in text, raw audio, and symbolic +music. This is achieved by imposing a structure on intermediate embeddings. +When trained for the same number of training steps, we achieve significant +gains in performance, which is comparable to pre-training a larger neural +architecture. Our architecture allows every next token prediction access to +intermediate embeddings at different temporal resolutions in every Transformer +decoder block. This work will hopefully pave the way for incorporating +multi-rate signal processing ideas into traditional LLM pre-training. Further, +we showcase pushing model performance by improving internal structure instead +of just going after scale. + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey of Hallucination in Large Language, Image, Video + and Audio Foundation Models EMNLP 2024 + + +
+ The rapid advancement of foundation models (FMs) across language, image, +audio, and video domains has shown remarkable capabilities in diverse tasks. +However, the proliferation of FMs brings forth a critical challenge: the +potential to generate hallucinated outputs, particularly in high-stakes +applications. The tendency of foundation models to produce hallucinated content +arguably represents the biggest hindrance to their widespread adoption in +real-world scenarios, especially in domains where reliability and accuracy are +paramount. This survey paper presents a comprehensive overview of recent +developments that aim to identify and mitigate the problem of hallucination in +FMs, spanning text, image, video, and audio modalities. By synthesizing recent +advancements in detecting and mitigating hallucination across various +modalities, the paper aims to provide valuable insights for researchers, +developers, and practitioners. Essentially, it establishes a clear framework +encompassing definition, taxonomy, and detection strategies for addressing +hallucination in multimodal foundation models, laying the foundation for future +research in this pivotal area. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ PyramidKV: Dynamic KV Cache Compression based on Pyramidal Information + Funneling + + +
+ In this study, we investigate whether attention-based information flow inside +large language models (LLMs) is aggregated through noticeable patterns for long +context processing. Our observations reveal that LLMs aggregate information +through Pyramidal Information Funneling where attention is scattering widely in +lower layers, progressively consolidating within specific contexts, and +ultimately focusing on critical tokens (a.k.a massive activation or attention +sink) in higher layers. Motivated by these insights, we developed PyramidKV, a +novel and effective KV cache compression method. This approach dynamically +adjusts the KV cache size across different layers, allocating more cache in +lower layers and less in higher ones, diverging from traditional methods that +maintain a uniform KV cache size. Our experimental evaluations, utilizing the +LongBench benchmark, show that PyramidKV matches the performance of models with +a full KV cache while retaining only 12% of the KV cache, thus significantly +reducing memory usage. In scenarios emphasizing memory efficiency, where only +0.7% of the KV cache is maintained, PyramidKV surpasses other KV cache +compression techniques, achieving up to a 20.5 absolute accuracy improvement on +TREC dataset. In the Needle-in-a-Haystack experiment, PyramidKV outperforms +competing methods in maintaining long-context comprehension in LLMs; notably, +retaining just 128 KV cache entries enables the LLAMA-3-70B model to achieve +100% Acc. performance, matching that of a full KV cache. + +
+
+
+
+
+ + ♻ ☆ Self-Constructed Context Decompilation with Fined-grained Alignment + Enhancement EMNLP 2024 + + +
+ Decompilation transforms compiled code back into a high-level programming +language for analysis when source code is unavailable. Previous work has +primarily focused on enhancing decompilation performance by increasing the +scale of model parameters or training data for pre-training. Based on the +characteristics of the decompilation task, we propose two methods: (1) Without +fine-tuning, the Self-Constructed Context Decompilation (sc$^2$dec) method +recompiles the LLM's decompilation results to construct pairs for in-context +learning, helping the model improve decompilation performance. (2) Fine-grained +Alignment Enhancement (FAE), which meticulously aligns assembly code with +source code at the statement level by leveraging debugging information, is +employed during the fine-tuning phase to achieve further improvements in +decompilation. By integrating these two methods, we achieved a Re-Executability +performance improvement of approximately 3.90% on the Decompile-Eval benchmark, +establishing a new state-of-the-art performance of 52.41%. The code, data, and +models are available at https://github.com/AlongWY/sccdec. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Exploring language relations through syntactic distances and geographic + proximity + + +
+ Languages are grouped into families that share common linguistic traits. +While this approach has been successful in understanding genetic relations +between diverse languages, more analyses are needed to accurately quantify +their relatedness, especially in less studied linguistic levels such as syntax. +Here, we explore linguistic distances using series of parts of speech (POS) +extracted from the Universal Dependencies dataset. Within an +information-theoretic framework, we show that employing POS trigrams maximizes +the possibility of capturing syntactic variations while being at the same time +compatible with the amount of available data. Linguistic connections are then +established by assessing pairwise distances based on the POS distributions. +Intriguingly, our analysis reveals definite clusters that correspond to well +known language families and groups, with exceptions explained by distinct +morphological typologies. Furthermore, we obtain a significant correlation +between language similarity and geographic distance, which underscores the +influence of spatial proximity on language kinships. + +
+
+ comment: 39 pages +
+
+
+
+
+ + ♻ ☆ A Systematic Analysis of Large Language Models as Soft Reasoners: The + Case of Syllogistic Inferences + + +
+ The reasoning abilities of Large Language Models (LLMs) are becoming a +central focus of study in NLP. In this paper, we consider the case of +syllogistic reasoning, an area of deductive reasoning studied extensively in +logic and cognitive psychology. Previous research has shown that pre-trained +LLMs exhibit reasoning biases, such as $\textit{content effects}$, avoid +answering that $\textit{no conclusion follows}$, display human-like +difficulties, and struggle with multi-step reasoning. We contribute to this +research line by systematically investigating the effects of chain-of-thought +reasoning, in-context learning (ICL), and supervised fine-tuning (SFT) on +syllogistic reasoning, considering syllogisms with conclusions that support or +violate world knowledge, as well as ones with multiple premises. Crucially, we +go beyond the standard focus on accuracy, with an in-depth analysis of the +conclusions generated by the models. Our results suggest that the behavior of +pre-trained LLMs can be explained by heuristics studied in cognitive science +and that both ICL and SFT improve model performance on valid inferences, +although only the latter mitigates most reasoning biases without harming model +consistency. + +
+
+
+
+
+ + ♻ ☆ Optimized Speculative Sampling for GPU Hardware Accelerators EMNLP 2024 + + +
+ In this work, we optimize speculative sampling for parallel hardware +accelerators to improve sampling speed. We notice that substantial portions of +the intermediate matrices necessary for speculative sampling can be computed +concurrently. This allows us to distribute the workload across multiple GPU +threads, enabling simultaneous operations on matrix segments within thread +blocks. This results in profiling time improvements ranging from 6% to 13% +relative to the baseline implementation, without compromising accuracy. To +further accelerate speculative sampling, probability distributions +parameterized by softmax are approximated by sigmoid. This approximation +approach results in significantly greater relative improvements in profiling +time, ranging from 37% to 94%, with a minor decline in accuracy. We conduct +extensive experiments on both automatic speech recognition and summarization +tasks to validate the effectiveness of our optimization methods. + +
+
+ comment: Accepted at EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ StablePT: Towards Stable Prompting for Few-shot Learning via Input + Separation EMNLP 2024 + + +
+ Large language models have shown their ability to become effective few-shot +learners with prompting, revolutionizing the paradigm of learning with data +scarcity. However, this approach largely depends on the quality of prompt +initialization, and always exhibits large variability among different runs. +Such property makes prompt tuning highly unreliable and vulnerable to poorly +constructed prompts, which limits its extension to more real-world +applications. To tackle this issue, we propose to treat the hard prompt and +soft prompt as separate inputs to mitigate noise brought by the prompt +initialization. Furthermore, we optimize soft prompts with contrastive learning +for utilizing class-aware information in the training process to maintain model +performance. Experimental results demonstrate that \sysname outperforms +state-of-the-art methods by 6.97% in accuracy and reduces the standard +deviation by 1.92 on average. Furthermore, extensive experiments underscore its +robustness and stability across 8 datasets covering various tasks. Codes are +available at https://github.com/lccc0528/Stable/tree/main. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 131 + +
+
+
+ + ☆ Flash-Splat: 3D Reflection Removal with Flash Cues and Gaussian Splats + + +
+ We introduce a simple yet effective approach for separating transmitted and +reflected light. Our key insight is that the powerful novel view synthesis +capabilities provided by modern inverse rendering methods (e.g.,~3D Gaussian +splatting) allow one to perform flash/no-flash reflection separation using +unpaired measurements -- this relaxation dramatically simplifies image +acquisition over conventional paired flash/no-flash reflection separation +methods. Through extensive real-world experiments, we demonstrate our method, +Flash-Splat, accurately reconstructs both transmitted and reflected scenes in +3D. Our method outperforms existing 3D reflection separation methods, which do +not leverage illumination control, by a large margin. Our project webpage is at +https://flash-splat.github.io/. + +
+
+
+
+
+ + ☆ Vinoground: Scrutinizing LMMs over Dense Temporal Reasoning with Short + Videos + + +
+ There has been growing sentiment recently that modern large multimodal models +(LMMs) have addressed most of the key challenges related to short video +comprehension. As a result, both academia and industry are gradually shifting +their attention towards the more complex challenges posed by understanding +long-form videos. However, is this really the case? Our studies indicate that +LMMs still lack many fundamental reasoning capabilities even when dealing with +short videos. We introduce Vinoground, a temporal counterfactual LMM evaluation +benchmark encompassing 1000 short and natural video-caption pairs. We +demonstrate that existing LMMs severely struggle to distinguish temporal +differences between different actions and object transformations. For example, +the best model GPT-4o only obtains ~50% on our text and video scores, showing a +large gap compared to the human baseline of ~90%. All open-source multimodal +models and CLIP-based models perform much worse, producing mostly random chance +performance. Through this work, we shed light onto the fact that temporal +reasoning in short videos is a problem yet to be fully solved. The dataset and +evaluation code are available at https://vinoground.github.io. + +
+
+ comment: Project Page: https://vinoground.github.io +
+
+
+
+
+ + ☆ Interpreting and Editing Vision-Language Representations to Mitigate + Hallucinations + + +
+ We investigate the internal representations of vision-language models (VLMs) +to address hallucinations, a persistent challenge despite advances in model +size and training. We project VLMs' internal image representations to their +language vocabulary and observe more confident output probabilities on real +objects than hallucinated objects. We additionally use these output +probabilities to spatially localize real objects. Building on this approach, we +introduce a knowledge erasure algorithm that removes hallucinations by linearly +orthogonalizing image features with respect to hallucinated object features. We +show that targeted edits to a model's latent representations can reduce +hallucinations by up to 25.7% on the COCO2014 dataset while preserving +performance. Our findings demonstrate how a deeper understanding of VLMs' +latent representations can enhance reliability and enable novel capabilities, +such as zero-shot segmentation. + +
+
+ comment: Project page and code: http://anishk23733.github.io/vl-interp/ +
+
+
+
+
+ + ☆ FakeShield: Explainable Image Forgery Detection and Localization via + Multi-modal Large Language Models + + +
+ The rapid development of generative AI is a double-edged sword, which not +only facilitates content creation but also makes image manipulation easier and +more difficult to detect. Although current image forgery detection and +localization (IFDL) methods are generally effective, they tend to face two +challenges: \textbf{1)} black-box nature with unknown detection principle, +\textbf{2)} limited generalization across diverse tampering methods (e.g., +Photoshop, DeepFake, AIGC-Editing). To address these issues, we propose the +explainable IFDL task and design FakeShield, a multi-modal framework capable of +evaluating image authenticity, generating tampered region masks, and providing +a judgment basis based on pixel-level and image-level tampering clues. +Additionally, we leverage GPT-4o to enhance existing IFDL datasets, creating +the Multi-Modal Tamper Description dataSet (MMTD-Set) for training FakeShield's +tampering analysis capabilities. Meanwhile, we incorporate a Domain Tag-guided +Explainable Forgery Detection Module (DTE-FDM) and a Multi-modal Forgery +Localization Module (MFLM) to address various types of tamper detection +interpretation and achieve forgery localization guided by detailed textual +descriptions. Extensive experiments demonstrate that FakeShield effectively +detects and localizes various tampering techniques, offering an explainable and +superior solution compared to previous IFDL methods. + +
+
+
+
+
+ + ☆ Loong: Generating Minute-level Long Videos with Autoregressive Language + Models + + +
+ It is desirable but challenging to generate content-rich long videos in the +scale of minutes. Autoregressive large language models (LLMs) have achieved +great success in generating coherent and long sequences of tokens in the domain +of natural language processing, while the exploration of autoregressive LLMs +for video generation is limited to generating short videos of several seconds. +In this work, we conduct a deep analysis of the challenges that prevent +autoregressive LLM-based video generators from generating long videos. Based on +the observations and analysis, we propose Loong, a new autoregressive LLM-based +video generator that can generate minute-long videos. Specifically, we model +the text tokens and video tokens as a unified sequence for autoregressive LLMs +and train the model from scratch. We propose progressive short-to-long training +with a loss re-weighting scheme to mitigate the loss imbalance problem for long +video training. We further investigate inference strategies, including video +token re-encoding and sampling strategies, to diminish error accumulation +during inference. Our proposed Loong can be trained on 10-second videos and be +extended to generate minute-level long videos conditioned on text prompts, as +demonstrated by the results. More samples are available at: +https://epiphqny.github.io/Loong-video. + +
+
+ comment: Project page: https://epiphqny.github.io/Loong-video/ +
+
+
+
+
+ + ☆ Contrastive Localized Language-Image Pre-Training + + +
+ Contrastive Language-Image Pre-training (CLIP) has been a celebrated method +for training vision encoders to generate image/text representations +facilitating various applications. Recently, CLIP has been widely adopted as +the vision backbone of multimodal large language models (MLLMs) to connect +image inputs for language interactions. The success of CLIP as a +vision-language foundation model relies on aligning web-crawled noisy text +annotations at image levels. Nevertheless, such criteria may become +insufficient for downstream tasks in need of fine-grained vision +representations, especially when region-level understanding is demanding for +MLLMs. In this paper, we improve the localization capability of CLIP with +several advances. We propose a pre-training method called Contrastive Localized +Language-Image Pre-training (CLOC) by complementing CLIP with region-text +contrastive loss and modules. We formulate a new concept, promptable +embeddings, of which the encoder produces image embeddings easy to transform +into region representations given spatial hints. To support large-scale +pre-training, we design a visually-enriched and spatially-localized captioning +framework to effectively generate region-text pseudo-labels at scale. By +scaling up to billions of annotated images, CLOC enables high-quality regional +embeddings for image region recognition and retrieval tasks, and can be a +drop-in replacement of CLIP to enhance MLLMs, especially on referring and +grounding tasks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Revisit Large-Scale Image-Caption Data in Pre-training Multimodal + Foundation Models + + +
+ Recent advancements in multimodal models highlight the value of rewritten +captions for improving performance, yet key challenges remain. For example, +while synthetic captions often provide superior quality and image-text +alignment, it is not clear whether they can fully replace AltTexts: the role of +synthetic captions and their interaction with original web-crawled AltTexts in +pre-training is still not well understood. Moreover, different multimodal +foundation models may have unique preferences for specific caption formats, but +efforts to identify the optimal captions for each model remain limited. In this +work, we propose a novel, controllable, and scalable captioning pipeline +designed to generate diverse caption formats tailored to various multimodal +models. By examining Short Synthetic Captions (SSC) towards Dense Synthetic +Captions (DSC+) as case studies, we systematically explore their effects and +interactions with AltTexts across models such as CLIP, multimodal LLMs, and +diffusion models. Our findings reveal that a hybrid approach that keeps both +synthetic captions and AltTexts can outperform the use of synthetic captions +alone, improving both alignment and performance, with each model demonstrating +preferences for particular caption formats. This comprehensive analysis +provides valuable insights into optimizing captioning strategies, thereby +advancing the pre-training of multimodal foundation models. + +
+
+ comment: CV/ML +
+
+
+
+
+ + ☆ DivScene: Benchmarking LVLMs for Object Navigation with Diverse Scenes + and Objects + + +
+ Object navigation in unknown environments is crucial for deploying embodied +agents in real-world applications. While we have witnessed huge progress due to +large-scale scene datasets, faster simulators, and stronger models, previous +studies mainly focus on limited scene types and target objects. In this paper, +we study a new task of navigating to diverse target objects in a large number +of scene types. To benchmark the problem, we present a large-scale scene +dataset, DivScene, which contains 4,614 scenes across 81 different types. With +the dataset, we build an end-to-end embodied agent, NatVLM, by fine-tuning a +Large Vision Language Model (LVLM) through imitation learning. The LVLM is +trained to take previous observations from the environment and generate the +next actions. We also introduce CoT explanation traces of the action prediction +for better performance when tuning LVLMs. Our extensive experiments find that +we can build a performant LVLM-based agent through imitation learning on the +shortest paths constructed by a BFS planner without any human supervision. Our +agent achieves a success rate that surpasses GPT-4o by over 20%. Meanwhile, we +carry out various analyses showing the generalization ability of our agent. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ Curvature Diversity-Driven Deformation and Domain Alignment for Point + Cloud + + +
+ Unsupervised Domain Adaptation (UDA) is crucial for reducing the need for +extensive manual data annotation when training deep networks on point cloud +data. A significant challenge of UDA lies in effectively bridging the domain +gap. To tackle this challenge, we propose \textbf{C}urvature +\textbf{D}iversity-Driven \textbf{N}uclear-Norm Wasserstein \textbf{D}omain +Alignment (CDND). Our approach first introduces a \textit{\textbf{Curv}ature +Diversity-driven Deformation \textbf{Rec}onstruction (CurvRec)} task, which +effectively mitigates the gap between the source and target domains by enabling +the model to extract salient features from semantically rich regions of a given +point cloud. We then propose \textit{\textbf{D}eformation-based +\textbf{N}uclear-norm \textbf{W}asserstein \textbf{D}iscrepancy (D-NWD)}, which +applies the Nuclear-norm Wasserstein Discrepancy to both \textit{deformed and +original} data samples to align the source and target domains. Furthermore, we +contribute a theoretical justification for the effectiveness of D-NWD in +distribution alignment and demonstrate that it is \textit{generic} enough to be +applied to \textbf{any} deformations. To validate our method, we conduct +extensive experiments on two public domain adaptation datasets for point cloud +classification and segmentation tasks. Empirical experiment results show that +our CDND achieves state-of-the-art performance by a noticeable margin over +existing approaches. + +
+
+
+
+
+ + ☆ AlzhiNet: Traversing from 2DCNN to 3DCNN, Towards Early Detection and + Diagnosis of Alzheimer's Disease + + +
+ Alzheimer's disease (AD) is a progressive neurodegenerative disorder with +increasing prevalence among the aging population, necessitating early and +accurate diagnosis for effective disease management. In this study, we present +a novel hybrid deep learning framework that integrates both 2D Convolutional +Neural Networks (2D-CNN) and 3D Convolutional Neural Networks (3D-CNN), along +with a custom loss function and volumetric data augmentation, to enhance +feature extraction and improve classification performance in AD diagnosis. +According to extensive experiments, AlzhiNet outperforms standalone 2D and 3D +models, highlighting the importance of combining these complementary +representations of data. The depth and quality of 3D volumes derived from the +augmented 2D slices also significantly influence the model's performance. The +results indicate that carefully selecting weighting factors in hybrid +predictions is imperative for achieving optimal results. Our framework has been +validated on the Magnetic Resonance Imaging (MRI) from Kaggle and MIRIAD +datasets, obtaining accuracies of 98.9% and 99.99%, respectively, with an AUC +of 100%. Furthermore, AlzhiNet was studied under a variety of perturbation +scenarios on the Alzheimer's Kaggle dataset, including Gaussian noise, +brightness, contrast, salt and pepper noise, color jitter, and occlusion. The +results obtained show that AlzhiNet is more robust to perturbations than +ResNet-18, making it an excellent choice for real-world applications. This +approach represents a promising advancement in the early diagnosis and +treatment planning for Alzheimer's disease. + +
+
+
+
+
+ + ☆ Video Instruction Tuning With Synthetic Data + + +
+ The development of video large multimodal models (LMMs) has been hindered by +the difficulty of curating large amounts of high-quality raw data from the web. +To address this, we propose an alternative approach by creating a high-quality +synthetic dataset specifically for video instruction-following, namely +LLaVA-Video-178K. This dataset includes key tasks such as detailed captioning, +open-ended question-answering (QA), and multiple-choice QA. By training on this +dataset, in combination with existing visual instruction tuning data, we +introduce LLaVA-Video, a new video LMM. Our experiments demonstrate that +LLaVA-Video achieves strong performance across various video benchmarks, +highlighting the effectiveness of our dataset. We plan to release the dataset, +its generation pipeline, and the model checkpoints. + +
+
+ comment: Project page: https://llava-vl.github.io/blog/2024-09-30-llava-video/ +
+
+
+
+
+ + ☆ LLaVA-Critic: Learning to Evaluate Multimodal Models + + +
+ We introduce LLaVA-Critic, the first open-source large multimodal model (LMM) +designed as a generalist evaluator to assess performance across a wide range of +multimodal tasks. LLaVA-Critic is trained using a high-quality critic +instruction-following dataset that incorporates diverse evaluation criteria and +scenarios. Our experiments demonstrate the model's effectiveness in two key +areas: (1) LMM-as-a-Judge, where LLaVA-Critic provides reliable evaluation +scores, performing on par with or surpassing GPT models on multiple evaluation +benchmarks; and (2) Preference Learning, where it generates reward signals for +preference learning, enhancing model alignment capabilities. This work +underscores the potential of open-source LMMs in self-critique and evaluation, +setting the stage for future research into scalable, superhuman alignment +feedback mechanisms for LMMs. + +
+
+ comment: Project Page: https://llava-vl.github.io/blog/2024-10-03-llava-critic +
+
+
+
+
+ + ☆ SteerDiff: Steering towards Safe Text-to-Image Diffusion Models + + +
+ Text-to-image (T2I) diffusion models have drawn attention for their ability +to generate high-quality images with precise text alignment. However, these +models can also be misused to produce inappropriate content. Existing safety +measures, which typically rely on text classifiers or ControlNet-like +approaches, are often insufficient. Traditional text classifiers rely on +large-scale labeled datasets and can be easily bypassed by rephrasing. As +diffusion models continue to scale, fine-tuning these safeguards becomes +increasingly challenging and lacks flexibility. Recent red-teaming attack +researches further underscore the need for a new paradigm to prevent the +generation of inappropriate content. In this paper, we introduce SteerDiff, a +lightweight adaptor module designed to act as an intermediary between user +input and the diffusion model, ensuring that generated images adhere to ethical +and safety standards with little to no impact on usability. SteerDiff +identifies and manipulates inappropriate concepts within the text embedding +space to guide the model away from harmful outputs. We conduct extensive +experiments across various concept unlearning tasks to evaluate the +effectiveness of our approach. Furthermore, we benchmark SteerDiff against +multiple red-teaming strategies to assess its robustness. Finally, we explore +the potential of SteerDiff for concept forgetting tasks, demonstrating its +versatility in text-conditioned image generation. + +
+
+
+
+
+ + ☆ ControlAR: Controllable Image Generation with Autoregressive Models + + +
+ Autoregressive (AR) models have reformulated image generation as next-token +prediction, demonstrating remarkable potential and emerging as strong +competitors to diffusion models. However, control-to-image generation, akin to +ControlNet, remains largely unexplored within AR models. Although a natural +approach, inspired by advancements in Large Language Models, is to tokenize +control images into tokens and prefill them into the autoregressive model +before decoding image tokens, it still falls short in generation quality +compared to ControlNet and suffers from inefficiency. To this end, we introduce +ControlAR, an efficient and effective framework for integrating spatial +controls into autoregressive image generation models. Firstly, we explore +control encoding for AR models and propose a lightweight control encoder to +transform spatial inputs (e.g., canny edges or depth maps) into control tokens. +Then ControlAR exploits the conditional decoding method to generate the next +image token conditioned on the per-token fusion between control and image +tokens, similar to positional encodings. Compared to prefilling tokens, using +conditional decoding significantly strengthens the control capability of AR +models but also maintains the model's efficiency. Furthermore, the proposed +ControlAR surprisingly empowers AR models with arbitrary-resolution image +generation via conditional decoding and specific controls. Extensive +experiments can demonstrate the controllability of the proposed ControlAR for +the autoregressive control-to-image generation across diverse inputs, including +edges, depths, and segmentation masks. Furthermore, both quantitative and +qualitative results indicate that ControlAR surpasses previous state-of-the-art +controllable diffusion models, e.g., ControlNet++. Code, models, and demo will +soon be available at https://github.com/hustvl/ControlAR. + +
+
+ comment: Preprint. Work in progress +
+
+
+
+
+ + ☆ Lie Algebra Canonicalization: Equivariant Neural Operators under + arbitrary Lie Groups + + +
+ The quest for robust and generalizable machine learning models has driven +recent interest in exploiting symmetries through equivariant neural networks. +In the context of PDE solvers, recent works have shown that Lie point +symmetries can be a useful inductive bias for Physics-Informed Neural Networks +(PINNs) through data and loss augmentation. Despite this, directly enforcing +equivariance within the model architecture for these problems remains elusive. +This is because many PDEs admit non-compact symmetry groups, oftentimes not +studied beyond their infinitesimal generators, making them incompatible with +most existing equivariant architectures. In this work, we propose Lie aLgebrA +Canonicalization (LieLAC), a novel approach that exploits only the action of +infinitesimal generators of the symmetry group, circumventing the need for +knowledge of the full group structure. To achieve this, we address existing +theoretical issues in the canonicalization literature, establishing connections +with frame averaging in the case of continuous non-compact groups. Operating +within the framework of canonicalization, LieLAC can easily be integrated with +unconstrained pre-trained models, transforming inputs to a canonical form +before feeding them into the existing model, effectively aligning the input for +model inference according to allowed symmetries. LieLAC utilizes standard Lie +group descent schemes, achieving equivariance in pre-trained models. Finally, +we showcase LieLAC's efficacy on tasks of invariant image classification and +Lie point symmetry equivariant neural PDE solvers using pre-trained models. + +
+
+ comment: 40 pages; preprint +
+
+
+
+
+ + ☆ Unsupervised Point Cloud Completion through Unbalanced Optimal Transport + + +
+ Unpaired point cloud completion explores methods for learning a completion +map from unpaired incomplete and complete point cloud data. In this paper, we +propose a novel approach for unpaired point cloud completion using the +unbalanced optimal transport map, called Unbalanced Optimal Transport Map for +Unpaired Point Cloud Completion (UOT-UPC). We demonstrate that the unpaired +point cloud completion can be naturally interpreted as the Optimal Transport +(OT) problem and introduce the Unbalanced Optimal Transport (UOT) approach to +address the class imbalance problem, which is prevalent in unpaired point cloud +completion datasets. Moreover, we analyze the appropriate cost function for +unpaired completion tasks. This analysis shows that the InfoCD cost function is +particularly well-suited for this task. Our model is the first attempt to +leverage UOT for unpaired point cloud completion, achieving competitive or +superior results on both single-category and multi-category datasets. In +particular, our model is especially effective in scenarios with class +imbalance, where the proportions of categories are different between the +incomplete and complete point cloud datasets. + +
+
+ comment: 20 pages, 10 figures +
+
+
+
+
+ + ☆ Measuring and Improving Persuasiveness of Generative Models + + +
+ LLMs are increasingly being used in workflows involving generating content to +be consumed by humans (e.g., marketing) and also in directly interacting with +humans (e.g., through chatbots). The development of such systems that are +capable of generating verifiably persuasive messages presents both +opportunities and challenges for society. On the one hand, such systems could +positively impact domains like advertising and social good, such as addressing +drug addiction, and on the other, they could be misused for spreading +misinformation and shaping political opinions. To channel LLMs' impact on +society, we need to develop systems to measure and benchmark their +persuasiveness. With this motivation, we introduce PersuasionBench and +PersuasionArena, the first large-scale benchmark and arena containing a battery +of tasks to measure the persuasion ability of generative models automatically. +We investigate to what extent LLMs know and leverage linguistic patterns that +can help them generate more persuasive language. Our findings indicate that the +persuasiveness of LLMs correlates positively with model size, but smaller +models can also be made to have a higher persuasiveness than much larger +models. Notably, targeted training using synthetic and natural datasets +significantly enhances smaller models' persuasive capabilities, challenging +scale-dependent assumptions. Our findings carry key implications for both model +developers and policymakers. For instance, while the EU AI Act and California's +SB-1047 aim to regulate AI models based on the number of floating point +operations, we demonstrate that simple metrics like this alone fail to capture +the full scope of AI's societal impact. We invite the community to explore and +contribute to PersuasionArena and PersuasionBench, available at +https://bit.ly/measure-persuasion, to advance our understanding of AI-driven +persuasion and its societal implications. + +
+
+
+
+
+ + ☆ Learning 3D Perception from Others' Predictions + + +
+ Accurate 3D object detection in real-world environments requires a huge +amount of annotated data with high quality. Acquiring such data is tedious and +expensive, and often needs repeated effort when a new sensor is adopted or when +the detector is deployed in a new environment. We investigate a new scenario to +construct 3D object detectors: learning from the predictions of a nearby unit +that is equipped with an accurate detector. For example, when a self-driving +car enters a new area, it may learn from other traffic participants whose +detectors have been optimized for that area. This setting is label-efficient, +sensor-agnostic, and communication-efficient: nearby units only need to share +the predictions with the ego agent (e.g., car). Naively using the received +predictions as ground-truths to train the detector for the ego car, however, +leads to inferior performance. We systematically study the problem and identify +viewpoint mismatches and mislocalization (due to synchronization and GPS +errors) as the main causes, which unavoidably result in false positives, false +negatives, and inaccurate pseudo labels. We propose a distance-based +curriculum, first learning from closer units with similar viewpoints and +subsequently improving the quality of other units' predictions via +self-training. We further demonstrate that an effective pseudo label refinement +module can be trained with a handful of annotated data, largely reducing the +data quantity necessary to train an object detector. We validate our approach +on the recently released real-world collaborative driving dataset, using +reference cars' predictions as pseudo labels for the ego car. Extensive +experiments including several scenarios (e.g., different sensors, detectors, +and domains) demonstrate the effectiveness of our approach toward +label-efficient learning of 3D perception from other units' predictions. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Why Sample Space Matters: Keyframe Sampling Optimization for LiDAR-based + Place Recognition + + +
+ Recent advances in robotics are pushing real-world autonomy, enabling robots +to perform long-term and large-scale missions. A crucial component for +successful missions is the incorporation of loop closures through place +recognition, which effectively mitigates accumulated pose estimation drift. +Despite computational advancements, optimizing performance for real-time +deployment remains challenging, especially in resource-constrained mobile +robots and multi-robot systems since, conventional keyframe sampling practices +in place recognition often result in retaining redundant information or +overlooking relevant data, as they rely on fixed sampling intervals or work +directly in the 3D space instead of the feature space. To address these +concerns, we introduce the concept of sample space in place recognition and +demonstrate how different sampling techniques affect the query process and +overall performance. We then present a novel keyframe sampling approach for +LiDAR-based place recognition, which focuses on redundancy minimization and +information preservation in the hyper-dimensional descriptor space. This +approach is applicable to both learning-based and handcrafted descriptors, and +through the experimental validation across multiple datasets and descriptor +frameworks, we demonstrate the effectiveness of our proposed method, showing it +can jointly minimize redundancy and preserve essential information in +real-time. The proposed approach maintains robust performance across various +datasets without requiring parameter tuning, contributing to more efficient and +reliable place recognition for a wide range of robotic applications. + +
+
+ comment: 20 pages, 15 figures. Submitted +
+
+
+
+
+ + ☆ Diffusion-based Extreme Image Compression with Compressed Feature + Initialization + + +
+ Diffusion-based extreme image compression methods have achieved impressive +performance at extremely low bitrates. However, constrained by the iterative +denoising process that starts from pure noise, these methods are limited in +both fidelity and efficiency. To address these two issues, we present Relay +Residual Diffusion Extreme Image Compression (RDEIC), which leverages +compressed feature initialization and residual diffusion. Specifically, we +first use the compressed latent features of the image with added noise, instead +of pure noise, as the starting point to eliminate the unnecessary initial +stages of the denoising process. Second, we design a novel relay residual +diffusion that reconstructs the raw image by iteratively removing the added +noise and the residual between the compressed and target latent features. +Notably, our relay residual diffusion network seamlessly integrates pre-trained +stable diffusion to leverage its robust generative capability for high-quality +reconstruction. Third, we propose a fixed-step fine-tuning strategy to +eliminate the discrepancy between the training and inference phases, further +improving the reconstruction quality. Extensive experiments demonstrate that +the proposed RDEIC achieves state-of-the-art visual quality and outperforms +existing diffusion-based extreme image compression methods in both fidelity and +efficiency. The source code will be provided in +https://github.com/huai-chang/RDEIC. + +
+
+
+
+
+ + ☆ Spatial-Temporal Multi-Cuts for Online Multiple-Camera Vehicle Tracking + + +
+ Accurate online multiple-camera vehicle tracking is essential for intelligent +transportation systems, autonomous driving, and smart city applications. Like +single-camera multiple-object tracking, it is commonly formulated as a graph +problem of tracking-by-detection. Within this framework, existing online +methods usually consist of two-stage procedures that cluster temporally first, +then spatially, or vice versa. This is computationally expensive and prone to +error accumulation. We introduce a graph representation that allows +spatial-temporal clustering in a single, combined step: New detections are +spatially and temporally connected with existing clusters. By keeping sparse +appearance and positional cues of all detections in a cluster, our method can +compare clusters based on the strongest available evidence. The final tracks +are obtained online using a simple multicut assignment procedure. Our method +does not require any training on the target scene, pre-extraction of +single-camera tracks, or additional annotations. Notably, we outperform the +online state-of-the-art on the CityFlow dataset in terms of IDF1 by more than +14%, and on the Synthehicle dataset by more than 25%, respectively. The code is +publicly available. + +
+
+
+
+
+ + ☆ Plots Unlock Time-Series Understanding in Multimodal Models + + +
+ While multimodal foundation models can now natively work with data beyond +text, they remain underutilized in analyzing the considerable amounts of +multi-dimensional time-series data in fields like healthcare, finance, and +social sciences, representing a missed opportunity for richer, data-driven +insights. This paper proposes a simple but effective method that leverages the +existing vision encoders of these models to "see" time-series data via plots, +avoiding the need for additional, potentially costly, model training. Our +empirical evaluations show that this approach outperforms providing the raw +time-series data as text, with the additional benefit that visual time-series +representations demonstrate up to a 90% reduction in model API costs. We +validate our hypothesis through synthetic data tasks of increasing complexity, +progressing from simple functional form identification on clean data, to +extracting trends from noisy scatter plots. To demonstrate generalizability +from synthetic tasks with clear reasoning steps to more complex, real-world +scenarios, we apply our approach to consumer health tasks - specifically fall +detection, activity recognition, and readiness assessment - which involve +heterogeneous, noisy data and multi-step reasoning. The overall success in plot +performance over text performance (up to an 120% performance increase on +zero-shot synthetic tasks, and up to 150% performance increase on real-world +tasks), across both GPT and Gemini model families, highlights our approach's +potential for making the best use of the native capabilities of foundation +models. + +
+
+ comment: 49 pages +
+
+
+
+
+ + ☆ Metrics Revolutions: Groundbreaking Insights into the Implementation of + Metrics for Biomedical Image Segmentation + + +
+ The evaluation of segmentation performance is a common task in biomedical +image analysis, with its importance emphasized in the recently released metrics +selection guidelines and computing frameworks. To quantitatively evaluate the +alignment of two segmentations, researchers commonly resort to counting +metrics, such as the Dice similarity coefficient, or distance-based metrics, +such as the Hausdorff distance, which are usually computed by publicly +available open-source tools with an inherent assumption that these tools +provide consistent results. In this study we questioned this assumption, and +performed a systematic implementation analysis along with quantitative +experiments on real-world clinical data to compare 11 open-source tools for +distance-based metrics computation against our highly accurate mesh-based +reference implementation. The results revealed that statistically significant +differences among all open-source tools are both surprising and concerning, +since they question the validity of existing studies. Besides identifying the +main sources of variation, we also provide recommendations for distance-based +metrics computation. + +
+
+
+
+
+ + ☆ GI-GS: Global Illumination Decomposition on Gaussian Splatting for + Inverse Rendering + + +
+ We present GI-GS, a novel inverse rendering framework that leverages 3D +Gaussian Splatting (3DGS) and deferred shading to achieve photo-realistic novel +view synthesis and relighting. In inverse rendering, accurately modeling the +shading processes of objects is essential for achieving high-fidelity results. +Therefore, it is critical to incorporate global illumination to account for +indirect lighting that reaches an object after multiple bounces across the +scene. Previous 3DGS-based methods have attempted to model indirect lighting by +characterizing indirect illumination as learnable lighting volumes or +additional attributes of each Gaussian, while using baked occlusion to +represent shadow effects. These methods, however, fail to accurately model the +complex physical interactions between light and objects, making it impossible +to construct realistic indirect illumination during relighting. To address this +limitation, we propose to calculate indirect lighting using efficient path +tracing with deferred shading. In our framework, we first render a G-buffer to +capture the detailed geometry and material properties of the scene. Then, we +perform physically-based rendering (PBR) only for direct lighting. With the +G-buffer and previous rendering results, the indirect lighting can be +calculated through a lightweight path tracing. Our method effectively models +indirect lighting under any given lighting conditions, thereby achieving better +novel view synthesis and relighting. Quantitative and qualitative results show +that our GI-GS outperforms existing baselines in both rendering quality and +efficiency. + +
+
+
+
+
+ + ☆ NL-Eye: Abductive NLI for Images + + +
+ Will a Visual Language Model (VLM)-based bot warn us about slipping if it +detects a wet floor? Recent VLMs have demonstrated impressive capabilities, yet +their ability to infer outcomes and causes remains underexplored. To address +this, we introduce NL-Eye, a benchmark designed to assess VLMs' visual +abductive reasoning skills. NL-Eye adapts the abductive Natural Language +Inference (NLI) task to the visual domain, requiring models to evaluate the +plausibility of hypothesis images based on a premise image and explain their +decisions. NL-Eye consists of 350 carefully curated triplet examples (1,050 +images) spanning diverse reasoning categories: physical, functional, logical, +emotional, cultural, and social. The data curation process involved two steps - +writing textual descriptions and generating images using text-to-image models, +both requiring substantial human involvement to ensure high-quality and +challenging scenes. Our experiments show that VLMs struggle significantly on +NL-Eye, often performing at random baseline levels, while humans excel in both +plausibility prediction and explanation quality. This demonstrates a deficiency +in the abductive reasoning capabilities of modern VLMs. NL-Eye represents a +crucial step toward developing VLMs capable of robust multimodal reasoning for +real-world applications, including accident-prevention bots and generated video +verification. + +
+
+
+
+
+ + ☆ High-Efficiency Neural Video Compression via Hierarchical Predictive + Learning + + +
+ The enhanced Deep Hierarchical Video Compression-DHVC 2.0-has been +introduced. This single-model neural video codec operates across a broad range +of bitrates, delivering not only superior compression performance to +representative methods but also impressive complexity efficiency, enabling +real-time processing with a significantly smaller memory footprint on standard +GPUs. These remarkable advancements stem from the use of hierarchical +predictive coding. Each video frame is uniformly transformed into multiscale +representations through hierarchical variational autoencoders. For a specific +scale's feature representation of a frame, its corresponding latent residual +variables are generated by referencing lower-scale spatial features from the +same frame and then conditionally entropy-encoded using a probabilistic model +whose parameters are predicted using same-scale temporal reference from +previous frames and lower-scale spatial reference of the current frame. This +feature-space processing operates from the lowest to the highest scale of each +frame, completely eliminating the need for the complexity-intensive motion +estimation and compensation techniques that have been standard in video codecs +for decades. The hierarchical approach facilitates parallel processing, +accelerating both encoding and decoding, and supports transmission-friendly +progressive decoding, making it particularly advantageous for networked video +applications in the presence of packet loss. Source codes will be made +available. + +
+
+
+
+
+ + ☆ IC3M: In-Car Multimodal Multi-object Monitoring for Abnormal Status of + Both Driver and Passengers + + +
+ Recently, in-car monitoring has emerged as a promising technology for +detecting early-stage abnormal status of the driver and providing timely alerts +to prevent traffic accidents. Although training models with multimodal data +enhances the reliability of abnormal status detection, the scarcity of labeled +data and the imbalance of class distribution impede the extraction of critical +abnormal state features, significantly deteriorating training performance. +Furthermore, missing modalities due to environment and hardware limitations +further exacerbate the challenge of abnormal status identification. More +importantly, monitoring abnormal health conditions of passengers, particularly +in elderly care, is of paramount importance but remains underexplored. To +address these challenges, we introduce our IC3M, an efficient +camera-rotation-based multimodal framework for monitoring both driver and +passengers in a car. Our IC3M comprises two key modules: an adaptive threshold +pseudo-labeling strategy and a missing modality reconstruction. The former +customizes pseudo-labeling thresholds for different classes based on the class +distribution, generating class-balanced pseudo labels to guide model training +effectively, while the latter leverages crossmodality relationships learned +from limited labels to accurately recover missing modalities by distribution +transferring from available modalities. Extensive experimental results +demonstrate that IC3M outperforms state-of-the-art benchmarks in accuracy, +precision, and recall while exhibiting superior robustness under limited +labeled data and severe missing modality. + +
+
+ comment: 16 pages, 17 figures +
+
+
+
+
+ + ☆ An Improved Variational Method for Image Denoising + + +
+ The total variation (TV) method is an image denoising technique that aims to +reduce noise by minimizing the total variation of the image, which measures the +variation in pixel intensities. The TV method has been widely applied in image +processing and computer vision for its ability to preserve edges and enhance +image quality. In this paper, we propose an improved TV model for image +denoising and the associated numerical algorithm to carry out the procedure, +which is particularly effective in removing several types of noises and their +combinations. Our improved model admits a unique solution and the associated +numerical algorithm guarantees the convergence. Numerical experiments are +demonstrated to show improved effectiveness and denoising quality compared to +other TV models. Such encouraging results further enhance the utility of the TV +method in image processing. + +
+
+
+
+
+ + ☆ Combining Pre- and Post-Demosaicking Noise Removal for RAW Video + + +
+ Denoising is one of the fundamental steps of the processing pipeline that +converts data captured by a camera sensor into a display-ready image or video. +It is generally performed early in the pipeline, usually before demosaicking, +although studies swapping their order or even conducting them jointly have been +proposed. With the advent of deep learning, the quality of denoising algorithms +has steadily increased. Even so, modern neural networks still have a hard time +adapting to new noise levels and scenes, which is indispensable for real-world +applications. With those in mind, we propose a self-similarity-based denoising +scheme that weights both a pre- and a post-demosaicking denoiser for +Bayer-patterned CFA video data. We show that a balance between the two leads to +better image quality, and we empirically find that higher noise levels benefit +from a higher influence pre-demosaicking. We also integrate temporal trajectory +prefiltering steps before each denoiser, which further improve texture +reconstruction. The proposed method only requires an estimation of the noise +model at the sensor, accurately adapts to any noise level, and is competitive +with the state of the art, making it suitable for real-world videography. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ☆ SuperGS: Super-Resolution 3D Gaussian Splatting via Latent Feature Field + and Gradient-guided Splitting + + +
+ Recently, 3D Gaussian Splatting (3DGS) has exceled in novel view synthesis +with its real-time rendering capabilities and superior quality. However, it +faces challenges for high-resolution novel view synthesis (HRNVS) due to the +coarse nature of primitives derived from low-resolution input views. To address +this issue, we propose Super-Resolution 3DGS (SuperGS), which is an expansion +of 3DGS designed with a two-stage coarse-to-fine training framework, utilizing +pretrained low-resolution scene representation as an initialization for +super-resolution optimization. Moreover, we introduce Multi-resolution Feature +Gaussian Splatting (MFGS) to incorporates a latent feature field for flexible +feature sampling and Gradient-guided Selective Splitting (GSS) for effective +Gaussian upsampling. By integrating these strategies within the coarse-to-fine +framework ensure both high fidelity and memory efficiency. Extensive +experiments demonstrate that SuperGS surpasses state-of-the-art HRNVS methods +on challenging real-world datasets using only low-resolution inputs. + +
+
+
+
+
+ + ☆ NestedMorph: Enhancing Deformable Medical Image Registration with Nested + Attention Mechanisms WACV + + +
+ Deformable image registration is crucial for aligning medical images in a +non-linear fashion across different modalities, allowing for precise spatial +correspondence between varying anatomical structures. This paper presents +NestedMorph, a novel network utilizing a Nested Attention Fusion approach to +improve intra-subject deformable registration between T1-weighted (T1w) MRI and +diffusion MRI (dMRI) data. NestedMorph integrates high-resolution spatial +details from an encoder with semantic information from a decoder using a +multi-scale framework, enhancing both local and global feature extraction. Our +model notably outperforms existing methods, including CNN-based approaches like +VoxelMorph, MIDIR, and CycleMorph, as well as Transformer-based models such as +TransMorph and ViT-V-Net, and traditional techniques like NiftyReg and SyN. +Evaluations on the HCP dataset demonstrate that NestedMorph achieves superior +performance across key metrics, including SSIM, HD95, and SDlogJ, with the +highest SSIM of 0.89, and the lowest HD95 of 2.5 and SDlogJ of 0.22. These +results highlight NestedMorph's ability to capture both local and global image +features effectively, leading to superior registration performance. The +promising outcomes of this study underscore NestedMorph's potential to +significantly advance deformable medical image registration, providing a robust +framework for future research and clinical applications. The source code and +our implementation are available at: https://bit.ly/3zdVqcg + +
+
+ comment: Submitted to IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2025 +
+
+
+
+
+ + ☆ MedVisionLlama: Leveraging Pre-Trained Large Language Model Layers to + Enhance Medical Image Segmentation WACV + + +
+ Large Language Models (LLMs), known for their versatility in textual data, +are increasingly being explored for their potential to enhance medical image +segmentation, a crucial task for accurate diagnostic imaging. This study +explores enhancing Vision Transformers (ViTs) for medical image segmentation by +integrating pre-trained LLM transformer blocks. Our approach, which +incorporates a frozen LLM transformer block into the encoder of a ViT-based +model, leads to substantial improvements in segmentation performance across +various medical imaging modalities. We propose a Hybrid Attention Mechanism +that combines global and local feature learning with a Multi-Scale Fusion Block +for aggregating features across different scales. The enhanced model shows +significant performance gains, including an average Dice score increase from +0.74 to 0.79 and improvements in accuracy, precision, and the Jaccard Index. +These results demonstrate the effectiveness of LLM-based transformers in +refining medical image segmentation, highlighting their potential to +significantly boost model accuracy and robustness. The source code and our +implementation are available at: https://bit.ly/3zf2CVs + +
+
+ comment: Submitted to IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2025 +
+
+
+
+
+ + ☆ Pseudo-Stereo Inputs: A Solution to the Occlusion Challenge in + Self-Supervised Stereo Matching + + +
+ Self-supervised stereo matching holds great promise for application and +research due to its independence from expensive labeled data. However, direct +self-supervised stereo matching paradigms based on photometric loss functions +have consistently struggled with performance issues due to the occlusion +challenge. The crux of the occlusion challenge lies in the fact that the +positions of occluded pixels consistently align with the epipolar search +direction defined by the input stereo images, leading to persistent information +loss and erroneous feedback at fixed locations during self-supervised training. +In this work, we propose a simple yet highly effective pseudo-stereo inputs +strategy to address the core occlusion challenge. This strategy decouples the +input and feedback images, compelling the network to probabilistically sample +information from both sides of the occluding objects. As a result, the +persistent lack of information in the aforementioned fixed occlusion areas is +mitigated. Building upon this, we further address feedback conflicts and +overfitting issues arising from the strategy. By integrating these components, +our method achieves stable and significant performance improvements compared to +existing methods. Quantitative experiments are conducted to evaluate the +performance. Qualitative experiments further demonstrate accurate disparity +inference even at occluded regions. These results demonstrate a significant +advancement over previous methods in the field of direct self-supervised stereo +matching based on photometric loss. The proposed pseudo-stereo inputs strategy, +due to its simplicity and effectiveness, has the potential to serve as a new +paradigm for direct self-supervised stereo matching. Code is available at +https://github.com/qrzyang/Pseudo-Stereo. + +
+
+ comment: Submitted to IEEE Transactions on Image Processing (TIP) +
+
+
+
+
+ + ☆ A Foundation Model for the Solar Dynamics Observatory + + +
+ SDO-FM is a foundation model using data from NASA's Solar Dynamics +Observatory (SDO) spacecraft; integrating three separate instruments to +encapsulate the Sun's complex physical interactions into a multi-modal +embedding space. This model can be used to streamline scientific investigations +involving SDO by making the enormous datasets more computationally accessible +for heliophysics research and enable investigations that require instrument +fusion. We discuss four key components: an ingestion pipeline to create machine +learning ready datasets, the model architecture and training approach, +resultant embeddings and fine-tunable models, and finally downstream fine-tuned +applications. A key component of this effort has been to include subject matter +specialists at each stage of development; reviewing the scientific value and +providing guidance for model architecture, dataset, and training paradigm +decisions. This paper marks release of our pretrained models and embedding +datasets, available to the community on Hugging Face and sdofm.org. + +
+
+
+
+
+ + ☆ HiFiSeg: High-Frequency Information Enhanced Polyp Segmentation with + Global-Local Vision Transformer + + +
+ Numerous studies have demonstrated the strong performance of Vision +Transformer (ViT)-based methods across various computer vision tasks. However, +ViT models often struggle to effectively capture high-frequency components in +images, which are crucial for detecting small targets and preserving edge +details, especially in complex scenarios. This limitation is particularly +challenging in colon polyp segmentation, where polyps exhibit significant +variability in structure, texture, and shape. High-frequency information, such +as boundary details, is essential for achieving precise semantic segmentation +in this context. To address these challenges, we propose HiFiSeg, a novel +network for colon polyp segmentation that enhances high-frequency information +processing through a global-local vision transformer framework. HiFiSeg +leverages the pyramid vision transformer (PVT) as its encoder and introduces +two key modules: the global-local interaction module (GLIM) and the selective +aggregation module (SAM). GLIM employs a parallel structure to fuse global and +local information at multiple scales, effectively capturing fine-grained +features. SAM selectively integrates boundary details from low-level features +with semantic information from high-level features, significantly improving the +model's ability to accurately detect and segment polyps. Extensive experiments +on five widely recognized benchmark datasets demonstrate the effectiveness of +HiFiSeg for polyp segmentation. Notably, the mDice scores on the challenging +CVC-ColonDB and ETIS datasets reached 0.826 and 0.822, respectively, +underscoring the superior performance of HiFiSeg in handling the specific +complexities of this task. + +
+
+
+
+
+ + ☆ Learning from Offline Foundation Features with Tensor Augmentations NeurIPS 2024 + + +
+ We introduce Learning from Offline Foundation Features with Tensor +Augmentations (LOFF-TA), an efficient training scheme designed to harness the +capabilities of foundation models in limited resource settings where their +direct development is not feasible. LOFF-TA involves training a compact +classifier on cached feature embeddings from a frozen foundation model, +resulting in up to $37\times$ faster training and up to $26\times$ reduced GPU +memory usage. Because the embeddings of augmented images would be too numerous +to store, yet the augmentation process is essential for training, we propose to +apply tensor augmentations to the cached embeddings of the original +non-augmented images. LOFF-TA makes it possible to leverage the power of +foundation models, regardless of their size, in settings with limited +computational capacity. Moreover, LOFF-TA can be used to apply foundation +models to high-resolution images without increasing compute. In certain +scenarios, we find that training with LOFF-TA yields better results than +directly fine-tuning the foundation model. + +
+
+ comment: Accepted to the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) +
+
+
+
+
+ + ☆ Med-TTT: Vision Test-Time Training model for Medical Image Segmentation + + +
+ Medical image segmentation plays a crucial role in clinical diagnosis and +treatment planning. Although models based on convolutional neural networks +(CNNs) and Transformers have achieved remarkable success in medical image +segmentation tasks, they still face challenges such as high computational +complexity and the loss of local features when capturing long-range +dependencies. To address these limitations, we propose Med-TTT, a visual +backbone network integrated with Test-Time Training (TTT) layers, which +incorporates dynamic adjustment capabilities. Med-TTT introduces the Vision-TTT +layer, which enables effective modeling of long-range dependencies with linear +computational complexity and adaptive parameter adjustment during inference. +Furthermore, we designed a multi-resolution fusion mechanism to combine image +features at different scales, facilitating the identification of subtle lesion +characteristics in complex backgrounds. At the same time, we adopt a frequency +domain feature enhancement strategy based on high pass filtering, which can +better capture texture and fine-grained details in images. Experimental results +demonstrate that Med-TTT significantly outperforms existing methods on multiple +medical image datasets, exhibiting strong segmentation capabilities, +particularly in complex image backgrounds. The model achieves leading +performance in terms of accuracy, sensitivity, and Dice coefficient, providing +an efficient and robust solution for the field of medical image +segmentation.The code is available at https://github.com/Jiashu-Xu/Med-TTT . + +
+
+
+
+
+ + ☆ Dog-IQA: Standard-guided Zero-shot MLLM for Mix-grained Image Quality + Assessment + + +
+ Image quality assessment (IQA) serves as the golden standard for all models' +performance in nearly all computer vision fields. However, it still suffers +from poor out-of-distribution generalization ability and expensive training +costs. To address these problems, we propose Dog-IQA, a standard-guided +zero-shot mix-grained IQA method, which is training-free and utilizes the +exceptional prior knowledge of multimodal large language models (MLLMs). To +obtain accurate IQA scores, namely scores consistent with humans, we design an +MLLM-based inference pipeline that imitates human experts. In detail, Dog-IQA +applies two techniques. First, Dog-IQA objectively scores with specific +standards that utilize MLLM's behavior pattern and minimize the influence of +subjective factors. Second, Dog-IQA comprehensively takes local semantic +objects and the whole image as input and aggregates their scores, leveraging +local and global information. Our proposed Dog-IQA achieves state-of-the-art +(SOTA) performance compared with training-free methods, and competitive +performance compared with training-based methods in cross-dataset scenarios. +Our code and models will be available at https://github.com/Kai-Liu001/Dog-IQA. + +
+
+ comment: 10 pages, 5 figures. The code and models will be available at + https://github.com/Kai-Liu001/Dog-IQA +
+
+
+
+
+ + ☆ DTVLT: A Multi-modal Diverse Text Benchmark for Visual Language Tracking + Based on LLM + + +
+ Visual language tracking (VLT) has emerged as a cutting-edge research area, +harnessing linguistic data to enhance algorithms with multi-modal inputs and +broadening the scope of traditional single object tracking (SOT) to encompass +video understanding applications. Despite this, most VLT benchmarks still +depend on succinct, human-annotated text descriptions for each video. These +descriptions often fall short in capturing the nuances of video content +dynamics and lack stylistic variety in language, constrained by their uniform +level of detail and a fixed annotation frequency. As a result, algorithms tend +to default to a "memorize the answer" strategy, diverging from the core +objective of achieving a deeper understanding of video content. Fortunately, +the emergence of large language models (LLMs) has enabled the generation of +diverse text. This work utilizes LLMs to generate varied semantic annotations +(in terms of text lengths and granularities) for representative SOT benchmarks, +thereby establishing a novel multi-modal benchmark. Specifically, we (1) +propose a new visual language tracking benchmark with diverse texts, named +DTVLT, based on five prominent VLT and SOT benchmarks, including three +sub-tasks: short-term tracking, long-term tracking, and global instance +tracking. (2) We offer four granularity texts in our benchmark, considering the +extent and density of semantic information. We expect this multi-granular +generation strategy to foster a favorable environment for VLT and video +understanding research. (3) We conduct comprehensive experimental analyses on +DTVLT, evaluating the impact of diverse text on tracking performance and hope +the identified performance bottlenecks of existing algorithms can support +further research in VLT and video understanding. The proposed benchmark, +experimental results and toolkit will be released gradually on +http://videocube.aitestunion.com/. + +
+
+ comment: Preprint, Under Review +
+
+
+
+
+ + ☆ Event-Customized Image Generation + + +
+ Customized Image Generation, generating customized images with user-specified +concepts, has raised significant attention due to its creativity and novelty. +With impressive progress achieved in subject customization, some pioneer works +further explored the customization of action and interaction beyond entity +(i.e., human, animal, and object) appearance. However, these approaches only +focus on basic actions and interactions between two entities, and their effects +are limited by insufficient ''exactly same'' reference images. To extend +customized image generation to more complex scenes for general real-world +applications, we propose a new task: event-customized image generation. Given a +single reference image, we define the ''event'' as all specific actions, poses, +relations, or interactions between different entities in the scene. This task +aims at accurately capturing the complex event and generating customized images +with various target entities. To solve this task, we proposed a novel +training-free event customization method: FreeEvent. Specifically, FreeEvent +introduces two extra paths alongside the general diffusion denoising process: +1) Entity switching path: it applies cross-attention guidance and regulation +for target entity generation. 2) Event transferring path: it injects the +spatial feature and self-attention maps from the reference image to the target +image for event generation. To further facilitate this new task, we collected +two evaluation benchmarks: SWiG-Event and Real-Event. Extensive experiments and +ablations have demonstrated the effectiveness of FreeEvent. + +
+
+
+
+
+ + ☆ Towards a Theoretical Understanding of Memorization in Diffusion Models + + +
+ As diffusion probabilistic models (DPMs) are being employed as mainstream +models for Generative Artificial Intelligence (GenAI), the study of their +memorization of training data has attracted growing attention. Existing works +in this direction aim to establish an understanding of whether or to what +extent DPMs learn via memorization. Such an understanding is crucial for +identifying potential risks of data leakage and copyright infringement in +diffusion models and, more importantly, for trustworthy application of GenAI. +Existing works revealed that conditional DPMs are more prone to training data +memorization than unconditional DPMs, and the motivated data extraction methods +are mostly for conditional DPMs. However, these understandings are primarily +empirical, and extracting training data from unconditional models has been +found to be extremely challenging. In this work, we provide a theoretical +understanding of memorization in both conditional and unconditional DPMs under +the assumption of model convergence. Our theoretical analysis indicates that +extracting data from unconditional models can also be effective by constructing +a proper surrogate condition. Based on this result, we propose a novel data +extraction method named \textbf{Surrogate condItional Data Extraction (SIDE)} +that leverages a time-dependent classifier trained on the generated data as a +surrogate condition to extract training data from unconditional DPMs. Empirical +results demonstrate that our SIDE can extract training data in challenging +scenarios where previous methods fail, and it is, on average, over 50\% more +effective across different scales of the CelebA dataset. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2406.12752 +
+
+
+
+
+ + ☆ Recurrent Few-Shot model for Document Verification + + +
+ General-purpose ID, or travel, document image- and video-based verification +systems have yet to achieve good enough performance to be considered a solved +problem. There are several factors that negatively impact their performance, +including low-resolution images and videos and a lack of sufficient data to +train the models. This task is particularly challenging when dealing with +unseen class of ID, or travel, documents. In this paper we address this task by +proposing a recurrent-based model able to detect forged documents in a few-shot +scenario. The recurrent architecture makes the model robust to document +resolution variability. Moreover, the few-shot approach allow the model to +perform well even for unseen class of documents. Preliminary results on the +SIDTD and Findit datasets show good performance of this model for this task. + +
+
+
+
+
+ + ☆ Clinnova Federated Learning Proof of Concept: Key Takeaways from a + Cross-border Collaboration + + +
+ Clinnova, a collaborative initiative involving France, Germany, Switzerland, +and Luxembourg, is dedicated to unlocking the power of precision medicine +through data federation, standardization, and interoperability. This European +Greater Region initiative seeks to create an interoperable European standard +using artificial intelligence (AI) and data science to enhance healthcare +outcomes and efficiency. Key components include multidisciplinary research +centers, a federated biobanking strategy, a digital health innovation platform, +and a federated AI strategy. It targets inflammatory bowel disease, rheumatoid +diseases, and multiple sclerosis (MS), emphasizing data quality to develop AI +algorithms for personalized treatment and translational research. + The IHU Strasbourg (Institute of Minimal-invasive Surgery) has the lead in +this initiative to develop the federated learning (FL) proof of concept (POC) +that will serve as a foundation for advancing AI in healthcare. At its core, +Clinnova-MS aims to enhance MS patient care by using FL to develop more +accurate models that detect disease progression, guide interventions, and +validate digital biomarkers across multiple sites. This technical report +presents insights and key takeaways from the first cross-border federated POC +on MS segmentation of MRI images within the Clinnova framework. While our work +marks a significant milestone in advancing MS segmentation through cross-border +collaboration, it also underscores the importance of addressing technical, +logistical, and ethical considerations to realize the full potential of FL in +healthcare settings. + +
+
+
+
+
+ + ☆ Predictive Attractor Models NeurIPS 2024 + + +
+ Sequential memory, the ability to form and accurately recall a sequence of +events or stimuli in the correct order, is a fundamental prerequisite for +biological and artificial intelligence as it underpins numerous cognitive +functions (e.g., language comprehension, planning, episodic memory formation, +etc.) However, existing methods of sequential memory suffer from catastrophic +forgetting, limited capacity, slow iterative learning procedures, low-order +Markov memory, and, most importantly, the inability to represent and generate +multiple valid future possibilities stemming from the same context. Inspired by +biologically plausible neuroscience theories of cognition, we propose +\textit{Predictive Attractor Models (PAM)}, a novel sequence memory +architecture with desirable generative properties. PAM is a streaming model +that learns a sequence in an online, continuous manner by observing each input +\textit{only once}. Additionally, we find that PAM avoids catastrophic +forgetting by uniquely representing past context through lateral inhibition in +cortical minicolumns, which prevents new memories from overwriting previously +learned knowledge. PAM generates future predictions by sampling from a union +set of predicted possibilities; this generative ability is realized through an +attractor model trained alongside the predictor. We show that PAM is trained +with local computations through Hebbian plasticity rules in a biologically +plausible framework. Other desirable traits (e.g., noise tolerance, CPU-based +learning, capacity scaling) are discussed throughout the paper. Our findings +suggest that PAM represents a significant step forward in the pursuit of +biologically plausible and computationally efficient sequential memory models, +with broad implications for cognitive science and artificial intelligence +research. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ PnP-Flow: Plug-and-Play Image Restoration with Flow Matching + + +
+ In this paper, we introduce Plug-and-Play (PnP) Flow Matching, an algorithm +for solving imaging inverse problems. PnP methods leverage the strength of +pre-trained denoisers, often deep neural networks, by integrating them in +optimization schemes. While they achieve state-of-the-art performance on +various inverse problems in imaging, PnP approaches face inherent limitations +on more generative tasks like inpainting. On the other hand, generative models +such as Flow Matching pushed the boundary in image sampling yet lack a clear +method for efficient use in image restoration. We propose to combine the PnP +framework with Flow Matching (FM) by defining a time-dependent denoiser using a +pre-trained FM model. Our algorithm alternates between gradient descent steps +on the data-fidelity term, reprojections onto the learned FM path, and +denoising. Notably, our method is computationally efficient and +memory-friendly, as it avoids backpropagation through ODEs and trace +computations. We evaluate its performance on denoising, super-resolution, +deblurring, and inpainting tasks, demonstrating superior results compared to +existing PnP algorithms and Flow Matching based state-of-the-art methods. + +
+
+
+
+
+ + ☆ LoGDesc: Local geometric features aggregation for robust point cloud + registration + + +
+ This paper introduces a new hybrid descriptor for 3D point matching and point +cloud registration, combining local geometrical properties and learning-based +feature propagation for each point's neighborhood structure description. The +proposed architecture first extracts prior geometrical information by computing +each point's planarity, anisotropy, and omnivariance using a Principal +Components Analysis (PCA). This prior information is completed by a descriptor +based on the normal vectors estimated thanks to constructing a neighborhood +based on triangles. The final geometrical descriptor is propagated between the +points using local graph convolutions and attention mechanisms. The new feature +extractor is evaluated on ModelNet40, Bunny Stanford dataset, KITTI and MVP +(Multi-View Partial)-RG for point cloud registration and shows interesting +results, particularly on noisy and low overlapping point clouds. + +
+
+
+
+
+ + ☆ Eliminating Oversaturation and Artifacts of High Guidance Scales in + Diffusion Models + + +
+ Classifier-free guidance (CFG) is crucial for improving both generation +quality and alignment between the input condition and final output in diffusion +models. While a high guidance scale is generally required to enhance these +aspects, it also causes oversaturation and unrealistic artifacts. In this +paper, we revisit the CFG update rule and introduce modifications to address +this issue. We first decompose the update term in CFG into parallel and +orthogonal components with respect to the conditional model prediction and +observe that the parallel component primarily causes oversaturation, while the +orthogonal component enhances image quality. Accordingly, we propose +down-weighting the parallel component to achieve high-quality generations +without oversaturation. Additionally, we draw a connection between CFG and +gradient ascent and introduce a new rescaling and momentum method for the CFG +update rule based on this insight. Our approach, termed adaptive projected +guidance (APG), retains the quality-boosting advantages of CFG while enabling +the use of higher guidance scales without oversaturation. APG is easy to +implement and introduces practically no additional computational overhead to +the sampling process. Through extensive experiments, we demonstrate that APG is +compatible with various conditional diffusion models and samplers, leading to +improved FID, recall, and saturation scores while maintaining precision +comparable to CFG, making our method a superior plug-and-play alternative to +standard classifier-free guidance. + +
+
+
+
+
+ + ☆ SynCo: Synthetic Hard Negatives in Contrastive Learning for Better + Unsupervised Visual Representations + + +
+ Contrastive learning has become a dominant approach in self-supervised visual +representation learning, with hard negatives-samples that closely resemble the +anchor-being key to enhancing the discriminative power of learned +representations. However, efficiently leveraging hard negatives remains a +challenge due to the difficulty in identifying and incorporating them without +significantly increasing computational costs. To address this, we introduce +SynCo (Synthetic Negatives in Contrastive learning), a novel contrastive +learning approach that improves model performance by generating synthetic hard +negatives. Built on the MoCo framework, SynCo introduces six novel strategies +for creating diverse synthetic hard negatives that can be generated on-the-fly +with minimal computational overhead. SynCo achieves faster training and better +representation learning, achieving a top-1 accuracy of 68.1% in ImageNet linear +evaluation after only 200 epochs on pretraining, surpassing MoCo's 67.5% with +the same ResNet-50 encoder. Additionally, it transfers more effectively to +detection tasks: on the PASCAL VOC, it outperforms both the supervised baseline +and MoCo, achieving an AP of 82.5%; on the COCO dataset, it sets a new +benchmark with 40.4% AP for bounding box detection and 35.4% AP for instance +segmentation. Our synthetic hard negative generation procedure significantly +enhances the quality of visual representations learned through self-supervised +contrastive learning. Code is available at +https://github.com/giakoumoglou/synco. + +
+
+ comment: 10 pages, 6 figures, 4 tables. arXiv admin note: text overlap with + arXiv:2010.01028 by other authors +
+
+
+
+
+ + ☆ Parameter Competition Balancing for Model Merging NeurIPS2024 + + +
+ While fine-tuning pretrained models has become common practice, these models +often underperform outside their specific domains. Recently developed model +merging techniques enable the direct integration of multiple models, each +fine-tuned for distinct tasks, into a single model. This strategy promotes +multitasking capabilities without requiring retraining on the original +datasets. However, existing methods fall short in addressing potential +conflicts and complex correlations between tasks, especially in parameter-level +adjustments, posing a challenge in effectively balancing parameter competition +across various tasks. This paper introduces an innovative technique named +PCB-Merging (Parameter Competition Balancing), a lightweight and training-free +technique that adjusts the coefficients of each parameter for effective model +merging. PCB-Merging employs intra-balancing to gauge parameter significance +within individual tasks and inter-balancing to assess parameter similarities +across different tasks. Parameters with low importance scores are dropped, and +the remaining ones are rescaled to form the final merged model. We assessed our +approach in diverse merging scenarios, including cross-task, cross-domain, and +cross-training configurations, as well as out-of-domain generalization. The +experimental results reveal that our approach achieves substantial performance +enhancements across multiple modalities, domains, model sizes, number of tasks, +fine-tuning forms, and large language models, outperforming existing model +merging methods. The code is publicly available at: +\url{https://github.com/duguodong7/pcb-merging}. + +
+
+ comment: Accepted by NeurIPS2024 +
+
+
+
+
+ + ☆ MetaMetrics: Calibrating Metrics For Generation Tasks Using Human + Preferences + + +
+ Understanding the quality of a performance evaluation metric is crucial for +ensuring that model outputs align with human preferences. However, it remains +unclear how well each metric captures the diverse aspects of these preferences, +as metrics often excel in one particular area but not across all dimensions. To +address this, it is essential to systematically calibrate metrics to specific +aspects of human preference, catering to the unique characteristics of each +aspect. We introduce MetaMetrics, a calibrated meta-metric designed to evaluate +generation tasks across different modalities in a supervised manner. +MetaMetrics optimizes the combination of existing metrics to enhance their +alignment with human preferences. Our metric demonstrates flexibility and +effectiveness in both language and vision downstream tasks, showing significant +benefits across various multilingual and multi-domain scenarios. MetaMetrics +aligns closely with human preferences and is highly extendable and easily +integrable into any application. This makes MetaMetrics a powerful tool for +improving the evaluation of generation tasks, ensuring that metrics are more +representative of human judgment across diverse contexts. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Unleashing the Potential of the Diffusion Model in Few-shot Semantic + Segmentation NeurIPS + + +
+ The Diffusion Model has not only garnered noteworthy achievements in the +realm of image generation but has also demonstrated its potential as an +effective pretraining method utilizing unlabeled data. Drawing from the +extensive potential unveiled by the Diffusion Model in both semantic +correspondence and open vocabulary segmentation, our work initiates an +investigation into employing the Latent Diffusion Model for Few-shot Semantic +Segmentation. Recently, inspired by the in-context learning ability of large +language models, Few-shot Semantic Segmentation has evolved into In-context +Segmentation tasks, morphing into a crucial element in assessing generalist +segmentation models. In this context, we concentrate on Few-shot Semantic +Segmentation, establishing a solid foundation for the future development of a +Diffusion-based generalist model for segmentation. Our initial focus lies in +understanding how to facilitate interaction between the query image and the +support image, resulting in the proposal of a KV fusion method within the +self-attention framework. Subsequently, we delve deeper into optimizing the +infusion of information from the support mask and simultaneously re-evaluating +how to provide reasonable supervision from the query mask. Based on our +analysis, we establish a simple and effective framework named DiffewS, +maximally retaining the original Latent Diffusion Model's generative framework +and effectively utilizing the pre-training prior. Experimental results +demonstrate that our method significantly outperforms the previous SOTA models +in multiple settings. + +
+
+ comment: Accepted to Proc. Annual Conference on Neural Information Processing + Systems (NeurIPS) 2024 +
+
+
+
+
+ + ☆ A Comprehensive Survey of Mamba Architectures for Medical Image + Analysis: Classification, Segmentation, Restoration and Beyond + + +
+ Mamba, a special case of the State Space Model, is gaining popularity as an +alternative to template-based deep learning approaches in medical image +analysis. While transformers are powerful architectures, they have drawbacks, +including quadratic computational complexity and an inability to address +long-range dependencies efficiently. This limitation affects the analysis of +large and complex datasets in medical imaging, where there are many spatial and +temporal relationships. In contrast, Mamba offers benefits that make it +well-suited for medical image analysis. It has linear time complexity, which is +a significant improvement over transformers. Mamba processes longer sequences +without attention mechanisms, enabling faster inference and requiring less +memory. Mamba also demonstrates strong performance in merging multimodal data, +improving diagnosis accuracy and patient outcomes. The organization of this +paper allows readers to appreciate the capabilities of Mamba in medical imaging +step by step. We begin by defining core concepts of SSMs and models, including +S4, S5, and S6, followed by an exploration of Mamba architectures such as pure +Mamba, U-Net variants, and hybrid models with convolutional neural networks, +transformers, and Graph Neural Networks. We also cover Mamba optimizations, +techniques and adaptations, scanning, datasets, applications, experimental +results, and conclude with its challenges and future directions in medical +imaging. This review aims to demonstrate the transformative potential of Mamba +in overcoming existing barriers within medical imaging while paving the way for +innovative advancements in the field. A comprehensive list of Mamba +architectures applied in the medical field, reviewed in this work, is available +at Github. + +
+
+
+
+
+ + ☆ ProtoSeg: A Prototype-Based Point Cloud Instance Segmentation Method + + +
+ 3D instance segmentation is crucial for obtaining an understanding of a point +cloud scene. This paper presents a novel neural network architecture for +performing instance segmentation on 3D point clouds. We propose to jointly +learn coefficients and prototypes in parallel which can be combined to obtain +the instance predictions. The coefficients are computed using an overcomplete +set of sampled points with a novel multi-scale module, dubbed dilated point +inception. As the set of obtained instance mask predictions is overcomplete, we +employ a non-maximum suppression algorithm to retrieve the final predictions. +This approach allows to omit the time-expensive clustering step and leads to a +more stable inference time. The proposed method is not only 28% faster than the +state-of-the-art, it also exhibits the lowest standard deviation. Our +experiments have shown that the standard deviation of the inference time is +only 1.0% of the total time while it ranges between 10.8 and 53.1% for the +state-of-the-art methods. Lastly, our method outperforms the state-of-the-art +both on S3DIS-blocks (4.9% in mRec on Fold-5) and PartNet (2.0% on average in +mAP). + +
+
+
+
+
+ + ☆ Self-eXplainable AI for Medical Image Analysis: A Survey and New + Outlooks + + +
+ The increasing demand for transparent and reliable models, particularly in +high-stakes decision-making areas such as medical image analysis, has led to +the emergence of eXplainable Artificial Intelligence (XAI). Post-hoc XAI +techniques, which aim to explain black-box models after training, have been +controversial in recent works concerning their fidelity to the models' +predictions. In contrast, Self-eXplainable AI (S-XAI) offers a compelling +alternative by incorporating explainability directly into the training process +of deep learning models. This approach allows models to generate inherent +explanations that are closely aligned with their internal decision-making +processes. Such enhanced transparency significantly supports the +trustworthiness, robustness, and accountability of AI systems in real-world +medical applications. To facilitate the development of S-XAI methods for +medical image analysis, this survey presents an comprehensive review across +various image modalities and clinical applications. It covers more than 200 +papers from three key perspectives: 1) input explainability through the +integration of explainable feature engineering and knowledge graph, 2) model +explainability via attention-based learning, concept-based learning, and +prototype-based learning, and 3) output explainability by providing +counterfactual explanation and textual explanation. Additionally, this paper +outlines the desired characteristics of explainability and existing evaluation +methods for assessing explanation quality. Finally, it discusses the major +challenges and future research directions in developing S-XAI for medical image +analysis. + +
+
+
+
+
+ + ☆ RESSCAL3D++: Joint Acquisition and Semantic Segmentation of 3D Point + Clouds ICIP + + +
+ 3D scene understanding is crucial for facilitating seamless interaction +between digital devices and the physical world. Real-time capturing and +processing of the 3D scene are essential for achieving this seamless +integration. While existing approaches typically separate acquisition and +processing for each frame, the advent of resolution-scalable 3D sensors offers +an opportunity to overcome this paradigm and fully leverage the otherwise +wasted acquisition time to initiate processing. In this study, we introduce +VX-S3DIS, a novel point cloud dataset accurately simulating the behavior of a +resolution-scalable 3D sensor. Additionally, we present RESSCAL3D++, an +important improvement over our prior work, RESSCAL3D, by incorporating an +update module and processing strategy. By applying our method to the new +dataset, we practically demonstrate the potential of joint acquisition and +semantic segmentation of 3D point clouds. Our resolution-scalable approach +significantly reduces scalability costs from 2% to just 0.2% in mIoU while +achieving impressive speed-ups of 15.6 to 63.9% compared to the non-scalable +baseline. Furthermore, our scalable approach enables early predictions, with +the first one occurring after only 7% of the total inference time of the +baseline. The new VX-S3DIS dataset is available at +https://github.com/remcoroyen/vx-s3dis. + +
+
+ comment: 2024 IEEE International Conference on Image Processing (ICIP). IEEE, + 2024 +
+
+
+
+
+ + ☆ CTARR: A fast and robust method for identifying anatomical regions on CT + images via atlas registration + + +
+ Medical image analysis tasks often focus on regions or structures located in +a particular location within the patient's body. Often large parts of the image +may not be of interest for the image analysis task. When using deep-learning +based approaches, this causes an unnecessary increases the computational burden +during inference and raises the chance of errors. In this paper, we introduce +CTARR, a novel generic method for CT Anatomical Region Recognition. The method +serves as a pre-processing step for any deep learning-based CT image analysis +pipeline by automatically identifying the pre-defined anatomical region that is +relevant for the follow-up task and removing the rest. It can be used in (i) +image segmentation to prevent false positives in anatomically implausible +regions and speeding up the inference, (ii) image classification to produce +image crops that are consistent in their anatomical context, and (iii) image +registration by serving as a fast pre-registration step. Our proposed method is +based on atlas registration and provides a fast and robust way to crop any +anatomical region encoded as one or multiple bounding box(es) from any +unlabeled CT scan of the brain, chest, abdomen and/or pelvis. We demonstrate +the utility and robustness of the proposed method in the context of medical +image segmentation by evaluating it on six datasets of public segmentation +challenges. The foreground voxels in the regions of interest are preserved in +the vast majority of cases and tasks (97.45-100%) while taking only fractions +of a seconds to compute (0.1-0.21s) on a deep learning workstation and greatly +reducing the segmentation runtime (2.0-12.7x). Our code is available at +https://github.com/ThomasBudd/ctarr. + +
+
+
+
+
+ + ☆ Decoupling Layout from Glyph in Online Chinese Handwriting Generation + + +
+ Text plays a crucial role in the transmission of human civilization, and +teaching machines to generate online handwritten text in various styles +presents an interesting and significant challenge. However, most prior work has +concentrated on generating individual Chinese fonts, leaving {complete text +line generation largely unexplored}. In this paper, we identify that text lines +can naturally be divided into two components: layout and glyphs. Based on this +division, we designed a text line layout generator coupled with a +diffusion-based stylized font synthesizer to address this challenge +hierarchically. More concretely, the layout generator performs in-context-like +learning based on the text content and the provided style references to +generate positions for each glyph autoregressively. Meanwhile, the font +synthesizer which consists of a character embedding dictionary, a multi-scale +calligraphy style encoder, and a 1D U-Net based diffusion denoiser will +generate each font on its position while imitating the calligraphy style +extracted from the given style references. Qualitative and quantitative +experiments on the CASIA-OLHWDB demonstrate that our method is capable of +generating structurally correct and indistinguishable imitation samples. + +
+
+
+
+
+ + ☆ The Comparison of Individual Cat Recognition Using Neural Networks + + +
+ Facial recognition using deep learning has been widely used in social life +for applications such as authentication, smart door locks, and photo grouping, +etc. More and more networks have been developed to facilitate computer vision +tasks, such as ResNet, DenseNet, EfficientNet, ConvNeXt, and Siamese networks. +However, few studies have systematically compared the advantages and +disadvantages of such neural networks in identifying individuals from images, +especially for pet animals like cats. In the present study, by systematically +comparing the efficacy of different neural networks in cat recognition, we +found traditional CNNs trained with transfer learning have better performance +than models trained with the fine-tuning method or Siamese networks in +individual cat recognition. In addition, ConvNeXt and DenseNet yield +significant results which could be further optimized for individual cat +recognition in pet stores and in the wild. These results provide a method to +improve cat management in pet stores and monitoring of cats in the wild. + +
+
+ comment: 13 pages,7 figures +
+
+
+
+
+ + ☆ A Novel Method for Accurate & Real-time Food Classification: The + Synergistic Integration of EfficientNetB7, CBAM, Transfer Learning, and Data + Augmentation + + +
+ Integrating artificial intelligence into modern society is profoundly +transformative, significantly enhancing productivity by streamlining various +daily tasks. AI-driven recognition systems provide notable advantages in the +food sector, including improved nutrient tracking, tackling food waste, and +boosting food production and consumption efficiency. Accurate food +classification is a crucial initial step in utilizing advanced AI models, as +the effectiveness of this process directly influences the success of subsequent +operations; therefore, achieving high accuracy at a reasonable speed is +essential. Despite existing research efforts, a gap persists in improving +performance while ensuring rapid processing times, prompting researchers to +pursue cost-effective and precise models. This study addresses this gap by +employing the state-of-the-art EfficientNetB7 architecture, enhanced through +transfer learning, data augmentation, and the CBAM attention module. This +methodology results in a robust model that surpasses previous studies in +accuracy while maintaining rapid processing suitable for real-world +applications. The Food11 dataset from Kaggle was utilized, comprising 16643 +imbalanced images across 11 diverse classes with significant intra-category +diversities and inter-category similarities. Furthermore, the proposed +methodology, bolstered by various deep learning techniques, consistently +achieves an impressive average accuracy of 96.40%. Notably, it can classify +over 60 images within one second during inference on unseen data, demonstrating +its ability to deliver high accuracy promptly. This underscores its potential +for practical applications in accurate food classification and enhancing +efficiency in subsequent processes. + +
+
+ comment: 20 pages, six figures, two tables +
+
+
+
+
+ + ☆ Computer-aided Colorization State-of-the-science: A Survey + + +
+ This paper reviews published research in the field of computer-aided +colorization technology. We argue that the colorization task originates from +computer graphics, prospers by introducing computer vision, and tends to the +fusion of vision and graphics, so we put forward our taxonomy and organize the +whole paper chronologically. We extend the existing reconstruction-based +colorization evaluation techniques, considering that aesthetic assessment of +colored images should be introduced to ensure that colorization satisfies human +visual-related requirements and emotions more closely. We perform the +colorization aesthetic assessment on seven representative unconditional +colorization models and discuss the difference between our assessment and the +existing reconstruction-based metrics. Finally, this paper identifies +unresolved issues and proposes fruitful areas for future research and +development. Access to the project associated with this survey can be obtained +at https://github.com/DanielCho-HK/Colorization. + +
+
+
+
+
+ + ☆ Structural-Entropy-Based Sample Selection for Efficient and Effective + Learning ICLR 2025 + + +
+ Sample selection improves the efficiency and effectiveness of machine +learning models by providing informative and representative samples. Typically, +samples can be modeled as a sample graph, where nodes are samples and edges +represent their similarities. Most existing methods are based on local +information, such as the training difficulty of samples, thereby overlooking +global information, such as connectivity patterns. This oversight can result in +suboptimal selection because global information is crucial for ensuring that +the selected samples well represent the structural properties of the graph. To +address this issue, we employ structural entropy to quantify global information +and losslessly decompose it from the whole graph to individual nodes using the +Shapley value. Based on the decomposition, we present +$\textbf{S}$tructural-$\textbf{E}$ntropy-based sample $\textbf{S}$election +($\textbf{SES}$), a method that integrates both global and local information to +select informative and representative samples. SES begins by constructing a +$k$NN-graph among samples based on their similarities. It then measures sample +importance by combining structural entropy (global metric) with training +difficulty (local metric). Finally, SES applies importance-biased blue noise +sampling to select a set of diverse and representative samples. Comprehensive +experiments on three learning scenarios -- supervised learning, active +learning, and continual learning -- clearly demonstrate the effectiveness of +our method. + +
+
+ comment: Submitted to ICLR 2025 +
+
+
+
+
+ + ☆ Probabilistic road classification in historical maps using synthetic + data and deep learning + + +
+ Historical maps are invaluable for analyzing long-term changes in +transportation and spatial development, offering a rich source of data for +evolutionary studies. However, digitizing and classifying road networks from +these maps is often expensive and time-consuming, limiting their widespread +use. Recent advancements in deep learning have made automatic road extraction +from historical maps feasible, yet these methods typically require large +amounts of labeled training data. To address this challenge, we introduce a +novel framework that integrates deep learning with geoinformation, +computer-based painting, and image processing methodologies. This framework +enables the extraction and classification of roads from historical maps using +only road geometries without needing road class labels for training. The +process begins with training of a binary segmentation model to extract road +geometries, followed by morphological operations, skeletonization, +vectorization, and filtering algorithms. Synthetic training data is then +generated by a painting function that artificially re-paints road segments +using predefined symbology for road classes. Using this synthetic data, a deep +ensemble is trained to generate pixel-wise probabilities for road classes to +mitigate distribution shift. These predictions are then discretized along the +extracted road geometries. Subsequently, further processing is employed to +classify entire roads, enabling the identification of potential changes in road +classes and resulting in a labeled road class dataset. Our method achieved +completeness and correctness scores of over 94% and 92%, respectively, for road +class 2, the most prevalent class in the two Siegfried Map sheets from +Switzerland used for testing. This research offers a powerful tool for urban +planning and transportation decision-making by efficiently extracting and +classifying roads from historical maps. + +
+
+
+
+
+ + ☆ Spiking Neural Network as Adaptive Event Stream Slicer NeurIPS 2024 + + +
+ Event-based cameras are attracting significant interest as they provide rich +edge information, high dynamic range, and high temporal resolution. Many +state-of-the-art event-based algorithms rely on splitting the events into fixed +groups, resulting in the omission of crucial temporal information, particularly +when dealing with diverse motion scenarios (e.g., high/low speed). In this +work, we propose SpikeSlicer, a novel-designed plug-and-play event processing +method capable of splitting events stream adaptively. SpikeSlicer utilizes a +lightweight (0.41M) and low-energy spiking neural network (SNN) to trigger +event slicing. To guide the SNN to fire spikes at optimal time steps, we +propose the Spiking Position-aware Loss (SPA-Loss) to modulate the neuron's +state. Additionally, we develop a Feedback-Update training strategy that +refines the slicing decisions using feedback from the downstream artificial +neural network (ANN). Extensive experiments demonstrate that our method yields +significant performance improvements in event-based object tracking and +recognition. Notably, SpikeSlicer provides a brand-new SNN-ANN cooperation +paradigm, where the SNN acts as an efficient, low-energy data processor to +assist the ANN in improving downstream performance, injecting new perspectives +and potential avenues of exploration. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Visual Prompting in LLMs for Enhancing Emotion Recognition EMNLP2024 + + +
+ Vision Large Language Models (VLLMs) are transforming the intersection of +computer vision and natural language processing. Nonetheless, the potential of +using visual prompts for emotion recognition in these models remains largely +unexplored and untapped. Traditional methods in VLLMs struggle with spatial +localization and often discard valuable global context. To address this +problem, we propose a Set-of-Vision prompting (SoV) approach that enhances +zero-shot emotion recognition by using spatial information, such as bounding +boxes and facial landmarks, to mark targets precisely. SoV improves accuracy in +face count and emotion categorization while preserving the enriched image +context. Through a battery of experimentation and analysis of recent commercial +or open-source VLLMs, we evaluate the SoV model's ability to comprehend facial +expressions in natural environments. Our findings demonstrate the effectiveness +of integrating spatial visual prompts into VLLMs for improving emotion +recognition performance. + +
+
+ comment: Accepted by EMNLP2024 (Main, Long paper) +
+
+
+
+
+ + ☆ SCA: Highly Efficient Semantic-Consistent Unrestricted Adversarial + Attack + + +
+ Unrestricted adversarial attacks typically manipulate the semantic content of +an image (e.g., color or texture) to create adversarial examples that are both +effective and photorealistic. Recent works have utilized the diffusion +inversion process to map images into a latent space, where high-level semantics +are manipulated by introducing perturbations. However, they often results in +substantial semantic distortions in the denoised output and suffers from low +efficiency. In this study, we propose a novel framework called +Semantic-Consistent Unrestricted Adversarial Attacks (SCA), which employs an +inversion method to extract edit-friendly noise maps and utilizes Multimodal +Large Language Model (MLLM) to provide semantic guidance throughout the +process. Under the condition of rich semantic information provided by MLLM, we +perform the DDPM denoising process of each step using a series of edit-friendly +noise maps, and leverage DPM Solver++ to accelerate this process, enabling +efficient sampling with semantic consistency. Compared to existing methods, our +framework enables the efficient generation of adversarial examples that exhibit +minimal discernible semantic changes. Consequently, we for the first time +introduce Semantic-Consistent Adversarial Examples (SCAE). Extensive +experiments and visualizations have demonstrated the high efficiency of SCA, +particularly in being on average 12 times faster than the state-of-the-art +attacks. Our code can be found at +https://github.com/Pan-Zihao/SCA}{https://github.com/Pan-Zihao/SCA. + +
+
+
+
+
+ + ☆ Key-Grid: Unsupervised 3D Keypoints Detection using Grid Heatmap + Features + + +
+ Detecting 3D keypoints with semantic consistency is widely used in many +scenarios such as pose estimation, shape registration and robotics. Currently, +most unsupervised 3D keypoint detection methods focus on the rigid-body +objects. However, when faced with deformable objects, the keypoints they +identify do not preserve semantic consistency well. In this paper, we introduce +an innovative unsupervised keypoint detector Key-Grid for both the rigid-body +and deformable objects, which is an autoencoder framework. The encoder predicts +keypoints and the decoder utilizes the generated keypoints to reconstruct the +objects. Unlike previous work, we leverage the identified keypoint in formation +to form a 3D grid feature heatmap called grid heatmap, which is used in the +decoder section. Grid heatmap is a novel concept that represents the latent +variables for grid points sampled uniformly in the 3D cubic space, where these +variables are the shortest distance between the grid points and the skeleton +connected by keypoint pairs. Meanwhile, we incorporate the information from +each layer of the encoder into the decoder section. We conduct an extensive +evaluation of Key-Grid on a list of benchmark datasets. Key-Grid achieves the +state-of-the-art performance on the semantic consistency and position accuracy +of keypoints. Moreover, we demonstrate the robustness of Key-Grid to noise and +downsampling. In addition, we achieve SE-(3) invariance of keypoints though +generalizing Key-Grid to a SE(3)-invariant backbone. + +
+
+
+
+
+ + ☆ Efficient Semantic Segmentation via Lightweight Multiple-Information + Interaction Network + + +
+ Recently, the integration of the local modeling capabilities of Convolutional +Neural Networks (CNNs) with the global dependency strengths of Transformers has +created a sensation in the semantic segmentation community. However, +substantial computational workloads and high hardware memory demands remain +major obstacles to their further application in real-time scenarios. In this +work, we propose a lightweight multiple-information interaction network for +real-time semantic segmentation, called LMIINet, which effectively combines +CNNs and Transformers while reducing redundant computations and memory +footprint. It features Lightweight Feature Interaction Bottleneck (LFIB) +modules comprising efficient convolutions that enhance context integration. +Additionally, improvements are made to the Flatten Transformer by enhancing +local and global feature interaction to capture detailed semantic information. +The incorporation of a combination coefficient learning scheme in both LFIB and +Transformer blocks facilitates improved feature interaction. Extensive +experiments demonstrate that LMIINet excels in balancing accuracy and +efficiency. With only 0.72M parameters and 11.74G FLOPs, LMIINet achieves 72.0% +mIoU at 100 FPS on the Cityscapes test set and 69.94% mIoU at 160 FPS on the +CamVid test dataset using a single RTX2080Ti GPU. + +
+
+ comment: 10 pages, 6 figures, 9 tables +
+
+
+
+
+ + ☆ Capturing complex hand movements and object interactions using machine + learning-powered stretchable smart textile gloves + + +
+ Accurate real-time tracking of dexterous hand movements and interactions has +numerous applications in human-computer interaction, metaverse, robotics, and +tele-health. Capturing realistic hand movements is challenging because of the +large number of articulations and degrees of freedom. Here, we report accurate +and dynamic tracking of articulated hand and finger movements using +stretchable, washable smart gloves with embedded helical sensor yarns and +inertial measurement units. The sensor yarns have a high dynamic range, +responding to low 0.005 % to high 155 % strains, and show stability during +extensive use and washing cycles. We use multi-stage machine learning to report +average joint angle estimation root mean square errors of 1.21 and 1.45 degrees +for intra- and inter-subjects cross-validation, respectively, matching accuracy +of costly motion capture cameras without occlusion or field of view +limitations. We report a data augmentation technique that enhances robustness +to noise and variations of sensors. We demonstrate accurate tracking of +dexterous hand movements during object interactions, opening new avenues of +applications including accurate typing on a mock paper keyboard, recognition of +complex dynamic and static gestures adapted from American Sign Language and +object identification. + +
+
+
+
+
+ + ☆ Stochastic Sampling from Deterministic Flow Models ICLR 2025 + + +
+ Deterministic flow models, such as rectified flows, offer a general framework +for learning a deterministic transport map between two distributions, realized +as the vector field for an ordinary differential equation (ODE). However, they +are sensitive to model estimation and discretization errors and do not permit +different samples conditioned on an intermediate state, limiting their +application. We present a general method to turn the underlying ODE of such +flow models into a family of stochastic differential equations (SDEs) that have +the same marginal distributions. This method permits us to derive families of +\emph{stochastic samplers}, for fixed (e.g., previously trained) +\emph{deterministic} flow models, that continuously span the spectrum of +deterministic and stochastic sampling, given access to the flow field and the +score function. Our method provides additional degrees of freedom that help +alleviate the issues with the deterministic samplers and empirically +outperforms them. We empirically demonstrate advantages of our method on a toy +Gaussian setup and on the large scale ImageNet generation task. Further, our +family of stochastic samplers provide an additional knob for controlling the +diversity of generation, which we qualitatively demonstrate in our experiments. + +
+
+ comment: Submitted to ICLR 2025 +
+
+
+
+
+ + ☆ Hard Negative Sample Mining for Whole Slide Image Classification MICCAI 2024 + + +
+ Weakly supervised whole slide image (WSI) classification is challenging due +to the lack of patch-level labels and high computational costs. +State-of-the-art methods use self-supervised patch-wise feature representations +for multiple instance learning (MIL). Recently, methods have been proposed to +fine-tune the feature representation on the downstream task using pseudo +labeling, but mostly focusing on selecting high-quality positive patches. In +this paper, we propose to mine hard negative samples during fine-tuning. This +allows us to obtain better feature representations and reduce the training +cost. Furthermore, we propose a novel patch-wise ranking loss in MIL to better +exploit these hard negative samples. Experiments on two public datasets +demonstrate the efficacy of these proposed ideas. Our codes are available at +https://github.com/winston52/HNM-WSI + +
+
+ comment: 13 pages, 4 figures, accepted by MICCAI 2024 +
+
+
+
+
+ + ☆ Adapting Segment Anything Model to Melanoma Segmentation in Microscopy + Slide Images + + +
+ Melanoma segmentation in Whole Slide Images (WSIs) is useful for prognosis +and the measurement of crucial prognostic factors such as Breslow depth and +primary invasive tumor size. In this paper, we present a novel approach that +uses the Segment Anything Model (SAM) for automatic melanoma segmentation in +microscopy slide images. Our method employs an initial semantic segmentation +model to generate preliminary segmentation masks that are then used to prompt +SAM. We design a dynamic prompting strategy that uses a combination of centroid +and grid prompts to achieve optimal coverage of the super high-resolution slide +images while maintaining the quality of generated prompts. To optimize for +invasive melanoma segmentation, we further refine the prompt generation process +by implementing in-situ melanoma detection and low-confidence region filtering. +We select Segformer as the initial segmentation model and EfficientSAM as the +segment anything model for parameter-efficient fine-tuning. Our experimental +results demonstrate that this approach not only surpasses other +state-of-the-art melanoma segmentation methods but also significantly +outperforms the baseline Segformer by 9.1% in terms of IoU. + +
+
+
+
+
+ + ☆ Remember and Recall: Associative-Memory-based Trajectory Prediction + + +
+ Trajectory prediction is a pivotal component of autonomous driving systems, +enabling the application of accumulated movement experience to current +scenarios. Although most existing methods concentrate on learning continuous +representations to gain valuable experience, they often suffer from +computational inefficiencies and struggle with unfamiliar situations. To +address this issue, we propose the Fragmented-Memory-based Trajectory +Prediction (FMTP) model, inspired by the remarkable learning capabilities of +humans, particularly their ability to leverage accumulated experience and +recall relevant memories in unfamiliar situations. The FMTP model employs +discrete representations to enhance computational efficiency by reducing +information redundancy while maintaining the flexibility to utilize past +experiences. Specifically, we design a learnable memory array by consolidating +continuous trajectory representations from the training set using defined +quantization operations during the training phase. This approach further +eliminates redundant information while preserving essential features in +discrete form. Additionally, we develop an advanced reasoning engine based on +language models to deeply learn the associative rules among these discrete +representations. Our method has been evaluated on various public datasets, +including ETH-UCY, inD, SDD, nuScenes, Waymo, and VTL-TP. The extensive +experimental results demonstrate that our approach achieves significant +performance and extracts more valuable experience from past trajectories to +inform the current state. + +
+
+
+
+
+ + ☆ BadCM: Invisible Backdoor Attack Against Cross-Modal Learning + + +
+ Despite remarkable successes in unimodal learning tasks, backdoor attacks +against cross-modal learning are still underexplored due to the limited +generalization and inferior stealthiness when involving multiple modalities. +Notably, since works in this area mainly inherit ideas from unimodal visual +attacks, they struggle with dealing with diverse cross-modal attack +circumstances and manipulating imperceptible trigger samples, which hinders +their practicability in real-world applications. In this paper, we introduce a +novel bilateral backdoor to fill in the missing pieces of the puzzle in the +cross-modal backdoor and propose a generalized invisible backdoor framework +against cross-modal learning (BadCM). Specifically, a cross-modal mining scheme +is developed to capture the modality-invariant components as target poisoning +areas, where well-designed trigger patterns injected into these regions can be +efficiently recognized by the victim models. This strategy is adapted to +different image-text cross-modal models, making our framework available to +various attack scenarios. Furthermore, for generating poisoned samples of high +stealthiness, we conceive modality-specific generators for visual and +linguistic modalities that facilitate hiding explicit trigger patterns in +modality-invariant regions. To the best of our knowledge, BadCM is the first +invisible backdoor method deliberately designed for diverse cross-modal attacks +within one unified framework. Comprehensive experimental evaluations on two +typical applications, i.e., cross-modal retrieval and VQA, demonstrate the +effectiveness and generalization of our method under multiple kinds of attack +scenarios. Moreover, we show that BadCM can robustly evade existing backdoor +defenses. Our code is available at https://github.com/xandery-geek/BadCM. + +
+
+
+
+
+ + ☆ HATFormer: Historic Handwritten Arabic Text Recognition with + Transformers + + +
+ Arabic handwritten text recognition (HTR) is challenging, especially for +historical texts, due to diverse writing styles and the intrinsic features of +Arabic script. Additionally, Arabic handwriting datasets are smaller compared +to English ones, making it difficult to train generalizable Arabic HTR models. +To address these challenges, we propose HATFormer, a transformer-based +encoder-decoder architecture that builds on a state-of-the-art English HTR +model. By leveraging the transformer's attention mechanism, HATFormer captures +spatial contextual information to address the intrinsic challenges of Arabic +script through differentiating cursive characters, decomposing visual +representations, and identifying diacritics. Our customization to historical +handwritten Arabic includes an image processor for effective ViT information +preprocessing, a text tokenizer for compact Arabic text representation, and a +training pipeline that accounts for a limited amount of historic Arabic +handwriting data. HATFormer achieves a character error rate (CER) of 8.6% on +the largest public historical handwritten Arabic dataset, with a 51% +improvement over the best baseline in the literature. HATFormer also attains a +comparable CER of 4.2% on the largest private non-historical dataset. Our work +demonstrates the feasibility of adapting an English HTR method to a +low-resource language with complex, language-specific challenges, contributing +to advancements in document digitization, information retrieval, and cultural +preservation. + +
+
+
+
+
+ + ☆ From Pixels to Tokens: Byte-Pair Encoding on Quantized Visual Modalities + + +
+ Multimodal Large Language Models have made significant strides in integrating +visual and textual information, yet they often struggle with effectively +aligning these modalities. We introduce a novel image tokenizer that bridges +this gap by applying the principle of Byte-Pair Encoding (BPE) to visual data. +Unlike conventional approaches that rely on separate visual encoders, our +method directly incorporates structural prior information into image tokens, +mirroring the successful tokenization strategies used in text-only Large +Language Models. This innovative approach enables Transformer models to more +effectively learn and reason across modalities. Through theoretical analysis +and extensive experiments, we demonstrate that our BPE Image Tokenizer +significantly enhances MLLMs' multimodal understanding capabilities, even with +limited training data. Our method not only improves performance across various +benchmarks but also shows promising scalability, potentially paving the way for +more efficient and capable multimodal foundation models. + +
+
+
+
+
+ + ☆ An Evaluation of Large Pre-Trained Models for Gesture Recognition using + Synthetic Videos SP + + +
+ In this work, we explore the possibility of using synthetically generated +data for video-based gesture recognition with large pre-trained models. We +consider whether these models have sufficiently robust and expressive +representation spaces to enable "training-free" classification. Specifically, +we utilize various state-of-the-art video encoders to extract features for use +in k-nearest neighbors classification, where the training data points are +derived from synthetic videos only. We compare these results with another +training-free approach -- zero-shot classification using text descriptions of +each gesture. In our experiments with the RoCoG-v2 dataset, we find that using +synthetic training videos yields significantly lower classification accuracy on +real test videos compared to using a relatively small number of real training +videos. We also observe that video backbones that were fine-tuned on +classification tasks serve as superior feature extractors, and that the choice +of fine-tuning data has a substantial impact on k-nearest neighbors +performance. Lastly, we find that zero-shot text-based classification performs +poorly on the gesture recognition task, as gestures are not easily described +through natural language. + +
+
+ comment: Synthetic Data for Artificial Intelligence and Machine Learning: + Tools, Techniques, and Applications II (SPIE Defense + Commercial Sensing, + 2024) +
+
+
+
+
+ + ☆ MDSGen: Fast and Efficient Masked Diffusion Temporal-Aware Transformers + for Open-Domain Sound Generation + + +
+ We introduce MDSGen, a novel framework for vision-guided open-domain sound +generation optimized for model parameter size, memory consumption, and +inference speed. This framework incorporates two key innovations: (1) a +redundant video feature removal module that filters out unnecessary visual +information, and (2) a temporal-aware masking strategy that leverages temporal +context for enhanced audio generation accuracy. In contrast to existing +resource-heavy Unet-based models, MDSGen employs denoising masked diffusion +transformers, facilitating efficient generation without reliance on pre-trained +diffusion models. Evaluated on the benchmark VGGSound dataset, our smallest +model (5M parameters) achieves 97.9% alignment accuracy, using 172x fewer +parameters, 371% less memory, and offering 36x faster inference than the +current 860M-parameter state-of-the-art model (93.9% accuracy). The larger +model (131M parameters) reaches nearly 99% accuracy while requiring 6.5x fewer +parameters. These results highlight the scalability and effectiveness of our +approach. + +
+
+ comment: 21 pages, 16 figures +
+
+
+
+
+ + ☆ DMC-Net: Lightweight Dynamic Multi-Scale and Multi-Resolution + Convolution Network for Pancreas Segmentation in CT Images + + +
+ Convolutional neural networks (CNNs) have shown great effectiveness in +medical image segmentation. However, they may be limited in modeling large +inter-subject variations in organ shapes and sizes and exploiting global +long-range contextual information. This is because CNNs typically employ +convolutions with fixed-sized local receptive fields and lack the mechanisms to +utilize global information. To address these limitations, we developed Dynamic +Multi-Resolution Convolution (DMRC) and Dynamic Multi-Scale Convolution (DMSC) +modules. Both modules enhance the representation capabilities of single +convolutions to capture varying scaled features and global contextual +information. This is achieved in the DMRC module by employing a convolutional +filter on images with different resolutions and subsequently utilizing dynamic +mechanisms to model global inter-dependencies between features. In contrast, +the DMSC module extracts features at different scales by employing convolutions +with different kernel sizes and utilizing dynamic mechanisms to extract global +contextual information. The utilization of convolutions with different kernel +sizes in the DMSC module may increase computational complexity. To lessen this +burden, we propose to use a lightweight design for convolution layers with a +large kernel size. Thus, DMSC and DMRC modules are designed as lightweight +drop-in replacements for single convolutions, and they can be easily integrated +into general CNN architectures for end-to-end training. The segmentation +network was proposed by incorporating our DMSC and DMRC modules into a standard +U-Net architecture, termed Dynamic Multi-scale and Multi-resolution Convolution +network (DMC-Net). The results demonstrate that our proposed DMSC and DMRC can +enhance the representation capabilities of single convolutions and improve +segmentation accuracy. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ CMP: Cooperative Motion Prediction with Multi-Agent Communication + + +
+ The confluence of the advancement of Autonomous Vehicles (AVs) and the +maturity of Vehicle-to-Everything (V2X) communication has enabled the +capability of cooperative connected and automated vehicles (CAVs). Building on +top of cooperative perception, this paper explores the feasibility and +effectiveness of cooperative motion prediction. Our method, CMP, takes LiDAR +signals as model input to enhance tracking and prediction capabilities. Unlike +previous work that focuses separately on either cooperative perception or +motion prediction, our framework, to the best of our knowledge, is the first to +address the unified problem where CAVs share information in both perception and +prediction modules. Incorporated into our design is the unique capability to +tolerate realistic V2X bandwidth limitations and transmission delays, while +dealing with bulky perception representations. We also propose a prediction +aggregation module, which unifies the predictions obtained by different CAVs +and generates the final prediction. Through extensive experiments and ablation +studies on the OPV2V and V2V4Real datasets, we demonstrate the effectiveness of +our method in cooperative perception, tracking, and motion prediction. In +particular, CMP reduces the average prediction error by 16.4\% with fewer +missing detections compared with the no cooperation setting and by 12.3\% +compared with the strongest baseline. Our work marks a significant step forward +in the cooperative capabilities of CAVs, showcasing enhanced performance in +complex scenarios. The code can be found on the project website: +https://cmp-cooperative-prediction.github.io/. + +
+
+ comment: Project website: https://cmp-cooperative-prediction.github.io/ +
+
+
+
+
+ + ♻ ☆ NVDS+: Towards Efficient and Versatile Neural Stabilizer for Video Depth + Estimation ICCV 2023 + + +
+ Video depth estimation aims to infer temporally consistent depth. One +approach is to finetune a single-image model on each video with geometry +constraints, which proves inefficient and lacks robustness. An alternative is +learning to enforce consistency from data, which requires well-designed models +and sufficient video depth data. To address both challenges, we introduce NVDS+ +that stabilizes inconsistent depth estimated by various single-image models in +a plug-and-play manner. We also elaborate a large-scale Video Depth in the Wild +(VDW) dataset, which contains 14,203 videos with over two million frames, +making it the largest natural-scene video depth dataset. Additionally, a +bidirectional inference strategy is designed to improve consistency by +adaptively fusing forward and backward predictions. We instantiate a model +family ranging from small to large scales for different applications. The +method is evaluated on VDW dataset and three public benchmarks. To further +prove the versatility, we extend NVDS+ to video semantic segmentation and +several downstream applications like bokeh rendering, novel view synthesis, and +3D reconstruction. Experimental results show that our method achieves +significant improvements in consistency, accuracy, and efficiency. Our work +serves as a solid baseline and data foundation for learning-based video depth +estimation. Code and dataset are available at: +https://github.com/RaymondWang987/NVDS + +
+
+ comment: V1/V2: ICCV 2023 accepted; V3: the journal extension accepted by IEEE + TPAMI 2024 +
+
+
+
+
+ + ♻ ☆ SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual + Editing ECCV 2024 + + +
+ Effective editing of personal content holds a pivotal role in enabling +individuals to express their creativity, weaving captivating narratives within +their visual stories, and elevate the overall quality and impact of their +visual content. Therefore, in this work, we introduce SwapAnything, a novel +framework that can swap any objects in an image with personalized concepts +given by the reference, while keeping the context unchanged. Compared with +existing methods for personalized subject swapping, SwapAnything has three +unique advantages: (1) precise control of arbitrary objects and parts rather +than the main subject, (2) more faithful preservation of context pixels, (3) +better adaptation of the personalized concept to the image. First, we propose +targeted variable swapping to apply region control over latent feature maps and +swap masked variables for faithful context preservation and initial semantic +concept swapping. Then, we introduce appearance adaptation, to seamlessly adapt +the semantic concept into the original image in terms of target location, +shape, style, and content during the image generation process. Extensive +results on both human and automatic evaluation demonstrate significant +improvements of our approach over baseline methods on personalized swapping. +Furthermore, SwapAnything shows its precise and faithful swapping abilities +across single object, multiple objects, partial object, and cross-domain +swapping tasks. SwapAnything also achieves great performance on text-based +swapping and tasks beyond swapping such as object insertion. + +
+
+ comment: ECCV 2024, 23 pages, 14 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Towards Foundation Models and Few-Shot Parameter-Efficient Fine-Tuning + for Volumetric Organ Segmentation MICCAI + + +
+ The recent popularity of foundation models and the pre-train-and-adapt +paradigm, where a large-scale model is transferred to downstream tasks, is +gaining attention for volumetric medical image segmentation. However, current +transfer learning strategies devoted to full fine-tuning for transfer learning +may require significant resources and yield sub-optimal results when the +labeled data of the target task is scarce. This makes its applicability in real +clinical settings challenging since these institutions are usually constrained +on data and computational resources to develop proprietary solutions. To +address this challenge, we formalize Few-Shot Efficient Fine-Tuning (FSEFT), a +novel and realistic scenario for adapting medical image segmentation foundation +models. This setting considers the key role of both data- and parameter- +efficiency during adaptation. Building on a foundation model pre-trained on +open-access CT organ segmentation sources, we propose leveraging +Parameter-Efficient Fine-Tuning and black-box Adapters to address such +challenges. Furthermore, novel efficient adaptation methodologies are +introduced in this work, which include Spatial black-box Adapters that are more +appropriate for dense prediction tasks and constrained transductive inference, +leveraging task-specific prior knowledge. Our comprehensive transfer learning +experiments confirm the suitability of foundation models in medical image +segmentation and unveil the limitations of popular fine-tuning strategies in +few-shot scenarios. + +
+
+ comment: Journal Extension of MICCAI - MedAGI Workshop 2023. Code in + https://github.com/jusiro/fewshot-finetuning +
+
+
+
+
+ + ♻ ☆ Autoregressive Pre-Training on Pixels and Texts EMNLP 2024 + + +
+ The integration of visual and textual information represents a promising +direction in the advancement of language models. In this paper, we explore the +dual modality of language--both visual and textual--within an autoregressive +framework, pre-trained on both document images and texts. Our method employs a +multimodal training strategy, utilizing visual data through next patch +prediction with a regression head and/or textual data through next token +prediction with a classification head. We focus on understanding the +interaction between these two modalities and their combined impact on model +performance. Our extensive evaluation across a wide range of benchmarks shows +that incorporating both visual and textual data significantly improves the +performance of pixel-based language models. Remarkably, we find that a +unidirectional pixel-based model trained solely on visual data can achieve +comparable results to state-of-the-art bidirectional models on several language +understanding tasks. This work uncovers the untapped potential of integrating +visual and textual modalities for more effective language modeling. We release +our code, data, and model checkpoints at +\url{https://github.com/ernie-research/pixelgpt}. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ VideoPhy: Evaluating Physical Commonsense for Video Generation + + +
+ Recent advances in internet-scale video data pretraining have led to the +development of text-to-video generative models that can create high-quality +videos across a broad range of visual concepts, synthesize realistic motions +and render complex objects. Hence, these generative models have the potential +to become general-purpose simulators of the physical world. However, it is +unclear how far we are from this goal with the existing text-to-video +generative models. To this end, we present VideoPhy, a benchmark designed to +assess whether the generated videos follow physical commonsense for real-world +activities (e.g. marbles will roll down when placed on a slanted surface). +Specifically, we curate diverse prompts that involve interactions between +various material types in the physical world (e.g., solid-solid, solid-fluid, +fluid-fluid). We then generate videos conditioned on these captions from +diverse state-of-the-art text-to-video generative models, including open models +(e.g., CogVideoX) and closed models (e.g., Lumiere, Dream Machine). Our human +evaluation reveals that the existing models severely lack the ability to +generate videos adhering to the given text prompts, while also lack physical +commonsense. Specifically, the best performing model, CogVideoX-5B, generates +videos that adhere to the caption and physical laws for 39.6% of the instances. +VideoPhy thus highlights that the video generative models are far from +accurately simulating the physical world. Finally, we propose an +auto-evaluator, VideoCon-Physics, to assess the performance reliably for the +newly released models. + +
+
+ comment: 43 pages, 29 figures, 12 tables. Added CogVideo and Dream Machine in + v2 +
+
+
+
+
+ + ♻ ☆ Generalizing Medical Image Representations via Quaternion Wavelet + Networks + + +
+ Neural network generalizability is becoming a broad research field due to the +increasing availability of datasets from different sources and for various +tasks. This issue is even wider when processing medical data, where a lack of +methodological standards causes large variations being provided by different +imaging centers or acquired with various devices and cofactors. To overcome +these limitations, we introduce a novel, generalizable, data- and task-agnostic +framework able to extract salient features from medical images. The proposed +quaternion wavelet network (QUAVE) can be easily integrated with any +pre-existing medical image analysis or synthesis task, and it can be involved +with real, quaternion, or hypercomplex-valued models, generalizing their +adoption to single-channel data. QUAVE first extracts different sub-bands +through the quaternion wavelet transform, resulting in both +low-frequency/approximation bands and high-frequency/fine-grained features. +Then, it weighs the most representative set of sub-bands to be involved as +input to any other neural model for image processing, replacing standard data +samples. We conduct an extensive experimental evaluation comprising different +datasets, diverse image analysis, and synthesis tasks including reconstruction, +segmentation, and modality translation. We also evaluate QUAVE in combination +with both real and quaternion-valued models. Results demonstrate the +effectiveness and the generalizability of the proposed framework that improves +network performance while being flexible to be adopted in manifold scenarios +and robust to domain shifts. The full code is available at: +https://github.com/ispamm/QWT. + +
+
+ comment: This paper is currently under review +
+
+
+
+
+ + ♻ ☆ Evaluating Perceptual Distance Models by Fitting Binomial Distributions + to Two-Alternative Forced Choice Data + + +
+ The two-alternative forced choice (2AFC) experimental method is popular in +the visual perception literature, where practitioners aim to understand how +human observers perceive distances within triplets made of a reference image +and two distorted versions. In the past, this had been conducted in controlled +environments, with triplets sharing images, so it was possible to rank the +perceived quality. This ranking would then be used to evaluate perceptual +distance models against the experimental data. Recently, crowd-sourced +perceptual datasets have emerged, with no images shared between triplets, +making ranking infeasible. Evaluating perceptual distance models using this +data reduces the judgements on a triplet to a binary decision, namely, whether +the distance model agrees with the human decision - which is suboptimal and +prone to misleading conclusions. Instead, we statistically model the underlying +decision-making process during 2AFC experiments using a binomial distribution. +Having enough empirical data, we estimate a smooth and consistent distribution +of the judgements on the reference-distorted distance plane, according to each +distance model. By applying maximum likelihood, we estimate the parameter of +the local binomial distribution, and a global measurement of the expected +log-likelihood of the measured responses. We calculate meaningful and +well-founded metrics for the distance model, beyond the mere prediction +accuracy as percentage agreement, even with variable numbers of judgements per +triplet -- key advantages over both classical and neural network methods. + +
+
+
+
+
+ + ♻ ☆ Context and Geometry Aware Voxel Transformer for Semantic Scene + Completion NIPS 2024 + + +
+ Vision-based Semantic Scene Completion (SSC) has gained much attention due to +its widespread applications in various 3D perception tasks. Existing +sparse-to-dense approaches typically employ shared context-independent queries +across various input images, which fails to capture distinctions among them as +the focal regions of different inputs vary and may result in undirected feature +aggregation of cross-attention. Additionally, the absence of depth information +may lead to points projected onto the image plane sharing the same 2D position +or similar sampling points in the feature map, resulting in depth ambiguity. In +this paper, we present a novel context and geometry aware voxel transformer. It +utilizes a context aware query generator to initialize context-dependent +queries tailored to individual input images, effectively capturing their unique +characteristics and aggregating information within the region of interest. +Furthermore, it extend deformable cross-attention from 2D to 3D pixel space, +enabling the differentiation of points with similar image coordinates based on +their depth coordinates. Building upon this module, we introduce a neural +network named CGFormer to achieve semantic scene completion. Simultaneously, +CGFormer leverages multiple 3D representations (i.e., voxel and TPV) to boost +the semantic and geometric representation abilities of the transformed 3D +volume from both local and global perspectives. Experimental results +demonstrate that CGFormer achieves state-of-the-art performance on the +SemanticKITTI and SSCBench-KITTI-360 benchmarks, attaining a mIoU of 16.87 and +20.05, as well as an IoU of 45.99 and 48.07, respectively. Remarkably, CGFormer +even outperforms approaches employing temporal images as inputs or much larger +image backbone networks. + +
+
+ comment: NIPS 2024 Spotlight +
+
+
+
+
+ + ♻ ☆ Leopard: A Vision Language Model For Text-Rich Multi-Image Tasks + + +
+ Text-rich images, where text serves as the central visual element guiding the +overall understanding, are prevalent in real-world applications, such as +presentation slides, scanned documents, and webpage snapshots. Tasks involving +multiple text-rich images are especially challenging, as they require not only +understanding the content of individual images but reasoning about +inter-relationships and logical flows across multiple visual inputs. Despite +the importance of these scenarios, current multimodal large language models +(MLLMs) struggle to handle such tasks due to two key challenges: (1) the +scarcity of high-quality instruction tuning datasets for text-rich multi-image +scenarios, and (2) the difficulty in balancing image resolution with visual +feature sequence length. To address these challenges, we propose Leopard, a +MLLM designed specifically for handling vision-language tasks involving +multiple text-rich images. First, we curated about one million high-quality +multimodal instruction-tuning data, tailored to text-rich, multi-image +scenarios. Second, we developed an adaptive high-resolution multi-image +encoding module to dynamically optimize the allocation of visual sequence +length based on the original aspect ratios and resolutions of the input images. +Experiments across a wide range of benchmarks demonstrate our model's superior +capabilities in text-rich, multi-image evaluations and competitive performance +in general domain evaluations. + +
+
+ comment: Our code is available at https://github.com/Jill0001/Leopard +
+
+
+
+
+ + ♻ ☆ Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at + Any Resolution + + +
+ We present the Qwen2-VL Series, an advanced upgrade of the previous Qwen-VL +models that redefines the conventional predetermined-resolution approach in +visual processing. Qwen2-VL introduces the Naive Dynamic Resolution mechanism, +which enables the model to dynamically process images of varying resolutions +into different numbers of visual tokens. This approach allows the model to +generate more efficient and accurate visual representations, closely aligning +with human perceptual processes. The model also integrates Multimodal Rotary +Position Embedding (M-RoPE), facilitating the effective fusion of positional +information across text, images, and videos. We employ a unified paradigm for +processing both images and videos, enhancing the model's visual perception +capabilities. To explore the potential of large multimodal models, Qwen2-VL +investigates the scaling laws for large vision-language models (LVLMs). By +scaling both the model size-with versions at 2B, 8B, and 72B parameters-and the +amount of training data, the Qwen2-VL Series achieves highly competitive +performance. Notably, the Qwen2-VL-72B model achieves results comparable to +leading models such as GPT-4o and Claude3.5-Sonnet across various multimodal +benchmarks, outperforming other generalist models. Code is available at +https://github.com/QwenLM/Qwen2-VL . + +
+
+ comment: Code is available at https://github.com/QwenLM/Qwen2-VL. arXiv admin + note: text overlap with arXiv:2408.15262 by other authors +
+
+
+
+
+ + ♻ ☆ BinaryDM: Accurate Weight Binarization for Efficient Diffusion Models + + +
+ With the advancement of diffusion models (DMs) and the substantially +increased computational requirements, quantization emerges as a practical +solution to obtain compact and efficient low-bit DMs. However, the highly +discrete representation leads to severe accuracy degradation, hindering the +quantization of diffusion models to ultra-low bit-widths. This paper proposes a +novel weight binarization approach for DMs, namely BinaryDM, pushing binarized +DMs to be accurate and efficient by improving the representation and +optimization. From the representation perspective, we present an +Evolvable-Basis Binarizer (EBB) to enable a smooth evolution of DMs from +full-precision to accurately binarized. EBB enhances information representation +in the initial stage through the flexible combination of multiple binary bases +and applies regularization to evolve into efficient single-basis binarization. +The evolution only occurs in the head and tail of the DM architecture to retain +the stability of training. From the optimization perspective, a Low-rank +Representation Mimicking (LRM) is applied to assist the optimization of +binarized DMs. The LRM mimics the representations of full-precision DMs in +low-rank space, alleviating the direction ambiguity of the optimization process +caused by fine-grained alignment. Comprehensive experiments demonstrate that +BinaryDM achieves significant accuracy and efficiency gains compared to SOTA +quantization methods of DMs under ultra-low bit-widths. With 1-bit weight and +4-bit activation (W1A4), BinaryDM achieves as low as 7.74 FID and saves the +performance from collapse (baseline FID 10.87). As the first binarization +method for diffusion models, W1A4 BinaryDM achieves impressive 15.2x OPs and +29.2x model size savings, showcasing its substantial potential for edge +deployment. + +
+
+ comment: The code is available at https://github.com/Xingyu-Zheng/BinaryDM +
+
+
+
+
+ + ♻ ☆ Learning an Actionable Discrete Diffusion Policy via Large-Scale + Actionless Video Pre-Training NeurIPS 2024 + + +
+ Learning a generalist embodied agent capable of completing multiple tasks +poses challenges, primarily stemming from the scarcity of action-labeled +robotic datasets. In contrast, a vast amount of human videos exist, capturing +intricate tasks and interactions with the physical world. Promising prospects +arise for utilizing actionless human videos for pre-training and transferring +the knowledge to facilitate robot policy learning through limited robot +demonstrations. However, it remains a challenge due to the domain gap between +humans and robots. Moreover, it is difficult to extract useful information +representing the dynamic world from human videos, because of its noisy and +multimodal data structure. In this paper, we introduce a novel framework to +tackle these challenges, which leverages a unified discrete diffusion to +combine generative pre-training on human videos and policy fine-tuning on a +small number of action-labeled robot videos. We start by compressing both human +and robot videos into unified video tokens. In the pre-training stage, we +employ a discrete diffusion model with a mask-and-replace diffusion strategy to +predict future video tokens in the latent space. In the fine-tuning stage, we +harness the imagined future videos to guide low-level action learning with a +limited set of robot data. Experiments demonstrate that our method generates +high-fidelity future videos for planning and enhances the fine-tuned policies +compared to previous state-of-the-art approaches with superior performance. Our +project website is available at https://video-diff.github.io/. + +
+
+ comment: Accepted by NeurIPS 2024. 24 pages +
+
+
+
+
+ + ♻ ☆ MARVIS: Motion & Geometry Aware Real and Virtual Image Segmentation + + +
+ Tasks such as autonomous navigation, 3D reconstruction, and object +recognition near the water surfaces are crucial in marine robotics +applications. However, challenges arise due to dynamic disturbances, e.g., +light reflections and refraction from the random air-water interface, irregular +liquid flow, and similar factors, which can lead to potential failures in +perception and navigation systems. Traditional computer vision algorithms +struggle to differentiate between real and virtual image regions, significantly +complicating tasks. A virtual image region is an apparent representation formed +by the redirection of light rays, typically through reflection or refraction, +creating the illusion of an object's presence without its actual physical +location. This work proposes a novel approach for segmentation on real and +virtual image regions, exploiting synthetic images combined with +domain-invariant information, a Motion Entropy Kernel, and Epipolar Geometric +Consistency. Our segmentation network does not need to be re-trained if the +domain changes. We show this by deploying the same segmentation network in two +different domains: simulation and the real world. By creating realistic +synthetic images that mimic the complexities of the water surface, we provide +fine-grained training data for our network (MARVIS) to discern between real and +virtual images effectively. By motion & geometry-aware design choices and +through comprehensive experimental analysis, we achieve state-of-the-art +real-virtual image segmentation performance in unseen real world domain, +achieving an IoU over 78% and a F1-Score over 86% while ensuring a small +computational footprint. MARVIS offers over 43 FPS (8 FPS) inference rates on a +single GPU (CPU core). Our code and dataset are available here +https://github.com/jiayi-wu-umd/MARVIS. + +
+
+
+
+
+ + ♻ ☆ Physics-Regularized Multi-Modal Image Assimilation for Brain Tumor + Localization NeurIPS 2024 + + +
+ Physical models in the form of partial differential equations represent an +important prior for many under-constrained problems. One example is tumor +treatment planning, which heavily depends on accurate estimates of the spatial +distribution of tumor cells in a patient's anatomy. Medical imaging scans can +identify the bulk of the tumor, but they cannot reveal its full spatial +distribution. Tumor cells at low concentrations remain undetectable, for +example, in the most frequent type of primary brain tumors, glioblastoma. +Deep-learning-based approaches fail to estimate the complete tumor cell +distribution due to a lack of reliable training data. Most existing works +therefore rely on physics-based simulations to match observed tumors, providing +anatomically and physiologically plausible estimations. However, these +approaches struggle with complex and unknown initial conditions and are limited +by overly rigid physical models. In this work, we present a novel method that +balances data-driven and physics-based cost functions. In particular, we +propose a unique discretization scheme that quantifies the adherence of our +learned spatiotemporal tumor and brain tissue distributions to their +corresponding growth and elasticity equations. This quantification, serving as +a regularization term rather than a hard constraint, enables greater +flexibility and proficiency in assimilating patient data than existing models. +We demonstrate improved coverage of tumor recurrence areas compared to existing +techniques on real-world data from a cohort of patients. The method holds the +potential to enhance clinical adoption of model-driven treatment planning for +glioblastoma. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Source-Free Domain Adaptation Guided by Vision and Vision-Language + Pre-Training ICCV + + +
+ Source-free domain adaptation (SFDA) aims to adapt a source model trained on +a fully-labeled source domain to a related but unlabeled target domain. While +the source model is a key avenue for acquiring target pseudolabels, the +generated pseudolabels may exhibit source bias. In the conventional SFDA +pipeline, a large data (e.g. ImageNet) pre-trained feature extractor is used to +initialize the source model at the start of source training, and subsequently +discarded. Despite having diverse features important for generalization, the +pre-trained feature extractor can overfit to the source data distribution +during source training and forget relevant target domain knowledge. Rather than +discarding this valuable knowledge, we introduce an integrated framework to +incorporate pre-trained networks into the target adaptation process. The +proposed framework is flexible and allows us to plug modern pre-trained +networks into the adaptation process to leverage their stronger representation +learning capabilities. For adaptation, we propose the Co-learn algorithm to +improve target pseudolabel quality collaboratively through the source model and +a pre-trained feature extractor. Building on the recent success of the +vision-language model CLIP in zero-shot image recognition, we present an +extension Co-learn++ to further incorporate CLIP's zero-shot classification +decisions. We evaluate on 4 benchmark datasets and include more challenging +scenarios such as open-set, partial-set and open-partial SFDA. Experimental +results demonstrate that our proposed strategy improves adaptation performance +and can be successfully integrated with existing SFDA methods. Project code is +available at https://github.com/zwenyu/colearn-plus. + +
+
+ comment: Extension of ICCV paper arXiv:2212.07585; Published at IJCV +
+
+
+
+
+ + ♻ ☆ SurgPointTransformer: Vertebrae Shape Completion with RGB-D Data + + +
+ State-of-the-art computer- and robot-assisted surgery systems heavily depend +on intraoperative imaging technologies such as CT and fluoroscopy to generate +detailed 3D visualization of the patient's anatomy. While imaging techniques +are highly accurate, they are based on ionizing radiation and expose patients +and clinicians. This study introduces an alternative, radiation-free approach +for reconstructing the 3D spine anatomy using RGB-D data. Drawing inspiration +from the 3D "mental map" that surgeons form during surgeries, we introduce +SurgPointTransformer, a shape completion approach for surgical applications +that can accurately reconstruct the unexposed spine regions from sparse +observations of the exposed surface. + Our method involves two main steps: segmentation and shape completion. The +segmentation step includes spinal column localization and segmentation, +followed by vertebra-wise segmentation. The segmented vertebra point clouds are +then subjected to SurgPointTransformer, which leverages an attention mechanism +to learn patterns between visible surface features and the underlying anatomy. +For evaluation, we utilize an ex-vivo dataset of nine specimens. Their CT data +is used to establish ground truth data that were used to compare to the outputs +of our methods. Our method significantly outperforms the state-of-the-art +baselines, achieving an average Chamfer Distance of 5.39, an F-Score of 0.85, +an Earth Mover's Distance of 0.011, and a Signal-to-Noise Ratio of 22.90 dB. + This study demonstrates the potential of our reconstruction method for 3D +vertebral shape completion. It enables 3D reconstruction of the entire lumbar +spine and surgical guidance without ionizing radiation or invasive imaging. Our +work contributes to computer-aided and robot-assisted surgery, advancing the +perception and intelligence of these systems. + +
+
+
+
+
+ + ♻ ☆ TransRef: Multi-Scale Reference Embedding Transformer for + Reference-Guided Image Inpainting + + +
+ Image inpainting for completing complicated semantic environments and diverse +hole patterns of corrupted images is challenging even for state-of-the-art +learning-based inpainting methods trained on large-scale data. A reference +image capturing the same scene of a corrupted image offers informative guidance +for completing the corrupted image as it shares similar texture and structure +priors to that of the holes of the corrupted image. In this work, we propose a +transformer-based encoder-decoder network, named TransRef, for reference-guided +image inpainting. Specifically, the guidance is conducted progressively through +a reference embedding procedure, in which the referencing features are +subsequently aligned and fused with the features of the corrupted image. For +precise utilization of the reference features for guidance, a reference-patch +alignment (Ref-PA) module is proposed to align the patch features of the +reference and corrupted images and harmonize their style differences, while a +reference-patch transformer (Ref-PT) module is proposed to refine the embedded +reference feature. Moreover, to facilitate the research of reference-guided +image restoration tasks, we construct a publicly accessible benchmark dataset +containing 50K pairs of input and reference images. Both quantitative and +qualitative evaluations demonstrate the efficacy of the reference information +and the proposed method over the state-of-the-art methods in completing complex +holes. Code and dataset can be accessed at https://github.com/Cameltr/TransRef. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Conditional Image Synthesis with Diffusion Models: A Survey + + +
+ Conditional image synthesis based on user-specified requirements is a key +component in creating complex visual content. In recent years, diffusion-based +generative modeling has become a highly effective way for conditional image +synthesis, leading to exponential growth in the literature. However, the +complexity of diffusion-based modeling, the wide range of image synthesis +tasks, and the diversity of conditioning mechanisms present significant +challenges for researchers to keep up with rapid developments and understand +the core concepts on this topic. In this survey, we categorize existing works +based on how conditions are integrated into the two fundamental components of +diffusion-based modeling, i.e., the denoising network and the sampling process. +We specifically highlight the underlying principles, advantages, and potential +challenges of various conditioning approaches in the training, re-purposing, +and specialization stages to construct a desired denoising network. We also +summarize six mainstream conditioning mechanisms in the essential sampling +process. All discussions are centered around popular applications. Finally, we +pinpoint some critical yet still open problems to be solved in the future and +suggest some possible solutions. Our reviewed works are itemized at +https://github.com/zju-pi/Awesome-Conditional-Diffusion-Models. + +
+
+
+
+
+ + ♻ ☆ Forecasting Disease Progression with Parallel Hyperplanes in + Longitudinal Retinal OCT MICCAI 2024 + + +
+ Predicting future disease progression risk from medical images is challenging +due to patient heterogeneity, and subtle or unknown imaging biomarkers. +Moreover, deep learning (DL) methods for survival analysis are susceptible to +image domain shifts across scanners. We tackle these issues in the task of +predicting late dry Age-related Macular Degeneration (dAMD) onset from retinal +OCT scans. We propose a novel DL method for survival prediction to jointly +predict from the current scan a risk score, inversely related to +time-to-conversion, and the probability of conversion within a time interval +$t$. It uses a family of parallel hyperplanes generated by parameterizing the +bias term as a function of $t$. In addition, we develop unsupervised losses +based on intra-subject image pairs to ensure that risk scores increase over +time and that future conversion predictions are consistent with AMD stage +prediction using actual scans of future visits. Such losses enable +data-efficient fine-tuning of the trained model on new unlabeled datasets +acquired with a different scanner. Extensive evaluation on two large datasets +acquired with different scanners resulted in a mean AUROCs of 0.82 for +Dataset-1 and 0.83 for Dataset-2, across prediction intervals of 6,12 and 24 +months. + +
+
+ comment: accepted in MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Eliciting In-Context Learning in Vision-Language Models for Videos + Through Curated Data Distributional Properties EMNLP 2024 + + +
+ A major reason behind the recent success of large language models (LLMs) is +their \textit{in-context learning} capability, which makes it possible to +rapidly adapt them to downstream text-based tasks by prompting them with a +small number of relevant demonstrations. While large vision-language models +(VLMs) have recently been developed for tasks requiring both text and images, +they largely lack in-context learning over visual information, especially in +understanding and generating text about videos. In this work, we implement +\textbf{E}mergent \textbf{I}n-context \textbf{Le}arning on \textbf{V}ideos +(\eilev{}), a novel training paradigm that induces in-context learning over +video and text by capturing key properties of pre-training data found by prior +work to be essential for in-context learning in transformers. In our +experiments, we show that \eilev-trained models outperform other off-the-shelf +VLMs in few-shot video narration for novel, rare actions. Furthermore, we +demonstrate that these key properties of bursty distributions, skewed marginal +distributions, and dynamic meaning each contribute to varying degrees to VLMs' +in-context learning capability in narrating procedural videos. Our results, +analysis, and \eilev{}-trained models yield numerous insights about the +emergence of in-context learning over video and text, creating a foundation for +future work to optimize and scale VLMs for open-domain video understanding and +reasoning. Our code and demo are available at +\url{https://github.com/yukw777/EILEV}. + +
+
+ comment: 16 pages, LaTeX; Accepted to EMNLP 2024 Main +
+
+
+
+
+ + ♻ ☆ Multimodal Self-Instruct: Synthetic Abstract Image and Visual Reasoning + Instruction Using Language Model EMNLP-24 + + +
+ Although most current large multimodal models (LMMs) can already understand +photos of natural scenes and portraits, their understanding of abstract images, +e.g., charts, maps, or layouts, and visual reasoning capabilities remains quite +rudimentary. They often struggle with simple daily tasks, such as reading time +from a clock, understanding a flowchart, or planning a route using a road map. +In light of this, we design a multi-modal self-instruct, utilizing large +language models and their code capabilities to synthesize massive abstract +images and visual reasoning instructions across daily scenarios. Our strategy +effortlessly creates a multimodal benchmark with 11,193 instructions for eight +visual scenarios: charts, tables, simulated maps, dashboards, flowcharts, +relation graphs, floor plans, and visual puzzles. \textbf{This benchmark, +constructed with simple lines and geometric elements, exposes the shortcomings +of most advanced LMMs} like Claude-3.5-Sonnet and GPT-4o in abstract image +understanding, spatial relations reasoning, and visual element induction. +Besides, to verify the quality of our synthetic data, we fine-tune an LMM using +62,476 synthetic chart, table and road map instructions. The results +demonstrate improved chart understanding and map navigation performance, and +also demonstrate potential benefits for other visual reasoning tasks. Our code +is available at: \url{https://github.com/zwq2018/Multi-modal-Self-instruct}. + +
+
+ comment: The paper is accepted by EMNLP-24. Code: + https://github.com/zwq2018/Multi-modal-Self-instruct dataset: + https://huggingface.co/datasets/zwq2018/Multi-modal-Self-instruct + Leaderboard: https://multi-modal-self-instruct.github.io/ +
+
+
+
+
+ + ♻ ☆ SRIF: Semantic Shape Registration Empowered by Diffusion-based Image + Morphing and Flow Estimation SIGGRAPH + + +
+ In this paper, we propose SRIF, a novel Semantic shape Registration framework +based on diffusion-based Image morphing and Flow estimation. More concretely, +given a pair of extrinsically aligned shapes, we first render them from +multi-views, and then utilize an image interpolation framework based on +diffusion models to generate sequences of intermediate images between them. The +images are later fed into a dynamic 3D Gaussian splatting framework, with which +we reconstruct and post-process for intermediate point clouds respecting the +image morphing processing. In the end, tailored for the above, we propose a +novel registration module to estimate continuous normalizing flow, which +deforms source shape consistently towards the target, with intermediate point +clouds as weak guidance. Our key insight is to leverage large vision models +(LVMs) to associate shapes and therefore obtain much richer semantic +information on the relationship between shapes than the ad-hoc feature +extraction and alignment. As a consequence, SRIF achieves high-quality dense +correspondences on challenging shape pairs, but also delivers smooth, +semantically meaningful interpolation in between. Empirical evidence justifies +the effectiveness and superiority of our method as well as specific design +choices. The code is released at https://github.com/rqhuang88/SRIF. + +
+
+ comment: Accepted as a conference paper of SIGGRAPH Asia 2024 +
+
+
+
+
+ + ♻ ☆ Generalizable Human Gaussians from Single-View Image + + +
+ In this work, we tackle the task of learning generalizable 3D human Gaussians +from a single image. The main challenge for this task is to recover detailed +geometry and appearance, especially for the unobserved regions. To this end, we +propose single-view generalizable Human Gaussian model (HGM), a +diffusion-guided framework for 3D human modeling from a single image. We design +a diffusion-based coarse-to-fine pipeline, where the diffusion model is adapted +to refine novel-view images rendered from a coarse human Gaussian model. The +refined images are then used together with the input image to learn a refined +human Gaussian model. Although effective in hallucinating the unobserved views, +the approach may generate unrealistic human pose and shapes due to the lack of +supervision. We circumvent this problem by further encoding the geometric +priors from SMPL model. Specifically, we propagate geometric features from SMPL +volume to the predicted Gaussians via sparse convolution and attention +mechanism. We validate our approach on publicly available datasets and +demonstrate that it significantly surpasses state-of-the-art methods in terms +of PSNR and SSIM. Additionally, our method exhibits strong generalization for +in-the-wild images. + +
+
+ comment: https://jinnan-chen.github.io/projects/HGM/ +
+
+
+
+
+ + ♻ ☆ Releasing the Parameter Latency of Neural Representation for + High-Efficiency Video Compression + + +
+ For decades, video compression technology has been a prominent research area. +Traditional hybrid video compression framework and end-to-end frameworks +continue to explore various intra- and inter-frame reference and prediction +strategies based on discrete transforms and deep learning techniques. However, +the emerging implicit neural representation (INR) technique models entire +videos as basic units, automatically capturing intra-frame and inter-frame +correlations and obtaining promising performance. INR uses a compact neural +network to store video information in network parameters, effectively +eliminating spatial and temporal redundancy in the original video. However, in +this paper, our exploration and verification reveal that current INR video +compression methods do not fully exploit their potential to preserve +information. We investigate the potential of enhancing network parameter +storage through parameter reuse. By deepening the network, we designed a +feasible INR parameter reuse scheme to further improve compression performance. +Extensive experimental results show that our method significantly enhances the +rate-distortion performance of INR video compression. + +
+
+
+
+
+ + ♻ ☆ LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via a + Hybrid Architecture + + +
+ Expanding the long-context capabilities of Multi-modal Large Language +Models~(MLLMs) is crucial for video understanding, high-resolution image +understanding, and multi-modal agents. This involves a series of systematic +optimizations, including model architecture, data construction and training +strategy, particularly addressing challenges such as \textit{degraded +performance with more images} and \textit{high computational costs}. In this +paper, we adapt the model architecture to a hybrid of Mamba and Transformer +blocks, approach data construction with both temporal and spatial dependencies +among multiple images and employ a progressive training strategy. The released +model \textbf{LongLLaVA}~(\textbf{Long}-Context \textbf{L}arge +\textbf{L}anguage \textbf{a}nd \textbf{V}ision \textbf{A}ssistant) is the first +hybrid MLLM, which achieved a better balance between efficiency and +effectiveness. LongLLaVA not only achieves competitive results across various +benchmarks, but also maintains high throughput and low memory consumption. +Especially, it could process nearly a thousand images on a single A100 80GB +GPU, showing promising application prospects for a wide range of tasks. + +
+
+ comment: 20 pages, 9 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ EUFCC-CIR: a Composed Image Retrieval Dataset for GLAM Collections ECCV + + +
+ The intersection of Artificial Intelligence and Digital Humanities enables +researchers to explore cultural heritage collections with greater depth and +scale. In this paper, we present EUFCC-CIR, a dataset designed for Composed +Image Retrieval (CIR) within Galleries, Libraries, Archives, and Museums (GLAM) +collections. Our dataset is built on top of the EUFCC-340K image labeling +dataset and contains over 180K annotated CIR triplets. Each triplet is composed +of a multi-modal query (an input image plus a short text describing the desired +attribute manipulations) and a set of relevant target images. The EUFCC-CIR +dataset fills an existing gap in CIR-specific resources for Digital Humanities. +We demonstrate the value of the EUFCC-CIR dataset by highlighting its unique +qualities in comparison to other existing CIR datasets and evaluating the +performance of several zero-shot CIR baselines. + +
+
+ comment: ECCV Workshop (AI4DH2024) +
+
+
+
+
+ + ♻ ☆ Fake It Until You Break It: On the Adversarial Robustness of + AI-generated Image Detectors + + +
+ While generative AI (GenAI) offers countless possibilities for creative and +productive tasks, artificially generated media can be misused for fraud, +manipulation, scams, misinformation campaigns, and more. To mitigate the risks +associated with maliciously generated media, forensic classifiers are employed +to identify AI-generated content. However, current forensic classifiers are +often not evaluated in practically relevant scenarios, such as the presence of +an attacker or when real-world artifacts like social media degradations affect +images. In this paper, we evaluate state-of-the-art AI-generated image (AIGI) +detectors under different attack scenarios. We demonstrate that forensic +classifiers can be effectively attacked in realistic settings, even when the +attacker does not have access to the target model and post-processing occurs +after the adversarial examples are created, which is standard on social media +platforms. These attacks can significantly reduce detection accuracy to the +extent that the risks of relying on detectors outweigh their benefits. Finally, +we propose a simple defense mechanism to make CLIP-based detectors, which are +currently the best-performing detectors, robust against these attacks. + +
+
+
+
+
+ + ♻ ☆ Optimal Projections for Discriminative Dictionary Learning using the + JL-lemma + + +
+ Dimensionality reduction-based dictionary learning methods in the literature +have often used iterative random projections. The dimensionality of such a +random projection matrix is a random number that might not lead to a separable +subspace structure in the transformed space. The convergence of such methods +highly depends on the initial seed values used. Also, gradient descent-based +updates might result in local minima. This paper proposes a constructive +approach to derandomize the projection matrix using the Johnson-Lindenstrauss +lemma. Rather than reducing dimensionality via random projections, a projection +matrix derived from the proposed Modified Supervised PC analysis is used. A +heuristic is proposed to decide the data perturbation levels and the dictionary +atom's corresponding suitable description length. The projection matrix is +derived in a single step, provides maximum feature-label consistency of the +transformed space, and preserves the geometry of the original data. The +projection matrix thus constructed is proved to be a JL-embedding. Despite +confusing classes in the OCR datasets, the dictionary trained in the +transformed space generates discriminative sparse coefficients with reduced +complexity. Empirical study demonstrates that the proposed method performs well +even when the number of classes and dimensionality increase. Experimentation on +OCR and face recognition datasets shows better classification performance than +other algorithms. + +
+
+
+
+
+ + ♻ ☆ MOREL: Enhancing Adversarial Robustness through Multi-Objective + Representation Learning + + +
+ Extensive research has shown that deep neural networks (DNNs) are vulnerable +to slight adversarial perturbations$-$small changes to the input data that +appear insignificant but cause the model to produce drastically different +outputs. In addition to augmenting training data with adversarial examples +generated from a specific attack method, most of the current defense strategies +necessitate modifying the original model architecture components to improve +robustness or performing test-time data purification to handle adversarial +attacks. In this work, we demonstrate that strong feature representation +learning during training can significantly enhance the original model's +robustness. We propose MOREL, a multi-objective feature representation learning +approach, encouraging classification models to produce similar features for +inputs within the same class, despite perturbations. Our training method +involves an embedding space where cosine similarity loss and multi-positive +contrastive loss are used to align natural and adversarial features from the +model encoder and ensure tight clustering. Concurrently, the classifier is +motivated to achieve accurate predictions. Through extensive experiments, we +demonstrate that our approach significantly enhances the robustness of DNNs +against white-box and black-box adversarial attacks, outperforming other +methods that similarly require no architectural changes or test-time data +purification. Our code is available at https://github.com/salomonhotegni/MOREL + +
+
+
+
+
+ + ♻ ☆ Efficient Temporal Extrapolation of Multimodal Large Language Models + with Temporal Grounding Bridge EMNLP 2024 + + +
+ Despite progress in multimodal large language models (MLLMs), the challenge +of interpreting long-form videos in response to linguistic queries persists, +largely due to the inefficiency in temporal grounding and limited pre-trained +context window size. In this work, we introduce Temporal Grounding Bridge +(TGB), a novel framework that bootstraps MLLMs with advanced temporal grounding +capabilities and broadens their contextual scope. Our framework significantly +enhances the temporal capabilities of current MLLMs through three key +innovations: an efficient multi-span temporal grounding algorithm applied to +low-dimension temporal features projected from flow; a multimodal length +extrapolation training paradigm that utilizes low-dimension temporal features +to extend the training context window size; and a bootstrapping framework that +bridges our model with pluggable MLLMs without requiring annotation. We +validate TGB across seven video benchmarks and demonstrate substantial +performance improvements compared with prior MLLMs. Notably, our model, +initially trained on sequences of four frames, effectively handles sequences up +to 16 longer without sacrificing performance, highlighting its scalability and +effectiveness in real-world applications. Our code is publicly available at +https://github.com/bigai-nlco/VideoTGB + +
+
+ comment: To appear at EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Text-to-Sticker: Style Tailoring Latent Diffusion Models for Human + Expression + + +
+ We introduce Style Tailoring, a recipe to finetune Latent Diffusion Models +(LDMs) in a distinct domain with high visual quality, prompt alignment and +scene diversity. We choose sticker image generation as the target domain, as +the images significantly differ from photorealistic samples typically generated +by large-scale LDMs. We start with a competent text-to-image model, like Emu, +and show that relying on prompt engineering with a photorealistic model to +generate stickers leads to poor prompt alignment and scene diversity. To +overcome these drawbacks, we first finetune Emu on millions of sticker-like +images collected using weak supervision to elicit diversity. Next, we curate +human-in-the-loop (HITL) Alignment and Style datasets from model generations, +and finetune to improve prompt alignment and style alignment respectively. +Sequential finetuning on these datasets poses a tradeoff between better style +alignment and prompt alignment gains. To address this tradeoff, we propose a +novel fine-tuning method called Style Tailoring, which jointly fits the content +and style distribution and achieves best tradeoff. Evaluation results show our +method improves visual quality by 14%, prompt alignment by 16.2% and scene +diversity by 15.3%, compared to prompt engineering the base Emu model for +stickers generation. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ CTSpine1K: A Large-Scale Dataset for Spinal Vertebrae Segmentation in + Computed Tomography MICCAI2024 + + +
+ Spine-related diseases have high morbidity and cause a huge burden of social +cost. Spine imaging is an essential tool for noninvasively visualizing and +assessing spinal pathology. Segmenting vertebrae in computed tomography (CT) +images is the basis of quantitative medical image analysis for clinical +diagnosis and surgery planning of spine diseases. Current publicly available +annotated datasets on spinal vertebrae are small in size. Due to the lack of a +large-scale annotated spine image dataset, the mainstream deep learning-based +segmentation methods, which are data-driven, are heavily restricted. In this +paper, we introduce a large-scale spine CT dataset, called CTSpine1K, curated +from multiple sources for vertebra segmentation, which contains 1,005 CT +volumes with over 11,100 labeled vertebrae belonging to different spinal +conditions. Based on this dataset, we conduct several spinal vertebrae +segmentation experiments to set the first benchmark. We believe that this +large-scale dataset will facilitate further research in many spine-related +image analysis tasks, including but not limited to vertebrae segmentation, +labeling, 3D spine reconstruction from biplanar radiographs, image +super-resolution, and enhancement. + +
+
+ comment: Accepted by MICCAI2024 Open Data for oral presentation and will be + published as a part of the journal MELBA special issue +
+
+
+
+
+ + ♻ ☆ A Comprehensive Survey of Hallucination in Large Language, Image, Video + and Audio Foundation Models EMNLP 2024 + + +
+ The rapid advancement of foundation models (FMs) across language, image, +audio, and video domains has shown remarkable capabilities in diverse tasks. +However, the proliferation of FMs brings forth a critical challenge: the +potential to generate hallucinated outputs, particularly in high-stakes +applications. The tendency of foundation models to produce hallucinated content +arguably represents the biggest hindrance to their widespread adoption in +real-world scenarios, especially in domains where reliability and accuracy are +paramount. This survey paper presents a comprehensive overview of recent +developments that aim to identify and mitigate the problem of hallucination in +FMs, spanning text, image, video, and audio modalities. By synthesizing recent +advancements in detecting and mitigating hallucination across various +modalities, the paper aims to provide valuable insights for researchers, +developers, and practitioners. Essentially, it establishes a clear framework +encompassing definition, taxonomy, and detection strategies for addressing +hallucination in multimodal foundation models, laying the foundation for future +research in this pivotal area. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Identifying and Solving Conditional Image Leakage in Image-to-Video + Diffusion Model NeurIPS 2024 + + +
+ Diffusion models have obtained substantial progress in image-to-video +generation. However, in this paper, we find that these models tend to generate +videos with less motion than expected. We attribute this to the issue called +conditional image leakage, where the image-to-video diffusion models (I2V-DMs) +tend to over-rely on the conditional image at large time steps. We further +address this challenge from both inference and training aspects. First, we +propose to start the generation process from an earlier time step to avoid the +unreliable large-time steps of I2V-DMs, as well as an initial noise +distribution with optimal analytic expressions (Analytic-Init) by minimizing +the KL divergence between it and the actual marginal distribution to bridge the +training-inference gap. Second, we design a time-dependent noise distribution +(TimeNoise) for the conditional image during training, applying higher noise +levels at larger time steps to disrupt it and reduce the model's dependency on +it. We validate these general strategies on various I2V-DMs on our collected +open-domain image benchmark and the UCF101 dataset. Extensive results show that +our methods outperform baselines by producing higher motion scores with lower +errors while maintaining image alignment and temporal consistency, thereby +yielding superior overall performance and enabling more accurate motion +control. The project page: \url{https://cond-image-leak.github.io/}. + +
+
+ comment: NeurIPS 2024. Project page: https://cond-image-leak.github.io/ +
+
+
+
+
+ + ♻ ☆ Bootstrap3D: Improving Multi-view Diffusion Model with Synthetic Data + + +
+ Recent years have witnessed remarkable progress in multi-view diffusion +models for 3D content creation. However, there remains a significant gap in +image quality and prompt-following ability compared to 2D diffusion models. A +critical bottleneck is the scarcity of high-quality 3D objects with detailed +captions. To address this challenge, we propose Bootstrap3D, a novel framework +that automatically generates an arbitrary quantity of multi-view images to +assist in training multi-view diffusion models. Specifically, we introduce a +data generation pipeline that employs (1) 2D and video diffusion models to +generate multi-view images based on constructed text prompts, and (2) our +fine-tuned 3D-aware MV-LLaVA for filtering high-quality data and rewriting +inaccurate captions. Leveraging this pipeline, we have generated 1 million +high-quality synthetic multi-view images with dense descriptive captions to +address the shortage of high-quality 3D data. Furthermore, we present a +Training Timestep Reschedule (TTR) strategy that leverages the denoising +process to learn multi-view consistency while maintaining the original 2D +diffusion prior. Extensive experiments demonstrate that Bootstrap3D can +generate high-quality multi-view images with superior aesthetic quality, +image-text alignment, and maintained view consistency. + +
+
+ comment: Project Page: https://sunzey.github.io/Bootstrap3D/ +
+
+
+
+
+ + ♻ ☆ AttackBench: Evaluating Gradient-based Attacks for Adversarial Examples + + +
+ Adversarial examples are typically optimized with gradient-based attacks. +While novel attacks are continuously proposed, each is shown to outperform its +predecessors using different experimental setups, hyperparameter settings, and +number of forward and backward calls to the target models. This provides +overly-optimistic and even biased evaluations that may unfairly favor one +particular attack over the others. In this work, we aim to overcome these +limitations by proposing AttackBench, i.e., the first evaluation framework that +enables a fair comparison among different attacks. To this end, we first +propose a categorization of gradient-based attacks, identifying their main +components and differences. We then introduce our framework, which evaluates +their effectiveness and efficiency. We measure these characteristics by (i) +defining an optimality metric that quantifies how close an attack is to the +optimal solution, and (ii) limiting the number of forward and backward queries +to the model, such that all attacks are compared within a given maximum query +budget. Our extensive experimental analysis compares more than $100$ attack +implementations with a total of over $800$ different configurations against +CIFAR-10 and ImageNet models, highlighting that only very few attacks +outperform all the competing approaches. Within this analysis, we shed light on +several implementation issues that prevent many attacks from finding better +solutions or running at all. We release AttackBench as a publicly-available +benchmark, aiming to continuously update it to include and evaluate novel +gradient-based attacks for optimizing adversarial examples. + +
+
+ comment: https://attackbench.github.io +
+
+
+
+
+ + ♻ ☆ I4VGen: Image as Free Stepping Stone for Text-to-Video Generation + + +
+ Text-to-video generation has trailed behind text-to-image generation in terms +of quality and diversity, primarily due to the inherent complexities of +spatio-temporal modeling and the limited availability of video-text datasets. +Recent text-to-video diffusion models employ the image as an intermediate step, +significantly enhancing overall performance but incurring high training costs. +In this paper, we present I4VGen, a novel video diffusion inference pipeline to +leverage advanced image techniques to enhance pre-trained text-to-video +diffusion models, which requires no additional training. Instead of the vanilla +text-to-video inference pipeline, I4VGen consists of two stages: anchor image +synthesis and anchor image-augmented text-to-video synthesis. Correspondingly, +a simple yet effective generation-selection strategy is employed to achieve +visually-realistic and semantically-faithful anchor image, and an innovative +noise-invariant video score distillation sampling (NI-VSDS) is developed to +animate the image to a dynamic video by distilling motion knowledge from video +diffusion models, followed by a video regeneration process to refine the video. +Extensive experiments show that the proposed method produces videos with higher +visual realism and textual fidelity. Furthermore, I4VGen also supports being +seamlessly integrated into existing image-to-video diffusion models, thereby +improving overall video quality. + +
+
+ comment: Project page: https://xiefan-guo.github.io/i4vgen +
+
+
+
+
+ + ♻ ☆ SSP-RACL: Classification of Noisy Fundus Images with Self-Supervised + Pretraining and Robust Adaptive Credal Loss + + +
+ Fundus image classification is crucial in the computer aided diagnosis tasks, +but label noise significantly impairs the performance of deep neural networks. +To address this challenge, we propose a robust framework, Self-Supervised +Pre-training with Robust Adaptive Credal Loss (SSP-RACL), for handling label +noise in fundus image datasets. First, we use Masked Autoencoders (MAE) for +pre-training to extract features, unaffected by label noise. Subsequently, RACL +employ a superset learning framework, setting confidence thresholds and +adaptive label relaxation parameter to construct possibility distributions and +provide more reliable ground-truth estimates, thus effectively suppressing the +memorization effect. Additionally, we introduce clinical knowledge-based +asymmetric noise generation to simulate real-world noisy fundus image datasets. +Experimental results demonstrate that our proposed method outperforms existing +approaches in handling label noise, showing superior performance. + +
+
+ comment: IEEE BioCAS 2024 +
+
+
+
+
+ + ♻ ☆ 4K4DGen: Panoramic 4D Generation at 4K Resolution + + +
+ The blooming of virtual reality and augmented reality (VR/AR) technologies +has driven an increasing demand for the creation of high-quality, immersive, +and dynamic environments. However, existing generative techniques either focus +solely on dynamic objects or perform outpainting from a single perspective +image, failing to meet the requirements of VR/AR applications that need +free-viewpoint, 360$^{\circ}$ virtual views where users can move in all +directions. In this work, we tackle the challenging task of elevating a single +panorama to an immersive 4D experience. For the first time, we demonstrate the +capability to generate omnidirectional dynamic scenes with 360$^{\circ}$ views +at 4K (4096 $\times$ 2048) resolution, thereby providing an immersive user +experience. Our method introduces a pipeline that facilitates natural scene +animations and optimizes a set of dynamic Gaussians using efficient splatting +techniques for real-time exploration. To overcome the lack of scene-scale +annotated 4D data and models, especially in panoramic formats, we propose a +novel \textbf{Panoramic Denoiser} that adapts generic 2D diffusion priors to +animate consistently in 360$^{\circ}$ images, transforming them into panoramic +videos with dynamic scenes at targeted regions. Subsequently, we propose +\textbf{Dynamic Panoramic Lifting} to elevate the panoramic video into a 4D +immersive environment while preserving spatial and temporal consistency. By +transferring prior knowledge from 2D models in the perspective domain to the +panoramic domain and the 4D lifting with spatial appearance and geometry +regularization, we achieve high-quality Panorama-to-4D generation at a +resolution of 4K for the first time. + +
+
+
+
+
+ + ♻ ☆ LEGO: Learnable Expansion of Graph Operators for Multi-Modal Feature + Fusion + + +
+ In computer vision tasks, features often come from diverse representations, +domains, and modalities, such as text, images, and videos. Effectively fusing +these features is essential for robust performance, especially with the +availability of powerful pre-trained models like vision-language models. +However, common fusion methods, such as concatenation, element-wise operations, +and non-linear techniques, often fail to capture structural relationships, deep +feature interactions, and suffer from inefficiency or misalignment of features +across domains. In this paper, we shift from high-dimensional feature space to +a lower-dimensional, interpretable graph space by constructing similarity +graphs that encode feature relationships at different levels, e.g., clip, +frame, patch, token, etc. To capture deeper interactions, we use graph power +expansions and introduce a learnable graph fusion operator to combine these +graph powers for more effective fusion. Our approach is relationship-centric, +operates in a homogeneous space, and is mathematically principled, resembling +element-wise similarity score aggregation via multilinear polynomials. We +demonstrate the effectiveness of our graph-based fusion method on video anomaly +detection, showing strong performance across multi-representational, +multi-modal, and multi-domain feature fusion tasks. + +
+
+ comment: Research paper +
+
+
+
+
+ + ♻ ☆ Pick of the Bunch: Detecting Infrared Small Targets Beyond Hit-Miss + Trade-Offs via Selective Rank-Aware Attention + + +
+ Infrared small target detection faces the inherent challenge of precisely +localizing dim targets amidst complex background clutter. Traditional +approaches struggle to balance detection precision and false alarm rates. To +break this dilemma, we propose SeRankDet, a deep network that achieves high +accuracy beyond the conventional hit-miss trade-off, by following the ``Pick of +the Bunch'' principle. At its core lies our Selective Rank-Aware Attention +(SeRank) module, employing a non-linear Top-K selection process that preserves +the most salient responses, preventing target signal dilution while maintaining +constant complexity. Furthermore, we replace the static concatenation typical +in U-Net structures with our Large Selective Feature Fusion (LSFF) module, a +dynamic fusion strategy that empowers SeRankDet with adaptive feature +integration, enhancing its ability to discriminate true targets from false +alarms. The network's discernment is further refined by our Dilated Difference +Convolution (DDC) module, which merges differential convolution aimed at +amplifying subtle target characteristics with dilated convolution to expand the +receptive field, thereby substantially improving target-background separation. +Despite its lightweight architecture, the proposed SeRankDet sets new +benchmarks in state-of-the-art performance across multiple public datasets. The +code is available at https://github.com/GrokCV/SeRankDet. + +
+
+ comment: IEEE TGRS 2024 +
+
+
+
+
+ + ♻ ☆ Representation Synthesis by Probabilistic Many-Valued Logic Operation in + Self-Supervised Learning ICIP2024 + + +
+ In this paper, we propose a new self-supervised learning (SSL) method for +representations that enable logic operations. Representation learning has been +applied to various tasks, such as image generation and retrieval. The logical +controllability of representations is important for these tasks. Although some +methods have been shown to enable the intuitive control of representations +using natural languages as the inputs, representation control via logic +operations between representations has not been demonstrated. Some SSL methods +using representation synthesis (e.g., elementwise mean and maximum operations) +have been proposed, but the operations performed in these methods do not +incorporate logic operations. In this work, we propose a logic-operable +self-supervised representation learning method by replacing the existing +representation synthesis with the OR operation on the probabilistic extension +of many-valued logic. The representations comprise a set of feature-possession +degrees, which are truth values indicating the presence or absence of each +feature in the image, and realize the logic operations (e.g., OR and AND). Our +method can generate a representation that has the features of both +representations or only those features common to both representations. In +addition, the expression of the ambiguous presence of a feature is realized by +indicating the feature-possession degree by the probability distribution of +truth values of the many-valued logic. We showed that our method performs +competitively in single and multi-label classification tasks compared with +prior SSL methods using synthetic representations. Moreover, experiments on +image retrieval using MNIST and PascalVOC showed that the representations of +our method can be operated by OR and AND operations. + +
+
+ comment: Accepted to the IEEE Open Journal of Signal Processing (ICIP2024 + track) +
+
+
+
+
+ + ♻ ☆ Semantic-Aware Adversarial Training for Reliable Deep Hashing Retrieval + + +
+ Deep hashing has been intensively studied and successfully applied in +large-scale image retrieval systems due to its efficiency and effectiveness. +Recent studies have recognized that the existence of adversarial examples poses +a security threat to deep hashing models, that is, adversarial vulnerability. +Notably, it is challenging to efficiently distill reliable semantic +representatives for deep hashing to guide adversarial learning, and thereby it +hinders the enhancement of adversarial robustness of deep hashing-based +retrieval models. Moreover, current researches on adversarial training for deep +hashing are hard to be formalized into a unified minimax structure. In this +paper, we explore Semantic-Aware Adversarial Training (SAAT) for improving the +adversarial robustness of deep hashing models. Specifically, we conceive a +discriminative mainstay features learning (DMFL) scheme to construct semantic +representatives for guiding adversarial learning in deep hashing. Particularly, +our DMFL with the strict theoretical guarantee is adaptively optimized in a +discriminative learning manner, where both discriminative and semantic +properties are jointly considered. Moreover, adversarial examples are +fabricated by maximizing the Hamming distance between the hash codes of +adversarial samples and mainstay features, the efficacy of which is validated +in the adversarial attack trials. Further, we, for the first time, formulate +the formalized adversarial training of deep hashing into a unified minimax +optimization under the guidance of the generated mainstay codes. Extensive +experiments on benchmark datasets show superb attack performance against the +state-of-the-art algorithms, meanwhile, the proposed adversarial training can +effectively eliminate adversarial perturbations for trustworthy deep +hashing-based retrieval. Our code is available at +https://github.com/xandery-geek/SAAT. + +
+
+
+
+
+ + ♻ ☆ A boundary-aware point clustering approach in Euclidean and embedding + spaces for roof plane segmentation + + +
+ Roof plane segmentation from airborne LiDAR point clouds is an important +technology for 3D building model reconstruction. One of the key issues of plane +segmentation is how to design powerful features that can exactly distinguish +adjacent planar patches. The quality of point feature directly determines the +accuracy of roof plane segmentation. Most of existing approaches use +handcrafted features to extract roof planes. However, the abilities of these +features are relatively low, especially in boundary area. To solve this +problem, we propose a boundary-aware point clustering approach in Euclidean and +embedding spaces constructed by a multi-task deep network for roof plane +segmentation. We design a three-branch network to predict semantic labels, +point offsets and extract deep embedding features. In the first branch, we +classify the input data as non-roof, boundary and plane points. In the second +branch, we predict point offsets for shifting each point toward its respective +instance center. In the third branch, we constrain that points of the same +plane instance should have the similar embeddings. We aim to ensure that points +of the same plane instance are close as much as possible in both Euclidean and +embedding spaces. However, although deep network has strong feature +representative ability, it is still hard to accurately distinguish points near +plane instance boundary. Therefore, we first group plane points into many +clusters in the two spaces, and then we assign the rest boundary points to +their closest clusters to generate final complete roof planes. In this way, we +can effectively reduce the influence of unreliable boundary points. In +addition, we prepare a synthetic dataset and two real datasets to train and +evaluate our approach. The experiments results show that the proposed approach +significantly outperforms the existing state-of-the-art approaches. + +
+
+
+
+
+ + ♻ ☆ The Instinctive Bias: Spurious Images lead to Illusion in MLLMs + + +
+ Large language models (LLMs) have recently experienced remarkable progress, +where the advent of multi-modal large language models (MLLMs) has endowed LLMs +with visual capabilities, leading to impressive performances in various +multi-modal tasks. However, those powerful MLLMs such as GPT-4V still fail +spectacularly when presented with certain image and text inputs. In this paper, +we identify a typical class of inputs that baffles MLLMs, which consist of +images that are highly relevant but inconsistent with answers, causing MLLMs to +suffer from visual illusion. To quantify the effect, we propose CorrelationQA, +the first benchmark that assesses the visual illusion level given spurious +images. This benchmark contains 7,308 text-image pairs across 13 categories. +Based on the proposed CorrelationQA, we conduct a thorough analysis on 9 +mainstream MLLMs, illustrating that they universally suffer from this +instinctive bias to varying degrees. We hope that our curated benchmark and +evaluation results aid in better assessments of the MLLMs' robustness in the +presence of misleading images. The code and datasets are available at +https://github.com/MasaiahHan/CorrelationQA. + +
+
+
+
+
+ + ♻ ☆ Rethinking and Defending Protective Perturbation in Personalized + Diffusion Models + + +
+ Personalized diffusion models (PDMs) have become prominent for adapting +pretrained text-to-image models to generate images of specific subjects using +minimal training data. However, PDMs are susceptible to minor adversarial +perturbations, leading to significant degradation when fine-tuned on corrupted +datasets. These vulnerabilities are exploited to create protective +perturbations that prevent unauthorized image generation. Existing purification +methods attempt to mitigate this issue but often over-purify images, resulting +in information loss. In this work, we conduct an in-depth analysis of the +fine-tuning process of PDMs through the lens of shortcut learning. We +hypothesize and empirically demonstrate that adversarial perturbations induce a +latent-space misalignment between images and their text prompts in the CLIP +embedding space. This misalignment causes the model to erroneously associate +noisy patterns with unique identifiers during fine-tuning, resulting in poor +generalization. Based on these insights, we propose a systematic defense +framework that includes data purification and contrastive decoupling learning. +We first employ off-the-shelf image restoration techniques to realign images +with their original semantic meanings in latent space. Then, we introduce +contrastive decoupling learning with noise tokens to decouple the learning of +personalized concepts from spurious noise patterns. Our study not only uncovers +fundamental shortcut learning vulnerabilities in PDMs but also provides a +comprehensive evaluation framework for developing stronger protection. Our +extensive evaluation demonstrates its superiority over existing purification +methods and stronger robustness against adaptive perturbation. + +
+
+ comment: Our code is available at + https://github.com/liuyixin-louis/DiffShortcut +
+
+
+
+
+ + ♻ ☆ LMOD: A Large Multimodal Ophthalmology Dataset and Benchmark for Large + Vision-Language Models + + +
+ Ophthalmology relies heavily on detailed image analysis for diagnosis and +treatment planning. While large vision-language models (LVLMs) have shown +promise in understanding complex visual information, their performance on +ophthalmology images remains underexplored. We introduce LMOD, a dataset and +benchmark for evaluating LVLMs on ophthalmology images, covering anatomical +understanding, diagnostic analysis, and demographic extraction. LMODincludes +21,993 images spanning optical coherence tomography, scanning laser +ophthalmoscopy, eye photos, surgical scenes, and color fundus photographs. We +benchmark 13 state-of-the-art LVLMs and find that they are far from perfect for +comprehending ophthalmology images. Models struggle with diagnostic analysis +and demographic extraction, reveal weaknesses in spatial reasoning, diagnostic +analysis, handling out-of-domain queries, and safeguards for handling +biomarkers of ophthalmology images. + +
+
+
+
+
+ + ♻ ☆ Integrating Large Language Models into a Tri-Modal Architecture for + Automated Depression Classification + + +
+ Major Depressive Disorder (MDD) is a pervasive mental health condition that +affects 300 million people worldwide. This work presents a novel, BiLSTM-based +tri-modal model-level fusion architecture for the binary classification of +depression from clinical interview recordings. The proposed architecture +incorporates Mel Frequency Cepstral Coefficients, Facial Action Units, and uses +a two-shot learning based GPT-4 model to process text data. This is the first +work to incorporate large language models into a multi-modal architecture for +this task. It achieves impressive results on the DAIC-WOZ AVEC 2016 Challenge +cross-validation split and Leave-One-Subject-Out cross-validation split, +surpassing all baseline models and multiple state-of-the-art models. In +Leave-One-Subject-Out testing, it achieves an accuracy of 91.01%, an F1-Score +of 85.95%, a precision of 80%, and a recall of 92.86%. + +
+
+ comment: Keywords: Multi-Modal Neural Networks, Deep Learning, Large Language + Models, Depression Diagnosis, Biomedical Informatics, DAIC-WOZ +
+
+
+
+
+ + ♻ ☆ Towards Data-and Knowledge-Driven Artificial Intelligence: A Survey on + Neuro-Symbolic Computing + + +
+ Neural-symbolic computing (NeSy), which pursues the integration of the +symbolic and statistical paradigms of cognition, has been an active research +area of Artificial Intelligence (AI) for many years. As NeSy shows promise of +reconciling the advantages of reasoning and interpretability of symbolic +representation and robust learning in neural networks, it may serve as a +catalyst for the next generation of AI. In the present paper, we provide a +systematic overview of the recent developments and important contributions of +NeSy research. Firstly, we introduce study history of this area, covering early +work and foundations. We further discuss background concepts and identify key +driving factors behind the development of NeSy. Afterward, we categorize recent +landmark approaches along several main characteristics that underline this +research paradigm, including neural-symbolic integration, knowledge +representation, knowledge embedding, and functionality. Next, we briefly +discuss the successful application of modern NeSy approaches in several +domains. Then, we benchmark several NeSy methods on three representative +application tasks. Finally, we identify the open problems together with +potential future research directions. This survey is expected to help new +researchers enter this rapidly evolving field and accelerate the progress +towards data-and knowledge-driven AI. + +
+
+ comment: PAMI 2024 +
+
+
+
+
+ + ♻ ☆ Color Equivariant Network CVPR 2024 + + +
+ Group equivariant convolutional neural networks have been designed for a +variety of geometric transformations from 2D and 3D rotation groups, to +semi-groups such as scale. Despite the improved interpretability, accuracy and +generalizability afforded by these architectures, group equivariant networks +have seen limited application in the context of perceptual quantities such as +hue and saturation, even though their variation can lead to significant +reductions in classification performance. In this paper, we introduce +convolutional neural networks equivariant to variations in hue and saturation +by design. To achieve this, we leverage the observation that hue and saturation +transformations can be identified with the 2D rotation and 1D translation +groups respectively. Our hue-, saturation-, and fully color-equivariant +networks achieve equivariance to these perceptual transformations without an +increase in network parameters. We demonstrate the utility of our networks on +synthetic and real world datasets where color and lighting variations are +commonplace. + +
+
+ comment: Accepted at CVPR 2024 Equivariant Vision Workshop +
+
+
+
+
+ + ♻ ☆ Explainable Concept Generation through Vision-Language Preference + Learning + + +
+ Concept-based explanations have become a popular choice for explaining deep +neural networks post-hoc because, unlike most other explainable AI techniques, +they can be used to test high-level visual "concepts" that are not directly +related to feature attributes. For instance, the concept of "stripes" is +important to classify an image as a zebra. Concept-based explanation methods, +however, require practitioners to guess and collect multiple candidate concept +image sets, which can often be imprecise and labor-intensive. Addressing this +limitation, in this paper, we frame concept image set creation as an image +generation problem. However, since naively using a generative model does not +result in meaningful concepts, we devise a reinforcement learning-based +preference optimization (RLPO) algorithm that fine-tunes the vision-language +generative model from approximate textual descriptions of concepts. Through a +series of experiments, we demonstrate the capability of our method to +articulate complex and abstract concepts which aligns with the test class that +are otherwise challenging to craft manually. In addition to showing the +efficacy and reliability of our method, we show how our method can be used as a +diagnostic tool for analyzing neural networks. + +
+
+ comment: 25 pages, 27 figures +
+
+
+
+
+ + ♻ ☆ EDADepth: Enhanced Data Augmentation for Monocular Depth Estimation + + +
+ Due to their text-to-image synthesis feature, diffusion models have recently +seen a rise in visual perception tasks, such as depth estimation. The lack of +good-quality datasets makes the extraction of a fine-grain semantic context +challenging for the diffusion models. The semantic context with fewer details +further worsens the process of creating effective text embeddings that will be +used as input for diffusion models. In this paper, we propose a novel EDADepth, +an enhanced data augmentation method to estimate monocular depth without using +additional training data. We use Swin2SR, a super-resolution model, to enhance +the quality of input images. We employ the BEiT pre-trained semantic +segmentation model for better extraction of text embeddings. We use BLIP-2 +tokenizer to generate tokens from these text embeddings. The novelty of our +approach is the introduction of Swin2SR, the BEiT model, and the BLIP-2 +tokenizer in the diffusion-based pipeline for the monocular depth estimation. +Our model achieves state-of-the-art results (SOTA) on the delta3 metric on +NYUv2 and KITTI datasets. It also achieves results comparable to those of the +SOTA models in the RMSE and REL metrics. Finally, we also show improvements in +the visualization of the estimated depth compared to the SOTA diffusion-based +monocular depth estimation models. Code: +https://github.com/edadepthmde/EDADepth_ICMLA. + +
+
+
+
+
+
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ Unified Multi-Modal Interleaved Document Representation for Information + Retrieval + + +
+ Information Retrieval (IR) methods aim to identify relevant documents in +response to a given query, which have gained remarkable attention due to their +successful application in various natural language tasks. However, existing +approaches typically consider only the textual information within the +documents, which overlooks the fact that documents can contain multiple +modalities, including texts, images, and tables. Further, they often segment +each long document into multiple discrete passages for embedding, preventing +them from capturing the overall document context and interactions between +paragraphs. We argue that these two limitations lead to suboptimal document +representations for retrieval. In this work, to address them, we aim to produce +more comprehensive and nuanced document representations by holistically +embedding documents interleaved with different modalities. Specifically, we +achieve this by leveraging the capability of recent vision-language models that +enable the processing and integration of text, images, and tables into a +unified format and representation. Moreover, to mitigate the information loss +from segmenting documents into passages, instead of representing and retrieving +passages individually, we further merge the representations of segmented +passages into one single document representation, while we additionally +introduce a reranking strategy to decouple and identify the relevant passage +within the document if necessary. Then, through extensive experiments on +diverse information retrieval scenarios considering both the textual and +multimodal queries, we show that our approach substantially outperforms +relevant baselines, thanks to the consideration of the multimodal information +interleaved within the documents in a unified way. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Domain-Specific Retrieval-Augmented Generation Using Vector Stores, + Knowledge Graphs, and Tensor Factorization ICML + + +
+ Large Language Models (LLMs) are pre-trained on large-scale corpora and excel +in numerous general natural language processing (NLP) tasks, such as question +answering (QA). Despite their advanced language capabilities, when it comes to +domain-specific and knowledge-intensive tasks, LLMs suffer from hallucinations, +knowledge cut-offs, and lack of knowledge attributions. Additionally, fine +tuning LLMs' intrinsic knowledge to highly specific domains is an expensive and +time consuming process. The retrieval-augmented generation (RAG) process has +recently emerged as a method capable of optimization of LLM responses, by +referencing them to a predetermined ontology. It was shown that using a +Knowledge Graph (KG) ontology for RAG improves the QA accuracy, by taking into +account relevant sub-graphs that preserve the information in a structured +manner. In this paper, we introduce SMART-SLIC, a highly domain-specific LLM +framework, that integrates RAG with KG and a vector store (VS) that store +factual domain specific information. Importantly, to avoid hallucinations in +the KG, we build these highly domain-specific KGs and VSs without the use of +LLMs, but via NLP, data mining, and nonnegative tensor factorization with +automatic model selection. Pairing our RAG with a domain-specific: (i) KG +(containing structured information), and (ii) VS (containing unstructured +information) enables the development of domain-specific chat-bots that +attribute the source of information, mitigate hallucinations, lessen the need +for fine-tuning, and excel in highly domain-specific question answering tasks. +We pair SMART-SLIC with chain-of-thought prompting agents. The framework is +designed to be generalizable to adapt to any specific or specialized domain. In +this paper, we demonstrate the question answering capabilities of our framework +on a corpus of scientific publications on malware analysis and anomaly +detection. + +
+
+ comment: 9 pages 7 figures, 1 table, 1 cypher code Accepted to ICMLA 2024 +
+
+
+
+
+ + ☆ Attention in Large Language Models Yields Efficient Zero-Shot Re-Rankers + + +
+ Information retrieval (IR) systems have played a vital role in modern digital +life and have cemented their continued usefulness in this new era of generative +AI via retrieval-augmented generation. With strong language processing +capabilities and remarkable versatility, large language models (LLMs) have +become popular choices for zero-shot re-ranking in IR systems. So far, +LLM-based re-ranking methods rely on strong generative capabilities, which +restricts their use to either specialized or powerful proprietary models. Given +these restrictions, we ask: is autoregressive generation necessary and optimal +for LLMs to perform re-ranking? We hypothesize that there are abundant signals +relevant to re-ranking within LLMs that might not be used to their full +potential via generation. To more directly leverage such signals, we propose +in-context re-ranking (ICR), a novel method that leverages the change in +attention pattern caused by the search query for accurate and efficient +re-ranking. To mitigate the intrinsic biases in LLMs, we propose a calibration +method using a content-free query. Due to the absence of generation, ICR only +requires two ($O(1)$) forward passes to re-rank $N$ documents, making it +substantially more efficient than generative re-ranking methods that require at +least $O(N)$ forward passes. Our novel design also enables ICR to be applied to +any LLM without specialized training while guaranteeing a well-formed ranking. +Extensive experiments with two popular open-weight LLMs on standard single-hop +and multi-hop information retrieval benchmarks show that ICR outperforms +RankGPT while cutting the latency by more than 60% in practice. Through +detailed analyses, we show that ICR's performance is specially strong on tasks +that require more complex re-ranking signals. Our findings call for further +exploration on novel ways of utilizing open-weight LLMs beyond text generation. + +
+
+
+
+
+ + ☆ Long-Sequence Recommendation Models Need Decoupled Embeddings + + +
+ Lifelong user behavior sequences, comprising up to tens of thousands of +history behaviors, are crucial for capturing user interests and predicting user +responses in modern recommendation systems. A two-stage paradigm is typically +adopted to handle these long sequences: a few relevant behaviors are first +searched from the original long sequences via an attention mechanism in the +first stage and then aggregated with the target item to construct a +discriminative representation for prediction in the second stage. In this work, +we identify and characterize, for the first time, a neglected deficiency in +existing long-sequence recommendation models: a single set of embeddings +struggles with learning both attention and representation, leading to +interference between these two processes. Initial attempts to address this +issue using linear projections -- a technique borrowed from language processing +-- proved ineffective, shedding light on the unique challenges of +recommendation models. To overcome this, we propose the Decoupled Attention and +Representation Embeddings (DARE) model, where two distinct embedding tables are +initialized and learned separately to fully decouple attention and +representation. Extensive experiments and analysis demonstrate that DARE +provides more accurate search of correlated behaviors and outperforms baselines +with AUC gains up to 0.9% on public datasets and notable online system +improvements. Furthermore, decoupling embedding spaces allows us to reduce the +attention embedding dimension and accelerate the search procedure by 50% +without significant performance impact, enabling more efficient, +high-performance online serving. + +
+
+ comment: First three authors contributed equally +
+
+
+
+
+ + ☆ Quantifying User Coherence: A Unified Framework for Cross-Domain + Recommendation Analysis + + +
+ The effectiveness of Recommender Systems (RS) is closely tied to the quality +and distinctiveness of user profiles, yet despite many advancements in raw +performance, the sensitivity of RS to user profile quality remains +under-researched. This paper introduces novel information-theoretic measures +for understanding recommender systems: a "surprise" measure quantifying users' +deviations from popular choices, and a "conditional surprise" measure capturing +user interaction coherence. We evaluate 7 recommendation algorithms across 9 +datasets, revealing the relationships between our measures and standard +performance metrics. Using a rigorous statistical framework, our analysis +quantifies how much user profile density and information measures impact +algorithm performance across domains. By segmenting users based on these +measures, we achieve improved performance with reduced data and show that +simpler algorithms can match complex ones for low-coherence users. +Additionally, we employ our measures to analyze how well different +recommendation algorithms maintain the coherence and diversity of user +preferences in their predictions, providing insights into algorithm behavior. +This work advances the theoretical understanding of user behavior and practical +heuristics for personalized recommendation systems, promoting more efficient +and adaptive architectures. + +
+
+
+
+
+ + ☆ Multi-modal clothing recommendation model based on large model and VAE + enhancement + + +
+ Accurately recommending products has long been a subject requiring in-depth +research. This study proposes a multimodal paradigm for clothing +recommendations. Specifically, it designs a multimodal analysis method that +integrates clothing description texts and images, utilizing a pre-trained large +language model to deeply explore the hidden meanings of users and products. +Additionally, a variational encoder is employed to learn the relationship +between user information and products to address the cold start problem in +recommendation systems. This study also validates the significant performance +advantages of this method over various recommendation system methods through +extensive ablation experiments, providing crucial practical guidance for the +comprehensive optimization of recommendation systems. + +
+
+
+
+
+ + ☆ A Survey on Point-of-Interest Recommendation: Models, Architectures, and + Security + + +
+ The widespread adoption of smartphones and Location-Based Social Networks has +led to a massive influx of spatio-temporal data, creating unparalleled +opportunities for enhancing Point-of-Interest (POI) recommendation systems. +These advanced POI systems are crucial for enriching user experiences, enabling +personalized interactions, and optimizing decision-making processes in the +digital landscape. However, existing surveys tend to focus on traditional +approaches and few of them delve into cutting-edge developments, emerging +architectures, as well as security considerations in POI recommendations. To +address this gap, our survey stands out by offering a comprehensive, up-to-date +review of POI recommendation systems, covering advancements in models, +architectures, and security aspects. We systematically examine the transition +from traditional models to advanced techniques such as large language models. +Additionally, we explore the architectural evolution from centralized to +decentralized and federated learning systems, highlighting the improvements in +scalability and privacy. Furthermore, we address the increasing importance of +security, examining potential vulnerabilities and privacy-preserving +approaches. Our taxonomy provides a structured overview of the current state of +POI recommendation, while we also identify promising directions for future +research in this rapidly advancing field. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ BayesCNS: A Unified Bayesian Approach to Address Cold Start and + Non-Stationarity in Search Systems at Scale + + +
+ Information Retrieval (IR) systems used in search and recommendation +platforms frequently employ Learning-to-Rank (LTR) models to rank items in +response to user queries. These models heavily rely on features derived from +user interactions, such as clicks and engagement data. This dependence +introduces cold start issues for items lacking user engagement and poses +challenges in adapting to non-stationary shifts in user behavior over time. We +address both challenges holistically as an online learning problem and propose +BayesCNS, a Bayesian approach designed to handle cold start and non-stationary +distribution shifts in search systems at scale. BayesCNS achieves this by +estimating prior distributions for user-item interactions, which are +continuously updated with new user interactions gathered online. This online +learning procedure is guided by a ranker model, enabling efficient exploration +of relevant items using contextual information provided by the ranker. We +successfully deployed BayesCNS in a large-scale search system and demonstrated +its efficacy through comprehensive offline and online experiments. Notably, an +online A/B experiment showed a 10.60% increase in new item interactions and a +1.05% improvement in overall success metrics over the existing production +baseline. + +
+
+
+
+
+ + ♻ ☆ LML-DAP: Language Model Learning a Dataset for Data-Augmented Prediction + + +
+ Classification tasks are typically handled using Machine Learning (ML) +models, which lack a balance between accuracy and interpretability. This paper +introduces a new approach to using Large Language Models (LLMs) for +classification tasks in an explainable way. Unlike ML models that rely heavily +on data cleaning and feature engineering, this method streamlines the process +using LLMs. This paper proposes a new concept called "Language Model Learning +(LML)" powered by a new method called "Data-Augmented Prediction (DAP)". The +classification is performed by LLMs using a method similar to humans manually +exploring and understanding the data and deciding classifications using data as +a reference. In the LML process, a dataset is summarized and evaluated to +determine the features that lead to the classification of each label the most. +In the process of DAP, the system uses the data summary and a row of the +testing dataset to automatically generate a query, which is used to retrieve +relevant rows from the dataset. A classification is generated by the LLM using +data summary and relevant rows, ensuring satisfactory accuracy even with +complex data using context-aware decision-making. LML and DAP unlock the +possibilities of new applications. The proposed method uses the words "Act as +an Explainable Machine Learning Model" in the prompt to enhance the +interpretability of the predictions by allowing users to review the logic +behind each prediction. In some test cases, the system scored an accuracy above +90%, proving the effectiveness of the system and its potential to outperform +conventional ML models in various scenarios. The code is available at +https://github.com/Pro-GenAI/LML-DAP + +
+
+ comment: Updated title, abstract, and images +
+
+
+
+
+ + ♻ ☆ Exploring the Practicality of Generative Retrieval on Dynamic Corpora + + +
+ Benchmarking the performance of information retrieval (IR) is mostly +conducted with a fixed set of documents (static corpora). However, in realistic +scenarios, this is rarely the case and the documents to be retrieved are +constantly updated and added. In this paper, we focus on Generative Retrievals +(GR), which apply autoregressive language models to IR problems, and explore +their adaptability and robustness in dynamic scenarios. We also conduct an +extensive evaluation of computational and memory efficiency, crucial factors +for real-world deployment of IR systems handling vast and ever-changing +document collections. Our results on the StreamingQA benchmark demonstrate that +GR is more adaptable to evolving knowledge (4 -- 11%), robust in learning +knowledge with temporal information, and efficient in terms of inference FLOPs +(x 2), indexing time (x 6), and storage footprint (x 4) compared to Dual +Encoders (DE), which are commonly used in retrieval systems. Our paper +highlights the potential of GR for future use in practical IR systems within +dynamic environments. + +
+
+
+
+
+ + ♻ ☆ Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on + Graphs + + +
+ Large language models (LLMs), while exhibiting exceptional performance, +suffer from hallucinations, especially on knowledge-intensive tasks. Existing +works propose to augment LLMs with individual text units retrieved from +external knowledge corpora to alleviate the issue. However, in many domains, +texts are interconnected (e.g., academic papers in a bibliographic graph are +linked by citations and co-authorships) which form a (text-attributed) graph. +The knowledge in such graphs is encoded not only in single texts/nodes but also +in their associated connections. To facilitate the research of augmenting LLMs +with graphs, we manually construct a Graph Reasoning Benchmark dataset called +GRBench, containing 1,740 questions that can be answered with the knowledge +from 10 domain graphs. Then, we propose a simple and effective framework called +Graph Chain-of-thought (Graph-CoT) to augment LLMs with graphs by encouraging +LLMs to reason on the graph iteratively. Each Graph-CoT iteration consists of +three sub-steps: LLM reasoning, LLM-graph interaction, and graph execution. We +conduct systematic experiments with three LLM backbones on GRBench, where +Graph-CoT outperforms the baselines consistently. The code is available at +https://github.com/PeterGriffinJin/Graph-CoT. + +
+
+ comment: 21 pages. Code: https://github.com/PeterGriffinJin/Graph-CoT +
+
+
+
+
+ + ♻ ☆ Stalactite: Toolbox for Fast Prototyping of Vertical Federated Learning + Systems + + +
+ Machine learning (ML) models trained on datasets owned by different +organizations and physically located in remote databases offer benefits in many +real-world use cases. State regulations or business requirements often prevent +data transfer to a central location, making it difficult to utilize standard +machine learning algorithms. Federated Learning (FL) is a technique that +enables models to learn from distributed datasets without revealing the +original data. Vertical Federated learning (VFL) is a type of FL where data +samples are divided by features across several data owners. For instance, in a +recommendation task, a user can interact with various sets of items, and the +logs of these interactions are stored by different organizations. In this demo +paper, we present \emph{Stalactite} - an open-source framework for VFL that +provides the necessary functionality for building prototypes of VFL systems. It +has several advantages over the existing frameworks. In particular, it allows +researchers to focus on the algorithmic side rather than engineering and to +easily deploy learning in a distributed environment. It implements several VFL +algorithms and has a built-in homomorphic encryption layer. We demonstrate its +use on a real-world recommendation datasets. + +
+
+
+
+
+ + ♻ ☆ Language Representations Can be What Recommenders Need: Findings and + Potentials + + +
+ Recent studies empirically indicate that language models (LMs) encode rich +world knowledge beyond mere semantics, attracting significant attention across +various fields. However, in the recommendation domain, it remains uncertain +whether LMs implicitly encode user preference information. Contrary to +prevailing understanding that LMs and traditional recommenders learn two +distinct representation spaces due to the huge gap in language and behavior +modeling objectives, this work re-examines such understanding and explores +extracting a recommendation space directly from the language representation +space. Surprisingly, our findings demonstrate that item representations, when +linearly mapped from advanced LM representations, yield superior recommendation +performance. This outcome suggests the possible homomorphism between the +advanced language representation space and an effective item representation +space for recommendation, implying that collaborative signals may be implicitly +encoded within LMs. Motivated by these findings, we explore the possibility of +designing advanced collaborative filtering (CF) models purely based on language +representations without ID-based embeddings. To be specific, we incorporate +several crucial components to build a simple yet effective model, with item +titles as the input. Empirical results show that such a simple model can +outperform leading ID-based CF models, which sheds light on using language +representations for better recommendation. Moreover, we systematically analyze +this simple model and find several key features for using advanced language +representations: a good initialization for item representations, zero-shot +recommendation abilities, and being aware of user intention. Our findings +highlight the connection between language modeling and behavior modeling, which +can inspire both natural language processing and recommender system +communities. + +
+
+ comment: Codes are available at https://github.com/LehengTHU/AlphaRec +
+
+
+
+
+ + ♻ ☆ BlueTempNet: A Temporal Multi-network Dataset of Social Interactions in + Bluesky Social + + +
+ Decentralized social media platforms like Bluesky Social (Bluesky) have made +it possible to publicly disclose some user behaviors with millisecond-level +precision. Embracing Bluesky's principles of open-source and open-data, we +present the first collection of the temporal dynamics of user-driven social +interactions. BlueTempNet integrates multiple types of networks into a single +multi-network, including user-to-user interactions (following and blocking +users) and user-to-community interactions (creating and joining communities). +Communities are user-formed groups in custom Feeds, where users subscribe to +posts aligned with their interests. Following Bluesky's public data policy, we +collect existing Bluesky Feeds, including the users who liked and generated +these Feeds, and provide tools to gather users' social interactions within a +date range. This data-collection strategy captures past user behaviors and +supports the future data collection of user behavior. + +
+
+ comment: accepted to IEEE Data Descriptions 24 +
+
+
+
+
+ + ♻ ☆ Do We Need Domain-Specific Embedding Models? An Empirical Investigation + + +
+ Embedding models play a crucial role in representing and retrieving +information across various NLP applications. Recent advancements in Large +Language Models (LLMs) have further enhanced the performance of embedding +models, which are trained on massive amounts of text covering almost every +domain. These models are often benchmarked on general-purpose datasets like +Massive Text Embedding Benchmark (MTEB), where they demonstrate superior +performance. However, a critical question arises: Is the development of +domain-specific embedding models necessary when general-purpose models are +trained on vast corpora that already include specialized domain texts? In this +paper, we empirically investigate this question, choosing the finance domain as +an example. We introduce the Finance Massive Text Embedding Benchmark +(FinMTEB), a counterpart to MTEB that consists of financial domain-specific +text datasets. We evaluate the performance of seven state-of-the-art embedding +models on FinMTEB and observe a significant performance drop compared to their +performance on MTEB. To account for the possibility that this drop is driven by +FinMTEB's higher complexity, we propose four measures to quantify dataset +complexity and control for this factor in our analysis. Our analysis provides +compelling evidence that state-of-the-art embedding models struggle to capture +domain-specific linguistic and semantic patterns. Moreover, we find that the +performance of general-purpose embedding models on MTEB is not correlated with +their performance on FinMTEB, indicating the need for domain-specific embedding +benchmarks for domain-specific embedding models. This study sheds light on +developing domain-specific embedding models in the LLM era. FinMTEB comes with +open-source code at https://github.com/yixuantt/FinMTEB + +
+
+ comment: https://github.com/yixuantt/FinMTEB +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Flash-Splat: 3D Reflection Removal with Flash Cues and Gaussian Splats + + +
+ We introduce a simple yet effective approach for separating transmitted and +reflected light. Our key insight is that the powerful novel view synthesis +capabilities provided by modern inverse rendering methods (e.g.,~3D Gaussian +splatting) allow one to perform flash/no-flash reflection separation using +unpaired measurements -- this relaxation dramatically simplifies image +acquisition over conventional paired flash/no-flash reflection separation +methods. Through extensive real-world experiments, we demonstrate our method, +Flash-Splat, accurately reconstructs both transmitted and reflected scenes in +3D. Our method outperforms existing 3D reflection separation methods, which do +not leverage illumination control, by a large margin. Our project webpage is at +https://flash-splat.github.io/. + +
+
+
+
+
+ + ☆ Vinoground: Scrutinizing LMMs over Dense Temporal Reasoning with Short + Videos + + +
+ There has been growing sentiment recently that modern large multimodal models +(LMMs) have addressed most of the key challenges related to short video +comprehension. As a result, both academia and industry are gradually shifting +their attention towards the more complex challenges posed by understanding +long-form videos. However, is this really the case? Our studies indicate that +LMMs still lack many fundamental reasoning capabilities even when dealing with +short videos. We introduce Vinoground, a temporal counterfactual LMM evaluation +benchmark encompassing 1000 short and natural video-caption pairs. We +demonstrate that existing LMMs severely struggle to distinguish temporal +differences between different actions and object transformations. For example, +the best model GPT-4o only obtains ~50% on our text and video scores, showing a +large gap compared to the human baseline of ~90%. All open-source multimodal +models and CLIP-based models perform much worse, producing mostly random chance +performance. Through this work, we shed light onto the fact that temporal +reasoning in short videos is a problem yet to be fully solved. The dataset and +evaluation code are available at https://vinoground.github.io. + +
+
+ comment: Project Page: https://vinoground.github.io +
+
+
+
+
+ + ☆ Interpreting and Editing Vision-Language Representations to Mitigate + Hallucinations + + +
+ We investigate the internal representations of vision-language models (VLMs) +to address hallucinations, a persistent challenge despite advances in model +size and training. We project VLMs' internal image representations to their +language vocabulary and observe more confident output probabilities on real +objects than hallucinated objects. We additionally use these output +probabilities to spatially localize real objects. Building on this approach, we +introduce a knowledge erasure algorithm that removes hallucinations by linearly +orthogonalizing image features with respect to hallucinated object features. We +show that targeted edits to a model's latent representations can reduce +hallucinations by up to 25.7% on the COCO2014 dataset while preserving +performance. Our findings demonstrate how a deeper understanding of VLMs' +latent representations can enhance reliability and enable novel capabilities, +such as zero-shot segmentation. + +
+
+ comment: Project page and code: http://anishk23733.github.io/vl-interp/ +
+
+
+
+
+ + ☆ Erasing Conceptual Knowledge from Language Models + + +
+ Concept erasure in language models has traditionally lacked a comprehensive +evaluation framework, leading to incomplete assessments of effectiveness of +erasure methods. We propose an evaluation paradigm centered on three critical +criteria: innocence (complete knowledge removal), seamlessness (maintaining +conditional fluent generation), and specificity (preserving unrelated task +performance). Our evaluation metrics naturally motivate the development of +Erasure of Language Memory (ELM), a new method designed to address all three +dimensions. ELM employs targeted low-rank updates to alter output distributions +for erased concepts while preserving overall model capabilities including +fluency when prompted for an erased concept. We demonstrate ELM's efficacy on +biosecurity, cybersecurity, and literary domain erasure tasks. Comparative +analysis shows that ELM achieves superior performance across our proposed +metrics, including near-random scores on erased topic assessments, generation +fluency, maintained accuracy on unrelated benchmarks, and robustness under +adversarial attacks. Our code, data, and trained models are available at +https://elm.baulab.info + +
+
+ comment: Project Page: https://elm.baulab.info +
+
+
+
+
+ + ☆ Forecasting Smog Clouds With Deep Learning + + +
+ In this proof-of-concept study, we conduct multivariate timeseries +forecasting for the concentrations of nitrogen dioxide (NO2), ozone (O3), and +(fine) particulate matter (PM10 & PM2.5) with meteorological covariates between +two locations using various deep learning models, with a focus on long +short-term memory (LSTM) and gated recurrent unit (GRU) architectures. In +particular, we propose an integrated, hierarchical model architecture inspired +by air pollution dynamics and atmospheric science that employs multi-task +learning and is benchmarked by unidirectional and fully-connected models. +Results demonstrate that, above all, the hierarchical GRU proves itself as a +competitive and efficient method for forecasting the concentration of +smog-related pollutants. + +
+
+
+
+
+ + ☆ SIEVE: General Purpose Data Filtering System Matching GPT-4o Accuracy at + 1% the Cost + + +
+ Creating specialized large language models requires vast amounts of clean, +special purpose data for training and fine-tuning. With only a handful of +existing large-scale, domain-specific datasets, creation of new datasets is +required in most applications. This requires the development of new +application-specific filtering of web-scale data. Filtering with a +high-performance, general-purpose LLM such as GPT-4o can be highly effective, +but this is extremely expensive at web-scale. This paper proposes SIEVE, a +lightweight alternative that matches GPT-4o accuracy at a fraction of the cost. +SIEVE can perform up to 500 filtering operations for the cost of one GPT-4o +filtering call. The key to SIEVE is a seamless integration of GPT-4o and +lightweight T5 models, using active learning to fine-tune T5 in the background +with a small number of calls to GPT-4o. Once trained, it performs as well as +GPT-4o at a tiny fraction of the cost. We experimentally validate SIEVE on the +OpenWebText dataset, using five highly customized filter tasks targeting high +quality and domain-specific content. Our results demonstrate the effectiveness +and efficiency of our method in curating large, high-quality datasets for +language model training at a substantially lower cost (1%) than existing +techniques. To further validate SIEVE, experiments show that SIEVE and GPT-4o +achieve similar accuracy, with human evaluators preferring SIEVE's filtering +results to those of GPT-4o. + +
+
+
+
+
+ + ☆ ReLIC: A Recipe for 64k Steps of In-Context Reinforcement Learning for + Embodied AI + + +
+ Intelligent embodied agents need to quickly adapt to new scenarios by +integrating long histories of experience into decision-making. For instance, a +robot in an unfamiliar house initially wouldn't know the locations of objects +needed for tasks and might perform inefficiently. However, as it gathers more +experience, it should learn the layout of its environment and remember where +objects are, allowing it to complete new tasks more efficiently. To enable such +rapid adaptation to new tasks, we present ReLIC, a new approach for in-context +reinforcement learning (RL) for embodied agents. With ReLIC, agents are capable +of adapting to new environments using 64,000 steps of in-context experience +with full attention while being trained through self-generated experience via +RL. We achieve this by proposing a novel policy update scheme for on-policy RL +called "partial updates'' as well as a Sink-KV mechanism that enables effective +utilization of a long observation history for embodied agents. Our method +outperforms a variety of meta-RL baselines in adapting to unseen houses in an +embodied multi-object navigation task. In addition, we find that ReLIC is +capable of few-shot imitation learning despite never being trained with expert +demonstrations. We also provide a comprehensive analysis of ReLIC, highlighting +that the combination of large-scale RL training, the proposed partial updates +scheme, and the Sink-KV are essential for effective in-context learning. The +code for ReLIC and all our experiments is at https://github.com/aielawady/relic + +
+
+
+
+
+ + ☆ An Online Automatic Modulation Classification Scheme Based on Isolation + Distributional Kernel + + +
+ Automatic Modulation Classification (AMC), as a crucial technique in modern +non-cooperative communication networks, plays a key role in various civil and +military applications. However, existing AMC methods usually are complicated +and can work in batch mode only due to their high computational complexity. +This paper introduces a new online AMC scheme based on Isolation Distributional +Kernel. Our method stands out in two aspects. Firstly, it is the first proposal +to represent baseband signals using a distributional kernel. Secondly, it +introduces a pioneering AMC technique that works well in online settings under +realistic time-varying channel conditions. Through extensive experiments in +online settings, we demonstrate the effectiveness of the proposed classifier. +Our results indicate that the proposed approach outperforms existing baseline +models, including two state-of-the-art deep learning classifiers. Moreover, it +distinguishes itself as the first online classifier for AMC with linear time +complexity, which marks a significant efficiency boost for real-time +applications. + +
+
+
+
+
+ + ☆ Training Language Models on Synthetic Edit Sequences Improves Code + Synthesis + + +
+ Software engineers mainly write code by editing existing programs. In +contrast, large language models (LLMs) autoregressively synthesize programs in +a single pass. One explanation for this is the scarcity of open-sourced edit +data. While high-quality instruction data for code synthesis is already scarce, +high-quality edit data is even scarcer. To fill this gap, we develop a +synthetic data generation algorithm called LintSeq. This algorithm refactors +existing code into a sequence of code edits by using a linter to procedurally +sample across the error-free insertions that can be used to sequentially write +programs. It outputs edit sequences as text strings consisting of consecutive +program diffs. To test LintSeq, we use it to refactor a dataset of instruction ++ program pairs into instruction + program-diff-sequence tuples. Then, we +instruction finetune a series of smaller LLMs ranging from 2.6B to 14B +parameters on both the re-factored and original versions of this dataset, +comparing zero-shot performance on code synthesis benchmarks. We show that +during repeated sampling, edit sequence finetuned models produce more diverse +programs than baselines. This results in better inference-time scaling for +benchmark coverage as a function of samples, i.e. the fraction of problems +"pass@k" solved by any attempt given "k" tries. For example, on HumanEval +pass@50, small LLMs finetuned on synthetic edit sequences are competitive with +GPT-4 and outperform models finetuned on the baseline dataset by +20% (+/-3%) +in absolute score. Finally, we also pretrain our own tiny LMs for code +understanding. We show that finetuning tiny models on synthetic code edits +results in state-of-the-art code synthesis for the on-device model class. Our +150M parameter edit sequence LM matches or outperforms code models with twice +as many parameters, both with and without repeated sampling, including Codex +and AlphaCode. + +
+
+
+
+
+ + ☆ CriSPO: Multi-Aspect Critique-Suggestion-guided Automatic Prompt + Optimization for Text Generation + + +
+ Large language models (LLMs) can generate fluent summaries across domains +using prompting techniques, reducing the need to train models for summarization +applications. However, crafting effective prompts that guide LLMs to generate +summaries with the appropriate level of detail and writing style remains a +challenge. In this paper, we explore the use of salient information extracted +from the source document to enhance summarization prompts. We show that adding +keyphrases in prompts can improve ROUGE F1 and recall, making the generated +summaries more similar to the reference and more complete. The number of +keyphrases can control the precision-recall trade-off. Furthermore, our +analysis reveals that incorporating phrase-level salient information is +superior to word- or sentence-level. However, the impact on hallucination is +not universally positive across LLMs. To conduct this analysis, we introduce +Keyphrase Signal Extractor (CriSPO), a lightweight model that can be finetuned +to extract salient keyphrases. By using CriSPO, we achieve consistent ROUGE +improvements across datasets and open-weight and proprietary LLMs without any +LLM customization. Our findings provide insights into leveraging salient +information in building prompt-based summarization systems. + +
+
+
+
+
+ + ☆ Contrastive Localized Language-Image Pre-Training + + +
+ Contrastive Language-Image Pre-training (CLIP) has been a celebrated method +for training vision encoders to generate image/text representations +facilitating various applications. Recently, CLIP has been widely adopted as +the vision backbone of multimodal large language models (MLLMs) to connect +image inputs for language interactions. The success of CLIP as a +vision-language foundation model relies on aligning web-crawled noisy text +annotations at image levels. Nevertheless, such criteria may become +insufficient for downstream tasks in need of fine-grained vision +representations, especially when region-level understanding is demanding for +MLLMs. In this paper, we improve the localization capability of CLIP with +several advances. We propose a pre-training method called Contrastive Localized +Language-Image Pre-training (CLOC) by complementing CLIP with region-text +contrastive loss and modules. We formulate a new concept, promptable +embeddings, of which the encoder produces image embeddings easy to transform +into region representations given spatial hints. To support large-scale +pre-training, we design a visually-enriched and spatially-localized captioning +framework to effectively generate region-text pseudo-labels at scale. By +scaling up to billions of annotated images, CLOC enables high-quality regional +embeddings for image region recognition and retrieval tasks, and can be a +drop-in replacement of CLIP to enhance MLLMs, especially on referring and +grounding tasks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Neutral residues: revisiting adapters for model extension + + +
+ We address the problem of extending a pretrained large language model to a +new domain that was not seen at training time, like adding a language for which +the original model has seen no or little training data. Popular solutions like +fine-tuning or low-rank adaptation are successful at domain adaptation, but +formally they do not add any extra capacity and degrade the performance in the +original domain. + Our paper analyzes this extension problem under three angles: data, +architecture and training procedure, which are advantageously considered +jointly. In particular, we improve adapters and make it possible to learn an +entire new language while ensuring that the output of the neural network is +almost unchanged in the original domain. For this purpose, we modify the new +residual blocks in a way that leads each new residual block to output +near-zeros in the original domain. + This solution of neutral residues, which borrows architectural components +from mixture of experts, is effective: with only 20% extra learnable weights +compared to an original model trained on English, we get results that are +significantly better than concurrent approaches (fine-tuning, low-rank or +vanilla adapters) in terms of the trade-off between learning a new language and +not forgetting English. + +
+
+
+
+
+ + ☆ Grounding Large Language Models In Embodied Environment With Imperfect + World Models + + +
+ Despite a widespread success in various applications, large language models +(LLMs) often stumble when tackling basic physical reasoning or executing +robotics tasks, due to a lack of direct experience with the physical nuances of +the real world. To address these issues, we propose a Grounding Large language +model with Imperfect world MOdel (GLIMO), which utilizes proxy world models +such as simulators to collect and synthesize trining data. GLIMO incorporates +an LLM agent-based data generator to automatically create high-quality and +diverse instruction datasets. The generator includes an iterative self-refining +module for temporally consistent experience sampling, a diverse set of +question-answering instruction seeds, and a retrieval-augmented generation +module for reflecting on prior experiences. Comprehensive experiments show that +our approach improve the performance of strong open-source LLMs like LLaMA-3 +with a performance boost of 2.04 $\times$, 1.54 $\times$, and 1.82 $\times$ +across three different benchmarks, respectively. The performance is able to +compete with or surpass their larger counterparts such as GPT-4. + +
+
+
+
+
+ + ☆ Salient Information Prompting to Steer Content in Prompt-based + Abstractive Summarization EMNLP 2024 + + +
+ Large language models (LLMs) can generate fluent summaries across domains +using prompting techniques, reducing the need to train models for summarization +applications. However, crafting effective prompts that guide LLMs to generate +summaries with the appropriate level of detail and writing style remains a +challenge. In this paper, we explore the use of salient information extracted +from the source document to enhance summarization prompts. We show that adding +keyphrases in prompts can improve ROUGE F1 and recall, making the generated +summaries more similar to the reference and more complete. The number of +keyphrases can control the precision-recall trade-off. Furthermore, our +analysis reveals that incorporating phrase-level salient information is +superior to word- or sentence-level. However, the impact on hallucination is +not universally positive across LLMs. To conduct this analysis, we introduce +Keyphrase Signal Extractor (SigExt), a lightweight model that can be finetuned +to extract salient keyphrases. By using SigExt, we achieve consistent ROUGE +improvements across datasets and open-weight and proprietary LLMs without any +LLM customization. Our findings provide insights into leveraging salient +information in building prompt-based summarization systems. + +
+
+ comment: Accepted to EMNLP 2024 Industry Track +
+
+
+
+
+ + ☆ Revisit Large-Scale Image-Caption Data in Pre-training Multimodal + Foundation Models + + +
+ Recent advancements in multimodal models highlight the value of rewritten +captions for improving performance, yet key challenges remain. For example, +while synthetic captions often provide superior quality and image-text +alignment, it is not clear whether they can fully replace AltTexts: the role of +synthetic captions and their interaction with original web-crawled AltTexts in +pre-training is still not well understood. Moreover, different multimodal +foundation models may have unique preferences for specific caption formats, but +efforts to identify the optimal captions for each model remain limited. In this +work, we propose a novel, controllable, and scalable captioning pipeline +designed to generate diverse caption formats tailored to various multimodal +models. By examining Short Synthetic Captions (SSC) towards Dense Synthetic +Captions (DSC+) as case studies, we systematically explore their effects and +interactions with AltTexts across models such as CLIP, multimodal LLMs, and +diffusion models. Our findings reveal that a hybrid approach that keeps both +synthetic captions and AltTexts can outperform the use of synthetic captions +alone, improving both alignment and performance, with each model demonstrating +preferences for particular caption formats. This comprehensive analysis +provides valuable insights into optimizing captioning strategies, thereby +advancing the pre-training of multimodal foundation models. + +
+
+ comment: CV/ML +
+
+
+
+
+ + ☆ OOD-Chameleon: Is Algorithm Selection for OOD Generalization Learnable? + + +
+ Out-of-distribution (OOD) generalization is challenging because distribution +shifts come in many forms. A multitude of learning algorithms exist and each +can improve performance in specific OOD situations. We posit that much of the +challenge of OOD generalization lies in choosing the right algorithm for the +right dataset. However, such algorithm selection is often elusive under complex +real-world shifts. In this work, we formalize the task of algorithm selection +for OOD generalization and investigate whether it could be approached by +learning. We propose a solution, dubbed OOD-Chameleon that treats the task as a +supervised classification over candidate algorithms. We construct a dataset of +datasets to learn from, which represents diverse types, magnitudes and +combinations of shifts (covariate shift, label shift, spurious correlations). +We train the model to predict the relative performance of algorithms given a +dataset's characteristics. This enables a priori selection of the best learning +strategy, i.e. without training various models as needed with traditional model +selection. Our experiments show that the adaptive selection outperforms any +individual algorithm and simple selection heuristics, on unseen datasets of +controllable and realistic image data. Inspecting the model shows that it +learns non-trivial data/algorithms interactions, and reveals the conditions for +any one algorithm to surpass another. This opens new avenues for (1) enhancing +OOD generalization with existing algorithms instead of designing new ones, and +(2) gaining insights into the applicability of existing algorithms with respect +to datasets' properties. + +
+
+
+
+
+ + ☆ Data Similarity-Based One-Shot Clustering for Multi-Task Hierarchical + Federated Learning + + +
+ We address the problem of cluster identity estimation in a hierarchical +federated learning setting in which users work toward learning different tasks. +To overcome the challenge of task heterogeneity, users need to be grouped in a +way such that users with the same task are in the same group, conducting +training together, while sharing the weights of feature extraction layers with +the other groups. Toward that end, we propose a one-shot clustering algorithm +that can effectively identify and group users based on their data similarity. +This enables more efficient collaboration and sharing of a common layer +representation within the federated learning system. Our proposed algorithm not +only enhances the clustering process, but also overcomes challenges related to +privacy concerns, communication overhead, and the need for prior knowledge +about learning models or loss function behaviors. We validate our proposed +algorithm using various datasets such as CIFAR-10 and Fashion MNIST, and show +that it outperforms the baseline in terms of accuracy and variance reduction. + +
+
+ comment: To appear in Asilomar 2024 +
+
+
+
+
+ + ☆ Adaptive Inference-Time Compute: LLMs Can Predict if They Can Do Better, + Even Mid-Generation + + +
+ Inference-time computation is a powerful paradigm to enhance the performance +of large language models (LLMs), with Best-of-N sampling being a widely used +technique. However, this method is computationally expensive, requiring both +(1) an external reward model and (2) the generation of multiple samples. In +this work, we introduce a new generative self-evaluation scheme designed to +adaptively reduce the number of generated samples while maintaining or even +improving performance. We use a generative reward model formulation, allowing +the LLM to predict mid-generation the probability that restarting the +generation will yield a better response. These predictions are obtained without +an external reward model and can be used to decide whether or not to generate +more samples, prune unpromising samples early on, or to pick the best sample. +This capability is very inexpensive as it involves generating a single +predefined token. Trained using a dataset constructed with real unfiltered +LMSYS user prompts, Llama 3.1 8B's win rate against GPT-4 on AlpacaEval +increases from 21% to 34% with 16 samples and math performance on GSM8K +improves from 84% to 91%. By sampling only when the LLM determines that it is +beneficial to do so and adaptively adjusting temperature annealing, we +demonstrate that 74% of the improvement from using 16 samples can be achieved +with only 1.2 samples on average. We further demonstrate that 50-75% of samples +can be pruned early in generation with minimal degradation in performance. +Overall, our methods enable more efficient and scalable compute utilization +during inference for LLMs. + +
+
+
+
+
+ + ☆ Large Language Models as Markov Chains + + +
+ Large language models (LLMs) have proven to be remarkably efficient, both +across a wide range of natural language processing tasks and well beyond them. +However, a comprehensive theoretical analysis of the origins of their +impressive performance remains elusive. In this paper, we approach this +challenging task by drawing an equivalence between generic autoregressive +language models with vocabulary of size $T$ and context window of size $K$ and +Markov chains defined on a finite state space of size $\mathcal{O}(T^K)$. We +derive several surprising findings related to the existence of a stationary +distribution of Markov chains that capture the inference power of LLMs, their +speed of convergence to it, and the influence of the temperature on the latter. +We then prove pre-training and in-context generalization bounds and show how +the drawn equivalence allows us to enrich their interpretation. Finally, we +illustrate our theoretical guarantees with experiments on several recent LLMs +to highlight how they capture the behavior observed in practice. + +
+
+ comment: 49 pages, 17 figures +
+
+
+
+
+ + ☆ SynthFormer: Equivariant Pharmacophore-based Generation of Molecules for + Ligand-Based Drug Design + + +
+ Drug discovery is a complex and resource-intensive process, with significant +time and cost investments required to bring new medicines to patients. Recent +advancements in generative machine learning (ML) methods offer promising +avenues to accelerate early-stage drug discovery by efficiently exploring +chemical space. This paper addresses the gap between in silico generative +approaches and practical in vitro methodologies, highlighting the need for +their integration to optimize molecule discovery. We introduce SynthFormer, a +novel ML model that utilizes a 3D equivariant encoder for pharmacophores to +generate fully synthesizable molecules, constructed as synthetic trees. Unlike +previous methods, SynthFormer incorporates 3D information and provides +synthetic paths, enhancing its ability to produce molecules with good docking +scores across various proteins. Our contributions include a new methodology for +efficient chemical space exploration using 3D information, a novel architecture +called Synthformer for translating 3D pharmacophore representations into +molecules, and a meaningful embedding space that organizes reagents for drug +discovery optimization. Synthformer generates molecules that dock well and +enables effective late-stage optimization restricted by synthesis paths. + +
+
+
+
+
+ + ☆ Measurements with Noise: Bayesian Optimization for Co-optimizing Noise + and Property Discovery in Automated Experiments + + +
+ We have developed a Bayesian optimization (BO) workflow that integrates +intra-step noise optimization into automated experimental cycles. Traditional +BO approaches in automated experiments focus on optimizing experimental +trajectories but often overlook the impact of measurement noise on data quality +and cost. Our proposed framework simultaneously optimizes both the target +property and the associated measurement noise by introducing time as an +additional input parameter, thereby balancing the signal-to-noise ratio and +experimental duration. Two approaches are explored: a reward-driven noise +optimization and a double-optimization acquisition function, both enhancing the +efficiency of automated workflows by considering noise and cost within the +optimization process. We validate our method through simulations and real-world +experiments using Piezoresponse Force Microscopy (PFM), demonstrating the +successful optimization of measurement duration and property exploration. Our +approach offers a scalable solution for optimizing multiple variables in +automated experimental workflows, improving data quality, and reducing resource +expenditure in materials science and beyond. + +
+
+ comment: 22 pages, 9 figures +
+
+
+
+
+ + ☆ AlzhiNet: Traversing from 2DCNN to 3DCNN, Towards Early Detection and + Diagnosis of Alzheimer's Disease + + +
+ Alzheimer's disease (AD) is a progressive neurodegenerative disorder with +increasing prevalence among the aging population, necessitating early and +accurate diagnosis for effective disease management. In this study, we present +a novel hybrid deep learning framework that integrates both 2D Convolutional +Neural Networks (2D-CNN) and 3D Convolutional Neural Networks (3D-CNN), along +with a custom loss function and volumetric data augmentation, to enhance +feature extraction and improve classification performance in AD diagnosis. +According to extensive experiments, AlzhiNet outperforms standalone 2D and 3D +models, highlighting the importance of combining these complementary +representations of data. The depth and quality of 3D volumes derived from the +augmented 2D slices also significantly influence the model's performance. The +results indicate that carefully selecting weighting factors in hybrid +predictions is imperative for achieving optimal results. Our framework has been +validated on the Magnetic Resonance Imaging (MRI) from Kaggle and MIRIAD +datasets, obtaining accuracies of 98.9% and 99.99%, respectively, with an AUC +of 100%. Furthermore, AlzhiNet was studied under a variety of perturbation +scenarios on the Alzheimer's Kaggle dataset, including Gaussian noise, +brightness, contrast, salt and pepper noise, color jitter, and occlusion. The +results obtained show that AlzhiNet is more robust to perturbations than +ResNet-18, making it an excellent choice for real-world applications. This +approach represents a promising advancement in the early diagnosis and +treatment planning for Alzheimer's disease. + +
+
+
+
+
+ + ☆ NETS: A Non-Equilibrium Transport Sampler + + +
+ We propose an algorithm, termed the Non-Equilibrium Transport Sampler (NETS), +to sample from unnormalized probability distributions. NETS can be viewed as a +variant of annealed importance sampling (AIS) based on Jarzynski's equality, in +which the stochastic differential equation used to perform the non-equilibrium +sampling is augmented with an additional learned drift term that lowers the +impact of the unbiasing weights used in AIS. We show that this drift is the +minimizer of a variety of objective functions, which can all be estimated in an +unbiased fashion without backpropagating through solutions of the stochastic +differential equations governing the sampling. We also prove that some these +objectives control the Kullback-Leibler divergence of the estimated +distribution from its target. NETS is shown to be unbiased and, in addition, +has a tunable diffusion coefficient which can be adjusted post-training to +maximize the effective sample size. We demonstrate the efficacy of the method +on standard benchmarks, high-dimensional Gaussian mixture distributions, and a +model from statistical lattice field theory, for which it surpasses the +performances of related work and existing baselines. + +
+
+
+
+
+ + ☆ Selective Attention Improves Transformer + + +
+ Unneeded elements in the attention's context degrade performance. We +introduce Selective Attention, a simple parameter-free change to the standard +attention mechanism which reduces attention to unneeded elements. Selective +attention improves language modeling performance in a variety of model sizes +and context lengths. For example, a range of transformers trained with the +language modeling objective on C4 with selective attention perform equivalently +to standard transformers with ~2X more heads and parameters in their attention +modules. Selective attention also allows decreasing the size of the attention's +context buffer, leading to meaningful reductions in the memory and compute +requirements during inference. For example, transformers with 100M parameters +trained on C4 with context sizes of 512, 1,024, and 2,048 need 16X, 25X, and +47X less memory for their attention module, respectively, when equipped with +selective attention, as those without selective attention, with the same +validation perplexity. + +
+
+
+
+
+ + ☆ Lie Algebra Canonicalization: Equivariant Neural Operators under + arbitrary Lie Groups + + +
+ The quest for robust and generalizable machine learning models has driven +recent interest in exploiting symmetries through equivariant neural networks. +In the context of PDE solvers, recent works have shown that Lie point +symmetries can be a useful inductive bias for Physics-Informed Neural Networks +(PINNs) through data and loss augmentation. Despite this, directly enforcing +equivariance within the model architecture for these problems remains elusive. +This is because many PDEs admit non-compact symmetry groups, oftentimes not +studied beyond their infinitesimal generators, making them incompatible with +most existing equivariant architectures. In this work, we propose Lie aLgebrA +Canonicalization (LieLAC), a novel approach that exploits only the action of +infinitesimal generators of the symmetry group, circumventing the need for +knowledge of the full group structure. To achieve this, we address existing +theoretical issues in the canonicalization literature, establishing connections +with frame averaging in the case of continuous non-compact groups. Operating +within the framework of canonicalization, LieLAC can easily be integrated with +unconstrained pre-trained models, transforming inputs to a canonical form +before feeding them into the existing model, effectively aligning the input for +model inference according to allowed symmetries. LieLAC utilizes standard Lie +group descent schemes, achieving equivariance in pre-trained models. Finally, +we showcase LieLAC's efficacy on tasks of invariant image classification and +Lie point symmetry equivariant neural PDE solvers using pre-trained models. + +
+
+ comment: 40 pages; preprint +
+
+
+
+
+ + ☆ Discovering Clues of Spoofed LM Watermarks + + +
+ LLM watermarks stand out as a promising way to attribute ownership of +LLM-generated text. One threat to watermark credibility comes from spoofing +attacks, where an unauthorized third party forges the watermark, enabling it to +falsely attribute arbitrary texts to a particular LLM. While recent works have +demonstrated that state-of-the-art schemes are in fact vulnerable to spoofing, +they lack deeper qualitative analysis of the texts produced by spoofing +methods. In this work, we for the first time reveal that there are observable +differences between genuine and spoofed watermark texts. Namely, we show that +regardless of their underlying approach, all current spoofing methods +consistently leave observable artifacts in spoofed texts, indicative of +watermark forgery. We build upon these findings to propose rigorous statistical +tests that reliably reveal the presence of such artifacts, effectively +discovering that a watermark was spoofed. Our experimental evaluation shows +high test power across all current spoofing methods, providing insights into +their fundamental limitations, and suggesting a way to mitigate this threat. + +
+
+
+
+
+ + ☆ DailyDilemmas: Revealing Value Preferences of LLMs with Quandaries of + Daily Life + + +
+ As we increasingly seek guidance from LLMs for decision-making in daily life, +many of these decisions are not clear-cut and depend significantly on the +personal values and ethical standards of the users. We present DailyDilemmas, a +dataset of 1,360 moral dilemmas encountered in everyday life. Each dilemma +includes two possible actions and with each action, the affected parties and +human values invoked. Based on these dilemmas, we consolidated a set of human +values across everyday topics e.g., interpersonal relationships, workplace, and +environmental issues. We evaluated LLMs on these dilemmas to determine what +action they will take and the values represented by these actions. Then, we +analyzed these values through the lens of five popular theories inspired by +sociology, psychology and philosophy. These theories are: World Value Survey, +Moral Foundation Theory, Maslow's Hierarchy of Needs, Aristotle's Virtues, and +Plutchik Wheel of Emotion. We find that LLMs are most aligned with the +self-expression over survival values in terms of World Value Survey, care over +loyalty in Moral Foundation Theory. Interestingly, we find large preferences +differences in models for some core values such as truthfulness e.g., +Mixtral-8x7B model tends to neglect it by 9.7% while GPT-4-turbo model tends to +select it by 9.4%. We also study the recent guidance released by OpenAI +(ModelSpec), and Anthropic (Constitutional AI) to understand how their released +principles reflect their actual value prioritization when facing nuanced moral +reasoning in daily-life settings. We find that end users cannot effectively +steer such prioritization using system prompts. + +
+
+ comment: Preprint. Under Review +
+
+
+
+
+ + ☆ Understanding and Mitigating Miscalibration in Prompt Tuning for + Vision-Language Models + + +
+ Confidence calibration is critical for the safe deployment of machine +learning models in the real world. However, such issue in vision-language +models like CLIP, particularly after fine-tuning, has not been fully addressed. +In this work, we demonstrate that existing prompt tuning methods usually lead +to a trade-off of calibration between base and new classes: the cross-entropy +loss in CoOp causes overconfidence in new classes by increasing textual label +divergence, whereas the regularization of KgCoOp maintains the confidence level +but results in underconfidence in base classes due to the improved accuracy. +Inspired by the observations, we introduce Dynamic Outlier Regularization (DOR) +to ensure the confidence calibration on both base and new classes after +fine-tuning. In particular, we propose to minimize the feature deviation of +novel textual labels (instead of base classes) sampled from a large vocabulary. +In effect, DOR prevents the increase in textual divergence for new labels while +easing restrictions on base classes. Extensive experiments demonstrate that DOR +can enhance the calibration performance of current fine-tuning methods on base +and new classes. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Highly Adaptive Ridge + + +
+ In this paper we propose the Highly Adaptive Ridge (HAR): a regression method +that achieves a $n^{-1/3}$ dimension-free L2 convergence rate in the class of +right-continuous functions with square-integrable sectional derivatives. This +is a large nonparametric function class that is particularly appropriate for +tabular data. HAR is exactly kernel ridge regression with a specific +data-adaptive kernel based on a saturated zero-order tensor-product spline +basis expansion. We use simulation and real data to confirm our theory. We +demonstrate empirical performance better than state-of-the-art algorithms for +small datasets in particular. + +
+
+
+
+
+ + ☆ CulturalBench: a Robust, Diverse and Challenging Benchmark on Measuring + the (Lack of) Cultural Knowledge of LLMs + + +
+ To make large language models (LLMs) more helpful across diverse cultures, it +is essential to have effective cultural knowledge benchmarks to measure and +track our progress. Effective benchmarks need to be robust, diverse, and +challenging. We introduce CulturalBench: a set of 1,227 human-written and +human-verified questions for effectively assessing LLMs' cultural knowledge, +covering 45 global regions including the underrepresented ones like Bangladesh, +Zimbabwe, and Peru. Questions - each verified by five independent annotators - +span 17 diverse topics ranging from food preferences to greeting etiquettes. We +evaluate models on two setups: CulturalBench-Easy and CulturalBench-Hard which +share the same questions but asked differently. We find that LLMs are sensitive +to such difference in setups (e.g., GPT-4o with 27.3% difference). Compared to +human performance (92.6% accuracy), CulturalBench-Hard is more challenging for +frontier LLMs with the best performing model (GPT-4o) at only 61.5% and the +worst (Llama3-8b) at 21.4%. Moreover, we find that LLMs often struggle with +tricky questions that have multiple correct answers (e.g., What utensils do the +Chinese usually use?), revealing a tendency to converge to a single answer. Our +results also indicate that OpenAI GPT-4o substantially outperform other +proprietary and open source models in questions related to all but one region +(Oceania). Nonetheless, all models consistently underperform on questions +related to South America and the Middle East. + +
+
+ comment: Preprint. Under review +
+
+
+
+
+ + ☆ FAN: Fourier Analysis Networks + + +
+ Despite the remarkable success achieved by neural networks, particularly +those represented by MLP and Transformer, we reveal that they exhibit potential +flaws in the modeling and reasoning of periodicity, i.e., they tend to memorize +the periodic data rather than genuinely understanding the underlying principles +of periodicity. However, periodicity is a crucial trait in various forms of +reasoning and generalization, underpinning predictability across natural and +engineered systems through recurring patterns in observations. In this paper, +we propose FAN, a novel network architecture based on Fourier Analysis, which +empowers the ability to efficiently model and reason about periodic phenomena. +By introducing Fourier Series, the periodicity is naturally integrated into the +structure and computational processes of the neural network, thus achieving a +more accurate expression and prediction of periodic patterns. As a promising +substitute to multi-layer perceptron (MLP), FAN can seamlessly replace MLP in +various models with fewer parameters and FLOPs. Through extensive experiments, +we demonstrate the effectiveness of FAN in modeling and reasoning about +periodic functions, and the superiority and generalizability of FAN across a +range of real-world tasks, including symbolic formula representation, time +series forecasting, and language modeling. + +
+
+
+
+
+ + ☆ GUD: Generation with Unified Diffusion + + +
+ Diffusion generative models transform noise into data by inverting a process +that progressively adds noise to data samples. Inspired by concepts from the +renormalization group in physics, which analyzes systems across different +scales, we revisit diffusion models by exploring three key design aspects: 1) +the choice of representation in which the diffusion process operates (e.g. +pixel-, PCA-, Fourier-, or wavelet-basis), 2) the prior distribution that data +is transformed into during diffusion (e.g. Gaussian with covariance $\Sigma$), +and 3) the scheduling of noise levels applied separately to different parts of +the data, captured by a component-wise noise schedule. Incorporating the +flexibility in these choices, we develop a unified framework for diffusion +generative models with greatly enhanced design freedom. In particular, we +introduce soft-conditioning models that smoothly interpolate between standard +diffusion models and autoregressive models (in any basis), conceptually +bridging these two approaches. Our framework opens up a wide design space which +may lead to more efficient training and data generation, and paves the way to +novel architectures integrating different generative approaches and generation +tasks. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ AlphaIntegrator: Transformer Action Search for Symbolic Integration + Proofs + + +
+ We present the first correct-by-construction learning-based system for +step-by-step mathematical integration. The key idea is to learn a policy, +represented by a GPT transformer model, which guides the search for the right +mathematical integration rule, to be carried out by a symbolic solver. +Concretely, we introduce a symbolic engine with axiomatically correct actions +on mathematical expressions, as well as the first dataset for step-by-step +integration. Our GPT-style transformer model, trained on this synthetic data, +demonstrates strong generalization by surpassing its own data generator in +accuracy and efficiency, using 50% fewer search steps. Our experimental results +with SoTA LLMs also demonstrate that the standard approach of fine-tuning LLMs +on a set of question-answer pairs is insufficient for solving this mathematical +task. This motivates the importance of discovering creative methods for +combining LLMs with symbolic reasoning engines, of which our work is an +instance. + +
+
+
+
+
+ + ☆ How to Train Long-Context Language Models (Effectively) + + +
+ We study continued training and supervised fine-tuning (SFT) of a language +model (LM) to make effective use of long-context information. We first +establish a reliable evaluation protocol to guide model development -- Instead +of perplexity or simple needle-in-a-haystack (NIAH) tests, we use a broad set +of long-context tasks, and we evaluate models after SFT with instruction data +as this better reveals long-context abilities. Supported by our robust +evaluations, we run thorough experiments to decide the data mix for continued +pre-training, the instruction tuning dataset, and many other design choices. We +find that (1) code repositories and books are excellent sources of long data, +but it is crucial to combine them with high-quality short data; (2) training +with a sequence length beyond the evaluation length boosts long-context +performance; (3) for SFT, using only short instruction datasets yields strong +performance on long-context tasks. Our final model, ProLong-8B, which is +initialized from Llama-3 and trained on 40B tokens, demonstrates +state-of-the-art long-context performance among similarly sized models at a +length of 128K. ProLong outperforms Llama-3.18B-Instruct on the majority of +long-context tasks despite having seen only 5% as many tokens during +long-context training. Additionally, ProLong can effectively process up to 512K +tokens, one of the longest context windows of publicly available LMs. + +
+
+ comment: Our code, data, and models are available at + https://github.com/princeton-nlp/ProLong +
+
+
+
+
+ + ☆ Scalable Simulation-free Entropic Unbalanced Optimal Transport + + +
+ The Optimal Transport (OT) problem investigates a transport map that connects +two distributions while minimizing a given cost function. Finding such a +transport map has diverse applications in machine learning, such as generative +modeling and image-to-image translation. In this paper, we introduce a scalable +and simulation-free approach for solving the Entropic Unbalanced Optimal +Transport (EUOT) problem. We derive the dynamical form of this EUOT problem, +which is a generalization of the Schr\"odinger bridges (SB) problem. Based on +this, we derive dual formulation and optimality conditions of the EUOT problem +from the stochastic optimal control interpretation. By leveraging these +properties, we propose a simulation-free algorithm to solve EUOT, called +Simulation-free EUOT (SF-EUOT). While existing SB models require expensive +simulation costs during training and evaluation, our model achieves +simulation-free training and one-step generation by utilizing the reciprocal +property. Our model demonstrates significantly improved scalability in +generative modeling and image-to-image translation tasks compared to previous +SB methods. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ Deconstructing Recurrence, Attention, and Gating: Investigating the + transferability of Transformers and Gated Recurrent Neural Networks in + forecasting of dynamical systems + + +
+ Machine learning architectures, including transformers and recurrent neural +networks (RNNs) have revolutionized forecasting in applications ranging from +text processing to extreme weather. Notably, advanced network architectures, +tuned for applications such as natural language processing, are transferable to +other tasks such as spatiotemporal forecasting tasks. However, there is a +scarcity of ablation studies to illustrate the key components that enable this +forecasting accuracy. The absence of such studies, although explainable due to +the associated computational cost, intensifies the belief that these models +ought to be considered as black boxes. In this work, we decompose the key +architectural components of the most powerful neural architectures, namely +gating and recurrence in RNNs, and attention mechanisms in transformers. Then, +we synthesize and build novel hybrid architectures from the standard blocks, +performing ablation studies to identify which mechanisms are effective for each +task. The importance of considering these components as hyper-parameters that +can augment the standard architectures is exhibited on various forecasting +datasets, from the spatiotemporal chaotic dynamics of the multiscale Lorenz 96 +system, the Kuramoto-Sivashinsky equation, as well as standard real world +time-series benchmarks. A key finding is that neural gating and attention +improves the performance of all standard RNNs in most tasks, while the addition +of a notion of recurrence in transformers is detrimental. Furthermore, our +study reveals that a novel, sparsely used, architecture which integrates +Recurrent Highway Networks with neural gating and attention mechanisms, emerges +as the best performing architecture in high-dimensional spatiotemporal +forecasting of dynamical systems. + +
+
+
+
+
+ + ☆ CAX: Cellular Automata Accelerated in JAX + + +
+ Cellular automata have become a cornerstone for investigating emergence and +self-organization across diverse scientific disciplines, spanning neuroscience, +artificial life, and theoretical physics. However, the absence of a +hardware-accelerated cellular automata library limits the exploration of new +research directions, hinders collaboration, and impedes reproducibility. In +this work, we introduce CAX (Cellular Automata Accelerated in JAX), a +high-performance and flexible open-source library designed to accelerate +cellular automata research. CAX offers cutting-edge performance and a modular +design through a user-friendly interface, and can support both discrete and +continuous cellular automata with any number of dimensions. We demonstrate +CAX's performance and flexibility through a wide range of benchmarks and +applications. From classic models like elementary cellular automata and +Conway's Game of Life to advanced applications such as growing neural cellular +automata and self-classifying MNIST digits, CAX speeds up simulations up to +2,000 times faster. Furthermore, we demonstrate CAX's potential to accelerate +research by presenting a collection of three novel cellular automata +experiments, each implemented in just a few lines of code thanks to the +library's modular architecture. Notably, we show that a simple one-dimensional +cellular automaton can outperform GPT-4 on the 1D-ARC challenge. + +
+
+
+
+
+ + ☆ Immunogenicity Prediction with Dual Attention Enables Vaccine Target + Selection + + +
+ Immunogenicity prediction is a central topic in reverse vaccinology for +finding candidate vaccines that can trigger protective immune responses. +Existing approaches typically rely on highly compressed features and simple +model architectures, leading to limited prediction accuracy and poor +generalizability. To address these challenges, we introduce ProVaccine, a novel +deep learning solution with a dual attention mechanism that integrates +pre-trained latent vector representations of protein sequences and structures. +We also compile the most comprehensive immunogenicity dataset to date, +encompassing over 9,500 antigen sequences, structures, and immunogenicity +labels from bacteria, viruses, and tumors. Extensive experiments demonstrate +that ProVaccine outperforms existing methods across a wide range of evaluation +metrics. Furthermore, we establish a post-hoc validation protocol to assess the +practical significance of deep learning models in tackling vaccine design +challenges. Our work provides an effective tool for vaccine design and sets +valuable benchmarks for future research. + +
+
+ comment: 18 pages, 11 tables, 5 figures +
+
+
+
+
+ + ☆ Labor Migration Modeling through Large-scale Job Query Data + + +
+ Accurate and timely modeling of labor migration is crucial for various urban +governance and commercial tasks, such as local policy-making and business site +selection. However, existing studies on labor migration largely rely on limited +survey data with statistical methods, which fail to deliver timely and +fine-grained insights for time-varying regional trends. To this end, we propose +a deep learning-based spatial-temporal labor migration analysis framework, +DHG-SIL, by leveraging large-scale job query data. Specifically, we first +acquire labor migration intention as a proxy of labor migration via job queries +from one of the world's largest search engines. Then, a Disprepant Homophily +co-preserved Graph Convolutional Network (DH-GCN) and an interpretable temporal +module are respectively proposed to capture cross-city and sequential labor +migration dependencies. Besides, we introduce four interpretable variables to +quantify city migration properties, which are co-optimized with city +representations via tailor-designed contrastive losses. Extensive experiments +on three real-world datasets demonstrate the superiority of our DHG-SIL. +Notably, DHG-SIL has been deployed as a core component of a cooperative +partner's intelligent human resource system, and the system supported a series +of city talent attraction reports. + +
+
+
+
+
+ + ☆ Estimating Generalization Performance Along the Trajectory of Proximal + SGD in Robust Regression + + +
+ This paper studies the generalization performance of iterates obtained by +Gradient Descent (GD), Stochastic Gradient Descent (SGD) and their proximal +variants in high-dimensional robust regression problems. The number of features +is comparable to the sample size and errors may be heavy-tailed. We introduce +estimators that precisely track the generalization error of the iterates along +the trajectory of the iterative algorithm. These estimators are provably +consistent under suitable conditions. The results are illustrated through +several examples, including Huber regression, pseudo-Huber regression, and +their penalized variants with non-smooth regularizer. We provide explicit +generalization error estimates for iterates generated from GD and SGD, or from +proximal SGD in the presence of a non-smooth regularizer. The proposed risk +estimates serve as effective proxies for the actual generalization error, +allowing us to determine the optimal stopping iteration that minimizes the +generalization error. Extensive simulations confirm the effectiveness of the +proposed generalization error estimates. + +
+
+
+
+
+ + ☆ Inverse Entropic Optimal Transport Solves Semi-supervised Learning via + Data Likelihood Maximization + + +
+ Learning conditional distributions $\pi^*(\cdot|x)$ is a central problem in +machine learning, which is typically approached via supervised methods with +paired data $(x,y) \sim \pi^*$. However, acquiring paired data samples is often +challenging, especially in problems such as domain translation. This +necessitates the development of $\textit{semi-supervised}$ models that utilize +both limited paired data and additional unpaired i.i.d. samples $x \sim +\pi^*_x$ and $y \sim \pi^*_y$ from the marginal distributions. The usage of +such combined data is complex and often relies on heuristic approaches. To +tackle this issue, we propose a new learning paradigm that integrates both +paired and unpaired data $\textbf{seamlessly}$ through the data likelihood +maximization techniques. We demonstrate that our approach also connects +intriguingly with inverse entropic optimal transport (OT). This finding allows +us to apply recent advances in computational OT to establish a $\textbf{light}$ +learning algorithm to get $\pi^*(\cdot|x)$. Furthermore, we demonstrate through +empirical tests that our method effectively learns conditional distributions +using paired and unpaired data simultaneously. + +
+
+
+
+
+ + ☆ Online Learning Guided Quasi-Newton Methods with Global Non-Asymptotic + Convergence + + +
+ In this paper, we propose a quasi-Newton method for solving smooth and +monotone nonlinear equations, including unconstrained minimization and minimax +optimization as special cases. For the strongly monotone setting, we establish +two global convergence bounds: (i) a linear convergence rate that matches the +rate of the celebrated extragradient method, and (ii) an explicit global +superlinear convergence rate that provably surpasses the linear convergence +rate after at most ${O}(d)$ iterations, where $d$ is the problem's dimension. +In addition, for the case where the operator is only monotone, we prove a +global convergence rate of ${O}(\min\{{1}/{k},{\sqrt{d}}/{k^{1.25}}\})$ in +terms of the duality gap. This matches the rate of the extragradient method +when $k = {O}(d^2)$ and is faster when $k = \Omega(d^2)$. These results are the +first global convergence results to demonstrate a provable advantage of a +quasi-Newton method over the extragradient method, without querying the +Jacobian of the operator. Unlike classical quasi-Newton methods, we achieve +this by using the hybrid proximal extragradient framework and a novel online +learning approach for updating the Jacobian approximation matrices. +Specifically, guided by the convergence analysis, we formulate the Jacobian +approximation update as an online convex optimization problem over +non-symmetric matrices, relating the regret of the online problem to the +convergence rate of our method. To facilitate efficient implementation, we +further develop a tailored online learning algorithm based on an approximate +separation oracle, which preserves structures such as symmetry and sparsity in +the Jacobian matrices. + +
+
+ comment: 54 pages +
+
+
+
+
+ + ☆ Diss-l-ECT: Dissecting Graph Data with local Euler Characteristic + Transforms + + +
+ The Euler Characteristic Transform (ECT) is an efficiently-computable +geometrical-topological invariant that characterizes the global shape of data. +In this paper, we introduce the Local Euler Characteristic Transform +($\ell$-ECT), a novel extension of the ECT particularly designed to enhance +expressivity and interpretability in graph representation learning. Unlike +traditional Graph Neural Networks (GNNs), which may lose critical local details +through aggregation, the $\ell$-ECT provides a lossless representation of local +neighborhoods. This approach addresses key limitations in GNNs by preserving +nuanced local structures while maintaining global interpretability. Moreover, +we construct a rotation-invariant metric based on $\ell$-ECTs for spatial +alignment of data spaces. Our method exhibits superior performance than +standard GNNs on a variety of node classification tasks, particularly in graphs +with high heterophily. + +
+
+
+
+
+ + ☆ Achieving Fairness in Predictive Process Analytics via Adversarial + Learning (Extended Version) + + +
+ Predictive business process analytics has become important for organizations, +offering real-time operational support for their processes. However, these +algorithms often perform unfair predictions because they are based on biased +variables (e.g., gender or nationality), namely variables embodying +discrimination. This paper addresses the challenge of integrating a debiasing +phase into predictive business process analytics to ensure that predictions are +not influenced by biased variables. Our framework leverages on adversial +debiasing is evaluated on four case studies, showing a significant reduction in +the contribution of biased variables to the predicted value. The proposed +technique is also compared with the state of the art in fairness in process +mining, illustrating that our framework allows for a more enhanced level of +fairness, while retaining a better prediction quality. + +
+
+ comment: 17 pages, 5 figures +
+
+
+
+
+ + ☆ LoGra-Med: Long Context Multi-Graph Alignment for Medical + Vision-Language Model + + +
+ State-of-the-art medical multi-modal large language models (med-MLLM), like +LLaVA-Med or BioMedGPT, leverage instruction-following data in pre-training. +However, those models primarily focus on scaling the model size and data volume +to boost performance while mainly relying on the autoregressive learning +objectives. Surprisingly, we reveal that such learning schemes might result in +a weak alignment between vision and language modalities, making these models +highly reliant on extensive pre-training datasets - a significant challenge in +medical domains due to the expensive and time-consuming nature of curating +high-quality instruction-following instances. We address this with LoGra-Med, a +new multi-graph alignment algorithm that enforces triplet correlations across +image modalities, conversation-based descriptions, and extended captions. This +helps the model capture contextual meaning, handle linguistic variability, and +build cross-modal associations between visuals and text. To scale our approach, +we designed an efficient end-to-end learning scheme using black-box gradient +estimation, enabling faster LLaMa 7B training. Our results show LoGra-Med +matches LLAVA-Med performance on 600K image-text pairs for Medical VQA and +significantly outperforms it when trained on 10% of the data. For example, on +VQA-RAD, we exceed LLAVA-Med by 20.13% and nearly match the 100% pre-training +score (72.52% vs. 72.64%). We also surpass SOTA methods like BiomedGPT on +visual chatbots and RadFM on zero-shot image classification with VQA, +highlighting the effectiveness of multi-graph alignment. + +
+
+ comment: First version +
+
+
+
+
+ + ☆ IndicSentEval: How Effectively do Multilingual Transformer Models encode + Linguistic Properties for Indic Languages? + + +
+ Transformer-based models have revolutionized the field of natural language +processing. To understand why they perform so well and to assess their +reliability, several studies have focused on questions such as: Which +linguistic properties are encoded by these models, and to what extent? How +robust are these models in encoding linguistic properties when faced with +perturbations in the input text? However, these studies have mainly focused on +BERT and the English language. In this paper, we investigate similar questions +regarding encoding capability and robustness for 8 linguistic properties across +13 different perturbations in 6 Indic languages, using 9 multilingual +Transformer models (7 universal and 2 Indic-specific). To conduct this study, +we introduce a novel multilingual benchmark dataset, IndicSentEval, containing +approximately $\sim$47K sentences. Surprisingly, our probing analysis of +surface, syntactic, and semantic properties reveals that while almost all +multilingual models demonstrate consistent encoding performance for English, +they show mixed results for Indic languages. As expected, Indic-specific +multilingual models capture linguistic properties in Indic languages better +than universal models. Intriguingly, universal models broadly exhibit better +robustness compared to Indic-specific models, particularly under perturbations +such as dropping both nouns and verbs, dropping only verbs, or keeping only +nouns. Overall, this study provides valuable insights into probing and +perturbation-specific strengths and weaknesses of popular multilingual +Transformer-based models for different Indic languages. We make our code and +dataset publicly available [https://tinyurl.com/IndicSentEval}]. + +
+
+ comment: 23 pages, 11 figures +
+
+
+
+
+ + ☆ Beyond Expected Returns: A Policy Gradient Algorithm for Cumulative + Prospect Theoretic Reinforcement Learning + + +
+ The widely used expected utility theory has been shown to be empirically +inconsistent with human preferences in the psychology and behavioral economy +literatures. Cumulative Prospect Theory (CPT) has been developed to fill in +this gap and provide a better model for human-based decision-making supported +by empirical evidence. It allows to express a wide range of attitudes and +perceptions towards risk, gains and losses. A few years ago, CPT has been +combined with Reinforcement Learning (RL) to formulate a CPT policy +optimization problem where the goal of the agent is to search for a policy +generating long-term returns which are aligned with their preferences. In this +work, we revisit this policy optimization problem and provide new insights on +optimal policies and their nature depending on the utility function under +consideration. We further derive a novel policy gradient theorem for the CPT +policy optimization objective generalizing the seminal corresponding result in +standard RL. This result enables us to design a model-free policy gradient +algorithm to solve the CPT-RL problem. We illustrate the performance of our +algorithm in simple examples motivated by traffic control and electricity +management applications. We also demonstrate that our policy gradient algorithm +scales better to larger state spaces compared to the existing zeroth order +algorithm for solving the same problem. + +
+
+ comment: 33 pages, 19 figures +
+
+
+
+
+ + ☆ Long-Sequence Recommendation Models Need Decoupled Embeddings + + +
+ Lifelong user behavior sequences, comprising up to tens of thousands of +history behaviors, are crucial for capturing user interests and predicting user +responses in modern recommendation systems. A two-stage paradigm is typically +adopted to handle these long sequences: a few relevant behaviors are first +searched from the original long sequences via an attention mechanism in the +first stage and then aggregated with the target item to construct a +discriminative representation for prediction in the second stage. In this work, +we identify and characterize, for the first time, a neglected deficiency in +existing long-sequence recommendation models: a single set of embeddings +struggles with learning both attention and representation, leading to +interference between these two processes. Initial attempts to address this +issue using linear projections -- a technique borrowed from language processing +-- proved ineffective, shedding light on the unique challenges of +recommendation models. To overcome this, we propose the Decoupled Attention and +Representation Embeddings (DARE) model, where two distinct embedding tables are +initialized and learned separately to fully decouple attention and +representation. Extensive experiments and analysis demonstrate that DARE +provides more accurate search of correlated behaviors and outperforms baselines +with AUC gains up to 0.9% on public datasets and notable online system +improvements. Furthermore, decoupling embedding spaces allows us to reduce the +attention embedding dimension and accelerate the search procedure by 50% +without significant performance impact, enabling more efficient, +high-performance online serving. + +
+
+ comment: First three authors contributed equally +
+
+
+
+
+ + ☆ Agents' Room: Narrative Generation through Multi-step Collaboration ICLR 2025 + + +
+ Writing compelling fiction is a multifaceted process combining elements such +as crafting a plot, developing interesting characters, and using evocative +language. While large language models (LLMs) show promise for story writing, +they currently rely heavily on intricate prompting, which limits their use. We +propose Agents' Room, a generation framework inspired by narrative theory, that +decomposes narrative writing into subtasks tackled by specialized agents. To +illustrate our method, we introduce Tell Me A Story, a high-quality dataset of +complex writing prompts and human-written stories, and a novel evaluation +framework designed specifically for assessing long narratives. We show that +Agents' Room generates stories that are preferred by expert evaluators over +those produced by baseline systems by leveraging collaboration and +specialization to decompose the complex story writing task into tractable +components. We provide extensive analysis with automated and human-based +metrics of the generated output. + +
+
+ comment: Under review as a conference paper at ICLR 2025 +
+
+
+
+
+ + ☆ Diffusion & Adversarial Schrödinger Bridges via Iterative Proportional + Markovian Fitting + + +
+ The Iterative Markovian Fitting (IMF) procedure based on iterative reciprocal +and Markovian projections has recently been proposed as a powerful method for +solving the Schr\"odinger Bridge problem. However, it has been observed that +for the practical implementation of this procedure, it is crucial to alternate +between fitting a forward and backward time diffusion at each iteration. Such +implementation is thought to be a practical heuristic, which is required to +stabilize training and obtain good results in applications such as unpaired +domain translation. In our work, we show that this heuristic closely connects +with the pioneer approaches for the Schr\"odinger Bridge based on the Iterative +Proportional Fitting (IPF) procedure. Namely, we find that the practical +implementation of IMF is, in fact, a combination of IMF and IPF procedures, and +we call this combination the Iterative Proportional Markovian Fitting (IPMF) +procedure. We show both theoretically and practically that this combined IPMF +procedure can converge under more general settings, thus, showing that the IPMF +procedure opens a door towards developing a unified framework for solving +Schr\"odinger Bridge problems. + +
+
+
+
+
+ + ☆ Three-in-One: Fast and Accurate Transducer for Hybrid-Autoregressive ASR + + +
+ We present \textbf{H}ybrid-\textbf{A}utoregressive \textbf{IN}ference +Tr\textbf{AN}sducers (HAINAN), a novel architecture for speech recognition that +extends the Token-and-Duration Transducer (TDT) model. Trained with randomly +masked predictor network outputs, HAINAN supports both autoregressive inference +with all network components and non-autoregressive inference without the +predictor. Additionally, we propose a novel semi-autoregressive inference +paradigm that first generates an initial hypothesis using non-autoregressive +inference, followed by refinement steps where each token prediction is +regenerated using parallelized autoregression on the initial hypothesis. +Experiments on multiple datasets across different languages demonstrate that +HAINAN achieves efficiency parity with CTC in non-autoregressive mode and with +TDT in autoregressive mode. In terms of accuracy, autoregressive HAINAN +outperforms TDT and RNN-T, while non-autoregressive HAINAN significantly +outperforms CTC. Semi-autoregressive inference further enhances the model's +accuracy with minimal computational overhead, and even outperforms TDT results +in some cases. These results highlight HAINAN's flexibility in balancing +accuracy and speed, positioning it as a strong candidate for real-world speech +recognition applications. + +
+
+
+
+
+ + ☆ Beyond Squared Error: Exploring Loss Design for Enhanced Training of + Generative Flow Networks + + +
+ Generative Flow Networks (GFlowNets) are a novel class of generative models +designed to sample from unnormalized distributions and have found applications +in various important tasks, attracting great research interest in their +training algorithms. In general, GFlowNets are trained by fitting the forward +flow to the backward flow on sampled training objects. Prior work focused on +the choice of training objects, parameterizations, sampling and resampling +strategies, and backward policies, aiming to enhance credit assignment, +exploration, or exploitation of the training process. However, the choice of +regression loss, which can highly influence the exploration and exploitation +behavior of the under-training policy, has been overlooked. Due to the lack of +theoretical understanding for choosing an appropriate regression loss, most +existing algorithms train the flow network by minimizing the squared error of +the forward and backward flows in log-space, i.e., using the quadratic +regression loss. In this work, we rigorously prove that distinct regression +losses correspond to specific divergence measures, enabling us to design and +analyze regression losses according to the desired properties of the +corresponding divergence measures. Specifically, we examine two key properties: +zero-forcing and zero-avoiding, where the former promotes exploitation and +higher rewards, and the latter encourages exploration and enhances diversity. +Based on our theoretical framework, we propose three novel regression losses, +namely, Shifted-Cosh, Linex(1/2), and Linex(1). We evaluate them across three +benchmarks: hyper-grid, bit-sequence generation, and molecule generation. Our +proposed losses are compatible with most existing training algorithms, and +significantly improve the performances of the algorithms concerning convergence +speed, sample diversity, and robustness. + +
+
+
+
+
+ + ☆ IC3M: In-Car Multimodal Multi-object Monitoring for Abnormal Status of + Both Driver and Passengers + + +
+ Recently, in-car monitoring has emerged as a promising technology for +detecting early-stage abnormal status of the driver and providing timely alerts +to prevent traffic accidents. Although training models with multimodal data +enhances the reliability of abnormal status detection, the scarcity of labeled +data and the imbalance of class distribution impede the extraction of critical +abnormal state features, significantly deteriorating training performance. +Furthermore, missing modalities due to environment and hardware limitations +further exacerbate the challenge of abnormal status identification. More +importantly, monitoring abnormal health conditions of passengers, particularly +in elderly care, is of paramount importance but remains underexplored. To +address these challenges, we introduce our IC3M, an efficient +camera-rotation-based multimodal framework for monitoring both driver and +passengers in a car. Our IC3M comprises two key modules: an adaptive threshold +pseudo-labeling strategy and a missing modality reconstruction. The former +customizes pseudo-labeling thresholds for different classes based on the class +distribution, generating class-balanced pseudo labels to guide model training +effectively, while the latter leverages crossmodality relationships learned +from limited labels to accurately recover missing modalities by distribution +transferring from available modalities. Extensive experimental results +demonstrate that IC3M outperforms state-of-the-art benchmarks in accuracy, +precision, and recall while exhibiting superior robustness under limited +labeled data and severe missing modality. + +
+
+ comment: 16 pages, 17 figures +
+
+
+
+
+ + ☆ Generalization emerges from local optimization in a self-organized + learning network + + +
+ We design and analyze a new paradigm for building supervised learning +networks, driven only by local optimization rules without relying on a global +error function. Traditional neural networks with a fixed topology are made up +of identical nodes and derive their expressiveness from an appropriate +adjustment of connection weights. In contrast, our network stores new knowledge +in the nodes accurately and instantaneously, in the form of a lookup table. +Only then is some of this information structured and incorporated into the +network geometry. The training error is initially zero by construction and +remains so throughout the network topology transformation phase. The latter +involves a small number of local topological transformations, such as splitting +or merging of nodes and adding binary connections between them. The choice of +operations to be carried out is only driven by optimization of expressivity at +the local scale. What we are primarily looking for in a learning network is its +ability to generalize, i.e. its capacity to correctly answer questions for +which it has never learned the answers. We show on numerous examples of +classification tasks that the networks generated by our algorithm +systematically reach such a state of perfect generalization when the number of +learned examples becomes sufficiently large. We report on the dynamics of the +change of state and show that it is abrupt and has the distinctive +characteristics of a first order phase transition, a phenomenon already +observed for traditional learning networks and known as grokking. In addition +to proposing a non-potential approach for the construction of learning +networks, our algorithm makes it possible to rethink the grokking transition in +a new light, under which acquisition of training data and topological +structuring of data are completely decoupled phenomena. + +
+
+ comment: This paper is submitted to Phys. Rev. X. It's a physicist's study + that focus on a new paradigm for deep learning networks. We would have liked + to choose other keywords for arXiv to reach a wider community, but don't have + the rights to do so +
+
+
+
+
+ + ☆ Boosting Sample Efficiency and Generalization in Multi-agent + Reinforcement Learning via Equivariance NeurIPS 2024 + + +
+ Multi-Agent Reinforcement Learning (MARL) struggles with sample inefficiency +and poor generalization [1]. These challenges are partially due to a lack of +structure or inductive bias in the neural networks typically used in learning +the policy. One such form of structure that is commonly observed in multi-agent +scenarios is symmetry. The field of Geometric Deep Learning has developed +Equivariant Graph Neural Networks (EGNN) that are equivariant (or symmetric) to +rotations, translations, and reflections of nodes. Incorporating equivariance +has been shown to improve learning efficiency and decrease error [ 2 ]. In this +paper, we demonstrate that EGNNs improve the sample efficiency and +generalization in MARL. However, we also show that a naive application of EGNNs +to MARL results in poor early exploration due to a bias in the EGNN structure. +To mitigate this bias, we present Exploration-enhanced Equivariant Graph Neural +Networks or E2GN2. We compare E2GN2 to other common function approximators +using common MARL benchmarks MPE and SMACv2. E2GN2 demonstrates a significant +improvement in sample efficiency, greater final reward convergence, and a 2x-5x +gain in over standard GNNs in our generalization tests. These results pave the +way for more reliable and effective solutions in complex multi-agent systems. + +
+
+ comment: accepted as a poster at NeurIPS 2024 +
+
+
+
+
+ + ☆ Deep Learning-Based Prediction of Suspension Dynamics Performance in + Multi-Axle Vehicles + + +
+ This paper presents a deep learning-based framework for predicting the +dynamic performance of suspension systems in multi-axle vehicles, emphasizing +the integration of machine learning with traditional vehicle dynamics modeling. +A Multi-Task Deep Belief Network Deep Neural Network (MTL-DBN-DNN) was +developed to capture the relationships between key vehicle parameters and +suspension performance metrics. The model was trained on data generated from +numerical simulations and demonstrated superior prediction accuracy compared to +conventional DNN models. A comprehensive sensitivity analysis was conducted to +assess the impact of various vehicle and suspension parameters on dynamic +suspension performance. Additionally, the Suspension Dynamic Performance Index +(SDPI) was introduced as a holistic measure to quantify overall suspension +performance, accounting for the combined effects of multiple parameters. The +findings highlight the effectiveness of multitask learning in improving +predictive models for complex vehicle systems. + +
+
+
+
+
+ + ☆ The Benefit of Being Bayesian in Online Conformal Prediction + + +
+ Based on the framework of Conformal Prediction (CP), we study the online +construction of valid confidence sets given a black-box machine learning model. +By converting the target confidence levels into quantile levels, the problem +can be reduced to predicting the quantiles (in hindsight) of a sequentially +revealed data sequence. Two very different approaches have been studied +previously. (i) Direct approach: Assuming the data sequence is iid or +exchangeable, one could maintain the empirical distribution of the observed +data as an algorithmic belief, and directly predict its quantiles. (ii) +Indirect approach: As statistical assumptions often do not hold in practice, a +recent trend is to consider the adversarial setting and apply first-order +online optimization to moving quantile losses (Gibbs & Cand\`es, 2021). It +requires knowing the target quantile level beforehand, and suffers from certain +validity issues on the obtained confidence sets, due to the associated loss +linearization. + This paper presents a novel Bayesian CP framework that combines their +strengths. Without any statistical assumption, it is able to both: (i) answer +multiple arbitrary confidence level queries online, with provably low regret; +and (ii) overcome the validity issues suffered by first-order optimization +baselines, due to being "data-centric" rather than "iterate-centric". + From a technical perspective, our key idea is to regularize the algorithmic +belief of the above direct approach by a Bayesian prior, which "robustifies" it +by simulating a non-linearized Follow the Regularized Leader (FTRL) algorithm +on the output. For statisticians, this can be regarded as an online adversarial +view of Bayesian inference. Importantly, the proposed belief update backbone is +shared by prediction heads targeting different confidence levels, bringing +practical benefits analogous to U-calibration (Kleinberg et al., 2023). + +
+
+
+
+
+ + ☆ Obtaining Lower Query Complexities through Lightweight Zeroth-Order + Proximal Gradient Algorithms + + +
+ Zeroth-order (ZO) optimization is one key technique for machine learning +problems where gradient calculation is expensive or impossible. Several +variance reduced ZO proximal algorithms have been proposed to speed up ZO +optimization for non-smooth problems, and all of them opted for the coordinated +ZO estimator against the random ZO estimator when approximating the true +gradient, since the former is more accurate. While the random ZO estimator +introduces bigger error and makes convergence analysis more challenging +compared to coordinated ZO estimator, it requires only $\mathcal{O}(1)$ +computation, which is significantly less than $\mathcal{O}(d)$ computation of +the coordinated ZO estimator, with $d$ being dimension of the problem space. To +take advantage of the computationally efficient nature of the random ZO +estimator, we first propose a ZO objective decrease (ZOOD) property which can +incorporate two different types of errors in the upper bound of convergence +rate. Next, we propose two generic reduction frameworks for ZO optimization +which can automatically derive the convergence results for convex and +non-convex problems respectively, as long as the convergence rate for the inner +solver satisfies the ZOOD property. With the application of two reduction +frameworks on our proposed ZOR-ProxSVRG and ZOR-ProxSAGA, two variance reduced +ZO proximal algorithms with fully random ZO estimators, we improve the +state-of-the-art function query complexities from +$\mathcal{O}\left(\min\{\frac{dn^{1/2}}{\epsilon^2}, +\frac{d}{\epsilon^3}\}\right)$ to +$\tilde{\mathcal{O}}\left(\frac{n+d}{\epsilon^2}\right)$ under $d > +n^{\frac{1}{2}}$ for non-convex problems, and from +$\mathcal{O}\left(\frac{d}{\epsilon^2}\right)$ to +$\tilde{\mathcal{O}}\left(n\log\frac{1}{\epsilon}+\frac{d}{\epsilon}\right)$ +for convex problems. + +
+
+ comment: Neural Computation 36 (5), 897-935 +
+
+
+
+
+ + ☆ ColaCare: Enhancing Electronic Health Record Modeling through Large + Language Model-Driven Multi-Agent Collaboration + + +
+ We introduce ColaCare, a framework that enhances Electronic Health Record +(EHR) modeling through multi-agent collaboration driven by Large Language +Models (LLMs). Our approach seamlessly integrates domain-specific expert models +with LLMs to bridge the gap between structured EHR data and text-based +reasoning. Inspired by clinical consultations, ColaCare employs two types of +agents: DoctorAgent and MetaAgent, which collaboratively analyze patient data. +Expert models process and generate predictions from numerical EHR data, while +LLM agents produce reasoning references and decision-making reports within the +collaborative consultation framework. We additionally incorporate the Merck +Manual of Diagnosis and Therapy (MSD) medical guideline within a +retrieval-augmented generation (RAG) module for authoritative evidence support. +Extensive experiments conducted on four distinct EHR datasets demonstrate +ColaCare's superior performance in mortality prediction tasks, underscoring its +potential to revolutionize clinical decision support systems and advance +personalized precision medicine. The code, complete prompt templates, more case +studies, etc. are publicly available at the anonymous link: +https://colacare.netlify.app. + +
+
+
+
+
+ + ☆ Local Flow Matching Generative Models + + +
+ Flow Matching (FM) is a simulation-free method for learning a continuous and +invertible flow to interpolate between two distributions, and in particular to +generate data from noise in generative modeling. In this paper, we introduce +Local Flow Matching (LFM), which learns a sequence of FM sub-models and each +matches a diffusion process up to the time of the step size in the +data-to-noise direction. In each step, the two distributions to be interpolated +by the sub-model are closer to each other than data vs. noise, and this enables +the use of smaller models with faster training. The stepwise structure of LFM +is natural to be distilled and different distillation techniques can be adopted +to speed up generation. Theoretically, we prove a generation guarantee of the +proposed flow model in terms of the $\chi^2$-divergence between the generated +and true data distributions. In experiments, we demonstrate the improved +training efficiency and competitive generative performance of LFM compared to +FM on the unconditional generation of tabular data and image datasets, and also +on the conditional generation of robotic manipulation policies. + +
+
+
+
+
+ + ☆ Diffusion Models are Evolutionary Algorithms + + +
+ In a convergence of machine learning and biology, we reveal that diffusion +models are evolutionary algorithms. By considering evolution as a denoising +process and reversed evolution as diffusion, we mathematically demonstrate that +diffusion models inherently perform evolutionary algorithms, naturally +encompassing selection, mutation, and reproductive isolation. Building on this +equivalence, we propose the Diffusion Evolution method: an evolutionary +algorithm utilizing iterative denoising -- as originally introduced in the +context of diffusion models -- to heuristically refine solutions in parameter +spaces. Unlike traditional approaches, Diffusion Evolution efficiently +identifies multiple optimal solutions and outperforms prominent mainstream +evolutionary algorithms. Furthermore, leveraging advanced concepts from +diffusion models, namely latent space diffusion and accelerated sampling, we +introduce Latent Space Diffusion Evolution, which finds solutions for +evolutionary tasks in high-dimensional complex parameter space while +significantly reducing computational steps. This parallel between diffusion and +evolution not only bridges two different fields but also opens new avenues for +mutual enhancement, raising questions about open-ended evolution and +potentially utilizing non-Gaussian or discrete diffusion models in the context +of Diffusion Evolution. + +
+
+
+
+
+ + ☆ Fair Decentralized Learning + + +
+ Decentralized learning (DL) is an emerging approach that enables nodes to +collaboratively train a machine learning model without sharing raw data. In +many application domains, such as healthcare, this approach faces challenges +due to the high level of heterogeneity in the training data's feature space. +Such feature heterogeneity lowers model utility and negatively impacts +fairness, particularly for nodes with under-represented training data. In this +paper, we introduce \textsc{Facade}, a clustering-based DL algorithm +specifically designed for fair model training when the training data exhibits +several distinct features. The challenge of \textsc{Facade} is to assign nodes +to clusters, one for each feature, based on the similarity in the features of +their local data, without requiring individual nodes to know apriori which +cluster they belong to. \textsc{Facade} (1) dynamically assigns nodes to their +appropriate clusters over time, and (2) enables nodes to collaboratively train +a specialized model for each cluster in a fully decentralized manner. We +theoretically prove the convergence of \textsc{Facade}, implement our +algorithm, and compare it against three state-of-the-art baselines. Our +experimental results on three datasets demonstrate the superiority of our +approach in terms of model accuracy and fairness compared to all three +competitors. Compared to the best-performing baseline, \textsc{Facade} on the +CIFAR-10 dataset also reduces communication costs by 32.3\% to reach a target +accuracy when cluster sizes are imbalanced. + +
+
+
+
+
+ + ☆ Semantic-Guided RL for Interpretable Feature Engineering + + +
+ The quality of Machine Learning (ML) models strongly depends on the input +data, as such generating high-quality features is often required to improve the +predictive accuracy. This process is referred to as Feature Engineering (FE). +However, since manual feature engineering is time-consuming and requires +case-by-case domain knowledge, Automated Feature Engineering (AutoFE) is +crucial. A major challenge that remains is to generate interpretable features. +To tackle this problem, we introduce SMART, a hybrid approach that uses +semantic technologies to guide the generation of interpretable features through +a two-step process: Exploitation and Exploration. The former uses Description +Logics (DL) to reason on the semantics embedded in Knowledge Graphs (KG) to +infer domain-specific features, while the latter exploits the knowledge graph +to conduct a guided exploration of the search space through Deep Reinforcement +Learning (DRL). Our experiments on public datasets demonstrate that SMART +significantly improves prediction accuracy while ensuring a high level of +interpretability. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2406.00544 +
+
+
+
+
+ + ☆ Learning Emergence of Interaction Patterns across Independent RL Agents + in Multi-Agent Environments + + +
+ Many real-world problems, such as controlling swarms of drones and urban +traffic, naturally lend themselves to modeling as multi-agent reinforcement +learning (RL) problems. However, existing multi-agent RL methods often suffer +from scalability challenges, primarily due to the introduction of communication +among agents. Consequently, a key challenge lies in adapting the success of +deep learning in single-agent RL to the multi-agent setting. In response to +this challenge, we propose an approach that fundamentally reimagines +multi-agent environments. Unlike conventional methods that model each agent +individually with separate networks, our approach, the Bottom Up Network (BUN), +adopts a unique perspective. BUN treats the collective of multi-agents as a +unified entity while employing a specialized weight initialization strategy +that promotes independent learning. Furthermore, we dynamically establish +connections among agents using gradient information, enabling coordination when +necessary while maintaining these connections as limited and sparse to +effectively manage the computational budget. Our extensive empirical +evaluations across a variety of cooperative multi-agent scenarios, including +tasks such as cooperative navigation and traffic control, consistently +demonstrate BUN's superiority over baseline methods with substantially reduced +computational costs. + +
+
+ comment: 13 pages, 24 figures +
+
+
+
+
+ + ☆ Minimax Group Fairness in Strategic Classification + + +
+ In strategic classification, agents manipulate their features, at a cost, to +receive a positive classification outcome from the learner's classifier. The +goal of the learner in such settings is to learn a classifier that is robust to +strategic manipulations. While the majority of works in this domain consider +accuracy as the primary objective of the learner, in this work, we consider +learning objectives that have group fairness guarantees in addition to accuracy +guarantees. We work with the minimax group fairness notion that asks for +minimizing the maximal group error rate across population groups. + We formalize a fairness-aware Stackelberg game between a population of agents +consisting of several groups, with each group having its own cost function, and +a learner in the agnostic PAC setting in which the learner is working with a +hypothesis class H. When the cost functions of the agents are separable, we +show the existence of an efficient algorithm that finds an approximately +optimal deterministic classifier for the learner when the number of groups is +small. This algorithm remains efficient, both statistically and +computationally, even when H is the set of all classifiers. We then consider +cost functions that are not necessarily separable and show the existence of +oracle-efficient algorithms that find approximately optimal randomized +classifiers for the learner when H has finite strategic VC dimension. These +algorithms work under the assumption that the learner is fully transparent: the +learner draws a classifier from its distribution (randomized classifier) before +the agents respond by manipulating their feature vectors. We highlight the +effectiveness of such transparency in developing oracle-efficient algorithms. +We conclude with verifying the efficacy of our algorithms on real data by +conducting an experimental analysis. + +
+
+
+
+
+ + ☆ SAFLEX: Self-Adaptive Augmentation via Feature Label Extrapolation ICLR 2024 + + +
+ Data augmentation, a cornerstone technique in deep learning, is crucial in +enhancing model performance, especially with scarce labeled data. While +traditional techniques are effective, their reliance on hand-crafted methods +limits their applicability across diverse data types and tasks. Although modern +learnable augmentation methods offer increased adaptability, they are +computationally expensive and challenging to incorporate within prevalent +augmentation workflows. In this work, we present a novel, efficient method for +data augmentation, effectively bridging the gap between existing augmentation +strategies and emerging datasets and learning tasks. We introduce SAFLEX +(Self-Adaptive Augmentation via Feature Label EXtrapolation), which learns the +sample weights and soft labels of augmented samples provided by any given +upstream augmentation pipeline, using a specifically designed efficient bilevel +optimization algorithm. Remarkably, SAFLEX effectively reduces the noise and +label errors of the upstream augmentation pipeline with a marginal +computational cost. As a versatile module, SAFLEX excels across diverse +datasets, including natural and medical images and tabular data, showcasing its +prowess in few-shot learning and out-of-distribution generalization. SAFLEX +seamlessly integrates with common augmentation strategies like RandAug, CutMix, +and those from large pre-trained generative models like stable diffusion and is +also compatible with frameworks such as CLIP's fine-tuning. Our findings +highlight the potential to adapt existing augmentation pipelines for new data +types and tasks, signaling a move towards more adaptable and resilient training +frameworks. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Cut the Crap: An Economical Communication Pipeline for LLM-based + Multi-Agent Systems + + +
+ Recent advancements in large language model (LLM)-powered agents have shown +that collective intelligence can significantly outperform individual +capabilities, largely attributed to the meticulously designed inter-agent +communication topologies. Though impressive in performance, existing +multi-agent pipelines inherently introduce substantial token overhead, as well +as increased economic costs, which pose challenges for their large-scale +deployments. In response to this challenge, we propose an economical, simple, +and robust multi-agent communication framework, termed $\texttt{AgentPrune}$, +which can seamlessly integrate into mainstream multi-agent systems and prunes +redundant or even malicious communication messages. Technically, +$\texttt{AgentPrune}$ is the first to identify and formally define the +\textit{communication redundancy} issue present in current LLM-based +multi-agent pipelines, and efficiently performs one-shot pruning on the +spatial-temporal message-passing graph, yielding a token-economic and +high-performing communication topology. Extensive experiments across six +benchmarks demonstrate that $\texttt{AgentPrune}$ \textbf{(I)} achieves +comparable results as state-of-the-art topologies at merely $\$5.6$ cost +compared to their $\$43.7$, \textbf{(II)} integrates seamlessly into existing +multi-agent frameworks with $28.1\%\sim72.8\%\downarrow$ token reduction, and +\textbf{(III)} successfully defend against two types of agent-based adversarial +attacks with $3.5\%\sim10.8\%\uparrow$ performance boost. + +
+
+
+
+
+ + ☆ Dual Active Learning for Reinforcement Learning from Human Feedback + + +
+ Aligning large language models (LLMs) with human preferences is critical to +recent advances in generative artificial intelligence. Reinforcement learning +from human feedback (RLHF) is widely applied to achieve this objective. A key +step in RLHF is to learn the reward function from human feedback. However, +human feedback is costly and time-consuming, making it essential to collect +high-quality conversation data for human teachers to label. Additionally, +different human teachers have different levels of expertise. It is thus +critical to query the most appropriate teacher for their opinions. In this +paper, we use offline reinforcement learning (RL) to formulate the alignment +problem. Motivated by the idea of $D$-optimal design, we first propose a dual +active reward learning algorithm for the simultaneous selection of +conversations and teachers. Next, we apply pessimistic RL to solve the +alignment problem, based on the learned reward estimator. Theoretically, we +show that the reward estimator obtained through our proposed adaptive selection +strategy achieves minimal generalized variance asymptotically, and prove that +the sub-optimality of our pessimistic policy scales as $O(1/\sqrt{T})$ with a +given sample budget $T$. Through simulations and experiments on LLMs, we +demonstrate the effectiveness of our algorithm and its superiority over +state-of-the-arts. + +
+
+
+
+
+ + ☆ Dynamic Gradient Alignment for Online Data Mixing + + +
+ The composition of training data mixtures is critical for effectively +training large language models (LLMs), as it directly impacts their performance +on downstream tasks. Our goal is to identify an optimal data mixture to +specialize an LLM for a specific task with access to only a few examples. +Traditional approaches to this problem include ad-hoc reweighting methods, +importance sampling, and gradient alignment techniques. This paper focuses on +gradient alignment and introduces Dynamic Gradient Alignment (DGA), a scalable +online gradient alignment algorithm. DGA dynamically estimates the pre-training +data mixture on which the models' gradients align as well as possible with +those of the model on the specific task. DGA is the first gradient alignment +approach that incurs minimal overhead compared to standard pre-training and +outputs a competitive model, eliminating the need for retraining the model. +Experimentally, we demonstrate significant improvements over importance +sampling in two key scenarios: (i) when the pre-training set is small and +importance sampling overfits due to limited data; and (ii) when there is +insufficient specialized data, trapping importance sampling on narrow pockets +of data. Our findings underscore the effectiveness of gradient alignment +methods in optimizing training data mixtures, particularly in data-constrained +environments, and offer a practical solution for enhancing LLM performance on +specific tasks with limited data availability. + +
+
+
+
+
+ + ☆ Efficient learning of differential network in multi-source + non-paranormal graphical models + + +
+ This paper addresses learning of sparse structural changes or differential +network between two classes of non-paranormal graphical models. We assume a +multi-source and heterogeneous dataset is available for each class, where the +covariance matrices are identical for all non-paranormal graphical models. The +differential network, which are encoded by the difference precision matrix, can +then be decoded by optimizing a lasso penalized D-trace loss function. To this +aim, an efficient approach is proposed that outputs the exact solution path, +outperforming the previous methods that only sample from the solution path in +pre-selected regularization parameters. Notably, our proposed method has low +computational complexity, especially when the differential network are sparse. +Our simulations on synthetic data demonstrate a superior performance for our +strategy in terms of speed and accuracy compared to an existing method. +Moreover, our strategy in combining datasets from multiple sources is shown to +be very effective in inferring differential network in real-world problems. +This is backed by our experimental results on drug resistance in tumor cancers. +In the latter case, our strategy outputs important genes for drug resistance +which are already confirmed by various independent studies. + +
+
+
+
+
+ + ☆ Stochastic variance-reduced Gaussian variational inference on the + Bures-Wasserstein manifold + + +
+ Optimization in the Bures-Wasserstein space has been gaining popularity in +the machine learning community since it draws connections between variational +inference and Wasserstein gradient flows. The variational inference objective +function of Kullback-Leibler divergence can be written as the sum of the +negative entropy and the potential energy, making forward-backward Euler the +method of choice. Notably, the backward step admits a closed-form solution in +this case, facilitating the practicality of the scheme. However, the forward +step is no longer exact since the Bures-Wasserstein gradient of the potential +energy involves "intractable" expectations. Recent approaches propose using the +Monte Carlo method -- in practice a single-sample estimator -- to approximate +these terms, resulting in high variance and poor performance. We propose a +novel variance-reduced estimator based on the principle of control variates. We +theoretically show that this estimator has a smaller variance than the +Monte-Carlo estimator in scenarios of interest. We also prove that variance +reduction helps improve the optimization bounds of the current analysis. We +demonstrate that the proposed estimator gains order-of-magnitude improvements +over the previous Bures-Wasserstein methods. + +
+
+
+
+
+ + ☆ Encryption-Friendly LLM Architecture + + +
+ Large language models (LLMs) offer personalized responses based on user +interactions, but this use case raises serious privacy concerns. Homomorphic +encryption (HE) is a cryptographic protocol supporting arithmetic computations +in encrypted states and provides a potential solution for privacy-preserving +machine learning (PPML). However, the computational intensity of transformers +poses challenges for applying HE to LLMs. In this work, we propose a modified +HE-friendly transformer architecture with an emphasis on inference following +personalized (private) fine-tuning. Utilizing LoRA fine-tuning and Gaussian +kernels, we achieve significant computational speedups -- 6.94x for fine-tuning +and 2.3x for inference -- while maintaining performance comparable to plaintext +models. Our findings provide a viable proof of concept for offering +privacy-preserving LLM services in areas where data protection is crucial. + +
+
+ comment: 27 pages +
+
+
+
+
+ + ☆ Cross-Embodiment Dexterous Grasping with Reinforcement Learning + + +
+ Dexterous hands exhibit significant potential for complex real-world grasping +tasks. While recent studies have primarily focused on learning policies for +specific robotic hands, the development of a universal policy that controls +diverse dexterous hands remains largely unexplored. In this work, we study the +learning of cross-embodiment dexterous grasping policies using reinforcement +learning (RL). Inspired by the capability of human hands to control various +dexterous hands through teleoperation, we propose a universal action space +based on the human hand's eigengrasps. The policy outputs eigengrasp actions +that are then converted into specific joint actions for each robot hand through +a retargeting mapping. We simplify the robot hand's proprioception to include +only the positions of fingertips and the palm, offering a unified observation +space across different robot hands. Our approach demonstrates an 80% success +rate in grasping objects from the YCB dataset across four distinct embodiments +using a single vision-based policy. Additionally, our policy exhibits zero-shot +generalization to two previously unseen embodiments and significant improvement +in efficient finetuning. For further details and videos, visit our project page +https://sites.google.com/view/crossdex. + +
+
+
+
+
+ + ☆ Temporal Predictive Coding for Gradient Compression in Distributed + Learning + + +
+ This paper proposes a prediction-based gradient compression method for +distributed learning with event-triggered communication. Our goal is to reduce +the amount of information transmitted from the distributed agents to the +parameter server by exploiting temporal correlation in the local gradients. We +use a linear predictor that \textit{combines past gradients to form a +prediction of the current gradient}, with coefficients that are optimized by +solving a least-square problem. In each iteration, every agent transmits the +predictor coefficients to the server such that the predicted local gradient can +be computed. The difference between the true local gradient and the predicted +one, termed the \textit{prediction residual, is only transmitted when its norm +is above some threshold.} When this additional communication step is omitted, +the server uses the prediction as the estimated gradient. This proposed design +shows notable performance gains compared to existing methods in the literature, +achieving convergence with reduced communication costs. + +
+
+ comment: 8 pages, 3 figures, presented at the 60th Allerton conference on + Communication, Control, and Computing +
+
+
+
+
+ + ☆ Learning Diverse Bimanual Dexterous Manipulation Skills from Human + Demonstrations + + +
+ Bimanual dexterous manipulation is a critical yet underexplored area in +robotics. Its high-dimensional action space and inherent task complexity +present significant challenges for policy learning, and the limited task +diversity in existing benchmarks hinders general-purpose skill development. +Existing approaches largely depend on reinforcement learning, often constrained +by intricately designed reward functions tailored to a narrow set of tasks. In +this work, we present a novel approach for efficiently learning diverse +bimanual dexterous skills from abundant human demonstrations. Specifically, we +introduce BiDexHD, a framework that unifies task construction from existing +bimanual datasets and employs teacher-student policy learning to address all +tasks. The teacher learns state-based policies using a general two-stage reward +function across tasks with shared behaviors, while the student distills the +learned multi-task policies into a vision-based policy. With BiDexHD, scalable +learning of numerous bimanual dexterous skills from auto-constructed tasks +becomes feasible, offering promising advances toward universal bimanual +dexterous manipulation. Our empirical evaluation on the TACO dataset, spanning +141 tasks across six categories, demonstrates a task fulfillment rate of 74.59% +on trained tasks and 51.07% on unseen tasks, showcasing the effectiveness and +competitive zero-shot generalization capabilities of BiDexHD. For videos and +more information, visit our project page https://sites.google.com/view/bidexhd. + +
+
+
+
+
+ + ☆ Online Convex Optimization with a Separation Oracle + + +
+ In this paper, we introduce a new projection-free algorithm for Online Convex +Optimization (OCO) with a state-of-the-art regret guarantee among +separation-based algorithms. Existing projection-free methods based on the +classical Frank-Wolfe algorithm achieve a suboptimal regret bound of +$O(T^{3/4})$, while more recent separation-based approaches guarantee a regret +bound of $O(\kappa \sqrt{T})$, where $\kappa$ denotes the asphericity of the +feasible set, defined as the ratio of the radii of the containing and contained +balls. However, for ill-conditioned sets, $\kappa$ can be arbitrarily large, +potentially leading to poor performance. Our algorithm achieves a regret bound +of $\tilde{O}(\sqrt{dT} + \kappa d)$, while requiring only $\tilde{O}(1)$ calls +to a separation oracle per round. Crucially, the main term in the bound, +$\tilde{O}(\sqrt{d T})$, is independent of $\kappa$, addressing the limitations +of previous methods. Additionally, as a by-product of our analysis, we recover +the $O(\kappa \sqrt{T})$ regret bound of existing OCO algorithms with a more +straightforward analysis and improve the regret bound for projection-free +online exp-concave optimization. Finally, for constrained stochastic convex +optimization, we achieve a state-of-the-art convergence rate of +$\tilde{O}(\sigma/\sqrt{T} + \kappa d/T)$, where $\sigma$ represents the noise +in the stochastic gradients, while requiring only $\tilde{O}(1)$ calls to a +separation oracle per iteration. + +
+
+
+
+
+ + ☆ Efficient Residual Learning with Mixture-of-Experts for Universal + Dexterous Grasping + + +
+ Universal dexterous grasping across diverse objects presents a fundamental +yet formidable challenge in robot learning. Existing approaches using +reinforcement learning (RL) to develop policies on extensive object datasets +face critical limitations, including complex curriculum design for multi-task +learning and limited generalization to unseen objects. To overcome these +challenges, we introduce ResDex, a novel approach that integrates residual +policy learning with a mixture-of-experts (MoE) framework. ResDex is +distinguished by its use of geometry-unaware base policies that are efficiently +acquired on individual objects and capable of generalizing across a wide range +of unseen objects. Our MoE framework incorporates several base policies to +facilitate diverse grasping styles suitable for various objects. By learning +residual actions alongside weights that combine these base policies, ResDex +enables efficient multi-task RL for universal dexterous grasping. ResDex +achieves state-of-the-art performance on the DexGraspNet dataset comprising +3,200 objects with an 88.8% success rate. It exhibits no generalization gap +with unseen objects and demonstrates superior training efficiency, mastering +all tasks within only 12 hours on a single GPU. + +
+
+
+
+
+ + ☆ Meta-Models: An Architecture for Decoding LLM Behaviors Through + Interpreted Embeddings and Natural Language + + +
+ As Large Language Models (LLMs) become increasingly integrated into our daily +lives, the potential harms from deceptive behavior underlie the need for +faithfully interpreting their decision-making. While traditional probing +methods have shown some effectiveness, they remain best for narrowly scoped +tasks while more comprehensive explanations are still necessary. To this end, +we investigate meta-models-an architecture using a "meta-model" that takes +activations from an "input-model" and answers natural language questions about +the input-model's behaviors. We evaluate the meta-model's ability to generalize +by training them on selected task types and assessing their out-of-distribution +performance in deceptive scenarios. Our findings show that meta-models +generalize well to out-of-distribution tasks and point towards opportunities +for future research in this area. + +
+
+ comment: 11 pages, 2 figures +
+
+
+
+
+ + ☆ Towards a Theoretical Understanding of Memorization in Diffusion Models + + +
+ As diffusion probabilistic models (DPMs) are being employed as mainstream +models for Generative Artificial Intelligence (GenAI), the study of their +memorization of training data has attracted growing attention. Existing works +in this direction aim to establish an understanding of whether or to what +extent DPMs learn via memorization. Such an understanding is crucial for +identifying potential risks of data leakage and copyright infringement in +diffusion models and, more importantly, for trustworthy application of GenAI. +Existing works revealed that conditional DPMs are more prone to training data +memorization than unconditional DPMs, and the motivated data extraction methods +are mostly for conditional DPMs. However, these understandings are primarily +empirical, and extracting training data from unconditional models has been +found to be extremely challenging. In this work, we provide a theoretical +understanding of memorization in both conditional and unconditional DPMs under +the assumption of model convergence. Our theoretical analysis indicates that +extracting data from unconditional models can also be effective by constructing +a proper surrogate condition. Based on this result, we propose a novel data +extraction method named \textbf{Surrogate condItional Data Extraction (SIDE)} +that leverages a time-dependent classifier trained on the generated data as a +surrogate condition to extract training data from unconditional DPMs. Empirical +results demonstrate that our SIDE can extract training data in challenging +scenarios where previous methods fail, and it is, on average, over 50\% more +effective across different scales of the CelebA dataset. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2406.12752 +
+
+
+
+
+ + ☆ Quantifying User Coherence: A Unified Framework for Cross-Domain + Recommendation Analysis + + +
+ The effectiveness of Recommender Systems (RS) is closely tied to the quality +and distinctiveness of user profiles, yet despite many advancements in raw +performance, the sensitivity of RS to user profile quality remains +under-researched. This paper introduces novel information-theoretic measures +for understanding recommender systems: a "surprise" measure quantifying users' +deviations from popular choices, and a "conditional surprise" measure capturing +user interaction coherence. We evaluate 7 recommendation algorithms across 9 +datasets, revealing the relationships between our measures and standard +performance metrics. Using a rigorous statistical framework, our analysis +quantifies how much user profile density and information measures impact +algorithm performance across domains. By segmenting users based on these +measures, we achieve improved performance with reduced data and show that +simpler algorithms can match complex ones for low-coherence users. +Additionally, we employ our measures to analyze how well different +recommendation algorithms maintain the coherence and diversity of user +preferences in their predictions, providing insights into algorithm behavior. +This work advances the theoretical understanding of user behavior and practical +heuristics for personalized recommendation systems, promoting more efficient +and adaptive architectures. + +
+
+
+
+
+ + ☆ Personalized Federated Learning for Generative AI-Assisted Semantic + Communications + + +
+ Semantic Communication (SC) focuses on transmitting only the semantic +information rather than the raw data. This approach offers an efficient +solution to the issue of spectrum resource utilization caused by the various +intelligent applications on Mobile Users (MUs). Generative Artificial +Intelligence (GAI) models have recently exhibited remarkable content generation +and signal processing capabilities, presenting new opportunities for enhancing +SC. Therefore, we propose a GAI-assisted SC (GSC) model deployed between MUs +and the Base Station (BS). Then, to train the GSC model using the local data of +MUs while ensuring privacy and accommodating heterogeneous requirements of MUs, +we introduce Personalized Semantic Federated Learning (PSFL). This approach +incorporates a novel Personalized Local Distillation (PLD) and Adaptive Global +Pruning (AGP). In PLD, each MU selects a personalized GSC model as a mentor +tailored to its local resources and a unified Convolutional Neural Networks +(CNN)-based SC (CSC) model as a student. This mentor model is then distilled +into the student model for global aggregation. In AGP, we perform network +pruning on the aggregated global model according to real-time communication +environments, reducing communication energy. Finally, numerical results +demonstrate the feasibility and efficiency of the proposed PSFL scheme. + +
+
+
+
+
+ + ☆ Clinnova Federated Learning Proof of Concept: Key Takeaways from a + Cross-border Collaboration + + +
+ Clinnova, a collaborative initiative involving France, Germany, Switzerland, +and Luxembourg, is dedicated to unlocking the power of precision medicine +through data federation, standardization, and interoperability. This European +Greater Region initiative seeks to create an interoperable European standard +using artificial intelligence (AI) and data science to enhance healthcare +outcomes and efficiency. Key components include multidisciplinary research +centers, a federated biobanking strategy, a digital health innovation platform, +and a federated AI strategy. It targets inflammatory bowel disease, rheumatoid +diseases, and multiple sclerosis (MS), emphasizing data quality to develop AI +algorithms for personalized treatment and translational research. + The IHU Strasbourg (Institute of Minimal-invasive Surgery) has the lead in +this initiative to develop the federated learning (FL) proof of concept (POC) +that will serve as a foundation for advancing AI in healthcare. At its core, +Clinnova-MS aims to enhance MS patient care by using FL to develop more +accurate models that detect disease progression, guide interventions, and +validate digital biomarkers across multiple sites. This technical report +presents insights and key takeaways from the first cross-border federated POC +on MS segmentation of MRI images within the Clinnova framework. While our work +marks a significant milestone in advancing MS segmentation through cross-border +collaboration, it also underscores the importance of addressing technical, +logistical, and ethical considerations to realize the full potential of FL in +healthcare settings. + +
+
+
+
+
+ + ☆ Learning K-U-Net with constant complexity: An Application to time series + forecasting + + +
+ Training deep models for time series forecasting is a critical task with an +inherent challenge of time complexity. While current methods generally ensure +linear time complexity, our observations on temporal redundancy show that +high-level features are learned 98.44\% slower than low-level features. To +address this issue, we introduce a new exponentially weighted stochastic +gradient descent algorithm designed to achieve constant time complexity in deep +learning models. We prove that the theoretical complexity of this learning +method is constant. Evaluation of this method on Kernel U-Net (K-U-Net) on +synthetic datasets shows a significant reduction in complexity while improving +the accuracy of the test set. + +
+
+
+
+
+ + ☆ Better Call SAUL: Fluent and Consistent Language Model Editing with + Generation Regularization + + +
+ To ensure large language models contain up-to-date knowledge, they need to be +updated regularly. However, model editing is challenging as it might also +affect knowledge that is unrelated to the new data. State-of-the-art methods +identify parameters associated with specific knowledge and then modify them via +direct weight updates. However, these locate-and-edit methods suffer from heavy +computational overhead and lack theoretical validation. In contrast, directly +fine-tuning the model on requested edits affects the model's behavior on +unrelated knowledge, and significantly damages the model's generation fluency +and consistency. To address these challenges, we propose SAUL, a streamlined +model editing method that uses sentence concatenation with augmented random +facts for generation regularization. Evaluations on three model editing +benchmarks show that SAUL is a practical and reliable solution for model +editing outperforming state-of-the-art methods while maintaining generation +quality and reducing computational overhead. + +
+
+
+
+
+ + ☆ Predictive Attractor Models NeurIPS 2024 + + +
+ Sequential memory, the ability to form and accurately recall a sequence of +events or stimuli in the correct order, is a fundamental prerequisite for +biological and artificial intelligence as it underpins numerous cognitive +functions (e.g., language comprehension, planning, episodic memory formation, +etc.) However, existing methods of sequential memory suffer from catastrophic +forgetting, limited capacity, slow iterative learning procedures, low-order +Markov memory, and, most importantly, the inability to represent and generate +multiple valid future possibilities stemming from the same context. Inspired by +biologically plausible neuroscience theories of cognition, we propose +\textit{Predictive Attractor Models (PAM)}, a novel sequence memory +architecture with desirable generative properties. PAM is a streaming model +that learns a sequence in an online, continuous manner by observing each input +\textit{only once}. Additionally, we find that PAM avoids catastrophic +forgetting by uniquely representing past context through lateral inhibition in +cortical minicolumns, which prevents new memories from overwriting previously +learned knowledge. PAM generates future predictions by sampling from a union +set of predicted possibilities; this generative ability is realized through an +attractor model trained alongside the predictor. We show that PAM is trained +with local computations through Hebbian plasticity rules in a biologically +plausible framework. Other desirable traits (e.g., noise tolerance, CPU-based +learning, capacity scaling) are discussed throughout the paper. Our findings +suggest that PAM represents a significant step forward in the pursuit of +biologically plausible and computationally efficient sequential memory models, +with broad implications for cognitive science and artificial intelligence +research. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ LLM-Pilot: Characterize and Optimize Performance of your LLM Inference + Services SC '24 + + +
+ As Large Language Models (LLMs) are rapidly growing in popularity, LLM +inference services must be able to serve requests from thousands of users while +satisfying performance requirements. The performance of an LLM inference +service is largely determined by the hardware onto which it is deployed, but +understanding of which hardware will deliver on performance requirements +remains challenging. In this work we present LLM-Pilot - a first-of-its-kind +system for characterizing and predicting performance of LLM inference services. +LLM-Pilot performs benchmarking of LLM inference services, under a realistic +workload, across a variety of GPUs, and optimizes the service configuration for +each considered GPU to maximize performance. Finally, using this +characterization data, LLM-Pilot learns a predictive model, which can be used +to recommend the most cost-effective hardware for a previously unseen LLM. +Compared to existing methods, LLM-Pilot can deliver on performance requirements +33% more frequently, whilst reducing costs by 60% on average. + +
+
+ comment: Accepted to the International Conference for High Performance + Computing, Networking, Storage and Analysis (SC '24) +
+
+
+
+
+ + ☆ PnP-Flow: Plug-and-Play Image Restoration with Flow Matching + + +
+ In this paper, we introduce Plug-and-Play (PnP) Flow Matching, an algorithm +for solving imaging inverse problems. PnP methods leverage the strength of +pre-trained denoisers, often deep neural networks, by integrating them in +optimization schemes. While they achieve state-of-the-art performance on +various inverse problems in imaging, PnP approaches face inherent limitations +on more generative tasks like inpainting. On the other hand, generative models +such as Flow Matching pushed the boundary in image sampling yet lack a clear +method for efficient use in image restoration. We propose to combine the PnP +framework with Flow Matching (FM) by defining a time-dependent denoiser using a +pre-trained FM model. Our algorithm alternates between gradient descent steps +on the data-fidelity term, reprojections onto the learned FM path, and +denoising. Notably, our method is computationally efficient and +memory-friendly, as it avoids backpropagation through ODEs and trace +computations. We evaluate its performance on denoising, super-resolution, +deblurring, and inpainting tasks, demonstrating superior results compared to +existing PnP algorithms and Flow Matching based state-of-the-art methods. + +
+
+
+
+
+ + ☆ MenakBERT -- Hebrew Diacriticizer SC + + +
+ Diacritical marks in the Hebrew language give words their vocalized form. The +task of adding diacritical marks to plain Hebrew text is still dominated by a +system that relies heavily on human-curated resources. Recent models trained on +diacritized Hebrew texts still present a gap in performance. We use a recently +developed char-based PLM to narrowly bridge this gap. Presenting MenakBERT, a +character level transformer pretrained on Hebrew text and fine-tuned to produce +diacritical marks for Hebrew sentences. We continue to show how finetuning a +model for diacritizing transfers to a task such as part of speech tagging. + +
+
+ comment: Published at ISCOL2022 as a poster +
+
+
+
+
+ + ☆ Eliminating Oversaturation and Artifacts of High Guidance Scales in + Diffusion Models + + +
+ Classifier-free guidance (CFG) is crucial for improving both generation +quality and alignment between the input condition and final output in diffusion +models. While a high guidance scale is generally required to enhance these +aspects, it also causes oversaturation and unrealistic artifacts. In this +paper, we revisit the CFG update rule and introduce modifications to address +this issue. We first decompose the update term in CFG into parallel and +orthogonal components with respect to the conditional model prediction and +observe that the parallel component primarily causes oversaturation, while the +orthogonal component enhances image quality. Accordingly, we propose +down-weighting the parallel component to achieve high-quality generations +without oversaturation. Additionally, we draw a connection between CFG and +gradient ascent and introduce a new rescaling and momentum method for the CFG +update rule based on this insight. Our approach, termed adaptive projected +guidance (APG), retains the quality-boosting advantages of CFG while enabling +the use of higher guidance scales without oversaturation. APG is easy to +implement and introduces practically no additional computational overhead to +the sampling process. Through extensive experiments, we demonstrate that APG is +compatible with various conditional diffusion models and samplers, leading to +improved FID, recall, and saturation scores while maintaining precision +comparable to CFG, making our method a superior plug-and-play alternative to +standard classifier-free guidance. + +
+
+
+
+
+ + ☆ An Online Feasible Point Method for Benign Generalized Nash Equilibrium + Problems + + +
+ We consider a repeatedly played generalized Nash equilibrium game. This +induces a multi-agent online learning problem with joint constraints. An +important challenge in this setting is that the feasible set for each agent +depends on the simultaneous moves of the other agents and, therefore, varies +over time. As a consequence, the agents face time-varying constraints, which +are not adversarial but rather endogenous to the system. Prior work in this +setting focused on convergence to a feasible solution in the limit via +integrating the constraints in the objective as a penalty function. However, no +existing work can guarantee that the constraints are satisfied for all +iterations while simultaneously guaranteeing convergence to a generalized Nash +equilibrium. This is a problem of fundamental theoretical interest and +practical relevance. In this work, we introduce a new online feasible point +method. Under the assumption that limited communication between the agents is +allowed, this method guarantees feasibility. We identify the class of benign +generalized Nash equilibrium problems, for which the convergence of our method +to the equilibrium is guaranteed. We set this class of benign generalized Nash +equilibrium games in context with existing definitions and illustrate our +method with examples. + +
+
+
+
+
+ + ☆ Parameter Competition Balancing for Model Merging NeurIPS2024 + + +
+ While fine-tuning pretrained models has become common practice, these models +often underperform outside their specific domains. Recently developed model +merging techniques enable the direct integration of multiple models, each +fine-tuned for distinct tasks, into a single model. This strategy promotes +multitasking capabilities without requiring retraining on the original +datasets. However, existing methods fall short in addressing potential +conflicts and complex correlations between tasks, especially in parameter-level +adjustments, posing a challenge in effectively balancing parameter competition +across various tasks. This paper introduces an innovative technique named +PCB-Merging (Parameter Competition Balancing), a lightweight and training-free +technique that adjusts the coefficients of each parameter for effective model +merging. PCB-Merging employs intra-balancing to gauge parameter significance +within individual tasks and inter-balancing to assess parameter similarities +across different tasks. Parameters with low importance scores are dropped, and +the remaining ones are rescaled to form the final merged model. We assessed our +approach in diverse merging scenarios, including cross-task, cross-domain, and +cross-training configurations, as well as out-of-domain generalization. The +experimental results reveal that our approach achieves substantial performance +enhancements across multiple modalities, domains, model sizes, number of tasks, +fine-tuning forms, and large language models, outperforming existing model +merging methods. The code is publicly available at: +\url{https://github.com/duguodong7/pcb-merging}. + +
+
+ comment: Accepted by NeurIPS2024 +
+
+
+
+
+ + ☆ Online Multi-Label Classification under Noisy and Changing Label + Distribution + + +
+ Multi-label data stream usually contains noisy labels in the real-world +applications, namely occuring in both relevant and irrelevant labels. However, +existing online multi-label classification methods are mostly limited in terms +of label quality and fail to deal with the case of noisy labels. On the other +hand, the ground-truth label distribution may vary with the time changing, +which is hidden in the observed noisy label distribution and difficult to +track, posing a major challenge for concept drift adaptation. Motivated by +this, we propose an online multi-label classification algorithm under Noisy and +Changing Label Distribution (NCLD). The convex objective is designed to +simultaneously model the label scoring and the label ranking for high accuracy, +whose robustness to NCLD benefits from three novel works: 1) The local feature +graph is used to reconstruct the label scores jointly with the observed labels, +and an unbiased ranking loss is derived and applied to learn reliable ranking +information. 2) By detecting the difference between two adjacent chunks with +the unbiased label cardinality, we identify the change in the ground-truth +label distribution and reset the ranking or all information learned from the +past to match the new distribution. 3) Efficient and accurate updating is +achieved based on the updating rule derived from the closed-form optimal model +solution. Finally, empirical experimental results validate the effectiveness of +our method in classifying instances under NCLD. + +
+
+
+
+
+ + ☆ MANTRA: The Manifold Triangulations Assemblage + + +
+ The rising interest in leveraging higher-order interactions present in +complex systems has led to a surge in more expressive models exploiting +high-order structures in the data, especially in topological deep learning +(TDL), which designs neural networks on high-order domains such as simplicial +complexes. However, progress in this field is hindered by the scarcity of +datasets for benchmarking these architectures. To address this gap, we +introduce MANTRA, the first large-scale, diverse, and intrinsically high order +dataset for benchmarking high-order models, comprising over 43,000 and 249,000 +triangulations of surfaces and three-dimensional manifolds, respectively. With +MANTRA, we assess several graph- and simplicial complex-based models on three +topological classification tasks. We demonstrate that while simplicial +complex-based neural networks generally outperform their graph-based +counterparts in capturing simple topological invariants, they also struggle, +suggesting a rethink of TDL. Thus, MANTRA serves as a benchmark for assessing +and advancing topological methods, leading the way for more effective +high-order models. + +
+
+ comment: 26 pages, 2 figures, 22 tables +
+
+
+
+
+ + ☆ Diffusion Meets Options: Hierarchical Generative Skill Composition for + Temporally-Extended Tasks + + +
+ Safe and successful deployment of robots requires not only the ability to +generate complex plans but also the capacity to frequently replan and correct +execution errors. This paper addresses the challenge of long-horizon trajectory +planning under temporally extended objectives in a receding horizon manner. To +this end, we propose DOPPLER, a data-driven hierarchical framework that +generates and updates plans based on instruction specified by linear temporal +logic (LTL). Our method decomposes temporal tasks into chain of options with +hierarchical reinforcement learning from offline non-expert datasets. It +leverages diffusion models to generate options with low-level actions. We +devise a determinantal-guided posterior sampling technique during batch +generation, which improves the speed and diversity of diffusion generated +options, leading to more efficient querying. Experiments on robot navigation +and manipulation tasks demonstrate that DOPPLER can generate sequences of +trajectories that progressively satisfy the specified formulae for obstacle +avoidance and sequential visitation. Demonstration videos are available online +at: https://philiptheother.github.io/doppler/. + +
+
+
+
+
+ + ☆ BiSSL: Bilevel Optimization for Self-Supervised Pre-Training and + Fine-Tuning + + +
+ In this work, we present BiSSL, a first-of-its-kind training framework that +introduces bilevel optimization to enhance the alignment between the pretext +pre-training and downstream fine-tuning stages in self-supervised learning. +BiSSL formulates the pretext and downstream task objectives as the lower- and +upper-level objectives in a bilevel optimization problem and serves as an +intermediate training stage within the self-supervised learning pipeline. By +more explicitly modeling the interdependence of these training stages, BiSSL +facilitates enhanced information sharing between them, ultimately leading to a +backbone parameter initialization that is better suited for the downstream +task. We propose a training algorithm that alternates between optimizing the +two objectives defined in BiSSL. Using a ResNet-18 backbone pre-trained with +SimCLR on the STL10 dataset, we demonstrate that our proposed framework +consistently achieves improved or competitive classification accuracies across +various downstream image classification datasets compared to the conventional +self-supervised learning pipeline. Qualitative analyses of the backbone +features further suggest that BiSSL enhances the alignment of downstream +features in the backbone prior to fine-tuning. + +
+
+
+
+
+ + ♻ ☆ CMP: Cooperative Motion Prediction with Multi-Agent Communication + + +
+ The confluence of the advancement of Autonomous Vehicles (AVs) and the +maturity of Vehicle-to-Everything (V2X) communication has enabled the +capability of cooperative connected and automated vehicles (CAVs). Building on +top of cooperative perception, this paper explores the feasibility and +effectiveness of cooperative motion prediction. Our method, CMP, takes LiDAR +signals as model input to enhance tracking and prediction capabilities. Unlike +previous work that focuses separately on either cooperative perception or +motion prediction, our framework, to the best of our knowledge, is the first to +address the unified problem where CAVs share information in both perception and +prediction modules. Incorporated into our design is the unique capability to +tolerate realistic V2X bandwidth limitations and transmission delays, while +dealing with bulky perception representations. We also propose a prediction +aggregation module, which unifies the predictions obtained by different CAVs +and generates the final prediction. Through extensive experiments and ablation +studies on the OPV2V and V2V4Real datasets, we demonstrate the effectiveness of +our method in cooperative perception, tracking, and motion prediction. In +particular, CMP reduces the average prediction error by 16.4\% with fewer +missing detections compared with the no cooperation setting and by 12.3\% +compared with the strongest baseline. Our work marks a significant step forward +in the cooperative capabilities of CAVs, showcasing enhanced performance in +complex scenarios. The code can be found on the project website: +https://cmp-cooperative-prediction.github.io/. + +
+
+ comment: Project website: https://cmp-cooperative-prediction.github.io/ +
+
+
+
+
+ + ♻ ☆ Accelerating Training with Neuron Interaction and Nowcasting Networks + + +
+ Neural network training can be accelerated when a learnable update rule is +used in lieu of classic adaptive optimizers (e.g. Adam). However, learnable +update rules can be costly and unstable to train and use. Recently, Jang et al. +(2023) proposed a simpler approach to accelerate training based on weight +nowcaster networks (WNNs). In their approach, Adam is used for most of the +optimization steps and periodically, only every few steps, a WNN nowcasts +(predicts near future) parameters. We improve WNNs by proposing neuron +interaction and nowcasting (NiNo) networks. In contrast to WNNs, NiNo leverages +neuron connectivity and graph neural networks to more accurately nowcast +parameters. We further show that in some networks, such as Transformers, +modeling neuron connectivity accurately is challenging. We address this and +other limitations, which allows NiNo to accelerate Adam training by up to 50% +in vision and language tasks. + +
+
+ comment: added Llama3-based results and other updates, code is + https://github.com/SamsungSAILMontreal/nino +
+
+
+
+
+ + ♻ ☆ LML-DAP: Language Model Learning a Dataset for Data-Augmented Prediction + + +
+ Classification tasks are typically handled using Machine Learning (ML) +models, which lack a balance between accuracy and interpretability. This paper +introduces a new approach to using Large Language Models (LLMs) for +classification tasks in an explainable way. Unlike ML models that rely heavily +on data cleaning and feature engineering, this method streamlines the process +using LLMs. This paper proposes a new concept called "Language Model Learning +(LML)" powered by a new method called "Data-Augmented Prediction (DAP)". The +classification is performed by LLMs using a method similar to humans manually +exploring and understanding the data and deciding classifications using data as +a reference. In the LML process, a dataset is summarized and evaluated to +determine the features that lead to the classification of each label the most. +In the process of DAP, the system uses the data summary and a row of the +testing dataset to automatically generate a query, which is used to retrieve +relevant rows from the dataset. A classification is generated by the LLM using +data summary and relevant rows, ensuring satisfactory accuracy even with +complex data using context-aware decision-making. LML and DAP unlock the +possibilities of new applications. The proposed method uses the words "Act as +an Explainable Machine Learning Model" in the prompt to enhance the +interpretability of the predictions by allowing users to review the logic +behind each prediction. In some test cases, the system scored an accuracy above +90%, proving the effectiveness of the system and its potential to outperform +conventional ML models in various scenarios. The code is available at +https://github.com/Pro-GenAI/LML-DAP + +
+
+ comment: Updated title, abstract, and images +
+
+
+
+
+ + ♻ ☆ On Training Data Influence of GPT Models EMNLP 2024 + + +
+ Amidst the rapid advancements in generative language models, the +investigation of how training data shapes the performance of GPT models is +still emerging. This paper presents GPTfluence, a novel approach that leverages +a featurized simulation to assess the impact of training examples on the +training dynamics of GPT models. Our approach not only traces the influence of +individual training instances on performance trajectories, such as loss and +other key metrics, on targeted test points but also enables a comprehensive +comparison with existing methods across various training scenarios in GPT +models, ranging from 14 million to 2.8 billion parameters, across a range of +downstream tasks. Contrary to earlier methods that struggle with generalization +to new data, GPTfluence introduces a parameterized simulation of training +dynamics, demonstrating robust generalization capabilities to unseen training +data. This adaptability is evident across both fine-tuning and +instruction-tuning scenarios, spanning tasks in natural language understanding +and generation. We make our code and data publicly available at +https://github.com/ernie-research/gptfluence. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Preble: Efficient Distributed Prompt Scheduling for LLM Serving + + +
+ Prompts to large language models (LLMs) have evolved beyond simple user +questions. For LLMs to solve complex problems, today's practices are to include +domain-specific instructions, illustration of tool usages, and/or long context +such as textbook chapters in prompts. As such, many parts of prompts are +repetitive across requests. Recent works propose to cache and reuse KV state of +prompts. However, they are all confined to a single-GPU optimization, while +production LLM serving systems are distributed by nature. + This paper proposes Preble, the first distributed LLM serving platform that +targets and optimizes for prompt sharing. We designed a distributed scheduling +system that co-optimizes KV state reuse and computation load-balancing with a +new scheduling algorithm and a hierarchical scheduling mechanism. Our +evaluation of Preble with real workloads and request arrival patterns on two +open-source LLMs shows that Preble outperforms the SOTA serving systems by 1.5X +to 14.5X on average latency and 2X to 10X on p99 latency. + +
+
+
+
+
+ + ♻ ☆ E(n) Equivariant Topological Neural Networks + + +
+ Graph neural networks excel at modeling pairwise interactions, but they +cannot flexibly accommodate higher-order interactions and features. Topological +deep learning (TDL) has emerged recently as a promising tool for addressing +this issue. TDL enables the principled modeling of arbitrary multi-way, +hierarchical higher-order interactions by operating on combinatorial +topological spaces, such as simplicial or cell complexes, instead of graphs. +However, little is known about how to leverage geometric features such as +positions and velocities for TDL. This paper introduces E(n)-Equivariant +Topological Neural Networks (ETNNs), which are E(n)-equivariant message-passing +networks operating on combinatorial complexes, formal objects unifying graphs, +hypergraphs, simplicial, path, and cell complexes. ETNNs incorporate geometric +node features while respecting rotation, reflection, and translation +equivariance. Moreover, ETNNs are natively ready for settings with +heterogeneous interactions. We provide a theoretical analysis to show the +improved expressiveness of ETNNs over architectures for geometric graphs. We +also show how E(n)-equivariant variants of TDL models can be directly derived +from our framework. The broad applicability of ETNNs is demonstrated through +two tasks of vastly different scales: i) molecular property prediction on the +QM9 benchmark and ii) land-use regression for hyper-local estimation of air +pollution with multi-resolution irregular geospatial data. The results indicate +that ETNNs are an effective tool for learning from diverse types of richly +structured data, as they match or surpass SotA equivariant TDL models with a +significantly smaller computational burden, thus highlighting the benefits of a +principled geometric inductive bias. + +
+
+ comment: 41 pages, 11 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ Unichain and Aperiodicity are Sufficient for Asymptotic Optimality of + Average-Reward Restless Bandits + + +
+ We consider the infinite-horizon, average-reward restless bandit problem in +discrete time. We propose a new class of policies that are designed to drive a +progressively larger subset of arms toward the optimal distribution. We show +that our policies are asymptotically optimal with an $O(1/\sqrt{N})$ optimality +gap for an $N$-armed problem, assuming only a unichain and aperiodicity +assumption. Our approach departs from most existing work that focuses on index +or priority policies, which rely on the Global Attractor Property (GAP) to +guarantee convergence to the optimum, or a recently developed simulation-based +policy, which requires a Synchronization Assumption (SA). + +
+
+ comment: 58 pages, 14 figures. This version includes a restructured main + result section and new experiments +
+
+
+
+
+ + ♻ ☆ Lookback Lens: Detecting and Mitigating Contextual Hallucinations in + Large Language Models Using Only Attention Maps EMNLP 2024 + + +
+ When asked to summarize articles or answer questions given a passage, large +language models (LLMs) can hallucinate details and respond with unsubstantiated +answers that are inaccurate with respect to the input context. This paper +describes a simple approach for detecting such contextual hallucinations. We +hypothesize that contextual hallucinations are related to the extent to which +an LLM attends to information in the provided context versus its own +generations. Based on this intuition, we propose a simple hallucination +detection model whose input features are given by the ratio of attention +weights on the context versus newly generated tokens (for each attention head). +We find that a linear classifier based on these lookback ratio features is as +effective as a richer detector that utilizes the entire hidden states of an LLM +or a text-based entailment model. The lookback ratio-based detector -- Lookback +Lens -- is found to transfer across tasks and even models, allowing a detector +that is trained on a 7B model to be applied (without retraining) to a larger +13B model. We further apply this detector to mitigate contextual +hallucinations, and find that a simple classifier-guided decoding approach is +able to reduce the amount of hallucination, for example by 9.6% in the XSum +summarization task. + +
+
+ comment: EMNLP 2024 main conference long paper. The source code is available + at https://github.com/voidism/Lookback-Lens +
+
+
+
+
+ + ♻ ☆ VideoPhy: Evaluating Physical Commonsense for Video Generation + + +
+ Recent advances in internet-scale video data pretraining have led to the +development of text-to-video generative models that can create high-quality +videos across a broad range of visual concepts, synthesize realistic motions +and render complex objects. Hence, these generative models have the potential +to become general-purpose simulators of the physical world. However, it is +unclear how far we are from this goal with the existing text-to-video +generative models. To this end, we present VideoPhy, a benchmark designed to +assess whether the generated videos follow physical commonsense for real-world +activities (e.g. marbles will roll down when placed on a slanted surface). +Specifically, we curate diverse prompts that involve interactions between +various material types in the physical world (e.g., solid-solid, solid-fluid, +fluid-fluid). We then generate videos conditioned on these captions from +diverse state-of-the-art text-to-video generative models, including open models +(e.g., CogVideoX) and closed models (e.g., Lumiere, Dream Machine). Our human +evaluation reveals that the existing models severely lack the ability to +generate videos adhering to the given text prompts, while also lack physical +commonsense. Specifically, the best performing model, CogVideoX-5B, generates +videos that adhere to the caption and physical laws for 39.6% of the instances. +VideoPhy thus highlights that the video generative models are far from +accurately simulating the physical world. Finally, we propose an +auto-evaluator, VideoCon-Physics, to assess the performance reliably for the +newly released models. + +
+
+ comment: 43 pages, 29 figures, 12 tables. Added CogVideo and Dream Machine in + v2 +
+
+
+
+
+ + ♻ ☆ Collaborative learning of common latent representations in routinely + collected multivariate ICU physiological signals ICASSP + + +
+ In Intensive Care Units (ICU), the abundance of multivariate time series +presents an opportunity for machine learning (ML) to enhance patient +phenotyping. In contrast to previous research focused on electronic health +records (EHR), here we propose an ML approach for phenotyping using routinely +collected physiological time series data. Our new algorithm integrates Long +Short-Term Memory (LSTM) networks with collaborative filtering concepts to +identify common physiological states across patients. Tested on real-world ICU +clinical data for intracranial hypertension (IH) detection in patients with +brain injury, our method achieved an area under the curve (AUC) of 0.889 and +average precision (AP) of 0.725. Moreover, our algorithm outperforms +autoencoders in learning more structured latent representations of the +physiological signals. These findings highlight the promise of our methodology +for patient phenotyping, leveraging routinely collected multivariate time +series to improve clinical care practices. + +
+
+ comment: Published in 2024 IEEE International Conference on Acoustics, Speech, + and Signal Processing Workshops (ICASSPW) +
+
+
+
+
+ + ♻ ☆ Generalizing Medical Image Representations via Quaternion Wavelet + Networks + + +
+ Neural network generalizability is becoming a broad research field due to the +increasing availability of datasets from different sources and for various +tasks. This issue is even wider when processing medical data, where a lack of +methodological standards causes large variations being provided by different +imaging centers or acquired with various devices and cofactors. To overcome +these limitations, we introduce a novel, generalizable, data- and task-agnostic +framework able to extract salient features from medical images. The proposed +quaternion wavelet network (QUAVE) can be easily integrated with any +pre-existing medical image analysis or synthesis task, and it can be involved +with real, quaternion, or hypercomplex-valued models, generalizing their +adoption to single-channel data. QUAVE first extracts different sub-bands +through the quaternion wavelet transform, resulting in both +low-frequency/approximation bands and high-frequency/fine-grained features. +Then, it weighs the most representative set of sub-bands to be involved as +input to any other neural model for image processing, replacing standard data +samples. We conduct an extensive experimental evaluation comprising different +datasets, diverse image analysis, and synthesis tasks including reconstruction, +segmentation, and modality translation. We also evaluate QUAVE in combination +with both real and quaternion-valued models. Results demonstrate the +effectiveness and the generalizability of the proposed framework that improves +network performance while being flexible to be adopted in manifold scenarios +and robust to domain shifts. The full code is available at: +https://github.com/ispamm/QWT. + +
+
+ comment: This paper is currently under review +
+
+
+
+
+ + ♻ ☆ On the Limited Generalization Capability of the Implicit Reward Model + Induced by Direct Preference Optimization EMNLP + + +
+ Reinforcement Learning from Human Feedback (RLHF) is an effective approach +for aligning language models to human preferences. Central to RLHF is learning +a reward function for scoring human preferences. Two main approaches for +learning a reward model are 1) training an EXplicit Reward Model (EXRM) as in +RLHF, and 2) using an implicit reward learned from preference data through +methods such as Direct Preference Optimization (DPO). Prior work has shown that +the implicit reward model of DPO (denoted as DPORM) can approximate an EXRM in +the limit. DPORM's effectiveness directly implies the optimality of the learned +policy, and also has practical implication for LLM alignment methods including +iterative DPO. However, it is unclear how well DPORM empirically matches the +performance of EXRM. This work studies the accuracy at distinguishing preferred +and rejected answers for both DPORM and EXRM. Our findings indicate that even +though DPORM fits the training dataset comparably, it generalizes less +effectively than EXRM, especially when the validation datasets contain +distribution shifts. Across five out-of-distribution settings, DPORM has a mean +drop in accuracy of 3% and a maximum drop of 7%. These findings highlight that +DPORM has limited generalization ability and substantiates the integration of +an explicit reward model in iterative DPO approaches. + +
+
+ comment: 12 pages, 8 tables, 3 figures; Paper Accepted at EMNLP Findings 2024 +
+
+
+
+
+ + ♻ ☆ Jailbreaking LLMs with Arabic Transliteration and Arabizi EMNLP 2024 + + +
+ This study identifies the potential vulnerabilities of Large Language Models +(LLMs) to 'jailbreak' attacks, specifically focusing on the Arabic language and +its various forms. While most research has concentrated on English-based prompt +manipulation, our investigation broadens the scope to investigate the Arabic +language. We initially tested the AdvBench benchmark in Standardized Arabic, +finding that even with prompt manipulation techniques like prefix injection, it +was insufficient to provoke LLMs into generating unsafe content. However, when +using Arabic transliteration and chatspeak (or arabizi), we found that unsafe +content could be produced on platforms like OpenAI GPT-4 and Anthropic Claude 3 +Sonnet. Our findings suggest that using Arabic and its various forms could +expose information that might remain hidden, potentially increasing the risk of +jailbreak attacks. We hypothesize that this exposure could be due to the +model's learned connection to specific words, highlighting the need for more +comprehensive safety training across all language forms. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Fair Allocation in Dynamic Mechanism Design NeurIPS + + +
+ We consider a dynamic mechanism design problem where an auctioneer sells an +indivisible good to groups of buyers in every round, for a total of $T$ rounds. +The auctioneer aims to maximize their discounted overall revenue while adhering +to a fairness constraint that guarantees a minimum average allocation for each +group. We begin by studying the static case ($T=1$) and establish that the +optimal mechanism involves two types of subsidization: one that increases the +overall probability of allocation to all buyers, and another that favors the +groups which otherwise have a lower probability of winning the item. We then +extend our results to the dynamic case by characterizing a set of recursive +functions that determine the optimal allocation and payments in each round. +Notably, our results establish that in the dynamic case, the seller, on the one +hand, commits to a participation bonus to incentivize truth-telling, and on the +other hand, charges an entry fee for every round. Moreover, the optimal +allocation once more involves subsidization, which its extent depends on the +difference in future utilities for both the seller and buyers when allocating +the item to one group versus the others. Finally, we present an approximation +scheme to solve the recursive equations and determine an approximately optimal +and fair allocation efficiently. + +
+
+ comment: A shorter conference version has been accepted at the Advances in + Neural Information Processing Systems (NeurIPS) 2024 +
+
+
+
+
+ + ♻ ☆ Signature Isolation Forest + + +
+ Functional Isolation Forest (FIF) is a recent state-of-the-art Anomaly +Detection (AD) algorithm designed for functional data. It relies on a tree +partition procedure where an abnormality score is computed by projecting each +curve observation on a drawn dictionary through a linear inner product. Such +linear inner product and the dictionary are a priori choices that highly +influence the algorithm's performances and might lead to unreliable results, +particularly with complex datasets. This work addresses these challenges by +introducing \textit{Signature Isolation Forest}, a novel AD algorithm class +leveraging the rough path theory's signature transform. Our objective is to +remove the constraints imposed by FIF through the proposition of two algorithms +which specifically target the linearity of the FIF inner product and the choice +of the dictionary. We provide several numerical experiments, including a +real-world applications benchmark showing the relevance of our methods. + +
+
+
+
+
+ + ♻ ☆ DyGPrompt: Learning Feature and Time Prompts on Dynamic Graphs + + +
+ Dynamic graphs capture evolving interactions between entities, such as in +social networks, online learning platforms, and crowdsourcing projects. For +dynamic graph modeling, dynamic graph neural networks (DGNNs) have emerged as a +mainstream technique. However, they are generally pre-trained on the link +prediction task, leaving a significant gap from the objectives of downstream +tasks such as node classification. To bridge the gap, prompt-based learning has +gained traction on graphs, but most existing efforts focus on static graphs, +neglecting the evolution of dynamic graphs. In this paper, we propose +DYGPROMPT, a novel pre-training and prompt learning framework for dynamic graph +modeling. First, we design dual prompts to address the gap in both task +objectives and temporal variations across pre-training and downstream tasks. +Second, we recognize that node and time features mutually characterize each +other, and propose dual condition-nets to model the evolving node-time patterns +in downstream tasks. Finally, we thoroughly evaluate and analyze DYGPROMPT +through extensive experiments on four public datasets. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Does Refusal Training in LLMs Generalize to the Past Tense? + + +
+ Refusal training is widely used to prevent LLMs from generating harmful, +undesirable, or illegal outputs. We reveal a curious generalization gap in the +current refusal training approaches: simply reformulating a harmful request in +the past tense (e.g., "How to make a Molotov cocktail?" to "How did people make +a Molotov cocktail?") is often sufficient to jailbreak many state-of-the-art +LLMs. We systematically evaluate this method on Llama-3 8B, Claude-3.5 Sonnet, +GPT-3.5 Turbo, Gemma-2 9B, Phi-3-Mini, GPT-4o mini, GPT-4o, o1-mini, +o1-preview, and R2D2 models using GPT-3.5 Turbo as a reformulation model. For +example, the success rate of this simple attack on GPT-4o increases from 1% +using direct requests to 88% using 20 past tense reformulation attempts on +harmful requests from JailbreakBench with GPT-4 as a jailbreak judge. +Interestingly, we also find that reformulations in the future tense are less +effective, suggesting that refusal guardrails tend to consider past historical +questions more benign than hypothetical future questions. Moreover, our +experiments on fine-tuning GPT-3.5 Turbo show that defending against past +reformulations is feasible when past tense examples are explicitly included in +the fine-tuning data. Overall, our findings highlight that the widely used +alignment techniques -- such as SFT, RLHF, and adversarial training -- employed +to align the studied models can be brittle and do not always generalize as +intended. We provide code and jailbreak artifacts at +https://github.com/tml-epfl/llm-past-tense. + +
+
+ comment: Update in v3: o1-mini and o1-preview results (on top of GPT-4o and + Claude 3.5 Sonnet added in v2). We provide code and jailbreak artifacts at + https://github.com/tml-epfl/llm-past-tense +
+
+
+
+
+ + ♻ ☆ Scalable Label Distribution Learning for Multi-Label Classification + + +
+ Multi-label classification (MLC) refers to the problem of tagging a given +instance with a set of relevant labels. Most existing MLC methods are based on +the assumption that the correlation of two labels in each label pair is +symmetric, which is violated in many real-world scenarios. Moreover, most +existing methods design learning processes associated with the number of +labels, which makes their computational complexity a bottleneck when scaling up +to large-scale output space. To tackle these issues, we propose a novel method +named Scalable Label Distribution Learning (SLDL) for multi-label +classification which can describe different labels as distributions in a latent +space, where the label correlation is asymmetric and the dimension is +independent of the number of labels. Specifically, SLDL first converts labels +into continuous distributions within a low-dimensional latent space and +leverages the asymmetric metric to establish the correlation between different +labels. Then, it learns the mapping from the feature space to the latent space, +resulting in the computational complexity is no longer related to the number of +labels. Finally, SLDL leverages a nearest-neighbor-based strategy to decode the +latent representations and obtain the final predictions. Extensive experiments +illustrate that SLDL achieves very competitive classification performances with +little computational consumption. + +
+
+
+
+
+ + ♻ ☆ Foundations of Large Language Model Compression -- Part 1: Weight + Quantization + + +
+ In recent years, compression of large language models (LLMs) has emerged as +an important problem to enable language model deployment on +resource-constrained devices, reduce computational costs, and mitigate the +environmental footprint of large-scale AI infrastructure. In this paper, we lay +down the foundation for LLM quantization from a convex optimization perspective +and propose a quantization technique that builds on this foundation for optimum +quantization outcomes. Our quantization framework, CVXQ, scales to models +containing hundreds of billions of weight parameters and provides users with +the flexibility to compress models to any specified model size, post-training. +A reference implementation of CVXQ can be obtained from github.com/seannz/cvxq. + +
+
+ comment: Preprint. 17 pages, 4 figures, 5 appendices +
+
+
+
+
+ + ♻ ☆ NECOMIMI: Neural-Cognitive Multimodal EEG-informed Image Generation with + Diffusion Models + + +
+ NECOMIMI (NEural-COgnitive MultImodal EEG-Informed Image Generation with +Diffusion Models) introduces a novel framework for generating images directly +from EEG signals using advanced diffusion models. Unlike previous works that +focused solely on EEG-image classification through contrastive learning, +NECOMIMI extends this task to image generation. The proposed NERV EEG encoder +demonstrates state-of-the-art (SoTA) performance across multiple zero-shot +classification tasks, including 2-way, 4-way, and 200-way, and achieves top +results in our newly proposed Category-based Assessment Table (CAT) Score, +which evaluates the quality of EEG-generated images based on semantic concepts. +A key discovery of this work is that the model tends to generate abstract or +generalized images, such as landscapes, rather than specific objects, +highlighting the inherent challenges of translating noisy and low-resolution +EEG data into detailed visual outputs. Additionally, we introduce the CAT Score +as a new metric tailored for EEG-to-image evaluation and establish a benchmark +on the ThingsEEG dataset. This study underscores the potential of EEG-to-image +generation while revealing the complexities and challenges that remain in +bridging neural activity with visual representation. + +
+
+
+
+
+ + ♻ ☆ EIA: Environmental Injection Attack on Generalist Web Agents for Privacy + Leakage + + +
+ Generalist web agents have demonstrated remarkable potential in autonomously +completing a wide range of tasks on real websites, significantly boosting human +productivity. However, web tasks, such as booking flights, usually involve +users' PII, which may be exposed to potential privacy risks if web agents +accidentally interact with compromised websites, a scenario that remains +largely unexplored in the literature. In this work, we narrow this gap by +conducting the first study on the privacy risks of generalist web agents in +adversarial environments. First, we present a realistic threat model for +attacks on the website, where we consider two adversarial targets: stealing +users' specific PII or the entire user request. Then, we propose a novel attack +method, termed Environmental Injection Attack (EIA). EIA injects malicious +content designed to adapt well to environments where the agents operate and our +work instantiates EIA specifically for privacy scenarios in web environments. +We collect 177 action steps that involve diverse PII categories on realistic +websites from the Mind2Web, and conduct experiments using one of the most +capable generalist web agent frameworks to date. The results demonstrate that +EIA achieves up to 70% ASR in stealing specific PII and 16% ASR for full user +request. Additionally, by accessing the stealthiness and experimenting with a +defensive system prompt, we indicate that EIA is hard to detect and mitigate. +Notably, attacks that are not well adapted for a webpage can be detected via +human inspection, leading to our discussion about the trade-off between +security and autonomy. However, extra attackers' efforts can make EIA +seamlessly adapted, rendering such supervision ineffective. Thus, we further +discuss the defenses at the pre- and post-deployment stages of the websites +without relying on human supervision and call for more advanced defense +strategies. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ♻ ☆ Graph Diffusion Transformers for Multi-Conditional Molecular Generation NeurIPS 2024 + + +
+ Inverse molecular design with diffusion models holds great potential for +advancements in material and drug discovery. Despite success in unconditional +molecular generation, integrating multiple properties such as synthetic score +and gas permeability as condition constraints into diffusion models remains +unexplored. We present the Graph Diffusion Transformer (Graph DiT) for +multi-conditional molecular generation. Graph DiT integrates an encoder to +learn numerical and categorical property representations with the +Transformer-based denoiser. Unlike previous graph diffusion models that add +noise separately on the atoms and bonds in the forward diffusion process, Graph +DiT is trained with a novel graph-dependent noise model for accurate estimation +of graph-related noise in molecules. We extensively validate Graph DiT for +multi-conditional polymer and small molecule generation. Results demonstrate +the superiority of Graph DiT across nine metrics from distribution learning to +condition control for molecular properties. A polymer inverse design task for +gas separation with feedback from domain experts further demonstrates its +practical utility. + +
+
+ comment: Accepted by NeurIPS 2024 (Oral). 21 pages, 11 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Sample and Oracle Efficient Reinforcement Learning for MDPs with + Linearly-Realizable Value Functions + + +
+ Designing sample-efficient and computationally feasible reinforcement +learning (RL) algorithms is particularly challenging in environments with large +or infinite state and action spaces. In this paper, we advance this effort by +presenting an efficient algorithm for Markov Decision Processes (MDPs) where +the state-action value function of any policy is linear in a given feature map. +This challenging setting can model environments with infinite states and +actions, strictly generalizes classic linear MDPs, and currently lacks a +computationally efficient algorithm under online access to the MDP. +Specifically, we introduce a new RL algorithm that efficiently finds a +near-optimal policy in this setting, using a number of episodes and calls to a +cost-sensitive classification (CSC) oracle that are both polynomial in the +problem parameters. Notably, our CSC oracle can be efficiently implemented when +the feature dimension is constant, representing a clear improvement over +state-of-the-art methods, which require solving non-convex problems with +horizon-many variables and can incur computational costs that are exponential +in the horizon. + +
+
+
+
+
+ + ♻ ☆ Adjusted Expected Improvement for Cumulative Regret Minimization in + Noisy Bayesian Optimization + + +
+ The expected improvement (EI) is one of the most popular acquisition +functions for Bayesian optimization (BO) and has demonstrated good empirical +performances in many applications for the minimization of simple regret. +However, under the evaluation metric of cumulative regret, the performance of +EI may not be competitive, and its existing theoretical regret upper bound +still has room for improvement. To adapt the EI for better performance under +cumulative regret, we introduce a novel quantity called the evaluation cost +which is compared against the acquisition function, and with this, develop the +expected improvement-cost (EIC) algorithm. In each iteration of EIC, a new +point with the largest acquisition function value is sampled, only if that +value exceeds its evaluation cost. If none meets this criteria, the current +best point is resampled. This evaluation cost quantifies the potential downside +of sampling a point, which is important under the cumulative regret metric as +the objective function value in every iteration affects the performance +measure. We establish in theory a high-probability regret upper bound of EIC +based on the maximum information gain, which is tighter than the bound of +existing EI-based algorithms. It is also comparable to the regret bound of +other popular BO algorithms such as Thompson sampling (GP-TS) and upper +confidence bound (GP-UCB). We further perform experiments to illustrate the +improvement of EIC over several popular BO algorithms. + +
+
+
+
+
+ + ♻ ☆ A Methodological Report on Anomaly Detection on Dynamic Knowledge Graphs + + +
+ In this paper, we explore different approaches to anomaly detection on +dynamic knowledge graphs, specifically in a microservices environment for +Kubernetes applications. Our approach explores three dynamic knowledge graph +representations: sequential data, one-hop graph structure, and two-hop graph +structure, with each representation incorporating increasingly complex +structural information. Each phase includes different machine learning and deep +learning models. We empirically analyse their performance and propose an +approach based on ensemble learning of these models. Our approach significantly +outperforms the baseline on the ISWC 2024 Dynamic Knowledge Graph Anomaly +Detection dataset, providing a robust solution for anomaly detection in dynamic +complex data. + +
+
+
+
+
+ + ♻ ☆ A Deep Generative Learning Approach for Two-stage Adaptive Robust + Optimization + + +
+ Two-stage adaptive robust optimization (ARO) is a powerful approach for +planning under uncertainty, balancing first-stage decisions with recourse +decisions made after uncertainty is realized. To account for uncertainty, +modelers typically define a simple uncertainty set over which potential +outcomes are considered. However, classical methods for defining these sets +unintentionally capture a wide range of unrealistic outcomes, resulting in +overly-conservative and costly planning in anticipation of unlikely +contingencies. In this work, we introduce AGRO, a solution algorithm that +performs adversarial generation for two-stage adaptive robust optimization +using a variational autoencoder. AGRO generates high-dimensional contingencies +that are simultaneously adversarial and realistic, improving the robustness of +first-stage decisions at a lower planning cost than standard methods. To ensure +generated contingencies lie in high-density regions of the uncertainty +distribution, AGRO defines a tight uncertainty set as the image of "latent" +uncertainty sets under the VAE decoding transformation. Projected gradient +ascent is then used to maximize recourse costs over the latent uncertainty sets +by leveraging differentiable optimization methods. We demonstrate the +cost-efficiency of AGRO by applying it to both a synthetic +production-distribution problem and a real-world power system expansion +setting. We show that AGRO outperforms the standard column-and-constraint +algorithm by up to 1.8% in production-distribution planning and up to 11.6% in +power system expansion. + +
+
+
+
+
+ + ♻ ☆ PARAMANU-AYN: Pretrain from scratch or Continual Pretraining of LLMs for + Legal Domain Adaptation? + + +
+ In this paper, we present Paramanu-Ayn, a collection of legal language models +trained exclusively on Indian legal case documents. This 97-million-parameter +Auto-Regressive (AR) decoder-only model was pretrained from scratch with a +context size of 8192 on a single GPU for just 185 hours, achieving an efficient +MFU of 41.35. We also developed a legal domain specialized BPE tokenizer. We +evaluated our model using perplexity and zero-shot tasks: case judgment +prediction with explanation and abstractive case summarization. Paramanu-Ayn +outperformed Llama-2 7B and Gemini-Pro in case judgment prediction with +explanation task on test accuracy by nearly 2 percentage points, despite being +72 times smaller. In zero-shot abstractive summarization, it surpassed +decoder-only LLMs generating fixed-length summaries (5000 tokens) by over 10 +percentage points in BLEU and METEOR metrics, and by nearly 4 percentage points +in BERTScore. Further evaluations on zero-shot commonsense and mathematical +benchmarks showed that Paramanu-Ayn excelled despite being trained exclusively +on legal documents, outperforming Llama-1, Llama-2, and Falcon on +AGIEVAL-AQuA-RAT and AGIEVAL-SAT-Math tasks. We also instruction-tuned our +model on 10,763 diverse legal tasks, including legal clause generation, legal +drafting, case summarization, etc. The Paramanu-Ayn-instruct model scored above +8 out of 10 in clarity, relevance, completeness, and legal reasoning metrics by +GPT-3.5-Turbo. We found that our models, were able to learn drafting knowledge +and generalize to draft legal contracts and legal clauses with limited +instruction-tuning. Hence, we conclude that for a strong domain-specialized +generative language model (such as legal), domain specialized pretraining from +scratch is more cost effective, environmentally friendly, and remains +competitive with larger models or even better than adapting LLMs for legal +domain tasks. + +
+
+
+
+
+ + ♻ ☆ Advantage Alignment Algorithms + + +
+ Artificially intelligent agents are increasingly being integrated into human +decision-making: from large language model (LLM) assistants to autonomous +vehicles. These systems often optimize their individual objective, leading to +conflicts, particularly in general-sum games where naive reinforcement learning +agents empirically converge to Pareto-suboptimal Nash equilibria. To address +this issue, opponent shaping has emerged as a paradigm for finding socially +beneficial equilibria in general-sum games. In this work, we introduce +Advantage Alignment, a family of algorithms derived from first principles that +perform opponent shaping efficiently and intuitively. We achieve this by +aligning the advantages of interacting agents, increasing the probability of +mutually beneficial actions when their interaction has been positive. We prove +that existing opponent shaping methods implicitly perform Advantage Alignment. +Compared to these methods, Advantage Alignment simplifies the mathematical +formulation of opponent shaping, reduces the computational burden and extends +to continuous action domains. We demonstrate the effectiveness of our +algorithms across a range of social dilemmas, achieving state-of-the-art +cooperation and robustness against exploitation. + +
+
+ comment: 25 Pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Fast Matrix Multiplications for Lookup Table-Quantized LLMs EMNLP 2024 + + +
+ The deployment of large language models (LLMs) is often constrained by memory +bandwidth, where the primary bottleneck is the cost of transferring model +parameters from the GPU's global memory to its registers. When coupled with +custom kernels that fuse the dequantization and matmul operations, weight-only +quantization can thus enable faster inference by reducing the amount of memory +movement. However, developing high-performance kernels for weight-quantized +LLMs presents substantial challenges, especially when the weights are +compressed to non-evenly-divisible bit widths (e.g., 3 bits) with non-uniform, +lookup table (LUT) quantization. This paper describes FLUTE, a flexible lookup +table engine for LUT-quantized LLMs, which uses offline restructuring of the +quantized weight matrix to minimize bit manipulations associated with +unpacking, and vectorization and duplication of the lookup table to mitigate +shared memory bandwidth constraints. At batch sizes < 32 and quantization group +size of 128 (typical in LLM inference), the FLUTE kernel can be 2-4x faster +than existing GEMM kernels. As an application of FLUTE, we explore a simple +extension to lookup table-based NormalFloat quantization and apply it to +quantize LLaMA3 to various configurations, obtaining competitive quantization +performance against strong baselines while obtaining an end-to-end throughput +increase of 1.5 to 2 times. + +
+
+ comment: EMNLP 2024 (Findings) +
+
+
+
+
+ + ♻ ☆ Nebula: A discourse aware Minecraft Builder EMNLP 2024 + + +
+ When engaging in collaborative tasks, humans efficiently exploit the semantic +structure of a conversation to optimize verbal and nonverbal interactions. But +in recent "language to code" or "language to action" models, this information +is lacking. We show how incorporating the prior discourse and nonlinguistic +context of a conversation situated in a nonlinguistic environment can improve +the "language to action" component of such interactions. We finetune an LLM to +predict actions based on prior context; our model, Nebula, doubles the +net-action F1 score over the baseline on this task of Jayannavar et al.(2020). +We also investigate our model's ability to construct shapes and understand +location descriptions using a synthetic dataset + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ LongForm: Effective Instruction Tuning with Reverse Instructions EMNLP 2024 + + +
+ Instruction tuning enables language models to more effectively generalize and +better follow user intent. However, obtaining instruction data is costly and +challenging. Prior work employs methods such as expensive human annotation, +crowd-sourced datasets with alignment issues, and generating noisy examples via +LLMs. We introduce the LongForm-C dataset, which is created by reverse +instructions. We generate instructions via LLMs for human-written corpus +examples using reverse instructions. First we select a diverse set of +human-written documents from corpora such as C4 and Wikipedia; then we generate +instructions for these documents via LLMs. This approach provides a cheaper and +cleaner instruction-tuning dataset with natural output and one suitable for +long text generation. Our models outperform 10x larger language models without +instruction tuning on tasks such as story/recipe generation and long-form +question answering. Moreover, LongForm models outperform prior +instruction-tuned models such as FLAN-T5 and Alpaca by a large margin, and +improve language understanding capabilities further. We publicly release our +data and models: https://github.com/akoksal/LongForm. + +
+
+ comment: EMNLP 2024 Findings. This version extends the training with recent + LLMs, evaluation with new metrics, and NLU tasks +
+
+
+
+
+ + ♻ ☆ Residual-based Attention Physics-informed Neural Networks for + Spatio-Temporal Ageing Assessment of Transformers Operated in Renewable Power + Plants + + +
+ Transformers are crucial for reliable and efficient power system operations, +particularly in supporting the integration of renewable energy. Effective +monitoring of transformer health is critical to maintain grid stability and +performance. Thermal insulation ageing is a key transformer failure mode, which +is generally tracked by monitoring the hotspot temperature (HST). However, HST +measurement is complex, costly, and often estimated from indirect measurements. +Existing HST models focus on space-agnostic thermal models, providing +worst-case HST estimates. This article introduces a spatio-temporal model for +transformer winding temperature and ageing estimation, which leverages +physics-based partial differential equations (PDEs) with data-driven Neural +Networks (NN) in a Physics Informed Neural Networks (PINNs) configuration to +improve prediction accuracy and acquire spatio-temporal resolution. The +computational accuracy of the PINN model is improved through the implementation +of the Residual-Based Attention (PINN-RBA) scheme that accelerates the PINN +model convergence. The PINN-RBA model is benchmarked against self-adaptive +attention schemes and classical vanilla PINN configurations. For the first +time, PINN based oil temperature predictions are used to estimate +spatio-temporal transformer winding temperature values, validated through PDE +numerical solution and fiber optic sensor measurements. Furthermore, the +spatio-temporal transformer ageing model is inferred, which supports +transformer health management decision-making. Results are validated with a +distribution transformer operating on a floating photovoltaic power plant. + +
+
+ comment: 23 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ Embodied-RAG: General Non-parametric Embodied Memory for Retrieval and + Generation + + +
+ There is no limit to how much a robot might explore and learn, but all of +that knowledge needs to be searchable and actionable. Within language research, +retrieval augmented generation (RAG) has become the workhouse of large-scale +non-parametric knowledge, however existing techniques do not directly transfer +to the embodied domain, which is multimodal, data is highly correlated, and +perception requires abstraction. + To address these challenges, we introduce Embodied-RAG, a framework that +enhances the foundational model of an embodied agent with a non-parametric +memory system capable of autonomously constructing hierarchical knowledge for +both navigation and language generation. Embodied-RAG handles a full range of +spatial and semantic resolutions across diverse environments and query types, +whether for a specific object or a holistic description of ambiance. At its +core, Embodied-RAG's memory is structured as a semantic forest, storing +language descriptions at varying levels of detail. This hierarchical +organization allows the system to efficiently generate context-sensitive +outputs across different robotic platforms. We demonstrate that Embodied-RAG +effectively bridges RAG to the robotics domain, successfully handling over 200 +explanation and navigation queries across 19 environments, highlighting its +promise for general-purpose non-parametric system for embodied agents. + +
+
+ comment: Web: https://quanting-xie.github.io/Embodied-RAG-web/ +
+
+
+
+
+ + ♻ ☆ LDMol: Text-to-Molecule Diffusion Model with Structurally Informative + Latent Space + + +
+ With the emergence of diffusion models as the frontline of generative models, +many researchers have proposed molecule generation techniques with conditional +diffusion models. However, the unavoidable discreteness of a molecule makes it +difficult for a diffusion model to connect raw data with highly complex +conditions like natural language. To address this, we present a novel latent +diffusion model dubbed LDMol for text-conditioned molecule generation. LDMol +comprises a molecule autoencoder that produces a learnable and structurally +informative feature space, and a natural language-conditioned latent diffusion +model. In particular, recognizing that multiple SMILES notations can represent +the same molecule, we employ a contrastive learning strategy to extract feature +space that is aware of the unique characteristics of the molecule structure. +LDMol outperforms the existing baselines on the text-to-molecule generation +benchmark, suggesting a potential for diffusion models can outperform +autoregressive models in text data generation with a better choice of the +latent domain. Furthermore, we show that LDMol can be applied to downstream +tasks such as molecule-to-text retrieval and text-guided molecule editing, +demonstrating its versatility as a diffusion model. + +
+
+
+
+
+ + ♻ ☆ Learning an Actionable Discrete Diffusion Policy via Large-Scale + Actionless Video Pre-Training NeurIPS 2024 + + +
+ Learning a generalist embodied agent capable of completing multiple tasks +poses challenges, primarily stemming from the scarcity of action-labeled +robotic datasets. In contrast, a vast amount of human videos exist, capturing +intricate tasks and interactions with the physical world. Promising prospects +arise for utilizing actionless human videos for pre-training and transferring +the knowledge to facilitate robot policy learning through limited robot +demonstrations. However, it remains a challenge due to the domain gap between +humans and robots. Moreover, it is difficult to extract useful information +representing the dynamic world from human videos, because of its noisy and +multimodal data structure. In this paper, we introduce a novel framework to +tackle these challenges, which leverages a unified discrete diffusion to +combine generative pre-training on human videos and policy fine-tuning on a +small number of action-labeled robot videos. We start by compressing both human +and robot videos into unified video tokens. In the pre-training stage, we +employ a discrete diffusion model with a mask-and-replace diffusion strategy to +predict future video tokens in the latent space. In the fine-tuning stage, we +harness the imagined future videos to guide low-level action learning with a +limited set of robot data. Experiments demonstrate that our method generates +high-fidelity future videos for planning and enhances the fine-tuned policies +compared to previous state-of-the-art approaches with superior performance. Our +project website is available at https://video-diff.github.io/. + +
+
+ comment: Accepted by NeurIPS 2024. 24 pages +
+
+
+
+
+ + ♻ ☆ On Sensitivity of Learning with Limited Labelled Data to the Effects of + Randomness: Impact of Interactions and Systematic Choices EMNLP'24 + + +
+ While learning with limited labelled data can improve performance when the +labels are lacking, it is also sensitive to the effects of uncontrolled +randomness introduced by so-called randomness factors (e.g., varying order of +data). We propose a method to systematically investigate the effects of +randomness factors while taking the interactions between them into +consideration. To measure the true effects of an individual randomness factor, +our method mitigates the effects of other factors and observes how the +performance varies across multiple runs. Applying our method to multiple +randomness factors across in-context learning and fine-tuning approaches on 7 +representative text classification tasks and meta-learning on 3 tasks, we show +that: 1) disregarding interactions between randomness factors in existing works +caused inconsistent findings due to incorrect attribution of the effects of +randomness factors, such as disproving the consistent sensitivity of in-context +learning to sample order even with random sample selection; and 2) besides +mutual interactions, the effects of randomness factors, especially sample +order, are also dependent on more systematic choices unexplored in existing +works, such as number of classes, samples per class or choice of prompt format. + +
+
+ comment: Accepted to the EMNLP'24 Main Conference +
+
+
+
+
+ + ♻ ☆ AtomSurf : Surface Representation for Learning on Protein Structures + + +
+ While there has been significant progress in evaluating and comparing +different representations for learning on protein data, the role of +surface-based learning approaches remains not well-understood. In particular, +there is a lack of direct and fair benchmark comparison between the best +available surface-based learning methods against alternative representations +such as graphs. Moreover, the few existing surface-based approaches either use +surface information in isolation or, at best, perform global pooling between +surface and graph-based architectures. + In this work, we fill this gap by first adapting a state-of-the-art surface +encoder for protein learning tasks. We then perform a direct and fair +comparison of the resulting method against alternative approaches within the +Atom3D benchmark, highlighting the limitations of pure surface-based learning. +Finally, we propose an integrated approach, which allows learned feature +sharing between graphs and surface representations on the level of nodes and +vertices $\textit{across all layers}$. + We demonstrate that the resulting architecture achieves state-of-the-art +results on all tasks in the Atom3D benchmark, while adhering to the strict +benchmark protocol, as well as more broadly on binding site identification and +binding pocket classification. Furthermore, we use coarsened surfaces and +optimize our approach for efficiency, making our tool competitive in training +and inference time with existing techniques. Our code and data can be found +online: $\texttt{github.com/Vincentx15/atomsurf}$ + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ PowerPM: Foundation Model for Power Systems + + +
+ The emergence of abundant electricity time series (ETS) data provides ample +opportunities for various applications in the power systems, including +demand-side management, grid stability, and consumer behavior analysis. Deep +learning models have advanced ETS modeling by effectively capturing sequence +dependence. Nevertheless, learning a generic representation of ETS data for +various applications remains challenging due to the inherently complex +hierarchical structure of ETS data. Moreover, ETS data exhibits intricate +temporal dependencies and is suscepti ble to the influence of exogenous +variables. Furthermore, different instances exhibit diverse electricity +consumption behavior. In this paper, we propose a foundation model PowerPM to +model ETS data, providing a large-scale, off-the-shelf model for power systems. +PowerPM consists of a temporal encoder and a hierarchical encoder. The temporal +encoder captures both temporal dependencies in ETS data, considering exogenous +variables. The hierarchical encoder models the correlation between hierarchy. +Furthermore, PowerPM leverages a novel self-supervised pretraining framework +consisting of masked ETS modeling and dual-view contrastive learning, which +enable PowerPM to capture temporal dependency within ETS windows and aware the +discrepancy across ETS windows, providing two different perspectives to learn +generic representation. Our experiments involve five real world scenario +datasets, comprising private and public data. Through pre-training on massive +ETS data, PowerPM achieves SOTA performance on diverse downstream tasks within +the private dataset. Impressively, when transferred to the public datasets, +PowerPM maintains its superiority, showcasing its remarkable generalization +ability across various tasks and domains. Moreover, ablation studies, few-shot +experiments provide additional evidence of the effectiveness of our model. + +
+
+ comment: 23 pages, 5 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Solution of the Probabilistic Lambert Problem: Connections with Optimal + Mass Transport, Schrödinger Bridge and Reaction-Diffusion PDEs + + +
+ The Lambert problem originated in orbital mechanics. It concerns with +determining the initial velocity for a boundary value problem involving the +dynamical constraint due to gravitational potential with additional time +horizon and endpoint position constraints. Its solution has application in +transferring a spacecraft from a given initial to a given terminal position +within prescribed flight time via velocity control. We consider a probabilistic +variant of the Lambert problem where the knowledge of the endpoint constraints +in position vectors are replaced by the knowledge of their respective joint +probability density functions. We show that the Lambert problem with endpoint +joint probability density constraints is a generalized optimal mass transport +(OMT) problem, thereby connecting this classical astrodynamics problem with a +burgeoning area of research in modern stochastic control and stochastic machine +learning. This newfound connection allows us to rigorously establish the +existence and uniqueness of solution for the probabilistic Lambert problem. The +same connection also helps to numerically solve the probabilistic Lambert +problem via diffusion regularization, i.e., by leveraging further connection of +the OMT with the Schr\"odinger bridge problem (SBP). This also shows that the +probabilistic Lambert problem with additive dynamic process noise is a +generalized SBP, and can be solved numerically using the so-called +Schr\"odinger factors, as we do in this work. Our analysis leads to solving a +system of reaction-diffusion PDEs where the gravitational potential appears as +the reaction rate. + +
+
+
+
+
+ + ♻ ☆ Source-Free Domain Adaptation Guided by Vision and Vision-Language + Pre-Training ICCV + + +
+ Source-free domain adaptation (SFDA) aims to adapt a source model trained on +a fully-labeled source domain to a related but unlabeled target domain. While +the source model is a key avenue for acquiring target pseudolabels, the +generated pseudolabels may exhibit source bias. In the conventional SFDA +pipeline, a large data (e.g. ImageNet) pre-trained feature extractor is used to +initialize the source model at the start of source training, and subsequently +discarded. Despite having diverse features important for generalization, the +pre-trained feature extractor can overfit to the source data distribution +during source training and forget relevant target domain knowledge. Rather than +discarding this valuable knowledge, we introduce an integrated framework to +incorporate pre-trained networks into the target adaptation process. The +proposed framework is flexible and allows us to plug modern pre-trained +networks into the adaptation process to leverage their stronger representation +learning capabilities. For adaptation, we propose the Co-learn algorithm to +improve target pseudolabel quality collaboratively through the source model and +a pre-trained feature extractor. Building on the recent success of the +vision-language model CLIP in zero-shot image recognition, we present an +extension Co-learn++ to further incorporate CLIP's zero-shot classification +decisions. We evaluate on 4 benchmark datasets and include more challenging +scenarios such as open-set, partial-set and open-partial SFDA. Experimental +results demonstrate that our proposed strategy improves adaptation performance +and can be successfully integrated with existing SFDA methods. Project code is +available at https://github.com/zwenyu/colearn-plus. + +
+
+ comment: Extension of ICCV paper arXiv:2212.07585; Published at IJCV +
+
+
+
+
+ + ♻ ☆ Symbolic State Partitioning for Reinforcement Learning + + +
+ Tabular reinforcement learning methods cannot operate directly on continuous +state spaces. One solution for this problem is to partition the state space. A +good partitioning enables generalization during learning and more efficient +exploitation of prior experiences. Consequently, the learning process becomes +faster and produces more reliable policies. However, partitioning introduces +approximation, which is particularly harmful in the presence of nonlinear +relations between state components. An ideal partition should be as coarse as +possible, while capturing the key structure of the state space for the given +problem. This work extracts partitions from the environment dynamics by +symbolic execution. We show that symbolic partitioning improves state space +coverage with respect to environmental behavior and allows reinforcement +learning to perform better for sparse rewards. We evaluate symbolic state space +partitioning with respect to precision, scalability, learning agent performance +and state space coverage for the learnt policies. + +
+
+
+
+
+ + ♻ ☆ miniCTX: Neural Theorem Proving with (Long-)Contexts + + +
+ Real-world formal theorem proving often depends on a wealth of context, +including definitions, lemmas, comments, file structure, and other information. +We introduce miniCTX, which tests a model's ability to prove formal +mathematical theorems that depend on new context that is not seen during +training. miniCTX contains theorems sourced from real Lean projects and +textbooks, each associated with a context that can span tens of thousands of +tokens. Models are tasked with proving a theorem given access to code from the +theorem's repository, which contains context that is needed for the proof. As a +baseline for miniCTX, we tested fine-tuning and prompting methods that +condition theorem proving on preceding context. Both approaches substantially +outperform traditional methods that rely solely on state information. We found +that this ability to use context is not captured by previous benchmarks such as +miniF2F. Alongside miniCTX, we offer ntp-toolkit for automatically extracting +and annotating theorem proving data, making it easy to add new projects into +miniCTX to ensure that contexts are not seen during training. miniCTX offers a +challenging and realistic evaluation of neural theorem provers. + +
+
+
+
+
+ + ♻ ☆ A Causal Bayesian Network and Probabilistic Programming Based Reasoning + Framework for Robot Manipulation Under Uncertainty ICRA 2025 + + +
+ Robot object manipulation in real-world environments is challenging because +robot operation must be robust to a range of sensing, estimation, and actuation +uncertainties to avoid potentially unsafe and costly mistakes that are a +barrier to their adoption. In this paper, we propose a flexible and +generalisable physics-informed causal Bayesian network (CBN) based framework +for a robot to probabilistically reason about candidate manipulation actions, +to enable robot decision-making robust to arbitrary robot system uncertainties +-- the first of its kind to use a probabilistic programming language +implementation. Using experiments in high-fidelity Gazebo simulation of an +exemplar block stacking task, we demonstrate our framework's ability to: (1) +predict manipulation outcomes with high accuracy (Pred Acc: 88.6%); and, (2) +perform greedy next-best action selection with 94.2% task success rate. We also +demonstrate our framework's suitability for real-world robot systems with a +domestic robot. Thus, we show that by combining probabilistic causal modelling +with physics simulations, we can make robot manipulation more robust to system +uncertainties and hence more feasible for real-world applications. Further, our +generalised reasoning framework can be used and extended for future robotics +and causality research. + +
+
+ comment: 7 pages, 7 figures, submitted to the 2025 IEEE Conference on Robotics + and Automation (ICRA 2025) +
+
+
+
+
+ + ♻ ☆ Regret-Optimal Federated Transfer Learning for Kernel Regression with + Applications in American Option Pricing + + +
+ We propose an optimal iterative scheme for federated transfer learning, where +a central planner has access to datasets ${\cal D}_1,\dots,{\cal D}_N$ for the +same learning model $f_{\theta}$. Our objective is to minimize the cumulative +deviation of the generated parameters $\{\theta_i(t)\}_{t=0}^T$ across all $T$ +iterations from the specialized parameters +$\theta^\star_{1},\ldots,\theta^\star_N$ obtained for each dataset, while +respecting the loss function for the model $f_{\theta(T)}$ produced by the +algorithm upon halting. We only allow for continual communication between each +of the specialized models (nodes/agents) and the central planner (server), at +each iteration (round). For the case where the model $f_{\theta}$ is a +finite-rank kernel regression, we derive explicit updates for the +regret-optimal algorithm. By leveraging symmetries within the regret-optimal +algorithm, we further develop a nearly regret-optimal heuristic that runs with +$\mathcal{O}(Np^2)$ fewer elementary operations, where $p$ is the dimension of +the parameter space. Additionally, we investigate the adversarial robustness of +the regret-optimal algorithm showing that an adversary which perturbs $q$ +training pairs by at-most $\varepsilon>0$, across all training sets, cannot +reduce the regret-optimal algorithm's regret by more than +$\mathcal{O}(\varepsilon q \bar{N}^{1/2})$, where $\bar{N}$ is the aggregate +number of training pairs. To validate our theoretical findings, we conduct +numerical experiments in the context of American option pricing, utilizing a +randomly generated finite-rank kernel. + +
+
+ comment: 51 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on + Graphs + + +
+ Large language models (LLMs), while exhibiting exceptional performance, +suffer from hallucinations, especially on knowledge-intensive tasks. Existing +works propose to augment LLMs with individual text units retrieved from +external knowledge corpora to alleviate the issue. However, in many domains, +texts are interconnected (e.g., academic papers in a bibliographic graph are +linked by citations and co-authorships) which form a (text-attributed) graph. +The knowledge in such graphs is encoded not only in single texts/nodes but also +in their associated connections. To facilitate the research of augmenting LLMs +with graphs, we manually construct a Graph Reasoning Benchmark dataset called +GRBench, containing 1,740 questions that can be answered with the knowledge +from 10 domain graphs. Then, we propose a simple and effective framework called +Graph Chain-of-thought (Graph-CoT) to augment LLMs with graphs by encouraging +LLMs to reason on the graph iteratively. Each Graph-CoT iteration consists of +three sub-steps: LLM reasoning, LLM-graph interaction, and graph execution. We +conduct systematic experiments with three LLM backbones on GRBench, where +Graph-CoT outperforms the baselines consistently. The code is available at +https://github.com/PeterGriffinJin/Graph-CoT. + +
+
+ comment: 21 pages. Code: https://github.com/PeterGriffinJin/Graph-CoT +
+
+
+
+
+ + ♻ ☆ A Systematic Survey and Critical Review on Evaluating Large Language + Models: Challenges, Limitations, and Recommendations EMNLP 2024 + + +
+ Large Language Models (LLMs) have recently gained significant attention due +to their remarkable capabilities in performing diverse tasks across various +domains. However, a thorough evaluation of these models is crucial before +deploying them in real-world applications to ensure they produce reliable +performance. Despite the well-established importance of evaluating LLMs in the +community, the complexity of the evaluation process has led to varied +evaluation setups, causing inconsistencies in findings and interpretations. To +address this, we systematically review the primary challenges and limitations +causing these inconsistencies and unreliable evaluations in various steps of +LLM evaluation. Based on our critical review, we present our perspectives and +recommendations to ensure LLM evaluations are reproducible, reliable, and +robust. + +
+
+ comment: Accepted at EMNLP 2024 (Main Conference) +
+
+
+
+
+ + ♻ ☆ Forecasting Disease Progression with Parallel Hyperplanes in + Longitudinal Retinal OCT MICCAI 2024 + + +
+ Predicting future disease progression risk from medical images is challenging +due to patient heterogeneity, and subtle or unknown imaging biomarkers. +Moreover, deep learning (DL) methods for survival analysis are susceptible to +image domain shifts across scanners. We tackle these issues in the task of +predicting late dry Age-related Macular Degeneration (dAMD) onset from retinal +OCT scans. We propose a novel DL method for survival prediction to jointly +predict from the current scan a risk score, inversely related to +time-to-conversion, and the probability of conversion within a time interval +$t$. It uses a family of parallel hyperplanes generated by parameterizing the +bias term as a function of $t$. In addition, we develop unsupervised losses +based on intra-subject image pairs to ensure that risk scores increase over +time and that future conversion predictions are consistent with AMD stage +prediction using actual scans of future visits. Such losses enable +data-efficient fine-tuning of the trained model on new unlabeled datasets +acquired with a different scanner. Extensive evaluation on two large datasets +acquired with different scanners resulted in a mean AUROCs of 0.82 for +Dataset-1 and 0.83 for Dataset-2, across prediction intervals of 6,12 and 24 +months. + +
+
+ comment: accepted in MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Large Language Models on Graphs: A Comprehensive Survey + + +
+ Large language models (LLMs), such as GPT4 and LLaMA, are creating +significant advancements in natural language processing, due to their strong +text encoding/decoding ability and newly found emergent capability (e.g., +reasoning). While LLMs are mainly designed to process pure texts, there are +many real-world scenarios where text data is associated with rich structure +information in the form of graphs (e.g., academic networks, and e-commerce +networks) or scenarios where graph data is paired with rich textual information +(e.g., molecules with descriptions). Besides, although LLMs have shown their +pure text-based reasoning ability, it is underexplored whether such ability can +be generalized to graphs (i.e., graph-based reasoning). In this paper, we +provide a systematic review of scenarios and techniques related to large +language models on graphs. We first summarize potential scenarios of adopting +LLMs on graphs into three categories, namely pure graphs, text-attributed +graphs, and text-paired graphs. We then discuss detailed techniques for +utilizing LLMs on graphs, including LLM as Predictor, LLM as Encoder, and LLM +as Aligner, and compare the advantages and disadvantages of different schools +of models. Furthermore, we discuss the real-world applications of such methods +and summarize open-source codes and benchmark datasets. Finally, we conclude +with potential future research directions in this fast-growing field. The +related source can be found at +https://github.com/PeterGriffinJin/Awesome-Language-Model-on-Graphs. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ Synthetic continued pretraining + + +
+ Pretraining on large-scale, unstructured internet text enables language +models to acquire a significant amount of world knowledge. However, this +knowledge acquisition is data-inefficient--to learn a given fact, models must +be trained on hundreds to thousands of diverse representations of it. This +poses a challenge when adapting a pretrained model to a small corpus of +domain-specific documents, where each fact may appear rarely or only once. We +propose to bridge this gap with synthetic continued pretraining: using the +small domain-specific corpus to synthesize a large corpus more amenable to +learning, and then performing continued pretraining on the synthesized corpus. +We instantiate this proposal with EntiGraph, a synthetic data augmentation +algorithm that extracts salient entities from the source documents and then +generates diverse text by drawing connections between the sampled entities. +Synthetic continued pretraining with EntiGraph enables a language model to +answer questions and follow generic instructions related to the source +documents without access to them. If, instead, the source documents are +available at inference time, we show that the knowledge acquired through our +approach compounds with retrieval-augmented generation. To better understand +these results, we build a simple mathematical model of EntiGraph, and show how +synthetic data augmentation can "rearrange" knowledge to enable more +data-efficient learning. + +
+
+ comment: Updated organization of experimental results and methods + introduction. Released the dataset and model weights artifact +
+
+
+
+
+ + ♻ ☆ Effective Heterogeneous Federated Learning via Efficient + Hypernetwork-based Weight Generation + + +
+ While federated learning leverages distributed client resources, it faces +challenges due to heterogeneous client capabilities. This necessitates +allocating models suited to clients' resources and careful parameter +aggregation to accommodate this heterogeneity. We propose HypeMeFed, a novel +federated learning framework for supporting client heterogeneity by combining a +multi-exit network architecture with hypernetwork-based model weight +generation. This approach aligns the feature spaces of heterogeneous model +layers and resolves per-layer information disparity during weight aggregation. +To practically realize HypeMeFed, we also propose a low-rank factorization +approach to minimize computation and memory overhead associated with +hypernetworks. Our evaluations on a real-world heterogeneous device testbed +indicate that \system enhances accuracy by 5.12% over FedAvg, reduces the +hypernetwork memory requirements by 98.22%, and accelerates its operations by +1.86x compared to a naive hypernetwork approach. These results demonstrate +HypeMeFed's effectiveness in leveraging and engaging heterogeneous clients for +federated learning. + +
+
+
+
+
+ + ♻ ☆ Graph Sparsification via Mixture of Graphs + + +
+ Graph Neural Networks (GNNs) have demonstrated superior performance across +various graph learning tasks but face significant computational challenges when +applied to large-scale graphs. One effective approach to mitigate these +challenges is graph sparsification, which involves removing non-essential edges +to reduce computational overhead. However, previous graph sparsification +methods often rely on a single global sparsity setting and uniform pruning +criteria, failing to provide customized sparsification schemes for each node's +complex local context. In this paper, we introduce Mixture-of-Graphs (MoG), +leveraging the concept of Mixture-of-Experts (MoE), to dynamically select +tailored pruning solutions for each node. Specifically, MoG incorporates +multiple sparsifier experts, each characterized by unique sparsity levels and +pruning criteria, and selects the appropriate experts for each node. +Subsequently, MoG performs a mixture of the sparse graphs produced by different +experts on the Grassmann manifold to derive an optimal sparse graph. One +notable property of MoG is its entirely local nature, as it depends on the +specific circumstances of each individual node. Extensive experiments on four +large-scale OGB datasets and two superpixel datasets, equipped with five GNN +backbones, demonstrate that MoG (I) identifies subgraphs at higher sparsity +levels ($8.67\%\sim 50.85\%$), with performance equal to or better than the +dense graph, (II) achieves $1.47-2.62\times$ speedup in GNN inference with +negligible performance drop, and (III) boosts ``top-student'' GNN performance +($1.02\%\uparrow$ on RevGNN+\textsc{ogbn-proteins} and $1.74\%\uparrow$ on +DeeperGCN+\textsc{ogbg-ppa}). + +
+
+
+
+
+ + ♻ ☆ Functional Latent Dynamics for Irregularly Sampled Time Series + Forecasting + + +
+ Irregularly sampled time series with missing values are often observed in +multiple real-world applications such as healthcare, climate and astronomy. +They pose a significant challenge to standard deep learning models that operate +only on fully observed and regularly sampled time series. In order to capture +the continuous dynamics of the irregular time series, many models rely on +solving an Ordinary Differential Equation (ODE) in the hidden state. These +ODE-based models tend to perform slow and require large memory due to +sequential operations and a complex ODE solver. As an alternative to complex +ODE-based models, we propose a family of models called Functional Latent +Dynamics (FLD). Instead of solving the ODE, we use simple curves which exist at +all time points to specify the continuous latent state in the model. The +coefficients of these curves are learned only from the observed values in the +time series ignoring the missing values. Through extensive experiments, we +demonstrate that FLD achieves better performance compared to the best ODE-based +model while reducing the runtime and memory overhead. Specifically, FLD +requires an order of magnitude less time to infer the forecasts compared to the +best performing forecasting model. + +
+
+
+
+
+ + ♻ ☆ Hybrid Quantum-inspired Resnet and Densenet for Pattern Recognition + + +
+ In this paper, we propose two hybrid quantum-inspired neural networks with +residual and dense connections respectively for pattern recognition. We explain +the concrete frameworks and illustrate the potential superiority to prevent +gradient explosion of our hybrid models. A group of numerical experiments about +generalization power shows that our hybrid models possess the same +generalization power as the pure classical models with different noisy datasets +utilized. More importantly, another group of numerical experiments of +robustness demonstrates that our hybrid models outperform pure classical models +notably in resistance to parameter attacks with various asymmetric noises. +Also, an ablation study indicate that the recognition accuracy of our hybrid +models is 2\%-3\% higher than that of the quantum neural network without +residual or dense connection. Eventually, we discuss the application scenarios +of our hybrid models by analyzing their computational complexities. + +
+
+ comment: 12 pages for main paper with a hyperlink of a 18-page supplementary + material in the last page of the main paper +
+
+
+
+
+ + ♻ ☆ Analysis of Linear Mode Connectivity via Permutation-Based Weight + Matching + + +
+ Recently, Ainsworth et al. showed that using weight matching (WM) to minimize +the $L_2$ distance in a permutation search of model parameters effectively +identifies permutations that satisfy linear mode connectivity (LMC), where the +loss along a linear path between two independently trained models with +different seeds remains nearly constant. This paper analyzes LMC using WM, +which is useful for understanding stochastic gradient descent's effectiveness +and its application in areas like model merging. We first empirically show that +permutations found by WM do not significantly reduce the $L_2$ distance between +two models, and the occurrence of LMC is not merely due to distance reduction +by WM itself. We then demonstrate that permutations can change the directions +of the singular vectors, but not the singular values, of the weight matrices in +each layer. This finding shows that permutations found by WM primarily align +the directions of singular vectors associated with large singular values across +models. This alignment brings the singular vectors with large singular values, +which determine the model's functionality, closer between the original and +merged models, allowing the merged model to retain functionality similar to the +original models, thereby satisfying LMC. This paper also analyzes activation +matching (AM) in terms of singular vectors and finds that the principle of AM +is the same as that of WM. Finally, we analyze the difference between WM and +the straight-through estimator (STE), a dataset-dependent permutation search +method, and show that WM can be more advantageous than STE in achieving LMC +among three or more models. + +
+
+ comment: 36 pages +
+
+
+
+
+ + ♻ ☆ Model Internals-based Answer Attribution for Trustworthy + Retrieval-Augmented Generation EMNLP 2024 + + +
+ Ensuring the verifiability of model answers is a fundamental challenge for +retrieval-augmented generation (RAG) in the question answering (QA) domain. +Recently, self-citation prompting was proposed to make large language models +(LLMs) generate citations to supporting documents along with their answers. +However, self-citing LLMs often struggle to match the required format, refer to +non-existent sources, and fail to faithfully reflect LLMs' context usage +throughout the generation. In this work, we present MIRAGE --Model +Internals-based RAG Explanations -- a plug-and-play approach using model +internals for faithful answer attribution in RAG applications. MIRAGE detects +context-sensitive answer tokens and pairs them with retrieved documents +contributing to their prediction via saliency methods. We evaluate our proposed +approach on a multilingual extractive QA dataset, finding high agreement with +human answer attribution. On open-ended QA, MIRAGE achieves citation quality +and efficiency comparable to self-citation while also allowing for a +finer-grained control of attribution parameters. Our qualitative evaluation +highlights the faithfulness of MIRAGE's attributions and underscores the +promising application of model internals for RAG answer attribution. + +
+
+ comment: Accepted by EMNLP 2024 Main Conference. Code and data released at + https://github.com/Betswish/MIRAGE +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ BadCM: Invisible Backdoor Attack Against Cross-Modal Learning + + +
+ Despite remarkable successes in unimodal learning tasks, backdoor attacks +against cross-modal learning are still underexplored due to the limited +generalization and inferior stealthiness when involving multiple modalities. +Notably, since works in this area mainly inherit ideas from unimodal visual +attacks, they struggle with dealing with diverse cross-modal attack +circumstances and manipulating imperceptible trigger samples, which hinders +their practicability in real-world applications. In this paper, we introduce a +novel bilateral backdoor to fill in the missing pieces of the puzzle in the +cross-modal backdoor and propose a generalized invisible backdoor framework +against cross-modal learning (BadCM). Specifically, a cross-modal mining scheme +is developed to capture the modality-invariant components as target poisoning +areas, where well-designed trigger patterns injected into these regions can be +efficiently recognized by the victim models. This strategy is adapted to +different image-text cross-modal models, making our framework available to +various attack scenarios. Furthermore, for generating poisoned samples of high +stealthiness, we conceive modality-specific generators for visual and +linguistic modalities that facilitate hiding explicit trigger patterns in +modality-invariant regions. To the best of our knowledge, BadCM is the first +invisible backdoor method deliberately designed for diverse cross-modal attacks +within one unified framework. Comprehensive experimental evaluations on two +typical applications, i.e., cross-modal retrieval and VQA, demonstrate the +effectiveness and generalization of our method under multiple kinds of attack +scenarios. Moreover, we show that BadCM can robustly evade existing backdoor +defenses. Our code is available at https://github.com/xandery-geek/BadCM. + +
+
+
+
+
+ + ♻ ☆ SonicSense: Object Perception from In-Hand Acoustic Vibration + + +
+ We introduce SonicSense, a holistic design of hardware and software to enable +rich robot object perception through in-hand acoustic vibration sensing. While +previous studies have shown promising results with acoustic sensing for object +perception, current solutions are constrained to a handful of objects with +simple geometries and homogeneous materials, single-finger sensing, and mixing +training and testing on the same objects. SonicSense enables container +inventory status differentiation, heterogeneous material prediction, 3D shape +reconstruction, and object re-identification from a diverse set of 83 +real-world objects. Our system employs a simple but effective heuristic +exploration policy to interact with the objects as well as end-to-end +learning-based algorithms to fuse vibration signals to infer object properties. +Our framework underscores the significance of in-hand acoustic vibration +sensing in advancing robot tactile perception. + +
+
+ comment: Our project website is at: http://generalroboticslab.com/SonicSense +
+
+
+
+
+ + ♻ ☆ Releasing the Parameter Latency of Neural Representation for + High-Efficiency Video Compression + + +
+ For decades, video compression technology has been a prominent research area. +Traditional hybrid video compression framework and end-to-end frameworks +continue to explore various intra- and inter-frame reference and prediction +strategies based on discrete transforms and deep learning techniques. However, +the emerging implicit neural representation (INR) technique models entire +videos as basic units, automatically capturing intra-frame and inter-frame +correlations and obtaining promising performance. INR uses a compact neural +network to store video information in network parameters, effectively +eliminating spatial and temporal redundancy in the original video. However, in +this paper, our exploration and verification reveal that current INR video +compression methods do not fully exploit their potential to preserve +information. We investigate the potential of enhancing network parameter +storage through parameter reuse. By deepening the network, we designed a +feasible INR parameter reuse scheme to further improve compression performance. +Extensive experimental results show that our method significantly enhances the +rate-distortion performance of INR video compression. + +
+
+
+
+
+ + ♻ ☆ LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via a + Hybrid Architecture + + +
+ Expanding the long-context capabilities of Multi-modal Large Language +Models~(MLLMs) is crucial for video understanding, high-resolution image +understanding, and multi-modal agents. This involves a series of systematic +optimizations, including model architecture, data construction and training +strategy, particularly addressing challenges such as \textit{degraded +performance with more images} and \textit{high computational costs}. In this +paper, we adapt the model architecture to a hybrid of Mamba and Transformer +blocks, approach data construction with both temporal and spatial dependencies +among multiple images and employ a progressive training strategy. The released +model \textbf{LongLLaVA}~(\textbf{Long}-Context \textbf{L}arge +\textbf{L}anguage \textbf{a}nd \textbf{V}ision \textbf{A}ssistant) is the first +hybrid MLLM, which achieved a better balance between efficiency and +effectiveness. LongLLaVA not only achieves competitive results across various +benchmarks, but also maintains high throughput and low memory consumption. +Especially, it could process nearly a thousand images on a single A100 80GB +GPU, showing promising application prospects for a wide range of tasks. + +
+
+ comment: 20 pages, 9 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Bootstrap3D: Improving Multi-view Diffusion Model with Synthetic Data + + +
+ Recent years have witnessed remarkable progress in multi-view diffusion +models for 3D content creation. However, there remains a significant gap in +image quality and prompt-following ability compared to 2D diffusion models. A +critical bottleneck is the scarcity of high-quality 3D objects with detailed +captions. To address this challenge, we propose Bootstrap3D, a novel framework +that automatically generates an arbitrary quantity of multi-view images to +assist in training multi-view diffusion models. Specifically, we introduce a +data generation pipeline that employs (1) 2D and video diffusion models to +generate multi-view images based on constructed text prompts, and (2) our +fine-tuned 3D-aware MV-LLaVA for filtering high-quality data and rewriting +inaccurate captions. Leveraging this pipeline, we have generated 1 million +high-quality synthetic multi-view images with dense descriptive captions to +address the shortage of high-quality 3D data. Furthermore, we present a +Training Timestep Reschedule (TTR) strategy that leverages the denoising +process to learn multi-view consistency while maintaining the original 2D +diffusion prior. Extensive experiments demonstrate that Bootstrap3D can +generate high-quality multi-view images with superior aesthetic quality, +image-text alignment, and maintained view consistency. + +
+
+ comment: Project Page: https://sunzey.github.io/Bootstrap3D/ +
+
+
+
+
+ + ♻ ☆ Semantic-Aware Adversarial Training for Reliable Deep Hashing Retrieval + + +
+ Deep hashing has been intensively studied and successfully applied in +large-scale image retrieval systems due to its efficiency and effectiveness. +Recent studies have recognized that the existence of adversarial examples poses +a security threat to deep hashing models, that is, adversarial vulnerability. +Notably, it is challenging to efficiently distill reliable semantic +representatives for deep hashing to guide adversarial learning, and thereby it +hinders the enhancement of adversarial robustness of deep hashing-based +retrieval models. Moreover, current researches on adversarial training for deep +hashing are hard to be formalized into a unified minimax structure. In this +paper, we explore Semantic-Aware Adversarial Training (SAAT) for improving the +adversarial robustness of deep hashing models. Specifically, we conceive a +discriminative mainstay features learning (DMFL) scheme to construct semantic +representatives for guiding adversarial learning in deep hashing. Particularly, +our DMFL with the strict theoretical guarantee is adaptively optimized in a +discriminative learning manner, where both discriminative and semantic +properties are jointly considered. Moreover, adversarial examples are +fabricated by maximizing the Hamming distance between the hash codes of +adversarial samples and mainstay features, the efficacy of which is validated +in the adversarial attack trials. Further, we, for the first time, formulate +the formalized adversarial training of deep hashing into a unified minimax +optimization under the guidance of the generated mainstay codes. Extensive +experiments on benchmark datasets show superb attack performance against the +state-of-the-art algorithms, meanwhile, the proposed adversarial training can +effectively eliminate adversarial perturbations for trustworthy deep +hashing-based retrieval. Our code is available at +https://github.com/xandery-geek/SAAT. + +
+
+
+
+
+ + ♻ ☆ Integrating Large Language Models into a Tri-Modal Architecture for + Automated Depression Classification + + +
+ Major Depressive Disorder (MDD) is a pervasive mental health condition that +affects 300 million people worldwide. This work presents a novel, BiLSTM-based +tri-modal model-level fusion architecture for the binary classification of +depression from clinical interview recordings. The proposed architecture +incorporates Mel Frequency Cepstral Coefficients, Facial Action Units, and uses +a two-shot learning based GPT-4 model to process text data. This is the first +work to incorporate large language models into a multi-modal architecture for +this task. It achieves impressive results on the DAIC-WOZ AVEC 2016 Challenge +cross-validation split and Leave-One-Subject-Out cross-validation split, +surpassing all baseline models and multiple state-of-the-art models. In +Leave-One-Subject-Out testing, it achieves an accuracy of 91.01%, an F1-Score +of 85.95%, a precision of 80%, and a recall of 92.86%. + +
+
+ comment: Keywords: Multi-Modal Neural Networks, Deep Learning, Large Language + Models, Depression Diagnosis, Biomedical Informatics, DAIC-WOZ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 19 + +
+
+
+ + ☆ MVGS: Multi-view-regulated Gaussian Splatting for Novel View Synthesis + + +
+ Recent works in volume rendering, \textit{e.g.} NeRF and 3D Gaussian +Splatting (3DGS), significantly advance the rendering quality and efficiency +with the help of the learned implicit neural radiance field or 3D Gaussians. +Rendering on top of an explicit representation, the vanilla 3DGS and its +variants deliver real-time efficiency by optimizing the parametric model with +single-view supervision per iteration during training which is adopted from +NeRF. Consequently, certain views are overfitted, leading to unsatisfying +appearance in novel-view synthesis and imprecise 3D geometries. To solve +aforementioned problems, we propose a new 3DGS optimization method embodying +four key novel contributions: 1) We transform the conventional single-view +training paradigm into a multi-view training strategy. With our proposed +multi-view regulation, 3D Gaussian attributes are further optimized without +overfitting certain training views. As a general solution, we improve the +overall accuracy in a variety of scenarios and different Gaussian variants. 2) +Inspired by the benefit introduced by additional views, we further propose a +cross-intrinsic guidance scheme, leading to a coarse-to-fine training procedure +concerning different resolutions. 3) Built on top of our multi-view regulated +training, we further propose a cross-ray densification strategy, densifying +more Gaussian kernels in the ray-intersect regions from a selection of views. +4) By further investigating the densification strategy, we found that the +effect of densification should be enhanced when certain views are distinct +dramatically. As a solution, we propose a novel multi-view augmented +densification strategy, where 3D Gaussians are encouraged to get densified to a +sufficient number accordingly, resulting in improved reconstruction accuracy. + +
+
+ comment: Project Page:https://xiaobiaodu.github.io/mvgs-project/ +
+
+
+
+
+ + ☆ Orient Anything + + +
+ Orientation estimation is a fundamental task in 3D shape analysis which +consists of estimating a shape's orientation axes: its side-, up-, and +front-axes. Using this data, one can rotate a shape into canonical orientation, +where its orientation axes are aligned with the coordinate axes. Developing an +orientation algorithm that reliably estimates complete orientations of general +shapes remains an open problem. We introduce a two-stage orientation pipeline +that achieves state of the art performance on up-axis estimation and further +demonstrate its efficacy on full-orientation estimation, where one seeks all +three orientation axes. Unlike previous work, we train and evaluate our method +on all of Shapenet rather than a subset of classes. We motivate our engineering +contributions by theory describing fundamental obstacles to orientation +estimation for rotationally-symmetric shapes, and show how our method avoids +these obstacles. + +
+
+
+
+
+ + ☆ EC-DIT: Scaling Diffusion Transformers with Adaptive Expert-Choice + Routing + + +
+ Diffusion transformers have been widely adopted for text-to-image synthesis. +While scaling these models up to billions of parameters shows promise, the +effectiveness of scaling beyond current sizes remains underexplored and +challenging. By explicitly exploiting the computational heterogeneity of image +generations, we develop a new family of Mixture-of-Experts (MoE) models +(EC-DIT) for diffusion transformers with expert-choice routing. EC-DIT learns +to adaptively optimize the compute allocated to understand the input texts and +generate the respective image patches, enabling heterogeneous computation +aligned with varying text-image complexities. This heterogeneity provides an +efficient way of scaling EC-DIT up to 97 billion parameters and achieving +significant improvements in training convergence, text-to-image alignment, and +overall generation quality over dense models and conventional MoE models. +Through extensive ablations, we show that EC-DIT demonstrates superior +scalability and adaptive compute allocation by recognizing varying textual +importance through end-to-end training. Notably, in text-to-image alignment +evaluation, our largest models achieve a state-of-the-art GenEval score of +71.68% and still maintain competitive inference speed with intuitive +interpretability. + +
+
+
+
+
+ + ☆ Tracking objects that change in appearance with phase synchrony + + +
+ Objects we encounter often change appearance as we interact with them. +Changes in illumination (shadows), object pose, or movement of nonrigid objects +can drastically alter available image features. How do biological visual +systems track objects as they change? It may involve specific attentional +mechanisms for reasoning about the locations of objects independently of their +appearances -- a capability that prominent neuroscientific theories have +associated with computing through neural synchrony. We computationally test the +hypothesis that the implementation of visual attention through neural synchrony +underlies the ability of biological visual systems to track objects that change +in appearance over time. We first introduce a novel deep learning circuit that +can learn to precisely control attention to features separately from their +location in the world through neural synchrony: the complex-valued recurrent +neural network (CV-RNN). Next, we compare object tracking in humans, the +CV-RNN, and other deep neural networks (DNNs), using FeatureTracker: a +large-scale challenge that asks observers to track objects as their locations +and appearances change in precisely controlled ways. While humans effortlessly +solved FeatureTracker, state-of-the-art DNNs did not. In contrast, our CV-RNN +behaved similarly to humans on the challenge, providing a computational +proof-of-concept for the role of phase synchronization as a neural substrate +for tracking appearance-morphing objects as they move about. + +
+
+
+
+
+ + ☆ Anchors Aweigh! Sail for Optimal Unified Multi-Modal Representations + + +
+ Multimodal learning plays a crucial role in enabling machine learning models +to fuse and utilize diverse data sources, such as text, images, and audio, to +support a variety of downstream tasks. A unified representation across various +modalities is particularly important for improving efficiency and performance. +Recent binding methods, such as ImageBind (Girdhar et al., 2023), typically use +a fixed anchor modality to align multimodal data in the anchor modal embedding +space. In this paper, we mathematically analyze the fixed anchor binding +methods and uncover notable limitations: (1) over-reliance on the choice of the +anchor modality, (2) failure to capture intra-modal information, and (3) +failure to account for inter-modal correlation among non-anchored modalities. +To address these limitations, we propose CentroBind, a simple yet powerful +approach that eliminates the need for a fixed anchor; instead, it employs +dynamically adjustable centroid-based anchors generated from all available +modalities, resulting in a balanced and rich representation space. We +theoretically demonstrate that our method captures three crucial properties of +multimodal learning: intra-modal learning, inter-modal learning, and multimodal +alignment, while also constructing a robust unified representation across all +modalities. Our experiments on both synthetic and real-world datasets +demonstrate the superiority of the proposed method, showing that dynamic anchor +methods outperform all fixed anchor binding methods as the former captures more +nuanced multimodal interactions. + +
+
+
+
+
+ + ☆ EMMA: Efficient Visual Alignment in Multi-Modal LLMs + + +
+ Multi-modal Large Language Models (MLLMs) have recently exhibited impressive +general-purpose capabilities by leveraging vision foundation models to encode +the core concepts of images into representations. These are then combined with +instructions and processed by the language model to generate high-quality +responses. Despite significant progress in enhancing the language component, +challenges persist in optimally fusing visual encodings within the language +model for task-specific adaptability. Recent research has focused on improving +this fusion through modality adaptation modules but at the cost of +significantly increased model complexity and training data needs. In this +paper, we propose EMMA (Efficient Multi-Modal Adaptation), a lightweight +cross-modality module designed to efficiently fuse visual and textual +encodings, generating instruction-aware visual representations for the language +model. Our key contributions include: (1) an efficient early fusion mechanism +that integrates vision and language representations with minimal added +parameters (less than 0.2% increase in model size), (2) an in-depth +interpretability analysis that sheds light on the internal mechanisms of the +proposed method; (3) comprehensive experiments that demonstrate notable +improvements on both specialized and general benchmarks for MLLMs. Empirical +results show that EMMA boosts performance across multiple tasks by up to 9.3% +while significantly improving robustness against hallucinations. Our code is +available at https://github.com/SaraGhazanfari/EMMA + +
+
+
+
+
+ + ☆ Posterior sampling via Langevin dynamics based on generative priors + + +
+ Posterior sampling in high-dimensional spaces using generative models holds +significant promise for various applications, including but not limited to +inverse problems and guided generation tasks. Despite many recent developments, +generating diverse posterior samples remains a challenge, as existing methods +require restarting the entire generative process for each new sample, making +the procedure computationally expensive. In this work, we propose efficient +posterior sampling by simulating Langevin dynamics in the noise space of a +pre-trained generative model. By exploiting the mapping between the noise and +data spaces which can be provided by distilled flows or consistency models, our +method enables seamless exploration of the posterior without the need to re-run +the full sampling chain, drastically reducing computational overhead. +Theoretically, we prove a guarantee for the proposed noise-space Langevin +dynamics to approximate the posterior, assuming that the generative model +sufficiently approximates the prior distribution. Our framework is +experimentally validated on image restoration tasks involving noisy linear and +nonlinear forward operators applied to LSUN-Bedroom (256 x 256) and ImageNet +(64 x 64) datasets. The results demonstrate that our approach generates +high-fidelity samples with enhanced semantic diversity even under a limited +number of function evaluations, offering superior efficiency and performance +compared to existing diffusion-based posterior sampling techniques. + +
+
+
+
+
+ + ☆ Kolmogorov-Arnold Network Autoencoders + + +
+ Deep learning models have revolutionized various domains, with Multi-Layer +Perceptrons (MLPs) being a cornerstone for tasks like data regression and image +classification. However, a recent study has introduced Kolmogorov-Arnold +Networks (KANs) as promising alternatives to MLPs, leveraging activation +functions placed on edges rather than nodes. This structural shift aligns KANs +closely with the Kolmogorov-Arnold representation theorem, potentially +enhancing both model accuracy and interpretability. In this study, we explore +the efficacy of KANs in the context of data representation via autoencoders, +comparing their performance with traditional Convolutional Neural Networks +(CNNs) on the MNIST, SVHN, and CIFAR-10 datasets. Our results demonstrate that +KAN-based autoencoders achieve competitive performance in terms of +reconstruction accuracy, thereby suggesting their viability as effective tools +in data analysis tasks. + +
+
+ comment: 12 pages, 5 figures, 1 table +
+
+
+
+
+ + ☆ Depth Pro: Sharp Monocular Metric Depth in Less Than a Second + + +
+ We present a foundation model for zero-shot metric monocular depth +estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with +unparalleled sharpness and high-frequency details. The predictions are metric, +with absolute scale, without relying on the availability of metadata such as +camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map +in 0.3 seconds on a standard GPU. These characteristics are enabled by a number +of technical contributions, including an efficient multi-scale vision +transformer for dense prediction, a training protocol that combines real and +synthetic datasets to achieve high metric accuracy alongside fine boundary +tracing, dedicated evaluation metrics for boundary accuracy in estimated depth +maps, and state-of-the-art focal length estimation from a single image. +Extensive experiments analyze specific design choices and demonstrate that +Depth Pro outperforms prior work along multiple dimensions. We release code and +weights at https://github.com/apple/ml-depth-pro + +
+
+ comment: Code and weights available at https://github.com/apple/ml-depth-pro +
+
+
+
+
+ + ☆ Learning from the Giants: A Practical Approach to Underwater Depth and + Surface Normals Estimation + + +
+ Monocular Depth and Surface Normals Estimation (MDSNE) is crucial for tasks +such as 3D reconstruction, autonomous navigation, and underwater exploration. +Current methods rely either on discriminative models, which struggle with +transparent or reflective surfaces, or generative models, which, while +accurate, are computationally expensive. This paper presents a novel deep +learning model for MDSNE, specifically tailored for underwater environments, +using a hybrid architecture that integrates Convolutional Neural Networks +(CNNs) with Transformers, leveraging the strengths of both approaches. Training +effective MDSNE models is often hampered by noisy real-world datasets and the +limited generalization of synthetic datasets. To address this, we generate +pseudo-labeled real data using multiple pre-trained MDSNE models. To ensure the +quality of this data, we propose the Depth Normal Evaluation and Selection +Algorithm (DNESA), which evaluates and selects the most reliable pseudo-labeled +samples using domain-specific metrics. A lightweight student model is then +trained on this curated dataset. Our model reduces parameters by 90% and +training costs by 80%, allowing real-time 3D perception on resource-constrained +devices. Key contributions include: a novel and efficient MDSNE model, the +DNESA algorithm, a domain-specific data pipeline, and a focus on real-time +performance and scalability. Designed for real-world underwater applications, +our model facilitates low-cost deployments in underwater robots and autonomous +vehicles, bridging the gap between research and practical implementation. + +
+
+ comment: 18 pages, 6 figures, 8 tables. Submitted to Elsevier +
+
+
+
+
+ + ☆ Semi-Supervised Fine-Tuning of Vision Foundation Models with + Content-Style Decomposition + + +
+ In this paper, we present a semi-supervised fine-tuning approach designed to +improve the performance of foundation models on downstream tasks with limited +labeled data. By leveraging content-style decomposition within an +information-theoretic framework, our method enhances the latent representations +of pre-trained vision foundation models, aligning them more effectively with +specific task objectives and addressing the problem of distribution shift. We +evaluate our approach on multiple datasets, including MNIST, its augmented +variations (with yellow and white stripes), CIFAR-10, SVHN, and GalaxyMNIST. +The experiments show improvements over purely supervised baselines, +particularly in low-labeled data regimes, across both frozen and trainable +backbones for the majority of the tested datasets. + +
+
+
+
+
+ + ☆ DisEnvisioner: Disentangled and Enriched Visual Prompt for Customized + Image Generation + + +
+ In the realm of image generation, creating customized images from visual +prompt with additional textual instruction emerges as a promising endeavor. +However, existing methods, both tuning-based and tuning-free, struggle with +interpreting the subject-essential attributes from the visual prompt. This +leads to subject-irrelevant attributes infiltrating the generation process, +ultimately compromising the personalization quality in both editability and ID +preservation. In this paper, we present DisEnvisioner, a novel approach for +effectively extracting and enriching the subject-essential features while +filtering out -irrelevant information, enabling exceptional customization +performance, in a tuning-free manner and using only a single image. +Specifically, the feature of the subject and other irrelevant components are +effectively separated into distinctive visual tokens, enabling a much more +accurate customization. Aiming to further improving the ID consistency, we +enrich the disentangled features, sculpting them into more granular +representations. Experiments demonstrate the superiority of our approach over +existing methods in instruction response (editability), ID consistency, +inference speed, and the overall image quality, highlighting the effectiveness +and efficiency of DisEnvisioner. Project page: +https://disenvisioner.github.io/. + +
+
+ comment: The first two authors contributed equally. Project page: + https://disenvisioner.github.io/ +
+
+
+
+
+ + ☆ Using Style Ambiguity Loss to Improve Aesthetics of Diffusion Models + + +
+ Teaching text-to-image models to be creative involves using style ambiguity +loss. In this work, we explore using the style ambiguity training objective, +used to approximate creativity, on a diffusion model. We then experiment with +forms of style ambiguity loss that do not require training a classifier or a +labeled dataset, and find that the models trained with style ambiguity loss can +generate better images than the baseline diffusion models and GANs. Code is +available at https://github.com/jamesBaker361/clipcreate. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2407.12009 +
+
+
+
+
+ + ☆ Improving Autonomous AI Agents with Reflective Tree Search and + Self-Learning + + +
+ Autonomous agents have demonstrated significant potential in automating +complex multistep decision-making tasks. However, even state-of-the-art +vision-language models (VLMs), such as GPT-4o, still fall short of human-level +performance, particularly in intricate web environments and long-horizon +planning tasks. To address these limitations, we introduce Reflective Monte +Carlo Tree Search (R-MCTS), a novel test-time algorithm designed to enhance the +ability of AI agents, e.g., powered by GPT-4o, to explore decision space on the +fly. R-MCTS extends traditional MCTS by 1) incorporating contrastive +reflection, allowing agents to learn from past interactions and dynamically +improve their search efficiency; and 2) using multi-agent debate to provide +reliable state evaluation. Moreover, we improve the agent's performance by +fine-tuning GPT-4o through self-learning, using R-MCTS generated tree +traversals without any human-provided labels. On the challenging VisualWebArena +benchmark, our GPT-4o-based R-MCTS agent achieves a 6% to 30% relative +improvement across various tasks compared to the previous state-of-the-art. +Additionally, we show that the knowledge gained from test-time search can be +effectively transferred back to GPT-4o via fine-tuning. The fine-tuned GPT-4o +matches 97% of R-MCTS's performance while reducing compute usage by a factor of +four at test time. Furthermore, qualitative results reveal that the fine-tuned +GPT-4o model demonstrates the ability to explore the environment, evaluate a +state, and backtrack to viable ones when it detects that the current state +cannot lead to success. Moreover, our work demonstrates the compute scaling +properties in both training - data collection with R-MCTS - and testing time. +These results suggest a promising research direction to enhance VLMs' reasoning +and planning capabilities for agentic applications via test-time search and +self-learning. + +
+
+
+
+
+ + ♻ ☆ Generative Visual Instruction Tuning + + +
+ We propose to use automatically generated instruction-following data to +improve the zero-shot capabilities of a large multimodal model with additional +support for generative and image editing tasks. We achieve this by curating a +new multimodal instruction-following set using GPT-4V and existing datasets for +image generation and editing. Using this instruction set and the existing +LLaVA-Finetune instruction set for visual understanding tasks, we produce +GenLLaVA, a Generative Large Language and Visual Assistant. GenLLaVA is built +through a strategy that combines three types of large pretrained models through +instruction finetuning: Mistral for language modeling, SigLIP for image-text +matching, and StableDiffusion for text-to-image generation. Our model +demonstrates visual understanding capabilities superior to LLaVA and +additionally demonstrates competitive results with native multimodal models +such as Unified-IO 2, paving the way for building advanced general-purpose +visual assistants by effectively re-using existing multimodal models. We +open-source our dataset, codebase, and model checkpoints to foster further +research and application in this domain. + +
+
+ comment: Add more results using task tokens, expand the introduction and + related work FIX: error in LLM-as-judge evaluation that was over-inflating + the results +
+
+
+
+
+ + ♻ ☆ Law of the Weakest Link: Cross Capabilities of Large Language Models + + +
+ The development and evaluation of Large Language Models (LLMs) have largely +focused on individual capabilities. However, this overlooks the intersection of +multiple abilities across different types of expertise that are often required +for real-world tasks, which we term cross capabilities. To systematically +explore this concept, we first define seven core individual capabilities and +then pair them to form seven common cross capabilities, each supported by a +manually constructed taxonomy. Building on these definitions, we introduce +CrossEval, a benchmark comprising 1,400 human-annotated prompts, with 100 +prompts for each individual and cross capability. To ensure reliable +evaluation, we involve expert annotators to assess 4,200 model responses, +gathering 8,400 human ratings with detailed explanations to serve as reference +examples. Our findings reveal that, in both static evaluations and attempts to +enhance specific abilities, current LLMs consistently exhibit the "Law of the +Weakest Link," where cross-capability performance is significantly constrained +by the weakest component. Specifically, across 58 cross-capability scores from +17 models, 38 scores are lower than all individual capabilities, while 20 fall +between strong and weak, but closer to the weaker ability. These results +highlight the under-performance of LLMs in cross-capability tasks, making the +identification and improvement of the weakest capabilities a critical priority +for future research to optimize performance in complex, multi-dimensional +scenarios. + +
+
+ comment: Data, Code, & Benchmark: www.llm-cross-capabilities.org +
+
+
+
+
+ + ♻ ☆ Lotus: Diffusion-based Visual Foundation Model for High-quality Dense + Prediction + + +
+ Leveraging the visual priors of pre-trained text-to-image diffusion models +offers a promising solution to enhance zero-shot generalization in dense +prediction tasks. However, existing methods often uncritically use the original +diffusion formulation, which may not be optimal due to the fundamental +differences between dense prediction and image generation. In this paper, we +provide a systemic analysis of the diffusion formulation for the dense +prediction, focusing on both quality and efficiency. And we find that the +original parameterization type for image generation, which learns to predict +noise, is harmful for dense prediction; the multi-step noising/denoising +diffusion process is also unnecessary and challenging to optimize. Based on +these insights, we introduce Lotus, a diffusion-based visual foundation model +with a simple yet effective adaptation protocol for dense prediction. +Specifically, Lotus is trained to directly predict annotations instead of +noise, thereby avoiding harmful variance. We also reformulate the diffusion +process into a single-step procedure, simplifying optimization and +significantly boosting inference speed. Additionally, we introduce a novel +tuning strategy called detail preserver, which achieves more accurate and +fine-grained predictions. Without scaling up the training data or model +capacity, Lotus achieves SoTA performance in zero-shot depth and normal +estimation across various datasets. It also enhances efficiency, being +significantly faster than most existing diffusion-based methods. Lotus' +superior quality and efficiency also enable a wide range of practical +applications, such as joint estimation, single/multi-view 3D reconstruction, +etc. Project page: https://lotus3d.github.io/. + +
+
+ comment: The first two authors contributed equally. Project page: + https://lotus3d.github.io/ +
+
+
+
+
+ + ♻ ☆ ViC-MAE: Self-Supervised Representation Learning from Images and Video + with Contrastive Masked Autoencoders ECCV 2024 + + +
+ We propose ViC-MAE, a model that combines both Masked AutoEncoders (MAE) and +contrastive learning. ViC-MAE is trained using a global featured obtained by +pooling the local representations learned under an MAE reconstruction loss and +leveraging this representation under a contrastive objective across images and +video frames. We show that visual representations learned under ViC-MAE +generalize well to both video and image classification tasks. Particularly, +ViC-MAE obtains state-of-the-art transfer learning performance from video to +images on Imagenet-1k compared to the recently proposed OmniMAE by achieving a +top-1 accuracy of 86% (+1.3% absolute improvement) when trained on the same +data and 87.1% (+2.4% absolute improvement) when training on extra data. At the +same time ViC-MAE outperforms most other methods on video benchmarks by +obtaining 75.9% top-1 accuracy on the challenging Something something-v2 video +benchmark . When training on videos and images from a diverse combination of +datasets, our method maintains a balanced transfer-learning performance between +video and image classification benchmarks, coming only as a close second to the +best supervised method. + +
+
+ comment: Published at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ CLIP-MoE: Towards Building Mixture of Experts for CLIP with Diversified + Multiplet Upcycling + + +
+ In recent years, Contrastive Language-Image Pre-training (CLIP) has become a +cornerstone in multimodal intelligence. However, recent studies have identified +that the information loss in the CLIP encoding process is substantial, and CLIP +tends to capture only coarse-grained features from the input. This deficiency +significantly limits the ability of a single CLIP model to handle images rich +in visual detail. In this work, we propose a simple yet effective +model-agnostic strategy, Diversified Multiplet Upcycling (DMU), for CLIP. DMU +efficiently fine-tunes a series of CLIP models that capture different feature +spaces, from a dense pre-trained CLIP checkpoint, sharing parameters except for +the Feed-Forward Network (FFN). These models can then be transformed into a +CLIP-MoE with a larger model capacity, leading to significantly enhanced +performance with minimal computational overhead. To the best of our knowledge, +Diversified Multiplet Upcycling is the first approach to introduce sparsely +activated MoE into CLIP foundation models. Extensive experiments demonstrate +the significant performance of CLIP-MoE across various zero-shot retrieval, +zero-shot image classification tasks, and downstream Multimodal Large Language +Model (MLLM) benchmarks by serving as a vision encoder. Furthermore, +Diversified Multiplet Upcycling enables the conversion of any dense CLIP model +into CLIP-MoEs, which can seamlessly replace CLIP in a plug-and-play manner +without requiring further adaptation in downstream frameworks. Through +Diversified Multiplet Upcycling, we aim to provide valuable insights for future +research on developing more efficient and effective multimodal learning +systems. + +
+
+
+
+
+
+
+
+ + Information Retrieval 20 + +
+
+
+ + ☆ Price-guided user attention in large-scale E-commerce group + recommendation + + +
+ Existing group recommender systems utilize attention mechanisms to identify +critical users who influence group decisions the most. We analyzed user +attention scores from a widely-used group recommendation model on a real-world +E-commerce dataset and found that item price and user interaction history +significantly influence the selection of critical users. When item prices are +low, users with extensive interaction histories are more influential in group +decision-making. Conversely, their influence diminishes with higher item +prices. Based on these observations, we propose a novel group recommendation +approach that incorporates item price as a guiding factor for user aggregation. +Our model employs an adaptive sigmoid function to adjust output logits based on +item prices, enhancing the accuracy of user aggregation. Our model can be +plugged into any attention-based group recommender system if the price +information is available. We evaluate our model's performance on a public +benchmark and a real-world dataset. We compare it with other state-of-the-art +group recommendation methods. Our results demonstrate that our price-guided +user attention approach outperforms the state-of-the-art methods in terms of +hit ratio and mean square error. + +
+
+
+
+
+ + ☆ Financial Sentiment Analysis on News and Reports Using Large Language + Models and FinBERT + + +
+ Financial sentiment analysis (FSA) is crucial for evaluating market sentiment +and making well-informed financial decisions. The advent of large language +models (LLMs) such as BERT and its financial variant, FinBERT, has notably +enhanced sentiment analysis capabilities. This paper investigates the +application of LLMs and FinBERT for FSA, comparing their performance on news +articles, financial reports and company announcements. The study emphasizes the +advantages of prompt engineering with zero-shot and few-shot strategy to +improve sentiment classification accuracy. Experimental results indicate that +GPT-4o, with few-shot examples of financial texts, can be as competent as a +well fine-tuned FinBERT in this specialized field. + +
+
+
+
+
+ + ☆ Elaborative Subtopic Query Reformulation for Broad and Indirect Queries + in Travel Destination Recommendation RecSys 2024 + + +
+ In Query-driven Travel Recommender Systems (RSs), it is crucial to understand +the user intent behind challenging natural language(NL) destination queries +such as the broadly worded "youth-friendly activities" or the indirect +description "a high school graduation trip". Such queries are challenging due +to the wide scope and subtlety of potential user intents that confound the +ability of retrieval methods to infer relevant destinations from available +textual descriptions such as WikiVoyage. While query reformulation (QR) has +proven effective in enhancing retrieval by addressing user intent, existing QR +methods tend to focus only on expanding the range of potentially matching query +subtopics (breadth) or elaborating on the potential meaning of a query (depth), +but not both. In this paper, we introduce Elaborative Subtopic Query +Reformulation (EQR), a large language model-based QR method that combines both +breadth and depth by generating potential query subtopics with information-rich +elaborations. We also release TravelDest, a novel dataset for query-driven +travel destination RSs. Experiments on TravelDest show that EQR achieves +significant improvements in recall and precision over existing state-of-the-art +QR methods. + +
+
+ comment: 9 pages, 7 figures,The 1st Workshop on Risks, Opportunities, and + Evaluation of Generative Models in Recommender Systems (ROEGEN@RecSys 2024), + October 2024, Bari, Italy +
+
+
+
+
+ + ☆ Peeling Back the Layers: An In-Depth Evaluation of Encoder Architectures + in Neural News Recommenders RecSys 2024 + + +
+ Encoder architectures play a pivotal role in neural news recommenders by +embedding the semantic and contextual information of news and users. Thus, +research has heavily focused on enhancing the representational capabilities of +news and user encoders to improve recommender performance. Despite the +significant impact of encoder architectures on the quality of news and user +representations, existing analyses of encoder designs focus only on the overall +downstream recommendation performance. This offers a one-sided assessment of +the encoders' similarity, ignoring more nuanced differences in their behavior, +and potentially resulting in sub-optimal model selection. In this work, we +perform a comprehensive analysis of encoder architectures in neural news +recommender systems. We systematically evaluate the most prominent news and +user encoder architectures, focusing on their (i) representational similarity, +measured with the Central Kernel Alignment, (ii) overlap of generated +recommendation lists, quantified with the Jaccard similarity, and (iii) the +overall recommendation performance. Our analysis reveals that the complexity of +certain encoding techniques is often empirically unjustified, highlighting the +potential for simpler, more efficient architectures. By isolating the effects +of individual components, we provide valuable insights for researchers and +practitioners to make better informed decisions about encoder selection and +avoid unnecessary complexity in the design of news recommenders. + +
+
+ comment: Accepted at the 12th International Workshop on News Recommendation + and Analytics (INRA 2024) in conjunction with ACM RecSys 2024 +
+
+
+
+
+ + ☆ Analyzing Byte-Pair Encoding on Monophonic and Polyphonic Symbolic + Music: A Focus on Musical Phrase Segmentation + + +
+ Byte-Pair Encoding (BPE) is an algorithm commonly used in Natural Language +Processing to build a vocabulary of subwords, which has been recently applied +to symbolic music. Given that symbolic music can differ significantly from +text, particularly with polyphony, we investigate how BPE behaves with +different types of musical content. This study provides a qualitative analysis +of BPE's behavior across various instrumentations and evaluates its impact on a +musical phrase segmentation task for both monophonic and polyphonic music. Our +findings show that the BPE training process is highly dependent on the +instrumentation and that BPE "supertokens" succeed in capturing abstract +musical content. In a musical phrase segmentation task, BPE notably improves +performance in a polyphonic setting, but enhances performance in monophonic +tunes only within a specific range of BPE merges. + +
+
+ comment: Accepted to 3rd Workshop on NLP for Music and Audio (NLP4MusA, + co-located with ISMIR 2024) +
+
+
+
+
+ + ☆ Can We Delegate Learning to Automation?: A Comparative Study of LLM + Chatbots, Search Engines, and Books + + +
+ Learning is a key motivator behind information search behavior. With the +emergence of LLM-based chatbots, students are increasingly turning to these +tools as their primary resource for acquiring knowledge. However, the +transition from traditional resources like textbooks and web searches raises +concerns among educators. They worry that these fully-automated LLMs might lead +students to delegate critical steps of search as learning. In this paper, we +systematically uncover three main concerns from educators' perspectives. In +response to these concerns, we conducted a mixed-methods study with 92 +university students to compare three learning sources with different automation +levels. Our results show that LLMs support comprehensive understanding of key +concepts without promoting passive learning, though their effectiveness in +knowledge retention was limited. Additionally, we found that academic +performance impacted both learning outcomes and search patterns. Notably, +higher-competence learners engaged more deeply with content through +reading-intensive behaviors rather than relying on search activities. + +
+
+ comment: 21 pages, 14 figures +
+
+
+
+
+ + ☆ PairDistill: Pairwise Relevance Distillation for Dense Retrieval EMNLP 2024 + + +
+ Effective information retrieval (IR) from vast datasets relies on advanced +techniques to extract relevant information in response to queries. Recent +advancements in dense retrieval have showcased remarkable efficacy compared to +traditional sparse retrieval methods. To further enhance retrieval performance, +knowledge distillation techniques, often leveraging robust cross-encoder +rerankers, have been extensively explored. However, existing approaches +primarily distill knowledge from pointwise rerankers, which assign absolute +relevance scores to documents, thus facing challenges related to inconsistent +comparisons. This paper introduces Pairwise Relevance Distillation +(PairDistill) to leverage pairwise reranking, offering fine-grained +distinctions between similarly relevant documents to enrich the training of +dense retrieval models. Our experiments demonstrate that PairDistill +outperforms existing methods, achieving new state-of-the-art results across +multiple benchmarks. This highlights the potential of PairDistill in advancing +dense retrieval techniques effectively. Our source code and trained models are +released at https://github.com/MiuLab/PairDistill + +
+
+ comment: Accepted to EMNLP 2024 Main Conference +
+
+
+
+
+ + ☆ Integrating Visual and Textual Inputs for Searching Large-Scale Map + Collections with CLIP + + +
+ Despite the prevalence and historical importance of maps in digital +collections, current methods of navigating and exploring map collections are +largely restricted to catalog records and structured metadata. In this paper, +we explore the potential for interactively searching large-scale map +collections using natural language inputs ("maps with sea monsters"), visual +inputs (i.e., reverse image search), and multimodal inputs (an example map + +"more grayscale"). As a case study, we adopt 562,842 images of maps publicly +accessible via the Library of Congress's API. To accomplish this, we use the +mulitmodal Contrastive Language-Image Pre-training (CLIP) machine learning +model to generate embeddings for these maps, and we develop code to implement +exploratory search capabilities with these input strategies. We present results +for example searches created in consultation with staff in the Library of +Congress's Geography and Map Division and describe the strengths, weaknesses, +and possibilities for these search queries. Moreover, we introduce a +fine-tuning dataset of 10,504 map-caption pairs, along with an architecture for +fine-tuning a CLIP model on this dataset. To facilitate re-use, we provide all +of our code in documented, interactive Jupyter notebooks and place all code +into the public domain. Lastly, we discuss the opportunities and challenges for +applying these approaches across both digitized and born-digital collections +held by galleries, libraries, archives, and museums. + +
+
+ comment: 18 pages, 7 figures, accepted at the Computational Humanities + Research Conference (CHR 2024) +
+
+
+
+
+ + ☆ GraphRevisedIE: Multimodal Information Extraction with Graph-Revised + Network + + +
+ Key information extraction (KIE) from visually rich documents (VRD) has been +a challenging task in document intelligence because of not only the complicated +and diverse layouts of VRD that make the model hard to generalize but also the +lack of methods to exploit the multimodal features in VRD. In this paper, we +propose a light-weight model named GraphRevisedIE that effectively embeds +multimodal features such as textual, visual, and layout features from VRD and +leverages graph revision and graph convolution to enrich the multimodal +embedding with global context. Extensive experiments on multiple real-world +datasets show that GraphRevisedIE generalizes to documents of varied layouts +and achieves comparable or better performance compared to previous KIE methods. +We also publish a business license dataset that contains both real-life and +synthesized documents to facilitate research of document KIE. + +
+
+
+
+
+ + ☆ Unleashing the Power of Large Language Models in Zero-shot Relation + Extraction via Self-Prompting EMNLP 2024 + + +
+ Recent research in zero-shot Relation Extraction (RE) has focused on using +Large Language Models (LLMs) due to their impressive zero-shot capabilities. +However, current methods often perform suboptimally, mainly due to a lack of +detailed, context-specific prompts needed for understanding various sentences +and relations. To address this, we introduce the Self-Prompting framework, a +novel method designed to fully harness the embedded RE knowledge within LLMs. +Specifically, our framework employs a three-stage diversity approach to prompt +LLMs, generating multiple synthetic samples that encapsulate specific relations +from scratch. These generated samples act as in-context learning samples, +offering explicit and context-specific guidance to efficiently prompt LLMs for +RE. Experimental evaluations on benchmark datasets show our approach +outperforms existing LLM-based zero-shot RE methods. Additionally, our +experiments confirm the effectiveness of our generation pipeline in producing +high-quality synthetic data that enhances performance. + +
+
+ comment: EMNLP 2024 Short +
+
+
+
+
+ + ♻ ☆ Ink and Individuality: Crafting a Personalised Narrative in the Age of + LLMs + + +
+ Individuality and personalization comprise the distinctive characteristics +that make each writer unique and influence their words in order to effectively +engage readers while conveying authenticity. However, our growing reliance on +LLM-based writing assistants risks compromising our creativity and +individuality over time. We often overlook the negative impacts of this trend +on our creativity and uniqueness, despite the possible consequences. This study +investigates these concerns by performing a brief survey to explore different +perspectives and concepts, as well as trying to understand people's viewpoints, +in conjunction with past studies in the area. Addressing these issues is +essential for improving human-computer interaction systems and enhancing +writing assistants for personalization and individuality. + +
+
+ comment: 5 Pages, 4 Figures. Accepted in The Third Workshop on Intelligent and + Interactive Writing Assistants at CHI 2024 +
+
+
+
+
+ + ♻ ☆ Train Once, Deploy Anywhere: Matryoshka Representation Learning for + Multimodal Recommendation EMNLP 2024 + + +
+ Despite recent advancements in language and vision modeling, integrating rich +multimodal knowledge into recommender systems continues to pose significant +challenges. This is primarily due to the need for efficient recommendation, +which requires adaptive and interactive responses. In this study, we focus on +sequential recommendation and introduce a lightweight framework called +full-scale Matryoshka representation learning for multimodal recommendation +(fMRLRec). Our fMRLRec captures item features at different granularities, +learning informative representations for efficient recommendation across +multiple dimensions. To integrate item features from diverse modalities, +fMRLRec employs a simple mapping to project multimodal item features into an +aligned feature space. Additionally, we design an efficient linear +transformation that embeds smaller features into larger ones, substantially +reducing memory requirements for large-scale training on recommendation data. +Combined with improved state space modeling techniques, fMRLRec scales to +different dimensions and only requires one-time training to produce multiple +models tailored to various granularities. We demonstrate the effectiveness and +efficiency of fMRLRec on multiple benchmark datasets, which consistently +achieves superior performance over state-of-the-art baseline methods. We make +our code and data publicly available at https://github.com/yueqirex/fMRLRec. + +
+
+ comment: Accepted to EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ Late Chunking: Contextual Chunk Embeddings Using Long-Context Embedding + Models + + +
+ Many use cases require retrieving smaller portions of text, and dense +vector-based retrieval systems often perform better with shorter text segments, +as the semantics are less likely to be over-compressed in the embeddings. +Consequently, practitioners often split text documents into smaller chunks and +encode them separately. However, chunk embeddings created in this way can lose +contextual information from surrounding chunks, resulting in sub-optimal +representations. In this paper, we introduce a novel method called late +chunking, which leverages long context embedding models to first embed all +tokens of the long text, with chunking applied after the transformer model and +just before mean pooling - hence the term late in its naming. The resulting +chunk embeddings capture the full contextual information, leading to superior +results across various retrieval tasks. The method is generic enough to be +applied to a wide range of long-context embedding models and works without +additional training. To further increase the effectiveness of late chunking, we +propose a dedicated fine-tuning approach for embedding models. + +
+
+ comment: 11 pages, 3rd draft +
+
+
+
+
+ + ♻ ☆ Contextual Compression in Retrieval-Augmented Generation for Large + Language Models: A Survey + + +
+ Large Language Models (LLMs) showcase remarkable abilities, yet they struggle +with limitations such as hallucinations, outdated knowledge, opacity, and +inexplicable reasoning. To address these challenges, Retrieval-Augmented +Generation (RAG) has proven to be a viable solution, leveraging external +databases to improve the consistency and coherence of generated content, +especially valuable for complex, knowledge-rich tasks, and facilitates +continuous improvement by leveraging domain-specific insights. By combining the +intrinsic knowledge of LLMs with the vast, dynamic repositories of external +databases, RAG achieves a synergistic effect. However, RAG is not without its +limitations, including a limited context window, irrelevant information, and +the high processing overhead for extensive contextual data. In this +comprehensive work, we explore the evolution of Contextual Compression +paradigms, providing an in-depth examination of the field. Finally, we outline +the current challenges and suggest potential research and development +directions, paving the way for future advancements in this area. + +
+
+ comment: Ongoing Work +
+
+
+
+
+ + ♻ ☆ GPT vs RETRO: Exploring the Intersection of Retrieval and + Parameter-Efficient Fine-Tuning EMNLP 2024 + + +
+ Parameter-Efficient Fine-Tuning (PEFT) and Retrieval-Augmented Generation +(RAG) have become popular methods for adapting large language models while +minimizing compute requirements. In this paper, we apply PEFT methods +(P-tuning, Adapters, and LoRA) to a modified Retrieval-Enhanced Transformer +(RETRO) and a baseline GPT model across several sizes, ranging from 823 million +to 48 billion parameters. We show that RETRO models outperform GPT models in +zero-shot settings due to their unique pre-training process but GPT models have +higher performance potential with PEFT. Additionally, our study indicates that +8B parameter models strike an optimal balance between cost and performance and +P-tuning lags behind other PEFT techniques. We further provide a comparative +analysis between applying PEFT to an Instruction-tuned RETRO model and base +RETRO model. This work presents the first comprehensive comparison of various +PEFT methods integrated with RAG, applied to both GPT and RETRO models, +highlighting their relative performance. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Pre-training Cross-lingual Open Domain Question Answering with + Large-scale Synthetic Supervision EMNLP 2024 + + +
+ Cross-lingual open domain question answering (CLQA) is a complex problem, +comprising cross-lingual retrieval from a multilingual knowledge base, followed +by answer generation in the query language. Both steps are usually tackled by +separate models, requiring substantial annotated datasets, and typically +auxiliary resources, like machine translation systems to bridge between +languages. In this paper, we show that CLQA can be addressed using a single +encoder-decoder model. To effectively train this model, we propose a +self-supervised method based on exploiting the cross-lingual link structure +within Wikipedia. We demonstrate how linked Wikipedia pages can be used to +synthesise supervisory signals for cross-lingual retrieval, through a form of +cloze query, and generate more natural questions to supervise answer +generation. Together, we show our approach, \texttt{CLASS}, outperforms +comparable methods on both supervised and zero-shot language adaptation +settings, including those using machine translation. + +
+
+ comment: EMNLP 2024 Main +
+
+
+
+
+ + ♻ ☆ Advancing Event Causality Identification via Heuristic Semantic + Dependency Inquiry Network EMNLP 2024 + + +
+ Event Causality Identification (ECI) focuses on extracting causal relations +between events in texts. Existing methods for ECI primarily rely on causal +features and external knowledge. However, these approaches fall short in two +dimensions: (1) causal features between events in a text often lack explicit +clues, and (2) external knowledge may introduce bias, while specific problems +require tailored analyses. To address these issues, we propose SemDI - a simple +and effective Semantic Dependency Inquiry Network for ECI. SemDI captures +semantic dependencies within the context using a unified encoder. Then, it +utilizes a Cloze Analyzer to generate a fill-in token based on comprehensive +context understanding. Finally, this fill-in token is used to inquire about the +causal relation between two events. Extensive experiments demonstrate the +effectiveness of SemDI, surpassing state-of-the-art methods on three widely +used benchmarks. Code is available at https://github.com/hrlics/SemDI. + +
+
+ comment: EMNLP 2024 camera-ready version. Code is released at + https://github.com/hrlics/SemDI +
+
+
+
+
+ + ♻ ☆ Towards Scalability and Extensibility of Query Reformulation Modeling in + E-commerce Search + + +
+ Customer behavioral data significantly impacts e-commerce search systems. +However, in the case of less common queries, the associated behavioral data +tends to be sparse and noisy, offering inadequate support to the search +mechanism. To address this challenge, the concept of query reformulation has +been introduced. It suggests that less common queries could utilize the +behavior patterns of their popular counterparts with similar meanings. In +Amazon product search, query reformulation has displayed its effectiveness in +improving search relevance and bolstering overall revenue. Nonetheless, +adapting this method for smaller or emerging businesses operating in regions +with lower traffic and complex multilingual settings poses the challenge in +terms of scalability and extensibility. This study focuses on overcoming this +challenge by constructing a query reformulation solution capable of functioning +effectively, even when faced with limited training data, in terms of quality +and scale, along with relatively complex linguistic characteristics. In this +paper we provide an overview of the solution implemented within Amazon product +search infrastructure, which encompasses a range of elements, including +refining the data mining process, redefining model training objectives, and +reshaping training strategies. The effectiveness of the proposed solution is +validated through online A/B testing on search ranking and Ads matching. +Notably, employing the proposed solution in search ranking resulted in 0.14% +and 0.29% increase in overall revenue in Japanese and Hindi cases, +respectively, and a 0.08% incremental gain in the English case compared to the +legacy implementation; while in search Ads matching led to a 0.36% increase in +Ads revenue in the Japanese case. + +
+
+
+
+
+ + ♻ ☆ OneGen: Efficient One-Pass Unified Generation and Retrieval for LLMs EMNLP 2024 + + +
+ Despite the recent advancements in Large Language Models (LLMs), which have +significantly enhanced the generative capabilities for various NLP tasks, LLMs +still face limitations in directly handling retrieval tasks. However, many +practical applications demand the seamless integration of both retrieval and +generation. This paper introduces a novel and efficient One-pass Generation and +retrieval framework (OneGen), designed to improve LLMs' performance on tasks +that require both generation and retrieval. The proposed framework bridges the +traditionally separate training approaches for generation and retrieval by +incorporating retrieval tokens generated autoregressively. This enables a +single LLM to handle both tasks simultaneously in a unified forward pass. We +conduct experiments on two distinct types of composite tasks, RAG and Entity +Linking, to validate the pluggability, effectiveness, and efficiency of OneGen +in training and inference. Furthermore, our results show that integrating +generation and retrieval within the same context preserves the generative +capabilities of LLMs while improving retrieval performance. To the best of our +knowledge, OneGen is the first to enable LLMs to conduct vector retrieval +during the generation. + +
+
+ comment: EMNLP 2024 Findings; code is available at + https://github.com/zjunlp/OneGen +
+
+
+
+
+ + ♻ ☆ Block-Diagonal Orthogonal Relation and Matrix Entity for Knowledge Graph + Embedding EMNLP2024 + + +
+ The primary aim of Knowledge Graph embeddings (KGE) is to learn +low-dimensional representations of entities and relations for predicting +missing facts. While rotation-based methods like RotatE and QuatE perform well +in KGE, they face two challenges: limited model flexibility requiring +proportional increases in relation size with entity dimension, and difficulties +in generalizing the model for higher-dimensional rotations. To address these +issues, we introduce OrthogonalE, a novel KGE model employing matrices for +entities and block-diagonal orthogonal matrices with Riemannian optimization +for relations. This approach enhances the generality and flexibility of KGE +models. The experimental results indicate that our new KGE model, OrthogonalE, +is both general and flexible, significantly outperforming state-of-the-art KGE +models while substantially reducing the number of relation parameters. + +
+
+ comment: EMNLP2024 findings (Long) +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Social Media Authentication and Combating Deepfakes using Semi-fragile + Invisible Image Watermarking + + +
+ With the significant advances in deep generative models for image and video +synthesis, Deepfakes and manipulated media have raised severe societal +concerns. Conventional machine learning classifiers for deepfake detection +often fail to cope with evolving deepfake generation technology and are +susceptible to adversarial attacks. Alternatively, invisible image watermarking +is being researched as a proactive defense technique that allows media +authentication by verifying an invisible secret message embedded in the image +pixels. A handful of invisible image watermarking techniques introduced for +media authentication have proven vulnerable to basic image processing +operations and watermark removal attacks. In response, we have proposed a +semi-fragile image watermarking technique that embeds an invisible secret +message into real images for media authentication. Our proposed watermarking +framework is designed to be fragile to facial manipulations or tampering while +being robust to benign image-processing operations and watermark removal +attacks. This is facilitated through a unique architecture of our proposed +technique consisting of critic and adversarial networks that enforce high image +quality and resiliency to watermark removal efforts, respectively, along with +the backbone encoder-decoder and the discriminator networks. Thorough +experimental investigations on SOTA facial Deepfake datasets demonstrate that +our proposed model can embed a $64$-bit secret as an imperceptible image +watermark that can be recovered with a high-bit recovery accuracy when benign +image processing operations are applied while being non-recoverable when unseen +Deepfake manipulations are applied. In addition, our proposed watermarking +technique demonstrates high resilience to several white-box and black-box +watermark removal attacks. Thus, obtaining state-of-the-art performance. + +
+
+ comment: ACM Transactions (Digital Threats: Research and Practice) +
+
+
+
+
+ + ☆ RADAR: Robust Two-stage Modality-incomplete Industrial Anomaly Detection + + +
+ Multimodal Industrial Anomaly Detection (MIAD), utilizing 3D point clouds and +2D RGB images to identify the abnormal region of products, plays a crucial role +in industrial quality inspection. However, the conventional MIAD setting +presupposes that all 2D and 3D modalities are paired, overlooking the fact that +multimodal data collected from the real world is often imperfect due to missing +modalities. Consequently, MIAD models that demonstrate robustness against +modal-incomplete data are highly desirable in practice. To address this +practical challenge, we introduce a first-of-its-kind study that +comprehensively investigates Modality-Incomplete Industrial Anomaly Detection +(MIIAD), to consider the imperfect learning environment in which the multimodal +information may be incomplete. Not surprisingly, we discovered that most +existing MIAD approaches are inadequate for addressing MIIAD challenges, +leading to significant performance degradation on the MIIAD benchmark we +developed. In this paper, we propose a novel two-stage Robust +modAlity-imcomplete fusing and Detecting frAmewoRk, abbreviated as RADAR. Our +bootstrapping philosophy is to enhance two stages in MIIAD, improving the +robustness of the Multimodal Transformer: i) In feature fusion, we first +explore learning modality-incomplete instruction, guiding the pre-trained +Multimodal Transformer to robustly adapt to various modality-incomplete +scenarios, and implement adaptive parameter learning based on a HyperNetwork; +ii) In anomaly detection, we construct a real-pseudo hybrid module to highlight +the distinctiveness of modality combinations, further enhancing the robustness +of the MIIAD model. Our experimental results demonstrate that the proposed +RADAR significantly surpasses conventional MIAD methods in terms of +effectiveness and robustness on our newly created MIIAD dataset, underscoring +its practical application value. + +
+
+
+
+
+ + ☆ Harnessing the Latent Diffusion Model for Training-Free Image Style + Transfer + + +
+ Diffusion models have recently shown the ability to generate high-quality +images. However, controlling its generation process still poses challenges. The +image style transfer task is one of those challenges that transfers the visual +attributes of a style image to another content image. Typical obstacle of this +task is the requirement of additional training of a pre-trained model. We +propose a training-free style transfer algorithm, Style Tracking Reverse +Diffusion Process (STRDP) for a pretrained Latent Diffusion Model (LDM). Our +algorithm employs Adaptive Instance Normalization (AdaIN) function in a +distinct manner during the reverse diffusion process of an LDM while tracking +the encoding history of the style image. This algorithm enables style transfer +in the latent space of LDM for reduced computational cost, and provides +compatibility for various LDM models. Through a series of experiments and a +user study, we show that our method can quickly transfer the style of an image +without additional training. The speed, compatibility, and training-free aspect +of our algorithm facilitates agile experiments with combinations of styles and +LDMs for extensive application. + +
+
+
+
+
+ + ♻ ☆ Progressive Frame Patching for FoV-based Point Cloud Video Streaming + + +
+ Many XR applications require the delivery of volumetric video to users with +six degrees of freedom (6-DoF) movements. Point Cloud has become a popular +volumetric video format. A dense point cloud consumes much higher bandwidth +than a 2D/360 degree video frame. User Field of View (FoV) is more dynamic with +6-DoF movement than 3-DoF movement. To save bandwidth, FoV-adaptive streaming +predicts a user's FoV and only downloads point cloud data falling in the +predicted FoV. However, it is vulnerable to FoV prediction errors, which can be +significant when a long buffer is utilized for smoothed streaming. In this +work, we propose a multi-round progressive refinement framework for point cloud +video streaming. Instead of sequentially downloading point cloud frames, our +solution simultaneously downloads/patches multiple frames falling into a +sliding time-window, leveraging the inherent scalability of octree-based +point-cloud coding. The optimal rate allocation among all tiles of active +frames are solved analytically using the heterogeneous tile rate-quality +functions calibrated by the predicted user FoV. Multi-frame +downloading/patching simultaneously takes advantage of the streaming smoothness +resulting from long buffer and the FoV prediction accuracy at short buffer +length. We evaluate our streaming solution using simulations driven by real +point cloud videos, real bandwidth traces, and 6-DoF FoV traces of real users. +Our solution is robust against the bandwidth/FoV prediction errors, and can +deliver high and smooth view quality in the face of bandwidth variations and +dynamic user and point cloud movements. + +
+
+ comment: Transactions on Multimedia (under review) +
+
+
+
+
+ + ♻ ☆ DiffSSD: A Diffusion-Based Dataset For Speech Forensics ICASSP + + +
+ Diffusion-based speech generators are ubiquitous. These methods can generate +very high quality synthetic speech and several recent incidents report their +malicious use. To counter such misuse, synthetic speech detectors have been +developed. Many of these detectors are trained on datasets which do not include +diffusion-based synthesizers. In this paper, we demonstrate that existing +detectors trained on one such dataset, ASVspoof2019, do not perform well in +detecting synthetic speech from recent diffusion-based synthesizers. We propose +the Diffusion-Based Synthetic Speech Dataset (DiffSSD), a dataset consisting of +about 200 hours of labeled speech, including synthetic speech generated by 8 +diffusion-based open-source and 2 commercial generators. We also examine the +performance of existing synthetic speech detectors on DiffSSD in both +closed-set and open-set scenarios. The results highlight the importance of this +dataset in detecting synthetic speech generated from recent open-source and +commercial speech generators. + +
+
+ comment: Submitted to IEEE International Conference on Acoustics, Speech, and + Signal Processing (ICASSP) 2025 +
+
+
+
+
+ + ♻ ☆ Beyond Single-Audio: Advancing Multi-Audio Processing in Audio Large + Language Models EMNLP24 + + +
+ Various audio-LLMs (ALLMs) have been explored recently for tackling different +audio tasks simultaneously using a single, unified model. While existing +evaluations of ALLMs primarily focus on single-audio tasks, real-world +applications often involve processing multiple audio streams simultaneously. To +bridge this gap, we propose the first multi-audio evaluation (MAE) benchmark +that consists of 20 datasets from 11 multi-audio tasks encompassing both speech +and sound scenarios. Comprehensive experiments on MAE demonstrate that the +existing ALLMs, while being powerful in comprehending primary audio elements in +individual audio inputs, struggling to handle multi-audio scenarios. To this +end, we propose a novel multi-audio-LLM (MALLM) to capture audio context among +multiple similar audios using discriminative learning on our proposed synthetic +data. The results demonstrate that the proposed MALLM outperforms all baselines +and achieves high data efficiency using synthetic data without requiring human +annotations. The proposed MALLM opens the door for ALLMs towards multi-audio +processing era and brings us closer to replicating human auditory capabilities +in machines. + +
+
+ comment: EMNLP24 Findings +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ Conversational Exploratory Search of Scholarly Publications Using + Knowledge Graphs SP 2024 + + +
+ Traditional search methods primarily depend on string matches, while semantic +search targets concept-based matches by recognizing underlying intents and +contextual meanings of search terms. Semantic search is particularly beneficial +for discovering scholarly publications where differences in vocabulary between +users' search terms and document content are common, often yielding irrelevant +search results. Many scholarly search engines have adopted knowledge graphs to +represent semantic relations between authors, publications, and research +concepts. However, users may face challenges when navigating these graphical +search interfaces due to the complexity and volume of data, which impedes their +ability to discover publications effectively. To address this problem, we +developed a conversational search system for exploring scholarly publications +using a knowledge graph. We outline the methodical approach for designing and +implementing the proposed system, detailing its architecture and functional +components. To assess the system's effectiveness, we employed various +performance metrics and conducted a human evaluation with 40 participants, +demonstrating how the conversational interface compares against a graphical +interface with traditional text search. The findings from our evaluation +provide practical insights for advancing the design of conversational search +systems. + +
+
+ comment: Accepted to ICNLSP 2024 +
+
+
+
+
+ + ☆ TPN: Transferable Proto-Learning Network towards Few-shot Document-Level + Relation Extraction + + +
+ Few-shot document-level relation extraction suffers from poor performance due +to the challenging cross-domain transferability of NOTA (none-of-the-above) +relation representation. In this paper, we introduce a Transferable +Proto-Learning Network (TPN) to address the challenging issue. It comprises +three core components: Hybrid Encoder hierarchically encodes semantic content +of input text combined with attention information to enhance the relation +representations. As a plug-and-play module for Out-of-Domain (OOD) Detection, +Transferable Proto-Learner computes NOTA prototype through an adaptive +learnable block, effectively mitigating NOTA bias across various domains. +Dynamic Weighting Calibrator detects relation-specific classification +confidence, serving as dynamic weights to calibrate the NOTA-dominant loss +function. Finally, to bolster the model's cross-domain performance, we +complement it with virtual adversarial training (VAT). We conduct extensive +experimental analyses on FREDo and ReFREDo, demonstrating the superiority of +TPN. Compared to state-of-the-art methods, our approach achieves competitive +performance with approximately half the parameter size. Data and code are +available at https://github.com/EchoDreamer/TPN. + +
+
+ comment: Few shot document-level relation extraction +
+
+
+
+
+ + ☆ ECORS: An Ensembled Clustering Approach to Eradicate The Local And + Global Outlier In Collaborative Filtering Recommender System + + +
+ Recommender systems are designed to suggest items based on user preferences, +helping users navigate the vast amount of information available on the +internet. Given the overwhelming content, outlier detection has emerged as a +key research area in recommender systems. It involves identifying unusual or +suspicious patterns in user behavior. However, existing studies in this field +face several challenges, including the limited universality of algorithms, +difficulties in selecting users, and a lack of optimization. In this paper, we +propose an approach that addresses these challenges by employing various +clustering algorithms. Specifically, we utilize a user-user matrix-based +clustering technique to detect outliers. By constructing a user-user matrix, we +can identify suspicious users in the system. Both local and global outliers are +detected to ensure comprehensive analysis. Our experimental results demonstrate +that this approach significantly improves the accuracy of outlier detection in +recommender systems. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ RecPrompt: A Self-tuning Prompting Framework for News Recommendation + Using Large Language Models + + +
+ News recommendations heavily rely on Natural Language Processing (NLP) +methods to analyze, understand, and categorize content, enabling personalized +suggestions based on user interests and reading behaviors. Large Language +Models (LLMs) like GPT-4 have shown promising performance in understanding +natural language. However, the extent of their applicability to news +recommendation systems remains to be validated. This paper introduces +RecPrompt, the first self-tuning prompting framework for news recommendation, +leveraging the capabilities of LLMs to perform complex news recommendation +tasks. This framework incorporates a news recommender and a prompt optimizer +that applies an iterative bootstrapping process to enhance recommendations +through automatic prompt engineering. Extensive experimental results with 400 +users show that RecPrompt can achieve an improvement of 3.36% in AUC, 10.49% in +MRR, 9.64% in nDCG@5, and 6.20% in nDCG@10 compared to deep neural models. +Additionally, we introduce TopicScore, a novel metric to assess explainability +by evaluating LLM's ability to summarize topics of interest for users. The +results show LLM's effectiveness in accurately identifying topics of interest +and delivering comprehensive topic-based explanations. + +
+
+ comment: 5 pages, 2 figures, and 2 tables +
+
+
+
+
+ + ♻ ☆ Visual Acuity Prediction on Real-Life Patient Data Using a Machine + Learning Based Multistage System + + +
+ In ophthalmology, intravitreal operative medication therapy (IVOM) is a +widespread treatment for diseases related to the age-related macular +degeneration (AMD), the diabetic macular edema (DME), as well as the retinal +vein occlusion (RVO). However, in real-world settings, patients often suffer +from loss of vision on time scales of years despite therapy, whereas the +prediction of the visual acuity (VA) and the earliest possible detection of +deterioration under real-life conditions is challenging due to heterogeneous +and incomplete data. In this contribution, we present a workflow for the +development of a research-compatible data corpus fusing different IT systems of +the department of ophthalmology of a German maximum care hospital. The +extensive data corpus allows predictive statements of the expected progression +of a patient and his or her VA in each of the three diseases. For the disease +AMD, we found out a significant deterioration of the visual acuity over time. +Within our proposed multistage system, we subsequently classify the VA +progression into the three groups of therapy "winners", "stabilizers", and +"losers" (WSL classification scheme). Our OCT biomarker classification using an +ensemble of deep neural networks results in a classification accuracy +(F1-score) of over 98 %, enabling us to complete incomplete OCT documentations +while allowing us to exploit them for a more precise VA modeling process. Our +VA prediction requires at least four VA examinations and optionally OCT +biomarkers from the same time period to predict the VA progression within a +forecasted time frame, whereas our prediction is currently restricted to IVOM / +no therapy. We achieve a final prediction accuracy of 69 % in macro average +F1-score, while being in the same range as the ophthalmologists with 57.8 and +50 +- 10.7 % F1-score. + +
+
+ comment: Accepted for: Scientific Reports +
+
+
+
+
+ + ♻ ☆ Automated Peer Reviewing in Paper SEA: Standardization, Evaluation, and + Analysis EMNLP 2024 + + +
+ In recent years, the rapid increase in scientific papers has overwhelmed +traditional review mechanisms, resulting in varying quality of publications. +Although existing methods have explored the capabilities of Large Language +Models (LLMs) for automated scientific reviewing, their generated contents are +often generic or partial. To address the issues above, we introduce an +automated paper reviewing framework SEA. It comprises of three modules: +Standardization, Evaluation, and Analysis, which are represented by models +SEA-S, SEA-E, and SEA-A, respectively. Initially, SEA-S distills data +standardization capabilities of GPT-4 for integrating multiple reviews for a +paper. Then, SEA-E utilizes standardized data for fine-tuning, enabling it to +generate constructive reviews. Finally, SEA-A introduces a new evaluation +metric called mismatch score to assess the consistency between paper contents +and reviews. Moreover, we design a self-correction strategy to enhance the +consistency. Extensive experimental results on datasets collected from eight +venues show that SEA can generate valuable insights for authors to improve +their papers. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ FELRec: Efficient Handling of Item Cold-Start With Dynamic + Representation Learning in Recommender Systems + + +
+ Recommender systems suffer from the cold-start problem whenever a new user +joins the platform or a new item is added to the catalog. To address item +cold-start, we propose to replace the embedding layer in sequential +recommenders with a dynamic storage that has no learnable weights and can keep +an arbitrary number of representations. In this paper, we present FELRec, a +large embedding network that refines the existing representations of users and +items in a recursive manner, as new information becomes available. In contrast +to similar approaches, our model represents new users and items without side +information and time-consuming finetuning, instead it runs a single forward +pass over a sequence of existing representations. During item cold-start, our +method outperforms similar method by 29.50%-47.45%. Further, our proposed model +generalizes well to previously unseen datasets in zero-shot settings. The +source code is publicly available at https://github.com/kweimann/FELRec . + +
+
+
+
+
+ + ♻ ☆ Enhancing High-order Interaction Awareness in LLM-based Recommender + Model EMNLP 2024 + + +
+ Large language models (LLMs) have demonstrated prominent reasoning +capabilities in recommendation tasks by transforming them into text-generation +tasks. However, existing approaches either disregard or ineffectively model the +user-item high-order interactions. To this end, this paper presents an enhanced +LLM-based recommender (ELMRec). We enhance whole-word embeddings to +substantially enhance LLMs' interpretation of graph-constructed interactions +for recommendations, without requiring graph pre-training. This finding may +inspire endeavors to incorporate rich knowledge graphs into LLM-based +recommenders via whole-word embedding. We also found that LLMs often recommend +items based on users' earlier interactions rather than recent ones, and present +a reranking solution. Our ELMRec outperforms state-of-the-art (SOTA) methods in +both direct and sequential recommendations. + +
+
+ comment: Long paper accepted to EMNLP 2024 Main. 16 pages +
+
+
+
+
+ + ♻ ☆ Causality-Inspired Fair Representation Learning for Multimodal + Recommendation + + +
+ Recently, multimodal recommendations (MMR) have gained increasing attention +for alleviating the data sparsity problem of traditional recommender systems by +incorporating modality-based representations. Although MMR exhibit notable +improvement in recommendation accuracy, we empirically validate that an +increase in the quantity or variety of modalities leads to a higher degree of +users' sensitive information leakage due to entangled causal relationships, +risking fair representation learning. On the other hand, existing fair +representation learning approaches are mostly based on the assumption that +sensitive information is solely leaked from users' interaction data and do not +explicitly model the causal relationships introduced by multimodal data, which +limits their applicability in multimodal scenarios. Particularly, we +disentangle biased and filtered modal embeddings inspired by causal inference +techniques, enabling the mining of modality-based unfair and fair user-user +relations, thereby enhancing the fairness and informativeness of user +representations. By addressing the causal effects of sensitive attributes on +user preferences, our approach aims to achieve counterfactual fairness in +multimodal recommendations. Experiments on two public datasets demonstrate the +superiority of our FMMRec relative to the state-of-the-art baselines. Our +source code is available at https://github.com/WeixinChen98/FMMRec. + +
+
+
+
+
+ + ♻ ☆ ClimRetrieve: A Benchmarking Dataset for Information Retrieval from + Corporate Climate Disclosures + + +
+ To handle the vast amounts of qualitative data produced in corporate climate +communication, stakeholders increasingly rely on Retrieval Augmented Generation +(RAG) systems. However, a significant gap remains in evaluating domain-specific +information retrieval - the basis for answer generation. To address this +challenge, this work simulates the typical tasks of a sustainability analyst by +examining 30 sustainability reports with 16 detailed climate-related questions. +As a result, we obtain a dataset with over 8.5K unique question-source-answer +pairs labeled by different levels of relevance. Furthermore, we develop a use +case with the dataset to investigate the integration of expert knowledge into +information retrieval with embeddings. Although we show that incorporating +expert knowledge works, we also outline the critical limitations of embeddings +in knowledge-intensive downstream domains like climate change communication. + +
+
+
+
+
+ + ♻ ☆ FLEX: Expert-level False-Less EXecution Metric for Reliable Text-to-SQL + Benchmark + + +
+ Text-to-SQL technology has become crucial for translating natural language +into SQL queries in various industries, enabling non-technical users to perform +complex data operations. The need for accurate evaluation methods has increased +as these systems have grown more sophisticated. However, we found that the +Execution Accuracy (EX), the most promising evaluation metric, still shows a +substantial portion of false positives and negatives compared to human +evaluation. Thus, this paper introduces FLEX (False-Less EXecution), a novel +approach to evaluating text-to-SQL systems using large language models (LLMs) +to emulate human expert-level evaluation of SQL queries. Our method shows +significantly higher agreement with human expert judgments, improving Cohen's +kappa from 61 to 78.17. Re-evaluating top-performing models on the Spider and +BIRD benchmarks using FLEX reveals substantial shifts in performance rankings, +with an average performance decrease of 3.15 due to false positive corrections +and an increase of 6.07 from addressing false negatives. This work contributes +to a more accurate and nuanced evaluation of text-to-SQL systems, potentially +reshaping our understanding of state-of-the-art performance in this field. + +
+
+ comment: preprint, under review +
+
+
+
+
+ + ♻ ☆ Deep Tree-based Retrieval for Efficient Recommendation: Theory and + Method + + +
+ With the development of deep learning techniques, deep recommendation models +also achieve remarkable improvements in terms of recommendation accuracy. +However, due to the large number of candidate items in practice and the high +cost of preference computation, these methods also suffer from low efficiency +of recommendation. The recently proposed tree-based deep recommendation models +alleviate the problem by directly learning tree structure and representations +under the guidance of recommendation objectives. However, such models have +shortcomings. The max-heap assumption in the hierarchical tree, in which the +preference for a parent node should be the maximum between the preferences for +its children, is difficult to satisfy in their binary classification +objectives. To this end, we propose Tree-based Deep Retrieval (TDR for short) +for efficient recommendation. In TDR, all the trees generated during the +training process are retained to form the forest. When learning the node +representation of each tree, we have to satisfy the max-heap assumption as much +as possible and mimic beam search behavior over the tree in the training stage. +This is achieved by TDR to regard the training task as multi-classification +over tree nodes at the same level. However, the number of tree nodes grows +exponentially with levels, making us train the preference model with the +guidance of the sampled-softmax technique. The experiments are conducted on +real-world datasets, validating the effectiveness of the proposed preference +model learning method and tree learning method. + +
+
+
+
+
+ + ♻ ☆ A Prompting-Based Representation Learning Method for Recommendation with + Large Language Models + + +
+ In recent years, Recommender Systems (RS) have witnessed a transformative +shift with the advent of Large Language Models (LLMs) in the field of Natural +Language Processing (NLP). Models such as GPT-3.5/4, Llama, have demonstrated +unprecedented capabilities in understanding and generating human-like text. The +extensive information pre-trained by these LLMs allows for the potential to +capture a more profound semantic representation from different contextual +information of users and items. + While the great potential lies behind the thriving of LLMs, the challenge of +leveraging user-item preferences from contextual information and its alignment +with the improvement of Recommender Systems needs to be addressed. Believing +that a better understanding of the user or item itself can be the key factor in +improving recommendation performance, we conduct research on generating +informative profiles using state-of-the-art LLMs. + To boost the linguistic abilities of LLMs in Recommender Systems, we +introduce the Prompting-Based Representation Learning Method for Recommendation +(P4R). In our P4R framework, we utilize the LLM prompting strategy to create +personalized item profiles. These profiles are then transformed into semantic +representation spaces using a pre-trained BERT model for text embedding. +Furthermore, we incorporate a Graph Convolution Network (GCN) for collaborative +filtering representation. The P4R framework aligns these two embedding spaces +in order to address the general recommendation tasks. In our evaluation, we +compare P4R with state-of-the-art Recommender models and assess the quality of +prompt-based profile generation. + +
+
+ comment: Risks: The 1st International Workshop on Risks, Opportunities, and + Evaluation of Generative Models in Recommendation +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ Graph-based Scalable Sampling of 3D Point Cloud Attributes + + +
+ 3D Point clouds (PCs) are commonly used to represent 3D scenes. They can have +millions of points, making subsequent downstream tasks such as compression and +streaming computationally expensive. PC sampling (selecting a subset of points) +can be used to reduce complexity. Existing PC sampling algorithms focus on +preserving geometry features and often do not scale to handle large PCs. In +this work, we develop scalable graph-based sampling algorithms for PC color +attributes, assuming the full geometry is available. Our sampling algorithms +are optimized for a signal reconstruction method that minimizes the graph +Laplacian quadratic form. We first develop a global sampling algorithm that can +be applied to PCs with millions of points by exploiting sparsity and sampling +rate adaptive parameter selection. Further, we propose a block-based sampling +strategy where each block is sampled independently. We show that sampling the +corresponding sub-graphs with optimally chosen self-loop weights (node weights) +will produce a sampling set that approximates the results of global sampling +while reducing complexity by an order of magnitude. Our empirical results on +two large PC datasets show that our algorithms outperform the existing fast PC +subsampling techniques (uniform and geometry feature preserving random +sampling) by 2dB. Our algorithm is up to 50 times faster than existing graph +signal sampling algorithms while providing better reconstruction accuracy. +Finally, we illustrate the efficacy of PC attribute sampling within a +compression scenario, showing that pre-compression sampling of PC attributes +can lower the bitrate by 11% while having minimal effect on reconstruction. + +
+
+ comment: 13 pages, 13 Figures +
+
+
+
+
+ + ☆ Energy-Quality-aware Variable Framerate Pareto-Front for Adaptive Video + Streaming + + +
+ Optimizing framerate for a given bitrate-spatial resolution pair in adaptive +video streaming is essential to maintain perceptual quality while considering +decoding complexity. Low framerates at low bitrates reduce compression +artifacts and decrease decoding energy. We propose a novel method, +Decoding-complexity aware Framerate Prediction (DECODRA), which employs a +Variable Framerate Pareto-front approach to predict an optimized framerate that +minimizes decoding energy under quality degradation constraints. DECODRA +dynamically adjusts the framerate based on current bitrate and spatial +resolution, balancing trade-offs between framerate, perceptual quality, and +decoding complexity. Extensive experimentation with the Inter-4K dataset +demonstrates DECODRA's effectiveness, yielding an average decoding energy +reduction of up to 13.45%, with minimal VMAF reduction of 0.33 points at a +low-quality degradation threshold, compared to the default 60 fps encoding. +Even at an aggressive threshold, DECODRA achieves significant energy savings of +13.45% while only reducing VMAF by 2.11 points. In this way, DECODRA extends +mobile device battery life and reduces the energy footprint of streaming +services by providing a more energy-efficient video streaming pipeline. + +
+
+ comment: Accepted at IEEE International Conference on Visual Communications + and Image Processing (VCIP) 2024 +
+
+
+
+
+ + ☆ Maximum entropy and quantized metric models for absolute category + ratings + + +
+ The datasets of most image quality assessment studies contain ratings on a +categorical scale with five levels, from bad (1) to excellent (5). For each +stimulus, the number of ratings from 1 to 5 is summarized and given in the form +of the mean opinion score. In this study, we investigate families of +multinomial probability distributions parameterized by mean and variance that +are used to fit the empirical rating distributions. To this end, we consider +quantized metric models based on continuous distributions that model perceived +stimulus quality on a latent scale. The probabilities for the rating categories +are determined by quantizing the corresponding random variables using threshold +values. Furthermore, we introduce a novel discrete maximum entropy distribution +for a given mean and variance. We compare the performance of these models and +the state of the art given by the generalized score distribution for two large +data sets, KonIQ-10k and VQEG HDTV. Given an input distribution of ratings, our +fitted two-parameter models predict unseen ratings better than the empirical +distribution. In contrast to empirical ACR distributions and their discrete +models, our continuous models can provide fine-grained estimates of quantiles +of quality of experience that are relevant to service providers to satisfy a +target fraction of the user population. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ VideoCLIP-XL: Advancing Long Description Understanding for Video CLIP + Models EMNLP 2024 + + +
+ Contrastive Language-Image Pre-training (CLIP) has been widely studied and +applied in numerous applications. However, the emphasis on brief summary texts +during pre-training prevents CLIP from understanding long descriptions. This +issue is particularly acute regarding videos given that videos often contain +abundant detailed contents. In this paper, we propose the VideoCLIP-XL (eXtra +Length) model, which aims to unleash the long-description understanding +capability of video CLIP models. Firstly, we establish an automatic data +collection system and gather a large-scale VILD pre-training dataset with VIdeo +and Long-Description pairs. Then, we propose Text-similarity-guided Primary +Component Matching (TPCM) to better learn the distribution of feature space +while expanding the long description capability. We also introduce two new +tasks namely Detail-aware Description Ranking (DDR) and Hallucination-aware +Description Ranking (HDR) for further understanding improvement. Finally, we +construct a Long Video Description Ranking (LVDR) benchmark for evaluating the +long-description capability more comprehensively. Extensive experimental +results on widely-used text-video retrieval benchmarks with both short and long +descriptions and our LVDR benchmark can fully demonstrate the effectiveness of +our method. + +
+
+ comment: EMNLP 2024 Main conference +
+
+
+
+
+ + ☆ STanH : Parametric Quantization for Variable Rate Learned Image + Compression + + +
+ In end-to-end learned image compression, encoder and decoder are jointly +trained to minimize a $R + {\lambda}D$ cost function, where ${\lambda}$ +controls the trade-off between rate of the quantized latent representation and +image quality. Unfortunately, a distinct encoder-decoder pair with millions of +parameters must be trained for each ${\lambda}$, hence the need to switch +encoders and to store multiple encoders and decoders on the user device for +every target rate. This paper proposes to exploit a differentiable quantizer +designed around a parametric sum of hyperbolic tangents, called STanH , that +relaxes the step-wise quantization function. STanH is implemented as a +differentiable activation layer with learnable quantization parameters that can +be plugged into a pre-trained fixed rate model and refined to achieve different +target bitrates. Experimental results show that our method enables variable +rate coding with comparable efficiency to the state-of-the-art, yet with +significant savings in terms of ease of deployment, training time, and storage +costs + +
+
+ comment: Submitted to IEEE Transactions on Image Processing +
+
+
+
+
+ + ♻ ☆ Spatial Visibility and Temporal Dynamics: Revolutionizing Field of View + Prediction in Adaptive Point Cloud Video Streaming + + +
+ Field-of-View (FoV) adaptive streaming significantly reduces bandwidth +requirement of immersive point cloud video (PCV) by only transmitting visible +points in a viewer's FoV. The traditional approaches often focus on +trajectory-based 6 degree-of-freedom (6DoF) FoV predictions. The predicted FoV +is then used to calculate point visibility. Such approaches do not explicitly +consider video content's impact on viewer attention, and the conversion from +FoV to point visibility is often error-prone and time-consuming. We reformulate +the PCV FoV prediction problem from the cell visibility perspective, allowing +for precise decision-making regarding the transmission of 3D data at the cell +level based on the predicted visibility distribution. We develop a novel +spatial visibility and object-aware graph model that leverages the historical +3D visibility data and incorporates spatial perception, neighboring cell +correlation, and occlusion information to predict the cell visibility in the +future. Our model significantly improves the long-term cell visibility +prediction, reducing the prediction MSE loss by up to 50% compared to the +state-of-the-art models while maintaining real-time performance (more than +30fps) for point cloud videos with over 1 million points. + +
+
+
+
+
+ + ♻ ☆ SVFAP: Self-supervised Video Facial Affect Perceiver + + +
+ Video-based facial affect analysis has recently attracted increasing +attention owing to its critical role in human-computer interaction. Previous +studies mainly focus on developing various deep learning architectures and +training them in a fully supervised manner. Although significant progress has +been achieved by these supervised methods, the longstanding lack of large-scale +high-quality labeled data severely hinders their further improvements. +Motivated by the recent success of self-supervised learning in computer vision, +this paper introduces a self-supervised approach, termed Self-supervised Video +Facial Affect Perceiver (SVFAP), to address the dilemma faced by supervised +methods. Specifically, SVFAP leverages masked facial video autoencoding to +perform self-supervised pre-training on massive unlabeled facial videos. +Considering that large spatiotemporal redundancy exists in facial videos, we +propose a novel temporal pyramid and spatial bottleneck Transformer as the +encoder of SVFAP, which not only largely reduces computational costs but also +achieves excellent performance. To verify the effectiveness of our method, we +conduct experiments on nine datasets spanning three downstream tasks, including +dynamic facial expression recognition, dimensional emotion recognition, and +personality recognition. Comprehensive results demonstrate that SVFAP can learn +powerful affect-related representations via large-scale self-supervised +pre-training and it significantly outperforms previous state-of-the-art methods +on all datasets. Code is available at https://github.com/sunlicai/SVFAP. + +
+
+ comment: Published in: IEEE Transactions on Affective Computing (Early + Access). The code and models are available at + https://github.com/sunlicai/SVFAP +
+
+
+
+
+ + ♻ ☆ BOLA360: Near-optimal View and Bitrate Adaptation for 360-degree Video + Streaming + + +
+ Recent advances in omnidirectional cameras and AR/VR headsets have spurred +the adoption of 360-degree videos that are widely believed to be the future of +online video streaming. 360-degree videos allow users to wear a head-mounted +display (HMD) and experience the video as if they are physically present in the +scene. Streaming high-quality 360-degree videos at scale is an unsolved problem +that is more challenging than traditional (2D) video delivery. The data rate +required to stream 360-degree videos is an order of magnitude more than +traditional videos. Further, the penalty for rebuffering events where the video +freezes or displays a blank screen is more severe as it may cause +cybersickness. We propose an online adaptive bitrate (ABR) algorithm for +360-degree videos called BOLA360 that runs inside the client's video player and +orchestrates the download of video segments from the server so as to maximize +the quality-of-experience (QoE) of the user. BOLA360 conserves bandwidth by +downloading only those video segments that are likely to fall within the +field-of-view (FOV) of the user. In addition, BOLA360 continually adapts the +bitrate of the downloaded video segments so as to enable a smooth playback +without rebuffering. We prove that BOLA360 is near-optimal with respect to an +optimal offline algorithm that maximizes QoE. Further, we evaluate BOLA360 on a +wide range of network and user head movement profiles and show that it provides +$13.6\%$ to $372.5\%$ more QoE than state-of-the-art algorithms. While ABR +algorithms for traditional (2D) videos have been well-studied over the last +decade, our work is the first ABR algorithm for 360-degree videos with both +theoretical and empirical guarantees on its performance. + +
+
+ comment: 27 pages +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ Text Clustering as Classification with LLMs + + +
+ Text clustering remains valuable in real-world applications where manual +labeling is cost-prohibitive. It facilitates efficient organization and +analysis of information by grouping similar texts based on their +representations. However, implementing this approach necessitates fine-tuned +embedders for downstream data and sophisticated similarity metrics. To address +this issue, this study presents a novel framework for text clustering that +effectively leverages the in-context learning capacity of Large Language Models +(LLMs). Instead of fine-tuning embedders, we propose to transform the text +clustering into a classification task via LLM. First, we prompt LLM to generate +potential labels for a given dataset. Second, after integrating similar labels +generated by the LLM, we prompt the LLM to assign the most appropriate label to +each sample in the dataset. Our framework has been experimentally proven to +achieve comparable or superior performance to state-of-the-art clustering +methods that employ embeddings, without requiring complex fine-tuning or +clustering algorithms. We make our code available to the public for utilization +at https://anonymous.4open.science/r/Text-Clustering-via-LLM-E500. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ RecSys Challenge 2024: Balancing Accuracy and Editorial Values in News + Recommendations RecSys' 24 + + +
+ The RecSys Challenge 2024 aims to advance news recommendation by addressing +both the technical and normative challenges inherent in designing effective and +responsible recommender systems for news publishing. This paper describes the +challenge, including its objectives, problem setting, and the dataset provided +by the Danish news publishers Ekstra Bladet and JP/Politikens Media Group +("Ekstra Bladet"). The challenge explores the unique aspects of news +recommendation, such as modeling user preferences based on behavior, accounting +for the influence of the news agenda on user interests, and managing the rapid +decay of news items. Additionally, the challenge embraces normative +complexities, investigating the effects of recommender systems on news flow and +their alignment with editorial values. We summarize the challenge setup, +dataset characteristics, and evaluation metrics. Finally, we announce the +winners and highlight their contributions. The dataset is available at: +https://recsys.eb.dk. + +
+
+ comment: 5 pages, 3 tables, RecSys' 24 +
+
+
+
+
+ + ☆ Mixed-Precision Embeddings for Large-Scale Recommendation Models + + +
+ Embedding techniques have become essential components of large databases in +the deep learning era. By encoding discrete entities, such as words, items, or +graph nodes, into continuous vector spaces, embeddings facilitate more +efficient storage, retrieval, and processing in large databases. Especially in +the domain of recommender systems, millions of categorical features are encoded +as unique embedding vectors, which facilitates the modeling of similarities and +interactions among features. However, numerous embedding vectors can result in +significant storage overhead. In this paper, we aim to compress the embedding +table through quantization techniques. Given that features vary in importance +levels, we seek to identify an appropriate precision for each feature to +balance model accuracy and memory usage. To this end, we propose a novel +embedding compression method, termed Mixed-Precision Embeddings (MPE). +Specifically, to reduce the size of the search space, we first group features +by frequency and then search precision for each feature group. MPE further +learns the probability distribution over precision levels for each feature +group, which can be used to identify the most suitable precision with a +specially designed sampling strategy. Extensive experiments on three public +datasets demonstrate that MPE significantly outperforms existing embedding +compression methods. Remarkably, MPE achieves about 200x compression on the +Criteo dataset without comprising the prediction accuracy. + +
+
+ comment: under submision +
+
+
+
+
+ + ☆ OM4OV: Leveraging Ontology Matching for Ontology Versioning + + +
+ Due to the dynamic nature of the semantic web, ontology version control is +required to capture time-varying information, most importantly for widely-used +ontologies. Despite the long-standing recognition of ontology versioning (OV) +as a crucial component for efficient ontology management, the growing size of +ontologies and accumulating errors caused by manual labour overwhelm current OV +approaches. In this paper, we propose yet another approach to performing OV +using existing ontology matching (OM) techniques and systems. We introduce a +unified OM4OV pipeline. From an OM perspective, we reconstruct a new task +formulation, performance measurement, and dataset construction for OV tasks. +Reusing the prior alignment(s) from OM, we also propose a cross-reference +mechanism to effectively reduce the matching candidature and improve overall OV +performance. We experimentally validate the OM4OV pipeline and its +cross-reference mechanism using three datasets from the Alignment Evaluation +Initiative (OAEI) and exploit insights on OM used for OV tasks. + +
+
+ comment: 7 pages, 7 figures, 1 table +
+
+
+
+
+ + ☆ ASTRA: Accurate and Scalable ANNS-based Training of Extreme Classifiers + + +
+ `Extreme Classification'' (or XC) is the task of annotating data points +(queries) with relevant labels (documents), from an extremely large set of $L$ +possible labels, arising in search and recommendations. The most successful +deep learning paradigm that has emerged over the last decade or so for XC is to +embed the queries (and labels) using a deep encoder (e.g. DistilBERT), and use +linear classifiers on top of the query embeddings. This architecture is of +appeal because it enables millisecond-time inference using approximate nearest +neighbor search (ANNS). The key question is how do we design training +algorithms that are accurate as well as scale to $O(100M)$ labels on a limited +number of GPUs. + State-of-the-art XC techniques that demonstrate high accuracies (e.g., DEXML, +Ren\'ee, DEXA) on standard datasets have per-epoch training time that scales as +$O(L)$ or employ expensive negative sampling strategies, which are prohibitive +in XC scenarios. In this work, we develop an accurate and scalable XC algorithm +ASTRA with two key observations: (a) building ANNS index on the classifier +vectors and retrieving hard negatives using the classifiers aligns the negative +sampling strategy to the loss function optimized; (b) keeping the ANNS indices +current as the classifiers change through the epochs is prohibitively expensive +while using stale negatives (refreshed periodically) results in poor accuracy; +to remedy this, we propose a negative sampling strategy that uses a mixture of +importance sampling and uniform sampling. By extensive evaluation on standard +XC as well as proprietary datasets with 120M labels, we demonstrate that ASTRA +achieves SOTA precision, while reducing training time by 4x-15x relative to the +second best. + +
+
+
+
+
+ + ☆ Neural Click Models for Recommender Systems + + +
+ We develop and evaluate neural architectures to model the user behavior in +recommender systems (RS) inspired by click models for Web search but going +beyond standard click models. Proposed architectures include recurrent +networks, Transformer-based models that alleviate the quadratic complexity of +self-attention, adversarial and hierarchical architectures. Our models +outperform baselines on the ContentWise and RL4RS datasets and can be used in +RS simulators to model user response for RS evaluation and pretraining. + +
+
+
+
+
+ + ☆ Mitigating Propensity Bias of Large Language Models for Recommender + Systems + + +
+ The rapid development of Large Language Models (LLMs) creates new +opportunities for recommender systems, especially by exploiting the side +information (e.g., descriptions and analyses of items) generated by these +models. However, aligning this side information with collaborative information +from historical interactions poses significant challenges. The inherent biases +within LLMs can skew recommendations, resulting in distorted and potentially +unfair user experiences. On the other hand, propensity bias causes side +information to be aligned in such a way that it often tends to represent all +inputs in a low-dimensional subspace, leading to a phenomenon known as +dimensional collapse, which severely restricts the recommender system's ability +to capture user preferences and behaviours. To address these issues, we +introduce a novel framework named Counterfactual LLM Recommendation (CLLMR). +Specifically, we propose a spectrum-based side information encoder that +implicitly embeds structural information from historical interactions into the +side information representation, thereby circumventing the risk of dimension +collapse. Furthermore, our CLLMR approach explores the causal relationships +inherent in LLM-based recommender systems. By leveraging counterfactual +inference, we counteract the biases introduced by LLMs. Extensive experiments +demonstrate that our CLLMR approach consistently enhances the performance of +various recommender models. + +
+
+
+
+
+ + ☆ Large Language Model Empowered Embedding Generator for Sequential + Recommendation + + +
+ Sequential Recommender Systems (SRS) are extensively applied across various +domains to predict users' next interaction by modeling their interaction +sequences. However, these systems typically grapple with the long-tail problem, +where they struggle to recommend items that are less popular. This challenge +results in a decline in user discovery and reduced earnings for vendors, +negatively impacting the system as a whole. Large Language Model (LLM) has the +potential to understand the semantic connections between items, regardless of +their popularity, positioning them as a viable solution to this dilemma. In our +paper, we present LLMEmb, an innovative technique that harnesses LLM to create +item embeddings that bolster the performance of SRS. To align the capabilities +of general-purpose LLM with the needs of the recommendation domain, we +introduce a method called Supervised Contrastive Fine-Tuning (SCFT). This +method involves attribute-level data augmentation and a custom contrastive loss +designed to tailor LLM for enhanced recommendation performance. Moreover, we +highlight the necessity of incorporating collaborative filtering signals into +LLM-generated embeddings and propose Recommendation Adaptation Training (RAT) +for this purpose. RAT refines the embeddings to be optimally suited for SRS. +The embeddings derived from LLMEmb can be easily integrated with any SRS model, +showcasing its practical utility. Extensive experimentation on three real-world +datasets has shown that LLMEmb significantly improves upon current methods when +applied across different SRS models. + +
+
+
+
+
+ + ♻ ☆ Impedance vs. Power Side-channel Vulnerabilities: A Comparative Study + + +
+ Physical side channels emerge from the relation between internal computation +or data with observable physical parameters of a chip. Previous works mostly +focus on properties related to current consumption such as power consumption. +The fundamental property behind current consumption occur from the impedance of +the chip. Contemporary works have stared using chip impedance as a physical +side channel in extracting sensitive information from computing systems. It +leverages variations in intrinsic impedance of a chip across different logic +states. However, there has been a lack of comparative studies. In this study, +we conduct a comparative analysis of the impedance side channel, which has been +limitedly explored, and the well-established power side channel. Through +experimental evaluation, we investigate the efficacy of these side channels in +extracting stored advanced encryption standard (AES) cryptographic key on a +memory and analyze their performance. Our findings indicate that impedance +analysis demonstrates a higher potential for cryptographic key extraction +compared to power side-channel analysis (SCA). Moreover, we identify scenarios +where power SCA does not yield satisfactory results, whereas impedance analysis +proves to be more robust and effective. This work not only underscores the +significance of impedance SCA in enhancing cryptographic security but also +emphasizes the necessity for a deeper understanding of its mechanisms and +implications. + +
+
+
+
+
+ + ♻ ☆ TTQA-RS- A break-down prompting approach for Multi-hop Table-Text + Question Answering with Reasoning and Summarization + + +
+ Question answering (QA) over tables and text has gained much popularity over +the years. Multi-hop table-text QA requires multiple hops between the table and +text, making it a challenging QA task. Although several works have attempted to +solve the table-text QA task, most involve training the models and requiring +labeled data. In this paper, we have proposed a Retrieval Augmented Generation +(RAG) based model - TTQA-RS: A break-down prompting approach for Multi-hop +Table-Text Question Answering with Reasoning and Summarization. Our model uses +an enhanced retriever for table-text information retrieval and uses augmented +knowledge, including table-text summary with decomposed sub-questions with +answers for a reasoning-based table-text QA. Using open-source language models, +our model outperformed all existing prompting methods for table-text QA tasks +on existing table-text QA datasets, such as HybridQA and OTT-QA's development +set. Our experiments demonstrate the potential of prompt-based approaches using +open-source LLMs. Additionally, by using LLaMA3-70B, our model achieved +state-of-the-art performance for prompting-based methods on multi-hop +table-text QA. + +
+
+
+
+
+ + ♻ ☆ Probability Distribution Learning: A theoretical framework for Deep + Learning + + +
+ This paper introduces Probability Distribution Learning (PD learning), a +novel theoretical learning framework designed to address a comprehensive range +of machine learning and statistical tasks, including classification, +regression, and parameter estimation. Departing from the traditional +statistical learning framework, PD learning focuses on learning the underlying +probability distribution of a dataset, which is modeled as a random variable +within the probability simplex. In this framework, the learning error is +decomposed into uncertainty and the model's fitting error to the optimal +estimate. Uncertainty, which is non-optimizable and independent of both the +model and optimization algorithm, depends solely on prior knowledge and +sampling data, constituting the optimal bound of the learning error. Minimizing +the fitting error represents a typical non-convex optimization problem. To +address this, we initially demonstrate that under the conditions of unique +optimum and sampling stability, the loss function exhibits a unified +mathematical form, which we refer to as the standard loss function. Moreover, +we prove that by employing the standard loss function, the optima of fitting +error minimization can be approached by reducing the gradient norm and +structural error. Subsequently, we demonstrate that with random parameter +initialization, increasing network depth and the parameter count can reduce +structural error. Consequently, from the perspective of structural error, +techniques such as over-parameterization, non-convex optimization, and the flat +minima in deep learning are beneficial in reducing structural error, thereby +ensuring that gradient-based iterative algorithms can attain an approximate +global optimum for fitting error minimization. Ultimately, the experimental +results on various models have validated the effectiveness of the framework +proposed in this paper. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2105.04026 by other + authors. arXiv admin note: text overlap with arXiv:2105.04026 by other + authors +
+
+
+
+
+ + ♻ ☆ Analyzing the Effectiveness of Listwise Reranking with Positional + Invariance on Temporal Generalizability + + +
+ This working note outlines our participation in the retrieval task at CLEF +2024. We highlight the considerable gap between studying retrieval performance +on static knowledge documents and understanding performance in real-world +environments. Therefore, Addressing these discrepancies and measuring the +temporal persistence of IR systems is crucial. By investigating the LongEval +benchmark, specifically designed for such dynamic environments, our findings +demonstrate the effectiveness of a listwise reranking approach, which +proficiently handles inaccuracies induced by temporal distribution shifts. +Among listwise rerankers, our findings show that ListT5, which effectively +mitigates the positional bias problem by adopting the Fusion-in-Decoder +architecture, is especially effective, and more so, as temporal drift +increases, on the test-long subset. + +
+
+ comment: Accepted at CLEF 2024 LongEval track. Abstract revised: its first two + (background) sentences were too similar to an earlier paper arXiv:2305.18952 +
+
+
+
+
+ + ♻ ☆ Cross-Modal Retrieval: A Systematic Review of Methods and Future + Directions + + +
+ With the exponential surge in diverse multi-modal data, traditional uni-modal +retrieval methods struggle to meet the needs of users seeking access to data +across various modalities. To address this, cross-modal retrieval has emerged, +enabling interaction across modalities, facilitating semantic matching, and +leveraging complementarity and consistency between heterogeneous data. Although +prior literature has reviewed the field of cross-modal retrieval, it suffers +from numerous deficiencies in terms of timeliness, taxonomy, and +comprehensiveness. This paper conducts a comprehensive review of cross-modal +retrieval's evolution, spanning from shallow statistical analysis techniques to +vision-language pre-training models. Commencing with a comprehensive taxonomy +grounded in machine learning paradigms, mechanisms, and models, the paper +delves deeply into the principles and architectures underpinning existing +cross-modal retrieval methods. Furthermore, it offers an overview of +widely-used benchmarks, metrics, and performances. Lastly, the paper probes the +prospects and challenges that confront contemporary cross-modal retrieval, +while engaging in a discourse on potential directions for further progress in +the field. To facilitate the ongoing research on cross-modal retrieval, we +develop a user-friendly toolbox and an open-source repository at +https://cross-modal-retrieval.github.io. + +
+
+
+
+
+ + ♻ ☆ Watermarking Recommender Systems + + +
+ Recommender systems embody significant commercial value and represent crucial +intellectual property. However, the integrity of these systems is constantly +challenged by malicious actors seeking to steal their underlying models. +Safeguarding against such threats is paramount to upholding the rights and +interests of the model owner. While model watermarking has emerged as a potent +defense mechanism in various domains, its direct application to recommender +systems remains unexplored and non-trivial. In this paper, we address this gap +by introducing Autoregressive Out-of-distribution Watermarking (AOW), a novel +technique tailored specifically for recommender systems. Our approach entails +selecting an initial item and querying it through the oracle model, followed by +the selection of subsequent items with small prediction scores. This iterative +process generates a watermark sequence autoregressively, which is then +ingrained into the model's memory through training. To assess the efficacy of +the watermark, the model is tasked with predicting the subsequent item given a +truncated watermark sequence. Through extensive experimentation and analysis, +we demonstrate the superior performance and robust properties of AOW. Notably, +our watermarking technique exhibits high-confidence extraction capabilities and +maintains effectiveness even in the face of distillation and fine-tuning +processes. + +
+
+
+
+
+ + ♻ ☆ A systematic evaluation of large language models for biomedical natural + language processing: benchmarks, baselines, and recommendations + + +
+ The biomedical literature is rapidly expanding, posing a significant +challenge for manual curation and knowledge discovery. Biomedical Natural +Language Processing (BioNLP) has emerged as a powerful solution, enabling the +automated extraction of information and knowledge from this extensive +literature. Recent attention has been directed towards Large Language Models +(LLMs) due to their impressive performance. However, there remains a critical +gap in understanding the effectiveness of LLMs in BioNLP tasks and their +broader implications for method development and downstream users. Currently, +there is a lack of baseline performance data, benchmarks, and practical +recommendations for using LLMs in the biomedical domain. To address this gap, +we present a systematic evaluation of four representative LLMs: GPT-3.5 and +GPT-4 (closed-source), LLaMA 2 (open-sourced), and PMC LLaMA (domain-specific) +across 12 BioNLP datasets covering six applications (named entity recognition, +relation extraction, multi-label document classification, question answering, +text summarization, and text simplification). The evaluation is conducted under +four settings: zero-shot, static few-shot, dynamic K-nearest few-shot, and +fine-tuning. We compare these models against state-of-the-art (SOTA) approaches +that fine-tune (domain-specific) BERT or BART models, which are +well-established methods in BioNLP tasks. The evaluation covers both +quantitative and qualitative evaluations, where the latter involves manually +reviewing collectively hundreds of thousands of LLM outputs for +inconsistencies, missing information, and hallucinations in extractive and +classification tasks. The qualitative review also examines accuracy, 1 +completeness, and readability in text summarization tasks. Additionally, a cost +analysis of closed-source GPT models is conducted. + +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ Delving Deep into Engagement Prediction of Short Videos ECCV 2024 + + +
+ Understanding and modeling the popularity of User Generated Content (UGC) +short videos on social media platforms presents a critical challenge with broad +implications for content creators and recommendation systems. This study delves +deep into the intricacies of predicting engagement for newly published videos +with limited user interactions. Surprisingly, our findings reveal that Mean +Opinion Scores from previous video quality assessment datasets do not strongly +correlate with video engagement levels. To address this, we introduce a +substantial dataset comprising 90,000 real-world UGC short videos from +Snapchat. Rather than relying on view count, average watch time, or rate of +likes, we propose two metrics: normalized average watch percentage (NAWP) and +engagement continuation rate (ECR) to describe the engagement levels of short +videos. Comprehensive multi-modal features, including visual content, +background music, and text data, are investigated to enhance engagement +prediction. With the proposed dataset and two key metrics, our method +demonstrates its ability to predict engagements of short videos purely from +video content. + +
+
+ comment: Accepted to ECCV 2024. Project page: + https://github.com/dasongli1/SnapUGC_Engagement +
+
+
+
+
+ + ☆ FreeMask: Rethinking the Importance of Attention Masks for Zero-Shot + Video Editing + + +
+ Text-to-video diffusion models have made remarkable advancements. Driven by +their ability to generate temporally coherent videos, research on zero-shot +video editing using these fundamental models has expanded rapidly. To enhance +editing quality, structural controls are frequently employed in video editing. +Among these techniques, cross-attention mask control stands out for its +effectiveness and efficiency. However, when cross-attention masks are naively +applied to video editing, they can introduce artifacts such as blurring and +flickering. Our experiments uncover a critical factor overlooked in previous +video editing research: cross-attention masks are not consistently clear but +vary with model structure and denoising timestep. To address this issue, we +propose the metric Mask Matching Cost (MMC) that quantifies this variability +and propose FreeMask, a method for selecting optimal masks tailored to specific +video editing tasks. Using MMC-selected masks, we further improve the masked +fusion mechanism within comprehensive attention features, e.g., temp, cross, +and self-attention modules. Our approach can be seamlessly integrated into +existing zero-shot video editing frameworks with better performance, requiring +no control assistance or parameter fine-tuning but enabling adaptive decoupling +of unedited semantic layouts with mask precision control. Extensive experiments +demonstrate that FreeMask achieves superior semantic fidelity, temporal +consistency, and editing quality compared to state-of-the-art methods. + +
+
+ comment: Video Editing +
+
+
+
+
+ + ☆ Computer-mediated therapies for stroke rehabilitation: a systematic + review and meta-Analysis + + +
+ OBJECTIVE: To evaluate the efficacy of different forms of virtual reality +(VR) treatments as either immersive virtual reality (IVR) or non-immersive +virtual reality (NIVR) in comparison to conventional therapy (CT) in improving +physical and psychological status among stroke patients. METHODS: The +literature search was conducted on seven databases. ACM Digital Library, +Medline (via PubMed), Cochrane, IEEE Xplore, Web of Science, and Scopus. The +effect sizes of the main outcomes were calculated using Cohen's d. Pooled +results were used to present an overall estimate of the treatment effect using +a random-effects model. RESULTS: A total of 22 randomized controlled trials +were evaluated. 3 trials demonstrated that immersive virtual reality improved +upper limb activity, function and activity of daily life in a way comparable to +CT. 18 trials showed that NIVR had similar benefits to CT for upper limb +activity and function, balance and mobility, activities of daily living and +participation. A comparison between the different forms of VR showed that IVR +may be more beneficial than NIVR for upper limb training and activities of +daily life. CONCLUSIONS: This study found out that IVR therapies may be more +effective than NIVR but not CT to improve upper limb activity, function, and +daily life activities. However, there is no evidence of the durability of IVR +treatment. More research involving studies with larger samples is needed to +assess the long-term effects and promising benefits of immersive virtual +reality technology. + +
+
+ comment: 32 pages +
+
+
+
+
+ + ☆ Signal Processing for Haptic Surface Modeling: a Review + + +
+ Haptic feedback has been integrated into Virtual and Augmented Reality, +complementing acoustic and visual information and contributing to an all-round +immersive experience in multiple fields, spanning from the medical domain to +entertainment and gaming. Haptic technologies involve complex +cross-disciplinary research that encompasses sensing, data representation, +interactive rendering, perception, and quality of experience. The standard +processing pipeline, consists of (I) sensing physical features in the real +world using a transducer, (II) modeling and storing the collected information +in some digital format, (III) communicating the information, and finally, (IV) +rendering the haptic information through appropriate devices, thus producing a +user experience (V) perceptually close to the original physical world. Among +these areas, sensing, rendering and perception have been deeply investigated +and are the subject of different comprehensive surveys available in the +literature. Differently, research dealing with haptic surface modeling and data +representation still lacks a comprehensive dissection. In this work, we aim at +providing an overview on modeling and representation of haptic surfaces from a +signal processing perspective, covering the aspects that lie in between haptic +information acquisition on one side and rendering and perception on the other +side. We analyze, categorize, and compare research papers that address the +haptic surface modeling and data representation, pointing out existing gaps and +possible research directions. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ☆ ProFD: Prompt-Guided Feature Disentangling for Occluded Person + Re-Identification ACM MM 2024 + + +
+ To address the occlusion issues in person Re-Identification (ReID) tasks, +many methods have been proposed to extract part features by introducing +external spatial information. However, due to missing part appearance +information caused by occlusion and noisy spatial information from external +model, these purely vision-based approaches fail to correctly learn the +features of human body parts from limited training data and struggle in +accurately locating body parts, ultimately leading to misaligned part features. +To tackle these challenges, we propose a Prompt-guided Feature Disentangling +method (ProFD), which leverages the rich pre-trained knowledge in the textual +modality facilitate model to generate well-aligned part features. ProFD first +designs part-specific prompts and utilizes noisy segmentation mask to +preliminarily align visual and textual embedding, enabling the textual prompts +to have spatial awareness. Furthermore, to alleviate the noise from external +masks, ProFD adopts a hybrid-attention decoder, ensuring spatial and semantic +consistency during the decoding process to minimize noise impact. Additionally, +to avoid catastrophic forgetting, we employ a self-distillation strategy, +retaining pre-trained knowledge of CLIP to mitigate over-fitting. Evaluation +results on the Market1501, DukeMTMC-ReID, Occluded-Duke, Occluded-ReID, and +P-DukeMTMC datasets demonstrate that ProFD achieves state-of-the-art results. +Our project is available at: https://github.com/Cuixxx/ProFD. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ☆ Towards Robust Multimodal Sentiment Analysis with Incomplete Data NeurIPS 2024 + + +
+ The field of Multimodal Sentiment Analysis (MSA) has recently witnessed an +emerging direction seeking to tackle the issue of data incompleteness. +Recognizing that the language modality typically contains dense sentiment +information, we consider it as the dominant modality and present an innovative +Language-dominated Noise-resistant Learning Network (LNLN) to achieve robust +MSA. The proposed LNLN features a dominant modality correction (DMC) module and +dominant modality based multimodal learning (DMML) module, which enhances the +model's robustness across various noise scenarios by ensuring the quality of +dominant modality representations. Aside from the methodical design, we perform +comprehensive experiments under random data missing scenarios, utilizing +diverse and meaningful settings on several popular datasets (\textit{e.g.,} +MOSI, MOSEI, and SIMS), providing additional uniformity, transparency, and +fairness compared to existing evaluations in the literature. Empirically, LNLN +consistently outperforms existing baselines, demonstrating superior performance +across these challenging and extensive evaluation metrics. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ WildFusion: Multimodal Implicit 3D Reconstructions in the Wild + + +
+ We propose WildFusion, a novel approach for 3D scene reconstruction in +unstructured, in-the-wild environments using multimodal implicit neural +representations. WildFusion integrates signals from LiDAR, RGB camera, contact +microphones, tactile sensors, and IMU. This multimodal fusion generates +comprehensive, continuous environmental representations, including pixel-level +geometry, color, semantics, and traversability. Through real-world experiments +on legged robot navigation in challenging forest environments, WildFusion +demonstrates improved route selection by accurately predicting traversability. +Our results highlight its potential to advance robotic navigation and 3D +mapping in complex outdoor terrains. + +
+
+ comment: Our project website is at: http://generalroboticslab.com/WildFusion +
+
+
+
+
+ + ♻ ☆ Cross-Modal Retrieval: A Systematic Review of Methods and Future + Directions + + +
+ With the exponential surge in diverse multi-modal data, traditional uni-modal +retrieval methods struggle to meet the needs of users seeking access to data +across various modalities. To address this, cross-modal retrieval has emerged, +enabling interaction across modalities, facilitating semantic matching, and +leveraging complementarity and consistency between heterogeneous data. Although +prior literature has reviewed the field of cross-modal retrieval, it suffers +from numerous deficiencies in terms of timeliness, taxonomy, and +comprehensiveness. This paper conducts a comprehensive review of cross-modal +retrieval's evolution, spanning from shallow statistical analysis techniques to +vision-language pre-training models. Commencing with a comprehensive taxonomy +grounded in machine learning paradigms, mechanisms, and models, the paper +delves deeply into the principles and architectures underpinning existing +cross-modal retrieval methods. Furthermore, it offers an overview of +widely-used benchmarks, metrics, and performances. Lastly, the paper probes the +prospects and challenges that confront contemporary cross-modal retrieval, +while engaging in a discourse on potential directions for further progress in +the field. To facilitate the ongoing research on cross-modal retrieval, we +develop a user-friendly toolbox and an open-source repository at +https://cross-modal-retrieval.github.io. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 6 + +
+
+
+ + ☆ Counterfactual Evaluation of Ads Ranking Models through Domain + Adaptation RecSys'24 + + +
+ We propose a domain-adapted reward model that works alongside an Offline A/B +testing system for evaluating ranking models. This approach effectively +measures reward for ranking model changes in large-scale Ads recommender +systems, where model-free methods like IPS are not feasible. Our experiments +demonstrate that the proposed technique outperforms both the vanilla IPS method +and approaches using non-generalized reward models. + +
+
+ comment: Accepted at the CONSEQUENCES'24 workshop, co-located with ACM + RecSys'24 +
+
+
+
+
+ + ☆ The Devil is in the Sources! Knowledge Enhanced Cross-Domain + Recommendation in an Information Bottleneck Perspective CIKM 2024 + + +
+ Cross-domain Recommendation (CDR) aims to alleviate the data sparsity and the +cold-start problems in traditional recommender systems by leveraging knowledge +from an informative source domain. However, previously proposed CDR models +pursue an imprudent assumption that the entire information from the source +domain is equally contributed to the target domain, neglecting the evil part +that is completely irrelevant to users' intrinsic interest. To address this +concern, in this paper, we propose a novel knowledge enhanced cross-domain +recommendation framework named CoTrans, which remolds the core procedures of +CDR models with: Compression on the knowledge from the source domain and +Transfer of the purity to the target domain. Specifically, following the theory +of Graph Information Bottleneck, CoTrans first compresses the source behaviors +with the perception of information from the target domain. Then to preserve all +the important information for the CDR task, the feedback signals from both +domains are utilized to promote the effectiveness of the transfer procedure. +Additionally, a knowledge-enhanced encoder is employed to narrow gaps caused by +the non-overlapped items across separate domains. Comprehensive experiments on +three widely used cross-domain datasets demonstrate that CoTrans significantly +outperforms both single-domain and state-of-the-art cross-domain recommendation +approaches. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+ + ☆ Meta Learning to Rank for Sparsely Supervised Queries + + +
+ Supervisory signals are a critical resource for training learning to rank +models. In many real-world search and retrieval scenarios, these signals may +not be readily available or could be costly to obtain for some queries. The +examples include domains where labeling requires professional expertise, +applications with strong privacy constraints, and user engagement information +that are too scarce. We refer to these scenarios as sparsely supervised queries +which pose significant challenges to traditional learning to rank models. In +this work, we address sparsely supervised queries by proposing a novel meta +learning to rank framework which leverages fast learning and adaption +capability of meta-learning. The proposed approach accounts for the fact that +different queries have different optimal parameters for their rankers, in +contrast to traditional learning to rank models which only learn a global +ranking model applied to all the queries. In consequence, the proposed method +would yield significant advantages especially when new queries are of different +characteristics with the training queries. Moreover, the proposed meta learning +to rank framework is generic and flexible. We conduct a set of comprehensive +experiments on both public datasets and a real-world e-commerce dataset. The +results demonstrate that the proposed meta-learning approach can significantly +enhance the performance of learning to rank models with sparsely labeled +queries. + +
+
+ comment: Accepted at TOIS +
+
+
+
+
+ + ♻ ☆ Robust portfolio optimization for recommender systems considering + uncertainty of estimated statistics + + +
+ This paper is concerned with portfolio optimization models for creating +high-quality lists of recommended items to balance the accuracy and diversity +of recommendations. However, the statistics (i.e., expectation and covariance +of ratings) required for mean--variance portfolio optimization are subject to +inevitable estimation errors. To remedy this situation, we focus on robust +optimization techniques that derive reliable solutions to uncertain +optimization problems. Specifically, we propose a robust portfolio optimization +model that copes with the uncertainty of estimated statistics based on the +cardinality-based uncertainty sets. This robust portfolio optimization model +can be reduced to a mixed-integer linear optimization problem, which can be +solved exactly using mathematical optimization solvers. Experimental results +using two publicly available rating datasets demonstrate that our method can +improve not only the recommendation accuracy but also the diversity of +recommendations compared with conventional mean--variance portfolio +optimization models. Notably, our method has the potential to improve the +recommendation quality of various rating prediction algorithms. + +
+
+
+
+
+ + ♻ ☆ High-Order Fusion Graph Contrastive Learning for Recommendation + + +
+ Self-supervised learning (SSL) has recently attracted significant attention +in the field of recommender systems. Contrastive learning (CL) stands out as a +major SSL paradigm due to its robust ability to generate self-supervised +signals. Mainstream graph contrastive learning (GCL)-based methods typically +implement CL by creating contrastive views through various data augmentation +techniques. Despite these methods are effective, we argue that there still +exist several challenges. i) Data augmentation (e.g., discarding edges or +adding noise) necessitates additional graph convolution (GCN) or modeling +operations, which are highly time-consuming and potentially harm the embedding +quality. ii) Existing CL-based methods use traditional CL objectives to capture +self-supervised signals. However, few studies have explored obtaining CL +objectives from more perspectives and have attempted to fuse the varying +signals from these CL objectives to enhance recommendation performance. + To overcome these challenges, we propose a High-order Fusion Graph +Contrastive Learning (HFGCL) framework for recommendation. Specifically, +instead of facilitating data augmentations, we use high-order information from +GCN process to create contrastive views. Additionally, to integrate +self-supervised signals from various CL objectives, we propose an advanced CL +objective. By ensuring that positive pairs are distanced from negative samples +derived from both contrastive views, we effectively fuse self-supervised +signals from distinct CL objectives, thereby enhancing the mutual information +between positive pairs. Experimental results on three public datasets +demonstrate the superior recommendation performance and efficiency of HFGCL +compared to the state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ A Quick, trustworthy spectral knowledge Q&A system + leveragingretrieval-augmented generation on LLM + + +
+ Large Language Model (LLM) has demonstrated significant success in a range of +natural language processing (NLP) tasks within general domain. The emergence of +LLM has introduced innovative methodologies across diverse fields, including +the natural sciences. Researchers aim to implement automated, concurrent +process driven by LLM to supplant conventional manual, repetitive and +labor-intensive work. In the domain of spectral analysis and detection, it is +imperative for researchers to autonomously acquire pertinent knowledge across +various research objects, which encompasses the spectroscopic techniques and +the chemometric methods that are employed in experiments and analysis. +Paradoxically, despite the recognition of spectroscopic detection as an +effective analytical method, the fundamental process of knowledge retrieval +remains both time-intensive and repetitive. In response to this challenge, we +first introduced the Spectral Detection and Analysis Based Paper(SDAAP) +dataset, which is the first open-source textual knowledge dataset for spectral +analysis and detection and contains annotated literature data as well as +corresponding knowledge instruction data. Subsequently, we also designed an +automated Q\&A framework based on the SDAAP dataset, which can retrieve +relevant knowledge and generate high-quality responses by extracting entities +in the input as retrieval parameters. It is worth noting that: within this +framework, LLM is only used as a tool to provide generalizability, while RAG +technique is used to accurately capture the source of the knowledge.This +approach not only improves the quality of the generated responses, but also +ensures the traceability of the knowledge. Experimental results show that our +framework generates responses with more reliable expertise compared to the +baseline. + +
+
+ comment: 16 pages,10 figures,3 tables +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ Modeling Layout Reading Order as Ordering Relations for Visually-rich + Document Understanding EMNLP 2024 + + +
+ Modeling and leveraging layout reading order in visually-rich documents +(VrDs) is critical in document intelligence as it captures the rich structure +semantics within documents. Previous works typically formulated layout reading +order as a permutation of layout elements, i.e. a sequence containing all the +layout elements. However, we argue that this formulation does not adequately +convey the complete reading order information in the layout, which may +potentially lead to performance decline in downstream VrD tasks. To address +this issue, we propose to model the layout reading order as ordering relations +over the set of layout elements, which have sufficient expressive capability +for the complete reading order information. To enable empirical evaluation on +methods towards the improved form of reading order prediction (ROP), we +establish a comprehensive benchmark dataset including the reading order +annotation as relations over layout elements, together with a +relation-extraction-based method that outperforms previous methods. Moreover, +to highlight the practical benefits of introducing the improved form of layout +reading order, we propose a reading-order-relation-enhancing pipeline to +improve model performance on any arbitrary VrD task by introducing additional +reading order relation inputs. Comprehensive results demonstrate that the +pipeline generally benefits downstream VrD tasks: (1) with utilizing the +reading order relation information, the enhanced downstream models achieve SOTA +results on both two task settings of the targeted dataset; (2) with utilizing +the pseudo reading order information generated by the proposed ROP model, the +performance of the enhanced models has improved across all three models and +eight cross-domain VrD-IE/QA task settings without targeted optimization. + +
+
+ comment: Accepted as a long paper in the main conference of EMNLP 2024 +
+
+
+
+
+ + ☆ IDEAW: Robust Neural Audio Watermarking with Invertible Dual-Embedding EMNLP 2024 + + +
+ The audio watermarking technique embeds messages into audio and accurately +extracts messages from the watermarked audio. Traditional methods develop +algorithms based on expert experience to embed watermarks into the time-domain +or transform-domain of signals. With the development of deep neural networks, +deep learning-based neural audio watermarking has emerged. Compared to +traditional algorithms, neural audio watermarking achieves better robustness by +considering various attacks during training. However, current neural +watermarking methods suffer from low capacity and unsatisfactory +imperceptibility. Additionally, the issue of watermark locating, which is +extremely important and even more pronounced in neural audio watermarking, has +not been adequately studied. In this paper, we design a dual-embedding +watermarking model for efficient locating. We also consider the impact of the +attack layer on the invertible neural network in robustness training, improving +the model to enhance both its reasonableness and stability. Experiments show +that the proposed model, IDEAW, can withstand various attacks with higher +capacity and more efficient locating ability compared to existing methods. + +
+
+ comment: Accepted by the 2024 Conference on Empirical Methods in Natural + Language Processing (EMNLP 2024) +
+
+
+
+
+ + ☆ Quantitative Analysis of Audio-Visual Tasks: An Information-Theoretic + Perspective SC + + +
+ In the field of spoken language processing, audio-visual speech processing is +receiving increasing research attention. Key components of this research +include tasks such as lip reading, audio-visual speech recognition, and +visual-to-speech synthesis. Although significant success has been achieved, +theoretical analysis is still insufficient for audio-visual tasks. This paper +presents a quantitative analysis based on information theory, focusing on +information intersection between different modalities. Our results show that +this analysis is valuable for understanding the difficulties of audio-visual +processing tasks as well as the benefits that could be obtained by modality +integration. + +
+
+ comment: Accepted by ISCSLP2024 +
+
+
+
+
+ + ☆ Video DataFlywheel: Resolving the Impossible Data Trinity in + Video-Language Understanding + + +
+ Recently, video-language understanding has achieved great success through +large-scale pre-training. However, data scarcity remains a prevailing +challenge. This study quantitatively reveals an "impossible trinity" among data +quantity, diversity, and quality in pre-training datasets. Recent efforts seek +to refine large-scale, diverse ASR datasets compromised by low quality through +synthetic annotations. These methods successfully leverage useful information +in multimodal video content (frames, tags, ASR transcripts, etc.) to refine the +original annotations. Nevertheless, they struggle to mitigate noise within +synthetic annotations and lack scalability as the dataset size expands. To +address these issues, we introduce the Video DataFlywheel framework, which +iteratively refines video annotations with improved noise control methods. For +iterative refinement, we first leverage a video-language model to generate +synthetic annotations, resulting in a refined dataset. Then, we pre-train on it +and fine-tune on human refinement examples for a stronger model. These +processes are repeated for continuous improvement. For noise control, we +present AdaTaiLr, a novel noise control method that requires weaker assumptions +on noise distribution, thereby proving more effective in large datasets with +theoretical guarantees. The combination of iterative refinement and AdaTaiLr +can achieve better scalability in video-language understanding. Extensive +experiments show that our framework outperforms existing data refinement +baselines, delivering a 3% performance boost and improving dataset quality with +minimal diversity loss. Furthermore, our refined dataset facilitates +significant improvements in various video-language understanding tasks, +including video question answering and text-video retrieval. + +
+
+ comment: Under peer review +
+
+
+
+
+ + ☆ IWN: Image Watermarking Based on Idempotency + + +
+ In the expanding field of digital media, maintaining the strength and +integrity of watermarking technology is becoming increasingly challenging. This +paper, inspired by the Idempotent Generative Network (IGN), explores the +prospects of introducing idempotency into image watermark processing and +proposes an innovative neural network model - the Idempotent Watermarking +Network (IWN). The proposed model, which focuses on enhancing the recovery +quality of color image watermarks, leverages idempotency to ensure superior +image reversibility. This feature ensures that, even if color image watermarks +are attacked or damaged, they can be effectively projected and mapped back to +their original state. Therefore, the extracted watermarks have unquestionably +increased quality. The IWN model achieves a balance between embedding capacity +and robustness, alleviating to some extent the inherent contradiction between +these two factors in traditional watermarking techniques and steganography +methods. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 5 + +
+
+
+ + ☆ HTML-LSTM: Information Extraction from HTML Tables in Web Pages using + Tree-Structured LSTM + + +
+ In this paper, we propose a novel method for extracting information from HTML +tables with similar contents but with a different structure. We aim to +integrate multiple HTML tables into a single table for retrieval of information +containing in various Web pages. The method is designed by extending +tree-structured LSTM, the neural network for tree-structured data, in order to +extract information that is both linguistic and structural information of HTML +data. We evaluate the proposed method through experiments using real data +published on the WWW. + +
+
+
+
+
+ + ☆ Crafting Personalized Agents through Retrieval-Augmented Generation on + Editable Memory Graphs EMNLP 2024 + + +
+ In the age of mobile internet, user data, often referred to as memories, is +continuously generated on personal devices. Effectively managing and utilizing +this data to deliver services to users is a compelling research topic. In this +paper, we introduce a novel task of crafting personalized agents powered by +large language models (LLMs), which utilize a user's smartphone memories to +enhance downstream applications with advanced LLM capabilities. To achieve this +goal, we introduce EMG-RAG, a solution that combines Retrieval-Augmented +Generation (RAG) techniques with an Editable Memory Graph (EMG). This approach +is further optimized using Reinforcement Learning to address three distinct +challenges: data collection, editability, and selectability. Extensive +experiments on a real-world dataset validate the effectiveness of EMG-RAG, +achieving an improvement of approximately 10% over the best existing approach. +Additionally, the personalized agents have been transferred into a real +smartphone AI assistant, which leads to enhanced usability. + +
+
+ comment: This paper has been accepted by EMNLP 2024 +
+
+
+
+
+ + ☆ Utilizing Collaborative Filtering in a Personalized Research-Paper + Recommendation System + + +
+ Recommendation system is such a platform that helps people to easily find out +the things they need within a few seconds. It is implemented based on the +preferences of similar users or items. In this digital era, the internet has +provided us with huge opportunities to use a lot of open resources for our own +needs. But there are too many resources on the internet from which finding the +precise one is a difficult job. Recommendation system has made this easier for +people. Research-paper recommendation system is a system that is developed for +people with common research interests using a collaborative filtering +recommender system. In this paper, coauthor, keyword, reference, and common +citation similarities are calculated using Jaccard Similarity to find the final +similarity and to find the top-n similar users. Based on the test of top-n +similar users of the target user research paper recommendations have been made. +Finally, the accuracy of our recommendation system has been calculated. An +impressive result has been found using our proposed system. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ An Efficient Multi-threaded Collaborative Filtering Approach in + Recommendation System + + +
+ Recommender systems are a subset of information filtering systems designed to +predict and suggest items that users may find interesting or relevant based on +their preferences, behaviors, or interactions. By analyzing user data such as +past activities, ratings, and preferences, these systems generate personalized +recommendations for products, services, or content, with common applications +including online retail, media streaming platforms, and social media. +Recommender systems are typically categorized into three types: content-based +filtering, which recommends items similar to those the user has shown interest +in; collaborative filtering, which analyzes the preferences of similar users; +and hybrid methods, which combine both approaches to improve accuracy. These +systems enhance user experience by reducing information overload and providing +personalized suggestions, thus increasing engagement and satisfaction. However, +building a scalable recommendation system capable of handling numerous users +efficiently is a significant challenge, particularly when considering both +performance consistency and user data security, which are emerging research +topics. The primary objective of this research is to address these challenges +by reducing the processing time in recommendation systems. A multithreaded +similarity approach is employed to achieve this, where users are divided into +independent threads that run in parallel. This parallelization significantly +reduces computation time compared to traditional methods, resulting in a +faster, more efficient, and scalable recommendation system that ensures +improved performance without compromising user data security. + +
+
+ comment: 6 Pages 6 Figure, Paper got accepted at the 2nd International + Conference on Artificial Intelligence, Blockchain, and Internet of Things, + (AIBThings) +
+
+
+
+
+ + ♻ ☆ Decoding Matters: Addressing Amplification Bias and Homogeneity Issue + for LLM-based Recommendation EMNLP 2024 + + +
+ Adapting Large Language Models (LLMs) for recommendation requires careful +consideration of the decoding process, given the inherent differences between +generating items and natural language. Existing approaches often directly apply +LLMs' original decoding methods. However, we find these methods encounter +significant challenges: 1) amplification bias -- where standard length +normalization inflates scores for items containing tokens with generation +probabilities close to 1 (termed ghost tokens), and 2) homogeneity issue -- +generating multiple similar or repetitive items for a user. To tackle these +challenges, we introduce a new decoding approach named Debiasing-Diversifying +Decoding (D3). D3 disables length normalization for ghost tokens to alleviate +amplification bias, and it incorporates a text-free assistant model to +encourage tokens less frequently generated by LLMs for counteracting +recommendation homogeneity. Extensive experiments on real-world datasets +demonstrate the method's effectiveness in enhancing accuracy and diversity. + +
+
+ comment: Accepted at EMNLP 2024 Main Conference +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Extending Depth of Field for Varifocal Multiview Images + + +
+ Optical imaging systems are generally limited by the depth of field because +of the nature of the optics. Therefore, extending depth of field (EDoF) is a +fundamental task for meeting the requirements of emerging visual applications. +To solve this task, the common practice is using multi-focus images from a +single viewpoint. This method can obtain acceptable quality of EDoF under the +condition of fixed field of view, but it is only applicable to static scenes +and the field of view is limited and fixed. An emerging data type, varifocal +multiview images have the potential to become a new paradigm for solving the +EDoF, because the data contains more field of view information than multi-focus +images. To realize EDoF of varifocal multiview images, we propose an end-to-end +method for the EDoF, including image alignment, image optimization and image +fusion. Experimental results demonstrate the efficiency of the proposed method. + +
+
+
+
+
+ + ♻ ☆ WorldGPT: Empowering LLM as Multimodal World Model + + +
+ World models are progressively being employed across diverse fields, +extending from basic environment simulation to complex scenario construction. +However, existing models are mainly trained on domain-specific states and +actions, and confined to single-modality state representations. In this paper, +We introduce WorldGPT, a generalist world model built upon Multimodal Large +Language Model (MLLM). WorldGPT acquires an understanding of world dynamics +through analyzing millions of videos across various domains. To further enhance +WorldGPT's capability in specialized scenarios and long-term tasks, we have +integrated it with a novel cognitive architecture that combines memory +offloading, knowledge retrieval, and context reflection. As for evaluation, we +build WorldNet, a multimodal state transition prediction benchmark encompassing +varied real-life scenarios. Conducting evaluations on WorldNet directly +demonstrates WorldGPT's capability to accurately model state transition +patterns, affirming its effectiveness in understanding and predicting the +dynamics of complex scenarios. We further explore WorldGPT's emerging potential +in serving as a world simulator, helping multimodal agents generalize to +unfamiliar domains through efficiently synthesising multimodal instruction +instances which are proved to be as reliable as authentic data for fine-tuning +purposes. The project is available on +\url{https://github.com/DCDmllm/WorldGPT}. + +
+
+ comment: update v2 +
+
+
+
+
+ + ♻ ☆ Less is More: A Simple yet Effective Token Reduction Method for + Efficient Multi-modal LLMs + + +
+ The rapid advancement of Multimodal Large Language Models (MLLMs) has led to +remarkable performances across various domains. However, this progress is +accompanied by a substantial surge in the resource consumption of these models. +We address this pressing issue by introducing a new approach, Token Reduction +using CLIP Metric (TRIM), aimed at improving the efficiency of MLLMs without +sacrificing their performance. Inspired by human attention patterns in Visual +Question Answering (VQA) tasks, TRIM presents a fresh perspective on the +selection and reduction of image tokens. The TRIM method has been extensively +tested across 12 datasets, and the results demonstrate a significant reduction +in computational overhead while maintaining a consistent level of performance. +This research marks a critical stride in efficient MLLM development, promoting +greater accessibility and sustainability of high-performing models. + +
+
+ comment: 9 pages, 3 figures, 6 tables Code and Model: + https://github.com/FreedomIntelligence/TRIM +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`