diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..1060562a --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-10-23T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2410.18077v1","updated":"2024-10-23T17:58:49Z","published":"2024-10-23T17:58:49Z","title":"ALTA: Compiler-Based Analysis of Transformers","summary":" We propose a new programming language called ALTA and a compiler that can map\nALTA programs to Transformer weights. ALTA is inspired by RASP, a language\nproposed by Weiss et al. (2021), and Tracr (Lindner et al., 2023), a compiler\nfrom RASP programs to Transformer weights. ALTA complements and extends this\nprior work, offering the ability to express loops and to compile programs to\nUniversal Transformers, among other advantages. ALTA allows us to\nconstructively show how Transformers can represent length-invariant algorithms\nfor computing parity and addition, as well as a solution to the SCAN benchmark\nof compositional generalization tasks, without requiring intermediate\nscratchpad decoding steps. We also propose tools to analyze cases where the\nexpressibility of an algorithm is established, but end-to-end training on a\ngiven training set fails to induce behavior consistent with the desired\nalgorithm. To this end, we explore training from ALTA execution traces as a\nmore fine-grained supervision signal. This enables additional experiments and\ntheoretical analyses relating the learnability of various algorithms to data\navailability and modeling decisions, such as positional encodings. We make the\nALTA framework -- language specification, symbolic interpreter, and weight\ncompiler -- available to the community to enable further applications and\ninsights.\n","authors":["Peter Shaw","James Cohan","Jacob Eisenstein","Kenton Lee","Jonathan Berant","Kristina Toutanova"],"pdf_url":"https://arxiv.org/pdf/2410.18077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18071v1","updated":"2024-10-23T17:54:43Z","published":"2024-10-23T17:54:43Z","title":"TP-Eval: Tap Multimodal LLMs' Potential in Evaluation by Customizing\n Prompts","summary":" Recently, multimodal large language models (MLLMs) have received much\nattention for their impressive capabilities. The evaluation of MLLMs is\nbecoming critical to analyzing attributes of MLLMs and providing valuable\ninsights. However, current benchmarks overlook the problem of prompt\nsensitivity - minor prompt variations may lead to significant performance\nfluctuations. Thus, inappropriate prompts may obscure the models' capabilities,\nunderestimating the models' performance. Moreover, different models have\ndifferent preferences for different prompts, and thus, using the same prompt\nfor all models will cause evaluation bias. This paper analyzes this deficiency\nin existing benchmarks and further introduces a new evaluation framework named\nTP-Eval, which introduces a prompt customization method to reduce evaluation\nbiases and tap models' potential. TP-Eval will rewrite the original prompts to\ndifferent customized prompts for different models. In particular, we propose\nsome well-designed modules for prompt customization tailored to the scenario of\nMLLM evaluation. Extensive experiments demonstrate the effectiveness of our\napproach to uncovering models' capabilities, and TP-Eval should benefit the\ncommunity in developing more comprehensive and convincing MLLM evaluation\nbenchmarks.\n","authors":["Yuxuan Xie","Tianhua Li","Wenqi Shao","Kaipeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.18071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.15240v2","updated":"2024-10-23T17:47:58Z","published":"2024-09-23T17:38:41Z","title":"MADial-Bench: Towards Real-world Evaluation of Memory-Augmented Dialogue\n Generation","summary":" Long-term memory is important for chatbots and dialogue systems (DS) to\ncreate consistent and human-like conversations, evidenced by numerous developed\nmemory-augmented DS (MADS). To evaluate the effectiveness of such MADS,\nexisting commonly used evaluation metrics, like retrieval accuracy and\nperplexity (PPL), mainly focus on query-oriented factualness and language\nquality assessment. However, these metrics often lack practical value.\nMoreover, the evaluation dimensions are insufficient for human-like assessment\nin DS. Regarding memory-recalling paradigms, current evaluation schemes only\nconsider passive memory retrieval while ignoring diverse memory recall with\nrich triggering factors, e.g., emotions and surroundings, which can be\nessential in emotional support scenarios. To bridge the gap, we construct a\nnovel Memory-Augmented Dialogue Benchmark (MADail-Bench) covering various\nmemory-recalling paradigms based on cognitive science and psychology theories.\nThe benchmark assesses two tasks separately: memory retrieval and memory\nrecognition with the incorporation of both passive and proactive memory recall\ndata. We introduce new scoring criteria to the evaluation, including memory\ninjection, emotion support (ES) proficiency, and intimacy, to comprehensively\nassess generated responses. Results from cutting-edge embedding models and\nlarge language models on this benchmark indicate the potential for further\nadvancement. Extensive testing further reveals correlations between memory\ninjection, ES proficiency, and intimacy.\n","authors":["Junqing He","Liang Zhu","Rui Wang","Xi Wang","Reza Haffari","Jiaxing Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.15240v2.pdf","comment":"Submitted to NAACL 2025"},{"id":"http://arxiv.org/abs/2404.19442v2","updated":"2024-10-23T17:46:13Z","published":"2024-04-30T10:45:40Z","title":"Does Generative AI speak Nigerian-Pidgin?: Issues about\n Representativeness and Bias for Multilingualism in LLMs","summary":" Nigeria is a multilingual country with 500+ languages. Naija is a\nNigerian-Pidgin spoken by approx. 120M speakers in Nigeria and it is a mixed\nlanguage (e.g., English, Portuguese, Yoruba, Hausa and Igbo). Although it has\nmainly been a spoken language until recently, there are now various platforms\npublishing exclusively in Naija such as Naija Wikipedia. However, it is hard to\ndistinguish by non-native from a larger pidgin languages spoken across West\nAfrica known as West African Pidgin English (WAPE) -- which is more simplied\nand understandable by wider audience in Ghana, Nigeria, and Cameroon. BBC news\nplatform publishes exclusively in WAPE to cater for several countries in West\nAfrica. In our paper, we show through statistical analyses and Machine\nTranslation experiments that these two creole varieties do not represent each\nother (i.e., there are linguistic differences in word order and vocabulary) and\nGenerative AI operates only based on WAPE. In other words, Naija is\nunder-represented in Generative AI, and it is hard to teach LLMs with few\nexamples.\n","authors":["David Ifeoluwa Adelani","A. Seza Doğruöz","Iyanuoluwa Shode","Anuoluwapo Aremu"],"pdf_url":"https://arxiv.org/pdf/2404.19442v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2407.15762v2","updated":"2024-10-23T17:42:39Z","published":"2024-07-22T16:13:38Z","title":"Conditional Language Policy: A General Framework for Steerable\n Multi-Objective Finetuning","summary":" Reward-based finetuning is crucial for aligning language policies with\nintended behaviors (e.g., creativity and safety). A key challenge is to develop\nsteerable language models that trade-off multiple (conflicting) objectives in a\nflexible and efficient manner. This paper presents Conditional Language Policy\n(CLP), a general framework for finetuning language models on multiple\nobjectives. Building on techniques from multi-task training and\nparameter-efficient finetuning, CLP learn steerable models that effectively\ntrade-off conflicting objectives at inference time. Notably, this does not\nrequire training or maintaining multiple models to achieve different trade-offs\nbetween the objectives. Through extensive experiments and ablations on two\nsummarization datasets, we show that CLP learns steerable language models that\noutperform and Pareto-dominate the existing approaches for multi-objective\nfinetuning.\n","authors":["Kaiwen Wang","Rahul Kidambi","Ryan Sullivan","Alekh Agarwal","Christoph Dann","Andrea Michi","Marco Gelmi","Yunxuan Li","Raghav Gupta","Avinava Dubey","Alexandre Ramé","Johan Ferret","Geoffrey Cideron","Le Hou","Hongkun Yu","Amr Ahmed","Aranyak Mehta","Léonard Hussenot","Olivier Bachem","Edouard Leurent"],"pdf_url":"https://arxiv.org/pdf/2407.15762v2.pdf","comment":"40 pages. Findings of EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.18057v1","updated":"2024-10-23T17:30:50Z","published":"2024-10-23T17:30:50Z","title":"CLEAR: Character Unlearning in Textual and Visual Modalities","summary":" Machine Unlearning (MU) is critical for enhancing privacy and security in\ndeep learning models, particularly in large multimodal language models (MLLMs),\nby removing specific private or hazardous information. While MU has made\nsignificant progress in textual and visual modalities, multimodal unlearning\n(MMU) remains significantly underexplored, partially due to the absence of a\nsuitable open-source benchmark. To address this, we introduce CLEAR, a new\nbenchmark designed to evaluate MMU methods. CLEAR contains 200 fictitious\nindividuals and 3,700 images linked with corresponding question-answer pairs,\nenabling a thorough evaluation across modalities. We assess 10 MU methods,\nadapting them for MMU, and highlight new challenges specific to multimodal\nforgetting. We also demonstrate that simple $\\ell_1$ regularization on LoRA\nweights significantly mitigates catastrophic forgetting, preserving model\nperformance on retained data. The dataset is available at\nhttps://huggingface.co/datasets/therem/CLEAR\n","authors":["Alexey Dontsov","Dmitrii Korzh","Alexey Zhavoronkin","Boris Mikheev","Denis Bobkov","Aibek Alanov","Oleg Y. Rogov","Ivan Oseledets","Elena Tutubalina"],"pdf_url":"https://arxiv.org/pdf/2410.18057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18050v1","updated":"2024-10-23T17:24:58Z","published":"2024-10-23T17:24:58Z","title":"LongRAG: A Dual-Perspective Retrieval-Augmented Generation Paradigm for\n Long-Context Question Answering","summary":" Long-Context Question Answering (LCQA), a challenging task, aims to reason\nover long-context documents to yield accurate answers to questions. Existing\nlong-context Large Language Models (LLMs) for LCQA often struggle with the\n\"lost in the middle\" issue. Retrieval-Augmented Generation (RAG) mitigates this\nissue by providing external factual evidence. However, its chunking strategy\ndisrupts the global long-context information, and its low-quality retrieval in\nlong contexts hinders LLMs from identifying effective factual details due to\nsubstantial noise. To this end, we propose LongRAG, a general,\ndual-perspective, and robust LLM-based RAG system paradigm for LCQA to enhance\nRAG's understanding of complex long-context knowledge (i.e., global information\nand factual details). We design LongRAG as a plug-and-play paradigm,\nfacilitating adaptation to various domains and LLMs. Extensive experiments on\nthree multi-hop datasets demonstrate that LongRAG significantly outperforms\nlong-context LLMs (up by 6.94%), advanced RAG (up by 6.16%), and Vanilla RAG\n(up by 17.25%). Furthermore, we conduct quantitative ablation studies and\nmulti-dimensional analyses, highlighting the effectiveness of the system's\ncomponents and fine-tuning strategies. Data and code are available at\nhttps://github.com/QingFei1/LongRAG.\n","authors":["Qingfei Zhao","Ruobing Wang","Yukuo Cen","Daren Zha","Shicheng Tan","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2410.18050v1.pdf","comment":"EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2410.18040v1","updated":"2024-10-23T17:07:32Z","published":"2024-10-23T17:07:32Z","title":"Key Algorithms for Keyphrase Generation: Instruction-Based LLMs for\n Russian Scientific Keyphrases","summary":" Keyphrase selection is a challenging task in natural language processing that\nhas a wide range of applications. Adapting existing supervised and unsupervised\nsolutions for the Russian language faces several limitations due to the rich\nmorphology of Russian and the limited number of training datasets available.\nRecent studies conducted on English texts show that large language models\n(LLMs) successfully address the task of generating keyphrases. LLMs allow\nachieving impressive results without task-specific fine-tuning, using text\nprompts instead. In this work, we access the performance of prompt-based\nmethods for generating keyphrases for Russian scientific abstracts. First, we\ncompare the performance of zero-shot and few-shot prompt-based methods,\nfine-tuned models, and unsupervised methods. Then we assess strategies for\nselecting keyphrase examples in a few-shot setting. We present the outcomes of\nhuman evaluation of the generated keyphrases and analyze the strengths and\nweaknesses of the models through expert assessment. Our results suggest that\nprompt-based methods can outperform common baselines even using simple text\nprompts.\n","authors":["Anna Glazkova","Dmitry Morozov","Timur Garipov"],"pdf_url":"https://arxiv.org/pdf/2410.18040v1.pdf","comment":"The 12th International Conference on Analysis of Images, Social\n Networks and Texts (AIST'2024)"},{"id":"http://arxiv.org/abs/2410.18035v1","updated":"2024-10-23T17:04:40Z","published":"2024-10-23T17:04:40Z","title":"MiLoRA: Efficient Mixture of Low-Rank Adaptation for Large Language\n Models Fine-tuning","summary":" Low-rank adaptation (LoRA) and its mixture-of-experts (MOE) variants are\nhighly effective parameter-efficient fine-tuning (PEFT) methods. However, they\nintroduce significant latency in multi-tenant settings due to the LoRA modules\nand MOE routers added to multiple linear modules in the Transformer layer. To\naddress this issue, we propose Mixture of Low-Rank Adaptation (MiLoRA), a novel\nand efficient LoRA variant. MiLoRA differs from previous MOE-style LoRA methods\nby considering each LoRA module as an expert and employing a prompt-aware\nrouting mechanism. This mechanism calculates expert routing results once before\ngenerating the first new token and reuses these results for subsequent tokens,\nreducing latency. Extensive experiments and analysis on commonsense reasoning\ntasks, math reasoning tasks, and widely used LLM evaluation benchmarks\ndemonstrate that MiLoRA consistently outperforms strong PEFT baselines with\ncomparable tunable parameter budgets. Additionally, MiLoRA significantly\nreduces latency in multi-tenant settings compared to previous LoRA-based\nmethods.\n","authors":["Jingfan Zhang","Yi Zhao","Dan Chen","Xing Tian","Huanran Zheng","Wei Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.18035v1.pdf","comment":"Accepted by EMNLP 2024 Findings. arXiv admin note: substantial text\n overlap with arXiv:2405.18203"},{"id":"http://arxiv.org/abs/2410.18032v1","updated":"2024-10-23T17:02:59Z","published":"2024-10-23T17:02:59Z","title":"GraphTeam: Facilitating Large Language Model-based Graph Analysis via\n Multi-Agent Collaboration","summary":" Graphs are widely used for modeling relational data in real-world scenarios,\nsuch as social networks and urban computing. Existing LLM-based graph analysis\napproaches either integrate graph neural networks (GNNs) for specific machine\nlearning tasks, limiting their transferability, or rely solely on LLMs'\ninternal reasoning ability, resulting in suboptimal performance. To address\nthese limitations, we take advantage of recent advances in LLM-based agents,\nwhich have shown capabilities of utilizing external knowledge or tools for\nproblem solving. By simulating human problem-solving strategies such as analogy\nand collaboration, we propose a multi-agent system based on LLMs named\nGraphTeam, for graph analysis. GraphTeam consists of five LLM-based agents from\nthree modules, and the agents with different specialities can collaborate with\neach other to address complex problems. Specifically, (1) input-output\nnormalization module: the question agent extracts and refines four key\narguments from the original question, facilitating the problem understanding,\nand the answer agent organizes the results to meet the output requirement; (2)\nexternal knowledge retrieval module: we first build a knowledge base consisting\nof relevant documentation and experience information, and then the search agent\nretrieves the most relevant entries for each question. (3) problem-solving\nmodule: given the retrieved information from search agent, the coding agent\nuses established algorithms via programming to generate solutions, and in case\nthe coding agent does not work, the reasoning agent will directly compute the\nresults without programming. Extensive experiments on six graph analysis\nbenchmarks demonstrate that GraphTeam achieves state-of-the-art performance\nwith an average 25.85% improvement over the best baseline in terms of accuracy.\nThe code and data are available at https://github.com/BUPT-GAMMA/GraphTeam.\n","authors":["Xin Li","Qizhi Chu","Yubin Chen","Yang Liu","Yaoqi Liu","Zekai Yu","Weize Chen","Chen Qian","Chuan Shi","Cheng Yang"],"pdf_url":"https://arxiv.org/pdf/2410.18032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18027v1","updated":"2024-10-23T17:00:13Z","published":"2024-10-23T17:00:13Z","title":"Cross-lingual Transfer of Reward Models in Multilingual Alignment","summary":" Reinforcement learning with human feedback (RLHF) is shown to largely benefit\nfrom precise reward models (RMs). However, recent studies in reward modeling\nschemes are skewed towards English, limiting the applicability of RLHF in\nmultilingual alignments. In this work, we investigate the cross-lingual\ntransfer of RMs trained in diverse languages, primarily from English. Our\nexperimental results demonstrate the strong cross-lingual transfer of English\nRMs, exceeding target language RMs by 3~4% average increase in Multilingual\nRewardBench. Furthermore, we analyze the cross-lingual transfer of RMs through\nthe representation shifts. Finally, we perform multilingual alignment to\nexemplify how cross-lingual transfer in RM propagates to enhanced multilingual\ninstruction-following capability, along with extensive analyses on\noff-the-shelf RMs. We release the code, model, and data.\n","authors":["Jiwoo Hong","Noah Lee","Rodrigo Martínez-Castaño","César Rodríguez","James Thorne"],"pdf_url":"https://arxiv.org/pdf/2410.18027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11757v4","updated":"2024-10-23T16:41:45Z","published":"2024-06-17T17:16:45Z","title":"STAR: SocioTechnical Approach to Red Teaming Language Models","summary":" This research introduces STAR, a sociotechnical framework that improves on\ncurrent best practices for red teaming safety of large language models. STAR\nmakes two key contributions: it enhances steerability by generating\nparameterised instructions for human red teamers, leading to improved coverage\nof the risk surface. Parameterised instructions also provide more detailed\ninsights into model failures at no increased cost. Second, STAR improves signal\nquality by matching demographics to assess harms for specific groups, resulting\nin more sensitive annotations. STAR further employs a novel step of arbitration\nto leverage diverse viewpoints and improve label reliability, treating\ndisagreement not as noise but as a valuable contribution to signal quality.\n","authors":["Laura Weidinger","John Mellor","Bernat Guillen Pegueroles","Nahema Marchal","Ravin Kumar","Kristian Lum","Canfer Akbulut","Mark Diaz","Stevie Bergman","Mikel Rodriguez","Verena Rieser","William Isaac"],"pdf_url":"https://arxiv.org/pdf/2406.11757v4.pdf","comment":"8 pages, 5 figures, 5 pages appendix. * denotes equal contribution"},{"id":"http://arxiv.org/abs/2409.17270v2","updated":"2024-10-23T16:27:20Z","published":"2024-09-25T18:35:45Z","title":"Proof of Thought : Neurosymbolic Program Synthesis allows Robust and\n Interpretable Reasoning","summary":" Large Language Models (LLMs) have revolutionized natural language processing,\nyet they struggle with inconsistent reasoning, particularly in novel domains\nand complex logical sequences. This research introduces Proof of Thought, a\nframework that enhances the reliability and transparency of LLM outputs. Our\napproach bridges LLM-generated ideas with formal logic verification, employing\na custom interpreter to convert LLM outputs into First Order Logic constructs\nfor theorem prover scrutiny. Central to our method is an intermediary\nJSON-based Domain-Specific Language, which by design balances precise logical\nstructures with intuitive human concepts. This hybrid representation enables\nboth rigorous validation and accessible human comprehension of LLM reasoning\nprocesses. Key contributions include a robust type system with sort management\nfor enhanced logical integrity, explicit representation of rules for clear\ndistinction between factual and inferential knowledge, and a flexible\narchitecture that allows for easy extension to various domain-specific\napplications. We demonstrate Proof of Thought's effectiveness through\nbenchmarking on StrategyQA and a novel multimodal reasoning task, showing\nimproved performance in open-ended scenarios. By providing verifiable and\ninterpretable results, our technique addresses critical needs for AI system\naccountability and sets a foundation for human-in-the-loop oversight in\nhigh-stakes domains.\n","authors":["Debargha Ganguly","Srinivasan Iyengar","Vipin Chaudhary","Shivkumar Kalyanaraman"],"pdf_url":"https://arxiv.org/pdf/2409.17270v2.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024) System 2 Reasoning At Scale Workshop"},{"id":"http://arxiv.org/abs/2407.10992v2","updated":"2024-10-23T16:19:06Z","published":"2024-06-24T09:29:14Z","title":"AlleNoise: large-scale text classification benchmark dataset with\n real-world label noise","summary":" Label noise remains a challenge for training robust classification models.\nMost methods for mitigating label noise have been benchmarked using primarily\ndatasets with synthetic noise. While the need for datasets with realistic noise\ndistribution has partially been addressed by web-scraped benchmarks such as\nWebVision and Clothing1M, those benchmarks are restricted to the computer\nvision domain. With the growing importance of Transformer-based models, it is\ncrucial to establish text classification benchmarks for learning with noisy\nlabels. In this paper, we present AlleNoise, a new curated text classification\nbenchmark dataset with real-world instance-dependent label noise, containing\nover 500,000 examples across approximately 5,600 classes, complemented with a\nmeaningful, hierarchical taxonomy of categories. The noise distribution comes\nfrom actual users of a major e-commerce marketplace, so it realistically\nreflects the semantics of human mistakes. In addition to the noisy labels, we\nprovide human-verified clean labels, which help to get a deeper insight into\nthe noise distribution, unlike web-scraped datasets typically used in the\nfield. We demonstrate that a representative selection of established methods\nfor learning with noisy labels is inadequate to handle such real-world noise.\nIn addition, we show evidence that these algorithms do not alleviate excessive\nmemorization. As such, with AlleNoise, we set the bar high for the development\nof label noise methods that can handle real-world label noise in text\nclassification tasks. The code and dataset are available for download at\nhttps://github.com/allegro/AlleNoise.\n","authors":["Alicja Rączkowska","Aleksandra Osowska-Kurczab","Jacek Szczerbiński","Kalina Jasinska-Kobus","Klaudia Nazarko"],"pdf_url":"https://arxiv.org/pdf/2407.10992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15720v4","updated":"2024-10-23T16:12:39Z","published":"2024-04-24T08:13:02Z","title":"Annotator-Centric Active Learning for Subjective NLP Tasks","summary":" Active Learning (AL) addresses the high costs of collecting human annotations\nby strategically annotating the most informative samples. However, for\nsubjective NLP tasks, incorporating a wide range of perspectives in the\nannotation process is crucial to capture the variability in human judgments. We\nintroduce Annotator-Centric Active Learning (ACAL), which incorporates an\nannotator selection strategy following data sampling. Our objective is\ntwo-fold: 1) to efficiently approximate the full diversity of human judgments,\nand 2) to assess model performance using annotator-centric metrics, which value\nminority and majority perspectives equally. We experiment with multiple\nannotator selection strategies across seven subjective NLP tasks, employing\nboth traditional and novel, human-centered evaluation metrics. Our findings\nindicate that ACAL improves data efficiency and excels in annotator-centric\nperformance evaluations. However, its success depends on the availability of a\nsufficiently large and diverse pool of annotators to sample from.\n","authors":["Michiel van der Meer","Neele Falk","Pradeep K. Murukannaiah","Enrico Liscio"],"pdf_url":"https://arxiv.org/pdf/2404.15720v4.pdf","comment":"Accepted at EMNLP2024"},{"id":"http://arxiv.org/abs/2410.14979v2","updated":"2024-10-23T15:43:28Z","published":"2024-10-19T05:01:56Z","title":"Do Large Language Models Truly Grasp Mathematics? An Empirical\n Exploration","summary":" Despite their proficiency in math tasks, the mechanisms underlying LLMs'\nmathematical reasoning abilities remain a subject of debate. Recent studies\nsuggest that chain-of-thought (CoT) prompts can bolster mathematical reasoning\nby encouraging LLMs to employ human-like logical reasoning (System 2), enabling\nthem to excel on the Cognitive Reflection Test (CRT). To assess whether LLMs\ngenuinely possess System 2-like logical reasoning, we introduced targeted\nmodifications to CRT problems. Our findings reveal that, despite the use of CoT\nprompts, mainstream LLMs, including the latest o1-preview model, continue to\nexhibit a significant error rate. Further analysis indicates that they\npredominantly rely on System 1-like intuitive reasoning and pattern matching\nderived from training data, rather than demonstrating mastery of mathematical\nthinking. This discovery challenges the prevailing notion that LLMs possess\ngenuine logical reasoning abilities and that CoT can enhance them.\nConsequently, this work may temper overly optimistic projections regarding\nLLMs' advancement toward artificial general intelligence.\n","authors":["Wei Xie","Shuoyoucheng Ma","Zhenhua Wang","Enze Wang","Baosheng Wang","Jinshu Su"],"pdf_url":"https://arxiv.org/pdf/2410.14979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17973v1","updated":"2024-10-23T15:37:08Z","published":"2024-10-23T15:37:08Z","title":"Together We Can: Multilingual Automatic Post-Editing for Low-Resource\n Languages","summary":" This exploratory study investigates the potential of multilingual Automatic\nPost-Editing (APE) systems to enhance the quality of machine translations for\nlow-resource Indo-Aryan languages. Focusing on two closely related language\npairs, English-Marathi and English-Hindi, we exploit the linguistic\nsimilarities to develop a robust multilingual APE model. To facilitate\ncross-linguistic transfer, we generate synthetic Hindi-Marathi and\nMarathi-Hindi APE triplets. Additionally, we incorporate a Quality Estimation\n(QE)-APE multi-task learning framework. While the experimental results\nunderline the complementary nature of APE and QE, we also observe that QE-APE\nmultitask learning facilitates effective domain adaptation. Our experiments\ndemonstrate that the multilingual APE models outperform their corresponding\nEnglish-Hindi and English-Marathi single-pair models by $2.5$ and $2.39$ TER\npoints, respectively, with further notable improvements over the multilingual\nAPE model observed through multi-task learning ($+1.29$ and $+1.44$ TER\npoints), data augmentation ($+0.53$ and $+0.45$ TER points) and domain\nadaptation ($+0.35$ and $+0.45$ TER points). We release the synthetic data,\ncode, and models accrued during this study publicly at\nhttps://github.com/cfiltnlp/Multilingual-APE.\n","authors":["Sourabh Deoghare","Diptesh Kanojia","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2410.17973v1.pdf","comment":"Accepted at Findings of EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.17972v1","updated":"2024-10-23T15:37:02Z","published":"2024-10-23T15:37:02Z","title":"Dependency Graph Parsing as Sequence Labeling","summary":" Various linearizations have been proposed to cast syntactic dependency\nparsing as sequence labeling. However, these approaches do not support more\ncomplex graph-based representations, such as semantic dependencies or enhanced\nuniversal dependencies, as they cannot handle reentrancy or cycles. By\nextending them, we define a range of unbounded and bounded linearizations that\ncan be used to cast graph parsing as a tagging task, enlarging the toolbox of\nproblems that can be solved under this paradigm. Experimental results on\nsemantic dependency and enhanced UD parsing show that with a good choice of\nencoding, sequence-labeling dependency graph parsers combine high efficiency\nwith accuracies close to the state of the art, in spite of their simplicity.\n","authors":["Ana Ezquerro","David Vilares","Carlos Gómez-Rodríguez"],"pdf_url":"https://arxiv.org/pdf/2410.17972v1.pdf","comment":"Accepted at EMNLP-2024"},{"id":"http://arxiv.org/abs/2410.17963v1","updated":"2024-10-23T15:30:37Z","published":"2024-10-23T15:30:37Z","title":"A Time-Aware Approach to Early Detection of Anorexia: UNSL at eRisk 2024","summary":" The eRisk laboratory aims to address issues related to early risk detection\non the Web. In this year's edition, three tasks were proposed, where Task 2 was\nabout early detection of signs of anorexia. Early risk detection is a problem\nwhere precision and speed are two crucial objectives. Our research group solved\nTask 2 by defining a CPI+DMC approach, addressing both objectives\nindependently, and a time-aware approach, where precision and speed are\nconsidered a combined single-objective. We implemented the last approach by\nexplicitly integrating time during the learning process, considering the\nERDE{\\theta} metric as the training objective. It also allowed us to\nincorporate temporal metrics to validate and select the optimal models. We\nachieved outstanding results for the ERDE50 metric and ranking-based metrics,\ndemonstrating consistency in solving ERD problems.\n","authors":["Horacio Thompson","Marcelo Errecalde"],"pdf_url":"https://arxiv.org/pdf/2410.17963v1.pdf","comment":"In Conference and Labs of the Evaluation Forum (CLEF 2024), Grenoble,\n France"},{"id":"http://arxiv.org/abs/2410.17960v1","updated":"2024-10-23T15:28:53Z","published":"2024-10-23T15:28:53Z","title":"Zeitenwenden: Detecting changes in the German political discourse","summary":" From a monarchy to a democracy, to a dictatorship and back to a democracy --\nthe German political landscape has been constantly changing ever since the\nfirst German national state was formed in 1871. After World War II, the Federal\nRepublic of Germany was formed in 1949. Since then every plenary session of the\nGerman Bundestag was logged and even has been digitized over the course of the\nlast few years. We analyze these texts using a time series variant of the topic\nmodel LDA to investigate which events had a lasting effect on the political\ndiscourse and how the political topics changed over time. This allows us to\ndetect changes in word frequency (and thus key discussion points) in political\ndiscourse.\n","authors":["Kai-Robin Lange","Jonas Rieger","Niklas Benner","Carsten Jentsch"],"pdf_url":"https://arxiv.org/pdf/2410.17960v1.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2201.12091v5","updated":"2024-10-23T15:28:38Z","published":"2022-01-28T13:00:17Z","title":"Linear Adversarial Concept Erasure","summary":" Modern neural models trained on textual data rely on pre-trained\nrepresentations that emerge without direct supervision. As these\nrepresentations are increasingly being used in real-world applications, the\ninability to \\emph{control} their content becomes an increasingly important\nproblem. We formulate the problem of identifying and erasing a linear subspace\nthat corresponds to a given concept, in order to prevent linear predictors from\nrecovering the concept. We model this problem as a constrained, linear maximin\ngame, and show that existing solutions are generally not optimal for this task.\nWe derive a closed-form solution for certain objectives, and propose a convex\nrelaxation, \\method, that works well for others. When evaluated in the context\nof binary gender removal, the method recovers a low-dimensional subspace whose\nremoval mitigates bias by intrinsic and extrinsic evaluation. We show that the\nmethod is highly expressive, effectively mitigating bias in deep nonlinear\nclassifiers while maintaining tractability and interpretability.\n","authors":["Shauli Ravfogel","Michael Twiton","Yoav Goldberg","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2201.12091v5.pdf","comment":"Accepted in ICML 2022; a revised version"},{"id":"http://arxiv.org/abs/2410.17954v1","updated":"2024-10-23T15:24:54Z","published":"2024-10-23T15:24:54Z","title":"ExpertFlow: Optimized Expert Activation and Token Allocation for\n Efficient Mixture-of-Experts Inference","summary":" Sparse Mixture of Experts (MoE) models, while outperforming dense Large\nLanguage Models (LLMs) in terms of performance, face significant deployment\nchallenges during inference due to their high memory demands. Existing\noffloading techniques, which involve swapping activated and idle experts\nbetween the GPU and CPU, often suffer from rigid expert caching mechanisms.\nThese mechanisms fail to adapt to dynamic routing, leading to inefficient cache\nutilization, or incur prohibitive costs for prediction training. To tackle\nthese inference-specific challenges, we introduce ExpertFlow, a comprehensive\nsystem specifically designed to enhance inference efficiency by accommodating\nflexible routing and enabling efficient expert scheduling between CPU and GPU.\nThis reduces overhead and boosts system performance. Central to our approach is\na predictive routing path-based offloading mechanism that utilizes a\nlightweight predictor to accurately forecast routing paths before computation\nbegins. This proactive strategy allows for real-time error correction in expert\ncaching, significantly increasing cache hit ratios and reducing the frequency\nof expert transfers, thereby minimizing I/O overhead. Additionally, we\nimplement a dynamic token scheduling strategy that optimizes MoE inference by\nrearranging input tokens across different batches. This method not only reduces\nthe number of activated experts per batch but also improves computational\nefficiency. Our extensive experiments demonstrate that ExpertFlow achieves up\nto 93.72\\% GPU memory savings and enhances inference speed by 2 to 10 times\ncompared to baseline methods, highlighting its effectiveness and utility as a\nrobust solution for resource-constrained inference scenarios.\n","authors":["Xin He","Shunkang Zhang","Yuxin Wang","Haiyan Yin","Zihao Zeng","Shaohuai Shi","Zhenheng Tang","Xiaowen Chu","Ivor Tsang","Ong Yew Soon"],"pdf_url":"https://arxiv.org/pdf/2410.17954v1.pdf","comment":"Mixture-of-Experts, Inference, Offloading"},{"id":"http://arxiv.org/abs/2410.17952v1","updated":"2024-10-23T15:24:16Z","published":"2024-10-23T15:24:16Z","title":"SimRAG: Self-Improving Retrieval-Augmented Generation for Adapting Large\n Language Models to Specialized Domains","summary":" Retrieval-augmented generation (RAG) enhances the question-answering (QA)\nabilities of large language models (LLMs) by integrating external knowledge.\nHowever, adapting general-purpose RAG systems to specialized fields such as\nscience and medicine poses unique challenges due to distribution shifts and\nlimited access to domain-specific data. To tackle this, we propose SimRAG, a\nself-training approach that equips the LLM with joint capabilities of question\nanswering and question generation for domain adaptation. Our method first\nfine-tunes the LLM on instruction-following, question-answering, and\nsearch-related data. Then, it prompts the same LLM to generate diverse\ndomain-relevant questions from unlabeled corpora, with an additional filtering\nstrategy to retain high-quality synthetic examples. By leveraging these\nsynthetic examples, the LLM can improve their performance on domain-specific\nRAG tasks. Experiments on 11 datasets, spanning two backbone sizes and three\ndomains, demonstrate that SimRAG outperforms baselines by 1.2\\%--8.6\\%.\n","authors":["Ran Xu","Hui Liu","Sreyashi Nag","Zhenwei Dai","Yaochen Xie","Xianfeng Tang","Chen Luo","Yang Li","Joyce C. Ho","Carl Yang","Qi He"],"pdf_url":"https://arxiv.org/pdf/2410.17952v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2406.12295v2","updated":"2024-10-23T15:23:00Z","published":"2024-06-18T05:59:28Z","title":"Fast and Slow Generating: An Empirical Study on Large and Small Language\n Models Collaborative Decoding","summary":" Large Language Models (LLMs) exhibit impressive capabilities across various\napplications but encounter substantial challenges such as high inference\nlatency, considerable training costs, and the generation of hallucinations.\nCollaborative decoding between large and small language models (SLMs) presents\na promising strategy to mitigate these issues through methods including\nspeculative decoding, contrastive decoding, and emulator or proxy fine-tuning.\nHowever, the specifics of such collaborations, particularly from a unified\nperspective, remain largely unexplored. Inspired by dual-process cognitive\ntheory, we propose a unified framework in this paper, termed Fast and Slow\nGenerating (FS-GEN). Within this framework, LLMs (sometimes along with SLMs)\nare categorized as System 2 (slow and deliberate), while independent SLMs are\ndesignated as System 1 (fast and intuitive). We provide a comprehensive\nanalysis of these collaborative methodologies, elucidating their common\nproperties and shedding light on the differential knowledge capabilities of\nSystem 2 versus System 1 through the FS-GEN framework. Our findings indicate\nthat only a small proportion of collaborative interactions (approximately less\nthan 20\\% in most instances) are necessary across various methods. These\ninteractions between System 1 and System 2 conform to a scaling law related to\nthe parameter ratios, enabling predictable collaboration. Furthermore, we\nexplore the specific conditions under which collaboration proves most\neffective, particularly from an uncertainty perspective, offering novel\ninsights that may guide future optimization efforts. Our research underscores\nthat the fundamental distinction between System 1 and System 2 lies in the\nuncertainty of next token predictions, where interventions by System 2 are\ncrucial to support System 1. Code for Reproduction:\nhttps://github.com/TsinghuaC3I/FS-GEN\n","authors":["Kaiyan Zhang","Jianyu Wang","Ning Ding","Biqing Qi","Ermo Hua","Xingtai Lv","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.12295v2.pdf","comment":"update figures and results on Pythia Series"},{"id":"http://arxiv.org/abs/2410.12018v2","updated":"2024-10-23T15:21:54Z","published":"2024-10-15T19:33:57Z","title":"LocoMotion: Learning Motion-Focused Video-Language Representations","summary":" This paper strives for motion-focused video-language representations.\nExisting methods to learn video-language representations use spatial-focused\ndata, where identifying the objects and scene is often enough to distinguish\nthe relevant caption. We instead propose LocoMotion to learn from\nmotion-focused captions that describe the movement and temporal progression of\nlocal object motions. We achieve this by adding synthetic motions to videos and\nusing the parameters of these motions to generate corresponding captions.\nFurthermore, we propose verb-variation paraphrasing to increase the caption\nvariety and learn the link between primitive motions and high-level verbs. With\nthis, we are able to learn a motion-focused video-language representation.\nExperiments demonstrate our approach is effective for a variety of downstream\ntasks, particularly when limited data is available for fine-tuning. Code is\navailable: https://hazeldoughty.github.io/Papers/LocoMotion/\n","authors":["Hazel Doughty","Fida Mohammad Thoker","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2410.12018v2.pdf","comment":"ACCV 2024 Oral"},{"id":"http://arxiv.org/abs/2402.04957v3","updated":"2024-10-23T15:08:57Z","published":"2024-02-07T15:40:22Z","title":"Reconfidencing LLMs from the Grouping Loss Perspective","summary":" Large Language Models (LLMs), including ChatGPT and LLaMA, are susceptible to\ngenerating hallucinated answers in a confident tone. While efforts to elicit\nand calibrate confidence scores have proven useful, recent findings show that\ncontrolling uncertainty must go beyond calibration: predicted scores may\ndeviate significantly from the actual posterior probabilities due to the impact\nof grouping loss. In this work, we construct a new evaluation dataset derived\nfrom a knowledge base to assess confidence scores given to answers of Mistral\nand LLaMA. Experiments show that they tend to be overconfident. Further, we\nshow that they are more overconfident on some answers than others, \\emph{eg}\ndepending on the nationality of the person in the query. In\nuncertainty-quantification theory, this is grouping loss. To address this, we\npropose a solution to reconfidence LLMs, canceling not only calibration but\nalso grouping loss. The LLMs, after the reconfidencing process, indicate\nimproved confidence alignment with the accuracy of their responses.\n","authors":["Lihu Chen","Alexandre Perez-Lebel","Fabian M. Suchanek","Gaël Varoquaux"],"pdf_url":"https://arxiv.org/pdf/2402.04957v3.pdf","comment":"EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2402.01622v4","updated":"2024-10-23T15:02:57Z","published":"2024-02-02T18:39:51Z","title":"TravelPlanner: A Benchmark for Real-World Planning with Language Agents","summary":" Planning has been part of the core pursuit for artificial intelligence since\nits conception, but earlier AI agents mostly focused on constrained settings\nbecause many of the cognitive substrates necessary for human-level planning\nhave been lacking. Recently, language agents powered by large language models\n(LLMs) have shown interesting capabilities such as tool use and reasoning. Are\nthese language agents capable of planning in more complex settings that are out\nof the reach of prior AI agents? To advance this investigation, we propose\nTravelPlanner, a new planning benchmark that focuses on travel planning, a\ncommon real-world planning scenario. It provides a rich sandbox environment,\nvarious tools for accessing nearly four million data records, and 1,225\nmeticulously curated planning intents and reference plans. Comprehensive\nevaluations show that the current language agents are not yet capable of\nhandling such complex planning tasks-even GPT-4 only achieves a success rate of\n0.6%. Language agents struggle to stay on task, use the right tools to collect\ninformation, or keep track of multiple constraints. However, we note that the\nmere possibility for language agents to tackle such a complex problem is in\nitself non-trivial progress. TravelPlanner provides a challenging yet\nmeaningful testbed for future language agents.\n","authors":["Jian Xie","Kai Zhang","Jiangjie Chen","Tinghui Zhu","Renze Lou","Yuandong Tian","Yanghua Xiao","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2402.01622v4.pdf","comment":"ICML 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2311.05876v3","updated":"2024-10-23T14:48:20Z","published":"2023-11-10T05:24:04Z","title":"Trends in Integration of Knowledge and Large Language Models: A Survey\n and Taxonomy of Methods, Benchmarks, and Applications","summary":" Large language models (LLMs) exhibit superior performance on various natural\nlanguage tasks, but they are susceptible to issues stemming from outdated data\nand domain-specific limitations. In order to address these challenges,\nresearchers have pursued two primary strategies, knowledge editing and\nretrieval augmentation, to enhance LLMs by incorporating external information\nfrom different aspects. Nevertheless, there is still a notable absence of a\ncomprehensive survey. In this paper, we propose a review to discuss the trends\nin integration of knowledge and large language models, including taxonomy of\nmethods, benchmarks, and applications. In addition, we conduct an in-depth\nanalysis of different methods and point out potential research directions in\nthe future. We hope this survey offers the community quick access and a\ncomprehensive overview of this research area, with the intention of inspiring\nfuture research endeavors.\n","authors":["Zhangyin Feng","Weitao Ma","Weijiang Yu","Lei Huang","Haotian Wang","Qianglong Chen","Weihua Peng","Xiaocheng Feng","Bing Qin","Ting liu"],"pdf_url":"https://arxiv.org/pdf/2311.05876v3.pdf","comment":"Work in progress; 22 pages. This work has been submitted to the IEEE\n for possible publication"},{"id":"http://arxiv.org/abs/2408.01119v2","updated":"2024-10-23T14:37:50Z","published":"2024-08-02T09:00:03Z","title":"Task Prompt Vectors: Effective Initialization through Multi-Task\n Soft-Prompt Transfer","summary":" Prompt tuning is an efficient solution for training large language models\n(LLMs). However, current soft-prompt-based methods often sacrifice multi-task\nmodularity, requiring the training process to be fully or partially repeated\nfor each newly added task. While recent work on task vectors applied arithmetic\noperations on full model weights to achieve the desired multi-task performance,\na similar approach for soft-prompts is still missing. To this end, we introduce\nTask Prompt Vectors, created by element-wise difference between weights of\ntuned soft-prompts and their random initialization. Experimental results on 12\nNLU datasets show that task prompt vectors can be used in low-resource settings\nto effectively initialize prompt tuning on similar tasks. In addition, we show\nthat task prompt vectors are independent of the random initialization of prompt\ntuning on 2 different language model architectures. This allows prompt\narithmetics with the pre-trained vectors from different tasks. In this way, we\nprovide a competitive alternative to state-of-the-art baselines by arithmetic\naddition of task prompt vectors from multiple tasks.\n","authors":["Robert Belanec","Simon Ostermann","Ivan Srba","Maria Bielikova"],"pdf_url":"https://arxiv.org/pdf/2408.01119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00279v3","updated":"2024-10-23T14:33:44Z","published":"2023-07-01T09:18:24Z","title":"Let Me Teach You: Pedagogical Foundations of Feedback for Language\n Models","summary":" Natural Language Feedback (NLF) is an increasingly popular mechanism for\naligning Large Language Models (LLMs) to human preferences. Despite the\ndiversity of the information it can convey, NLF methods are often hand-designed\nand arbitrary, with little systematic grounding. At the same time, research in\nlearning sciences has long established several effective feedback models. In\nthis opinion piece, we compile ideas from pedagogy to introduce FELT, a\nfeedback framework for LLMs that outlines various characteristics of the\nfeedback space, and a feedback content taxonomy based on these variables,\nproviding a general mapping of the feedback space. In addition to streamlining\nNLF designs, FELT also brings out new, unexplored directions for research in\nNLF. We make our taxonomy available to the community, providing guides and\nexamples for mapping our categorizations to future research.\n","authors":["Beatriz Borges","Niket Tandon","Tanja Käser","Antoine Bosselut"],"pdf_url":"https://arxiv.org/pdf/2307.00279v3.pdf","comment":"EMNLP 2024; 9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2410.17901v1","updated":"2024-10-23T14:18:25Z","published":"2024-10-23T14:18:25Z","title":"ELAICHI: Enhancing Low-resource TTS by Addressing Infrequent and\n Low-frequency Character Bigrams","summary":" Recent advancements in Text-to-Speech (TTS) technology have led to\nnatural-sounding speech for English, primarily due to the availability of\nlarge-scale, high-quality web data. However, many other languages lack access\nto such resources, relying instead on limited studio-quality data. This\nscarcity results in synthesized speech that often suffers from intelligibility\nissues, particularly with low-frequency character bigrams. In this paper, we\npropose three solutions to address this challenge. First, we leverage\nhigh-quality data from linguistically or geographically related languages to\nimprove TTS for the target language. Second, we utilize low-quality Automatic\nSpeech Recognition (ASR) data recorded in non-studio environments, which is\nrefined using denoising and speech enhancement models. Third, we apply\nknowledge distillation from large-scale models using synthetic data to generate\nmore robust outputs. Our experiments with Hindi demonstrate significant\nreductions in intelligibility issues, as validated by human evaluators. We\npropose this methodology as a viable alternative for languages with limited\naccess to high-quality data, enabling them to collectively benefit from shared\nresources.\n","authors":["Srija Anand","Praveen Srinivasa Varadhan","Mehak Singal","Mitesh M. Khapra"],"pdf_url":"https://arxiv.org/pdf/2410.17901v1.pdf","comment":"11 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2410.17897v1","updated":"2024-10-23T14:15:07Z","published":"2024-10-23T14:15:07Z","title":"Value Residual Learning For Alleviating Attention Concentration In\n Transformers","summary":" Transformers can capture long-range dependencies using self-attention,\nallowing tokens to attend to all others directly. However, stacking multiple\nattention layers leads to attention concentration. One natural way to address\nthis issue is to use cross-layer attention, allowing information from earlier\nlayers to be directly accessible to later layers. However, this approach is\ncomputationally expensive. To address this problem, we propose Transformer with\nresidual value (ResFormer) which approximates cross-layer attention through\nadding a residual connection from the values of the the first layer to all\nsubsequent layers. Based on this method, one variant is the Transformer with\nsingle layer value (SVFormer), where all layers share the same value embedding\nfrom first layer, reducing the KV cache by nearly 50%. Comprehensive empirical\nevidence demonstrates that ResFormer mitigates attention concentration problem\nin deeper layers and enhances representation across most layers, outperforming\nthe vanilla Transformer, DenseFormer, and NeuTRENO in training error as well as\ndownstream tasks. SVFormer trains significantly faster than the vanilla\nTransformer and performs better than other methods like GQA and CLA, with\nperformance influenced by sequence length and cumulative learning rate.\n","authors":["Zhanchao Zhou","Tianyi Wu","Zhiyun Jiang","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2410.17897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15592v2","updated":"2024-10-23T14:08:10Z","published":"2024-10-21T02:21:56Z","title":"CPE-Pro: A Structure-Sensitive Deep Learning Method for Protein\n Representation and Origin Evaluation","summary":" Protein structures are important for understanding their functions and\ninteractions. Currently, many protein structure prediction methods are\nenriching the structure database. Discriminating the origin of structures is\ncrucial for distinguishing between experimentally resolved and computationally\npredicted structures, evaluating the reliability of prediction methods, and\nguiding downstream biological studies. Building on works in structure\nprediction, We developed a structure-sensitive supervised deep learning model,\nCrystal vs Predicted Evaluator for Protein Structure (CPE-Pro), to represent\nand discriminate the origin of protein structures. CPE-Pro learns the\nstructural information of proteins and captures inter-structural differences to\nachieve accurate traceability on four data classes, and is expected to be\nextended to more. Simultaneously, we utilized Foldseek to encode protein\nstructures into \"structure-sequences\" and trained a protein Structural Sequence\nLanguage Model, SSLM. Preliminary experiments demonstrated that, compared to\nlarge-scale protein language models pre-trained on vast amounts of amino acid\nsequences, the \"structure-sequence\" enables the language model to learn more\ninformative protein features, enhancing and optimizing structural\nrepresentations. We have provided the code, model weights, and all related\nmaterials on https://github.com/GouWenrui/CPE-Pro-main.git.\n","authors":["Wenrui Gou","Wenhui Ge","Yang Tan","Mingchen Li","Guisheng Fan","Huiqun Yu"],"pdf_url":"https://arxiv.org/pdf/2410.15592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17891v1","updated":"2024-10-23T14:04:22Z","published":"2024-10-23T14:04:22Z","title":"Scaling Diffusion Language Models via Adaptation from Autoregressive\n Models","summary":" Diffusion Language Models (DLMs) have emerged as a promising new paradigm for\ntext generative modeling, potentially addressing limitations of autoregressive\n(AR) models. However, current DLMs have been studied at a smaller scale\ncompared to their AR counterparts and lack fair comparison on language modeling\nbenchmarks. Additionally, training diffusion models from scratch at scale\nremains challenging. Given the prevalence of open-source AR language models, we\npropose adapting these models to build text diffusion models. We demonstrate\nconnections between AR and diffusion modeling objectives and introduce a simple\ncontinual pre-training approach for training diffusion models. Through\nsystematic evaluation on language modeling, reasoning, and commonsense\nbenchmarks, we show that we can convert AR models ranging from 127M to 7B\nparameters (GPT2 and LLaMA) into diffusion models DiffuGPT and DiffuLLaMA,\nusing less than 200B tokens for training. Our experimental results reveal that\nthese models outperform earlier DLMs and are competitive with their AR\ncounterparts. We release a suite of DLMs (with 127M, 355M, and 7B parameters)\ncapable of generating fluent text, performing in-context learning, filling in\nthe middle without prompt re-ordering, and following instructions\n\\url{https://github.com/HKUNLP/DiffuLLaMA}.\n","authors":["Shansan Gong","Shivam Agarwal","Yizhe Zhang","Jiacheng Ye","Lin Zheng","Mukai Li","Chenxin An","Peilin Zhao","Wei Bi","Jiawei Han","Hao Peng","Lingpeng Kong"],"pdf_url":"https://arxiv.org/pdf/2410.17891v1.pdf","comment":"25 pages. Code: https://github.com/HKUNLP/DiffuLLaMA"},{"id":"http://arxiv.org/abs/2406.14703v2","updated":"2024-10-23T14:01:14Z","published":"2024-06-20T19:50:56Z","title":"Do LLMs Have Distinct and Consistent Personality? TRAIT: Personality\n Testset designed for LLMs with Psychometrics","summary":" Recent advancements in Large Language Models (LLMs) have led to their\nadaptation in various domains as conversational agents. We wonder: can\npersonality tests be applied to these agents to analyze their behavior, similar\nto humans? We introduce TRAIT, a new benchmark consisting of 8K multi-choice\nquestions designed to assess the personality of LLMs. TRAIT is built on two\npsychometrically validated small human questionnaires, Big Five Inventory (BFI)\nand Short Dark Triad (SD-3), enhanced with the ATOMIC-10X knowledge graph to a\nvariety of real-world scenarios. TRAIT also outperforms existing personality\ntests for LLMs in terms of reliability and validity, achieving the highest\nscores across four key metrics: Content Validity, Internal Validity, Refusal\nRate, and Reliability. Using TRAIT, we reveal two notable insights into\npersonalities of LLMs: 1) LLMs exhibit distinct and consistent personality,\nwhich is highly influenced by their training data (e.g., data used for\nalignment tuning), and 2) current prompting techniques have limited\neffectiveness in eliciting certain traits, such as high psychopathy or low\nconscientiousness, suggesting the need for further research in this direction.\n","authors":["Seungbeen Lee","Seungwon Lim","Seungju Han","Giyeong Oh","Hyungjoo Chae","Jiwan Chung","Minju Kim","Beong-woo Kwak","Yeonsoo Lee","Dongha Lee","Jinyoung Yeo","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2406.14703v2.pdf","comment":"Preprint; Under review"},{"id":"http://arxiv.org/abs/2410.17886v1","updated":"2024-10-23T14:00:48Z","published":"2024-10-23T14:00:48Z","title":"SpeakGer: A meta-data enriched speech corpus of German state and federal\n parliaments","summary":" The application of natural language processing on political texts as well as\nspeeches has become increasingly relevant in political sciences due to the\nability to analyze large text corpora which cannot be read by a single person.\nBut such text corpora often lack critical meta information, detailing for\ninstance the party, age or constituency of the speaker, that can be used to\nprovide an analysis tailored to more fine-grained research questions. To enable\nresearchers to answer such questions with quantitative approaches such as\nnatural language processing, we provide the SpeakGer data set, consisting of\nGerman parliament debates from all 16 federal states of Germany as well as the\nGerman Bundestag from 1947-2023, split into a total of 10,806,105 speeches.\nThis data set includes rich meta data in form of information on both reactions\nfrom the audience towards the speech as well as information about the speaker's\nparty, their age, their constituency and their party's political alignment,\nwhich enables a deeper analysis. We further provide three exploratory analyses,\ndetailing topic shares of different parties throughout time, a descriptive\nanalysis of the development of the age of an average speaker as well as a\nsentiment analysis of speeches of different parties with regards to the\nCOVID-19 pandemic.\n","authors":["Kai-Robin Lange","Carsten Jentsch"],"pdf_url":"https://arxiv.org/pdf/2410.17886v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.07799v2","updated":"2024-10-23T14:00:40Z","published":"2024-07-10T16:16:02Z","title":"Attribute or Abstain: Large Language Models as Long Document Assistants","summary":" LLMs can help humans working with long documents, but are known to\nhallucinate. Attribution can increase trust in LLM responses: The LLM provides\nevidence that supports its response, which enhances verifiability. Existing\napproaches to attribution have only been evaluated in RAG settings, where the\ninitial retrieval confounds LLM performance. This is crucially different from\nthe long document setting, where retrieval is not needed, but could help. Thus,\na long document specific evaluation of attribution is missing. To fill this\ngap, we present LAB, a benchmark of 6 diverse long document tasks with\nattribution, and experiments with different approaches to attribution on 5 LLMs\nof different sizes.\n We find that citation, i.e. response generation and evidence extraction in\none step, performs best for large and fine-tuned models, while additional\nretrieval can help for small, prompted models. We investigate whether the \"Lost\nin the Middle'' phenomenon exists for attribution, but do not find this. We\nalso find that evidence quality can predict response quality on datasets with\nsimple responses, but not so for complex responses, as models struggle with\nproviding evidence for complex claims.\n","authors":["Jan Buchmann","Xiao Liu","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2407.07799v2.pdf","comment":"Accepted at EMNLP 2024. Code and data:\n https://github.com/UKPLab/arxiv2024-attribute-or-abstain"},{"id":"http://arxiv.org/abs/2407.12819v2","updated":"2024-10-23T14:00:36Z","published":"2024-07-01T10:33:46Z","title":"I've Got 99 Problems But FLOPS Ain't One","summary":" Hyperscalers dominate the landscape of large network deployments, yet they\nrarely share data or insights about the challenges they face. In light of this\nsupremacy, what problems can we find to solve in this space? We take an\nunconventional approach to find relevant research directions, starting from\npublic plans to build a $100 billion datacenter for machine learning\napplications. Leveraging the language models scaling laws, we discover what\nworkloads such a datacenter might carry and explore the challenges one may\nencounter in doing so, with a focus on networking research. We conclude that\nbuilding the datacenter and training such models is technically possible, but\nthis requires novel wide-area transports for inter-DC communication, a\nmultipath transport and novel datacenter topologies for intra-datacenter\ncommunication, high speed scale-up networks and transports, outlining a rich\nresearch agenda for the networking community.\n","authors":["Alexandru M. Gherghescu","Vlad-Andrei Bădoiu","Alexandru Agache","Mihai-Valentin Dumitru","Iuliu Vasilescu","Radu Mantu","Costin Raiciu"],"pdf_url":"https://arxiv.org/pdf/2407.12819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17875v1","updated":"2024-10-23T13:47:05Z","published":"2024-10-23T13:47:05Z","title":"Understanding Layer Significance in LLM Alignment","summary":" Aligning large language models (LLMs) through fine-tuning is essential for\ntailoring them to specific applications. Therefore, understanding what LLMs\nlearn during the alignment process is crucial. Recent studies suggest that\nalignment primarily adjusts a model's presentation style rather than its\nfoundational knowledge, indicating that only certain components of the model\nare significantly impacted. To delve deeper into LLM alignment, we propose to\nidentify which layers within LLMs are most critical to the alignment process,\nthereby uncovering how alignment influences model behavior at a granular level.\nWe propose a novel approach to identify the important layers for LLM alignment\n(ILA). It involves learning a binary mask for each incremental weight matrix in\nthe LoRA algorithm, indicating the significance of each layer. ILA consistently\nidentifies important layers across various alignment datasets, with nearly 90%\noverlap even with substantial dataset differences, highlighting fundamental\npatterns in LLM alignment. Experimental results indicate that freezing\nnon-essential layers improves overall model performance, while selectively\ntuning the most critical layers significantly enhances fine-tuning efficiency\nwith minimal performance loss.\n","authors":["Guangyuan Shi","Zexin Lu","Xiaoyu Dong","Wenlong Zhang","Xuanyu Zhang","Yujie Feng","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2410.17875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15055v2","updated":"2024-10-23T13:20:15Z","published":"2024-02-23T02:15:47Z","title":"Interpreting Context Look-ups in Transformers: Investigating\n Attention-MLP Interactions","summary":" Understanding the inner workings of large language models (LLMs) is crucial\nfor advancing their theoretical foundations and real-world applications. While\nthe attention mechanism and multi-layer perceptrons (MLPs) have been studied\nindependently, their interactions remain largely unexplored. This study\ninvestigates how attention heads and next-token neurons interact in LLMs to\npredict new words. We propose a methodology to identify next-token neurons,\nfind prompts that highly activate them, and determine the upstream attention\nheads responsible. We then generate and evaluate explanations for the activity\nof these attention heads in an automated manner. Our findings reveal that some\nattention heads recognize specific contexts relevant to predicting a token and\nactivate a downstream token-predicting neuron accordingly. This mechanism\nprovides a deeper understanding of how attention heads work with MLP neurons to\nperform next-token prediction. Our approach offers a foundation for further\nresearch into the intricate workings of LLMs and their impact on text\ngeneration and understanding.\n","authors":["Clement Neo","Shay B. Cohen","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2402.15055v2.pdf","comment":"Accepted to EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2403.14774v2","updated":"2024-10-23T13:01:14Z","published":"2024-03-21T18:28:43Z","title":"Few-Shot Adversarial Prompt Learning on Vision-Language Models","summary":" The vulnerability of deep neural networks to imperceptible adversarial\nperturbations has attracted widespread attention. Inspired by the success of\nvision-language foundation models, previous efforts achieved zero-shot\nadversarial robustness by aligning adversarial visual features with text\nsupervision. However, in practice, they are still unsatisfactory due to several\nissues, including heavy adaptation cost, suboptimal text supervision, and\nuncontrolled natural generalization capacity. In this paper, to address these\nissues, we propose a few-shot adversarial prompt framework where adapting input\nsequences with limited data makes significant adversarial robustness\nimprovement. Specifically, we achieve this by providing adversarially\ncorrelated text supervision that is end-to-end learned from adversarial\nexamples. We also propose a novel training objective that enhances the\nconsistency of multi-modal features while encourages differentiated uni-modal\nfeatures between natural and adversarial examples. The proposed framework gives\naccess to learn adversarial text supervision, which provides superior\ncross-modal adversarial alignment and matches state-of-the-art zero-shot\nadversarial robustness with only 1% training data. Code is available at:\nhttps://github.com/lionel-w2/FAP.\n","authors":["Yiwei Zhou","Xiaobo Xia","Zhiwei Lin","Bo Han","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.14774v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.15956v2","updated":"2024-10-23T13:00:27Z","published":"2024-10-21T12:34:17Z","title":"Do Large Language Models Have an English Accent? Evaluating and\n Improving the Naturalness of Multilingual LLMs","summary":" Current Large Language Models (LLMs) are predominantly designed with English\nas the primary language, and even the few that are multilingual tend to exhibit\nstrong English-centric biases. Much like speakers who might produce awkward\nexpressions when learning a second language, LLMs often generate unnatural\noutputs in non-English languages, reflecting English-centric patterns in both\nvocabulary and grammar. Despite the importance of this issue, the naturalness\nof multilingual LLM outputs has received limited attention. In this paper, we\naddress this gap by introducing novel automatic corpus-level metrics to assess\nthe lexical and syntactic naturalness of LLM outputs in a multilingual context.\nUsing our new metrics, we evaluate state-of-the-art LLMs on a curated benchmark\nin French and Chinese, revealing a tendency towards English-influenced\npatterns. To mitigate this issue, we also propose a simple and effective\nalignment method to improve the naturalness of an LLM in a target language and\ndomain, achieving consistent improvements in naturalness without compromising\nthe performance on general-purpose benchmarks. Our work highlights the\nimportance of developing multilingual metrics, resources and methods for the\nnew wave of multilingual LLMs.\n","authors":["Yanzhu Guo","Simone Conia","Zelin Zhou","Min Li","Saloni Potdar","Henry Xiao"],"pdf_url":"https://arxiv.org/pdf/2410.15956v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16845v2","updated":"2024-10-23T12:53:00Z","published":"2024-06-24T17:49:28Z","title":"RaTEScore: A Metric for Radiology Report Generation","summary":" This paper introduces a novel, entity-aware metric, termed as Radiological\nReport (Text) Evaluation (RaTEScore), to assess the quality of medical reports\ngenerated by AI models. RaTEScore emphasizes crucial medical entities such as\ndiagnostic outcomes and anatomical details, and is robust against complex\nmedical synonyms and sensitive to negation expressions. Technically, we\ndeveloped a comprehensive medical NER dataset, RaTE-NER, and trained an NER\nmodel specifically for this purpose. This model enables the decomposition of\ncomplex radiological reports into constituent medical entities. The metric\nitself is derived by comparing the similarity of entity embeddings, obtained\nfrom a language model, based on their types and relevance to clinical\nsignificance. Our evaluations demonstrate that RaTEScore aligns more closely\nwith human preference than existing metrics, validated both on established\npublic benchmarks and our newly proposed RaTE-Eval benchmark.\n","authors":["Weike Zhao","Chaoyi Wu","Xiaoman Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2406.16845v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.06022v2","updated":"2024-10-23T12:49:32Z","published":"2024-10-08T13:23:58Z","title":"Can Language Models Induce Grammatical Knowledge from Indirect Evidence?","summary":" What kinds of and how much data is necessary for language models to induce\ngrammatical knowledge to judge sentence acceptability? Recent language models\nstill have much room for improvement in their data efficiency compared to\nhumans. This paper investigates whether language models efficiently use\nindirect data (indirect evidence), from which they infer sentence\nacceptability. In contrast, humans use indirect evidence efficiently, which is\nconsidered one of the inductive biases contributing to efficient language\nacquisition. To explore this question, we introduce the Wug InDirect Evidence\nTest (WIDET), a dataset consisting of training instances inserted into the\npre-training data and evaluation instances. We inject synthetic instances with\nnewly coined wug words into pretraining data and explore the model's behavior\non evaluation data that assesses grammatical acceptability regarding those\nwords. We prepare the injected instances by varying their levels of\nindirectness and quantity. Our experiments surprisingly show that language\nmodels do not induce grammatical knowledge even after repeated exposure to\ninstances with the same structure but differing only in lexical items from\nevaluation instances in certain language phenomena. Our findings suggest a\npotential direction for future research: developing models that use latent\nindirect evidence to induce grammatical knowledge.\n","authors":["Miyu Oba","Yohei Oseki","Akiyo Fukatsu","Akari Haga","Hiroki Ouchi","Taro Watanabe","Saku Sugawara"],"pdf_url":"https://arxiv.org/pdf/2410.06022v2.pdf","comment":"This paper is accepted at EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2410.17820v1","updated":"2024-10-23T12:26:10Z","published":"2024-10-23T12:26:10Z","title":"Understanding When Tree of Thoughts Succeeds: Larger Models Excel in\n Generation, Not Discrimination","summary":" Tree of Thoughts (ToT) is a reasoning strategy for Large Language Models\n(LLMs) that employs a generator to suggest reasoning steps and a discriminator\nto decide which steps to implement. ToT demonstrates strong performance on\nreasoning tasks, often surpassing simple methods such as Input-Output (IO)\nprompting and Chain-of-Thought (CoT) reasoning. However, ToT does not\nconsistently outperform such simpler methods across all models, leaving large\nknowledge gaps on the conditions under which ToT is most beneficial. In this\npaper, we analyze the roles of the generator and discriminator separately to\nbetter understand the conditions when ToT is beneficial. We find that the\ngenerator plays a more critical role than the discriminator in driving the\nsuccess of ToT. While using even a smaller model as the discriminator, scaling\nthe generator leads to notable improvements in ToT performance, whereas scaling\nthe discriminator with a fixed generator yields only marginal gains. Our\nresults show that models across different scales exhibit comparable\ndiscrimination capabilities, yet differ significantly in their generative\nperformance for ToT.\n","authors":["Qiqi Chen","Xinpeng Wang","Philipp Mondorf","Michael A. Hedderich","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2410.17820v1.pdf","comment":"Code: github.com/mainlp/tot-eval"},{"id":"http://arxiv.org/abs/2410.17799v1","updated":"2024-10-23T11:58:58Z","published":"2024-10-23T11:58:58Z","title":"OmniFlatten: An End-to-end GPT Model for Seamless Voice Conversation","summary":" Full-duplex spoken dialogue systems significantly advance over traditional\nturn-based dialogue systems, as they allow simultaneous bidirectional\ncommunication, closely mirroring human-human interactions. However, achieving\nlow latency and natural interactions in full-duplex dialogue systems remains a\nsignificant challenge, especially considering human conversation dynamics such\nas interruptions, backchannels, and overlapping speech. In this paper, we\nintroduce a novel End-to-End GPT-based model OmniFlatten for full-duplex\nconversation, capable of effectively modeling the complex behaviors inherent to\nnatural conversations with low latency. To achieve full-duplex communication\ncapabilities, we propose a multi-stage post-training scheme that progressively\nadapts a text-based large language model (LLM) backbone into a speech-text\ndialogue LLM, capable of generating text and speech in real time, without\nmodifying the architecture of the backbone LLM. The training process comprises\nthree stages: modality alignment, half-duplex dialogue learning, and\nfull-duplex dialogue learning. Throughout all training stages, we standardize\nthe data using a flattening operation, which allows us to unify the training\nmethods and the model architecture across different modalities and tasks. Our\napproach offers a straightforward modeling technique and a promising research\ndirection for developing efficient and natural end-to-end full-duplex spoken\ndialogue systems. Audio samples of dialogues generated by OmniFlatten can be\nfound at this web site (https://omniflatten.github.io/).\n","authors":["Qinglin Zhang","Luyao Cheng","Chong Deng","Qian Chen","Wen Wang","Siqi Zheng","Jiaqing Liu","Hai Yu","Chaohong Tan"],"pdf_url":"https://arxiv.org/pdf/2410.17799v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2406.05804v4","updated":"2024-10-23T11:36:57Z","published":"2024-06-09T14:42:55Z","title":"A Review of Prominent Paradigms for LLM-Based Agents: Tool Use\n (Including RAG), Planning, and Feedback Learning","summary":" Tool use, planning, and feedback learning are currently three prominent\nparadigms for developing Large Language Model (LLM)-based agents across various\ntasks. Although numerous frameworks have been devised for each paradigm, their\nintricate workflows and inconsistent taxonomy create challenges in\nunderstanding and reviewing the frameworks across different paradigms. This\nsurvey introduces a unified taxonomy to systematically review and discuss these\nframeworks. Specifically, 1) the taxonomy defines environments/tasks, common\nLLM-profiled roles or LMPRs (policy models, evaluators, and dynamic models),\nand universally applicable workflows found in prior work, and 2) it enables a\ncomparison of key perspectives on the implementations of LMPRs and workflow\ndesigns across different agent paradigms and frameworks. 3) Finally, we\nidentify three limitations in existing workflow designs and systematically\ndiscuss the future work.\n","authors":["Xinzhe Li"],"pdf_url":"https://arxiv.org/pdf/2406.05804v4.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2410.17783v1","updated":"2024-10-23T11:32:46Z","published":"2024-10-23T11:32:46Z","title":"Leveraging the Domain Adaptation of Retrieval Augmented Generation\n Models for Question Answering and Reducing Hallucination","summary":" While ongoing advancements in Large Language Models have demonstrated\nremarkable success across various NLP tasks, Retrieval Augmented Generation\nModel stands out to be highly effective on downstream applications like\nQuestion Answering. Recently, RAG-end2end model further optimized the\narchitecture and achieved notable performance improvements on domain\nadaptation. However, the effectiveness of these RAG-based architectures remains\nrelatively unexplored when fine-tuned on specialized domains such as customer\nservice for building a reliable conversational AI system. Furthermore, a\ncritical challenge persists in reducing the occurrence of hallucinations while\nmaintaining high domain-specific accuracy. In this paper, we investigated the\nperformance of diverse RAG and RAG-like architectures through domain adaptation\nand evaluated their ability to generate accurate and relevant response grounded\nin the contextual knowledge base. To facilitate the evaluation of the models,\nwe constructed a novel dataset HotelConvQA, sourced from wide range of\nhotel-related conversations and fine-tuned all the models on our domain\nspecific dataset. We also addressed a critical research gap on determining the\nimpact of domain adaptation on reducing hallucinations across different RAG\narchitectures, an aspect that was not properly measured in prior work. Our\nevaluation shows positive results in all metrics by employing domain\nadaptation, demonstrating strong performance on QA tasks and providing insights\ninto their efficacy in reducing hallucinations. Our findings clearly indicate\nthat domain adaptation not only enhances the models' performance on QA tasks\nbut also significantly reduces hallucination across all evaluated RAG\narchitectures.\n","authors":["Salman Rakin","Md. A. R. Shibly","Zahin M. Hossain","Zeeshan Khan","Md. Mostofa Akbar"],"pdf_url":"https://arxiv.org/pdf/2410.17783v1.pdf","comment":"Initial Version fine-tuned on HotelConvQA"},{"id":"http://arxiv.org/abs/2404.19232v7","updated":"2024-10-23T11:19:02Z","published":"2024-04-30T03:29:30Z","title":"GRAMMAR: Grounded and Modular Methodology for Assessment of\n Closed-Domain Retrieval-Augmented Language Model","summary":" Retrieval-Augmented Generation (RAG) systems are widely used across various\nindustries for querying closed-domain and in-house knowledge bases. However,\nevaluating these systems presents significant challenges due to the private\nnature of closed-domain data and a scarcity of queries with verifiable ground\ntruths. Moreover, there is a lack of analytical methods to diagnose problematic\nmodules and identify types of failure, such as those caused by knowledge\ndeficits or issues with robustness. To address these challenges, we introduce\nGRAMMAR (GRounded And Modular Methodology for Assessment of RAG), an evaluation\nframework comprising a grounded data generation process and an evaluation\nprotocol that effectively pinpoints defective modules. Our validation\nexperiments reveal that GRAMMAR provides a reliable approach for identifying\nvulnerable modules and supports hypothesis testing for textual form\nvulnerabilities. An open-source tool accompanying this framework is available\nin our GitHub repository (see https://github.com/xinzhel/grammar), allowing for\neasy reproduction of our results and enabling reliable and modular evaluation\nin closed-domain settings.\n","authors":["Xinzhe Li","Ming Liu","Shang Gao"],"pdf_url":"https://arxiv.org/pdf/2404.19232v7.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2410.16144v2","updated":"2024-10-23T11:17:42Z","published":"2024-10-21T16:14:57Z","title":"1-bit AI Infra: Part 1.1, Fast and Lossless BitNet b1.58 Inference on\n CPUs","summary":" Recent advances in 1-bit Large Language Models (LLMs), such as BitNet and\nBitNet b1.58, present a promising approach to enhancing the efficiency of LLMs\nin terms of speed and energy consumption. These developments also enable local\nLLM deployment across a broad range of devices. In this work, we introduce\nbitnet.cpp, a tailored software stack designed to unlock the full potential of\n1-bit LLMs. Specifically, we develop a set of kernels to support fast and\nlossless inference of ternary BitNet b1.58 LLMs on CPUs. Extensive experiments\ndemonstrate that bitnet.cpp achieves significant speedups, ranging from 2.37x\nto 6.17x on x86 CPUs and from 1.37x to 5.07x on ARM CPUs, across various model\nsizes. The code is available at https://github.com/microsoft/BitNet.\n","authors":["Jinheng Wang","Hansong Zhou","Ting Song","Shaoguang Mao","Shuming Ma","Hongyu Wang","Yan Xia","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2410.16144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17759v1","updated":"2024-10-23T10:50:40Z","published":"2024-10-23T10:50:40Z","title":"Latent Structures of Intertextuality in French Fiction","summary":" Intertextuality is a key concept in literary theory that challenges\ntraditional notions of text, signification or authorship. It views texts as\npart of a vast intertextual network that is constantly evolving and being\nreconfigured. This paper argues that the field of computational literary\nstudies is the ideal place to conduct a study of intertextuality since we have\nnow the ability to systematically compare texts with each others. Specifically,\nwe present a work on a corpus of more than 12.000 French fictions from the\n18th, 19th and early 20th century. We focus on evaluating the underlying roles\nof two literary notions, sub-genres and the literary canon in the framing of\ntextuality. The article attempts to operationalize intertextuality using\nstate-of-the-art contextual language models to encode novels and capture\nfeatures that go beyond simple lexical or thematic approaches. Previous\nresearch (Hughes, 2012) supports the existence of a literary \"style of a time\",\nand our findings further reinforce this concept. Our findings also suggest that\nboth subgenres and canonicity play a significant role in shaping textual\nsimilarities within French fiction. These discoveries point to the importance\nof considering genre and canon as dynamic forces that influence the evolution\nand intertextual connections of literary works within specific historical\ncontexts.\n","authors":["Jean Barré"],"pdf_url":"https://arxiv.org/pdf/2410.17759v1.pdf","comment":"13 pages, 6 figures. Computational Humanities Research Conference\n 2024"},{"id":"http://arxiv.org/abs/2410.17739v1","updated":"2024-10-23T10:12:35Z","published":"2024-10-23T10:12:35Z","title":"Local Contrastive Editing of Gender Stereotypes","summary":" Stereotypical bias encoded in language models (LMs) poses a threat to safe\nlanguage technology, yet our understanding of how bias manifests in the\nparameters of LMs remains incomplete. We introduce local contrastive editing\nthat enables the localization and editing of a subset of weights in a target\nmodel in relation to a reference model. We deploy this approach to identify and\nmodify subsets of weights that are associated with gender stereotypes in LMs.\nThrough a series of experiments, we demonstrate that local contrastive editing\ncan precisely localize and control a small subset (< 0.5%) of weights that\nencode gender bias. Our work (i) advances our understanding of how\nstereotypical biases can manifest in the parameter space of LMs and (ii) opens\nup new avenues for developing parameter-efficient strategies for controlling\nmodel properties in a contrastive manner.\n","authors":["Marlene Lutz","Rochelle Choenni","Markus Strohmaier","Anne Lauscher"],"pdf_url":"https://arxiv.org/pdf/2410.17739v1.pdf","comment":"Accepted at EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.17736v1","updated":"2024-10-23T10:11:40Z","published":"2024-10-23T10:11:40Z","title":"MojoBench: Language Modeling and Benchmarks for Mojo","summary":" The recently introduced Mojo programming language (PL) by Modular, has\nreceived significant attention in the scientific community due to its claimed\nsignificant speed boost over Python. Despite advancements in code Large\nLanguage Models (LLMs) across various PLs, Mojo remains unexplored in this\ncontext. To address this gap, we introduce MojoBench, the first framework for\nMojo code generation. MojoBench includes HumanEval-Mojo, a benchmark dataset\ndesigned for evaluating code LLMs on Mojo, and Mojo-Coder, the first LLM\npretrained and finetuned for Mojo code generation, which supports instructions\nin 5 natural languages (NLs). Our results show that Mojo-Coder achieves a\n30-35% performance improvement over leading models like GPT-4o and\nClaude-3.5-Sonnet. Furthermore, we provide insights into LLM behavior with\nunderrepresented and unseen PLs, offering potential strategies for enhancing\nmodel adaptability. MojoBench contributes to our understanding of LLM\ncapabilities and limitations in emerging programming paradigms fostering more\nrobust code generation systems.\n","authors":["Nishat Raihan","Joanna C. S. Santos","Marcos Zampieri"],"pdf_url":"https://arxiv.org/pdf/2410.17736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14622v2","updated":"2024-10-23T10:06:10Z","published":"2024-02-22T15:10:45Z","title":"From Keywords to Structured Summaries: Streamlining Scholarly\n Information Access","summary":" This paper highlights the growing importance of information retrieval (IR)\nengines in the scientific community, addressing the inefficiency of traditional\nkeyword-based search engines due to the rising volume of publications. The\nproposed solution involves structured records, underpinning advanced\ninformation technology (IT) tools, including visualization dashboards, to\nrevolutionize how researchers access and filter articles, replacing the\ntraditional text-heavy approach. This vision is exemplified through a proof of\nconcept centered on the \"reproductive number estimate of infectious diseases\"\nresearch theme, using a fine-tuned large language model (LLM) to automate the\ncreation of structured records to populate a backend database that now goes\nbeyond keywords. The result is a next-generation information access system as\nan IR method accessible at https://orkg.org/usecases/r0-estimates.\n","authors":["Mahsa Shamsabadi","Jennifer D'Souza"],"pdf_url":"https://arxiv.org/pdf/2402.14622v2.pdf","comment":"8 pages, 3 figures | Accepted for publication as a poster paper at\n the International Semantic Web Conference (ISWC 2024)"},{"id":"http://arxiv.org/abs/2410.17728v1","updated":"2024-10-23T10:00:23Z","published":"2024-10-23T10:00:23Z","title":"Dialectal and Low Resource Machine Translation for Aromanian","summary":" We present a neural machine translation system that can translate between\nRomanian, English, and Aromanian (an endangered Eastern Romance language); the\nfirst of its kind. BLEU scores range from 17 to 32 depending on the direction\nand genre of the text. Alongside, we release the biggest known\nAromanian-Romanian bilingual corpus, consisting of 79k cleaned sentence pairs.\nAdditional tools such as an agnostic sentence embedder (used for both text\nmining and automatic evaluation) and a diacritics converter are also presented.\nWe publicly release our findings and models. Finally, we describe the\ndeployment of our quantized model at https://arotranslate.com.\n","authors":["Alexandru-Iulius Jerpelea","Alina-Ştefania Rădoi","Sergiu Nisioi"],"pdf_url":"https://arxiv.org/pdf/2410.17728v1.pdf","comment":"16 pages, 3 figures, 6 tables, submitted to COLING 2025"},{"id":"http://arxiv.org/abs/2406.14282v3","updated":"2024-10-23T09:42:59Z","published":"2024-06-20T13:07:38Z","title":"Learning to Plan for Retrieval-Augmented Large Language Models from\n Knowledge Graphs","summary":" Improving the performance of large language models (LLMs) in complex\nquestion-answering (QA) scenarios has always been a research focal point.\nRecent studies have attempted to enhance LLMs' performance by combining\nstep-wise planning with external retrieval. While effective for advanced models\nlike GPT-3.5, smaller LLMs face challenges in decomposing complex questions,\nnecessitating supervised fine-tuning. Previous work has relied on manual\nannotation and knowledge distillation from teacher LLMs, which are\ntime-consuming and not accurate enough. In this paper, we introduce a novel\nframework for enhancing LLMs' planning capabilities by using planning data\nderived from knowledge graphs (KGs). LLMs fine-tuned with this data have\nimproved planning capabilities, better equipping them to handle complex QA\ntasks that involve retrieval. Evaluations on multiple datasets, including our\nnewly proposed benchmark, highlight the effectiveness of our framework and the\nbenefits of KG-derived planning data.\n","authors":["Junjie Wang","Mingyang Chen","Binbin Hu","Dan Yang","Ziqi Liu","Yue Shen","Peng Wei","Zhiqiang Zhang","Jinjie Gu","Jun Zhou","Jeff Z. Pan","Wen Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2406.14282v3.pdf","comment":"EMNLP2024 Findings"},{"id":"http://arxiv.org/abs/2410.17714v1","updated":"2024-10-23T09:40:15Z","published":"2024-10-23T09:40:15Z","title":"CogSteer: Cognition-Inspired Selective Layer Intervention for Efficient\n Semantic Steering in Large Language Models","summary":" Despite their impressive capabilities, large language models (LLMs) often\nlack interpretability and can generate toxic content. While using LLMs as\nfoundation models and applying semantic steering methods are widely practiced,\nwe believe that efficient methods should be based on a thorough understanding\nof LLM behavior. To this end, we propose using eye movement measures to\ninterpret LLM behavior across layers. We find that LLMs exhibit patterns\nsimilar to human gaze across layers and different layers function differently.\nInspired by these findings, we introduce a heuristic steering layer selection\nand apply it to layer intervention methods via fine-tuning and inference. Using\nlanguage toxification and detoxification as test beds, we demonstrate that our\nproposed CogSteer methods achieve better results in terms of toxicity scores\nwhile efficiently saving 97% of the computational resources and 60% of the\ntraining time. Our model-agnostic approach can be adopted into various LLMs,\ncontributing to their interpretability and promoting trustworthiness for safe\ndeployment.\n","authors":["Xintong Wang","Jingheng Pan","Longqin Jiang","Liang Ding","Xingshan Li","Chris Biemann"],"pdf_url":"https://arxiv.org/pdf/2410.17714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17711v1","updated":"2024-10-23T09:36:21Z","published":"2024-10-23T09:36:21Z","title":"Beware of Calibration Data for Pruning Large Language Models","summary":" As large language models (LLMs) are widely applied across various fields,\nmodel compression has become increasingly crucial for reducing costs and\nimproving inference efficiency. Post-training pruning is a promising method\nthat does not require resource-intensive iterative training and only needs a\nsmall amount of calibration data to assess the importance of parameters.\nPrevious research has primarily focused on designing advanced pruning methods,\nwhile different calibration data's impact on pruning performance still lacks\nsystematical exploration. We fill this blank and surprisingly observe that the\neffects of calibration data even value more than designing advanced pruning\nstrategies, especially for high sparsity. Our preliminary exploration also\ndiscloses that using calibration data similar to the training data can yield\nbetter performance. As pre-training data is usually inaccessible for advanced\nLLMs, we further provide a self-generating calibration data synthesis strategy\nto construct feasible calibration data. We conduct experiments on the recent\nstrong open-source LLMs (e.g., DCLM, and LLaMA-3), and the results show that\nthe proposed method outperforms commonly used calibration data and can\neffectively enhance strong pruning methods (e.g., Wanda, OWL).\n","authors":["Yixin Ji","Yang Xiang","Juntao Li","Qingrong Xia","Ping Li","Xinyu Duan","Zhefeng Wang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.17711v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2410.17694v1","updated":"2024-10-23T09:14:57Z","published":"2024-10-23T09:14:57Z","title":"An Adaptive Framework for Generating Systematic Explanatory Answer in\n Online Q&A Platforms","summary":" Question Answering (QA) systems face challenges in handling complex questions\nthat require multi-domain knowledge synthesis. The naive RAG models, although\neffective in information retrieval, struggle with complex questions that\nrequire comprehensive and in-depth answers. The pioneering task is defined as\nexplanatory answer generation, which entails handling identified challenges\nsuch as the requirement for comprehensive information and logical coherence\nwithin the generated context. To address these issues, we refer to systematic\nthinking theory and propose SynthRAG, an innovative framework designed to\nenhance QA performance. SynthRAG improves on conventional models by employing\nadaptive outlines for dynamic content structuring, generating systematic\ninformation to ensure detailed coverage, and producing customized answers\ntailored to specific user inquiries. This structured approach guarantees\nlogical coherence and thorough integration of information, yielding responses\nthat are both insightful and methodically organized. Empirical evaluations\nunderscore SynthRAG's effectiveness, demonstrating its superiority in handling\ncomplex questions, overcoming the limitations of naive RAG models, and\nsignificantly improving answer quality and depth. Furthermore, an online\ndeployment on the Zhihu platform revealed that SynthRAG's answers achieved\nnotable user engagement, with each response averaging 5.73 upvotes and\nsurpassing the performance of 79.8% of human contributors, highlighting the\npractical relevance and impact of the proposed framework. Our code is available\nat https://github.com/czy1999/SynthRAG .\n","authors":["Ziyang Chen","Xiaobin Wang","Yong Jiang","Jinzhi Liao","Pengjun Xie","Fei Huang","Xiang Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.17694v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.17676v1","updated":"2024-10-23T08:49:51Z","published":"2024-10-23T08:49:51Z","title":"Towards a Similarity-adjusted Surprisal Theory","summary":" Surprisal theory posits that the cognitive effort required to comprehend a\nword is determined by its contextual predictability, quantified as surprisal.\nTraditionally, surprisal theory treats words as distinct entities, overlooking\nany potential similarity between them. Giulianelli et al. (2023) address this\nlimitation by introducing information value, a measure of predictability\ndesigned to account for similarities between communicative units. Our work\nleverages Ricotta and Szeidl's (2006) diversity index to extend surprisal into\na metric that we term similarity-adjusted surprisal, exposing a mathematical\nrelationship between surprisal and information value. Similarity-adjusted\nsurprisal aligns with information value when considering graded similarities\nand reduces to standard surprisal when words are treated as distinct.\nExperimental results with reading time data indicate that similarity-adjusted\nsurprisal adds predictive power beyond standard surprisal for certain datasets,\nsuggesting it serves as a complementary measure of comprehension effort.\n","authors":["Clara Meister","Mario Giulianelli","Tiago Pimentel"],"pdf_url":"https://arxiv.org/pdf/2410.17676v1.pdf","comment":"EMNLP 2024 main conference proceedings"},{"id":"http://arxiv.org/abs/2410.17670v1","updated":"2024-10-23T08:42:36Z","published":"2024-10-23T08:42:36Z","title":"Quantifying the Risks of Tool-assisted Rephrasing to Linguistic\n Diversity","summary":" Writing assistants and large language models see widespread use in the\ncreation of text content. While their effectiveness for individual users has\nbeen evaluated in the literature, little is known about their proclivity to\nchange language or reduce its richness when adopted by a large user base. In\nthis paper, we take a first step towards quantifying this risk by measuring the\nsemantic and vocabulary change enacted by the use of rephrasing tools on a\nmulti-domain corpus of human-generated text.\n","authors":["Mengying Wang","Andreas Spitz"],"pdf_url":"https://arxiv.org/pdf/2410.17670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15993v4","updated":"2024-10-23T08:33:54Z","published":"2024-04-24T17:10:35Z","title":"Uncertainty Estimation and Quantification for LLMs: A Simple Supervised\n Approach","summary":" In this paper, we study the problem of uncertainty estimation and calibration\nfor LLMs. We begin by formulating the uncertainty estimation problem, a\nrelevant yet underexplored area in existing literature. We then propose a\nsupervised approach that leverages labeled datasets to estimate the uncertainty\nin LLMs' responses. Based on the formulation, we illustrate the difference\nbetween the uncertainty estimation for LLMs and that for standard ML models and\nexplain why the hidden neurons of the LLMs may contain uncertainty information.\nOur designed approach demonstrates the benefits of utilizing hidden activations\nto enhance uncertainty estimation across various tasks and shows robust\ntransferability in out-of-distribution settings. We distinguish the uncertainty\nestimation task from the uncertainty calibration task and show that better\nuncertainty estimation leads to better calibration performance. Furthermore,\nour method is easy to implement and adaptable to different levels of model\naccessibility including black box, grey box, and white box.\n","authors":["Linyu Liu","Yu Pan","Xiaocheng Li","Guanting Chen"],"pdf_url":"https://arxiv.org/pdf/2404.15993v4.pdf","comment":"29 pages, 14 figures"},{"id":"http://arxiv.org/abs/2406.10216v2","updated":"2024-10-23T08:22:44Z","published":"2024-06-14T17:49:59Z","title":"Regularizing Hidden States Enables Learning Generalizable Reward Model\n for LLMs","summary":" Reward models trained on human preference data have been proven to\neffectively align Large Language Models (LLMs) with human intent within the\nframework of reinforcement learning from human feedback (RLHF). However,\ncurrent reward models have limited generalization capabilities to unseen\nprompts and responses, which can lead to an unexpected phenomenon known as\nreward over-optimization, resulting in a decline in actual performance due to\nexcessive optimization of rewards. While previous research has advocated for\nconstraining policy optimization, our study introduces a novel approach to\nenhance the reward model's generalization ability against distribution shifts\nby regularizing the hidden states. Specifically, we retain the base model's\nlanguage model head and incorporate a suite of text-generation losses to\npreserve the hidden states' text-generation capabilities, while concurrently\nlearning a reward head behind the same hidden states. Our experimental results\ndemonstrate that the introduced regularization technique markedly improves the\naccuracy of learned reward models across a variety of out-of-distribution (OOD)\ntasks and effectively alleviates the over-optimization issue in RLHF, offering\na more reliable and robust preference learning paradigm.\n","authors":["Rui Yang","Ruomeng Ding","Yong Lin","Huan Zhang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.10216v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.17657v1","updated":"2024-10-23T08:19:18Z","published":"2024-10-23T08:19:18Z","title":"ReflecTool: Towards Reflection-Aware Tool-Augmented Clinical Agents","summary":" Large Language Models (LLMs) have shown promising potential in the medical\ndomain, assisting with tasks like clinical note generation and patient\ncommunication. However, current LLMs are limited to text-based communication,\nhindering their ability to interact with diverse forms of information in\nclinical environments. Despite clinical agents succeeding in diverse signal\ninteraction, they are oriented to a single clinical scenario and hence fail for\nbroader applications. To evaluate clinical agents holistically, we propose\nClinicalAgent Bench~(CAB), a comprehensive medical agent benchmark consisting\nof 18 tasks across five key realistic clinical dimensions. Building on this, we\nintroduce ReflecTool, a novel framework that excels at utilizing\ndomain-specific tools within two stages. The first optimization stage\nprogressively enlarges a long-term memory by saving successful solving\nprocesses and tool-wise experience of agents in a tiny pre-defined training\nset. In the following inference stage, ReflecTool can search for supportive\nsuccessful demonstrations from already built long-term memory to guide the tool\nselection strategy, and a verifier improves the tool usage according to the\ntool-wise experience with two verification methods--iterative refinement and\ncandidate selection. Extensive experiments on ClinicalAgent Benchmark\ndemonstrate that ReflecTool surpasses the pure LLMs with more than 10 points\nand the well-established agent-based methods with 3 points, highlighting its\nadaptability and effectiveness in solving complex clinical tasks.\n","authors":["Yusheng Liao","Shuyang Jiang","Yanfeng Wang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2410.17657v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2410.17635v1","updated":"2024-10-23T07:53:29Z","published":"2024-10-23T07:53:29Z","title":"Markov Chain of Thought for Efficient Mathematical Reasoning","summary":" Chain of Thought (CoT) of multi-step benefits from the logical structure of\nthe reasoning steps and task-specific actions, significantly enhancing the\nmathematical reasoning capabilities of large language models. As the prevalence\nof long CoT, the number of reasoning steps exceeds manageable token limits and\nleads to higher computational demands. Inspired by the fundamental logic of\nhuman cognition, ``derive, then reduce'', we conceptualize the standard\nmulti-step CoT as a novel Markov Chain of Thought (MCoT). In this study, we\nconsider the mathematical reasoning task, defining each reasoning step as text\naccompanied by a Python code snippet. To facilitate a longer reasoning path,\nself-correction is enabled through interactions with the code interpreter. Our\nMCoT aims to compress previous reasoning steps into a simplified question,\nenabling efficient next-step inference without relying on a lengthy KV cache.\nIn our experiments, we curate the \\texttt{MCoTInstruct} dataset, and the\nempirical results indicate that MCoT not only significantly enhances efficiency\nbut also maintains comparable accuracy. While much remains to be explored, this\nwork paves the way for exploring the long CoT reasoning abilities of LLMs.\n","authors":["Wen Yang","Kai Fan","Minpeng Liao"],"pdf_url":"https://arxiv.org/pdf/2410.17635v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2410.17632v1","updated":"2024-10-23T07:48:51Z","published":"2024-10-23T07:48:51Z","title":"LMLPA: Language Model Linguistic Personality Assessment","summary":" Large Language Models (LLMs) are increasingly used in everyday life and\nresearch. One of the most common use cases is conversational interactions,\nenabled by the language generation capabilities of LLMs. Just as between two\nhumans, a conversation between an LLM-powered entity and a human depends on the\npersonality of the conversants. However, measuring the personality of a given\nLLM is currently a challenge. This paper introduces the Language Model\nLinguistic Personality Assessment (LMLPA), a system designed to evaluate the\nlinguistic personalities of LLMs. Our system helps to understand LLMs' language\ngeneration capabilities by quantitatively assessing the distinct personality\ntraits reflected in their linguistic outputs. Unlike traditional human-centric\npsychometrics, the LMLPA adapts a personality assessment questionnaire,\nspecifically the Big Five Inventory, to align with the operational capabilities\nof LLMs, and also incorporates the findings from previous language-based\npersonality measurement literature. To mitigate sensitivity to the order of\noptions, our questionnaire is designed to be open-ended, resulting in textual\nanswers. Thus, the AI rater is needed to transform ambiguous personality\ninformation from text responses into clear numerical indicators of personality\ntraits. Utilising Principal Component Analysis and reliability validations, our\nfindings demonstrate that LLMs possess distinct personality traits that can be\neffectively quantified by the LMLPA. This research contributes to\nHuman-Computer Interaction and Human-Centered AI, providing a robust framework\nfor future studies to refine AI personality assessments and expand their\napplications in multiple areas, including education and manufacturing.\n","authors":["Jingyao Zheng","Xian Wang","Simo Hosio","Xiaoxian Xu","Lik-Hang Lee"],"pdf_url":"https://arxiv.org/pdf/2410.17632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03622v3","updated":"2024-10-23T07:20:26Z","published":"2024-04-04T17:45:08Z","title":"Mind's Eye of LLMs: Visualization-of-Thought Elicits Spatial Reasoning\n in Large Language Models","summary":" Large language models (LLMs) have exhibited impressive performance in\nlanguage comprehension and various reasoning tasks. However, their abilities in\nspatial reasoning, a crucial aspect of human cognition, remain relatively\nunexplored. Human possess a remarkable ability to create mental images of\nunseen objects and actions through a process known as the Mind's Eye, enabling\nthe imagination of the unseen world. Inspired by this cognitive capacity, we\npropose Visualization-of-Thought (VoT) prompting. VoT aims to elicit spatial\nreasoning of LLMs by visualizing their reasoning traces, thereby guiding\nsubsequent reasoning steps. We employed VoT for multi-hop spatial reasoning\ntasks, including natural language navigation, visual navigation, and visual\ntiling in 2D grid worlds. Experimental results demonstrated that VoT\nsignificantly enhances the spatial reasoning abilities of LLMs. Notably, VoT\noutperformed existing multimodal large language models (MLLMs) in these tasks.\nWhile VoT works surprisingly well on LLMs, the ability to generate mental\nimages to facilitate spatial reasoning resembles the mind's eye process,\nsuggesting its potential viability in MLLMs. Please find the dataset and codes\nat https://microsoft.github.io/visualization-of-thought\n","authors":["Wenshan Wu","Shaoguang Mao","Yadong Zhang","Yan Xia","Li Dong","Lei Cui","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2404.03622v3.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024)"},{"id":"http://arxiv.org/abs/2305.12987v3","updated":"2024-10-23T07:13:49Z","published":"2023-05-22T12:47:48Z","title":"GPT-SW3: An Autoregressive Language Model for the Nordic Languages","summary":" This paper details the process of developing the first native large\ngenerative language model for the Nordic languages, GPT-SW3. We cover all parts\nof the development process, from data collection and processing, training\nconfiguration and instruction finetuning, to evaluation and considerations for\nrelease strategies. We hope that this paper can serve as a guide and reference\nfor other researchers that undertake the development of large generative models\nfor smaller languages.\n","authors":["Ariel Ekgren","Amaru Cuba Gyllensten","Felix Stollenwerk","Joey Öhman","Tim Isbister","Evangelia Gogoulou","Fredrik Carlsson","Alice Heiman","Judit Casademont","Magnus Sahlgren"],"pdf_url":"https://arxiv.org/pdf/2305.12987v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17195v2","updated":"2024-10-23T07:02:09Z","published":"2024-10-22T17:13:38Z","title":"Non-myopic Generation of Language Model for Reasoning and Planning","summary":" Large Language Models have demonstrated remarkable abilities in reasoning and\nplanning by breaking down complex problems into sequential steps. Despite their\nsuccess in various domains like mathematical problem-solving and coding, LLMs\nface challenges in ensuring reliable and optimal planning due to their inherent\nmyopic nature of autoregressive decoding. This paper revisits LLM reasoning\nfrom an optimal-control perspective, proposing a novel method,\nPredictive-Decoding, that leverages Model Predictive Control to enhance\nplanning accuracy. By re-weighting LLM distributions based on foresight\ntrajectories, Predictive-Decoding aims to mitigate early errors and promote\nnon-myopic planning. Our experiments show significant improvements in a wide\nrange of tasks for math, coding, and agents. Furthermore, Predictive-Decoding\ndemonstrates computational efficiency, outperforming search baselines with\nreduced computational resources. This study provides insights into optimizing\nLLM planning capabilities.\n","authors":["Chang Ma","Haiteng Zhao","Junlei Zhang","Junxian He","Lingpeng Kong"],"pdf_url":"https://arxiv.org/pdf/2410.17195v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17600v1","updated":"2024-10-23T06:54:03Z","published":"2024-10-23T06:54:03Z","title":"Graphusion: A RAG Framework for Knowledge Graph Construction with a\n Global Perspective","summary":" Knowledge Graphs (KGs) are crucial in the field of artificial intelligence\nand are widely used in downstream tasks, such as question-answering (QA). The\nconstruction of KGs typically requires significant effort from domain experts.\nLarge Language Models (LLMs) have recently been used for Knowledge Graph\nConstruction (KGC). However, most existing approaches focus on a local\nperspective, extracting knowledge triplets from individual sentences or\ndocuments, missing a fusion process to combine the knowledge in a global KG.\nThis work introduces Graphusion, a zero-shot KGC framework from free text. It\ncontains three steps: in Step 1, we extract a list of seed entities using topic\nmodeling to guide the final KG includes the most relevant entities; in Step 2,\nwe conduct candidate triplet extraction using LLMs; in Step 3, we design the\nnovel fusion module that provides a global view of the extracted knowledge,\nincorporating entity merging, conflict resolution, and novel triplet discovery.\nResults show that Graphusion achieves scores of 2.92 and 2.37 out of 3 for\nentity extraction and relation recognition, respectively. Moreover, we showcase\nhow Graphusion could be applied to the Natural Language Processing (NLP) domain\nand validate it in an educational scenario. Specifically, we introduce TutorQA,\na new expert-verified benchmark for QA, comprising six tasks and a total of\n1,200 QA pairs. Using the Graphusion-constructed KG, we achieve a significant\nimprovement on the benchmark, for example, a 9.2% accuracy improvement on\nsub-graph completion.\n","authors":["Rui Yang","Boming Yang","Aosong Feng","Sixun Ouyang","Moritz Blum","Tianwei She","Yuang Jiang","Freddy Lecue","Jinghui Lu","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2410.17600v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2407.10794"},{"id":"http://arxiv.org/abs/2410.17599v1","updated":"2024-10-23T06:52:09Z","published":"2024-10-23T06:52:09Z","title":"Cross-model Control: Improving Multiple Large Language Models in\n One-time Training","summary":" The number of large language models (LLMs) with varying parameter scales and\nvocabularies is increasing. While they deliver powerful performance, they also\nface a set of common optimization needs to meet specific requirements or\nstandards, such as instruction following or avoiding the output of sensitive\ninformation from the real world. However, how to reuse the fine-tuning outcomes\nof one model to other models to reduce training costs remains a challenge. To\nbridge this gap, we introduce Cross-model Control (CMC), a method that improves\nmultiple LLMs in one-time training with a portable tiny language model.\nSpecifically, we have observed that the logit shift before and after\nfine-tuning is remarkably similar across different models. Based on this\ninsight, we incorporate a tiny language model with a minimal number of\nparameters. By training alongside a frozen template LLM, the tiny model gains\nthe capability to alter the logits output by the LLMs. To make this tiny\nlanguage model applicable to models with different vocabularies, we propose a\nnovel token mapping strategy named PM-MinED. We have conducted extensive\nexperiments on instruction tuning and unlearning tasks, demonstrating the\neffectiveness of CMC. Our code is available at https://github.com/wujwyi/CMC.\n","authors":["Jiayi Wu","Hao Sun","Hengyi Cai","Lixin Su","Shuaiqiang Wang","Dawei Yin","Xiang Li","Ming Gao"],"pdf_url":"https://arxiv.org/pdf/2410.17599v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2402.12617v2","updated":"2024-10-23T06:28:19Z","published":"2024-02-20T00:51:05Z","title":"Generative AI Security: Challenges and Countermeasures","summary":" Generative AI's expanding footprint across numerous industries has led to\nboth excitement and increased scrutiny. This paper delves into the unique\nsecurity challenges posed by Generative AI, and outlines potential research\ndirections for managing these risks.\n","authors":["Banghua Zhu","Norman Mu","Jiantao Jiao","David Wagner"],"pdf_url":"https://arxiv.org/pdf/2402.12617v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15573v2","updated":"2024-10-23T06:21:09Z","published":"2024-10-21T01:36:42Z","title":"OpenMU: Your Swiss Army Knife for Music Understanding","summary":" We present OpenMU-Bench, a large-scale benchmark suite for addressing the\ndata scarcity issue in training multimodal language models to understand music.\nTo construct OpenMU-Bench, we leveraged existing datasets and bootstrapped new\nannotations. OpenMU-Bench also broadens the scope of music understanding by\nincluding lyrics understanding and music tool usage. Using OpenMU-Bench, we\ntrained our music understanding model, OpenMU, with extensive ablations,\ndemonstrating that OpenMU outperforms baseline models such as MU-Llama. Both\nOpenMU and OpenMU-Bench are open-sourced to facilitate future research in music\nunderstanding and to enhance creative music production efficiency.\n","authors":["Mengjie Zhao","Zhi Zhong","Zhuoyuan Mao","Shiqi Yang","Wei-Hsiang Liao","Shusuke Takahashi","Hiromi Wakaki","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2410.15573v2.pdf","comment":"Resources: https://github.com/mzhaojp22/openmu"},{"id":"http://arxiv.org/abs/2410.17578v1","updated":"2024-10-23T06:04:55Z","published":"2024-10-23T06:04:55Z","title":"MM-Eval: A Multilingual Meta-Evaluation Benchmark for LLM-as-a-Judge and\n Reward Models","summary":" Large language models (LLMs) are commonly used as evaluators in tasks (e.g.,\nreward modeling, LLM-as-a-judge), where they act as proxies for human\npreferences or judgments. This leads to the need for meta-evaluation:\nevaluating the credibility of LLMs as evaluators. However, existing benchmarks\nprimarily focus on English, offering limited insight into LLMs' effectiveness\nas evaluators in non-English contexts. To address this, we introduce MM-Eval, a\nmultilingual meta-evaluation benchmark that covers 18 languages across six\ncategories. MM-Eval evaluates various dimensions, including language-specific\nchallenges like linguistics and language hallucinations. Evaluation results\nshow that both proprietary and open-source language models have considerable\nroom for improvement. Further analysis reveals a tendency for these models to\nassign middle-ground scores to low-resource languages. We publicly release our\nbenchmark and code.\n","authors":["Guijin Son","Dongkeun Yoon","Juyoung Suk","Javier Aula-Blasco","Mano Aslan","Vu Trong Kim","Shayekh Bin Islam","Jaume Prats-Cristià","Lucía Tormo-Bañuelos","Seungone Kim"],"pdf_url":"https://arxiv.org/pdf/2410.17578v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2410.17566v1","updated":"2024-10-23T05:19:51Z","published":"2024-10-23T05:19:51Z","title":"Differentially Private Learning Needs Better Model Initialization and\n Self-Distillation","summary":" Differentially private SGD (DPSGD) enables privacy-preserving training of\nlanguage models, but often reduces utility, diversity, and linguistic quality.\nWe introduce DPRefine, a three-phase method that initializes a model using data\nsynthesis from a small pre-trained LM with rigorous filtering, applies DP\nfinetuning on private data, and performs self-distillation to refine outputs.\nThis approach significantly outperforms vanilla DPSGD, with AlpacaEval\npreferring DPRefine's generations in 78.4% of cases across all datasets. Our\nanalysis reveals that DPRefine reduces linguistic errors in generated text by\n84.0%, mitigating grammar and spelling errors, commonly associated with DPSGD.\nIt also reduces inconsistencies of non-private models, such as hallucinated\ndetails and misattributed quotes. We find that small models like GPT-2 can be\neffective for initialization and distillation, highlighting their potential in\nenabling scalable and efficient deployment of privacy-preserving language.\n","authors":["Ivoline C. Ngong","Joseph P. Near","Niloofar Mireshghallah"],"pdf_url":"https://arxiv.org/pdf/2410.17566v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2402.14146v2","updated":"2024-10-23T04:39:25Z","published":"2024-02-21T22:02:37Z","title":"Reinforcement Learning with Dynamic Multi-Reward Weighting for\n Multi-Style Controllable Generation","summary":" Textual style expresses a diverse set of information, including interpersonal\ndynamics (e.g., formality) and the author's emotions or attitudes (e.g.,\ndisgust). An open question is how language models can be explicitly controlled\nso that they weave together target styles when generating text: for example, to\nproduce text that is both negative and non-toxic. One approach to such\ncontrolled generation is multi-objective reinforcement learning (RL), but how\nbest to combine multiple objectives in a reward function is an open question.\nIn this paper, we investigate various formulations of multi-style rewards,\nincluding calibrated outputs from discriminators and dynamic weighting by\ndiscriminator gradient magnitudes. We find that our proposed dynamic weighting\noutperforms static weighting approaches with respect to style control while\nmaintaining linguistic quality, and we explore its effectiveness in 2- and\n3-style control.\n","authors":["Karin de Langis","Ryan Koo","Dongyeop Kang"],"pdf_url":"https://arxiv.org/pdf/2402.14146v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17552v1","updated":"2024-10-23T04:34:49Z","published":"2024-10-23T04:34:49Z","title":"ESpeW: Robust Copyright Protection for LLM-based EaaS via\n Embedding-Specific Watermark","summary":" Embeddings as a Service (EaaS) is emerging as a crucial role in AI\napplications. Unfortunately, EaaS is vulnerable to model extraction attacks,\nhighlighting the urgent need for copyright protection.Although some preliminary\nworks propose applying embedding watermarks to protect EaaS, recent research\nreveals that these watermarks can be easily removed. Hence, it is crucial to\ninject robust watermarks resistant to watermark removal attacks.Existing\nwatermarking methods typically inject a target embedding into embeddings\nthrough linear interpolation when the text contains triggers. However, this\nmechanism results in each watermarked embedding having the same component,\nwhich makes the watermark easy to identify and eliminate.Motivated by this, in\nthis paper, we propose a novel embedding-specific watermarking (ESpeW)\nmechanism to offer robust copyright protection for EaaS. Our approach involves\ninjecting unique, yet readily identifiable watermarks into each embedding.\nWatermarks inserted by ESpeW are designed to maintain a significant distance\nfrom one another and to avoid sharing common components, thus making it\nsignificantly more challenging to remove the watermarks.Extensive experiments\non four popular datasets demonstrate that ESpeW can even watermark successfully\nagainst a highly aggressive removal strategy without sacrificing the quality of\nembeddings.\n","authors":["Zongqi Wang","Baoyuan Wu","Jingyuan Deng","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2410.17552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17546v1","updated":"2024-10-23T03:53:46Z","published":"2024-10-23T03:53:46Z","title":"ProtoLens: Advancing Prototype Learning for Fine-Grained\n Interpretability in Text Classification","summary":" Deep neural networks have achieved remarkable performance in various\ntext-based tasks but often lack interpretability, making them less suitable for\napplications where transparency is critical. To address this, we propose\nProtoLens, a novel prototype-based model that provides fine-grained,\nsub-sentence level interpretability for text classification. ProtoLens uses a\nPrototype-aware Span Extraction module to identify relevant text spans\nassociated with learned prototypes and a Prototype Alignment mechanism to\nensure prototypes are semantically meaningful throughout training. By aligning\nthe prototype embeddings with human-understandable examples, ProtoLens provides\ninterpretable predictions while maintaining competitive accuracy. Extensive\nexperiments demonstrate that ProtoLens outperforms both prototype-based and\nnon-interpretable baselines on multiple text classification benchmarks. Code\nand data are available at\n\\url{https://anonymous.4open.science/r/ProtoLens-CE0B/}.\n","authors":["Bowen Wei","Ziwei Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.17546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16638v2","updated":"2024-10-23T03:41:49Z","published":"2024-10-22T02:27:57Z","title":"LLMScan: Causal Scan for LLM Misbehavior Detection","summary":" Despite the success of Large Language Models (LLMs) across various fields,\ntheir potential to generate untruthful, biased and harmful responses poses\nsignificant risks, particularly in critical applications. This highlights the\nurgent need for systematic methods to detect and prevent such misbehavior.\nWhile existing approaches target specific issues such as harmful responses,\nthis work introduces LLMScan, an innovative LLM monitoring technique based on\ncausality analysis, offering a comprehensive solution. LLMScan systematically\nmonitors the inner workings of an LLM through the lens of causal inference,\noperating on the premise that the LLM's `brain' behaves differently when\nmisbehaving. By analyzing the causal contributions of the LLM's input tokens\nand transformer layers, LLMScan effectively detects misbehavior. Extensive\nexperiments across various tasks and models reveal clear distinctions in the\ncausal distributions between normal behavior and misbehavior, enabling the\ndevelopment of accurate, lightweight detectors for a variety of misbehavior\ndetection tasks.\n","authors":["Mengdi Zhang","Kai Kiat Goh","Peixin Zhang","Jun Sun"],"pdf_url":"https://arxiv.org/pdf/2410.16638v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17532v1","updated":"2024-10-23T03:19:15Z","published":"2024-10-23T03:19:15Z","title":"Responsible Multilingual Large Language Models: A Survey of Development,\n Applications, and Societal Impact","summary":" Multilingual Large Language Models (MLLMs) represent a pivotal advancement in\ndemocratizing artificial intelligence across linguistic boundaries. While\ntheoretical foundations are well-established, practical implementation\nguidelines remain scattered. This work bridges this gap by providing a\ncomprehensive end-to-end framework for developing and deploying MLLMs in\nproduction environments. We make three distinctive contributions: First, we\npresent an actionable pipeline from data pre-processing through deployment,\nintegrating insights from academic research and industrial applications.\nSecond, using Llama2 as a case study, we provide detailed optimization\nstrategies for enhancing multilingual capabilities, including curriculum\nlearning approaches for balancing high-resource and low-resource languages,\ntokenization strategies, and effective sampling methods. Third, we offer an\ninterdisciplinary analysis that considers technical, linguistic, and cultural\nperspectives in MLLM development. Our findings reveal critical challenges in\nsupporting linguistic diversity, with 88.38% of world languages categorized as\nlow-resource, affecting over a billion speakers. We examine practical solutions\nthrough real-world applications in customer service, search engines, and\nmachine translation. By synthesizing theoretical frameworks with\nproduction-ready implementation strategies, this survey provides essential\nguidance for practitioners and researchers working to develop more inclusive\nand effective multilingual AI systems.\n","authors":["Junhua Liu","Bin Fu"],"pdf_url":"https://arxiv.org/pdf/2410.17532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17529v1","updated":"2024-10-23T03:14:07Z","published":"2024-10-23T03:14:07Z","title":"Navigate Complex Physical Worlds via Geometrically Constrained LLM","summary":" This study investigates the potential of Large Language Models (LLMs) for\nreconstructing and constructing the physical world solely based on textual\nknowledge. It explores the impact of model performance on spatial understanding\nabilities. To enhance the comprehension of geometric and spatial relationships\nin the complex physical world, the study introduces a set of geometric\nconventions and develops a workflow based on multi-layer graphs and multi-agent\nsystem frameworks. It examines how LLMs achieve multi-step and multi-objective\ngeometric inference in a spatial environment using multi-layer graphs under\nunified geometric conventions. Additionally, the study employs a genetic\nalgorithm, inspired by large-scale model knowledge, to solve geometric\nconstraint problems. In summary, this work innovatively explores the\nfeasibility of using text-based LLMs as physical world builders and designs a\nworkflow to enhance their capabilities.\n","authors":["Yongqiang Huang","Wentao Ye","Liyao Li","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.17529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14687v2","updated":"2024-10-23T03:05:37Z","published":"2024-10-03T14:17:43Z","title":"BrainTransformers: SNN-LLM","summary":" This study introduces BrainTransformers, an innovative Large Language Model\n(LLM) implemented using Spiking Neural Networks (SNN). Our key contributions\ninclude: (1) designing SNN-compatible Transformer components such as SNNMatmul,\nSNNSoftmax, and SNNSiLU; (2) implementing an SNN approximation of the SiLU\nactivation function; and (3) developing a Synapsis module to simulate synaptic\nplasticity. Our 3-billion parameter model, BrainTransformers-3B-Chat,\ndemonstrates competitive performance across various benchmarks, including MMLU\n(63.2), BBH (54.1), ARC-C (54.3), and GSM8K (76.3), while potentially offering\nimproved energy efficiency and biological plausibility. The model employs a\nthree-stage training approach, including SNN-specific neuronal synaptic\nplasticity training. This research opens new avenues for brain-like AI systems\nin natural language processing and neuromorphic computing. Future work will\nfocus on hardware optimization, developing specialized SNN fine-tuning tools,\nand exploring practical applications in energy-efficient computing\nenvironments.\n","authors":["Zhengzheng Tang","Eva Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.14687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11303v2","updated":"2024-10-23T03:00:41Z","published":"2024-10-15T05:54:17Z","title":"TSDS: Data Selection for Task-Specific Model Finetuning","summary":" Finetuning foundation models for specific tasks is an emerging paradigm in\nmodern machine learning. The efficacy of task-specific finetuning largely\ndepends on the selection of appropriate training data. We present TSDS\n(Task-Specific Data Selection), a framework to select data for task-specific\nmodel finetuning, guided by a small but representative set of examples from the\ntarget task. To do so, we formulate data selection for task-specific finetuning\nas an optimization problem with a distribution alignment loss based on optimal\ntransport to capture the discrepancy between the selected data and the target\ndistribution. In addition, we add a regularizer to encourage the diversity of\nthe selected data and incorporate kernel density estimation into the\nregularizer to reduce the negative effects of near-duplicates among the\ncandidate data. We connect our optimization problem to nearest neighbor search\nand design efficient algorithms to compute the optimal solution based on\napproximate nearest neighbor search techniques. We evaluate our method on data\nselection for both continued pretraining and instruction tuning of language\nmodels. We show that instruction tuning using data selected by our method with\na 1% selection ratio often outperforms using the full dataset and beats the\nbaseline selection methods by 1.5 points in F1 score on average.\n","authors":["Zifan Liu","Amin Karbasi","Theodoros Rekatsinas"],"pdf_url":"https://arxiv.org/pdf/2410.11303v2.pdf","comment":"31 pages, 1 figure"},{"id":"http://arxiv.org/abs/2406.18925v3","updated":"2024-10-23T02:57:31Z","published":"2024-06-27T06:32:56Z","title":"Selective Vision is the Challenge for Visual Reasoning: A Benchmark for\n Visual Argument Understanding","summary":" Visual arguments, often used in advertising or social causes, rely on images\nto persuade viewers to do or believe something. Understanding these arguments\nrequires selective vision: only specific visual stimuli within an image are\nrelevant to the argument, and relevance can only be understood within the\ncontext of a broader argumentative structure. While visual arguments are\nreadily appreciated by human audiences, we ask: are today's AI capable of\nsimilar understanding? We present VisArgs, a dataset of 1,611 images annotated\nwith 5,112 visual premises (with regions), 5,574 commonsense premises, and\nreasoning trees connecting them into structured arguments. We propose three\ntasks for evaluating visual argument understanding: premise localization,\npremise identification, and conclusion deduction. Experiments show that 1)\nmachines struggle to capture visual cues: GPT-4-O achieved 78.5% accuracy,\nwhile humans reached 98.0%. Models also performed 19.5% worse when\ndistinguishing between irrelevant objects within the image compared to external\nobjects. 2) Providing relevant visual premises improved model performance\nsignificantly.\n","authors":["Jiwan Chung","Sungjae Lee","Minseo Kim","Seungju Han","Ashkan Yousefpour","Jack Hessel","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2406.18925v3.pdf","comment":"12 pages, 6 figures. Accepted as main paper in EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.17520v1","updated":"2024-10-23T02:51:43Z","published":"2024-10-23T02:51:43Z","title":"MobileSafetyBench: Evaluating Safety of Autonomous Agents in Mobile\n Device Control","summary":" Autonomous agents powered by large language models (LLMs) show promising\npotential in assistive tasks across various domains, including mobile device\ncontrol. As these agents interact directly with personal information and device\nsettings, ensuring their safe and reliable behavior is crucial to prevent\nundesirable outcomes. However, no benchmark exists for standardized evaluation\nof the safety of mobile device-control agents. In this work, we introduce\nMobileSafetyBench, a benchmark designed to evaluate the safety of\ndevice-control agents within a realistic mobile environment based on Android\nemulators. We develop a diverse set of tasks involving interactions with\nvarious mobile applications, including messaging and banking applications. To\nclearly evaluate safety apart from general capabilities, we design separate\ntasks measuring safety and tasks evaluating helpfulness. The safety tasks\nchallenge agents with managing potential risks prevalent in daily life and\ninclude tests to evaluate robustness against indirect prompt injections. Our\nexperiments demonstrate that while baseline agents, based on state-of-the-art\nLLMs, perform well in executing helpful tasks, they show poor performance in\nsafety tasks. To mitigate these safety concerns, we propose a prompting method\nthat encourages agents to prioritize safety considerations. While this method\nshows promise in promoting safer behaviors, there is still considerable room\nfor improvement to fully earn user trust. This highlights the urgent need for\ncontinued research to develop more robust safety mechanisms in mobile\nenvironments. We open-source our benchmark at:\nhttps://mobilesafetybench.github.io/.\n","authors":["Juyong Lee","Dongyoon Hahm","June Suk Choi","W. Bradley Knox","Kimin Lee"],"pdf_url":"https://arxiv.org/pdf/2410.17520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17519v1","updated":"2024-10-23T02:51:33Z","published":"2024-10-23T02:51:33Z","title":"Large Language Models Still Exhibit Bias in Long Text","summary":" Existing fairness benchmarks for large language models (LLMs) primarily focus\non simple tasks, such as multiple-choice questions, overlooking biases that may\narise in more complex scenarios like long-text generation. To address this gap,\nwe introduce the Long Text Fairness Test (LTF-TEST), a framework that evaluates\nbiases in LLMs through essay-style prompts. LTF-TEST covers 14 topics and 10\ndemographic axes, including gender and race, resulting in 11,948 samples. By\nassessing both model responses and the reasoning behind them, LTF-TEST uncovers\nsubtle biases that are difficult to detect in simple responses. In our\nevaluation of five recent LLMs, including GPT-4o and LLaMa3, we identify two\nkey patterns of bias. First, these models frequently favor certain demographic\ngroups in their responses. Second, they show excessive sensitivity toward\ntraditionally disadvantaged groups, often providing overly protective responses\nwhile neglecting others. To mitigate these biases, we propose FT-REGARD, a\nfinetuning approach that pairs biased prompts with neutral responses. FT-REGARD\nreduces gender bias by 34.6% and improves performance by 1.4 percentage points\non the BBQ benchmark, offering a promising approach to addressing biases in\nlong-text generation tasks.\n","authors":["Wonje Jeung","Dongjae Jeon","Ashkan Yousefpour","Jonghyun Choi"],"pdf_url":"https://arxiv.org/pdf/2410.17519v1.pdf","comment":"22 page, 38 figures, Neurips (SoLaR Workshop)"},{"id":"http://arxiv.org/abs/2410.13334v2","updated":"2024-10-23T02:15:52Z","published":"2024-10-17T08:46:09Z","title":"Do LLMs Have Political Correctness? Analyzing Ethical Biases and\n Jailbreak Vulnerabilities in AI Systems","summary":" Although large language models (LLMs) demonstrate impressive proficiency in\nvarious tasks, they present potential safety risks, such as `jailbreaks', where\nmalicious inputs can coerce LLMs into generating harmful content. To address\nthese issues, many LLM developers have implemented various safety measures to\nalign these models. This alignment involves several techniques, including data\nfiltering during pre-training, supervised fine-tuning, reinforcement learning\nfrom human feedback, and red-teaming exercises. These methods often introduce\ndeliberate and intentional biases similar to Political Correctness (PC) to\nensure the ethical behavior of LLMs. In this paper, we delve into the\nintentional biases injected into LLMs for safety purposes and examine methods\nto circumvent these safety alignment techniques. Notably, these intentional\nbiases result in a jailbreaking success rate in GPT-4o models that differs by\n20% between non-binary and cisgender keywords and by 16% between white and\nblack keywords, even when the other parts of the prompts are identical. We\nintroduce the concept of PCJailbreak, highlighting the inherent risks posed by\nthese safety-induced biases. Additionally, we propose an efficient defense\nmethod PCDefense, which prevents jailbreak attempts by injecting defense\nprompts prior to generation. PCDefense stands as an appealing alternative to\nGuard Models, such as Llama-Guard, that require additional inference cost after\ntext generation. Our findings emphasize the urgent need for LLM developers to\nadopt a more responsible approach when designing and implementing safety\nmeasures.\n","authors":["Isack Lee","Haebin Seong"],"pdf_url":"https://arxiv.org/pdf/2410.13334v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16451v2","updated":"2024-10-23T01:56:15Z","published":"2024-10-21T19:25:31Z","title":"Susu Box or Piggy Bank: Assessing Cultural Commonsense Knowledge between\n Ghana and the U.S","summary":" Recent work has highlighted the culturally-contingent nature of commonsense\nknowledge. We introduce AMAMMER${\\epsilon}$, a test set of 525 multiple-choice\nquestions designed to evaluate the commonsense knowledge of English LLMs,\nrelative to the cultural contexts of Ghana and the United States. To create\nAMAMMER${\\epsilon}$, we select a set of multiple-choice questions (MCQs) from\nexisting commonsense datasets and rewrite them in a multi-stage process\ninvolving surveys of Ghanaian and U.S. participants. In three rounds of\nsurveys, participants from both pools are solicited to (1) write correct and\nincorrect answer choices, (2) rate individual answer choices on a 5-point\nLikert scale, and (3) select the best answer choice from the newly-constructed\nMCQ items, in a final validation step. By engaging participants at multiple\nstages, our procedure ensures that participant perspectives are incorporated\nboth in the creation and validation of test items, resulting in high levels of\nagreement within each pool. We evaluate several off-the-shelf English LLMs on\nAMAMMER${\\epsilon}$. Uniformly, models prefer answers choices that align with\nthe preferences of U.S. annotators over Ghanaian annotators. Additionally, when\ntest items specify a cultural context (Ghana or the U.S.), models exhibit some\nability to adapt, but performance is consistently better in U.S. contexts than\nGhanaian. As large resources are devoted to the advancement of English LLMs,\nour findings underscore the need for culturally adaptable models and\nevaluations to meet the needs of diverse English-speaking populations around\nthe world.\n","authors":["Christabel Acquaye","Haozhe An","Rachel Rudinger"],"pdf_url":"https://arxiv.org/pdf/2410.16451v2.pdf","comment":"Accepted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2404.03881v4","updated":"2024-10-23T01:39:31Z","published":"2024-04-05T04:04:23Z","title":"A Bi-consolidating Model for Joint Relational Triple Extraction","summary":" Current methods to extract relational triples directly make a prediction\nbased on a possible entity pair in a raw sentence without depending on entity\nrecognition. The task suffers from a serious semantic overlapping problem, in\nwhich several relation triples may share one or two entities in a sentence. In\nthis paper, based on a two-dimensional sentence representation, a\nbi-consolidating model is proposed to address this problem by simultaneously\nreinforcing the local and global semantic features relevant to a relation\ntriple. This model consists of a local consolidation component and a global\nconsolidation component. The first component uses a pixel difference\nconvolution to enhance semantic information of a possible triple representation\nfrom adjacent regions and mitigate noise in neighbouring neighbours. The second\ncomponent strengthens the triple representation based a channel attention and a\nspatial attention, which has the advantage to learn remote semantic\ndependencies in a sentence. They are helpful to improve the performance of both\nentity identification and relation type classification in relation triple\nextraction. After evaluated on several publish datasets, the bi-consolidating\nmodel achieves competitive performance. Analytical experiments demonstrate the\neffectiveness of our model for relational triple extraction and give motivation\nfor other natural language processing tasks.\n","authors":["Xiaocheng Luo","Yanping Chen","Ruixue Tang","Caiwei Yang","Ruizhang Huang","Yongbin Qin"],"pdf_url":"https://arxiv.org/pdf/2404.03881v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17498v1","updated":"2024-10-23T01:38:10Z","published":"2024-10-23T01:38:10Z","title":"Mechanisms of Symbol Processing for In-Context Learning in Transformer\n Networks","summary":" Large Language Models (LLMs) have demonstrated impressive abilities in symbol\nprocessing through in-context learning (ICL). This success flies in the face of\ndecades of predictions that artificial neural networks cannot master abstract\nsymbol manipulation. We seek to understand the mechanisms that can enable\nrobust symbol processing in transformer networks, illuminating both the\nunanticipated success, and the significant limitations, of transformers in\nsymbol processing. Borrowing insights from symbolic AI on the power of\nProduction System architectures, we develop a high-level language, PSL, that\nallows us to write symbolic programs to do complex, abstract symbol processing,\nand create compilers that precisely implement PSL programs in transformer\nnetworks which are, by construction, 100% mechanistically interpretable. We\ndemonstrate that PSL is Turing Universal, so the work can inform the\nunderstanding of transformer ICL in general. The type of transformer\narchitecture that we compile from PSL programs suggests a number of paths for\nenhancing transformers' capabilities at symbol processing. (Note: The first\nsection of the paper gives an extended synopsis of the entire paper.)\n","authors":["Paul Smolensky","Roland Fernandez","Zhenghao Herbert Zhou","Mattia Opper","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2410.17498v1.pdf","comment":"101 pages (including 30 pages of Appendices), 18 figures"},{"id":"http://arxiv.org/abs/2410.17492v1","updated":"2024-10-23T01:14:54Z","published":"2024-10-23T01:14:54Z","title":"BadFair: Backdoored Fairness Attacks with Group-conditioned Triggers","summary":" Attacking fairness is crucial because compromised models can introduce biased\noutcomes, undermining trust and amplifying inequalities in sensitive\napplications like hiring, healthcare, and law enforcement. This highlights the\nurgent need to understand how fairness mechanisms can be exploited and to\ndevelop defenses that ensure both fairness and robustness. We introduce\nBadFair, a novel backdoored fairness attack methodology. BadFair stealthily\ncrafts a model that operates with accuracy and fairness under regular\nconditions but, when activated by certain triggers, discriminates and produces\nincorrect results for specific groups. This type of attack is particularly\nstealthy and dangerous, as it circumvents existing fairness detection methods,\nmaintaining an appearance of fairness in normal use. Our findings reveal that\nBadFair achieves a more than 85% attack success rate in attacks aimed at target\ngroups on average while only incurring a minimal accuracy loss. Moreover, it\nconsistently exhibits a significant discrimination score, distinguishing\nbetween pre-defined target and non-target attacked groups across various\ndatasets and models.\n","authors":["Jiaqi Xue","Qian Lou","Mengxin Zheng"],"pdf_url":"https://arxiv.org/pdf/2410.17492v1.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2402.10601v2","updated":"2024-10-23T00:38:14Z","published":"2024-02-16T11:37:05Z","title":"When \"Competency\" in Reasoning Opens the Door to Vulnerability:\n Jailbreaking LLMs via Novel Complex Ciphers","summary":" Recent advancements in the safety of Large Language Models (LLMs) have\nprimarily focused on mitigating attacks crafted in natural language or in\ncommon encryption techniques like Base64. However, new models which often\npossess better reasoning capabilities, open the door to new attack vectors that\nwere previously non-existent in older models. This seems counter-intuitive at\nfirst glance, but these advanced models can decipher more complex cryptic\nqueries that previous models could not, making them susceptible to attacks\nusing such prompts. To exploit this vulnerability, we propose Attacks using\nCustom Encryptions (ACE), a novel method to jailbreak LLMs by leveraging custom\nencryption schemes. We evaluate the effectiveness of ACE on four\nstate-of-the-art LLMs, achieving Attack Success Rates (ASR) of up to 66% on\nclose-source models and 88% on open-source models. Building upon this, we\nintroduce Layered Attacks using Custom Encryptions (LACE), which employs\nmultiple layers of encryption through our custom ciphers to further enhance the\nASR. Our findings demonstrate that LACE significantly enhances the ability to\njailbreak LLMs, increasing the ASR of GPT-4o from 40% to 78%, a 38%\nimprovement. Our results highlight that the advanced capabilities of LLMs\nintroduce unforeseen vulnerabilities to complex attacks. Specifically complex\nand layered ciphers increase the chance of jailbreaking.\n","authors":["Divij Handa","Zehua Zhang","Amir Saeidi","Chitta Baral"],"pdf_url":"https://arxiv.org/pdf/2402.10601v2.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2410.17485v1","updated":"2024-10-23T00:36:06Z","published":"2024-10-23T00:36:06Z","title":"VoiceTextBlender: Augmenting Large Language Models with Speech\n Capabilities via Single-Stage Joint Speech-Text Supervised Fine-Tuning","summary":" Recent studies have augmented large language models (LLMs) with speech\ncapabilities, leading to the development of speech language models (SpeechLMs).\nEarlier SpeechLMs focused on single-turn speech-based question answering (QA),\nwhere user input comprised a speech context and a text question. More recent\nstudies have extended this to multi-turn conversations, though they often\nrequire complex, multi-stage supervised fine-tuning (SFT) with diverse data.\nAnother critical challenge with SpeechLMs is catastrophic forgetting-where\nmodels optimized for speech tasks suffer significant degradation in text-only\nperformance. To mitigate these issues, we propose a novel single-stage joint\nspeech-text SFT approach on the low-rank adaptation (LoRA) of the LLM backbone.\nOur joint SFT combines text-only SFT data with three types of speech-related\ndata: speech recognition and translation, speech-based QA, and mixed-modal SFT.\nCompared to previous SpeechLMs with 7B or 13B parameters, our 3B model\ndemonstrates superior performance across various speech benchmarks while\npreserving the original capabilities on text-only tasks. Furthermore, our model\nshows emergent abilities of effectively handling previously unseen prompts and\ntasks, including multi-turn, mixed-modal inputs.\n","authors":["Yifan Peng","Krishna C. Puvvada","Zhehuai Chen","Piotr Zelasko","He Huang","Kunal Dhawan","Ke Hu","Shinji Watanabe","Jagadeesh Balam","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2410.17485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17484v1","updated":"2024-10-23T00:31:17Z","published":"2024-10-23T00:31:17Z","title":"Which Client is Reliable?: A Reliable and Personalized Prompt-based\n Federated Learning for Medical Image Question Answering","summary":" Conventional medical artificial intelligence (AI) models face barriers in\nclinical application and ethical issues owing to their inability to handle the\nprivacy-sensitive characteristics of medical data. We present a novel\npersonalized federated learning (pFL) method for medical visual question\nanswering (VQA) models, addressing privacy reliability challenges in the\nmedical domain. Our method introduces learnable prompts into a Transformer\narchitecture to efficiently train it on diverse medical datasets without\nmassive computational costs. Then we introduce a reliable client VQA model that\nincorporates Dempster-Shafer evidence theory to quantify uncertainty in\npredictions, enhancing the model's reliability. Furthermore, we propose a novel\ninter-client communication mechanism that uses maximum likelihood estimation to\nbalance accuracy and uncertainty, fostering efficient integration of insights\nacross clients.\n","authors":["He Zhu","Ren Togo","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2410.17484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17482v1","updated":"2024-10-23T00:05:57Z","published":"2024-10-23T00:05:57Z","title":"Is artificial intelligence still intelligence? LLMs generalize to novel\n adjective-noun pairs, but don't mimic the full human distribution","summary":" Inferences from adjective-noun combinations like \"Is artificial intelligence\nstill intelligence?\" provide a good test bed for LLMs' understanding of meaning\nand compositional generalization capability, since there are many combinations\nwhich are novel to both humans and LLMs but nevertheless elicit convergent\nhuman judgments. We study a range of LLMs and find that the largest models we\ntested are able to draw human-like inferences when the inference is determined\nby context and can generalize to unseen adjective-noun combinations. We also\npropose three methods to evaluate LLMs on these inferences out of context,\nwhere there is a distribution of human-like answers rather than a single\ncorrect answer. We find that LLMs show a human-like distribution on at most\n75\\% of our dataset, which is promising but still leaves room for improvement.\n","authors":["Hayley Ross","Kathryn Davidson","Najoung Kim"],"pdf_url":"https://arxiv.org/pdf/2410.17482v1.pdf","comment":"9 pages (23 pages with appendix). Accepted to GenBench 2024"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2410.18084v1","updated":"2024-10-23T17:59:58Z","published":"2024-10-23T17:59:58Z","title":"DynamicCity: Large-Scale LiDAR Generation from Dynamic Scenes","summary":" LiDAR scene generation has been developing rapidly recently. However,\nexisting methods primarily focus on generating static and single-frame scenes,\noverlooking the inherently dynamic nature of real-world driving environments.\nIn this work, we introduce DynamicCity, a novel 4D LiDAR generation framework\ncapable of generating large-scale, high-quality LiDAR scenes that capture the\ntemporal evolution of dynamic environments. DynamicCity mainly consists of two\nkey models. 1) A VAE model for learning HexPlane as the compact 4D\nrepresentation. Instead of using naive averaging operations, DynamicCity\nemploys a novel Projection Module to effectively compress 4D LiDAR features\ninto six 2D feature maps for HexPlane construction, which significantly\nenhances HexPlane fitting quality (up to 12.56 mIoU gain). Furthermore, we\nutilize an Expansion & Squeeze Strategy to reconstruct 3D feature volumes in\nparallel, which improves both network training efficiency and reconstruction\naccuracy than naively querying each 3D point (up to 7.05 mIoU gain, 2.06x\ntraining speedup, and 70.84% memory reduction). 2) A DiT-based diffusion model\nfor HexPlane generation. To make HexPlane feasible for DiT generation, a Padded\nRollout Operation is proposed to reorganize all six feature planes of the\nHexPlane as a squared 2D feature map. In particular, various conditions could\nbe introduced in the diffusion or sampling process, supporting versatile 4D\ngeneration applications, such as trajectory- and command-driven generation,\ninpainting, and layout-conditioned generation. Extensive experiments on the\nCarlaSC and Waymo datasets demonstrate that DynamicCity significantly\noutperforms existing state-of-the-art 4D LiDAR generation methods across\nmultiple metrics. The code will be released to facilitate future research.\n","authors":["Hengwei Bian","Lingdong Kong","Haozhe Xie","Liang Pan","Yu Qiao","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2410.18084v1.pdf","comment":"Preprint; 29 pages, 15 figures, 7 tables; Project Page at\n https://dynamic-city.github.io/"},{"id":"http://arxiv.org/abs/2410.18083v1","updated":"2024-10-23T17:59:57Z","published":"2024-10-23T17:59:57Z","title":"FIPER: Generalizable Factorized Fields for Joint Image Compression and\n Super-Resolution","summary":" In this work, we propose a unified representation for Super-Resolution (SR)\nand Image Compression, termed **Factorized Fields**, motivated by the shared\nprinciples between these two tasks. Both SISR and Image Compression require\nrecovering and preserving fine image details--whether by enhancing resolution\nor reconstructing compressed data. Unlike previous methods that mainly focus on\nnetwork architecture, our proposed approach utilizes a basis-coefficient\ndecomposition to explicitly capture multi-scale visual features and structural\ncomponents in images, addressing the core challenges of both tasks. We first\nderive our SR model, which includes a Coefficient Backbone and Basis Swin\nTransformer for generalizable Factorized Fields. Then, to further unify these\ntwo tasks, we leverage the strong information-recovery capabilities of the\ntrained SR modules as priors in the compression pipeline, improving both\ncompression efficiency and detail reconstruction. Additionally, we introduce a\nmerged-basis compression branch that consolidates shared structures, further\noptimizing the compression process. Extensive experiments show that our unified\nrepresentation delivers state-of-the-art performance, achieving an average\nrelative improvement of 204.4% in PSNR over the baseline in Super-Resolution\n(SR) and 9.35% BD-rate reduction in Image Compression compared to the previous\nSOTA.\n","authors":["Yang-Che Sun","Cheng Yu Yeo","Ernie Chu","Jun-Cheng Chen","Yu-Lun Liu"],"pdf_url":"https://arxiv.org/pdf/2410.18083v1.pdf","comment":"Project page: https://jayisaking.github.io/FIPER/"},{"id":"http://arxiv.org/abs/2410.18079v1","updated":"2024-10-23T17:59:11Z","published":"2024-10-23T17:59:11Z","title":"FreeVS: Generative View Synthesis on Free Driving Trajectory","summary":" Existing reconstruction-based novel view synthesis methods for driving scenes\nfocus on synthesizing camera views along the recorded trajectory of the ego\nvehicle. Their image rendering performance will severely degrade on viewpoints\nfalling out of the recorded trajectory, where camera rays are untrained. We\npropose FreeVS, a novel fully generative approach that can synthesize camera\nviews on free new trajectories in real driving scenes. To control the\ngeneration results to be 3D consistent with the real scenes and accurate in\nviewpoint pose, we propose the pseudo-image representation of view priors to\ncontrol the generation process. Viewpoint transformation simulation is applied\non pseudo-images to simulate camera movement in each direction. Once trained,\nFreeVS can be applied to any validation sequences without reconstruction\nprocess and synthesis views on novel trajectories. Moreover, we propose two new\nchallenging benchmarks tailored to driving scenes, which are novel camera\nsynthesis and novel trajectory synthesis, emphasizing the freedom of\nviewpoints. Given that no ground truth images are available on novel\ntrajectories, we also propose to evaluate the consistency of images synthesized\non novel trajectories with 3D perception models. Experiments on the Waymo Open\nDataset show that FreeVS has a strong image synthesis performance on both the\nrecorded trajectories and novel trajectories. Project Page:\nhttps://freevs24.github.io/\n","authors":["Qitai Wang","Lue Fan","Yuqi Wang","Yuntao Chen","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.18079v1.pdf","comment":"Project Page: https://freevs24.github.io/"},{"id":"http://arxiv.org/abs/2410.18074v1","updated":"2024-10-23T17:56:33Z","published":"2024-10-23T17:56:33Z","title":"UnCLe: Unsupervised Continual Learning of Depth Completion","summary":" We propose UnCLe, a standardized benchmark for Unsupervised Continual\nLearning of a multimodal depth estimation task: Depth completion aims to infer\na dense depth map from a pair of synchronized RGB image and sparse depth map.\nWe benchmark depth completion models under the practical scenario of\nunsupervised learning over continuous streams of data. Existing methods are\ntypically trained on a static, or stationary, dataset. However, when adapting\nto novel non-stationary distributions, they \"catastrophically forget\"\npreviously learned information. UnCLe simulates these non-stationary\ndistributions by adapting depth completion models to sequences of datasets\ncontaining diverse scenes captured from distinct domains using different visual\nand range sensors. We adopt representative methods from continual learning\nparadigms and translate them to enable unsupervised continual learning of depth\ncompletion. We benchmark these models for indoor and outdoor and investigate\nthe degree of catastrophic forgetting through standard quantitative metrics.\nFurthermore, we introduce model inversion quality as an additional measure of\nforgetting. We find that unsupervised continual learning of depth completion is\nan open problem, and we invite researchers to leverage UnCLe as a development\nplatform.\n","authors":["Suchisrit Gangopadhyay","Xien Chen","Michael Chu","Patrick Rim","Hyoungseob Park","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2410.18074v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.18072v1","updated":"2024-10-23T17:56:11Z","published":"2024-10-23T17:56:11Z","title":"WorldSimBench: Towards Video Generation Models as World Simulators","summary":" Recent advancements in predictive models have demonstrated exceptional\ncapabilities in predicting the future state of objects and scenes. However, the\nlack of categorization based on inherent characteristics continues to hinder\nthe progress of predictive model development. Additionally, existing benchmarks\nare unable to effectively evaluate higher-capability, highly embodied\npredictive models from an embodied perspective. In this work, we classify the\nfunctionalities of predictive models into a hierarchy and take the first step\nin evaluating World Simulators by proposing a dual evaluation framework called\nWorldSimBench. WorldSimBench includes Explicit Perceptual Evaluation and\nImplicit Manipulative Evaluation, encompassing human preference assessments\nfrom the visual perspective and action-level evaluations in embodied tasks,\ncovering three representative embodied scenarios: Open-Ended Embodied\nEnvironment, Autonomous, Driving, and Robot Manipulation. In the Explicit\nPerceptual Evaluation, we introduce the HF-Embodied Dataset, a video assessment\ndataset based on fine-grained human feedback, which we use to train a Human\nPreference Evaluator that aligns with human perception and explicitly assesses\nthe visual fidelity of World Simulators. In the Implicit Manipulative\nEvaluation, we assess the video-action consistency of World Simulators by\nevaluating whether the generated situation-aware video can be accurately\ntranslated into the correct control signals in dynamic environments. Our\ncomprehensive evaluation offers key insights that can drive further innovation\nin video generation models, positioning World Simulators as a pivotal\nadvancement toward embodied artificial intelligence.\n","authors":["Yiran Qin","Zhelun Shi","Jiwen Yu","Xijun Wang","Enshen Zhou","Lijun Li","Zhenfei Yin","Xihui Liu","Lu Sheng","Jing Shao","Lei Bai","Wanli Ouyang","Ruimao Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.18072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18071v1","updated":"2024-10-23T17:54:43Z","published":"2024-10-23T17:54:43Z","title":"TP-Eval: Tap Multimodal LLMs' Potential in Evaluation by Customizing\n Prompts","summary":" Recently, multimodal large language models (MLLMs) have received much\nattention for their impressive capabilities. The evaluation of MLLMs is\nbecoming critical to analyzing attributes of MLLMs and providing valuable\ninsights. However, current benchmarks overlook the problem of prompt\nsensitivity - minor prompt variations may lead to significant performance\nfluctuations. Thus, inappropriate prompts may obscure the models' capabilities,\nunderestimating the models' performance. Moreover, different models have\ndifferent preferences for different prompts, and thus, using the same prompt\nfor all models will cause evaluation bias. This paper analyzes this deficiency\nin existing benchmarks and further introduces a new evaluation framework named\nTP-Eval, which introduces a prompt customization method to reduce evaluation\nbiases and tap models' potential. TP-Eval will rewrite the original prompts to\ndifferent customized prompts for different models. In particular, we propose\nsome well-designed modules for prompt customization tailored to the scenario of\nMLLM evaluation. Extensive experiments demonstrate the effectiveness of our\napproach to uncovering models' capabilities, and TP-Eval should benefit the\ncommunity in developing more comprehensive and convincing MLLM evaluation\nbenchmarks.\n","authors":["Yuxuan Xie","Tianhua Li","Wenqi Shao","Kaipeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.18071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12568v2","updated":"2024-10-23T17:53:24Z","published":"2024-08-22T17:35:18Z","title":"Pruning By Explaining Revisited: Optimizing Attribution Methods to Prune\n CNNs and Transformers","summary":" To solve ever more complex problems, Deep Neural Networks are scaled to\nbillions of parameters, leading to huge computational costs. An effective\napproach to reduce computational requirements and increase efficiency is to\nprune unnecessary components of these often over-parameterized networks.\nPrevious work has shown that attribution methods from the field of eXplainable\nAI serve as effective means to extract and prune the least relevant network\ncomponents in a few-shot fashion. We extend the current state by proposing to\nexplicitly optimize hyperparameters of attribution methods for the task of\npruning, and further include transformer-based networks in our analysis. Our\napproach yields higher model compression rates of large transformer- and\nconvolutional architectures (VGG, ResNet, ViT) compared to previous works,\nwhile still attaining high performance on ImageNet classification tasks. Here,\nour experiments indicate that transformers have a higher degree of\nover-parameterization compared to convolutional neural networks. Code is\navailable at https://github.com/erfanhatefi/Pruning-by-eXplaining-in-PyTorch.\n","authors":["Sayed Mohammad Vakilzadeh Hatefi","Maximilian Dreyer","Reduan Achtibat","Thomas Wiegand","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2408.12568v2.pdf","comment":"Accepted as a workshop paper at ECCV 2024, 26 pages (11 pages\n manuscript, 3 pages references, 12 pages appendix)"},{"id":"http://arxiv.org/abs/2410.18065v1","updated":"2024-10-23T17:42:07Z","published":"2024-10-23T17:42:07Z","title":"SPIRE: Synergistic Planning, Imitation, and Reinforcement Learning for\n Long-Horizon Manipulation","summary":" Robot learning has proven to be a general and effective technique for\nprogramming manipulators. Imitation learning is able to teach robots solely\nfrom human demonstrations but is bottlenecked by the capabilities of the\ndemonstrations. Reinforcement learning uses exploration to discover better\nbehaviors; however, the space of possible improvements can be too large to\nstart from scratch. And for both techniques, the learning difficulty increases\nproportional to the length of the manipulation task. Accounting for this, we\npropose SPIRE, a system that first uses Task and Motion Planning (TAMP) to\ndecompose tasks into smaller learning subproblems and second combines imitation\nand reinforcement learning to maximize their strengths. We develop novel\nstrategies to train learning agents when deployed in the context of a planning\nsystem. We evaluate SPIRE on a suite of long-horizon and contact-rich robot\nmanipulation problems. We find that SPIRE outperforms prior approaches that\nintegrate imitation learning, reinforcement learning, and planning by 35% to\n50% in average task performance, is 6 times more data efficient in the number\nof human demonstrations needed to train proficient agents, and learns to\ncomplete tasks nearly twice as efficiently. View\nhttps://sites.google.com/view/spire-corl-2024 for more details.\n","authors":["Zihan Zhou","Animesh Garg","Dieter Fox","Caelan Garrett","Ajay Mandlekar"],"pdf_url":"https://arxiv.org/pdf/2410.18065v1.pdf","comment":"Conference on Robot Learning (CoRL) 2024"},{"id":"http://arxiv.org/abs/2410.18057v1","updated":"2024-10-23T17:30:50Z","published":"2024-10-23T17:30:50Z","title":"CLEAR: Character Unlearning in Textual and Visual Modalities","summary":" Machine Unlearning (MU) is critical for enhancing privacy and security in\ndeep learning models, particularly in large multimodal language models (MLLMs),\nby removing specific private or hazardous information. While MU has made\nsignificant progress in textual and visual modalities, multimodal unlearning\n(MMU) remains significantly underexplored, partially due to the absence of a\nsuitable open-source benchmark. To address this, we introduce CLEAR, a new\nbenchmark designed to evaluate MMU methods. CLEAR contains 200 fictitious\nindividuals and 3,700 images linked with corresponding question-answer pairs,\nenabling a thorough evaluation across modalities. We assess 10 MU methods,\nadapting them for MMU, and highlight new challenges specific to multimodal\nforgetting. We also demonstrate that simple $\\ell_1$ regularization on LoRA\nweights significantly mitigates catastrophic forgetting, preserving model\nperformance on retained data. The dataset is available at\nhttps://huggingface.co/datasets/therem/CLEAR\n","authors":["Alexey Dontsov","Dmitrii Korzh","Alexey Zhavoronkin","Boris Mikheev","Denis Bobkov","Aibek Alanov","Oleg Y. Rogov","Ivan Oseledets","Elena Tutubalina"],"pdf_url":"https://arxiv.org/pdf/2410.18057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18052v1","updated":"2024-10-23T17:26:22Z","published":"2024-10-23T17:26:22Z","title":"In-Pixel Foreground and Contrast Enhancement Circuits with Customizable\n Mapping","summary":" This paper presents an innovative in-pixel contrast enhancement circuit that\nperforms image processing directly within the pixel circuit. The circuit can be\ntuned for different modes of operation. In foreground enhancement mode, it\nsuppresses low-intensity background pixels to nearly zero, isolating the\nforeground for better object visibility. In contrast enhancement mode, it\nimproves overall image contrast. The contrast enhancement function is\ncustomizable both during the design phase and in real-time, allowing the\ncircuit to adapt to specific applications and varying lighting conditions. A\nmodel of the designed pixel circuit is developed and applied to a full pixel\narray, demonstrating significant improvements in image quality. Simulations\nperformed in HSPICE show a nearly 6x increase in Michelson Contrast Ratio (CR)\nin the foreground enhancement mode. The simulation results indicate its\npotential for real-time, adaptive contrast enhancement across various imaging\nenvironments.\n","authors":["Md Rahatul Islam Udoy","Md Mazharul Islam","Elijah Johnson","Ahmedullah Aziz"],"pdf_url":"https://arxiv.org/pdf/2410.18052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18051v1","updated":"2024-10-23T17:25:26Z","published":"2024-10-23T17:25:26Z","title":"Real time anomalies detection on video","summary":" Nowadays, many places use security cameras. Unfortunately, when an incident\noccurs, these technologies are used to show past events. So it can be\nconsidered as a deterrence tool than a detection tool. In this article, we will\npropose a deep learning approach trying to solve this problematic. This\napproach uses convolutional models (CNN) to extract relevant characteristics\nlinked to the video images, theses characteristics will form times series to be\nanalyzed by LSTM / GRU models.\n","authors":["Fabien Poirier"],"pdf_url":"https://arxiv.org/pdf/2410.18051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18013v1","updated":"2024-10-23T16:42:56Z","published":"2024-10-23T16:42:56Z","title":"Scalable Ranked Preference Optimization for Text-to-Image Generation","summary":" Direct Preference Optimization (DPO) has emerged as a powerful approach to\nalign text-to-image (T2I) models with human feedback. Unfortunately, successful\napplication of DPO to T2I models requires a huge amount of resources to collect\nand label large-scale datasets, e.g., millions of generated paired images\nannotated with human preferences. In addition, these human preference datasets\ncan get outdated quickly as the rapid improvements of T2I models lead to higher\nquality images. In this work, we investigate a scalable approach for collecting\nlarge-scale and fully synthetic datasets for DPO training. Specifically, the\npreferences for paired images are generated using a pre-trained reward\nfunction, eliminating the need for involving humans in the annotation process,\ngreatly improving the dataset collection efficiency. Moreover, we demonstrate\nthat such datasets allow averaging predictions across multiple models and\ncollecting ranked preferences as opposed to pairwise preferences. Furthermore,\nwe introduce RankDPO to enhance DPO-based methods using the ranking feedback.\nApplying RankDPO on SDXL and SD3-Medium models with our synthetically generated\npreference dataset ``Syn-Pic'' improves both prompt-following (on benchmarks\nlike T2I-Compbench, GenEval, and DPG-Bench) and visual quality (through user\nstudies). This pipeline presents a practical and scalable solution to develop\nbetter preference datasets to enhance the performance of text-to-image models.\n","authors":["Shyamgopal Karthik","Huseyin Coskun","Zeynep Akata","Sergey Tulyakov","Jian Ren","Anil Kag"],"pdf_url":"https://arxiv.org/pdf/2410.18013v1.pdf","comment":"Project Page: https://snap-research.github.io/RankDPO/"},{"id":"http://arxiv.org/abs/2409.04429v2","updated":"2024-10-23T16:42:06Z","published":"2024-09-06T17:49:56Z","title":"VILA-U: a Unified Foundation Model Integrating Visual Understanding and\n Generation","summary":" VILA-U is a Unified foundation model that integrates Video, Image, Language\nunderstanding and generation. Traditional visual language models (VLMs) use\nseparate modules for understanding and generating visual content, which can\nlead to misalignment and increased complexity. In contrast, VILA-U employs a\nsingle autoregressive next-token prediction framework for both tasks,\neliminating the need for additional components like diffusion models. This\napproach not only simplifies the model but also achieves near state-of-the-art\nperformance in visual language understanding and generation. The success of\nVILA-U is attributed to two main factors: the unified vision tower that aligns\ndiscrete visual tokens with textual inputs during pretraining, which enhances\nvisual perception, and autoregressive image generation can achieve similar\nquality as diffusion models with high-quality dataset. This allows VILA-U to\nperform comparably to more complex models using a fully token-based\nautoregressive framework.\n","authors":["Yecheng Wu","Zhuoyang Zhang","Junyu Chen","Haotian Tang","Dacheng Li","Yunhao Fang","Ligeng Zhu","Enze Xie","Hongxu Yin","Li Yi","Song Han","Yao Lu"],"pdf_url":"https://arxiv.org/pdf/2409.04429v2.pdf","comment":"Code: https://github.com/mit-han-lab/vila-u. The first two authors\n contributed equally to this work"},{"id":"http://arxiv.org/abs/2403.05489v2","updated":"2024-10-23T16:39:15Z","published":"2024-03-08T17:54:38Z","title":"JointMotion: Joint Self-Supervision for Joint Motion Prediction","summary":" We present JointMotion, a self-supervised pre-training method for joint\nmotion prediction in self-driving vehicles. Our method jointly optimizes a\nscene-level objective connecting motion and environments, and an instance-level\nobjective to refine learned representations. Scene-level representations are\nlearned via non-contrastive similarity learning of past motion sequences and\nenvironment context. At the instance level, we use masked autoencoding to\nrefine multimodal polyline representations. We complement this with an adaptive\npre-training decoder that enables JointMotion to generalize across different\nenvironment representations, fusion mechanisms, and dataset characteristics.\nNotably, our method reduces the joint final displacement error of Wayformer,\nHPTR, and Scene Transformer models by 3\\%, 8\\%, and 12\\%, respectively; and\nenables transfer learning between the Waymo Open Motion and the Argoverse 2\nMotion Forecasting datasets. Code: https://github.com/kit-mrt/future-motion\n","authors":["Royden Wagner","Omer Sahin Tas","Marvin Klemp","Carlos Fernandez"],"pdf_url":"https://arxiv.org/pdf/2403.05489v2.pdf","comment":"CoRL'24 camera-ready"},{"id":"http://arxiv.org/abs/2309.17327v2","updated":"2024-10-23T16:25:17Z","published":"2023-09-29T15:34:39Z","title":"Telling Stories for Common Sense Zero-Shot Action Recognition","summary":" Video understanding has long suffered from reliance on large labeled\ndatasets, motivating research into zero-shot learning. Recent progress in\nlanguage modeling presents opportunities to advance zero-shot video analysis,\nbut constructing an effective semantic space relating action classes remains\nchallenging. We address this by introducing a novel dataset, Stories, which\ncontains rich textual descriptions for diverse action classes extracted from\nWikiHow articles. For each class, we extract multi-sentence narratives\ndetailing the necessary steps, scenes, objects, and verbs that characterize the\naction. This contextual data enables modeling of nuanced relationships between\nactions, paving the way for zero-shot transfer. We also propose an approach\nthat harnesses Stories to improve feature generation for training zero-shot\nclassification. Without any target dataset fine-tuning, our method achieves new\nstate-of-the-art on multiple benchmarks, improving top-1 accuracy by up to\n6.1%. We believe Stories provides a valuable resource that can catalyze\nprogress in zero-shot action recognition. The textual narratives forge\nconnections between seen and unseen classes, overcoming the bottleneck of\nlabeled data that has long impeded advancements in this exciting domain. The\ndata can be found here: https://github.com/kini5gowda/Stories .\n","authors":["Shreyank N Gowda","Laura Sevilla-Lara"],"pdf_url":"https://arxiv.org/pdf/2309.17327v2.pdf","comment":"Accepted in ACCV 2024!"},{"id":"http://arxiv.org/abs/2410.03189v2","updated":"2024-10-23T16:22:59Z","published":"2024-10-04T07:02:13Z","title":"Generalizable Prompt Tuning for Vision-Language Models","summary":" Prompt tuning for vision-language models such as CLIP involves optimizing the\ntext prompts used to generate image-text pairs for specific downstream tasks.\nWhile hand-crafted or template-based prompts are generally applicable to a\nwider range of unseen classes, they tend to perform poorly in downstream tasks\n(i.e., seen classes). Learnable soft prompts, on the other hand, often perform\nwell in downstream tasks but lack generalizability. Additionally, prior\nresearch has predominantly concentrated on the textual modality, with very few\nstudies attempting to explore the prompt's generalization potential from the\nvisual modality. Keeping these limitations in mind, we investigate how to\nprompt tuning to obtain both a competitive downstream performance and\ngeneralization. The study shows that by treating soft and hand-crafted prompts\nas dual views of the textual modality, and maximizing their mutual information,\nwe can better ensemble task-specific and general semantic information.\nMoreover, to generate more expressive prompts, the study introduces a\nclass-wise augmentation from the visual modality, resulting in significant\nrobustness to a wider range of unseen classes. Extensive evaluations on several\nbenchmarks report that the proposed approach achieves competitive results in\nterms of both task-specific performance and general abilities.\n","authors":["Qian Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.03189v2.pdf","comment":"in progress"},{"id":"http://arxiv.org/abs/2410.17997v1","updated":"2024-10-23T16:12:03Z","published":"2024-10-23T16:12:03Z","title":"Characterization of the multiplicity of solutions for camera pose given\n two vertically-aligned landmarks and accelerometer","summary":" We consider the problem of recovering the position and orientation of a\ncamera equipped with an accelerometer from sensor images of two labeled\nlandmarks whose positions in a coordinate system aligned in a known way with\ngravity are known. This a variant on the much studied P$n$P problem of\nrecovering camera position and orientation from $n$ points without any\ngravitational data. It is proved that in three types of singular cases there\nare infinitely many solutions, in another type of case there is one, and in a\nfinal type of case there are two. A precise characterization of each type of\ncase. In particular, there is always a unique solution in the practically\ninteresting case where the two landmarks are at the same altitude and the\ncamera is at a different altitude. This case is studied by numerical simulation\nand an implementation on a consumer cellphone. It is also proved that if the\ntwo landmarks are unlabeled, then apart from the same singular cases, there are\nstill always one or two solutions.\n","authors":["Alexander R. Pruss"],"pdf_url":"https://arxiv.org/pdf/2410.17997v1.pdf","comment":"32 pages, 8 figures"},{"id":"http://arxiv.org/abs/2407.19553v2","updated":"2024-10-23T16:06:32Z","published":"2024-07-28T18:20:08Z","title":"Exploring the Adversarial Robustness of CLIP for AI-generated Image\n Detection","summary":" In recent years, many forensic detectors have been proposed to detect\nAI-generated images and prevent their use for malicious purposes. Convolutional\nneural networks (CNNs) have long been the dominant architecture in this field\nand have been the subject of intense study. However, recently proposed\nTransformer-based detectors have been shown to match or even outperform\nCNN-based detectors, especially in terms of generalization. In this paper, we\nstudy the adversarial robustness of AI-generated image detectors, focusing on\nContrastive Language-Image Pretraining (CLIP)-based methods that rely on Visual\nTransformer (ViT) backbones and comparing their performance with CNN-based\nmethods. We study the robustness to different adversarial attacks under a\nvariety of conditions and analyze both numerical results and frequency-domain\npatterns. CLIP-based detectors are found to be vulnerable to white-box attacks\njust like CNN-based detectors. However, attacks do not easily transfer between\nCNN-based and CLIP-based methods. This is also confirmed by the different\ndistribution of the adversarial noise patterns in the frequency domain.\nOverall, this analysis provides new insights into the properties of forensic\ndetectors that can help to develop more effective strategies.\n","authors":["Vincenzo De Rosa","Fabrizio Guillaro","Giovanni Poggi","Davide Cozzolino","Luisa Verdoliva"],"pdf_url":"https://arxiv.org/pdf/2407.19553v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17988v1","updated":"2024-10-23T16:01:31Z","published":"2024-10-23T16:01:31Z","title":"A Pipeline for Segmenting and Structuring RGB-D Data for Robotics\n Applications","summary":" We introduce a novel pipeline for segmenting and structuring color and depth\n(RGB-D) data. Existing processing pipelines for RGB-D data have focused on\nextracting geometric information alone. This approach precludes the development\nof more advanced robotic navigation and manipulation algorithms, which benefit\nfrom a semantic understanding of their environment. Our pipeline can segment\nRGB-D data into accurate semantic masks. These masks are then used to fuse raw\ncaptured point clouds into semantically separated point clouds. We store this\ninformation using the Universal Scene Description (USD) file format, a format\nsuitable for easy querying by downstream robotics algorithms, human-friendly\nvisualization, and robotics simulation.\n","authors":["Zhiwu Zheng","Lauren Mentzer","Berk Iskender","Michael Price","Colm Prendergast","Audren Cloitre"],"pdf_url":"https://arxiv.org/pdf/2410.17988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17983v1","updated":"2024-10-23T15:51:33Z","published":"2024-10-23T15:51:33Z","title":"Robust Two-View Geometry Estimation with Implicit Differentiation","summary":" We present a novel two-view geometry estimation framework which is based on a\ndifferentiable robust loss function fitting. We propose to treat the robust\nfundamental matrix estimation as an implicit layer, which allows us to avoid\nbackpropagation through time and significantly improves the numerical\nstability. To take full advantage of the information from the feature matching\nstage we incorporate learnable weights that depend on the matching confidences.\nIn this way our solution brings together feature extraction, matching and\ntwo-view geometry estimation in a unified end-to-end trainable pipeline. We\nevaluate our approach on the camera pose estimation task in both outdoor and\nindoor scenarios. The experiments on several datasets show that the proposed\nmethod outperforms both classic and learning-based state-of-the-art methods by\na large margin. The project webpage is available at:\nhttps://github.com/VladPyatov/ihls\n","authors":["Vladislav Pyatov","Iaroslav Koshelev","Stamatis Lefkimmiatis"],"pdf_url":"https://arxiv.org/pdf/2410.17983v1.pdf","comment":"IROS 2024 Accepted"},{"id":"http://arxiv.org/abs/2410.17966v1","updated":"2024-10-23T15:34:06Z","published":"2024-10-23T15:34:06Z","title":"A Wavelet Diffusion GAN for Image Super-Resolution","summary":" In recent years, diffusion models have emerged as a superior alternative to\ngenerative adversarial networks (GANs) for high-fidelity image generation, with\nwide applications in text-to-image generation, image-to-image translation, and\nsuper-resolution. However, their real-time feasibility is hindered by slow\ntraining and inference speeds. This study addresses this challenge by proposing\na wavelet-based conditional Diffusion GAN scheme for Single-Image\nSuper-Resolution (SISR). Our approach utilizes the diffusion GAN paradigm to\nreduce the timesteps required by the reverse diffusion process and the Discrete\nWavelet Transform (DWT) to achieve dimensionality reduction, decreasing\ntraining and inference times significantly. The results of an experimental\nvalidation on the CelebA-HQ dataset confirm the effectiveness of our proposed\nscheme. Our approach outperforms other state-of-the-art methodologies\nsuccessfully ensuring high-fidelity output while overcoming inherent drawbacks\nassociated with diffusion models in time-sensitive applications.\n","authors":["Lorenzo Aloisi","Luigi Sigillo","Aurelio Uncini","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2410.17966v1.pdf","comment":"The paper has been accepted at Italian Workshop on Neural Networks\n (WIRN) 2024"},{"id":"http://arxiv.org/abs/2408.15205v2","updated":"2024-10-23T15:33:47Z","published":"2024-08-27T17:06:22Z","title":"Leveraging Hallucinations to Reduce Manual Prompt Dependency in\n Promptable Segmentation","summary":" Promptable segmentation typically requires instance-specific manual prompts\nto guide the segmentation of each desired object. To minimize such a need,\ntask-generic promptable segmentation has been introduced, which employs a\nsingle task-generic prompt to segment various images of different objects in\nthe same task. Current methods use Multimodal Large Language Models (MLLMs) to\nreason detailed instance-specific prompts from a task-generic prompt for\nimproving segmentation accuracy. The effectiveness of this segmentation heavily\ndepends on the precision of these derived prompts. However, MLLMs often suffer\nhallucinations during reasoning, resulting in inaccurate prompting. While\nexisting methods focus on eliminating hallucinations to improve a model, we\nargue that MLLM hallucinations can reveal valuable contextual insights when\nleveraged correctly, as they represent pre-trained large-scale knowledge beyond\nindividual images. In this paper, we utilize hallucinations to mine\ntask-related information from images and verify its accuracy for enhancing\nprecision of the generated prompts. Specifically, we introduce an iterative\nPrompt-Mask Cycle generation framework (ProMaC) with a prompt generator and a\nmask generator.The prompt generator uses a multi-scale chain of thought\nprompting, initially exploring hallucinations for extracting extended\ncontextual knowledge on a test image.These hallucinations are then reduced to\nformulate precise instance-specific prompts, directing the mask generator to\nproduce masks that are consistent with task semantics by mask semantic\nalignment. The generated masks iteratively induce the prompt generator to focus\nmore on task-relevant image areas and reduce irrelevant hallucinations,\nresulting jointly in better prompts and masks. Experiments on 5 benchmarks\ndemonstrate the effectiveness of ProMaC. Code given in\nhttps://lwpyh.github.io/ProMaC/.\n","authors":["Jian Hu","Jiayi Lin","Junchi Yan","Shaogang Gong"],"pdf_url":"https://arxiv.org/pdf/2408.15205v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.17959v1","updated":"2024-10-23T15:28:25Z","published":"2024-10-23T15:28:25Z","title":"Medical Imaging Complexity and its Effects on GAN Performance","summary":" The proliferation of machine learning models in diverse clinical applications\nhas led to a growing need for high-fidelity, medical image training data. Such\ndata is often scarce due to cost constraints and privacy concerns. Alleviating\nthis burden, medical image synthesis via generative adversarial networks (GANs)\nemerged as a powerful method for synthetically generating photo-realistic\nimages based on existing sets of real medical images. However, the exact image\nset size required to efficiently train such a GAN is unclear. In this work, we\nexperimentally establish benchmarks that measure the relationship between a\nsample dataset size and the fidelity of the generated images, given the\ndataset's distribution of image complexities. We analyze statistical metrics\nbased on delentropy, an image complexity measure rooted in Shannon's entropy in\ninformation theory. For our pipeline, we conduct experiments with two\nstate-of-the-art GANs, StyleGAN 3 and SPADE-GAN, trained on multiple medical\nimaging datasets with variable sample sizes. Across both GANs, general\nperformance improved with increasing training set size but suffered with\nincreasing complexity.\n","authors":["William Cagas","Chan Ko","Blake Hsiao","Shryuk Grandhi","Rishi Bhattacharya","Kevin Zhu","Michael Lam"],"pdf_url":"https://arxiv.org/pdf/2410.17959v1.pdf","comment":"Accepted to ACCV, Workshop on Generative AI for Synthetic Medical\n Data"},{"id":"http://arxiv.org/abs/2410.12018v2","updated":"2024-10-23T15:21:54Z","published":"2024-10-15T19:33:57Z","title":"LocoMotion: Learning Motion-Focused Video-Language Representations","summary":" This paper strives for motion-focused video-language representations.\nExisting methods to learn video-language representations use spatial-focused\ndata, where identifying the objects and scene is often enough to distinguish\nthe relevant caption. We instead propose LocoMotion to learn from\nmotion-focused captions that describe the movement and temporal progression of\nlocal object motions. We achieve this by adding synthetic motions to videos and\nusing the parameters of these motions to generate corresponding captions.\nFurthermore, we propose verb-variation paraphrasing to increase the caption\nvariety and learn the link between primitive motions and high-level verbs. With\nthis, we are able to learn a motion-focused video-language representation.\nExperiments demonstrate our approach is effective for a variety of downstream\ntasks, particularly when limited data is available for fine-tuning. Code is\navailable: https://hazeldoughty.github.io/Papers/LocoMotion/\n","authors":["Hazel Doughty","Fida Mohammad Thoker","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2410.12018v2.pdf","comment":"ACCV 2024 Oral"},{"id":"http://arxiv.org/abs/2406.14856v2","updated":"2024-10-23T15:08:59Z","published":"2024-06-21T04:02:19Z","title":"Accessible, At-Home Detection of Parkinson's Disease via Multi-task\n Video Analysis","summary":" Limited accessibility to neurological care leads to underdiagnosed\nParkinson's Disease (PD), preventing early intervention. Existing AI-based PD\ndetection methods primarily focus on unimodal analysis of motor or speech\ntasks, overlooking the multifaceted nature of the disease. To address this, we\nintroduce a large-scale, multi-task video dataset consisting of 1102 sessions\n(each containing videos of finger tapping, facial expression, and speech tasks\ncaptured via webcam) from 845 participants (272 with PD). We propose a novel\nUncertainty-calibrated Fusion Network (UFNet) that leverages this multimodal\ndata to enhance diagnostic accuracy. UFNet employs independent task-specific\nnetworks, trained with Monte Carlo Dropout for uncertainty quantification,\nfollowed by self-attended fusion of features, with attention weights\ndynamically adjusted based on task-specific uncertainties. To ensure\npatient-centered evaluation, the participants were randomly split into three\nsets: 60% for training, 20% for model selection, and 20% for final performance\nevaluation. UFNet significantly outperformed single-task models in terms of\naccuracy, area under the ROC curve (AUROC), and sensitivity while maintaining\nnon-inferior specificity. Withholding uncertain predictions further boosted the\nperformance, achieving 88.0+-0.3%$ accuracy, 93.0+-0.2% AUROC, 79.3+-0.9%\nsensitivity, and 92.6+-0.3% specificity, at the expense of not being able to\npredict for 2.3+-0.3% data (+- denotes 95% confidence interval). Further\nanalysis suggests that the trained model does not exhibit any detectable bias\nacross sex and ethnic subgroups and is most effective for individuals aged\nbetween 50 and 80. Requiring only a webcam and microphone, our approach\nfacilitates accessible home-based PD screening, especially in regions with\nlimited healthcare resources.\n","authors":["Md Saiful Islam","Tariq Adnan","Jan Freyberg","Sangwu Lee","Abdelrahman Abdelkader","Meghan Pawlik","Cathe Schwartz","Karen Jaffe","Ruth B. Schneider","E Ray Dorsey","Ehsan Hoque"],"pdf_url":"https://arxiv.org/pdf/2406.14856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17932v1","updated":"2024-10-23T14:54:48Z","published":"2024-10-23T14:54:48Z","title":"VR-Splatting: Foveated Radiance Field Rendering via 3D Gaussian\n Splatting and Neural Points","summary":" Recent advances in novel view synthesis (NVS), particularly neural radiance\nfields (NeRF) and Gaussian splatting (3DGS), have demonstrated impressive\nresults in photorealistic scene rendering. These techniques hold great\npotential for applications in virtual tourism and teleportation, where\nimmersive realism is crucial. However, the high-performance demands of virtual\nreality (VR) systems present challenges in directly utilizing even such\nfast-to-render scene representations like 3DGS due to latency and computational\nconstraints.\n In this paper, we propose foveated rendering as a promising solution to these\nobstacles. We analyze state-of-the-art NVS methods with respect to their\nrendering performance and compatibility with the human visual system. Our\napproach introduces a novel foveated rendering approach for Virtual Reality,\nthat leverages the sharp, detailed output of neural point rendering for the\nfoveal region, fused with a smooth rendering of 3DGS for the peripheral vision.\n Our evaluation confirms that perceived sharpness and detail-richness are\nincreased by our approach compared to a standard VR-ready 3DGS configuration.\nOur system meets the necessary performance requirements for real-time VR\ninteractions, ultimately enhancing the user's immersive experience.\n Project page: https://lfranke.github.io/vr_splatting\n","authors":["Linus Franke","Laura Fink","Marc Stamminger"],"pdf_url":"https://arxiv.org/pdf/2410.17932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02240v4","updated":"2024-10-23T14:53:38Z","published":"2024-10-03T06:25:53Z","title":"SCA: Highly Efficient Semantic-Consistent Unrestricted Adversarial\n Attack","summary":" Deep neural network based systems deployed in sensitive environments are\nvulnerable to adversarial attacks. Unrestricted adversarial attacks typically\nmanipulate the semantic content of an image (e.g., color or texture) to create\nadversarial examples that are both effective and photorealistic. Recent works\nhave utilized the diffusion inversion process to map images into a latent\nspace, where high-level semantics are manipulated by introducing perturbations.\nHowever, they often results in substantial semantic distortions in the denoised\noutput and suffers from low efficiency. In this study, we propose a novel\nframework called Semantic-Consistent Unrestricted Adversarial Attacks (SCA),\nwhich employs an inversion method to extract edit-friendly noise maps and\nutilizes Multimodal Large Language Model (MLLM) to provide semantic guidance\nthroughout the process. Under the condition of rich semantic information\nprovided by MLLM, we perform the DDPM denoising process of each step using a\nseries of edit-friendly noise maps, and leverage DPM Solver++ to accelerate\nthis process, enabling efficient sampling with semantic consistency. Compared\nto existing methods, our framework enables the efficient generation of\nadversarial examples that exhibit minimal discernible semantic changes.\nConsequently, we for the first time introduce Semantic-Consistent Adversarial\nExamples (SCAE). Extensive experiments and visualizations have demonstrated the\nhigh efficiency of SCA, particularly in being on average 12 times faster than\nthe state-of-the-art attacks. Our research can further draw attention to the\nsecurity of multimedia information.\n","authors":["Zihao Pan","Weibin Wu","Yuhang Cao","Zibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2410.02240v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08401v3","updated":"2024-10-23T14:48:44Z","published":"2024-04-12T11:15:15Z","title":"PnLCalib: Sports Field Registration via Points and Lines Optimization","summary":" Camera calibration in broadcast sports videos presents numerous challenges\nfor accurate sports field registration due to multiple camera angles, varying\ncamera parameters, and frequent occlusions of the field. Traditional\nsearch-based methods depend on initial camera pose estimates, which can\nstruggle in non-standard positions and dynamic environments. In response, we\npropose an optimization-based calibration pipeline that leverages a 3D soccer\nfield model and a predefined set of keypoints to overcome these limitations.\nOur method also introduces a novel refinement module that improves initial\ncalibration by using detected field lines in a non-linear optimization process.\nThis approach outperforms existing techniques in both multi-view and\nsingle-view 3D camera calibration tasks, while maintaining competitive\nperformance in homography estimation. Extensive experimentation on real-world\nsoccer datasets, including SoccerNet-Calibration, WorldCup 2014, and\nTS-WorldCup, highlights the robustness and accuracy of our method across\ndiverse broadcast scenarios. Our approach offers significant improvements in\ncamera calibration precision and reliability.\n","authors":["Marc Gutiérrez-Pérez","Antonio Agudo"],"pdf_url":"https://arxiv.org/pdf/2404.08401v3.pdf","comment":"Extended version of \"No Bells, Just Whistles: Sports Field\n Registration Leveraging Geometric Properties\""},{"id":"http://arxiv.org/abs/2312.05349v3","updated":"2024-10-23T14:47:10Z","published":"2023-12-08T20:12:26Z","title":"PixLore: A Dataset-driven Approach to Rich Image Captioning","summary":" In the domain of vision-language integration, generating detailed image\ncaptions poses a significant challenge due to the lack of curated and rich\ndatasets. This study introduces PixLore, a novel method that leverages Querying\nTransformers through the fine-tuning of the BLIP-2 model using the LoRa method\non a standard commercial GPU. The followed approach, which involves training on\na carefully assembled dataset from state-of-the-art Computer Vision models\ncombined and augmented by ChatGPT, addresses the question of whether intricate\nimage understanding can be achieved with an ensemble of smaller-scale models,\nreferred to as Knowledge Stitching. Comparative evaluations against major\nmodels such as GPT-4 and Google Bard demonstrate that PixLore-2.7B, despite\nhaving considerably fewer parameters, is rated higher than the existing\nState-of-the-Art models in over half of the assessments. Precisely, PixLore\noutperform Bard and BLIP-2, which score approximately 35.18% and 27.98% lower\nthan PixLore in the task of image captioning. This research not only presents a\ngroundbreaking approach but also highlights the importance of well-curated\ndatasets in enhancing the performance of smaller models.\n","authors":["Diego Bonilla-Salvador","Marcelino Martínez-Sober","Joan Vila-Francés","Antonio José Serrano-López","Pablo Rodríguez-Belenguer","Fernando Mateo"],"pdf_url":"https://arxiv.org/pdf/2312.05349v3.pdf","comment":"Paper in preprint pending of publication"},{"id":"http://arxiv.org/abs/2402.17307v2","updated":"2024-10-23T14:42:07Z","published":"2024-02-27T08:31:39Z","title":"Denoising Diffusion Models for Inpainting of Healthy Brain Tissue","summary":" This paper is a contribution to the \"BraTS 2023 Local Synthesis of Healthy\nBrain Tissue via Inpainting Challenge\". The task of this challenge is to\ntransform tumor tissue into healthy tissue in brain magnetic resonance (MR)\nimages. This idea originates from the problem that MR images can be evaluated\nusing automatic processing tools, however, many of these tools are optimized\nfor the analysis of healthy tissue. By solving the given inpainting task, we\nenable the automatic analysis of images featuring lesions, and further\ndownstream tasks. Our approach builds on denoising diffusion probabilistic\nmodels. We use a 2D model that is trained using slices in which healthy tissue\nwas cropped out and is learned to be inpainted again. This allows us to use the\nground truth healthy tissue during training. In the sampling stage, we replace\nthe slices containing diseased tissue in the original 3D volume with the slices\ncontaining the healthy tissue inpainting. With our approach, we achieve\ncomparable results to the competing methods. On the validation set our model\nachieves a mean SSIM of 0.7804, a PSNR of 20.3525 and a MSE of 0.0113. In\nfuture we plan to extend our 2D model to a 3D model, allowing to inpaint the\nregion of interest as a whole without losing context information of neighboring\nslices.\n","authors":["Alicia Durrer","Philippe C. Cattin","Julia Wolleb"],"pdf_url":"https://arxiv.org/pdf/2402.17307v2.pdf","comment":"12 pages, 5 figures, MICCAI challenge submission"},{"id":"http://arxiv.org/abs/2410.17920v1","updated":"2024-10-23T14:38:57Z","published":"2024-10-23T14:38:57Z","title":"Gaze-Assisted Medical Image Segmentation","summary":" The annotation of patient organs is a crucial part of various diagnostic and\ntreatment procedures, such as radiotherapy planning. Manual annotation is\nextremely time-consuming, while its automation using modern image analysis\ntechniques has not yet reached levels sufficient for clinical adoption. This\npaper investigates the idea of semi-supervised medical image segmentation using\nhuman gaze as interactive input for segmentation correction. In particular, we\nfine-tuned the Segment Anything Model in Medical Images (MedSAM), a public\nsolution that uses various prompt types as additional input for semi-automated\nsegmentation correction. We used human gaze data from reading abdominal images\nas a prompt for fine-tuning MedSAM. The model was validated on a public WORD\ndatabase, which consists of 120 CT scans of 16 abdominal organs. The results of\nthe gaze-assisted MedSAM were shown to be superior to the results of the\nstate-of-the-art segmentation models. In particular, the average Dice\ncoefficient for 16 abdominal organs was 85.8%, 86.7%, 81.7%, and 90.5% for\nnnUNetV2, ResUNet, original MedSAM, and our gaze-assisted MedSAM model,\nrespectively.\n","authors":["Leila Khaertdinova","Ilya Pershin","Tatiana Shmykova","Bulat Ibragimov"],"pdf_url":"https://arxiv.org/pdf/2410.17920v1.pdf","comment":"16 pages, 4 figures, Accepted to AIM-FM Workshop @ NeurIPS'24"},{"id":"http://arxiv.org/abs/2410.17918v1","updated":"2024-10-23T14:34:39Z","published":"2024-10-23T14:34:39Z","title":"Addressing Asynchronicity in Clinical Multimodal Fusion via\n Individualized Chest X-ray Generation","summary":" Integrating multi-modal clinical data, such as electronic health records\n(EHR) and chest X-ray images (CXR), is particularly beneficial for clinical\nprediction tasks. However, in a temporal setting, multi-modal data are often\ninherently asynchronous. EHR can be continuously collected but CXR is generally\ntaken with a much longer interval due to its high cost and radiation dose. When\nclinical prediction is needed, the last available CXR image might have been\noutdated, leading to suboptimal predictions. To address this challenge, we\npropose DDL-CXR, a method that dynamically generates an up-to-date latent\nrepresentation of the individualized CXR images. Our approach leverages latent\ndiffusion models for patient-specific generation strategically conditioned on a\nprevious CXR image and EHR time series, providing information regarding\nanatomical structures and disease progressions, respectively. In this way, the\ninteraction across modalities could be better captured by the latent CXR\ngeneration process, ultimately improving the prediction performance.\nExperiments using MIMIC datasets show that the proposed model could effectively\naddress asynchronicity in multimodal fusion and consistently outperform\nexisting methods.\n","authors":["Wenfang Yao","Chen Liu","Kejing Yin","William K. Cheung","Jing Qin"],"pdf_url":"https://arxiv.org/pdf/2410.17918v1.pdf","comment":"Accepted by NeurIPS-24"},{"id":"http://arxiv.org/abs/2403.08511v3","updated":"2024-10-23T14:21:40Z","published":"2024-03-13T13:16:26Z","title":"A Multimodal Fusion Network For Student Emotion Recognition Based on\n Transformer and Tensor Product","summary":" This paper introduces a new multi-modal model based on the Transformer\narchitecture and tensor product fusion strategy, combining BERT's text vectors\nand ViT's image vectors to classify students' psychological conditions, with an\naccuracy of 93.65%. The purpose of the study is to accurately analyze the\nmental health status of students from various data sources. This paper\ndiscusses modal fusion methods, including early, late and intermediate fusion,\nto overcome the challenges of integrating multi-modal information. Ablation\nstudies compare the performance of different models and fusion techniques,\nshowing that the proposed model outperforms existing methods such as CLIP and\nViLBERT in terms of accuracy and inference speed. Conclusions indicate that\nwhile this model has significant advantages in emotion recognition, its\npotential to incorporate other data modalities provides areas for future\nresearch.\n","authors":["Ao Xiang","Zongqing Qi","Han Wang","Qin Yang","Danqing Ma"],"pdf_url":"https://arxiv.org/pdf/2403.08511v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19472v2","updated":"2024-10-23T14:02:12Z","published":"2024-09-28T22:41:49Z","title":"Towards Croppable Implicit Neural Representations","summary":" Implicit Neural Representations (INRs) have peaked interest in recent years\ndue to their ability to encode natural signals using neural networks. While\nINRs allow for useful applications such as interpolating new coordinates and\nsignal compression, their black-box nature makes it difficult to modify them\npost-training. In this paper we explore the idea of editable INRs, and\nspecifically focus on the widely used cropping operation. To this end, we\npresent Local-Global SIRENs -- a novel INR architecture that supports cropping\nby design. Local-Global SIRENs are based on combining local and global feature\nextraction for signal encoding. What makes their design unique is the ability\nto effortlessly remove specific portions of an encoded signal, with a\nproportional weight decrease. This is achieved by eliminating the corresponding\nweights from the network, without the need for retraining. We further show how\nthis architecture can be used to support the straightforward extension of\npreviously encoded signals. Beyond signal editing, we examine how the\nLocal-Global approach can accelerate training, enhance encoding of various\nsignals, improve downstream performance, and be applied to modern INRs such as\nINCODE, highlighting its potential and flexibility. Code is available at\nhttps://github.com/maorash/Local-Global-INRs.\n","authors":["Maor Ashkenazi","Eran Treister"],"pdf_url":"https://arxiv.org/pdf/2409.19472v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.06927v2","updated":"2024-10-23T14:01:27Z","published":"2024-08-13T14:29:00Z","title":"Breaking Class Barriers: Efficient Dataset Distillation via Inter-Class\n Feature Compensator","summary":" Dataset distillation has emerged as a technique aiming to condense\ninformative features from large, natural datasets into a compact and synthetic\nform. While recent advancements have refined this technique, its performance is\nbottlenecked by the prevailing class-specific synthesis paradigm. Under this\nparadigm, synthetic data is optimized exclusively for a pre-assigned one-hot\nlabel, creating an implicit class barrier in feature condensation. This leads\nto inefficient utilization of the distillation budget and oversight of\ninter-class feature distributions, which ultimately limits the effectiveness\nand efficiency, as demonstrated in our analysis. To overcome these constraints,\nthis paper presents the Inter-class Feature Compensator (INFER), an innovative\ndistillation approach that transcends the class-specific data-label framework\nwidely utilized in current dataset distillation methods. Specifically, INFER\nleverages a Universal Feature Compensator (UFC) to enhance feature integration\nacross classes, enabling the generation of multiple additional synthetic\ninstances from a single UFC input. This significantly improves the efficiency\nof the distillation budget. Moreover, INFER enriches inter-class interactions\nduring the distillation, thereby enhancing the effectiveness and\ngeneralizability of the distilled data. By allowing for the linear\ninterpolation of labels similar to those in the original dataset, INFER\nmeticulously optimizes the synthetic data and dramatically reduces the size of\nsoft labels in the synthetic dataset to almost zero, establishing a new\nbenchmark for efficiency and effectiveness in dataset distillation.\n","authors":["Xin Zhang","Jiawei Du","Ping Liu","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.06927v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17885v1","updated":"2024-10-23T13:58:39Z","published":"2024-10-23T13:58:39Z","title":"R-CoT: Reverse Chain-of-Thought Problem Generation for Geometric\n Reasoning in Large Multimodal Models","summary":" Existing Large Multimodal Models (LMMs) struggle with mathematical geometric\nreasoning due to a lack of high-quality image-text paired data. Current\ngeometric data generation approaches, which apply preset templates to generate\ngeometric data or use Large Language Models (LLMs) to rephrase questions and\nanswers (Q&A), unavoidably limit data accuracy and diversity. To synthesize\nhigher-quality data, we propose a two-stage Reverse Chain-of-Thought (R-CoT)\ngeometry problem generation pipeline. First, we introduce GeoChain to produce\nhigh-fidelity geometric images and corresponding descriptions highlighting\nrelations among geometric elements. We then design a Reverse A&Q method that\nreasons step-by-step based on the descriptions and generates questions in\nreverse from the reasoning results. Experiments demonstrate that the proposed\nmethod brings significant and consistent improvements on multiple LMM\nbaselines, achieving new performance records in the 2B, 7B, and 8B settings.\nNotably, R-CoT-8B significantly outperforms previous state-of-the-art\nopen-source mathematical models by 16.6% on MathVista and 9.2% on GeoQA, while\nalso surpassing the closed-source model GPT-4o by an average of 13% across both\ndatasets. The code is available at https://github.com/dle666/R-CoT.\n","authors":["Linger Deng","Yuliang Liu","Bohan Li","Dongliang Luo","Liang Wu","Chengquan Zhang","Pengyuan Lyu","Ziyang Zhang","Gang Zhang","Errui Ding","Yingying Zhu","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2410.17885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17880v1","updated":"2024-10-23T13:52:22Z","published":"2024-10-23T13:52:22Z","title":"A utility-based spatial analysis of residential street-level conditions;\n A case study of Rotterdam","summary":" Residential location choices are traditionally modelled using factors related\nto accessibility and socioeconomic environments, neglecting the importance of\nlocal street-level conditions. Arguably, this neglect is due to data practices.\nToday, however, street-level images -- which are highly effective at encoding\nstreet-level conditions -- are widely available. Additionally, recent advances\nin discrete choice models incorporating computer vision capabilities offer\nopportunities to integrate street-level conditions into residential location\nchoice analysis. This study leverages these developments to investigate the\nspatial distribution of utility derived from street-level conditions in\nresidential location choices on a city-wide scale. In our case study of\nRotterdam, the Netherlands, we find that the utility derived from street-level\nconditions varies significantly on a highly localised scale, with conditions\nrapidly changing even within neighbourhoods. Our results also reveal that the\nhigh real-estate prices in the city centre cannot be attributed to attractive\nstreet-level conditions. Furthermore, whereas the city centre is characterised\nby relatively unattractive residential street-level conditions, neighbourhoods\nin the southern part of the city -- often perceived as problematic -- exhibit\nsurprisingly appealing street-level environments. The methodological\ncontribution of this paper is that it advances the discrete choice models\nincorporating computer vision capabilities by introducing a semantic\nregularisation layer to the model. Thereby, it adds explainability and\neliminates the need for a separate pipeline to extract information from images,\nstreamlining the analysis. As such, this paper's findings and methodological\nadvancements pave the way for further studies to explore integrating\nstreet-level conditions in urban planning.\n","authors":["Sander van Cranenburgh","Francisco Garrido-Valenzuela"],"pdf_url":"https://arxiv.org/pdf/2410.17880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17863v1","updated":"2024-10-23T13:35:18Z","published":"2024-10-23T13:35:18Z","title":"CASCRNet: An Atrous Spatial Pyramid Pooling and Shared Channel Residual\n based Network for Capsule Endoscopy","summary":" This manuscript summarizes work on the Capsule Vision Challenge 2024 by\nMISAHUB. To address the multi-class disease classification task, which is\nchallenging due to the complexity and imbalance in the Capsule Vision challenge\ndataset, this paper proposes CASCRNet (Capsule endoscopy-Aspp-SCR-Network), a\nparameter-efficient and novel model that uses Shared Channel Residual (SCR)\nblocks and Atrous Spatial Pyramid Pooling (ASPP) blocks. Further, the\nperformance of the proposed model is compared with other well-known approaches.\nThe experimental results yield that proposed model provides better disease\nclassification results. The proposed model was successful in classifying\ndiseases with an F1 Score of 78.5% and a Mean AUC of 98.3%, which is promising\ngiven its compact architecture.\n","authors":["K V Srinanda","M Manvith Prabhu","Shyam Lal"],"pdf_url":"https://arxiv.org/pdf/2410.17863v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2410.17858v1","updated":"2024-10-23T13:29:02Z","published":"2024-10-23T13:29:02Z","title":"Blendify -- Python rendering framework for Blender","summary":" With the rapid growth of the volume of research fields like computer vision\nand computer graphics, researchers require effective and user-friendly\nrendering tools to visualize results. While advanced tools like Blender offer\npowerful capabilities, they also require a significant effort to master. This\ntechnical report introduces Blendify, a lightweight Python-based framework that\nseamlessly integrates with Blender, providing a high-level API for scene\ncreation and rendering. Blendify reduces the complexity of working with\nBlender's native API by automating object creation, handling the colors and\nmaterial linking, and implementing features such as shadow-catcher objects\nwhile maintaining support for high-quality ray-tracing rendering output. With a\nfocus on usability Blendify enables efficient and flexible rendering workflow\nfor rendering in common computer vision and computer graphics use cases. The\ncode is available at https://github.com/ptrvilya/blendify\n","authors":["Vladimir Guzov","Ilya A. Petrov","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2410.17858v1.pdf","comment":"Project page: https://virtualhumans.mpi-inf.mpg.de/blendify/"},{"id":"http://arxiv.org/abs/2410.17856v1","updated":"2024-10-23T13:26:59Z","published":"2024-10-23T13:26:59Z","title":"ROCKET-1: Master Open-World Interaction with Visual-Temporal Context\n Prompting","summary":" Vision-language models (VLMs) have excelled in multimodal tasks, but adapting\nthem to embodied decision-making in open-world environments presents\nchallenges. A key issue is the difficulty in smoothly connecting individual\nentities in low-level observations with abstract concepts required for\nplanning. A common approach to address this problem is through the use of\nhierarchical agents, where VLMs serve as high-level reasoners that break down\ntasks into executable sub-tasks, typically specified using language and\nimagined observations. However, language often fails to effectively convey\nspatial information, while generating future images with sufficient accuracy\nremains challenging. To address these limitations, we propose visual-temporal\ncontext prompting, a novel communication protocol between VLMs and policy\nmodels. This protocol leverages object segmentation from both past and present\nobservations to guide policy-environment interactions. Using this approach, we\ntrain ROCKET-1, a low-level policy that predicts actions based on concatenated\nvisual observations and segmentation masks, with real-time object tracking\nprovided by SAM-2. Our method unlocks the full potential of VLMs\nvisual-language reasoning abilities, enabling them to solve complex creative\ntasks, especially those heavily reliant on spatial understanding. Experiments\nin Minecraft demonstrate that our approach allows agents to accomplish\npreviously unattainable tasks, highlighting the effectiveness of\nvisual-temporal context prompting in embodied decision-making. Codes and demos\nwill be available on the project page: https://craftjarvis.github.io/ROCKET-1.\n","authors":["Shaofei Cai","Zihao Wang","Kewei Lian","Zhancun Mu","Xiaojian Ma","Anji Liu","Yitao Liang"],"pdf_url":"https://arxiv.org/pdf/2410.17856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17855v1","updated":"2024-10-23T13:26:19Z","published":"2024-10-23T13:26:19Z","title":"TAGE: Trustworthy Attribute Group Editing for Stable Few-shot Image\n Generation","summary":" Generative Adversarial Networks (GANs) have emerged as a prominent research\nfocus for image editing tasks, leveraging the powerful image generation\ncapabilities of the GAN framework to produce remarkable results.However,\nprevailing approaches are contingent upon extensive training datasets and\nexplicit supervision, presenting a significant challenge in manipulating the\ndiverse attributes of new image classes with limited sample availability. To\nsurmount this hurdle, we introduce TAGE, an innovative image generation network\ncomprising three integral modules: the Codebook Learning Module (CLM), the Code\nPrediction Module (CPM) and the Prompt-driven Semantic Module (PSM). The CPM\nmodule delves into the semantic dimensions of category-agnostic attributes,\nencapsulating them within a discrete codebook. This module is predicated on the\nconcept that images are assemblages of attributes, and thus, by editing these\ncategory-independent attributes, it is theoretically possible to generate\nimages from unseen categories. Subsequently, the CPM module facilitates\nnaturalistic image editing by predicting indices of category-independent\nattribute vectors within the codebook. Additionally, the PSM module generates\nsemantic cues that are seamlessly integrated into the Transformer architecture\nof the CPM, enhancing the model's comprehension of the targeted attributes for\nediting. With these semantic cues, the model can generate images that\naccentuate desired attributes more prominently while maintaining the integrity\nof the original category, even with a limited number of samples. We have\nconducted extensive experiments utilizing the Animal Faces, Flowers, and\nVGGFaces datasets. The results of these experiments demonstrate that our\nproposed method not only achieves superior performance but also exhibits a high\ndegree of stability when compared to other few-shot image generation\ntechniques.\n","authors":["Ruicheng Zhang","Guoheng Huang","Yejing Huo","Xiaochen Yuan","Zhizhen Zhou","Xuhang Chen","Guo Zhong"],"pdf_url":"https://arxiv.org/pdf/2410.17855v1.pdf","comment":"Accepted by International Conference on Signal Processing Systems\n Conference"},{"id":"http://arxiv.org/abs/2410.15613v2","updated":"2024-10-23T13:07:41Z","published":"2024-10-21T03:17:25Z","title":"Exploring Stronger Transformer Representation Learning for Occluded\n Person Re-Identification","summary":" Due to some complex factors (e.g., occlusion, pose variation and diverse\ncamera perspectives), extracting stronger feature representation in person\nre-identification remains a challenging task. In this paper, we proposed a\nnovel self-supervision and supervision combining transformer-based person\nre-identification framework, namely SSSC-TransReID. Different from the general\ntransformer-based person re-identification models, we designed a\nself-supervised contrastive learning branch, which can enhance the feature\nrepresentation for person re-identification without negative samples or\nadditional pre-training. In order to train the contrastive learning branch, we\nalso proposed a novel random rectangle mask strategy to simulate the occlusion\nin real scenes, so as to enhance the feature representation for occlusion.\nFinally, we utilized the joint-training loss function to integrate the\nadvantages of supervised learning with ID tags and self-supervised contrastive\nlearning without negative samples, which can reinforce the ability of our model\nto excavate stronger discriminative features, especially for occlusion.\nExtensive experimental results on several benchmark datasets show our proposed\nmodel obtains superior Re-ID performance consistently and outperforms the\nstate-of-the-art ReID methods by large margins on the mean average accuracy\n(mAP) and Rank-1 accuracy.\n","authors":["Zhangjian Ji","Donglin Cheng","Kai Feng"],"pdf_url":"https://arxiv.org/pdf/2410.15613v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17839v1","updated":"2024-10-23T13:05:26Z","published":"2024-10-23T13:05:26Z","title":"Few-shot NeRF by Adaptive Rendering Loss Regularization","summary":" Novel view synthesis with sparse inputs poses great challenges to Neural\nRadiance Field (NeRF). Recent works demonstrate that the frequency\nregularization of Positional Encoding (PE) can achieve promising results for\nfew-shot NeRF. In this work, we reveal that there exists an inconsistency\nbetween the frequency regularization of PE and rendering loss. This prevents\nfew-shot NeRF from synthesizing higher-quality novel views. To mitigate this\ninconsistency, we propose Adaptive Rendering loss regularization for few-shot\nNeRF, dubbed AR-NeRF. Specifically, we present a two-phase rendering\nsupervision and an adaptive rendering loss weight learning strategy to align\nthe frequency relationship between PE and 2D-pixel supervision. In this way,\nAR-NeRF can learn global structures better in the early training phase and\nadaptively learn local details throughout the training process. Extensive\nexperiments show that our AR-NeRF achieves state-of-the-art performance on\ndifferent datasets, including object-level and complex scenes.\n","authors":["Qingshan Xu","Xuanyu Yi","Jianyao Xu","Wenbing Tao","Yew-Soon Ong","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.17839v1.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2410.13896v2","updated":"2024-10-23T13:01:22Z","published":"2024-10-15T02:41:52Z","title":"From Real Artifacts to Virtual Reference: A Robust Framework for\n Translating Endoscopic Images","summary":" Domain adaptation, which bridges the distributions across different\nmodalities, plays a crucial role in multimodal medical image analysis. In\nendoscopic imaging, combining pre-operative data with intra-operative imaging\nis important for surgical planning and navigation. However, existing domain\nadaptation methods are hampered by distribution shift caused by in vivo\nartifacts, necessitating robust techniques for aligning noisy and artifact\nabundant patient endoscopic videos with clean virtual images reconstructed from\npre-operative tomographic data for pose estimation during intraoperative\nguidance. This paper presents an artifact-resilient image translation method\nand an associated benchmark for this purpose. The method incorporates a novel\n``local-global'' translation framework and a noise-resilient feature extraction\nstrategy. For the former, it decouples the image translation process into a\nlocal step for feature denoising, and a global step for global style transfer.\nFor feature extraction, a new contrastive learning strategy is proposed, which\ncan extract noise-resilient features for establishing robust correspondence\nacross domains. Detailed validation on both public and in-house clinical\ndatasets has been conducted, demonstrating significantly improved performance\ncompared to the current state-of-the-art.\n","authors":["Junyang Wu","Fangfang Xie","Jiayuan Sun","Yun Gu","Guang-Zhong Yang"],"pdf_url":"https://arxiv.org/pdf/2410.13896v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14774v2","updated":"2024-10-23T13:01:14Z","published":"2024-03-21T18:28:43Z","title":"Few-Shot Adversarial Prompt Learning on Vision-Language Models","summary":" The vulnerability of deep neural networks to imperceptible adversarial\nperturbations has attracted widespread attention. Inspired by the success of\nvision-language foundation models, previous efforts achieved zero-shot\nadversarial robustness by aligning adversarial visual features with text\nsupervision. However, in practice, they are still unsatisfactory due to several\nissues, including heavy adaptation cost, suboptimal text supervision, and\nuncontrolled natural generalization capacity. In this paper, to address these\nissues, we propose a few-shot adversarial prompt framework where adapting input\nsequences with limited data makes significant adversarial robustness\nimprovement. Specifically, we achieve this by providing adversarially\ncorrelated text supervision that is end-to-end learned from adversarial\nexamples. We also propose a novel training objective that enhances the\nconsistency of multi-modal features while encourages differentiated uni-modal\nfeatures between natural and adversarial examples. The proposed framework gives\naccess to learn adversarial text supervision, which provides superior\ncross-modal adversarial alignment and matches state-of-the-art zero-shot\nadversarial robustness with only 1% training data. Code is available at:\nhttps://github.com/lionel-w2/FAP.\n","authors":["Yiwei Zhou","Xiaobo Xia","Zhiwei Lin","Bo Han","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.14774v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.13152v3","updated":"2024-10-23T12:56:05Z","published":"2024-05-21T18:45:18Z","title":"Enhancing Interaction Modeling with Agent Selection and Physical\n Coefficient for Trajectory Prediction","summary":" A thorough understanding of the interaction between the target agent and\nsurrounding agents is a prerequisite for accurate trajectory prediction.\nAlthough many methods have been explored, they all assign correlation\ncoefficients to surrounding agents in a purely learning-based manner. In this\nstudy, we present ASPILin, which manually selects interacting agents and\ncalculates their correlations instead of attention scores. Surprisingly, these\nsimple modifications can significantly improve prediction performance and\nsubstantially reduce computational costs. Additionally, ASPILin models the\ninteracting agents at each past time step separately, rather than only modeling\nthe interacting agents at the current time step. This clarifies the causal\nchain of the target agent's historical trajectory and helps the model better\nunderstand dynamic interactions. We intentionally simplified our model in other\naspects, such as map encoding. Remarkably, experiments conducted on the\nINTERACTION, highD, and CitySim datasets demonstrate that our method is\nefficient and straightforward, outperforming other state-of-the-art methods.\n","authors":["Shiji Huang","Lei Ye","Min Chen","Wenhai Luo","Dihong Wang","Chenqi Xu","Deyuan Liang"],"pdf_url":"https://arxiv.org/pdf/2405.13152v3.pdf","comment":"code:https://github.com/kkk00714/ASPILin"},{"id":"http://arxiv.org/abs/2410.17832v1","updated":"2024-10-23T12:51:07Z","published":"2024-10-23T12:51:07Z","title":"Exploiting Text-Image Latent Spaces for the Description of Visual\n Concepts","summary":" Concept Activation Vectors (CAVs) offer insights into neural network\ndecision-making by linking human friendly concepts to the model's internal\nfeature extraction process. However, when a new set of CAVs is discovered, they\nmust still be translated into a human understandable description. For\nimage-based neural networks, this is typically done by visualizing the most\nrelevant images of a CAV, while the determination of the concept is left to\nhumans. In this work, we introduce an approach to aid the interpretation of\nnewly discovered concept sets by suggesting textual descriptions for each CAV.\nThis is done by mapping the most relevant images representing a CAV into a\ntext-image embedding where a joint description of these relevant images can be\ncomputed. We propose utilizing the most relevant receptive fields instead of\nfull images encoded. We demonstrate the capabilities of this approach in\nmultiple experiments with and without given CAV labels, showing that the\nproposed approach provides accurate descriptions for the CAVs and reduces the\nchallenge of concept interpretation.\n","authors":["Laines Schmalwasser","Jakob Gawlikowski","Joachim Denzler","Julia Niebling"],"pdf_url":"https://arxiv.org/pdf/2410.17832v1.pdf","comment":"19 pages, 7 figures, to be published in ICPR"},{"id":"http://arxiv.org/abs/2210.01708v4","updated":"2024-10-23T12:50:18Z","published":"2022-10-04T16:08:54Z","title":"Conquering the Communication Constraints to Enable Large Pre-Trained\n Models in Federated Learning","summary":" Federated learning (FL) has emerged as a promising paradigm for enabling the\ncollaborative training of models without centralized access to the raw data on\nlocal devices. In the typical FL paradigm (e.g., FedAvg), model weights are\nsent to and from the server each round to participating clients. Recently, the\nuse of small pre-trained models has been shown effective in federated learning\noptimization and improving convergence. However, recent state-of-the-art\npre-trained models are getting more capable but also have more parameters. In\nconventional FL, sharing the enormous model weights can quickly put a massive\ncommunication burden on the system, especially if more capable models are\nemployed. Can we find a solution to enable those strong and readily-available\npre-trained models in FL to achieve excellent performance while simultaneously\nreducing the communication burden? To this end, we investigate the use of\nparameter-efficient fine-tuning in federated learning and thus introduce a new\nframework: FedPEFT. Specifically, we systemically evaluate the performance of\nFedPEFT across a variety of client stability, data distribution, and\ndifferential privacy settings. By only locally tuning and globally sharing a\nsmall portion of the model weights, significant reductions in the total\ncommunication overhead can be achieved while maintaining competitive or even\nbetter performance in a wide range of federated learning scenarios, providing\ninsight into a new paradigm for practical and effective federated systems.\n","authors":["Guangyu Sun","Umar Khalid","Matias Mendieta","Taojiannan Yang","Pu Wang","Minwoo Lee","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2210.01708v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17823v1","updated":"2024-10-23T12:32:21Z","published":"2024-10-23T12:32:21Z","title":"Att2CPC: Attention-Guided Lossy Attribute Compression of Point Clouds","summary":" With the great progress of 3D sensing and acquisition technology, the volume\nof point cloud data has grown dramatically, which urges the development of\nefficient point cloud compression methods. In this paper, we focus on the task\nof learned lossy point cloud attribute compression (PCAC). We propose an\nefficient attention-based method for lossy compression of point cloud\nattributes leveraging on an autoencoder architecture. Specifically, at the\nencoding side, we conduct multiple downsampling to best exploit the local\nattribute patterns, in which effective External Cross Attention (ECA) is\ndevised to hierarchically aggregate features by intergrating attributes and\ngeometry contexts. At the decoding side, the attributes of the point cloud are\nprogressively reconstructed based on the multi-scale representation and the\nzero-padding upsampling tactic. To the best of our knowledge, this is the first\napproach to introduce attention mechanism to point-based lossy PCAC task. We\nverify the compression efficiency of our model on various sequences, including\nhuman body frames, sparse objects, and large-scale point cloud scenes.\nExperiments show that our method achieves an average improvement of 1.15 dB and\n2.13 dB in BD-PSNR of Y channel and YUV channel, respectively, when comparing\nwith the state-of-the-art point-based method Deep-PCAC. Codes of this paper are\navailable at https://github.com/I2-Multimedia-Lab/Att2CPC.\n","authors":["Kai Liu","Kang You","Pan Gao","Manoranjan Paul"],"pdf_url":"https://arxiv.org/pdf/2410.17823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17822v1","updated":"2024-10-23T12:32:20Z","published":"2024-10-23T12:32:20Z","title":"DREB-Net: Dual-stream Restoration Embedding Blur-feature Fusion Network\n for High-mobility UAV Object Detection","summary":" Object detection algorithms are pivotal components of unmanned aerial vehicle\n(UAV) imaging systems, extensively employed in complex fields. However, images\ncaptured by high-mobility UAVs often suffer from motion blur cases, which\nsignificantly impedes the performance of advanced object detection algorithms.\nTo address these challenges, we propose an innovative object detection\nalgorithm specifically designed for blurry images, named DREB-Net (Dual-stream\nRestoration Embedding Blur-feature Fusion Network). First, DREB-Net addresses\nthe particularities of blurry image object detection problem by incorporating a\nBlurry image Restoration Auxiliary Branch (BRAB) during the training phase.\nSecond, it fuses the extracted shallow features via Multi-level\nAttention-Guided Feature Fusion (MAGFF) module, to extract richer features.\nHere, the MAGFF module comprises local attention modules and global attention\nmodules, which assign different weights to the branches. Then, during the\ninference phase, the deep feature extraction of the BRAB can be removed to\nreduce computational complexity and improve detection speed. In loss function,\na combined loss of MSE and SSIM is added to the BRAB to restore blurry images.\nFinally, DREB-Net introduces Fast Fourier Transform in the early stages of\nfeature extraction, via a Learnable Frequency domain Amplitude Modulation\nModule (LFAMM), to adjust feature amplitude and enhance feature processing\ncapability. Experimental results indicate that DREB-Net can still effectively\nperform object detection tasks under motion blur in captured images, showcasing\nexcellent performance and broad application prospects. Our source code will be\navailable at https://github.com/EEIC-Lab/DREB-Net.git.\n","authors":["Qingpeng Li","Yuxin Zhang","Leyuan Fang","Yuhan Kang","Shutao Li","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.17822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17816v1","updated":"2024-10-23T12:19:12Z","published":"2024-10-23T12:19:12Z","title":"Deep Learning for Active Region Classification: A Systematic Study from\n Convolutional Neural Networks to Vision Transformers","summary":" A solar active region can significantly disrupt the Sun Earth space\nenvironment, often leading to severe space weather events such as solar flares\nand coronal mass ejections. As a consequence, the automatic classification of\nactive region groups is the crucial starting point for accurately and promptly\npredicting solar activity. This study presents our results concerned with the\napplication of deep learning techniques to the classification of active region\ncutouts based on the Mount Wilson classification scheme. Specifically, we have\nexplored the latest advancements in image classification architectures, from\nConvolutional Neural Networks to Vision Transformers, and reported on their\nperformances for the active region classification task, showing that the\ncrucial point for their effectiveness consists in a robust training process\nbased on the latest advances in the field.\n","authors":["Edoardo Legnaro","Sabrina Guastavino","Michele Piana","Anna Maria Massone"],"pdf_url":"https://arxiv.org/pdf/2410.17816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17814v1","updated":"2024-10-23T12:18:36Z","published":"2024-10-23T12:18:36Z","title":"Learning Lossless Compression for High Bit-Depth Volumetric Medical\n Image","summary":" Recent advances in learning-based methods have markedly enhanced the\ncapabilities of image compression. However, these methods struggle with high\nbit-depth volumetric medical images, facing issues such as degraded\nperformance, increased memory demand, and reduced processing speed. To address\nthese challenges, this paper presents the Bit-Division based Lossless\nVolumetric Image Compression (BD-LVIC) framework, which is tailored for high\nbit-depth medical volume compression. The BD-LVIC framework skillfully divides\nthe high bit-depth volume into two lower bit-depth segments: the Most\nSignificant Bit-Volume (MSBV) and the Least Significant Bit-Volume (LSBV). The\nMSBV concentrates on the most significant bits of the volumetric medical image,\ncapturing vital structural details in a compact manner. This reduction in\ncomplexity greatly improves compression efficiency using traditional codecs.\nConversely, the LSBV deals with the least significant bits, which encapsulate\nintricate texture details. To compress this detailed information effectively,\nwe introduce an effective learning-based compression model equipped with a\nTransformer-Based Feature Alignment Module, which exploits both intra-slice and\ninter-slice redundancies to accurately align features. Subsequently, a Parallel\nAutoregressive Coding Module merges these features to precisely estimate the\nprobability distribution of the least significant bit-planes. Our extensive\ntesting demonstrates that the BD-LVIC framework not only sets new performance\nbenchmarks across various datasets but also maintains a competitive coding\nspeed, highlighting its significant potential and practical utility in the\nrealm of volumetric medical image compression.\n","authors":["Kai Wang","Yuanchao Bai","Daxin Li","Deming Zhai","Junjun Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2410.17814v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2410.17812v1","updated":"2024-10-23T12:17:03Z","published":"2024-10-23T12:17:03Z","title":"PGDiffSeg: Prior-Guided Denoising Diffusion Model with Parameter-Shared\n Attention for Breast Cancer Segmentation","summary":" Early detection through imaging and accurate diagnosis is crucial in\nmitigating the high mortality rate associated with breast cancer. However,\nlocating tumors from low-resolution and high-noise medical images is extremely\nchallenging. Therefore, this paper proposes a novel PGDiffSeg (Prior-Guided\nDiffusion Denoising Model with Parameter-Shared Attention) that applies\ndiffusion denoising methods to breast cancer medical image segmentation,\naccurately recovering the affected areas from Gaussian noise. Firstly, we\ndesign a parallel pipeline for noise processing and semantic information\nprocessing and propose a parameter-shared attention module (PSA) in multi-layer\nthat seamlessly integrates these two pipelines. This integration empowers\nPGDiffSeg to incorporate semantic details at multiple levels during the\ndenoising process, producing highly accurate segmentation maps. Secondly, we\nintroduce a guided strategy that leverages prior knowledge to simulate the\ndecision-making process of medical professionals, thereby enhancing the model's\nability to locate tumor positions precisely. Finally, we provide the first-ever\ndiscussion on the interpretability of the generative diffusion model in the\ncontext of breast cancer segmentation. Extensive experiments have demonstrated\nthe superiority of our model over the current state-of-the-art approaches,\nconfirming its effectiveness as a flexible diffusion denoising method suitable\nfor medical image research. Our code will be publicly available later.\n","authors":["Feiyan Feng","Tianyu Liu","Hong Wang","Jun Zhao","Wei Li","Yanshen Sun"],"pdf_url":"https://arxiv.org/pdf/2410.17812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17810v1","updated":"2024-10-23T12:12:56Z","published":"2024-10-23T12:12:56Z","title":"EntityCLIP: Entity-Centric Image-Text Matching via Multimodal Attentive\n Contrastive Learning","summary":" Recent advancements in image-text matching have been notable, yet prevailing\nmodels predominantly cater to broad queries and struggle with accommodating\nfine-grained query intention. In this paper, we work towards the\n\\textbf{E}ntity-centric \\textbf{I}mage-\\textbf{T}ext \\textbf{M}atching (EITM),\na task that the text and image involve specific entity-related information. The\nchallenge of this task mainly lies in the larger semantic gap in entity\nassociation modeling, comparing with the general image-text matching problem.To\nnarrow the huge semantic gap between the entity-centric text and the images, we\ntake the fundamental CLIP as the backbone and devise a multimodal attentive\ncontrastive learning framework to tam CLIP to adapt EITM problem, developing a\nmodel named EntityCLIP. The key of our multimodal attentive contrastive\nlearning is to generate interpretive explanation text using Large Language\nModels (LLMs) as the bridge clues. In specific, we proceed by extracting\nexplanatory text from off-the-shelf LLMs. This explanation text, coupled with\nthe image and text, is then input into our specially crafted Multimodal\nAttentive Experts (MMAE) module, which effectively integrates explanation texts\nto narrow the gap of the entity-related text and image in a shared semantic\nspace. Building on the enriched features derived from MMAE, we further design\nan effective Gated Integrative Image-text Matching (GI-ITM) strategy. The\nGI-ITM employs an adaptive gating mechanism to aggregate MMAE's features,\nsubsequently applying image-text matching constraints to steer the alignment\nbetween the text and the image. Extensive experiments are conducted on three\nsocial media news benchmarks including N24News, VisualNews, and GoodNews, the\nresults shows that our method surpasses the competition methods with a clear\nmargin.\n","authors":["Yaxiong Wang","Yaxiong Wang","Lianwei Wu","Lechao Cheng","Zhun Zhong","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2410.17810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17809v1","updated":"2024-10-23T12:11:26Z","published":"2024-10-23T12:11:26Z","title":"An Intelligent Agentic System for Complex Image Restoration Problems","summary":" Real-world image restoration (IR) is inherently complex and often requires\ncombining multiple specialized models to address diverse degradations. Inspired\nby human problem-solving, we propose AgenticIR, an agentic system that mimics\nthe human approach to image processing by following five key stages:\nPerception, Scheduling, Execution, Reflection, and Rescheduling. AgenticIR\nleverages large language models (LLMs) and vision-language models (VLMs) that\ninteract via text generation to dynamically operate a toolbox of IR models. We\nfine-tune VLMs for image quality analysis and employ LLMs for reasoning,\nguiding the system step by step. To compensate for LLMs' lack of specific IR\nknowledge and experience, we introduce a self-exploration method, allowing the\nLLM to observe and summarize restoration results into referenceable documents.\nExperiments demonstrate AgenticIR's potential in handling complex IR tasks,\nrepresenting a promising path toward achieving general intelligence in visual\nprocessing.\n","authors":["Kaiwen Zhu","Jinjin Gu","Zhiyuan You","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2410.17809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17802v1","updated":"2024-10-23T11:59:49Z","published":"2024-10-23T11:59:49Z","title":"GenUDC: High Quality 3D Mesh Generation with Unsigned Dual Contouring\n Representation","summary":" Generating high-quality meshes with complex structures and realistic surfaces\nis the primary goal of 3D generative models. Existing methods typically employ\nsequence data or deformable tetrahedral grids for mesh generation. However,\nsequence-based methods have difficulty producing complex structures with many\nfaces due to memory limits. The deformable tetrahedral grid-based method\nMeshDiffusion fails to recover realistic surfaces due to the inherent ambiguity\nin deformable grids. We propose the GenUDC framework to address these\nchallenges by leveraging the Unsigned Dual Contouring (UDC) as the mesh\nrepresentation. UDC discretizes a mesh in a regular grid and divides it into\nthe face and vertex parts, recovering both complex structures and fine details.\nAs a result, the one-to-one mapping between UDC and mesh resolves the ambiguity\nproblem. In addition, GenUDC adopts a two-stage, coarse-to-fine generative\nprocess for 3D mesh generation. It first generates the face part as a rough\nshape and then the vertex part to craft a detailed shape. Extensive evaluations\ndemonstrate the superiority of UDC as a mesh representation and the favorable\nperformance of GenUDC in mesh generation. The code and trained models are\navailable at https://github.com/TrepangCat/GenUDC.\n","authors":["Ruowei Wang","Jiaqi Li","Dan Zeng","Xueqi Ma","Zixiang Xu","Jianwei Zhang","Qijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.17802v1.pdf","comment":"ACMMM 2024, code:https://github.com/TrepangCat/GenUDC"},{"id":"http://arxiv.org/abs/2309.16515v3","updated":"2024-10-23T11:56:02Z","published":"2023-09-28T15:22:02Z","title":"Latent Noise Segmentation: How Neural Noise Leads to the Emergence of\n Segmentation and Grouping","summary":" Humans are able to segment images effortlessly without supervision using\nperceptual grouping. Here, we propose a counter-intuitive computational\napproach to solving unsupervised perceptual grouping and segmentation: that\nthey arise because of neural noise, rather than in spite of it. We (1)\nmathematically demonstrate that under realistic assumptions, neural noise can\nbe used to separate objects from each other; (2) that adding noise in a DNN\nenables the network to segment images even though it was never trained on any\nsegmentation labels; and (3) that segmenting objects using noise results in\nsegmentation performance that aligns with the perceptual grouping phenomena\nobserved in humans, and is sample-efficient. We introduce the Good Gestalt (GG)\ndatasets -- six datasets designed to specifically test perceptual grouping, and\nshow that our DNN models reproduce many important phenomena in human\nperception, such as illusory contours, closure, continuity, proximity, and\nocclusion. Finally, we (4) show that our model improves performance on our GG\ndatasets compared to other tested unsupervised models by $24.9\\%$. Together,\nour results suggest a novel unsupervised segmentation method requiring few\nassumptions, a new explanation for the formation of perceptual grouping, and a\nnovel potential benefit of neural noise.\n","authors":["Ben Lonnqvist","Zhengqing Wu","Michael H. Herzog"],"pdf_url":"https://arxiv.org/pdf/2309.16515v3.pdf","comment":"ICML 2024 camera ready version"},{"id":"http://arxiv.org/abs/2410.17785v1","updated":"2024-10-23T11:35:44Z","published":"2024-10-23T11:35:44Z","title":"TranSPORTmer: A Holistic Approach to Trajectory Understanding in\n Multi-Agent Sports","summary":" Understanding trajectories in multi-agent scenarios requires addressing\nvarious tasks, including predicting future movements, imputing missing\nobservations, inferring the status of unseen agents, and classifying different\nglobal states. Traditional data-driven approaches often handle these tasks\nseparately with specialized models. We introduce TranSPORTmer, a unified\ntransformer-based framework capable of addressing all these tasks, showcasing\nits application to the intricate dynamics of multi-agent sports scenarios like\nsoccer and basketball. Using Set Attention Blocks, TranSPORTmer effectively\ncaptures temporal dynamics and social interactions in an equivariant manner.\nThe model's tasks are guided by an input mask that conceals missing or\nyet-to-be-predicted observations. Additionally, we introduce a CLS extra agent\nto classify states along soccer trajectories, including passes, possessions,\nuncontrolled states, and out-of-play intervals, contributing to an enhancement\nin modeling trajectories. Evaluations on soccer and basketball datasets show\nthat TranSPORTmer outperforms state-of-the-art task-specific models in player\nforecasting, player forecasting-imputation, ball inference, and ball\nimputation. https://youtu.be/8VtSRm8oGoE\n","authors":["Guillem Capellera","Luis Ferraz","Antonio Rubio","Antonio Agudo","Francesc Moreno-Noguer"],"pdf_url":"https://arxiv.org/pdf/2410.17785v1.pdf","comment":"Accepted to ACCV 2024"},{"id":"http://arxiv.org/abs/2410.17779v1","updated":"2024-10-23T11:31:06Z","published":"2024-10-23T11:31:06Z","title":"ADEM-VL: Adaptive and Embedded Fusion for Efficient Vision-Language\n Tuning","summary":" Recent advancements in multimodal fusion have witnessed the remarkable\nsuccess of vision-language (VL) models, which excel in various multimodal\napplications such as image captioning and visual question answering. However,\nbuilding VL models requires substantial hardware resources, where efficiency is\nrestricted by two key factors: the extended input sequence of the language\nmodel with vision features demands more computational operations, and a large\nnumber of additional learnable parameters increase memory complexity. These\nchallenges significantly restrict the broader applicability of such models. To\nbridge this gap, we propose ADEM-VL, an efficient vision-language method that\ntunes VL models based on pretrained large language models (LLMs) by adopting a\nparameter-free cross-attention mechanism for similarity measurements in\nmultimodal fusion. This approach only requires embedding vision features into\nthe language space, significantly reducing the number of trainable parameters\nand accelerating both training and inference speeds. To enhance representation\nlearning in fusion module, we introduce an efficient multiscale feature\ngeneration scheme that requires only a single forward pass through the vision\nencoder. Moreover, we propose an adaptive fusion scheme that dynamically\ndiscards less relevant visual information for each text token based on its\nattention score. This ensures that the fusion process prioritizes the most\npertinent visual features. With experiments on various tasks including visual\nquestion answering, image captioning, and instruction-following, we demonstrate\nthat our framework outperforms existing approaches. Specifically, our method\nsurpasses existing methods by an average accuracy of 0.77% on ScienceQA\ndataset, with reduced training and inference latency, demonstrating the\nsuperiority of our framework. The code is available at\nhttps://github.com/Hao840/ADEM-VL.\n","authors":["Zhiwei Hao","Jianyuan Guo","Li Shen","Yong Luo","Han Hu","Yonggang Wen"],"pdf_url":"https://arxiv.org/pdf/2410.17779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17774v1","updated":"2024-10-23T11:23:05Z","published":"2024-10-23T11:23:05Z","title":"Quasi-Medial Distance Field (Q-MDF): A Robust Method for Approximating\n and Discretizing Neural Medial Axis","summary":" The medial axis, a lower-dimensional shape descriptor, plays an important\nrole in the field of digital geometry processing. Despite its importance,\nrobust computation of the medial axis transform from diverse inputs, especially\npoint clouds with defects, remains a significant challenge. In this paper, we\ntackle the challenge by proposing a new implicit method that diverges from\nmainstream explicit medial axis computation techniques. Our key technical\ninsight is the difference between the signed distance field (SDF) and the\nmedial field (MF) of a solid shape is the unsigned distance field (UDF) of the\nshape's medial axis. This allows for formulating medial axis computation as an\nimplicit reconstruction problem. Utilizing a modified double covering method,\nwe extract the medial axis as the zero level-set of the UDF. Extensive\nexperiments show that our method has enhanced accuracy and robustness in\nlearning compact medial axis transform from thorny meshes and point clouds\ncompared to existing methods.\n","authors":["Jiayi Kong","Chen Zong","Jun Luo","Shiqing Xin","Fei Hou","Hanqing Jiang","Chen Qian","Ying He"],"pdf_url":"https://arxiv.org/pdf/2410.17774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17772v1","updated":"2024-10-23T11:19:48Z","published":"2024-10-23T11:19:48Z","title":"Scaling Robot Policy Learning via Zero-Shot Labeling with Foundation\n Models","summary":" A central challenge towards developing robots that can relate human language\nto their perception and actions is the scarcity of natural language annotations\nin diverse robot datasets. Moreover, robot policies that follow natural\nlanguage instructions are typically trained on either templated language or\nexpensive human-labeled instructions, hindering their scalability. To this end,\nwe introduce NILS: Natural language Instruction Labeling for Scalability. NILS\nautomatically labels uncurated, long-horizon robot data at scale in a zero-shot\nmanner without any human intervention. NILS combines pretrained vision-language\nfoundation models in order to detect objects in a scene, detect object-centric\nchanges, segment tasks from large datasets of unlabelled interaction data and\nultimately label behavior datasets. Evaluations on BridgeV2, Fractal, and a\nkitchen play dataset show that NILS can autonomously annotate diverse robot\ndemonstrations of unlabeled and unstructured datasets while alleviating several\nshortcomings of crowdsourced human annotations, such as low data quality and\ndiversity. We use NILS to label over 115k trajectories obtained from over 430\nhours of robot data. We open-source our auto-labeling code and generated\nannotations on our website: http://robottasklabeling.github.io.\n","authors":["Nils Blank","Moritz Reuss","Marcel Rühle","Ömer Erdinç Yağmurlu","Fabian Wenzel","Oier Mees","Rudolf Lioutikov"],"pdf_url":"https://arxiv.org/pdf/2410.17772v1.pdf","comment":"Project Website at https://robottasklabeling.github.io/"},{"id":"http://arxiv.org/abs/2404.00362v2","updated":"2024-10-23T11:06:02Z","published":"2024-03-30T13:28:53Z","title":"STBA: Towards Evaluating the Robustness of DNNs for Query-Limited\n Black-box Scenario","summary":" Many attack techniques have been proposed to explore the vulnerability of\nDNNs and further help to improve their robustness. Despite the significant\nprogress made recently, existing black-box attack methods still suffer from\nunsatisfactory performance due to the vast number of queries needed to optimize\ndesired perturbations. Besides, the other critical challenge is that\nadversarial examples built in a noise-adding manner are abnormal and struggle\nto successfully attack robust models, whose robustness is enhanced by\nadversarial training against small perturbations. There is no doubt that these\ntwo issues mentioned above will significantly increase the risk of exposure and\nresult in a failure to dig deeply into the vulnerability of DNNs. Hence, it is\nnecessary to evaluate DNNs' fragility sufficiently under query-limited settings\nin a non-additional way. In this paper, we propose the Spatial Transform\nBlack-box Attack (STBA), a novel framework to craft formidable adversarial\nexamples in the query-limited scenario. Specifically, STBA introduces a flow\nfield to the high-frequency part of clean images to generate adversarial\nexamples and adopts the following two processes to enhance their naturalness\nand significantly improve the query efficiency: a) we apply an estimated flow\nfield to the high-frequency part of clean images to generate adversarial\nexamples instead of introducing external noise to the benign image, and b) we\nleverage an efficient gradient estimation method based on a batch of samples to\noptimize such an ideal flow field under query-limited settings. Compared to\nexisting score-based black-box baselines, extensive experiments indicated that\nSTBA could effectively improve the imperceptibility of the adversarial examples\nand remarkably boost the attack success rate under query-limited settings.\n","authors":["Renyang Liu","Kwok-Yan Lam","Wei Zhou","Sixing Wu","Jun Zhao","Dongting Hu","Mingming Gong"],"pdf_url":"https://arxiv.org/pdf/2404.00362v2.pdf","comment":"Accepted by T-MM"},{"id":"http://arxiv.org/abs/2403.14626v2","updated":"2024-10-23T11:05:01Z","published":"2024-03-21T17:59:55Z","title":"ODTFormer: Efficient Obstacle Detection and Tracking with Stereo Cameras\n Based on Transformer","summary":" Obstacle detection and tracking represent a critical component in robot\nautonomous navigation. In this paper, we propose ODTFormer, a Transformer-based\nmodel to address both obstacle detection and tracking problems. For the\ndetection task, our approach leverages deformable attention to construct a 3D\ncost volume, which is decoded progressively in the form of voxel occupancy\ngrids. We further track the obstacles by matching the voxels between\nconsecutive frames. The entire model can be optimized in an end-to-end manner.\nThrough extensive experiments on DrivingStereo and KITTI benchmarks, our model\nachieves state-of-the-art performance in the obstacle detection task. We also\nreport comparable accuracy to state-of-the-art obstacle tracking models while\nrequiring only a fraction of their computation cost, typically ten-fold to\ntwenty-fold less. The code and model weights will be publicly released.\n","authors":["Tianye Ding","Hongyu Li","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2403.14626v2.pdf","comment":"8 pages. Accepted by IROS 2024"},{"id":"http://arxiv.org/abs/2405.15574v4","updated":"2024-10-23T10:54:16Z","published":"2024-05-24T14:04:03Z","title":"Meteor: Mamba-based Traversal of Rationale for Large Language and Vision\n Models","summary":" The rapid development of large language and vision models (LLVMs) has been\ndriven by advances in visual instruction tuning. Recently, open-source LLVMs\nhave curated high-quality visual instruction tuning datasets and utilized\nadditional vision encoders or multiple computer vision models in order to\nnarrow the performance gap with powerful closed-source LLVMs. These\nadvancements are attributed to multifaceted information required for diverse\ncapabilities, including fundamental image understanding, real-world knowledge\nabout common-sense and non-object concepts (e.g., charts, diagrams, symbols,\nsigns, and math problems), and step-by-step procedures for solving complex\nquestions. Drawing from the multifaceted information, we present a new\nefficient LLVM, Mamba-based traversal of rationales (Meteor), which leverages\nmultifaceted rationale to enhance understanding and answering capabilities. To\nembed lengthy rationales containing abundant information, we employ the Mamba\narchitecture, capable of processing sequential data with linear time\ncomplexity. We introduce a new concept of traversal of rationale that\nfacilitates efficient embedding of rationale. Subsequently, the backbone\nmultimodal language model (MLM) is trained to generate answers with the aid of\nrationale. Through these steps, Meteor achieves significant improvements in\nvision language performances across multiple evaluation benchmarks requiring\ndiverse capabilities, without scaling up the model size or employing additional\nvision encoders and computer vision models.\n","authors":["Byung-Kwan Lee","Chae Won Kim","Beomchan Park","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2405.15574v4.pdf","comment":"Code is available in https://github.com/ByungKwanLee/Meteor"},{"id":"http://arxiv.org/abs/2410.17752v1","updated":"2024-10-23T10:29:18Z","published":"2024-10-23T10:29:18Z","title":"AdaDiffSR: Adaptive Region-aware Dynamic Acceleration Diffusion Model\n for Real-World Image Super-Resolution","summary":" Diffusion models (DMs) have shown promising results on single-image\nsuper-resolution and other image-to-image translation tasks. Benefiting from\nmore computational resources and longer inference times, they are able to yield\nmore realistic images. Existing DMs-based super-resolution methods try to\nachieve an overall average recovery over all regions via iterative refinement,\nignoring the consideration that different input image regions require different\ntimesteps to reconstruct. In this work, we notice that previous DMs-based\nsuper-resolution methods suffer from wasting computational resources to\nreconstruct invisible details. To further improve the utilization of\ncomputational resources, we propose AdaDiffSR, a DMs-based SR pipeline with\ndynamic timesteps sampling strategy (DTSS). Specifically, by introducing the\nmulti-metrics latent entropy module (MMLE), we can achieve dynamic perception\nof the latent spatial information gain during the denoising process, thereby\nguiding the dynamic selection of the timesteps. In addition, we adopt a\nprogressive feature injection module (PFJ), which dynamically injects the\noriginal image features into the denoising process based on the current\ninformation gain, so as to generate images with both fidelity and realism.\nExperiments show that our AdaDiffSR achieves comparable performance over\ncurrent state-of-the-art DMs-based SR methods while consuming less\ncomputational resources and inference time on both synthetic and real-world\ndatasets.\n","authors":["Yuanting Fan","Chengxu Liu","Nengzhong Yin","Changlong Gao","Xueming Qian"],"pdf_url":"https://arxiv.org/pdf/2410.17752v1.pdf","comment":"18 pages, 6 figures, ECCV2024 accepted"},{"id":"http://arxiv.org/abs/2410.17751v1","updated":"2024-10-23T10:28:17Z","published":"2024-10-23T10:28:17Z","title":"VISAGE: Video Synthesis using Action Graphs for Surgery","summary":" Surgical data science (SDS) is a field that analyzes patient data before,\nduring, and after surgery to improve surgical outcomes and skills. However,\nsurgical data is scarce, heterogeneous, and complex, which limits the\napplicability of existing machine learning methods. In this work, we introduce\nthe novel task of future video generation in laparoscopic surgery. This task\ncan augment and enrich the existing surgical data and enable various\napplications, such as simulation, analysis, and robot-aided surgery.\nUltimately, it involves not only understanding the current state of the\noperation but also accurately predicting the dynamic and often unpredictable\nnature of surgical procedures. Our proposed method, VISAGE (VIdeo Synthesis\nusing Action Graphs for Surgery), leverages the power of action scene graphs to\ncapture the sequential nature of laparoscopic procedures and utilizes diffusion\nmodels to synthesize temporally coherent video sequences. VISAGE predicts the\nfuture frames given only a single initial frame, and the action graph triplets.\nBy incorporating domain-specific knowledge through the action graph, VISAGE\nensures the generated videos adhere to the expected visual and motion patterns\nobserved in real laparoscopic procedures. The results of our experiments\ndemonstrate high-fidelity video generation for laparoscopy procedures, which\nenables various applications in SDS.\n","authors":["Yousef Yeganeh","Rachmadio Lazuardi","Amir Shamseddin","Emine Dari","Yash Thirani","Nassir Navab Azade Farshad"],"pdf_url":"https://arxiv.org/pdf/2410.17751v1.pdf","comment":"Accepted at MICCAI 2024 Embodied AI and Robotics for HealTHcare\n (EARTH) Workshop"},{"id":"http://arxiv.org/abs/2410.17741v1","updated":"2024-10-23T10:16:01Z","published":"2024-10-23T10:16:01Z","title":"Efficient Neural Implicit Representation for 3D Human Reconstruction","summary":" High-fidelity digital human representations are increasingly in demand in the\ndigital world, particularly for interactive telepresence, AR/VR, 3D graphics,\nand the rapidly evolving metaverse. Even though they work well in small spaces,\nconventional methods for reconstructing 3D human motion frequently require the\nuse of expensive hardware and have high processing costs. This study presents\nHumanAvatar, an innovative approach that efficiently reconstructs precise human\navatars from monocular video sources. At the core of our methodology, we\nintegrate the pre-trained HuMoR, a model celebrated for its proficiency in\nhuman motion estimation. This is adeptly fused with the cutting-edge neural\nradiance field technology, Instant-NGP, and the state-of-the-art articulated\nmodel, Fast-SNARF, to enhance the reconstruction fidelity and speed. By\ncombining these two technologies, a system is created that can render quickly\nand effectively while also providing estimation of human pose parameters that\nare unmatched in accuracy. We have enhanced our system with an advanced\nposture-sensitive space reduction technique, which optimally balances rendering\nquality with computational efficiency. In our detailed experimental analysis\nusing both artificial and real-world monocular videos, we establish the\nadvanced performance of our approach. HumanAvatar consistently equals or\nsurpasses contemporary leading-edge reconstruction techniques in quality.\nFurthermore, it achieves these complex reconstructions in minutes, a fraction\nof the time typically required by existing methods. Our models achieve a\ntraining speed that is 110X faster than that of State-of-The-Art (SoTA)\nNeRF-based models. Our technique performs noticeably better than SoTA dynamic\nhuman NeRF methods if given an identical runtime limit. HumanAvatar can provide\neffective visuals after only 30 seconds of training.\n","authors":["Zexu Huang","Sarah Monazam Erfani","Siying Lu","Mingming Gong"],"pdf_url":"https://arxiv.org/pdf/2410.17741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17740v1","updated":"2024-10-23T10:14:37Z","published":"2024-10-23T10:14:37Z","title":"Emotion Recognition with Facial Attention and Objective Activation\n Functions","summary":" In this paper, we study the effect of introducing channel and spatial\nattention mechanisms, namely SEN-Net, ECA-Net, and CBAM, to existing CNN\nvision-based models such as VGGNet, ResNet, and ResNetV2 to perform the Facial\nEmotion Recognition task. We show that not only attention can significantly\nimprove the performance of these models but also that combining them with a\ndifferent activation function can further help increase the performance of\nthese models.\n","authors":["Andrzej Miskow","Abdulrahman Altahhan"],"pdf_url":"https://arxiv.org/pdf/2410.17740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17735v1","updated":"2024-10-23T10:11:39Z","published":"2024-10-23T10:11:39Z","title":"New Insight in Cervical Cancer Diagnosis Using Convolution Neural\n Network Architecture","summary":" The Pap smear is a screening method for early cervical cancer diagnosis. The\nselection of the right optimizer in the convolutional neural network (CNN)\nmodel is key to the success of the CNN in image classification, including the\nclassification of cervical cancer Pap smear images. In this study, stochastic\ngradient descent (SGD), RMSprop, Adam, AdaGrad, AdaDelta, Adamax, and Nadam\noptimizers were used to classify cervical cancer Pap smear images from the\nSipakMed dataset. Resnet-18, Resnet-34, and VGG-16 are the CNN architectures\nused in this study, and each architecture uses a transfer-learning model. Based\non the test results, we conclude that the transfer learning model performs\nbetter on all CNNs and optimization techniques and that in the transfer\nlearning model, the optimization has little influence on the training of the\nmodel. Adamax, with accuracy values of 72.8% and 66.8%, had the best accuracy\nfor the VGG-16 and Resnet-18 architectures, respectively. Resnet-34 had 54.0%.\nThis is 0.034% lower than Nadam. Overall, Adamax is a suitable optimizer for\nCNN in cervical cancer classification on Resnet-18, Resnet-34, and VGG-16\narchitectures. This study provides new insights into the configuration of CNN\nmodels for Pap smear image analysis.\n","authors":["Ach. Khozaimi","Wayan Firdaus Mahmudy"],"pdf_url":"https://arxiv.org/pdf/2410.17735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17734v1","updated":"2024-10-23T10:07:13Z","published":"2024-10-23T10:07:13Z","title":"YOLO-Vehicle-Pro: A Cloud-Edge Collaborative Framework for Object\n Detection in Autonomous Driving under Adverse Weather Conditions","summary":" With the rapid advancement of autonomous driving technology, efficient and\naccurate object detection capabilities have become crucial factors in ensuring\nthe safety and reliability of autonomous driving systems. However, in\nlow-visibility environments such as hazy conditions, the performance of\ntraditional object detection algorithms often degrades significantly, failing\nto meet the demands of autonomous driving. To address this challenge, this\npaper proposes two innovative deep learning models: YOLO-Vehicle and\nYOLO-Vehicle-Pro. YOLO-Vehicle is an object detection model tailored\nspecifically for autonomous driving scenarios, employing multimodal fusion\ntechniques to combine image and textual information for object detection.\nYOLO-Vehicle-Pro builds upon this foundation by introducing an improved image\ndehazing algorithm, enhancing detection performance in low-visibility\nenvironments. In addition to model innovation, this paper also designs and\nimplements a cloud-edge collaborative object detection system, deploying models\non edge devices and offloading partial computational tasks to the cloud in\ncomplex situations. Experimental results demonstrate that on the KITTI dataset,\nthe YOLO-Vehicle-v1s model achieved 92.1% accuracy while maintaining a\ndetection speed of 226 FPS and an inference time of 12ms, meeting the real-time\nrequirements of autonomous driving. When processing hazy images, the\nYOLO-Vehicle-Pro model achieved a high accuracy of 82.3% mAP@50 on the Foggy\nCityscapes dataset while maintaining a detection speed of 43 FPS.\n","authors":["Xiguang Li","Jiafu Chen","Yunhe Sun","Na Lin","Ammar Hawbani","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.17734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17725v1","updated":"2024-10-23T09:55:22Z","published":"2024-10-23T09:55:22Z","title":"YOLOv11: An Overview of the Key Architectural Enhancements","summary":" This study presents an architectural analysis of YOLOv11, the latest\niteration in the YOLO (You Only Look Once) series of object detection models.\nWe examine the models architectural innovations, including the introduction of\nthe C3k2 (Cross Stage Partial with kernel size 2) block, SPPF (Spatial Pyramid\nPooling - Fast), and C2PSA (Convolutional block with Parallel Spatial\nAttention) components, which contribute in improving the models performance in\nseveral ways such as enhanced feature extraction. The paper explores YOLOv11's\nexpanded capabilities across various computer vision tasks, including object\ndetection, instance segmentation, pose estimation, and oriented object\ndetection (OBB). We review the model's performance improvements in terms of\nmean Average Precision (mAP) and computational efficiency compared to its\npredecessors, with a focus on the trade-off between parameter count and\naccuracy. Additionally, the study discusses YOLOv11's versatility across\ndifferent model sizes, from nano to extra-large, catering to diverse\napplication needs from edge devices to high-performance computing environments.\nOur research provides insights into YOLOv11's position within the broader\nlandscape of object detection and its potential impact on real-time computer\nvision applications.\n","authors":["Rahima Khanam","Muhammad Hussain"],"pdf_url":"https://arxiv.org/pdf/2410.17725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15881v3","updated":"2024-10-23T09:52:23Z","published":"2024-08-28T15:52:23Z","title":"LLaVA-MoD: Making LLaVA Tiny via MoE Knowledge Distillation","summary":" We introduce LLaVA-MoD, a novel framework designed to enable the efficient\ntraining of small-scale Multimodal Language Models (s-MLLM) by distilling\nknowledge from large-scale MLLM (l-MLLM). Our approach tackles two fundamental\nchallenges in MLLM distillation. First, we optimize the network structure of\ns-MLLM by integrating a sparse Mixture of Experts (MoE) architecture into the\nlanguage model, striking a balance between computational efficiency and model\nexpressiveness. Second, we propose a progressive knowledge transfer strategy to\nensure comprehensive knowledge migration. This strategy begins with mimic\ndistillation, where we minimize the Kullback-Leibler (KL) divergence between\noutput distributions to enable the student model to emulate the teacher\nnetwork's understanding. Following this, we introduce preference distillation\nvia Direct Preference Optimization (DPO), where the key lies in treating l-MLLM\nas the reference model. During this phase, the s-MLLM's ability to discriminate\nbetween superior and inferior examples is significantly enhanced beyond l-MLLM,\nleading to a better student that surpasses its teacher, particularly in\nhallucination benchmarks. Extensive experiments demonstrate that LLaVA-MoD\noutperforms existing models across various multimodal benchmarks while\nmaintaining a minimal number of activated parameters and low computational\ncosts. Remarkably, LLaVA-MoD, with only 2B activated parameters, surpasses\nQwen-VL-Chat-7B by an average of 8.8% across benchmarks, using merely 0.3% of\nthe training data and 23% trainable parameters. These results underscore\nLLaVA-MoD's ability to effectively distill comprehensive knowledge from its\nteacher model, paving the way for the development of more efficient MLLMs. The\ncode will be available on: https://github.com/shufangxun/LLaVA-MoD.\n","authors":["Fangxun Shu","Yue Liao","Le Zhuo","Chenning Xu","Lei Zhang","Guanghao Zhang","Haonan Shi","Long Chen","Tao Zhong","Wanggui He","Siming Fu","Haoyuan Li","Bolin Li","Zhelun Yu","Si Liu","Hongsheng Li","Hao Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.15881v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14693v2","updated":"2024-10-23T09:42:29Z","published":"2024-04-23T02:50:38Z","title":"DIP-Watermark: A Double Identity Protection Method Based on Robust\n Adversarial Watermark","summary":" The wide deployment of Face Recognition (FR) systems poses privacy risks. One\ncountermeasure is adversarial attack, deceiving unauthorized malicious FR, but\nit also disrupts regular identity verification of trusted authorizers,\nexacerbating the potential threat of identity impersonation. To address this,\nwe propose the first double identity protection scheme based on traceable\nadversarial watermarking, termed DIP-Watermark. DIP-Watermark employs a\none-time watermark embedding to deceive unauthorized FR models and allows\nauthorizers to perform identity verification by extracting the watermark.\nSpecifically, we propose an information-guided adversarial attack against FR\nmodels. The encoder embeds an identity-specific watermark into the deep feature\nspace of the carrier, guiding recognizable features of the image to deviate\nfrom the source identity. We further adopt a collaborative meta-optimization\nstrategy compatible with sub-tasks, which regularizes the joint optimization\ndirection of the encoder and decoder. This strategy enhances the representation\nof universal carrier features, mitigating multi-objective optimization\nconflicts in watermarking. Experiments confirm that DIP-Watermark achieves\nsignificant attack success rates and traceability accuracy on state-of-the-art\nFR models, exhibiting remarkable robustness that outperforms the existing\nprivacy protection methods using adversarial attacks and deep watermarking, or\nsimple combinations of the two. Our work potentially opens up new insights into\nproactive protection for FR privacy.\n","authors":["Yunming Zhang","Dengpan Ye","Caiyun Xie","Sipeng Shen","Ziyi Liu","Jiacheng Deng","Long Tang"],"pdf_url":"https://arxiv.org/pdf/2404.14693v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17715v1","updated":"2024-10-23T09:42:17Z","published":"2024-10-23T09:42:17Z","title":"Continual Learning on a Data Diet","summary":" Continual Learning (CL) methods usually learn from all available data.\nHowever, this is not the case in human cognition which efficiently focuses on\nkey experiences while disregarding the redundant information. Similarly, not\nall data points in a dataset have equal potential; some can be more informative\nthan others. This disparity may significantly impact the performance, as both\nthe quality and quantity of samples directly influence the model's\ngeneralizability and efficiency. Drawing inspiration from this, we explore the\npotential of learning from important samples and present an empirical study for\nevaluating coreset selection techniques in the context of CL to stimulate\nresearch in this unexplored area. We train different continual learners on\nincreasing amounts of selected samples and investigate the learning-forgetting\ndynamics by shedding light on the underlying mechanisms driving their improved\nstability-plasticity balance. We present several significant observations:\nlearning from selectively chosen samples (i) enhances incremental accuracy,\n(ii) improves knowledge retention of previous tasks, and (iii) refines learned\nrepresentations. This analysis contributes to a deeper understanding of\nselective learning strategies in CL scenarios.\n","authors":["Elif Ceren Gok Yildirim","Murat Onur Yildirim","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2410.17715v1.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.17678v2","updated":"2024-10-23T09:15:32Z","published":"2024-03-26T13:05:49Z","title":"Hierarchical Light Transformer Ensembles for Multimodal Trajectory\n Forecasting","summary":" Accurate trajectory forecasting is crucial for the performance of various\nsystems, such as advanced driver-assistance systems and self-driving vehicles.\nThese forecasts allow to anticipate events leading to collisions and,\ntherefore, to mitigate them. Deep Neural Networks have excelled in motion\nforecasting, but issues like overconfidence and uncertainty quantification\npersist. Deep Ensembles address these concerns, yet applying them to multimodal\ndistributions remains challenging. In this paper, we propose a novel approach\nnamed Hierarchical Light Transformer Ensembles (HLT-Ens), aimed at efficiently\ntraining an ensemble of Transformer architectures using a novel hierarchical\nloss function. HLT-Ens leverages grouped fully connected layers, inspired by\ngrouped convolution techniques, to capture multimodal distributions,\neffectively. Through extensive experimentation, we demonstrate that HLT-Ens\nachieves state-of-the-art performance levels, offering a promising avenue for\nimproving trajectory forecasting techniques.\n","authors":["Adrien Lafage","Mathieu Barbier","Gianni Franchi","David Filliat"],"pdf_url":"https://arxiv.org/pdf/2403.17678v2.pdf","comment":"acknowledgement added"},{"id":"http://arxiv.org/abs/2410.17691v1","updated":"2024-10-23T09:13:11Z","published":"2024-10-23T09:13:11Z","title":"Longitudinal Causal Image Synthesis","summary":" Clinical decision-making relies heavily on causal reasoning and longitudinal\nanalysis. For example, for a patient with Alzheimer's disease (AD), how will\nthe brain grey matter atrophy in a year if intervened on the A-beta level in\ncerebrospinal fluid? The answer is fundamental to diagnosis and follow-up\ntreatment. However, this kind of inquiry involves counterfactual medical images\nwhich can not be acquired by instrumental or correlation-based image synthesis\nmodels. Yet, such queries require counterfactual medical images, not obtainable\nthrough standard image synthesis models. Hence, a causal longitudinal image\nsynthesis (CLIS) method, enabling the synthesis of such images, is highly\nvaluable. However, building a CLIS model confronts three primary yet unmet\nchallenges: mismatched dimensionality between high-dimensional images and\nlow-dimensional tabular variables, inconsistent collection intervals of\nfollow-up data, and inadequate causal modeling capability of existing causal\ngraph methods for image data. In this paper, we established a tabular-visual\ncausal graph (TVCG) for CLIS overcoming these challenges through a novel\nintegration of generative imaging, continuous-time modeling, and structural\ncausal models combined with a neural network. We train our CLIS based on the\nADNI dataset and evaluate it on two other AD datasets, which illustrate the\noutstanding yet controllable quality of the synthesized images and the\ncontributions of synthesized MRI to the characterization of AD progression,\nsubstantiating the reliability and utility in clinics.\n","authors":["Yujia Li","Han Li","ans S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.17691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13752v2","updated":"2024-10-23T08:53:05Z","published":"2023-05-23T07:09:09Z","title":"Pulling Target to Source: A New Perspective on Domain Adaptive Semantic\n Segmentation","summary":" Domain adaptive semantic segmentation aims to transfer knowledge from a\nlabeled source domain to an unlabeled target domain. However, existing methods\nprimarily focus on directly learning qualified target features, making it\nchallenging to guarantee their discrimination in the absence of target labels.\nThis work provides a new perspective. We observe that the features learned with\nsource data manage to keep categorically discriminative during training,\nthereby enabling us to implicitly learn adequate target representations by\nsimply \\textbf{pulling target features close to source features for each\ncategory}. To this end, we propose T2S-DA, which we interpret as a form of\npulling Target to Source for Domain Adaptation, encouraging the model in\nlearning similar cross-domain features. Also, considering the pixel categories\nare heavily imbalanced for segmentation datasets, we come up with a dynamic\nre-weighting strategy to help the model concentrate on those underperforming\nclasses. Extensive experiments confirm that T2S-DA learns a more discriminative\nand generalizable representation, significantly surpassing the\nstate-of-the-art. We further show that our method is quite qualified for the\ndomain generalization task, verifying its domain-invariant property.\n","authors":["Haochen Wang","Yujun Shen","Jingjing Fei","Wei Li","Liwei Wu","Yuxi Wang","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13752v2.pdf","comment":"Accepted by IJCV"},{"id":"http://arxiv.org/abs/2410.17664v1","updated":"2024-10-23T08:33:23Z","published":"2024-10-23T08:33:23Z","title":"Deep Generative Models for 3D Medical Image Synthesis","summary":" Deep generative modeling has emerged as a powerful tool for synthesizing\nrealistic medical images, driving advances in medical image analysis, disease\ndiagnosis, and treatment planning. This chapter explores various deep\ngenerative models for 3D medical image synthesis, with a focus on Variational\nAutoencoders (VAEs), Generative Adversarial Networks (GANs), and Denoising\nDiffusion Models (DDMs). We discuss the fundamental principles, recent\nadvances, as well as strengths and weaknesses of these models and examine their\napplications in clinically relevant problems, including unconditional and\nconditional generation tasks like image-to-image translation and image\nreconstruction. We additionally review commonly used evaluation metrics for\nassessing image fidelity, diversity, utility, and privacy and provide an\noverview of current challenges in the field.\n","authors":["Paul Friedrich","Yannik Frisch","Philippe C. Cattin"],"pdf_url":"https://arxiv.org/pdf/2410.17664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00505v2","updated":"2024-10-23T08:28:53Z","published":"2024-06-01T17:27:34Z","title":"Improving Text Generation on Images with Synthetic Captions","summary":" The recent emergence of latent diffusion models such as SDXL and SD 1.5 has\nshown significant capability in generating highly detailed and realistic\nimages. Despite their remarkable ability to produce images, generating accurate\ntext within images still remains a challenging task. In this paper, we examine\nthe validity of fine-tuning approaches in generating legible text within the\nimage. We propose a low-cost approach by leveraging SDXL without any\ntime-consuming training on large-scale datasets. The proposed strategy employs\na fine-tuning technique that examines the effects of data refinement levels and\nsynthetic captions. Moreover, our results demonstrate how our small scale\nfine-tuning approach can improve the accuracy of text generation in different\nscenarios without the need of additional multimodal encoders. Our experiments\nshow that with the addition of random letters to our raw dataset, our model's\nperformance improves in producing well-formed visual text.\n","authors":["Jun Young Koh","Sang Hyun Park","Joy Song"],"pdf_url":"https://arxiv.org/pdf/2406.00505v2.pdf","comment":"2024 16th IIAI International Congress on Advanced Applied Informatics\n (IIAI-AAI)"},{"id":"http://arxiv.org/abs/2407.16364v2","updated":"2024-10-23T08:27:23Z","published":"2024-07-23T10:11:56Z","title":"Harmonizing Visual Text Comprehension and Generation","summary":" In this work, we present TextHarmony, a unified and versatile multimodal\ngenerative model proficient in comprehending and generating visual text.\nSimultaneously generating images and texts typically results in performance\ndegradation due to the inherent inconsistency between vision and language\nmodalities. To overcome this challenge, existing approaches resort to\nmodality-specific data for supervised fine-tuning, necessitating distinct model\ninstances. We propose Slide-LoRA, which dynamically aggregates\nmodality-specific and modality-agnostic LoRA experts, partially decoupling the\nmultimodal generation space. Slide-LoRA harmonizes the generation of vision and\nlanguage within a singular model instance, thereby facilitating a more unified\ngenerative process. Additionally, we develop a high-quality image caption\ndataset, DetailedTextCaps-100K, synthesized with a sophisticated closed-source\nMLLM to enhance visual text generation capabilities further. Comprehensive\nexperiments across various benchmarks demonstrate the effectiveness of the\nproposed approach. Empowered by Slide-LoRA, TextHarmony achieves comparable\nperformance to modality-specific fine-tuning results with only a 2% increase in\nparameters and shows an average improvement of 2.5% in visual text\ncomprehension tasks and 4.0% in visual text generation tasks. Our work\ndelineates the viability of an integrated approach to multimodal generation\nwithin the visual text domain, setting a foundation for subsequent inquiries.\nCode is available at https://github.com/bytedance/TextHarmony.\n","authors":["Zhen Zhao","Jingqun Tang","Binghong Wu","Chunhui Lin","Shu Wei","Hao Liu","Xin Tan","Zhizhong Zhang","Can Huang","Yuan Xie"],"pdf_url":"https://arxiv.org/pdf/2407.16364v2.pdf","comment":"accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2312.12540v4","updated":"2024-10-23T08:20:12Z","published":"2023-12-19T19:19:19Z","title":"Lightning-Fast Image Inversion and Editing for Text-to-Image Diffusion\n Models","summary":" Diffusion inversion is the problem of taking an image and a text prompt that\ndescribes it and finding a noise latent that would generate the exact same\nimage. Most current deterministic inversion techniques operate by approximately\nsolving an implicit equation and may converge slowly or yield poor\nreconstructed images. We formulate the problem by finding the roots of an\nimplicit equation and devlop a method to solve it efficiently. Our solution is\nbased on Newton-Raphson (NR), a well-known technique in numerical analysis. We\nshow that a vanilla application of NR is computationally infeasible while\nnaively transforming it to a computationally tractable alternative tends to\nconverge to out-of-distribution solutions, resulting in poor reconstruction and\nediting. We therefore derive an efficient guided formulation that fastly\nconverges and provides high-quality reconstructions and editing. We showcase\nour method on real image editing with three popular open-sourced diffusion\nmodels: Stable Diffusion, SDXL-Turbo, and Flux with different deterministic\nschedulers. Our solution, Guided Newton-Raphson Inversion, inverts an image\nwithin 0.4 sec (on an A100 GPU) for few-step models (SDXL-Turbo and Flux.1),\nopening the door for interactive image editing. We further show improved\nresults in image interpolation and generation of rare objects.\n","authors":["Dvir Samuel","Barak Meiri","Haggai Maron","Yoad Tewel","Nir Darshan","Shai Avidan","Gal Chechik","Rami Ben-Ari"],"pdf_url":"https://arxiv.org/pdf/2312.12540v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17642v1","updated":"2024-10-23T07:58:47Z","published":"2024-10-23T07:58:47Z","title":"Surgical Scene Segmentation by Transformer With Asymmetric Feature\n Enhancement","summary":" Surgical scene segmentation is a fundamental task for robotic-assisted\nlaparoscopic surgery understanding. It often contains various anatomical\nstructures and surgical instruments, where similar local textures and\nfine-grained structures make the segmentation a difficult task. Vision-specific\ntransformer method is a promising way for surgical scene understanding.\nHowever, there are still two main challenges. Firstly, the absence of\ninner-patch information fusion leads to poor segmentation performance.\nSecondly, the specific characteristics of anatomy and instruments are not\nspecifically modeled. To tackle the above challenges, we propose a novel\nTransformer-based framework with an Asymmetric Feature Enhancement module\n(TAFE), which enhances local information and then actively fuses the improved\nfeature pyramid into the embeddings from transformer encoders by a multi-scale\ninteraction attention strategy. The proposed method outperforms the SOTA\nmethods in several different surgical segmentation tasks and additionally\nproves its ability of fine-grained structure recognition. Code is available at\nhttps://github.com/cyuan-sjtu/ViT-asym.\n","authors":["Cheng Yuan","Yutong Ban"],"pdf_url":"https://arxiv.org/pdf/2410.17642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17637v1","updated":"2024-10-23T07:56:48Z","published":"2024-10-23T07:56:48Z","title":"MIA-DPO: Multi-Image Augmented Direct Preference Optimization For Large\n Vision-Language Models","summary":" Visual preference alignment involves training Large Vision-Language Models\n(LVLMs) to predict human preferences between visual inputs. This is typically\nachieved by using labeled datasets of chosen/rejected pairs and employing\noptimization algorithms like direct preference optimization (DPO). Existing\nvisual alignment methods, primarily designed for single-image scenarios,\nstruggle to effectively handle the complexity of multi-image tasks due to the\nscarcity of diverse training data and the high cost of annotating\nchosen/rejected pairs. We present Multi-Image Augmented Direct Preference\nOptimization (MIA-DPO), a visual preference alignment approach that effectively\nhandles multi-image inputs. MIA-DPO mitigates the scarcity of diverse\nmulti-image training data by extending single-image data with unrelated images\narranged in grid collages or pic-in-pic formats, significantly reducing the\ncosts associated with multi-image data annotations. Our observation reveals\nthat attention values of LVLMs vary considerably across different images. We\nuse attention values to identify and filter out rejected responses the model\nmay have mistakenly focused on. Our attention-aware selection for constructing\nthe chosen/rejected pairs without relying on (i) human annotation, (ii) extra\ndata, and (iii) external models or APIs. MIA-DPO is compatible with various\narchitectures and outperforms existing methods on five multi-image benchmarks,\nachieving an average performance boost of 3.0% on LLaVA-v1.5 and 4.3% on the\nrecent InternLM-XC2.5. Moreover, MIA-DPO has a minimal effect on the model's\nability to understand single images.\n","authors":["Ziyu Liu","Yuhang Zang","Xiaoyi Dong","Pan Zhang","Yuhang Cao","Haodong Duan","Conghui He","Yuanjun Xiong","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2410.17637v1.pdf","comment":"Project URL: https://github.com/Liuziyu77/MIA-DPO"},{"id":"http://arxiv.org/abs/2410.15767v2","updated":"2024-10-23T07:55:10Z","published":"2024-10-21T08:27:13Z","title":"Improving Instance Optimization in Deformable Image Registration with\n Gradient Projection","summary":" Deformable image registration is inherently a multi-objective optimization\n(MOO) problem, requiring a delicate balance between image similarity and\ndeformation regularity. These conflicting objectives often lead to poor\noptimization outcomes, such as being trapped in unsatisfactory local minima or\nexperiencing slow convergence. Deep learning methods have recently gained\npopularity in this domain due to their efficiency in processing large datasets\nand achieving high accuracy. However, they often underperform during test time\ncompared to traditional optimization techniques, which further explore\niterative, instance-specific gradient-based optimization. This performance gap\nis more pronounced when a distribution shift between training and test data\nexists. To address this issue, we focus on the instance optimization (IO)\nparadigm, which involves additional optimization for test-time instances based\non a pre-trained model. IO effectively combines the generalization capabilities\nof deep learning with the fine-tuning advantages of instance-specific\noptimization. Within this framework, we emphasize the use of gradient\nprojection to mitigate conflicting updates in MOO. This technique projects\nconflicting gradients into a common space, better aligning the dual objectives\nand enhancing optimization stability. We validate our method using a\nstate-of-the-art foundation model on the 3D Brain inter-subject registration\ntask (LUMIR) from the Learn2Reg 2024 Challenge. Our results show significant\nimprovements over standard gradient descent, leading to more accurate and\nreliable registration results.\n","authors":["Yi Zhang","Yidong Zhao","Qian Tao"],"pdf_url":"https://arxiv.org/pdf/2410.15767v2.pdf","comment":"Learn2Reg Challenge at MICCAI 2024"},{"id":"http://arxiv.org/abs/2410.17622v1","updated":"2024-10-23T07:26:19Z","published":"2024-10-23T07:26:19Z","title":"Bridging the Gaps: Utilizing Unlabeled Face Recognition Datasets to\n Boost Semi-Supervised Facial Expression Recognition","summary":" In recent years, Facial Expression Recognition (FER) has gained increasing\nattention. Most current work focuses on supervised learning, which requires a\nlarge amount of labeled and diverse images, while FER suffers from the scarcity\nof large, diverse datasets and annotation difficulty. To address these\nproblems, we focus on utilizing large unlabeled Face Recognition (FR) datasets\nto boost semi-supervised FER. Specifically, we first perform face\nreconstruction pre-training on large-scale facial images without annotations to\nlearn features of facial geometry and expression regions, followed by two-stage\nfine-tuning on FER datasets with limited labels. In addition, to further\nalleviate the scarcity of labeled and diverse images, we propose a Mixup-based\ndata augmentation strategy tailored for facial images, and the loss weights of\nreal and virtual images are determined according to the intersection-over-union\n(IoU) of the faces in the two images. Experiments on RAF-DB, AffectNet, and\nFERPlus show that our method outperforms existing semi-supervised FER methods\nand achieves new state-of-the-art performance. Remarkably, with only 5%, 25%\ntraining sets,our method achieves 64.02% on AffectNet,and 88.23% on RAF-DB,\nwhich is comparable to fully supervised state-of-the-art methods. Codes will be\nmade publicly available at https://github.com/zhelishisongjie/SSFER.\n","authors":["Jie Song","Mengqiao He","Jinhua Feng","Bairong Shen"],"pdf_url":"https://arxiv.org/pdf/2410.17622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14947v2","updated":"2024-10-23T07:18:51Z","published":"2024-08-27T10:44:34Z","title":"ERX: A Fast Real-Time Anomaly Detection Algorithm for Hyperspectral Line\n Scanning","summary":" Detecting unexpected objects (anomalies) in real time has great potential for\nmonitoring, managing, and protecting the environment. Hyperspectral line-scan\ncameras are a low-cost solution that enhance confidence in anomaly detection\nover RGB and multispectral imagery. However, existing line-scan algorithms are\ntoo slow when using small computers (e.g. those onboard a drone or small\nsatellite), do not adapt to changing scenery, or lack robustness against\ngeometric distortions. This paper introduces the Exponentially moving RX\nalgorithm (ERX) to address these issues, and compares it with existing RX-based\nanomaly detection methods for hyperspectral line scanning. Three large and more\ncomplex datasets are also introduced to better assess the practical challenges\nwhen using line-scan cameras (two hyperspectral and one multispectral). ERX is\nevaluated using a Jetson Xavier NX compute module, achieving the best\ncombination of speed and detection performance. This research paves the way for\nfuture studies in grouping and locating anomalous objects, adaptive and\nautomatic threshold selection, and real-time field tests. The datasets and the\nPython code are available at: https://github.com/WiseGamgee/HyperAD.\n","authors":["Samuel Garske","Bradley Evans","Christopher Artlett","KC Wong"],"pdf_url":"https://arxiv.org/pdf/2408.14947v2.pdf","comment":"17 pages, 13 figures, 4 tables, code and datasets accessible at\n https://github.com/WiseGamgee/HyperAD"},{"id":"http://arxiv.org/abs/2404.07554v2","updated":"2024-10-23T07:16:42Z","published":"2024-04-11T08:36:13Z","title":"CAT: Contrastive Adapter Training for Personalized Image Generation","summary":" The emergence of various adapters, including Low-Rank Adaptation (LoRA)\napplied from the field of natural language processing, has allowed diffusion\nmodels to personalize image generation at a low cost. However, due to the\nvarious challenges including limited datasets and shortage of regularization\nand computation resources, adapter training often results in unsatisfactory\noutcomes, leading to the corruption of the backbone model's prior knowledge.\nOne of the well known phenomena is the loss of diversity in object generation,\nespecially within the same class which leads to generating almost identical\nobjects with minor variations. This poses challenges in generation\ncapabilities. To solve this issue, we present Contrastive Adapter Training\n(CAT), a simple yet effective strategy to enhance adapter training through the\napplication of CAT loss. Our approach facilitates the preservation of the base\nmodel's original knowledge when the model initiates adapters. Furthermore, we\nintroduce the Knowledge Preservation Score (KPS) to evaluate CAT's ability to\nkeep the former information. We qualitatively and quantitatively compare CAT's\nimprovement. Finally, we mention the possibility of CAT in the aspects of\nmulti-concept adapter and optimization.\n","authors":["Jae Wan Park","Sang Hyun Park","Jun Young Koh","Junha Lee","Min Song"],"pdf_url":"https://arxiv.org/pdf/2404.07554v2.pdf","comment":"CVPRW 2024"},{"id":"http://arxiv.org/abs/2410.17610v1","updated":"2024-10-23T07:06:08Z","published":"2024-10-23T07:06:08Z","title":"ImDy: Human Inverse Dynamics from Imitated Observations","summary":" Inverse dynamics (ID), which aims at reproducing the driven torques from\nhuman kinematic observations, has been a critical tool for gait analysis.\nHowever, it is hindered from wider application to general motion due to its\nlimited scalability. Conventional optimization-based ID requires expensive\nlaboratory setups, restricting its availability. To alleviate this problem, we\npropose to exploit the recently progressive human motion imitation algorithms\nto learn human inverse dynamics in a data-driven manner. The key insight is\nthat the human ID knowledge is implicitly possessed by motion imitators, though\nnot directly applicable. In light of this, we devise an efficient data\ncollection pipeline with state-of-the-art motion imitation algorithms and\nphysics simulators, resulting in a large-scale human inverse dynamics benchmark\nas Imitated Dynamics (ImDy). ImDy contains over 150 hours of motion with joint\ntorque and full-body ground reaction force data. With ImDy, we train a\ndata-driven human inverse dynamics solver ImDyS(olver) in a fully supervised\nmanner, which conducts ID and ground reaction force estimation simultaneously.\nExperiments on ImDy and real-world data demonstrate the impressive competency\nof ImDyS in human inverse dynamics and ground reaction force estimation.\nMoreover, the potential of ImDy(-S) as a fundamental motion analysis tool is\nexhibited with downstream applications. The project page is\nhttps://foruck.github.io/ImDy/.\n","authors":["Xinpeng Liu","Junxuan Liang","Zili Lin","Haowen Hou","Yong-Lu Li","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2410.17610v1.pdf","comment":"Yong-Lu Li and Cewu Lu are the corresponding authors"},{"id":"http://arxiv.org/abs/2406.14927v2","updated":"2024-10-23T07:01:34Z","published":"2024-06-21T07:37:17Z","title":"Gaussian-Informed Continuum for Physical Property Identification and\n Simulation","summary":" This paper studies the problem of estimating physical properties (system\nidentification) through visual observations. To facilitate geometry-aware\nguidance in physical property estimation, we introduce a novel hybrid framework\nthat leverages 3D Gaussian representation to not only capture explicit shapes\nbut also enable the simulated continuum to render object masks as 2D shape\nsurrogates during training.\n We propose a new dynamic 3D Gaussian framework based on motion factorization\nto recover the object as 3D Gaussian point sets across different time states.\n Furthermore, we develop a coarse-to-fine filling strategy to generate the\ndensity fields of the object from the Gaussian reconstruction, allowing for the\nextraction of object continuums along with their surfaces and the integration\nof Gaussian attributes into these continuums.\n In addition to the extracted object surfaces, the Gaussian-informed continuum\nalso enables the rendering of object masks during simulations, serving as\n2D-shape guidance for physical property estimation.\n Extensive experimental evaluations demonstrate that our pipeline achieves\nstate-of-the-art performance across multiple benchmarks and metrics.\nAdditionally, we illustrate the effectiveness of the proposed method through\nreal-world demonstrations, showcasing its practical utility.\n Our project page is at https://jukgei.github.io/project/gic.\n","authors":["Junhao Cai","Yuji Yang","Weihao Yuan","Yisheng He","Zilong Dong","Liefeng Bo","Hui Cheng","Qifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2406.14927v2.pdf","comment":"21 pages, 8 figures, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.17606v1","updated":"2024-10-23T07:01:16Z","published":"2024-10-23T07:01:16Z","title":"Towards Effective Data-Free Knowledge Distillation via Diverse Diffusion\n Augmentation","summary":" Data-free knowledge distillation (DFKD) has emerged as a pivotal technique in\nthe domain of model compression, substantially reducing the dependency on the\noriginal training data. Nonetheless, conventional DFKD methods that employ\nsynthesized training data are prone to the limitations of inadequate diversity\nand discrepancies in distribution between the synthesized and original\ndatasets. To address these challenges, this paper introduces an innovative\napproach to DFKD through diverse diffusion augmentation (DDA). Specifically, we\nrevise the paradigm of common data synthesis in DFKD to a composite process\nthrough leveraging diffusion models subsequent to data synthesis for\nself-supervised augmentation, which generates a spectrum of data samples with\nsimilar distributions while retaining controlled variations. Furthermore, to\nmitigate excessive deviation in the embedding space, we introduce an image\nfiltering technique grounded in cosine similarity to maintain fidelity during\nthe knowledge distillation process. Comprehensive experiments conducted on\nCIFAR-10, CIFAR-100, and Tiny-ImageNet datasets showcase the superior\nperformance of our method across various teacher-student network\nconfigurations, outperforming the contemporary state-of-the-art DFKD methods.\nCode will be available at:https://github.com/SLGSP/DDA.\n","authors":["Muquan Li","Dongyang Zhang","Tao He","Xiurui Xie","Yuan-Fang Li","Ke Qin"],"pdf_url":"https://arxiv.org/pdf/2410.17606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17598v1","updated":"2024-10-23T06:51:59Z","published":"2024-10-23T06:51:59Z","title":"PlantCamo: Plant Camouflage Detection","summary":" Camouflaged Object Detection (COD) aims to detect objects with camouflaged\nproperties. Although previous studies have focused on natural (animals and\ninsects) and unnatural (artistic and synthetic) camouflage detection, plant\ncamouflage has been neglected. However, plant camouflage plays a vital role in\nnatural camouflage. Therefore, this paper introduces a new challenging problem\nof Plant Camouflage Detection (PCD). To address this problem, we introduce the\nPlantCamo dataset, which comprises 1,250 images with camouflaged plants\nrepresenting 58 object categories in various natural scenes. To investigate the\ncurrent status of plant camouflage detection, we conduct a large-scale\nbenchmark study using 20+ cutting-edge COD models on the proposed dataset. Due\nto the unique characteristics of plant camouflage, including holes and\nirregular borders, we developed a new framework, named PCNet, dedicated to PCD.\nOur PCNet surpasses performance thanks to its multi-scale global feature\nenhancement and refinement. Finally, we discuss the potential applications and\ninsights, hoping this work fills the gap in fine-grained COD research and\nfacilitates further intelligent ecology research. All resources will be\navailable on https://github.com/yjybuaa/PlantCamo.\n","authors":["Jinyu Yang","Qingwei Wang","Feng Zheng","Peng Chen","Aleš Leonardis","Deng-Ping Fan"],"pdf_url":"https://arxiv.org/pdf/2410.17598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17594v1","updated":"2024-10-23T06:47:29Z","published":"2024-10-23T06:47:29Z","title":"How to Continually Adapt Text-to-Image Diffusion Models for Flexible\n Customization?","summary":" Custom diffusion models (CDMs) have attracted widespread attention due to\ntheir astonishing generative ability for personalized concepts. However, most\nexisting CDMs unreasonably assume that personalized concepts are fixed and\ncannot change over time. Moreover, they heavily suffer from catastrophic\nforgetting and concept neglect on old personalized concepts when continually\nlearning a series of new concepts. To address these challenges, we propose a\nnovel Concept-Incremental text-to-image Diffusion Model (CIDM), which can\nresolve catastrophic forgetting and concept neglect to learn new customization\ntasks in a concept-incremental manner. Specifically, to surmount the\ncatastrophic forgetting of old concepts, we develop a concept consolidation\nloss and an elastic weight aggregation module. They can explore task-specific\nand task-shared knowledge during training, and aggregate all low-rank weights\nof old concepts based on their contributions during inference. Moreover, in\norder to address concept neglect, we devise a context-controllable synthesis\nstrategy that leverages expressive region features and noise estimation to\ncontrol the contexts of generated images according to user conditions.\nExperiments validate that our CIDM surpasses existing custom diffusion models.\nThe source codes are available at https://github.com/JiahuaDong/CIFC.\n","authors":["Jiahua Dong","Wenqi Liang","Hongliu Li","Duzhen Zhang","Meng Cao","Henghui Ding","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2410.17594v1.pdf","comment":"Accepted to NeurIPS2024"},{"id":"http://arxiv.org/abs/2406.16592v3","updated":"2024-10-23T06:45:20Z","published":"2024-06-24T12:33:21Z","title":"Toward Fairer Face Recognition Datasets","summary":" Face recognition and verification are two computer vision tasks whose\nperformance has progressed with the introduction of deep representations.\nHowever, ethical, legal, and technical challenges due to the sensitive\ncharacter of face data and biases in real training datasets hinder their\ndevelopment. Generative AI addresses privacy by creating fictitious identities,\nbut fairness problems persist. We promote fairness by introducing a demographic\nattributes balancing mechanism in generated training datasets. We experiment\nwith an existing real dataset, three generated training datasets, and the\nbalanced versions of a diffusion-based dataset. We propose a comprehensive\nevaluation that considers accuracy and fairness equally and includes a rigorous\nregression-based statistical analysis of attributes. The analysis shows that\nbalancing reduces demographic unfairness. Also, a performance gap persists\ndespite generation becoming more accurate with time. The proposed balancing\nmethod and comprehensive verification evaluation promote fairer and transparent\nface recognition and verification.\n","authors":["Alexandre Fournier-Montgieux","Michael Soumm","Adrian Popescu","Bertrand Luvison","Hervé Le Borgne"],"pdf_url":"https://arxiv.org/pdf/2406.16592v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08035v2","updated":"2024-10-23T06:37:01Z","published":"2024-06-12T09:36:52Z","title":"LVBench: An Extreme Long Video Understanding Benchmark","summary":" Recent progress in multimodal large language models has markedly enhanced the\nunderstanding of short videos (typically under one minute), and several\nevaluation datasets have emerged accordingly. However, these advancements fall\nshort of meeting the demands of real-world applications such as embodied\nintelligence for long-term decision-making, in-depth movie reviews and\ndiscussions, and live sports commentary, all of which require comprehension of\nlong videos spanning several hours. To address this gap, we introduce LVBench,\na benchmark specifically designed for long video understanding. Our dataset\ncomprises publicly sourced videos and encompasses a diverse set of tasks aimed\nat long video comprehension and information extraction. LVBench is designed to\nchallenge multimodal models to demonstrate long-term memory and extended\ncomprehension capabilities. Our extensive evaluations reveal that current\nmultimodal models still underperform on these demanding long video\nunderstanding tasks. Through LVBench, we aim to spur the development of more\nadvanced models capable of tackling the complexities of long video\ncomprehension. Our data and code are publicly available at:\nhttps://lvbench.github.io.\n","authors":["Weihan Wang","Zehai He","Wenyi Hong","Yean Cheng","Xiaohan Zhang","Ji Qi","Xiaotao Gu","Shiyu Huang","Bin Xu","Yuxiao Dong","Ming Ding","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2406.08035v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17555v2","updated":"2024-10-23T05:49:00Z","published":"2024-09-26T05:57:35Z","title":"Advancing Open-Set Domain Generalization Using Evidential Bi-Level\n Hardest Domain Scheduler","summary":" In Open-Set Domain Generalization (OSDG), the model is exposed to both new\nvariations of data appearance (domains) and open-set conditions, where both\nknown and novel categories are present at test time. The challenges of this\ntask arise from the dual need to generalize across diverse domains and\naccurately quantify category novelty, which is critical for applications in\ndynamic environments. Recently, meta-learning techniques have demonstrated\nsuperior results in OSDG, effectively orchestrating the meta-train and -test\ntasks by employing varied random categories and predefined domain partition\nstrategies. These approaches prioritize a well-designed training schedule over\ntraditional methods that focus primarily on data augmentation and the\nenhancement of discriminative feature learning. The prevailing meta-learning\nmodels in OSDG typically utilize a predefined sequential domain scheduler to\nstructure data partitions. However, a crucial aspect that remains inadequately\nexplored is the influence brought by strategies of domain schedulers during\ntraining. In this paper, we observe that an adaptive domain scheduler benefits\nmore in OSDG compared with prefixed sequential and random domain schedulers. We\npropose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve\nan adaptive domain scheduler. This method strategically sequences domains by\nassessing their reliabilities in utilizing a follower network, trained with\nconfidence scores learned in an evidential manner, regularized by max rebiasing\ndiscrepancy, and optimized in a bi-level manner. The results show that our\nmethod substantially improves OSDG performance and achieves more discriminative\nembeddings for both the seen and unseen categories. The source code is publicly\navailable at https://github.com/KPeng9510/EBiL-HaDS.\n","authors":["Kunyu Peng","Di Wen","Kailun Yang","Ao Luo","Yufan Chen","Jia Fu","M. Saquib Sarfraz","Alina Roitberg","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2409.17555v2.pdf","comment":"Accepted to NeurIPS 2024. The source code is publicly available at\n https://github.com/KPeng9510/EBiL-HaDS"},{"id":"http://arxiv.org/abs/2407.15815v2","updated":"2024-10-23T05:32:34Z","published":"2024-07-22T17:29:02Z","title":"Learning to Manipulate Anywhere: A Visual Generalizable Framework For\n Reinforcement Learning","summary":" Can we endow visuomotor robots with generalization capabilities to operate in\ndiverse open-world scenarios? In this paper, we propose \\textbf{Maniwhere}, a\ngeneralizable framework tailored for visual reinforcement learning, enabling\nthe trained robot policies to generalize across a combination of multiple\nvisual disturbance types. Specifically, we introduce a multi-view\nrepresentation learning approach fused with Spatial Transformer Network (STN)\nmodule to capture shared semantic information and correspondences among\ndifferent viewpoints. In addition, we employ a curriculum-based randomization\nand augmentation approach to stabilize the RL training process and strengthen\nthe visual generalization ability. To exhibit the effectiveness of Maniwhere,\nwe meticulously design 8 tasks encompassing articulate objects, bi-manual, and\ndexterous hand manipulation tasks, demonstrating Maniwhere's strong visual\ngeneralization and sim2real transfer abilities across 3 hardware platforms. Our\nexperiments show that Maniwhere significantly outperforms existing\nstate-of-the-art methods. Videos are provided at\nhttps://gemcollector.github.io/maniwhere/.\n","authors":["Zhecheng Yuan","Tianming Wei","Shuiqi Cheng","Gu Zhang","Yuanpei Chen","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2407.15815v2.pdf","comment":"Webpage: https://gemcollector.github.io/maniwhere/"},{"id":"http://arxiv.org/abs/2312.11309v2","updated":"2024-10-23T05:30:24Z","published":"2023-12-18T16:02:43Z","title":"The Ultimate Combo: Boosting Adversarial Example Transferability by\n Composing Data Augmentations","summary":" To help adversarial examples generalize from surrogate machine-learning (ML)\nmodels to targets, certain transferability-based black-box evasion attacks\nincorporate data augmentations (e.g., random resizing). Yet, prior work has\nexplored limited augmentations and their composition. To fill the gap, we\nsystematically studied how data augmentation affects transferability.\nSpecifically, we explored 46 augmentation techniques originally proposed to\nhelp ML models generalize to unseen benign samples, and assessed how they\nimpact transferability, when applied individually or composed. Performing\nexhaustive search on a small subset of augmentation techniques and genetic\nsearch on all techniques, we identified augmentation combinations that help\npromote transferability. Extensive experiments with the ImageNet and CIFAR-10\ndatasets and 18 models showed that simple color-space augmentations (e.g.,\ncolor to greyscale) attain high transferability when combined with standard\naugmentations. Furthermore, we discovered that composing augmentations impacts\ntransferability mostly monotonically (i.e., more augmentations $\\rightarrow$\n$\\ge$transferability). We also found that the best composition significantly\noutperformed the state of the art (e.g., 91.8% vs. $\\le$82.5% average\ntransferability to adversarially trained targets on ImageNet). Lastly, our\ntheoretical analysis, backed by empirical evidence, intuitively explains why\ncertain augmentations promote transferability.\n","authors":["Zebin Yun","Achi-Or Weingarten","Eyal Ronen","Mahmood Sharif"],"pdf_url":"https://arxiv.org/pdf/2312.11309v2.pdf","comment":"Accepted by AISec'24"},{"id":"http://arxiv.org/abs/2402.02316v3","updated":"2024-10-23T05:26:10Z","published":"2024-02-04T02:09:18Z","title":"Diffusion Models are Certifiably Robust Classifiers","summary":" Generative learning, recognized for its effective modeling of data\ndistributions, offers inherent advantages in handling out-of-distribution\ninstances, especially for enhancing robustness to adversarial attacks. Among\nthese, diffusion classifiers, utilizing powerful diffusion models, have\ndemonstrated superior empirical robustness. However, a comprehensive\ntheoretical understanding of their robustness is still lacking, raising\nconcerns about their vulnerability to stronger future attacks. In this study,\nwe prove that diffusion classifiers possess $O(1)$ Lipschitzness, and establish\ntheir certified robustness, demonstrating their inherent resilience. To achieve\nnon-constant Lipschitzness, thereby obtaining much tighter certified\nrobustness, we generalize diffusion classifiers to classify Gaussian-corrupted\ndata. This involves deriving the evidence lower bounds (ELBOs) for these\ndistributions, approximating the likelihood using the ELBO, and calculating\nclassification probabilities via Bayes' theorem. Experimental results show the\nsuperior certified robustness of these Noised Diffusion Classifiers (NDCs).\nNotably, we achieve over 80% and 70% certified robustness on CIFAR-10 under\nadversarial perturbations with \\(\\ell_2\\) norms less than 0.25 and 0.5,\nrespectively, using a single off-the-shelf diffusion model without any\nadditional data.\n","authors":["Huanran Chen","Yinpeng Dong","Shitong Shao","Zhongkai Hao","Xiao Yang","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.02316v3.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.17565v1","updated":"2024-10-23T05:19:20Z","published":"2024-10-23T05:19:20Z","title":"Double Banking on Knowledge: Customized Modulation and Prototypes for\n Multi-Modality Semi-supervised Medical Image Segmentation","summary":" Multi-modality (MM) semi-supervised learning (SSL) based medical image\nsegmentation has recently gained increasing attention for its ability to\nutilize MM data and reduce reliance on labeled images. However, current methods\nface several challenges: (1) Complex network designs hinder scalability to\nscenarios with more than two modalities. (2) Focusing solely on\nmodality-invariant representation while neglecting modality-specific features,\nleads to incomplete MM learning. (3) Leveraging unlabeled data with generative\nmethods can be unreliable for SSL. To address these problems, we propose Double\nBank Dual Consistency (DBDC), a novel MM-SSL approach for medical image\nsegmentation. To address challenge (1), we propose a modality all-in-one\nsegmentation network that accommodates data from any number of modalities,\nremoving the limitation on modality count. To address challenge (2), we design\ntwo learnable plug-in banks, Modality-Level Modulation bank (MLMB) and\nModality-Level Prototype (MLPB) bank, to capture both modality-invariant and\nmodality-specific knowledge. These banks are updated using our proposed\nModality Prototype Contrastive Learning (MPCL). Additionally, we design\nModality Adaptive Weighting (MAW) to dynamically adjust learning weights for\neach modality, ensuring balanced MM learning as different modalities learn at\ndifferent rates. Finally, to address challenge (3), we introduce a Dual\nConsistency (DC) strategy that enforces consistency at both the image and\nfeature levels without relying on generative methods. We evaluate our method on\na 2-to-4 modality segmentation task using three open-source datasets, and\nextensive experiments show that our method outperforms state-of-the-art\napproaches.\n","authors":["Yingyu Chen","Ziyuan Yang","Ming Yan","Zhongzhou Zhang","Hui Yu","Yan Liu","Yi Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.17565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06316v2","updated":"2024-10-23T05:14:19Z","published":"2023-12-11T12:03:30Z","title":"SemiSAM: Enhancing Semi-Supervised Medical Image Segmentation via\n SAM-Assisted Consistency Regularization","summary":" Semi-supervised learning has attracted much attention due to its less\ndependence on acquiring abundant annotations from experts compared to fully\nsupervised methods, which is especially important for medical image\nsegmentation which typically requires intensive pixel/voxel-wise labeling by\ndomain experts. Although semi-supervised methods can improve the performance by\nutilizing unlabeled data, there are still gaps between fully supervised methods\nunder extremely limited annotation scenarios. In this paper, we propose a\nsimple yet efficient strategy to explore the usage of the Segment Anything\nModel (SAM) for enhancing semi-supervised medical image segmentation.\nConcretely, the segmentation model trained with domain knowledge provides\ninformation for localization and generating input prompts to the SAM. Then the\ngenerated pseudo-labels of SAM are utilized as additional supervision to assist\nin the learning procedure of the semi-supervised framework. Extensive\nexperiments demonstrate that SemiSAM significantly improves the performance of\nexisting semi-supervised frameworks when only one or a few labeled images are\navailable and shows strong efficiency as a plug-and-play strategy for\nsemi-supervised medical image segmentation.\n","authors":["Yichi Zhang","Jin Yang","Yuchen Liu","Yuan Cheng","Yuan Qi"],"pdf_url":"https://arxiv.org/pdf/2312.06316v2.pdf","comment":"Accept for BIBM 2024"},{"id":"http://arxiv.org/abs/2410.17557v1","updated":"2024-10-23T04:46:36Z","published":"2024-10-23T04:46:36Z","title":"BlurryScope: a cost-effective and compact scanning microscope for\n automated HER2 scoring using deep learning on blurry image data","summary":" We developed a rapid scanning optical microscope, termed \"BlurryScope\", that\nleverages continuous image acquisition and deep learning to provide a\ncost-effective and compact solution for automated inspection and analysis of\ntissue sections. BlurryScope integrates specialized hardware with a neural\nnetwork-based model to quickly process motion-blurred histological images and\nperform automated pathology classification. This device offers comparable speed\nto commercial digital pathology scanners, but at a significantly lower price\npoint and smaller size/weight, making it ideal for fast triaging in small\nclinics, as well as for resource-limited settings. To demonstrate the\nproof-of-concept of BlurryScope, we implemented automated classification of\nhuman epidermal growth factor receptor 2 (HER2) scores on immunohistochemically\n(IHC) stained breast tissue sections, achieving concordant results with those\nobtained from a high-end digital scanning microscope. We evaluated this\napproach by scanning HER2-stained tissue microarrays (TMAs) at a continuous\nspeed of 5 mm/s, which introduces bidirectional motion blur artifacts. These\ncompromised images were then used to train our network models. Using a test set\nof 284 unique patient cores, we achieved blind testing accuracies of 79.3% and\n89.7% for 4-class (0, 1+, 2+, 3+) and 2-class (0/1+ , 2+/3+) HER2 score\nclassification, respectively. BlurryScope automates the entire workflow, from\nimage scanning to stitching and cropping of regions of interest, as well as\nHER2 score classification. We believe BlurryScope has the potential to enhance\nthe current pathology infrastructure in resource-scarce environments, save\ndiagnostician time and bolster cancer identification and classification across\nvarious clinical environments.\n","authors":["Michael John Fanous","Christopher Michael Seybold","Hanlong Chen","Nir Pillar","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2410.17557v1.pdf","comment":"18 Pages, 6 Figures"},{"id":"http://arxiv.org/abs/2409.05280v2","updated":"2024-10-23T04:41:51Z","published":"2024-09-09T02:18:50Z","title":"RotCAtt-TransUNet++: Novel Deep Neural Network for Sophisticated Cardiac\n Segmentation","summary":" Cardiovascular disease remains a predominant global health concern,\nresponsible for a significant portion of mortality worldwide. Accurate\nsegmentation of cardiac medical imaging data is pivotal in mitigating fatality\nrates associated with cardiovascular conditions. However, existing\nstate-of-the-art (SOTA) neural networks, including both CNN-based and\nTransformer-based approaches, exhibit limitations in practical applicability\ndue to their inability to effectively capture inter-slice connections alongside\nintra-slice information. This deficiency is particularly evident in datasets\nfeaturing intricate, long-range details along the z-axis, such as coronary\narteries in axial views. Additionally, SOTA methods fail to differentiate\nnon-cardiac components from myocardium in segmentation, leading to the\n\"spraying\" phenomenon. To address these challenges, we present\nRotCAtt-TransUNet++, a novel architecture tailored for robust segmentation of\ncomplex cardiac structures. Our approach emphasizes modeling global contexts by\naggregating multiscale features with nested skip connections in the encoder. It\nintegrates transformer layers to capture interactions between patches and\nemploys a rotatory attention mechanism to capture connectivity between multiple\nslices (inter-slice information). Additionally, a channel-wise cross-attention\ngate guides the fused multi-scale channel-wise information and features from\ndecoder stages to bridge semantic gaps. Experimental results demonstrate that\nour proposed model outperforms existing SOTA approaches across four cardiac\ndatasets and one abdominal dataset. Importantly, coronary arteries and\nmyocardium are annotated with near-perfect accuracy during inference. An\nablation study shows that the rotatory attention mechanism effectively\ntransforms embedded vectorized patches in the semantic dimensional space,\nenhancing segmentation accuracy.\n","authors":["Quoc-Bao Nguyen-Le","Tuan-Hy Le","Anh-Triet Do","Quoc-Huy Trinh"],"pdf_url":"https://arxiv.org/pdf/2409.05280v2.pdf","comment":"11 pages, 11 figures"},{"id":"http://arxiv.org/abs/2309.12029v2","updated":"2024-10-23T04:36:26Z","published":"2023-09-21T12:51:11Z","title":"Exploring Self-Supervised Skeleton-Based Human Action Recognition under\n Occlusions","summary":" To integrate self-supervised skeleton-based action recognition methods into\nautonomous robotic systems, it is crucial to consider adverse situations\ninvolving target occlusions. Such a scenario, despite its practical relevance,\nis rarely addressed in existing self-supervised skeleton-based action\nrecognition methods. To empower models with the capacity to address occlusion,\nwe propose a simple and effective method. We first pre-train using occluded\nskeleton sequences, then use k-means clustering (KMeans) on sequence embeddings\nto group semantically similar samples. Next, we propose KNN-Imputation to fill\nin missing skeleton data based on the closest sample neighbors. Imputing\nincomplete skeleton sequences to create relatively complete sequences as input\nprovides significant benefits to existing skeleton-based self-supervised\nmethods. Meanwhile, building on the state-of-the-art Partial Spatio-Temporal\nLearning (PSTL), we introduce an Occluded Partial Spatio-Temporal Learning\n(OPSTL) framework. This enhancement utilizes Adaptive Spatial Masking (ASM) for\nbetter use of high-quality, intact skeletons. The new proposed method is\nverified on the challenging occluded versions of the NTURGB+D 60 and NTURGB+D\n120. The source code is publicly available at https://github.com/cyfml/OPSTL.\n","authors":["Yifei Chen","Kunyu Peng","Alina Roitberg","David Schneider","Jiaming Zhang","Junwei Zheng","Ruiping Liu","Yufan Chen","Kailun Yang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2309.12029v2.pdf","comment":"The source code is publicly available at\n https://github.com/cyfml/OPSTL"},{"id":"http://arxiv.org/abs/2305.19599v4","updated":"2024-10-23T03:59:05Z","published":"2023-05-31T06:59:21Z","title":"RealignDiff: Boosting Text-to-Image Diffusion Model with Coarse-to-fine\n Semantic Re-alignment","summary":" Recent advances in text-to-image diffusion models have achieved remarkable\nsuccess in generating high-quality, realistic images from textual descriptions.\nHowever, these approaches have faced challenges in precisely aligning the\ngenerated visual content with the textual concepts described in the prompts. In\nthis paper, we propose a two-stage coarse-to-fine semantic re-alignment method,\nnamed RealignDiff, aimed at improving the alignment between text and images in\ntext-to-image diffusion models. In the coarse semantic re-alignment phase, a\nnovel caption reward, leveraging the BLIP-2 model, is proposed to evaluate the\nsemantic discrepancy between the generated image caption and the given text\nprompt. Subsequently, the fine semantic re-alignment stage employs a local\ndense caption generation module and a re-weighting attention modulation module\nto refine the previously generated images from a local semantic view.\nExperimental results on the MS-COCO and ViLG-300 datasets demonstrate that the\nproposed two-stage coarse-to-fine semantic re-alignment method outperforms\nother baseline re-alignment techniques by a substantial margin in both visual\nquality and semantic similarity with the input prompt.\n","authors":["Guian Fang","Zutao Jiang","Jianhua Han","Guansong Lu","Hang Xu","Shengcai Liao","Xiaojun Chang","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2305.19599v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17543v1","updated":"2024-10-23T03:47:29Z","published":"2024-10-23T03:47:29Z","title":"Unsupervised Low-dose CT Reconstruction with One-way Conditional\n Normalizing Flows","summary":" Deep-learning methods have shown promising performance for low-dose computed\ntomography (LDCT) reconstruction. However, supervised methods face the problem\nof lacking labeled data in clinical scenarios, and the CNN-based unsupervised\ndenoising methods would cause excessive smoothing in the reconstructed image.\nRecently, the normalizing flows (NFs) based methods have shown advantages in\nproducing detail-rich images and avoiding over-smoothing, however, there are\nstill issues: (1) Although the alternating optimization in the data and latent\nspace can well utilize the regularization and generation capabilities of NFs,\nthe current two-way transformation strategy of noisy images and latent\nvariables would cause detail loss and secondary artifacts; and (2) Training NFs\non high-resolution CT images is hard due to huge computation. Though using\nconditional normalizing flows (CNFs) to learn conditional probability can\nreduce the computational burden, current methods require labeled data for\nconditionalization, and the unsupervised CNFs-based LDCT reconstruction remains\na problem. To tackle these problems, we propose a novel CNFs-based unsupervised\nLDCT iterative reconstruction algorithm. It employs strict one-way\ntransformation when performing alternating optimization in the dual spaces,\nthus effectively avoiding the problems of detail loss and secondary artifacts.\nBy proposing a novel unsupervised conditionalization strategy, we train CNFs on\nhigh-resolution CT images, thus achieving fast and high-quality unsupervised\nreconstruction. Experiments on different datasets suggest that the performance\nof the proposed algorithm could surpass some state-of-the-art unsupervised and\neven supervised methods.\n","authors":["Ran An","Ke Chen","Hongwei Li"],"pdf_url":"https://arxiv.org/pdf/2410.17543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05741v2","updated":"2024-10-23T03:39:00Z","published":"2024-02-08T15:19:50Z","title":"Real-World Robot Applications of Foundation Models: A Review","summary":" Recent developments in foundation models, like Large Language Models (LLMs)\nand Vision-Language Models (VLMs), trained on extensive data, facilitate\nflexible application across different tasks and modalities. Their impact spans\nvarious fields, including healthcare, education, and robotics. This paper\nprovides an overview of the practical application of foundation models in\nreal-world robotics, with a primary emphasis on the replacement of specific\ncomponents within existing robot systems. The summary encompasses the\nperspective of input-output relationships in foundation models, as well as\ntheir role in perception, motion planning, and control within the field of\nrobotics. This paper concludes with a discussion of future challenges and\nimplications for practical robot applications.\n","authors":["Kento Kawaharazuka","Tatsuya Matsushima","Andrew Gambardella","Jiaxian Guo","Chris Paxton","Andy Zeng"],"pdf_url":"https://arxiv.org/pdf/2402.05741v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17534v1","updated":"2024-10-23T03:28:46Z","published":"2024-10-23T03:28:46Z","title":"OVT-B: A New Large-Scale Benchmark for Open-Vocabulary Multi-Object\n Tracking","summary":" Open-vocabulary object perception has become an important topic in artificial\nintelligence, which aims to identify objects with novel classes that have not\nbeen seen during training. Under this setting, open-vocabulary object detection\n(OVD) in a single image has been studied in many literature. However,\nopen-vocabulary object tracking (OVT) from a video has been studied less, and\none reason is the shortage of benchmarks. In this work, we have built a new\nlarge-scale benchmark for open-vocabulary multi-object tracking namely OVT-B.\nOVT-B contains 1,048 categories of objects and 1,973 videos with 637,608\nbounding box annotations, which is much larger than the sole open-vocabulary\ntracking dataset, i.e., OVTAO-val dataset (200+ categories, 900+ videos). The\nproposed OVT-B can be used as a new benchmark to pave the way for OVT research.\nWe also develop a simple yet effective baseline method for OVT. It integrates\nthe motion features for object tracking, which is an important feature for MOT\nbut is ignored in previous OVT methods. Experimental results have verified the\nusefulness of the proposed benchmark and the effectiveness of our method. We\nhave released the benchmark to the public at\nhttps://github.com/Coo1Sea/OVT-B-Dataset.\n","authors":["Haiji Liang","Ruize Han"],"pdf_url":"https://arxiv.org/pdf/2410.17534v1.pdf","comment":"15 pages, 6 figures, accepted at NeurIPS 2024 Dataset and Benchmark\n Track"},{"id":"http://arxiv.org/abs/2406.14515v2","updated":"2024-10-23T03:09:54Z","published":"2024-06-20T17:26:01Z","title":"MMBench-Video: A Long-Form Multi-Shot Benchmark for Holistic Video\n Understanding","summary":" The advent of large vision-language models (LVLMs) has spurred research into\ntheir applications in multi-modal contexts, particularly in video\nunderstanding. Traditional VideoQA benchmarks, despite providing quantitative\nmetrics, often fail to encompass the full spectrum of video content and\ninadequately assess models' temporal comprehension. To address these\nlimitations, we introduce MMBench-Video, a quantitative benchmark designed to\nrigorously evaluate LVLMs' proficiency in video understanding. MMBench-Video\nincorporates lengthy videos from YouTube and employs free-form questions,\nmirroring practical use cases. The benchmark is meticulously crafted to probe\nthe models' temporal reasoning skills, with all questions human-annotated\naccording to a carefully constructed ability taxonomy. We employ GPT-4 for\nautomated assessment, demonstrating superior accuracy and robustness over\nearlier LLM-based evaluations. Utilizing MMBench-Video, we have conducted\ncomprehensive evaluations that include both proprietary and open-source LVLMs\nfor images and videos. MMBench-Video stands as a valuable resource for the\nresearch community, facilitating improved evaluation of LVLMs and catalyzing\nprogress in the field of video understanding. The evalutation code of\nMMBench-Video will be integrated into VLMEvalKit:\nhttps://github.com/open-compass/VLMEvalKit.\n","authors":["Xinyu Fang","Kangrui Mao","Haodong Duan","Xiangyu Zhao","Yining Li","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2406.14515v2.pdf","comment":"Accepted in NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2406.18925v3","updated":"2024-10-23T02:57:31Z","published":"2024-06-27T06:32:56Z","title":"Selective Vision is the Challenge for Visual Reasoning: A Benchmark for\n Visual Argument Understanding","summary":" Visual arguments, often used in advertising or social causes, rely on images\nto persuade viewers to do or believe something. Understanding these arguments\nrequires selective vision: only specific visual stimuli within an image are\nrelevant to the argument, and relevance can only be understood within the\ncontext of a broader argumentative structure. While visual arguments are\nreadily appreciated by human audiences, we ask: are today's AI capable of\nsimilar understanding? We present VisArgs, a dataset of 1,611 images annotated\nwith 5,112 visual premises (with regions), 5,574 commonsense premises, and\nreasoning trees connecting them into structured arguments. We propose three\ntasks for evaluating visual argument understanding: premise localization,\npremise identification, and conclusion deduction. Experiments show that 1)\nmachines struggle to capture visual cues: GPT-4-O achieved 78.5% accuracy,\nwhile humans reached 98.0%. Models also performed 19.5% worse when\ndistinguishing between irrelevant objects within the image compared to external\nobjects. 2) Providing relevant visual premises improved model performance\nsignificantly.\n","authors":["Jiwan Chung","Sungjae Lee","Minseo Kim","Seungju Han","Ashkan Yousefpour","Jack Hessel","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2406.18925v3.pdf","comment":"12 pages, 6 figures. Accepted as main paper in EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.01023v2","updated":"2024-10-23T02:55:20Z","published":"2024-10-01T19:32:57Z","title":"Can visual language models resolve textual ambiguity with visual cues?\n Let visual puns tell you!","summary":" Humans possess multimodal literacy, allowing them to actively integrate\ninformation from various modalities to form reasoning. Faced with challenges\nlike lexical ambiguity in text, we supplement this with other modalities, such\nas thumbnail images or textbook illustrations. Is it possible for machines to\nachieve a similar multimodal understanding capability? In response, we present\nUnderstanding Pun with Image Explanations (UNPIE), a novel benchmark designed\nto assess the impact of multimodal inputs in resolving lexical ambiguities.\nPuns serve as the ideal subject for this evaluation due to their intrinsic\nambiguity. Our dataset includes 1,000 puns, each accompanied by an image that\nexplains both meanings. We pose three multimodal challenges with the\nannotations to assess different aspects of multimodal literacy; Pun Grounding,\nDisambiguation, and Reconstruction. The results indicate that various Socratic\nModels and Visual-Language Models improve over the text-only models when given\nvisual context, particularly as the complexity of the tasks increases.\n","authors":["Jiwan Chung","Seungwon Lim","Jaehyun Jeon","Seungbeen Lee","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2410.01023v2.pdf","comment":"Accepted as main paper in EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.17521v1","updated":"2024-10-23T02:52:53Z","published":"2024-10-23T02:52:53Z","title":"Diffusion Priors for Variational Likelihood Estimation and Image\n Denoising","summary":" Real-world noise removal is crucial in low-level computer vision. Due to the\nremarkable generation capabilities of diffusion models, recent attention has\nshifted towards leveraging diffusion priors for image restoration tasks.\nHowever, existing diffusion priors-based methods either consider simple noise\ntypes or rely on approximate posterior estimation, limiting their effectiveness\nin addressing structured and signal-dependent noise commonly found in\nreal-world images. In this paper, we build upon diffusion priors and propose\nadaptive likelihood estimation and MAP inference during the reverse diffusion\nprocess to tackle real-world noise. We introduce an independent,\nnon-identically distributed likelihood combined with the noise precision\n(inverse variance) prior and dynamically infer the precision posterior using\nvariational Bayes during the generation process. Meanwhile, we rectify the\nestimated noise variance through local Gaussian convolution. The final denoised\nimage is obtained by propagating intermediate MAP solutions that balance the\nupdated likelihood and diffusion prior. Additionally, we explore the local\ndiffusion prior inherent in low-resolution diffusion models, enabling direct\nhandling of high-resolution noisy images. Extensive experiments and analyses on\ndiverse real-world datasets demonstrate the effectiveness of our method. Code\nis available at https://github.com/HUST-Tan/DiffusionVI.\n","authors":["Jun Cheng","Shan Tan"],"pdf_url":"https://arxiv.org/pdf/2410.17521v1.pdf","comment":"Accepted by NeurIPS2024 as Spotlight"},{"id":"http://arxiv.org/abs/2405.20279v2","updated":"2024-10-23T02:38:44Z","published":"2024-05-30T17:33:10Z","title":"CV-VAE: A Compatible Video VAE for Latent Generative Video Models","summary":" Spatio-temporal compression of videos, utilizing networks such as Variational\nAutoencoders (VAE), plays a crucial role in OpenAI's SORA and numerous other\nvideo generative models. For instance, many LLM-like video models learn the\ndistribution of discrete tokens derived from 3D VAEs within the VQVAE\nframework, while most diffusion-based video models capture the distribution of\ncontinuous latent extracted by 2D VAEs without quantization. The temporal\ncompression is simply realized by uniform frame sampling which results in\nunsmooth motion between consecutive frames. Currently, there lacks of a\ncommonly used continuous video (3D) VAE for latent diffusion-based video models\nin the research community. Moreover, since current diffusion-based approaches\nare often implemented using pre-trained text-to-image (T2I) models, directly\ntraining a video VAE without considering the compatibility with existing T2I\nmodels will result in a latent space gap between them, which will take huge\ncomputational resources for training to bridge the gap even with the T2I models\nas initialization. To address this issue, we propose a method for training a\nvideo VAE of latent video models, namely CV-VAE, whose latent space is\ncompatible with that of a given image VAE, e.g., image VAE of Stable Diffusion\n(SD). The compatibility is achieved by the proposed novel latent space\nregularization, which involves formulating a regularization loss using the\nimage VAE. Benefiting from the latent space compatibility, video models can be\ntrained seamlessly from pre-trained T2I or video models in a truly\nspatio-temporally compressed latent space, rather than simply sampling video\nframes at equal intervals. With our CV-VAE, existing video models can generate\nfour times more frames with minimal finetuning. Extensive experiments are\nconducted to demonstrate the effectiveness of the proposed video VAE.\n","authors":["Sijie Zhao","Yong Zhang","Xiaodong Cun","Shaoshu Yang","Muyao Niu","Xiaoyu Li","Wenbo Hu","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2405.20279v2.pdf","comment":"Project Page: https://ailab-cvc.github.io/cvvae/index.html"},{"id":"http://arxiv.org/abs/2410.17514v1","updated":"2024-10-23T02:38:12Z","published":"2024-10-23T02:38:12Z","title":"PathMoCo: A Novel Framework to Improve Feature Embedding in\n Self-supervised Contrastive Learning for Histopathological Images","summary":" Self-supervised contrastive learning has become a cornerstone in various\nareas, particularly histopathological image analysis. Image augmentation plays\na crucial role in self-supervised contrastive learning, as it generates\nvariations in image samples. However, traditional image augmentation techniques\noften overlook the unique characteristics of histopathological images. In this\npaper, we propose a new histopathology-specific image augmentation method\ncalled stain reconstruction augmentation (SRA). We integrate our SRA with MoCo\nv3, a leading model in self-supervised contrastive learning, along with our\nadditional contrastive loss terms, and call the new model PathMoCo. We\ndemonstrate that our PathMoCo always outperforms the standard MoCo v3 across\nvarious downstream tasks and achieves comparable or superior performance to\nother foundation models pre-trained on significantly larger histopathology\ndatasets.\n","authors":["Hamid Manoochehri","Bodong Zhang","Beatrice S. Knudsen","Tolga Tasdizen"],"pdf_url":"https://arxiv.org/pdf/2410.17514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17513v1","updated":"2024-10-23T02:36:47Z","published":"2024-10-23T02:36:47Z","title":"HCDN: A Change Detection Network for Construction Housekeeping Using\n Feature Fusion and Large Vision Models","summary":" Workplace safety has received increasing attention as millions of workers\nworldwide suffer from work-related accidents. Despite poor housekeeping is a\nsignificant contributor to construction accidents, there remains a significant\nlack of technological research focused on improving housekeeping practices in\nconstruction sites. Recognizing and locating poor housekeeping in a dynamic\nconstruction site is an important task that can be improved through computer\nvision approaches. Despite advances in AI and computer vision, existing methods\nfor detecting poor housekeeping conditions face many challenges, including\nlimited explanations, lack of locating of poor housekeeping, and lack of\nannotated datasets. On the other hand, change detection which aims to detect\nthe changed environmental conditions (e.g., changing from good to poor\nhousekeeping) and 'where' the change has occurred (e.g., location of objects\ncausing poor housekeeping), has not been explored to the problem of\nhousekeeping management. To address these challenges, we propose the\nHousekeeping Change Detection Network (HCDN), an advanced change detection\nneural network that integrates a feature fusion module and a large vision\nmodel, achieving state-of-the-art performance. Additionally, we introduce the\napproach to establish a novel change detection dataset (named Housekeeping-CCD)\nfocused on housekeeping in construction sites, along with a housekeeping\nsegmentation dataset. Our contributions include significant performance\nimprovements compared to existing methods, providing an effective tool for\nenhancing construction housekeeping and safety. To promote further development,\nwe share our source code and trained models for global researchers:\nhttps://github.com/NUS-DBE/Housekeeping-CD.\n","authors":["Kailai Sun","Zherui Shao","Yang Miang Goh","Jing Tian","Vincent J. L. Gan"],"pdf_url":"https://arxiv.org/pdf/2410.17513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15070v2","updated":"2024-10-23T02:35:58Z","published":"2024-07-21T06:03:11Z","title":"GPHM: Gaussian Parametric Head Model for Monocular Head Avatar\n Reconstruction","summary":" Creating high-fidelity 3D human head avatars is crucial for applications in\nVR/AR, digital human, and film production. Recent advances have leveraged\nmorphable face models to generate animated head avatars from easily accessible\ndata, representing varying identities and expressions within a low-dimensional\nparametric space. However, existing methods often struggle with modeling\ncomplex appearance details, e.g., hairstyles, and suffer from low rendering\nquality and efficiency. In this paper we introduce a novel approach, 3D\nGaussian Parametric Head Model, which employs 3D Gaussians to accurately\nrepresent the complexities of the human head, allowing precise control over\nboth identity and expression. The Gaussian model can handle intricate details,\nenabling realistic representations of varying appearances and complex\nexpressions. Furthermore, we presents a well-designed training framework to\nensure smooth convergence, providing a robust guarantee for learning the rich\ncontent. Our method achieves high-quality, photo-realistic rendering with\nreal-time efficiency, making it a valuable contribution to the field of\nparametric head models. Finally, we apply the 3D Gaussian Parametric Head Model\nto monocular video or few-shot head avatar reconstruction tasks, which enables\ninstant reconstruction of high-quality 3D head avatars even when input data is\nextremely limited, surpassing previous methods in terms of reconstruction\nquality and training speed.\n","authors":["Yuelang Xu","Zhaoqi Su","Qingyao Wu","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2407.15070v2.pdf","comment":"Project page: https://yuelangx.github.io/gphmv2/"},{"id":"http://arxiv.org/abs/2403.08350v2","updated":"2024-10-23T02:16:37Z","published":"2024-03-13T08:54:31Z","title":"CoIN: A Benchmark of Continual Instruction tuNing for Multimodel Large\n Language Model","summary":" Instruction tuning represents a prevalent strategy employed by Multimodal\nLarge Language Models (MLLMs) to align with human instructions and adapt to new\ntasks. Nevertheless, MLLMs encounter the challenge of adapting to users'\nevolving knowledge and demands. Therefore, how to retain existing skills while\nacquiring new knowledge needs to be investigated. In this paper, we present a\ncomprehensive benchmark, namely Continual Instruction tuNing (CoIN), to assess\nexisting MLLMs in the sequential instruction tuning paradigm. CoIN comprises 10\ncommonly used datasets spanning 8 task categories, ensuring a diverse range of\ninstructions and tasks. Besides, the trained model is evaluated from two\naspects: Instruction Following and General Knowledge, which assess the\nalignment with human intention and knowledge preserved for reasoning,\nrespectively. Experiments on CoIN demonstrate that current powerful MLLMs still\nsuffer catastrophic forgetting, and the failure in intention alignment assumes\nthe main responsibility, instead of the knowledge forgetting. To this end, we\nintroduce MoELoRA to MLLMs which is effective to retain the previous\ninstruction alignment. Experimental results consistently illustrate the\nforgetting decreased from this method on CoIN.\n","authors":["Cheng Chen","Junchen Zhu","Xu Luo","Hengtao Shen","Lianli Gao","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2403.08350v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05166v4","updated":"2024-10-23T02:10:29Z","published":"2024-09-08T17:35:48Z","title":"CD-NGP: A Fast Scalable Continual Representation for Dynamic Scenes","summary":" Current methodologies for novel view synthesis (NVS) in dynamic scenes\nencounter significant challenges in harmonizing memory consumption, model\ncomplexity, training efficiency, and rendering fidelity. Existing offline\ntechniques, while delivering high-quality results, are often characterized by\nsubstantial memory demands and limited scalability. In contrast, online methods\ngrapple with the challenge of balancing rapid convergence with model\ncompactness. To address these issues, we propose continual dynamic neural\ngraphics primitives (CD-NGP). Our approach synergizes features from both\ntemporal and spatial hash encodings to achieve high rendering quality, employs\nparameter reuse to enhance scalability, and leverages a continual learning\nframework to mitigate memory overhead. Furthermore, we introduce a novel\ndataset comprising multi-view, exceptionally long video sequences with\nsubstantial rigid and non-rigid motion, thereby substantiating the scalability\nof our method.\n","authors":["Zhenhuan Liu","Shuai Liu","Zhiwei Ning","Jie Yang","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2409.05166v4.pdf","comment":"new template, editing"},{"id":"http://arxiv.org/abs/2410.17505v1","updated":"2024-10-23T02:05:05Z","published":"2024-10-23T02:05:05Z","title":"PLGS: Robust Panoptic Lifting with 3D Gaussian Splatting","summary":" Previous methods utilize the Neural Radiance Field (NeRF) for panoptic\nlifting, while their training and rendering speed are unsatisfactory. In\ncontrast, 3D Gaussian Splatting (3DGS) has emerged as a prominent technique due\nto its rapid training and rendering speed. However, unlike NeRF, the\nconventional 3DGS may not satisfy the basic smoothness assumption as it does\nnot rely on any parameterized structures to render (e.g., MLPs). Consequently,\nthe conventional 3DGS is, in nature, more susceptible to noisy 2D mask\nsupervision. In this paper, we propose a new method called PLGS that enables\n3DGS to generate consistent panoptic segmentation masks from noisy 2D\nsegmentation masks while maintaining superior efficiency compared to NeRF-based\nmethods. Specifically, we build a panoptic-aware structured 3D Gaussian model\nto introduce smoothness and design effective noise reduction strategies. For\nthe semantic field, instead of initialization with structure from motion, we\nconstruct reliable semantic anchor points to initialize the 3D Gaussians. We\nthen use these anchor points as smooth regularization during training.\nAdditionally, we present a self-training approach using pseudo labels generated\nby merging the rendered masks with the noisy masks to enhance the robustness of\nPLGS. For the instance field, we project the 2D instance masks into 3D space\nand match them with oriented bounding boxes to generate cross-view consistent\ninstance masks for supervision. Experiments on various benchmarks demonstrate\nthat our method outperforms previous state-of-the-art methods in terms of both\nsegmentation quality and speed.\n","authors":["Yu Wang","Xiaobao Wei","Ming Lu","Guoliang Kang"],"pdf_url":"https://arxiv.org/pdf/2410.17505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17502v1","updated":"2024-10-23T02:00:07Z","published":"2024-10-23T02:00:07Z","title":"Bilateral Hippocampi Segmentation in Low Field MRIs Using Mutual Feature\n Learning via Dual-Views","summary":" Accurate hippocampus segmentation in brain MRI is critical for studying\ncognitive and memory functions and diagnosing neurodevelopmental disorders.\nWhile high-field MRIs provide detailed imaging, low-field MRIs are more\naccessible and cost-effective, which eliminates the need for sedation in\nchildren, though they often suffer from lower image quality. In this paper, we\npresent a novel deep-learning approach for the automatic segmentation of\nbilateral hippocampi in low-field MRIs. Extending recent advancements in infant\nbrain segmentation to underserved communities through the use of low-field MRIs\nensures broader access to essential diagnostic tools, thereby supporting better\nhealthcare outcomes for all children. Inspired by our previous work, Co-BioNet,\nthe proposed model employs a dual-view structure to enable mutual feature\nlearning via high-frequency masking, enhancing segmentation accuracy by\nleveraging complementary information from different perspectives. Extensive\nexperiments demonstrate that our method provides reliable segmentation outcomes\nfor hippocampal analysis in low-resource settings. The code is publicly\navailable at: https://github.com/himashi92/LoFiHippSeg.\n","authors":["Himashi Peiris","Zhaolin Chen"],"pdf_url":"https://arxiv.org/pdf/2410.17502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17494v1","updated":"2024-10-23T01:25:25Z","published":"2024-10-23T01:25:25Z","title":"Enhancing Multimodal Medical Image Classification using Cross-Graph\n Modal Contrastive Learning","summary":" The classification of medical images is a pivotal aspect of disease\ndiagnosis, often enhanced by deep learning techniques. However, traditional\napproaches typically focus on unimodal medical image data, neglecting the\nintegration of diverse non-image patient data. This paper proposes a novel\nCross-Graph Modal Contrastive Learning (CGMCL) framework for multimodal medical\nimage classification. The model effectively integrates both image and non-image\ndata by constructing cross-modality graphs and leveraging contrastive learning\nto align multimodal features in a shared latent space. An inter-modality\nfeature scaling module further optimizes the representation learning process by\nreducing the gap between heterogeneous modalities. The proposed approach is\nevaluated on two datasets: a Parkinson's disease (PD) dataset and a public\nmelanoma dataset. Results demonstrate that CGMCL outperforms conventional\nunimodal methods in accuracy, interpretability, and early disease prediction.\nAdditionally, the method shows superior performance in multi-class melanoma\nclassification. The CGMCL framework provides valuable insights into medical\nimage classification while offering improved disease interpretability and\npredictive capabilities.\n","authors":["Jun-En Ding","Chien-Chin Hsu","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2410.17494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10937v2","updated":"2024-10-23T01:13:24Z","published":"2024-10-14T17:59:58Z","title":"Hybrid Spatial Representations for Species Distribution Modeling","summary":" We address an important problem in ecology called Species Distribution\nModeling (SDM), whose goal is to predict whether a species exists at a certain\nposition on Earth. In particular, we tackle a challenging version of this task,\nwhere we learn from presence-only data in a community-sourced dataset, model a\nlarge number of species simultaneously, and do not use any additional\nenvironmental information. Previous work has used neural implicit\nrepresentations to construct models that achieve promising results. However,\nimplicit representations often generate predictions of limited spatial\nprecision. We attribute this limitation to their inherently global formulation\nand inability to effectively capture local feature variations. This issue is\nespecially pronounced with presence-only data and a large number of species. To\naddress this, we propose a hybrid embedding scheme that combines both implicit\nand explicit embeddings. Specifically, the explicit embedding is implemented\nwith a multiresolution hashgrid, enabling our models to better capture local\ninformation. Experiments demonstrate that our results exceed other works by a\nlarge margin on various standard benchmarks, and that the hybrid representation\nis better than both purely implicit and explicit ones. Qualitative\nvisualizations and comprehensive ablation studies reveal that our hybrid\nrepresentation successfully addresses the two main challenges. Our code is\nopen-sourced at https://github.com/Shiran-Yuan/HSR-SDM.\n","authors":["Shiran Yuan","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.10937v2.pdf","comment":"Project codebase https://github.com/Shiran-Yuan/HSR-SDM"},{"id":"http://arxiv.org/abs/2410.17489v1","updated":"2024-10-23T00:59:27Z","published":"2024-10-23T00:59:27Z","title":"Unsupervised Domain Adaptation for Action Recognition via\n Self-Ensembling and Conditional Embedding Alignment","summary":" Recent advancements in deep learning-based wearable human action recognition\n(wHAR) have improved the capture and classification of complex motions, but\nadoption remains limited due to the lack of expert annotations and domain\ndiscrepancies from user variations. Limited annotations hinder the model's\nability to generalize to out-of-distribution samples. While data augmentation\ncan improve generalizability, unsupervised augmentation techniques must be\napplied carefully to avoid introducing noise. Unsupervised domain adaptation\n(UDA) addresses domain discrepancies by aligning conditional distributions with\nlabeled target samples, but vanilla pseudo-labeling can lead to error\npropagation. To address these challenges, we propose $\\mu$DAR, a novel joint\noptimization architecture comprised of three functions: (i) consistency\nregularizer between augmented samples to improve model classification\ngeneralizability, (ii) temporal ensemble for robust pseudo-label generation and\n(iii) conditional distribution alignment to improve domain generalizability.\nThe temporal ensemble works by aggregating predictions from past epochs to\nsmooth out noisy pseudo-label predictions, which are then used in the\nconditional distribution alignment module to minimize kernel-based class-wise\nconditional maximum mean discrepancy ($k$CMMD) between the source and target\nfeature space to learn a domain invariant embedding. The\nconsistency-regularized augmentations ensure that multiple augmentations of the\nsame sample share the same labels; this results in (a) strong generalization\nwith limited source domain samples and (b) consistent pseudo-label generation\nin target samples. The novel integration of these three modules in $\\mu$DAR\nresults in a range of $\\approx$ 4-12% average macro-F1 score improvement over\nsix state-of-the-art UDA methods in four benchmark wHAR datasets\n","authors":["Indrajeet Ghosh","Garvit Chugh","Abu Zaher Md Faridee","Nirmalya Roy"],"pdf_url":"https://arxiv.org/pdf/2410.17489v1.pdf","comment":"This work has been accepted to the Proceedings of the IEEE\n International Conference on Data Mining, 2024"},{"id":"http://arxiv.org/abs/2410.17488v1","updated":"2024-10-23T00:51:47Z","published":"2024-10-23T00:51:47Z","title":"GenDP: 3D Semantic Fields for Category-Level Generalizable Diffusion\n Policy","summary":" Diffusion-based policies have shown remarkable capability in executing\ncomplex robotic manipulation tasks but lack explicit characterization of\ngeometry and semantics, which often limits their ability to generalize to\nunseen objects and layouts. To enhance the generalization capabilities of\nDiffusion Policy, we introduce a novel framework that incorporates explicit\nspatial and semantic information via 3D semantic fields. We generate 3D\ndescriptor fields from multi-view RGBD observations with large foundational\nvision models, then compare these descriptor fields against reference\ndescriptors to obtain semantic fields. The proposed method explicitly considers\ngeometry and semantics, enabling strong generalization capabilities in tasks\nrequiring category-level generalization, resolving geometric ambiguities, and\nattention to subtle geometric details. We evaluate our method across eight\ntasks involving articulated objects and instances with varying shapes and\ntextures from multiple object categories. Our method demonstrates its\neffectiveness by increasing Diffusion Policy's average success rate on unseen\ninstances from 20% to 93%. Additionally, we provide a detailed analysis and\nvisualization to interpret the sources of performance gain and explain how our\nmethod can generalize to novel instances.\n","authors":["Yixuan Wang","Guang Yin","Binghao Huang","Tarik Kelestemur","Jiuguang Wang","Yunzhu Li"],"pdf_url":"https://arxiv.org/pdf/2410.17488v1.pdf","comment":"Accepted to Conference on Robot Learning (CoRL 2024). Project Page:\n https://robopil.github.io/GenDP/"},{"id":"http://arxiv.org/abs/2410.17484v1","updated":"2024-10-23T00:31:17Z","published":"2024-10-23T00:31:17Z","title":"Which Client is Reliable?: A Reliable and Personalized Prompt-based\n Federated Learning for Medical Image Question Answering","summary":" Conventional medical artificial intelligence (AI) models face barriers in\nclinical application and ethical issues owing to their inability to handle the\nprivacy-sensitive characteristics of medical data. We present a novel\npersonalized federated learning (pFL) method for medical visual question\nanswering (VQA) models, addressing privacy reliability challenges in the\nmedical domain. Our method introduces learnable prompts into a Transformer\narchitecture to efficiently train it on diverse medical datasets without\nmassive computational costs. Then we introduce a reliable client VQA model that\nincorporates Dempster-Shafer evidence theory to quantify uncertainty in\npredictions, enhancing the model's reliability. Furthermore, we propose a novel\ninter-client communication mechanism that uses maximum likelihood estimation to\nbalance accuracy and uncertainty, fostering efficient integration of insights\nacross clients.\n","authors":["He Zhu","Ren Togo","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2410.17484v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2402.16664v3","updated":"2024-10-23T16:27:48Z","published":"2024-02-26T15:35:24Z","title":"LLM-Assisted Multi-Teacher Continual Learning for Visual Question\n Answering in Robotic Surgery","summary":" Visual question answering (VQA) is crucial for promoting surgical education.\nIn practice, the needs of trainees are constantly evolving, such as learning\nmore surgical types, adapting to different robots, and learning new surgical\ninstruments and techniques for various surgeries. However, patient data privacy\noften restricts the availability of old data when updating the model,\nnecessitating an exemplar-free continual learning (CL) setup. Prior CL studies\noverlooked two vital problems in the surgical domain: 1) large domain shifts\nfrom diverse surgical operations collected from multiple sources, and 2) severe\ndata imbalance arising from the uneven presence of surgical instruments or\nactivities. This paper proposes addressing these problems with a multimodal\nlarge language model (LLM) and an adaptive weight assignment methodology. We\nfirst develop a new multi-teacher CL framework that leverages a multimodal LLM\nas the additional teacher. The strong generalization ability of the LLM can\nbridge the knowledge gap when domain shifts and data imbalances occur. We then\nput forth a novel data processing method that transforms complex LLM embeddings\ninto logits compatible with our CL framework. We further design an adaptive\nweight assignment approach that balances the generalization ability of the LLM\nand the domain expertise of the old CL model. Finally, to comprehensively test\nthe effectiveness of our proposed method, we have also constructed two new\nsurgical VQA datasets that are largely different from existing ones and could\nbe valuable resources for future research. Extensive experimental results on\nthe tested datasets demonstrate the superiority of our method to other advanced\nCL schemes.\n","authors":["Yuyang Du","Kexin Chen","Yue Zhan","Chang Han Low","Tao You","Mobarakol Islam","Ziyu Guo","Yueming Jin","Guangyong Chen","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2402.16664v3.pdf","comment":"This paper has been accapted by 2024 IEEE International Conference on\n Robotics and Automation (ICRA)"},{"id":"http://arxiv.org/abs/2410.17952v1","updated":"2024-10-23T15:24:16Z","published":"2024-10-23T15:24:16Z","title":"SimRAG: Self-Improving Retrieval-Augmented Generation for Adapting Large\n Language Models to Specialized Domains","summary":" Retrieval-augmented generation (RAG) enhances the question-answering (QA)\nabilities of large language models (LLMs) by integrating external knowledge.\nHowever, adapting general-purpose RAG systems to specialized fields such as\nscience and medicine poses unique challenges due to distribution shifts and\nlimited access to domain-specific data. To tackle this, we propose SimRAG, a\nself-training approach that equips the LLM with joint capabilities of question\nanswering and question generation for domain adaptation. Our method first\nfine-tunes the LLM on instruction-following, question-answering, and\nsearch-related data. Then, it prompts the same LLM to generate diverse\ndomain-relevant questions from unlabeled corpora, with an additional filtering\nstrategy to retain high-quality synthetic examples. By leveraging these\nsynthetic examples, the LLM can improve their performance on domain-specific\nRAG tasks. Experiments on 11 datasets, spanning two backbone sizes and three\ndomains, demonstrate that SimRAG outperforms baselines by 1.2\\%--8.6\\%.\n","authors":["Ran Xu","Hui Liu","Sreyashi Nag","Zhenwei Dai","Yaochen Xie","Xianfeng Tang","Chen Luo","Yang Li","Joyce C. Ho","Carl Yang","Qi He"],"pdf_url":"https://arxiv.org/pdf/2410.17952v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2410.17734v1","updated":"2024-10-23T10:07:13Z","published":"2024-10-23T10:07:13Z","title":"YOLO-Vehicle-Pro: A Cloud-Edge Collaborative Framework for Object\n Detection in Autonomous Driving under Adverse Weather Conditions","summary":" With the rapid advancement of autonomous driving technology, efficient and\naccurate object detection capabilities have become crucial factors in ensuring\nthe safety and reliability of autonomous driving systems. However, in\nlow-visibility environments such as hazy conditions, the performance of\ntraditional object detection algorithms often degrades significantly, failing\nto meet the demands of autonomous driving. To address this challenge, this\npaper proposes two innovative deep learning models: YOLO-Vehicle and\nYOLO-Vehicle-Pro. YOLO-Vehicle is an object detection model tailored\nspecifically for autonomous driving scenarios, employing multimodal fusion\ntechniques to combine image and textual information for object detection.\nYOLO-Vehicle-Pro builds upon this foundation by introducing an improved image\ndehazing algorithm, enhancing detection performance in low-visibility\nenvironments. In addition to model innovation, this paper also designs and\nimplements a cloud-edge collaborative object detection system, deploying models\non edge devices and offloading partial computational tasks to the cloud in\ncomplex situations. Experimental results demonstrate that on the KITTI dataset,\nthe YOLO-Vehicle-v1s model achieved 92.1% accuracy while maintaining a\ndetection speed of 226 FPS and an inference time of 12ms, meeting the real-time\nrequirements of autonomous driving. When processing hazy images, the\nYOLO-Vehicle-Pro model achieved a high accuracy of 82.3% mAP@50 on the Foggy\nCityscapes dataset while maintaining a detection speed of 43 FPS.\n","authors":["Xiguang Li","Jiafu Chen","Yunhe Sun","Na Lin","Ammar Hawbani","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.17734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14622v2","updated":"2024-10-23T10:06:10Z","published":"2024-02-22T15:10:45Z","title":"From Keywords to Structured Summaries: Streamlining Scholarly\n Information Access","summary":" This paper highlights the growing importance of information retrieval (IR)\nengines in the scientific community, addressing the inefficiency of traditional\nkeyword-based search engines due to the rising volume of publications. The\nproposed solution involves structured records, underpinning advanced\ninformation technology (IT) tools, including visualization dashboards, to\nrevolutionize how researchers access and filter articles, replacing the\ntraditional text-heavy approach. This vision is exemplified through a proof of\nconcept centered on the \"reproductive number estimate of infectious diseases\"\nresearch theme, using a fine-tuned large language model (LLM) to automate the\ncreation of structured records to populate a backend database that now goes\nbeyond keywords. The result is a next-generation information access system as\nan IR method accessible at https://orkg.org/usecases/r0-estimates.\n","authors":["Mahsa Shamsabadi","Jennifer D'Souza"],"pdf_url":"https://arxiv.org/pdf/2402.14622v2.pdf","comment":"8 pages, 3 figures | Accepted for publication as a poster paper at\n the International Semantic Web Conference (ISWC 2024)"},{"id":"http://arxiv.org/abs/2410.17651v1","updated":"2024-10-23T08:09:48Z","published":"2024-10-23T08:09:48Z","title":"Testing Deep Learning Recommender Systems Models on Synthetic\n GAN-Generated Datasets","summary":" The published method Generative Adversarial Networks for Recommender Systems\n(GANRS) allows generating data sets for collaborative filtering recommendation\nsystems. The GANRS source code is available along with a representative set of\ngenerated datasets. We have tested the GANRS method by creating multiple\nsynthetic datasets from three different real datasets taken as a source.\nExperiments include variations in the number of users in the synthetic\ndatasets, as well as a different number of samples. We have also selected six\nstate-of-the-art collaborative filtering deep learning models to test both\ntheir comparative performance and the GANRS method. The results show a\nconsistent behavior of the generated datasets compared to the source ones;\nparticularly, in the obtained values and trends of the precision and recall\nquality measures. The tested deep learning models have also performed as\nexpected on all synthetic datasets, making it possible to compare the results\nwith those obtained from the real source data. Future work is proposed,\nincluding different cold start scenarios, unbalanced data, and demographic\nfairness.\n","authors":["Jesús Bobadilla","Abraham Gutiérrez"],"pdf_url":"https://arxiv.org/pdf/2410.17651v1.pdf","comment":"10 pages, 7 figures, In press"},{"id":"http://arxiv.org/abs/2410.17644v1","updated":"2024-10-23T08:01:07Z","published":"2024-10-23T08:01:07Z","title":"Comprehensive Evaluation of Matrix Factorization Models for\n Collaborative Filtering Recommender Systems","summary":" Matrix factorization models are the core of current commercial collaborative\nfiltering Recommender Systems. This paper tested six representative matrix\nfactorization models, using four collaborative filtering datasets. Experiments\nhave tested a variety of accuracy and beyond accuracy quality measures,\nincluding prediction, recommendation of ordered and unordered lists, novelty,\nand diversity. Results show each convenient matrix factorization model\nattending to their simplicity, the required prediction quality, the necessary\nrecommendation quality, the desired recommendation novelty and diversity, the\nneed to explain recommendations, the adequacy of assigning semantic\ninterpretations to hidden factors, the advisability of recommending to groups\nof users, and the need to obtain reliability values. To ensure the\nreproducibility of the experiments, an open framework has been used, and the\nimplementation code is provided.\n","authors":["Jesús Bobadilla","Jorge Dueñas-Lerín","Fernando Ortega","Abraham Gutierrez"],"pdf_url":"https://arxiv.org/pdf/2410.17644v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.17614v1","updated":"2024-10-23T07:11:48Z","published":"2024-10-23T07:11:48Z","title":"Extending and Applying Automated HERMES Software Publication Workflows","summary":" Research software is an import output of research and must be published\naccording to the FAIR Principles for Research Software. This can be achieved by\npublishing software with metadata under a persistent identifier. HERMES is a\ntool that leverages continuous integration to automate the publication of\nsoftware with rich metadata. In this work, we describe the HERMES workflow\nitself, and how to extend it to meet the needs of specific research software\nmetadata or infrastructure. We introduce the HERMES plugin architecture and\nprovide the example of creating a new HERMES plugin that harvests metadata from\na metadata source in source code repositories. We show how to use HERMES as an\nend user, both via the command line interface, and as a step in a continuous\nintegration pipeline. Finally, we report three informal case studies whose\nresults provide a preliminary evaluation of the feasibility and applicability\nof HERMES workflows, and the extensibility of the hermes software package.\n","authors":["Sophie Kernchen","Michael Meinel","Stephan Druskat","Michael Fritzsche","David Pape","Oliver Bertuch"],"pdf_url":"https://arxiv.org/pdf/2410.17614v1.pdf","comment":"17 pages, 2 figures, 2 tables, submitted to a special issue of\n Electronic Communications of the EASST collecting submissions of deRSE24,\n Conference for Research Software Engineers"},{"id":"http://arxiv.org/abs/2403.06737v2","updated":"2024-10-23T02:00:35Z","published":"2024-03-11T14:02:24Z","title":"Post-Training Attribute Unlearning in Recommender Systems","summary":" With the growing privacy concerns in recommender systems, recommendation\nunlearning is getting increasing attention. Existing studies predominantly use\ntraining data, i.e., model inputs, as unlearning target. However, attackers can\nextract private information from the model even if it has not been explicitly\nencountered during training. We name this unseen information as\n\\textit{attribute} and treat it as unlearning target. To protect the sensitive\nattribute of users, Attribute Unlearning (AU) aims to make target attributes\nindistinguishable. In this paper, we focus on a strict but practical setting of\nAU, namely Post-Training Attribute Unlearning (PoT-AU), where unlearning can\nonly be performed after the training of the recommendation model is completed.\nTo address the PoT-AU problem in recommender systems, we propose a\ntwo-component loss function. The first component is distinguishability loss,\nwhere we design a distribution-based measurement to make attribute labels\nindistinguishable from attackers. We further extend this measurement to handle\nmulti-class attribute cases with efficient computational overhead. The second\ncomponent is regularization loss, where we explore a function-space measurement\nthat effectively maintains recommendation performance compared to\nparameter-space regularization. We use stochastic gradient descent algorithm to\noptimize our proposed loss. Extensive experiments on four real-world datasets\ndemonstrate the effectiveness of our proposed methods.\n","authors":["Chaochao Chen","Yizhao Zhang","Yuyuan Li","Dan Meng","Jun Wang","Xiaoli Zheng","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2403.06737v2.pdf","comment":"Accepted by TOIS. arXiv admin note: text overlap with\n arXiv:2310.05847"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2410.18082v1","updated":"2024-10-23T17:59:52Z","published":"2024-10-23T17:59:52Z","title":"Prioritized Generative Replay","summary":" Sample-efficient online reinforcement learning often uses replay buffers to\nstore experience for reuse when updating the value function. However, uniform\nreplay is inefficient, since certain classes of transitions can be more\nrelevant to learning. While prioritization of more useful samples is helpful,\nthis strategy can also lead to overfitting, as useful samples are likely to be\nmore rare. In this work, we instead propose a prioritized, parametric version\nof an agent's memory, using generative models to capture online experience.\nThis paradigm enables (1) densification of past experience, with new\ngenerations that benefit from the generative model's generalization capacity\nand (2) guidance via a family of \"relevance functions\" that push these\ngenerations towards more useful parts of an agent's acquired history. We show\nthis recipe can be instantiated using conditional diffusion models and simple\nrelevance functions such as curiosity- or value-based metrics. Our approach\nconsistently improves performance and sample efficiency in both state- and\npixel-based domains. We expose the mechanisms underlying these gains, showing\nhow guidance promotes diversity in our generated transitions and reduces\noverfitting. We also showcase how our approach can train policies with even\nhigher update-to-data ratios than before, opening up avenues to better scale\nonline RL agents.\n","authors":["Renhao Wang","Kevin Frans","Pieter Abbeel","Sergey Levine","Alexei A. Efros"],"pdf_url":"https://arxiv.org/pdf/2410.18082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18077v1","updated":"2024-10-23T17:58:49Z","published":"2024-10-23T17:58:49Z","title":"ALTA: Compiler-Based Analysis of Transformers","summary":" We propose a new programming language called ALTA and a compiler that can map\nALTA programs to Transformer weights. ALTA is inspired by RASP, a language\nproposed by Weiss et al. (2021), and Tracr (Lindner et al., 2023), a compiler\nfrom RASP programs to Transformer weights. ALTA complements and extends this\nprior work, offering the ability to express loops and to compile programs to\nUniversal Transformers, among other advantages. ALTA allows us to\nconstructively show how Transformers can represent length-invariant algorithms\nfor computing parity and addition, as well as a solution to the SCAN benchmark\nof compositional generalization tasks, without requiring intermediate\nscratchpad decoding steps. We also propose tools to analyze cases where the\nexpressibility of an algorithm is established, but end-to-end training on a\ngiven training set fails to induce behavior consistent with the desired\nalgorithm. To this end, we explore training from ALTA execution traces as a\nmore fine-grained supervision signal. This enables additional experiments and\ntheoretical analyses relating the learnability of various algorithms to data\navailability and modeling decisions, such as positional encodings. We make the\nALTA framework -- language specification, symbolic interpreter, and weight\ncompiler -- available to the community to enable further applications and\ninsights.\n","authors":["Peter Shaw","James Cohan","Jacob Eisenstein","Kenton Lee","Jonathan Berant","Kristina Toutanova"],"pdf_url":"https://arxiv.org/pdf/2410.18077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18076v1","updated":"2024-10-23T17:58:45Z","published":"2024-10-23T17:58:45Z","title":"Leveraging Skills from Unlabeled Prior Data for Efficient Online\n Exploration","summary":" Unsupervised pretraining has been transformative in many supervised domains.\nHowever, applying such ideas to reinforcement learning (RL) presents a unique\nchallenge in that fine-tuning does not involve mimicking task-specific data,\nbut rather exploring and locating the solution through iterative\nself-improvement. In this work, we study how unlabeled prior trajectory data\ncan be leveraged to learn efficient exploration strategies. While prior data\ncan be used to pretrain a set of low-level skills, or as additional off-policy\ndata for online RL, it has been unclear how to combine these ideas effectively\nfor online exploration. Our method SUPE (Skills from Unlabeled Prior data for\nExploration) demonstrates that a careful combination of these ideas compounds\ntheir benefits. Our method first extracts low-level skills using a variational\nautoencoder (VAE), and then pseudo-relabels unlabeled trajectories using an\noptimistic reward model, transforming prior data into high-level, task-relevant\nexamples. Finally, SUPE uses these transformed examples as additional\noff-policy data for online RL to learn a high-level policy that composes\npretrained low-level skills to explore efficiently. We empirically show that\nSUPE reliably outperforms prior strategies, successfully solving a suite of\nlong-horizon, sparse-reward tasks. Code: https://github.com/rail-berkeley/supe.\n","authors":["Max Wilcoxson","Qiyang Li","Kevin Frans","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2410.18076v1.pdf","comment":"23 pages, 10 figures"},{"id":"http://arxiv.org/abs/2410.18075v1","updated":"2024-10-23T17:57:14Z","published":"2024-10-23T17:57:14Z","title":"ProFL: Performative Robust Optimal Federated Learning","summary":" Performative prediction (PP) is a framework that captures distribution shifts\nthat occur during the training of machine learning models due to their\ndeployment. As the trained model is used, its generated data could cause the\nmodel to evolve, leading to deviations from the original data distribution. The\nimpact of such model-induced distribution shifts in the federated learning (FL)\nsetup remains unexplored despite being increasingly likely to transpire in\nreal-life use cases. Although Jin et al. (2024) recently extended PP to FL in a\nstraightforward manner, the resulting model only converges to a performative\nstable point, which may be far from optimal. The methods in Izzo et al. (2021);\nMiller et al. (2021) can find a performative optimal point in centralized\nsettings, but they require the performative risk to be convex and the training\ndata to be noiseless, assumptions often violated in realistic FL systems. This\npaper overcomes all of these shortcomings and proposes Performative robust\noptimal Federated Learning (ProFL), an algorithm that finds performative\noptimal points in FL from noisy and contaminated data. We present the\nconvergence analysis under the Polyak-Lojasiewicz condition, which applies to\nnon-convex objectives. Extensive experiments on multiple datasets validate our\nproposed algorithms' efficiency.\n","authors":["Xue Zheng","Tian Xie","Xuwei Tan","Aylin Yener","Xueru Zhang","Ali Payani","Myungjin Lee"],"pdf_url":"https://arxiv.org/pdf/2410.18075v1.pdf","comment":"27 pages with Appendix, 18 figures. The paper has been submitted and\n is currently under review"},{"id":"http://arxiv.org/abs/2410.18074v1","updated":"2024-10-23T17:56:33Z","published":"2024-10-23T17:56:33Z","title":"UnCLe: Unsupervised Continual Learning of Depth Completion","summary":" We propose UnCLe, a standardized benchmark for Unsupervised Continual\nLearning of a multimodal depth estimation task: Depth completion aims to infer\na dense depth map from a pair of synchronized RGB image and sparse depth map.\nWe benchmark depth completion models under the practical scenario of\nunsupervised learning over continuous streams of data. Existing methods are\ntypically trained on a static, or stationary, dataset. However, when adapting\nto novel non-stationary distributions, they \"catastrophically forget\"\npreviously learned information. UnCLe simulates these non-stationary\ndistributions by adapting depth completion models to sequences of datasets\ncontaining diverse scenes captured from distinct domains using different visual\nand range sensors. We adopt representative methods from continual learning\nparadigms and translate them to enable unsupervised continual learning of depth\ncompletion. We benchmark these models for indoor and outdoor and investigate\nthe degree of catastrophic forgetting through standard quantitative metrics.\nFurthermore, we introduce model inversion quality as an additional measure of\nforgetting. We find that unsupervised continual learning of depth completion is\nan open problem, and we invite researchers to leverage UnCLe as a development\nplatform.\n","authors":["Suchisrit Gangopadhyay","Xien Chen","Michael Chu","Patrick Rim","Hyoungseob Park","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2410.18074v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.12568v2","updated":"2024-10-23T17:53:24Z","published":"2024-08-22T17:35:18Z","title":"Pruning By Explaining Revisited: Optimizing Attribution Methods to Prune\n CNNs and Transformers","summary":" To solve ever more complex problems, Deep Neural Networks are scaled to\nbillions of parameters, leading to huge computational costs. An effective\napproach to reduce computational requirements and increase efficiency is to\nprune unnecessary components of these often over-parameterized networks.\nPrevious work has shown that attribution methods from the field of eXplainable\nAI serve as effective means to extract and prune the least relevant network\ncomponents in a few-shot fashion. We extend the current state by proposing to\nexplicitly optimize hyperparameters of attribution methods for the task of\npruning, and further include transformer-based networks in our analysis. Our\napproach yields higher model compression rates of large transformer- and\nconvolutional architectures (VGG, ResNet, ViT) compared to previous works,\nwhile still attaining high performance on ImageNet classification tasks. Here,\nour experiments indicate that transformers have a higher degree of\nover-parameterization compared to convolutional neural networks. Code is\navailable at https://github.com/erfanhatefi/Pruning-by-eXplaining-in-PyTorch.\n","authors":["Sayed Mohammad Vakilzadeh Hatefi","Maximilian Dreyer","Reduan Achtibat","Thomas Wiegand","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2408.12568v2.pdf","comment":"Accepted as a workshop paper at ECCV 2024, 26 pages (11 pages\n manuscript, 3 pages references, 12 pages appendix)"},{"id":"http://arxiv.org/abs/2410.18070v1","updated":"2024-10-23T17:53:11Z","published":"2024-10-23T17:53:11Z","title":"Training Free Guided Flow Matching with Optimal Control","summary":" Controlled generation with pre-trained Diffusion and Flow Matching models has\nvast applications. One strategy for guiding ODE-based generative models is\nthrough optimizing a target loss $R(x_1)$ while staying close to the prior\ndistribution. Along this line, some recent work showed the effectiveness of\nguiding flow model by differentiating through its ODE sampling process. Despite\nthe superior performance, the theoretical understanding of this line of methods\nis still preliminary, leaving space for algorithm improvement. Moreover,\nexisting methods predominately focus on Euclidean data manifold, and there is a\ncompelling need for guided flow methods on complex geometries such as SO(3),\nwhich prevails in high-stake scientific applications like protein design. We\npresent OC-Flow, a general and theoretically grounded training-free framework\nfor guided flow matching using optimal control. Building upon advances in\noptimal control theory, we develop effective and practical algorithms for\nsolving optimal control in guided ODE-based generation and provide a systematic\ntheoretical analysis of the convergence guarantee in both Euclidean and SO(3).\nWe show that existing backprop-through-ODE methods can be interpreted as\nspecial cases of Euclidean OC-Flow. OC-Flow achieved superior performance in\nextensive experiments on text-guided image manipulation, conditional molecule\ngeneration, and all-atom peptide design.\n","authors":["Luran Wang","Chaoran Cheng","Yizhen Liao","Yanru Qu","Ge Liu"],"pdf_url":"https://arxiv.org/pdf/2410.18070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03185v2","updated":"2024-10-23T17:52:57Z","published":"2024-03-05T18:22:15Z","title":"Correlated Proxies: A New Definition and Improved Mitigation for Reward\n Hacking","summary":" Because it is difficult to precisely specify complex objectives,\nreinforcement learning policies are often optimized using flawed proxy rewards\nthat seem to capture the true objective. However, optimizing proxy rewards\nfrequently leads to reward hacking: the optimized reward function ceases to be\na good proxy, and the resulting policy performs poorly with respect to the\nunspecified true reward. Principled solutions to reward hacking have been\nimpeded by the lack of a good definition for the problem. To address this, we\nintroduce a definition of reward hacking based on the correlation between proxy\nand true rewards for states and actions seen by a \"base policy\" that breaks\ndown under optimization. We show that this definition captures reward hacking\nbehavior across several realistic settings, including in reinforcement learning\nfrom human feedback (RLHF). We then show theoretically that regularization to\nthe base policy can effectively prevent reward hacking. While current RLHF\napproaches apply a KL penalty between the action distributions of policies, our\ntheory suggests that it is more effective to regularize using the $\\chi^2$\ndivergence between the policies' occupancy measures. We intuitively show why\nthis type of regularization is superior and demonstrate that it better\nmitigates reward hacking in practice across four realistic domains, including\nRLHF for LLMs. Our code is available at https://github.com/cassidylaidlaw/orpo.\n","authors":["Cassidy Laidlaw","Shivam Singhal","Anca Dragan"],"pdf_url":"https://arxiv.org/pdf/2403.03185v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18067v1","updated":"2024-10-23T17:48:28Z","published":"2024-10-23T17:48:28Z","title":"Beyond position: how rotary embeddings shape representations and memory\n in autoregressive transfomers","summary":" Rotary Positional Embeddings (RoPE) enhance positional encoding in\nTransformer models, yet their full impact on model dynamics remains\nunderexplored. This paper studies how RoPE introduces position-dependent\nrotations, causing phase shifts in token embeddings that influence\nhigher-frequency components within the model's internal representations.\nThrough spectral analysis, we demonstrate that RoPE's rotation matrices induce\noscillatory behaviors in embeddings, affecting information retention across\nlayers and shaping temporal modeling capabilities. We show that activation\nfunctions in feed-forward networks interact with RoPE-modulated embeddings to\ngenerate harmonics, leading to constructive or destructive interference based\non phase alignment. Our findings reveal that phase alignment amplifies\nactivations and sharpens attention, while misalignment weakens activations and\ndisrupts focus on positional patterns. This study underscores the importance of\nfrequency components as intrinsic elements of model behavior, offering new\ninsights beyond traditional analyses.\n","authors":["Valeria Ruscio","Fabrizio Silvestri"],"pdf_url":"https://arxiv.org/pdf/2410.18067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18066v1","updated":"2024-10-23T17:42:54Z","published":"2024-10-23T17:42:54Z","title":"The Double-Edged Sword of Behavioral Responses in Strategic\n Classification: Theory and User Studies","summary":" When humans are subject to an algorithmic decision system, they can\nstrategically adjust their behavior accordingly (``game'' the system). While a\ngrowing line of literature on strategic classification has used game-theoretic\nmodeling to understand and mitigate such gaming, these existing works consider\nstandard models of fully rational agents. In this paper, we propose a strategic\nclassification model that considers behavioral biases in human responses to\nalgorithms. We show how misperceptions of a classifier (specifically, of its\nfeature weights) can lead to different types of discrepancies between biased\nand rational agents' responses, and identify when behavioral agents over- or\nunder-invest in different features. We also show that strategic agents with\nbehavioral biases can benefit or (perhaps, unexpectedly) harm the firm compared\nto fully rational strategic agents. We complement our analytical results with\nuser studies, which support our hypothesis of behavioral biases in human\nresponses to the algorithm. Together, our findings highlight the need to\naccount for human (cognitive) biases when designing AI systems, and providing\nexplanations of them, to strategic human in the loop.\n","authors":["Raman Ebrahimi","Kristen Vaccaro","Parinaz Naghizadeh"],"pdf_url":"https://arxiv.org/pdf/2410.18066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15762v2","updated":"2024-10-23T17:42:39Z","published":"2024-07-22T16:13:38Z","title":"Conditional Language Policy: A General Framework for Steerable\n Multi-Objective Finetuning","summary":" Reward-based finetuning is crucial for aligning language policies with\nintended behaviors (e.g., creativity and safety). A key challenge is to develop\nsteerable language models that trade-off multiple (conflicting) objectives in a\nflexible and efficient manner. This paper presents Conditional Language Policy\n(CLP), a general framework for finetuning language models on multiple\nobjectives. Building on techniques from multi-task training and\nparameter-efficient finetuning, CLP learn steerable models that effectively\ntrade-off conflicting objectives at inference time. Notably, this does not\nrequire training or maintaining multiple models to achieve different trade-offs\nbetween the objectives. Through extensive experiments and ablations on two\nsummarization datasets, we show that CLP learns steerable language models that\noutperform and Pareto-dominate the existing approaches for multi-objective\nfinetuning.\n","authors":["Kaiwen Wang","Rahul Kidambi","Ryan Sullivan","Alekh Agarwal","Christoph Dann","Andrea Michi","Marco Gelmi","Yunxuan Li","Raghav Gupta","Avinava Dubey","Alexandre Ramé","Johan Ferret","Geoffrey Cideron","Le Hou","Hongkun Yu","Amr Ahmed","Aranyak Mehta","Léonard Hussenot","Olivier Bachem","Edouard Leurent"],"pdf_url":"https://arxiv.org/pdf/2407.15762v2.pdf","comment":"40 pages. Findings of EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.18065v1","updated":"2024-10-23T17:42:07Z","published":"2024-10-23T17:42:07Z","title":"SPIRE: Synergistic Planning, Imitation, and Reinforcement Learning for\n Long-Horizon Manipulation","summary":" Robot learning has proven to be a general and effective technique for\nprogramming manipulators. Imitation learning is able to teach robots solely\nfrom human demonstrations but is bottlenecked by the capabilities of the\ndemonstrations. Reinforcement learning uses exploration to discover better\nbehaviors; however, the space of possible improvements can be too large to\nstart from scratch. And for both techniques, the learning difficulty increases\nproportional to the length of the manipulation task. Accounting for this, we\npropose SPIRE, a system that first uses Task and Motion Planning (TAMP) to\ndecompose tasks into smaller learning subproblems and second combines imitation\nand reinforcement learning to maximize their strengths. We develop novel\nstrategies to train learning agents when deployed in the context of a planning\nsystem. We evaluate SPIRE on a suite of long-horizon and contact-rich robot\nmanipulation problems. We find that SPIRE outperforms prior approaches that\nintegrate imitation learning, reinforcement learning, and planning by 35% to\n50% in average task performance, is 6 times more data efficient in the number\nof human demonstrations needed to train proficient agents, and learns to\ncomplete tasks nearly twice as efficiently. View\nhttps://sites.google.com/view/spire-corl-2024 for more details.\n","authors":["Zihan Zhou","Animesh Garg","Dieter Fox","Caelan Garrett","Ajay Mandlekar"],"pdf_url":"https://arxiv.org/pdf/2410.18065v1.pdf","comment":"Conference on Robot Learning (CoRL) 2024"},{"id":"http://arxiv.org/abs/2410.18038v1","updated":"2024-10-23T17:06:56Z","published":"2024-10-23T17:06:56Z","title":"POD-Attention: Unlocking Full Prefill-Decode Overlap for Faster LLM\n Inference","summary":" Each request in LLM inference goes through two phases: compute-bound prefill\nand memory-bandwidth-bound decode. To improve GPU utilization, recent systems\nuse hybrid batching that combines the prefill and decode phases of different\nrequests into the same batch. Hybrid batching works well for linear operations\nas it amortizes the cost of loading model weights from HBM. However, attention\ncomputation in hybrid batches remains inefficient because existing attention\nkernels are optimized for either prefill or decode.\n In this paper, we present POD-Attention -- the first GPU kernel that\nefficiently computes attention for hybrid batches. POD-Attention aims to\nmaximize the utilization of both compute and memory bandwidth by carefully\nallocating the GPU's resources such that prefill and decode operations happen\nconcurrently on the same multiprocessor. We integrate POD-Attention in a\nstate-of-the-art LLM inference scheduler Sarathi-Serve. POD-Attention speeds up\nattention computation by up to 75% (mean 28%) and increases LLM serving\nthroughput by up to 22% in offline inference. In online inference,\nPOD-Attention enables lower time-to-first-token (TTFT), time-between-tokens\n(TBT), and request execution latency versus Sarathi-Serve.\n","authors":["Aditya K Kamath","Ramya Prabhu","Jayashree Mohan","Simon Peter","Ramachandran Ramjee","Ashish Panwar"],"pdf_url":"https://arxiv.org/pdf/2410.18038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04429v2","updated":"2024-10-23T16:42:06Z","published":"2024-09-06T17:49:56Z","title":"VILA-U: a Unified Foundation Model Integrating Visual Understanding and\n Generation","summary":" VILA-U is a Unified foundation model that integrates Video, Image, Language\nunderstanding and generation. Traditional visual language models (VLMs) use\nseparate modules for understanding and generating visual content, which can\nlead to misalignment and increased complexity. In contrast, VILA-U employs a\nsingle autoregressive next-token prediction framework for both tasks,\neliminating the need for additional components like diffusion models. This\napproach not only simplifies the model but also achieves near state-of-the-art\nperformance in visual language understanding and generation. The success of\nVILA-U is attributed to two main factors: the unified vision tower that aligns\ndiscrete visual tokens with textual inputs during pretraining, which enhances\nvisual perception, and autoregressive image generation can achieve similar\nquality as diffusion models with high-quality dataset. This allows VILA-U to\nperform comparably to more complex models using a fully token-based\nautoregressive framework.\n","authors":["Yecheng Wu","Zhuoyang Zhang","Junyu Chen","Haotian Tang","Dacheng Li","Yunhao Fang","Ligeng Zhu","Enze Xie","Hongxu Yin","Li Yi","Song Han","Yao Lu"],"pdf_url":"https://arxiv.org/pdf/2409.04429v2.pdf","comment":"Code: https://github.com/mit-han-lab/vila-u. The first two authors\n contributed equally to this work"},{"id":"http://arxiv.org/abs/2409.19841v2","updated":"2024-10-23T16:27:27Z","published":"2024-09-30T00:47:13Z","title":"Counter-Current Learning: A Biologically Plausible Dual Network Approach\n for Deep Learning","summary":" Despite its widespread use in neural networks, error backpropagation has\nfaced criticism for its lack of biological plausibility, suffering from issues\nsuch as the backward locking problem and the weight transport problem. These\nlimitations have motivated researchers to explore more biologically plausible\nlearning algorithms that could potentially shed light on how biological neural\nsystems adapt and learn. Inspired by the counter-current exchange mechanisms\nobserved in biological systems, we propose counter-current learning (CCL), a\nbiologically plausible framework for credit assignment in neural networks. This\nframework employs a feedforward network to process input data and a feedback\nnetwork to process targets, with each network enhancing the other through\nanti-parallel signal propagation. By leveraging the more informative signals\nfrom the bottom layer of the feedback network to guide the updates of the top\nlayer of the feedforward network and vice versa, CCL enables the simultaneous\ntransformation of source inputs to target outputs and the dynamic mutual\ninfluence of these transformations. Experimental results on MNIST,\nFashionMNIST, CIFAR10, and CIFAR100 datasets using multi-layer perceptrons and\nconvolutional neural networks demonstrate that CCL achieves comparable\nperformance to other biologically plausible algorithms while offering a more\nbiologically realistic learning mechanism. Furthermore, we showcase the\napplicability of our approach to an autoencoder task, underscoring its\npotential for unsupervised representation learning. Our work presents a\ndirection for biologically inspired and plausible learning algorithms, offering\nan alternative mechanism of learning and adaptation in neural networks.\n","authors":["Chia-Hsiang Kao","Bharath Hariharan"],"pdf_url":"https://arxiv.org/pdf/2409.19841v2.pdf","comment":"Accepted at NeurIPS 2024. Code available at\n https://github.com/IandRover/CCL-NeurIPS24"},{"id":"http://arxiv.org/abs/2409.17270v2","updated":"2024-10-23T16:27:20Z","published":"2024-09-25T18:35:45Z","title":"Proof of Thought : Neurosymbolic Program Synthesis allows Robust and\n Interpretable Reasoning","summary":" Large Language Models (LLMs) have revolutionized natural language processing,\nyet they struggle with inconsistent reasoning, particularly in novel domains\nand complex logical sequences. This research introduces Proof of Thought, a\nframework that enhances the reliability and transparency of LLM outputs. Our\napproach bridges LLM-generated ideas with formal logic verification, employing\na custom interpreter to convert LLM outputs into First Order Logic constructs\nfor theorem prover scrutiny. Central to our method is an intermediary\nJSON-based Domain-Specific Language, which by design balances precise logical\nstructures with intuitive human concepts. This hybrid representation enables\nboth rigorous validation and accessible human comprehension of LLM reasoning\nprocesses. Key contributions include a robust type system with sort management\nfor enhanced logical integrity, explicit representation of rules for clear\ndistinction between factual and inferential knowledge, and a flexible\narchitecture that allows for easy extension to various domain-specific\napplications. We demonstrate Proof of Thought's effectiveness through\nbenchmarking on StrategyQA and a novel multimodal reasoning task, showing\nimproved performance in open-ended scenarios. By providing verifiable and\ninterpretable results, our technique addresses critical needs for AI system\naccountability and sets a foundation for human-in-the-loop oversight in\nhigh-stakes domains.\n","authors":["Debargha Ganguly","Srinivasan Iyengar","Vipin Chaudhary","Shivkumar Kalyanaraman"],"pdf_url":"https://arxiv.org/pdf/2409.17270v2.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024) System 2 Reasoning At Scale Workshop"},{"id":"http://arxiv.org/abs/2410.18003v1","updated":"2024-10-23T16:25:36Z","published":"2024-10-23T16:25:36Z","title":"Inferring stability properties of chaotic systems on autoencoders'\n latent spaces","summary":" The data-driven learning of solutions of partial differential equations can\nbe based on a divide-and-conquer strategy. First, the high dimensional data is\ncompressed to a latent space with an autoencoder; and, second, the temporal\ndynamics are inferred on the latent space with a form of recurrent neural\nnetwork. In chaotic systems and turbulence, convolutional autoencoders and echo\nstate networks (CAE-ESN) successfully forecast the dynamics, but little is\nknown about whether the stability properties can also be inferred. We show that\nthe CAE-ESN model infers the invariant stability properties and the geometry of\nthe tangent space in the low-dimensional manifold (i.e. the latent space)\nthrough Lyapunov exponents and covariant Lyapunov vectors. This work opens up\nnew opportunities for inferring the stability of high-dimensional chaotic\nsystems in latent spaces.\n","authors":["Elise Özalp","Luca Magri"],"pdf_url":"https://arxiv.org/pdf/2410.18003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10992v2","updated":"2024-10-23T16:19:06Z","published":"2024-06-24T09:29:14Z","title":"AlleNoise: large-scale text classification benchmark dataset with\n real-world label noise","summary":" Label noise remains a challenge for training robust classification models.\nMost methods for mitigating label noise have been benchmarked using primarily\ndatasets with synthetic noise. While the need for datasets with realistic noise\ndistribution has partially been addressed by web-scraped benchmarks such as\nWebVision and Clothing1M, those benchmarks are restricted to the computer\nvision domain. With the growing importance of Transformer-based models, it is\ncrucial to establish text classification benchmarks for learning with noisy\nlabels. In this paper, we present AlleNoise, a new curated text classification\nbenchmark dataset with real-world instance-dependent label noise, containing\nover 500,000 examples across approximately 5,600 classes, complemented with a\nmeaningful, hierarchical taxonomy of categories. The noise distribution comes\nfrom actual users of a major e-commerce marketplace, so it realistically\nreflects the semantics of human mistakes. In addition to the noisy labels, we\nprovide human-verified clean labels, which help to get a deeper insight into\nthe noise distribution, unlike web-scraped datasets typically used in the\nfield. We demonstrate that a representative selection of established methods\nfor learning with noisy labels is inadequate to handle such real-world noise.\nIn addition, we show evidence that these algorithms do not alleviate excessive\nmemorization. As such, with AlleNoise, we set the bar high for the development\nof label noise methods that can handle real-world label noise in text\nclassification tasks. The code and dataset are available for download at\nhttps://github.com/allegro/AlleNoise.\n","authors":["Alicja Rączkowska","Aleksandra Osowska-Kurczab","Jacek Szczerbiński","Kalina Jasinska-Kobus","Klaudia Nazarko"],"pdf_url":"https://arxiv.org/pdf/2407.10992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17998v1","updated":"2024-10-23T16:12:59Z","published":"2024-10-23T16:12:59Z","title":"Estimating the Spectral Moments of the Kernel Integral Operator from\n Finite Sample Matrices","summary":" Analyzing the structure of sampled features from an input data distribution\nis challenging when constrained by limited measurements in both the number of\ninputs and features. Traditional approaches often rely on the eigenvalue\nspectrum of the sample covariance matrix derived from finite measurement\nmatrices; however, these spectra are sensitive to the size of the measurement\nmatrix, leading to biased insights. In this paper, we introduce a novel\nalgorithm that provides unbiased estimates of the spectral moments of the\nkernel integral operator in the limit of infinite inputs and features from\nfinitely sampled measurement matrices. Our method, based upon dynamic\nprogramming, is efficient and capable of estimating the moments of the operator\nspectrum. We demonstrate the accuracy of our estimator on radial basis function\n(RBF) kernels, highlighting its consistency with the theoretical spectra.\nFurthermore, we showcase the practical utility and robustness of our method in\nunderstanding the geometry of learned representations in neural networks.\n","authors":["Chanwoo Chun","SueYeon Chung","Daniel D. Lee"],"pdf_url":"https://arxiv.org/pdf/2410.17998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02131v4","updated":"2024-10-23T16:05:11Z","published":"2024-06-04T09:18:20Z","title":"CondTSF: One-line Plugin of Dataset Condensation for Time Series\n Forecasting","summary":" Dataset condensation is a newborn technique that generates a small dataset\nthat can be used in training deep neural networks to lower training costs. The\nobjective of dataset condensation is to ensure that the model trained with the\nsynthetic dataset can perform comparably to the model trained with full\ndatasets. However, existing methods predominantly concentrate on classification\ntasks, posing challenges in their adaptation to time series forecasting\n(TS-forecasting). This challenge arises from disparities in the evaluation of\nsynthetic data. In classification, the synthetic data is considered\nwell-distilled if the model trained with the full dataset and the model trained\nwith the synthetic dataset yield identical labels for the same input,\nregardless of variations in output logits distribution. Conversely, in\nTS-forecasting, the effectiveness of synthetic data distillation is determined\nby the distance between predictions of the two models. The synthetic data is\ndeemed well-distilled only when all data points within the predictions are\nsimilar. Consequently, TS-forecasting has a more rigorous evaluation\nmethodology compared to classification. To mitigate this gap, we theoretically\nanalyze the optimization objective of dataset condensation for TS-forecasting\nand propose a new one-line plugin of dataset condensation designated as Dataset\nCondensation for Time Series Forecasting (CondTSF) based on our analysis.\nPlugging CondTSF into previous dataset condensation methods facilitates a\nreduction in the distance between the predictions of the model trained with the\nfull dataset and the model trained with the synthetic dataset, thereby\nenhancing performance. We conduct extensive experiments on eight commonly used\ntime series datasets. CondTSF consistently improves the performance of all\nprevious dataset condensation methods across all datasets, particularly at low\ncondensing ratios.\n","authors":["Jianrong Ding","Zhanyu Liu","Guanjie Zheng","Haiming Jin","Linghe Kong"],"pdf_url":"https://arxiv.org/pdf/2406.02131v4.pdf","comment":"Accepted by NeurIPS 2024, the project can be found at\n https://github.com/RafaDD/CondTSF"},{"id":"http://arxiv.org/abs/2304.10650v2","updated":"2024-10-23T16:03:19Z","published":"2023-04-20T21:25:33Z","title":"Learning a quantum computer's capability","summary":" Accurately predicting a quantum computer's capability -- which circuits it\ncan run and how well it can run them -- is a foundational goal of quantum\ncharacterization and benchmarking. As modern quantum computers become\nincreasingly hard to simulate, we must develop accurate and scalable predictive\ncapability models to help researchers and stakeholders decide which quantum\ncomputers to build and use. In this work, we propose a hardware-agnostic method\nto efficiently construct scalable predictive models of a quantum computer's\ncapability for almost any class of circuits, and demonstrate our method using\nconvolutional neural networks (CNNs). Our CNN-based approach works by\nefficiently representing a circuit as a three-dimensional tensor and then using\na CNN to predict its success rate. Our CNN capability models obtain\napproximately a $1\\%$ average absolute prediction error when modeling\nprocessors experiencing both Markovian and non-Markovian stochastic Pauli\nerrors. We also apply our CNNs to model the capabilities of cloud-access\nquantum computing systems, obtaining moderate prediction accuracy (average\nabsolute error around $2-5\\%$), and we highlight the challenges to building\nbetter neural network capability models.\n","authors":["Daniel Hothem","Kevin Young","Tommie Catanach","Timothy Proctor"],"pdf_url":"https://arxiv.org/pdf/2304.10650v2.pdf","comment":"20 pages, 11 figures, plus appendices"},{"id":"http://arxiv.org/abs/2410.17986v1","updated":"2024-10-23T16:00:14Z","published":"2024-10-23T16:00:14Z","title":"Federated Transformer: Multi-Party Vertical Federated Learning on\n Practical Fuzzily Linked Data","summary":" Federated Learning (FL) is an evolving paradigm that enables multiple parties\nto collaboratively train models without sharing raw data. Among its variants,\nVertical Federated Learning (VFL) is particularly relevant in real-world,\ncross-organizational collaborations, where distinct features of a shared\ninstance group are contributed by different parties. In these scenarios,\nparties are often linked using fuzzy identifiers, leading to a common practice\ntermed as multi-party fuzzy VFL. Existing models generally address either\nmulti-party VFL or fuzzy VFL between two parties. Extending these models to\npractical multi-party fuzzy VFL typically results in significant performance\ndegradation and increased costs for maintaining privacy. To overcome these\nlimitations, we introduce the Federated Transformer (FeT), a novel framework\nthat supports multi-party VFL with fuzzy identifiers. FeT innovatively encodes\nthese identifiers into data representations and employs a transformer\narchitecture distributed across different parties, incorporating three new\ntechniques to enhance performance. Furthermore, we have developed a multi-party\nprivacy framework for VFL that integrates differential privacy with secure\nmulti-party computation, effectively protecting local representations while\nminimizing associated utility costs. Our experiments demonstrate that the FeT\nsurpasses the baseline models by up to 46\\% in terms of accuracy when scaled to\n50 parties. Additionally, in two-party fuzzy VFL settings, FeT also shows\nimproved performance and privacy over cutting-edge VFL models.\n","authors":["Zhaomin Wu","Junyi Hou","Yiqun Diao","Bingsheng He"],"pdf_url":"https://arxiv.org/pdf/2410.17986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17980v1","updated":"2024-10-23T15:51:13Z","published":"2024-10-23T15:51:13Z","title":"Stick-breaking Attention","summary":" The self-attention mechanism traditionally relies on the softmax operator,\nnecessitating positional embeddings like RoPE, or position biases to account\nfor token order. But current methods using still face length generalisation\nchallenges. We propose an alternative attention mechanism based on the\nstick-breaking process: For each token before the current, we determine a break\npoint $\\beta_{i,j}$, which represents the proportion of the remaining stick to\nallocate to the current token. We repeat the process until the stick is fully\nallocated, resulting in a sequence of attention weights. This process naturally\nincorporates recency bias, which has linguistic motivations for grammar parsing\n(Shen et. al., 2017). We study the implications of replacing the conventional\nsoftmax-based attention mechanism with stick-breaking attention. We then\ndiscuss implementation of numerically stable stick-breaking attention and adapt\nFlash Attention to accommodate this mechanism. When used as a drop-in\nreplacement for current softmax+RoPE attention systems, we find that\nstick-breaking attention performs competitively with current methods on length\ngeneralisation and downstream tasks. Stick-breaking also performs well at\nlength generalisation, allowing a model trained with $2^{11}$ context window to\nperform well at $2^{14}$ with perplexity improvements.\n","authors":["Shawn Tan","Yikang Shen","Songlin Yang","Aaron Courville","Rameswar Panda"],"pdf_url":"https://arxiv.org/pdf/2410.17980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02447v3","updated":"2024-10-23T15:48:45Z","published":"2024-06-04T16:12:27Z","title":"Federated Class-Incremental Learning with Hierarchical Generative\n Prototypes","summary":" Federated Learning (FL) aims at unburdening the training of deep models by\ndistributing computation across multiple devices (clients) while safeguarding\ndata privacy. On top of that, Federated Continual Learning (FCL) also accounts\nfor data distribution evolving over time, mirroring the dynamic nature of\nreal-world environments. While previous studies have identified Catastrophic\nForgetting and Client Drift as primary causes of performance degradation in\nFCL, we shed light on the importance of Incremental Bias and Federated Bias,\nwhich cause models to prioritize classes that are recently introduced or\nlocally predominant, respectively. Our proposal constrains both biases in the\nlast layer by efficiently finetuning a pre-trained backbone using learnable\nprompts, resulting in clients that produce less biased representations and more\nbiased classifiers. Therefore, instead of solely relying on parameter\naggregation, we leverage generative prototypes to effectively balance the\npredictions of the global model. Our method significantly improves the current\nState Of The Art, providing an average increase of +7.8% in accuracy. Code to\nreproduce the results is provided in the suppl. material.\n","authors":["Riccardo Salami","Pietro Buzzega","Matteo Mosconi","Mattia Verasani","Simone Calderara"],"pdf_url":"https://arxiv.org/pdf/2406.02447v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14979v2","updated":"2024-10-23T15:43:28Z","published":"2024-10-19T05:01:56Z","title":"Do Large Language Models Truly Grasp Mathematics? An Empirical\n Exploration","summary":" Despite their proficiency in math tasks, the mechanisms underlying LLMs'\nmathematical reasoning abilities remain a subject of debate. Recent studies\nsuggest that chain-of-thought (CoT) prompts can bolster mathematical reasoning\nby encouraging LLMs to employ human-like logical reasoning (System 2), enabling\nthem to excel on the Cognitive Reflection Test (CRT). To assess whether LLMs\ngenuinely possess System 2-like logical reasoning, we introduced targeted\nmodifications to CRT problems. Our findings reveal that, despite the use of CoT\nprompts, mainstream LLMs, including the latest o1-preview model, continue to\nexhibit a significant error rate. Further analysis indicates that they\npredominantly rely on System 1-like intuitive reasoning and pattern matching\nderived from training data, rather than demonstrating mastery of mathematical\nthinking. This discovery challenges the prevailing notion that LLMs possess\ngenuine logical reasoning abilities and that CoT can enhance them.\nConsequently, this work may temper overly optimistic projections regarding\nLLMs' advancement toward artificial general intelligence.\n","authors":["Wei Xie","Shuoyoucheng Ma","Zhenhua Wang","Enze Wang","Baosheng Wang","Jinshu Su"],"pdf_url":"https://arxiv.org/pdf/2410.14979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17976v1","updated":"2024-10-23T15:39:08Z","published":"2024-10-23T15:39:08Z","title":"metasnf: Meta Clustering with Similarity Network Fusion in R","summary":" metasnf is an R package that enables users to apply meta clustering, a method\nfor efficiently searching a broad space of cluster solutions by clustering the\nsolutions themselves, to clustering workflows based on similarity network\nfusion (SNF). SNF is a multi-modal data integration algorithm commonly used for\nbiomedical subtype discovery. The package also contains functions to assist\nwith cluster visualization, characterization, and validation. This package can\nhelp researchers identify SNF-derived cluster solutions that are guided by\ncontext-specific utility over context-agnostic measures of quality.\n","authors":["Prashanth S Velayudhan","Xiaoqiao Xu","Prajkta Kallurkar","Ana Patricia Balbon","Maria T Secara","Adam Taback","Denise Sabac","Nicholas Chan","Shihao Ma","Bo Wang","Daniel Felsky","Stephanie H Ameis","Brian Cox","Colin Hawco","Lauren Erdman","Anne L Wheeler"],"pdf_url":"https://arxiv.org/pdf/2410.17976v1.pdf","comment":"72 pages, 22 figures, submitted to Journal of Statistical Software"},{"id":"http://arxiv.org/abs/2410.17970v1","updated":"2024-10-23T15:36:08Z","published":"2024-10-23T15:36:08Z","title":"Optical Generative Models","summary":" Generative models cover various application areas, including image, video and\nmusic synthesis, natural language processing, and molecular design, among many\nothers. As digital generative models become larger, scalable inference in a\nfast and energy-efficient manner becomes a challenge. Here, we present optical\ngenerative models inspired by diffusion models, where a shallow and fast\ndigital encoder first maps random noise into phase patterns that serve as\noptical generative seeds for a desired data distribution; a jointly-trained\nfree-space-based reconfigurable decoder all-optically processes these\ngenerative seeds to create novel images (never seen before) following the\ntarget data distribution. Except for the illumination power and the random seed\ngeneration through a shallow encoder, these optical generative models do not\nconsume computing power during the synthesis of novel images. We report the\noptical generation of monochrome and multi-color novel images of handwritten\ndigits, fashion products, butterflies, and human faces, following the data\ndistributions of MNIST, Fashion MNIST, Butterflies-100, and Celeb-A datasets,\nrespectively, achieving an overall performance comparable to digital neural\nnetwork-based generative models. To experimentally demonstrate optical\ngenerative models, we used visible light to generate, in a snapshot, novel\nimages of handwritten digits and fashion products. These optical generative\nmodels might pave the way for energy-efficient, scalable and rapid inference\ntasks, further exploiting the potentials of optics and photonics for artificial\nintelligence-generated content.\n","authors":["Shiqi Chen","Yuhang Li","Hanlong Chen","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2410.17970v1.pdf","comment":"24 Pages, 9 Figures"},{"id":"http://arxiv.org/abs/2410.11709v2","updated":"2024-10-23T15:35:57Z","published":"2024-10-15T15:46:03Z","title":"On the potential of Optimal Transport in Geospatial Data Science","summary":" Prediction problems in geographic information science and transportation are\noften motivated by the possibility to enhance operational efficiency and\nthereby reduce emissions. Examples range from predicting car sharing demand for\nrelocation planning to forecasting traffic congestion for navigation purposes.\nHowever, conventional accuracy metrics ignore the spatial distribution of the\nerrors, despite its relevance for operations. Here, we put forward a spatially\naware evaluation metric and loss function based on Optimal Transport (OT). Our\nframework leverages partial OT and can minimize relocation costs in any spatial\nprediction problem. We showcase the advantages of OT-based evaluation over\nconventional metrics and further demonstrate the application of an OT loss\nfunction for improving forecasts of bike sharing demand and charging station\noccupancy. Thus, our framework not only aligns with operational considerations,\nbut also signifies a step forward in refining predictions within geospatial\napplications. All code is available at https://github.com/mie-lab/geospatialOT.\n","authors":["Nina Wiedemann","Théo Uscidda","Martin Raubal"],"pdf_url":"https://arxiv.org/pdf/2410.11709v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17967v1","updated":"2024-10-23T15:34:11Z","published":"2024-10-23T15:34:11Z","title":"POMDP-Driven Cognitive Massive MIMO Radar: Joint Target\n Detection-Tracking In Unknown Disturbances","summary":" The joint detection and tracking of a moving target embedded in an unknown\ndisturbance represents a key feature that motivates the development of the\ncognitive radar paradigm. Building upon recent advancements in robust target\ndetection with multiple-input multiple-output (MIMO) radars, this work explores\nthe application of a Partially Observable Markov Decision Process (POMDP)\nframework to enhance the tracking and detection tasks in a statistically\nunknown environment. In the POMDP setup, the radar system is considered as an\nintelligent agent that continuously senses the surrounding environment,\noptimizing its actions to maximize the probability of detection $(P_D)$ and\nimprove the target position and velocity estimation, all this while keeping a\nconstant probability of false alarm $(P_{FA})$. The proposed approach employs\nan online algorithm that does not require any apriori knowledge of the noise\nstatistics, and it relies on a much more general observation model than the\ntraditional range-azimuth-elevation model employed by conventional tracking\nalgorithms. Simulation results clearly show substantial performance improvement\nof the POMDP-based algorithm compared to the State-Action-Reward-State-Action\n(SARSA)-based one that has been recently investigated in the context of massive\nMIMO (MMIMO) radar systems.\n","authors":["Imad Bouhou","Stefano Fortunati","Leila Gharsalli","Alexandre Renaux"],"pdf_url":"https://arxiv.org/pdf/2410.17967v1.pdf","comment":"The paper has been submitted to ieee Transactions on radar systems"},{"id":"http://arxiv.org/abs/2401.11576v3","updated":"2024-10-23T15:30:50Z","published":"2024-01-21T19:53:17Z","title":"Quantum Architecture Search with Unsupervised Representation Learning","summary":" Unsupervised representation learning presents new opportunities for advancing\nQuantum Architecture Search (QAS) on Noisy Intermediate-Scale Quantum (NISQ)\ndevices. QAS is designed to optimize quantum circuits for Variational Quantum\nAlgorithms (VQAs). Most QAS algorithms tightly couple the search space and\nsearch algorithm, typically requiring the evaluation of numerous quantum\ncircuits, resulting in high computational costs and limiting scalability to\nlarger quantum circuits. Predictor-based QAS algorithms mitigate this issue by\nestimating circuit performance based on structure or embedding. However, these\nmethods often demand time-intensive labeling to optimize gate parameters across\nmany circuits, which is crucial for training accurate predictors. Inspired by\nthe classical neural architecture search algorithm Arch2vec, we investigate the\npotential of unsupervised representation learning for QAS without relying on\npredictors. Our framework decouples unsupervised architecture representation\nlearning from the search process, enabling the learned representations to be\napplied across various downstream tasks. Additionally, it integrates an\nimproved quantum circuit graph encoding scheme, addressing the limitations of\nexisting representations and enhancing search efficiency. This predictor-free\napproach removes the need for large labeled datasets. During the search, we\nemploy REINFORCE and Bayesian Optimization to explore the latent representation\nspace and compare their performance against baseline methods. Our results\ndemonstrate that the framework efficiently identifies high-performing quantum\ncircuits with fewer search iterations.\n","authors":["Yize Sun","Zixin Wu","Yunpu Ma","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2401.11576v3.pdf","comment":"9 Pages, quantum architecture search, unsupervised representation\n learning"},{"id":"http://arxiv.org/abs/2410.17963v1","updated":"2024-10-23T15:30:37Z","published":"2024-10-23T15:30:37Z","title":"A Time-Aware Approach to Early Detection of Anorexia: UNSL at eRisk 2024","summary":" The eRisk laboratory aims to address issues related to early risk detection\non the Web. In this year's edition, three tasks were proposed, where Task 2 was\nabout early detection of signs of anorexia. Early risk detection is a problem\nwhere precision and speed are two crucial objectives. Our research group solved\nTask 2 by defining a CPI+DMC approach, addressing both objectives\nindependently, and a time-aware approach, where precision and speed are\nconsidered a combined single-objective. We implemented the last approach by\nexplicitly integrating time during the learning process, considering the\nERDE{\\theta} metric as the training objective. It also allowed us to\nincorporate temporal metrics to validate and select the optimal models. We\nachieved outstanding results for the ERDE50 metric and ranking-based metrics,\ndemonstrating consistency in solving ERD problems.\n","authors":["Horacio Thompson","Marcelo Errecalde"],"pdf_url":"https://arxiv.org/pdf/2410.17963v1.pdf","comment":"In Conference and Labs of the Evaluation Forum (CLEF 2024), Grenoble,\n France"},{"id":"http://arxiv.org/abs/2410.17961v1","updated":"2024-10-23T15:30:13Z","published":"2024-10-23T15:30:13Z","title":"Closed-form merging of parameter-efficient modules for Federated\n Continual Learning","summary":" Model merging has emerged as a crucial technique in Deep Learning, enabling\nthe integration of multiple models into a unified system while preserving\nperformance and scalability. In this respect, the compositional properties of\nlow-rank adaptation techniques (e.g., LoRA) have proven beneficial, as simple\naveraging LoRA modules yields a single model that mostly integrates the\ncapabilities of all individual modules. Building on LoRA, we take a step\nfurther by imposing that the merged model matches the responses of all learned\nmodules. Solving this objective in closed form yields an indeterminate system\nwith A and B as unknown variables, indicating the existence of infinitely many\nclosed-form solutions. To address this challenge, we introduce LoRM, an\nalternating optimization strategy that trains one LoRA matrix at a time. This\nallows solving for each unknown variable individually, thus finding a unique\nsolution. We apply our proposed methodology to Federated Class-Incremental\nLearning (FCIL), ensuring alignment of model responses both between clients and\nacross tasks. Our method demonstrates state-of-the-art performance across a\nrange of FCIL scenarios.\n","authors":["Riccardo Salami","Pietro Buzzega","Matteo Mosconi","Jacopo Bonato","Luigi Sabetta","Simone Calderara"],"pdf_url":"https://arxiv.org/pdf/2410.17961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08649v2","updated":"2024-10-23T15:29:28Z","published":"2024-06-12T21:18:14Z","title":"MOTIVE: A Drug-Target Interaction Graph For Inductive Link Prediction","summary":" Drug-target interaction (DTI) prediction is crucial for identifying new\ntherapeutics and detecting mechanisms of action. While structure-based methods\naccurately model physical interactions between a drug and its protein target,\ncell-based assays such as Cell Painting can better capture complex DTI\ninteractions. This paper introduces MOTIVE, a Morphological cOmpound Target\nInteraction Graph dataset comprising Cell Painting features for 11,000 genes\nand 3,600 compounds, along with their relationships extracted from seven\npublicly available databases. We provide random, cold-source (new drugs), and\ncold-target (new genes) data splits to enable rigorous evaluation under\nrealistic use cases. Our benchmark results show that graph neural networks that\nuse Cell Painting features consistently outperform those that learn from graph\nstructure alone, feature-based models, and topological heuristics. MOTIVE\naccelerates both graph ML research and drug discovery by promoting the\ndevelopment of more reliable DTI prediction models. MOTIVE resources are\navailable at https://github.com/carpenter-singh-lab/motive.\n","authors":["John Arevalo","Ellen Su","Anne E Carpenter","Shantanu Singh"],"pdf_url":"https://arxiv.org/pdf/2406.08649v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09063v3","updated":"2024-10-23T15:28:59Z","published":"2023-05-15T23:12:15Z","title":"Bounded KRnet and its applications to density estimation and\n approximation","summary":" In this paper, we develop an invertible mapping, called B-KRnet, on a bounded\ndomain and apply it to density estimation/approximation for data or the\nsolutions of PDEs such as the Fokker-Planck equation and the Keller-Segel\nequation. Similar to KRnet, the structure of B-KRnet adapts the\npseudo-triangular structure into a normalizing flow model. The main difference\nbetween B-KRnet and KRnet is that B-KRnet is defined on a hypercube while KRnet\nis defined on the whole space, in other words, a new mechanism is introduced in\nB-KRnet to maintain the exact invertibility. Using B-KRnet as a transport map,\nwe obtain an explicit probability density function (PDF) model that corresponds\nto the pushforward of a prior (uniform) distribution on the hypercube. It can\nbe directly applied to density estimation when only data are available. By\ncoupling KRnet and B-KRnet, we define a deep generative model on a\nhigh-dimensional domain where some dimensions are bounded and other dimensions\nare unbounded. A typical case is the solution of the stationary kinetic\nFokker-Planck equation, which is a PDF of position and momentum. Based on\nB-KRnet, we develop an adaptive learning approach to approximate partial\ndifferential equations whose solutions are PDFs or can be treated as PDFs. A\nvariety of numerical experiments is presented to demonstrate the effectiveness\nof B-KRnet.\n","authors":["Li Zeng","Xiaoliang Wan","Tao Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.09063v3.pdf","comment":"26 pages, 13 figures"},{"id":"http://arxiv.org/abs/2201.12091v5","updated":"2024-10-23T15:28:38Z","published":"2022-01-28T13:00:17Z","title":"Linear Adversarial Concept Erasure","summary":" Modern neural models trained on textual data rely on pre-trained\nrepresentations that emerge without direct supervision. As these\nrepresentations are increasingly being used in real-world applications, the\ninability to \\emph{control} their content becomes an increasingly important\nproblem. We formulate the problem of identifying and erasing a linear subspace\nthat corresponds to a given concept, in order to prevent linear predictors from\nrecovering the concept. We model this problem as a constrained, linear maximin\ngame, and show that existing solutions are generally not optimal for this task.\nWe derive a closed-form solution for certain objectives, and propose a convex\nrelaxation, \\method, that works well for others. When evaluated in the context\nof binary gender removal, the method recovers a low-dimensional subspace whose\nremoval mitigates bias by intrinsic and extrinsic evaluation. We show that the\nmethod is highly expressive, effectively mitigating bias in deep nonlinear\nclassifiers while maintaining tractability and interpretability.\n","authors":["Shauli Ravfogel","Michael Twiton","Yoav Goldberg","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2201.12091v5.pdf","comment":"Accepted in ICML 2022; a revised version"},{"id":"http://arxiv.org/abs/2410.17959v1","updated":"2024-10-23T15:28:25Z","published":"2024-10-23T15:28:25Z","title":"Medical Imaging Complexity and its Effects on GAN Performance","summary":" The proliferation of machine learning models in diverse clinical applications\nhas led to a growing need for high-fidelity, medical image training data. Such\ndata is often scarce due to cost constraints and privacy concerns. Alleviating\nthis burden, medical image synthesis via generative adversarial networks (GANs)\nemerged as a powerful method for synthetically generating photo-realistic\nimages based on existing sets of real medical images. However, the exact image\nset size required to efficiently train such a GAN is unclear. In this work, we\nexperimentally establish benchmarks that measure the relationship between a\nsample dataset size and the fidelity of the generated images, given the\ndataset's distribution of image complexities. We analyze statistical metrics\nbased on delentropy, an image complexity measure rooted in Shannon's entropy in\ninformation theory. For our pipeline, we conduct experiments with two\nstate-of-the-art GANs, StyleGAN 3 and SPADE-GAN, trained on multiple medical\nimaging datasets with variable sample sizes. Across both GANs, general\nperformance improved with increasing training set size but suffered with\nincreasing complexity.\n","authors":["William Cagas","Chan Ko","Blake Hsiao","Shryuk Grandhi","Rishi Bhattacharya","Kevin Zhu","Michael Lam"],"pdf_url":"https://arxiv.org/pdf/2410.17959v1.pdf","comment":"Accepted to ACCV, Workshop on Generative AI for Synthetic Medical\n Data"},{"id":"http://arxiv.org/abs/2410.17957v1","updated":"2024-10-23T15:27:37Z","published":"2024-10-23T15:27:37Z","title":"MCUBERT: Memory-Efficient BERT Inference on Commodity Microcontrollers","summary":" In this paper, we propose MCUBERT to enable language models like BERT on tiny\nmicrocontroller units (MCUs) through network and scheduling co-optimization. We\nobserve the embedding table contributes to the major storage bottleneck for\ntiny BERT models. Hence, at the network level, we propose an MCU-aware\ntwo-stage neural architecture search algorithm based on clustered low-rank\napproximation for embedding compression. To reduce the inference memory\nrequirements, we further propose a novel fine-grained MCU-friendly scheduling\nstrategy. Through careful computation tiling and re-ordering as well as kernel\ndesign, we drastically increase the input sequence lengths supported on MCUs\nwithout any latency or accuracy penalty. MCUBERT reduces the parameter size of\nBERT-tiny and BERT-mini by 5.7$\\times$ and 3.0$\\times$ and the execution memory\nby 3.5$\\times$ and 4.3$\\times$, respectively. MCUBERT also achieves 1.5$\\times$\nlatency reduction. For the first time, MCUBERT enables lightweight BERT models\non commodity MCUs and processing more than 512 tokens with less than 256KB of\nmemory.\n","authors":["Zebin Yang","Renze Chen","Taiqiang Wu","Ngai Wong","Yun Liang","Runsheng Wang","Ru Huang","Meng Li"],"pdf_url":"https://arxiv.org/pdf/2410.17957v1.pdf","comment":"ICCAD 2024"},{"id":"http://arxiv.org/abs/2410.17952v1","updated":"2024-10-23T15:24:16Z","published":"2024-10-23T15:24:16Z","title":"SimRAG: Self-Improving Retrieval-Augmented Generation for Adapting Large\n Language Models to Specialized Domains","summary":" Retrieval-augmented generation (RAG) enhances the question-answering (QA)\nabilities of large language models (LLMs) by integrating external knowledge.\nHowever, adapting general-purpose RAG systems to specialized fields such as\nscience and medicine poses unique challenges due to distribution shifts and\nlimited access to domain-specific data. To tackle this, we propose SimRAG, a\nself-training approach that equips the LLM with joint capabilities of question\nanswering and question generation for domain adaptation. Our method first\nfine-tunes the LLM on instruction-following, question-answering, and\nsearch-related data. Then, it prompts the same LLM to generate diverse\ndomain-relevant questions from unlabeled corpora, with an additional filtering\nstrategy to retain high-quality synthetic examples. By leveraging these\nsynthetic examples, the LLM can improve their performance on domain-specific\nRAG tasks. Experiments on 11 datasets, spanning two backbone sizes and three\ndomains, demonstrate that SimRAG outperforms baselines by 1.2\\%--8.6\\%.\n","authors":["Ran Xu","Hui Liu","Sreyashi Nag","Zhenwei Dai","Yaochen Xie","Xianfeng Tang","Chen Luo","Yang Li","Joyce C. Ho","Carl Yang","Qi He"],"pdf_url":"https://arxiv.org/pdf/2410.17952v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2410.17948v1","updated":"2024-10-23T15:22:21Z","published":"2024-10-23T15:22:21Z","title":"Generalized Resubstitution for Regression Error Estimation","summary":" We propose generalized resubstitution error estimators for regression, a\nbroad family of estimators, each corresponding to a choice of empirical\nprobability measures and loss function. The usual sum of squares criterion is a\nspecial case corresponding to the standard empirical probability measure and\nthe quadratic loss. Other choices of empirical probability measure lead to more\ngeneral estimators with superior bias and variance properties. We prove that\nthese error estimators are consistent under broad assumptions. In addition,\nprocedures for choosing the empirical measure based on the method of moments\nand maximum pseudo-likelihood are proposed and investigated. Detailed\nexperimental results using polynomial regression demonstrate empirically the\nsuperior finite-sample bias and variance properties of the proposed estimators.\nThe R code for the experiments is provided.\n","authors":["Diego Marcondes","Ulisses Braga-Neto"],"pdf_url":"https://arxiv.org/pdf/2410.17948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17945v1","updated":"2024-10-23T15:18:07Z","published":"2024-10-23T15:18:07Z","title":"Theoretically Grounded Pruning of Large Ground Sets for Constrained,\n Discrete Optimization","summary":" Modern instances of combinatorial optimization problems often exhibit\nbillion-scale ground sets, which have many uninformative or redundant elements.\nIn this work, we develop light-weight pruning algorithms to quickly discard\nelements that are unlikely to be part of an optimal solution. Under mild\nassumptions on the instance, we prove theoretical guarantees on the fraction of\nthe optimal value retained and the size of the resulting pruned ground set.\nThrough extensive experiments on real-world datasets for various applications,\nwe demonstrate that our algorithm, QuickPrune, efficiently prunes over 90% of\nthe ground set and outperforms state-of-the-art classical and machine learning\nheuristics for pruning.\n","authors":["Ankur Nath","Alan Kuhnle"],"pdf_url":"https://arxiv.org/pdf/2410.17945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17943v1","updated":"2024-10-23T15:15:56Z","published":"2024-10-23T15:15:56Z","title":"Optimizing Travel Itineraries with AI Algorithms in a Microservices\n Architecture: Balancing Cost, Time, Preferences, and Sustainability","summary":" The objective of this research is how an implementation of AI algorithms in\nthe microservices architecture enhances travel itineraries by cost, time, user\npreferences, and environmental sustainability. It uses machine learning models\nfor both cost forecasting and personalization, genetic algorithm for\noptimization of the itinerary, and heuristics for sustainability checking.\nPrimary evaluated parameters consist of latency, ability to satisfy user\npreferences, cost and environmental concern. The experimental results\ndemonstrate an average of 4.5 seconds of response time on 1000 concurrent users\nand 92% of user preferences accuracy. The cost efficiency is proved, with 95%\nof provided trips being within the limits of the budget declared by the user.\nThe system also implements some measures to alleviate negative externalities\nrelated to travel and 60% of offered travel plans had green options\nincorporated, resulting in the average 15% lower carbon emissions than the\ntraditional travel plans offered. The genetic algorithm with time complexity\nO(g.p.f) provides the optimal solution in 100 generations. Every iteration\nimproves the quality of the solution by 5%, thus enabling its effective use in\noptimization problems where time is measured in seconds. Finally, the system is\ndesigned to be fault-tolerant with functional 99.9% availability which allows\nthe provision of services even when requirements are exceeded. Travel\noptimization platform is turned dynamic and efficient by this microservices\nbased architecture which provides enhanced scaling, allows asynchronous\ncommunication and real time changes. Because of the incorporation of Ai, cost\ncontrol and eco-friendliness approaches, the system addresses the different\nuser needs in the present days travel business.\n","authors":["Biman Barua","M. Shamim Kaiser"],"pdf_url":"https://arxiv.org/pdf/2410.17943v1.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.17941v1","updated":"2024-10-23T15:09:02Z","published":"2024-10-23T15:09:02Z","title":"Spiking Graph Neural Network on Riemannian Manifolds","summary":" Graph neural networks (GNNs) have become the dominant solution for learning\non graphs, the typical non-Euclidean structures. Conventional GNNs, constructed\nwith the Artificial Neuron Network (ANN), have achieved impressive performance\nat the cost of high computation and energy consumption. In parallel, spiking\nGNNs with brain-like spiking neurons are drawing increasing research attention\nowing to the energy efficiency. So far, existing spiking GNNs consider graphs\nin Euclidean space, ignoring the structural geometry, and suffer from the high\nlatency issue due to Back-Propagation-Through-Time (BPTT) with the surrogate\ngradient. In light of the aforementioned issues, we are devoted to exploring\nspiking GNN on Riemannian manifolds, and present a Manifold-valued Spiking GNN\n(MSG). In particular, we design a new spiking neuron on geodesically complete\nmanifolds with the diffeomorphism, so that BPTT regarding the spikes is\nreplaced by the proposed differentiation via manifold. Theoretically, we show\nthat MSG approximates a solver of the manifold ordinary differential equation.\nExtensive experiments on common graphs show the proposed MSG achieves superior\nperformance to previous spiking GNNs and energy efficiency to conventional\nGNNs.\n","authors":["Li Sun","Zhenhao Huang","Qiqi Wan","Hao Peng","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2410.17941v1.pdf","comment":"Accepted by NeurIPS 2024, 30 pages"},{"id":"http://arxiv.org/abs/2406.14856v2","updated":"2024-10-23T15:08:59Z","published":"2024-06-21T04:02:19Z","title":"Accessible, At-Home Detection of Parkinson's Disease via Multi-task\n Video Analysis","summary":" Limited accessibility to neurological care leads to underdiagnosed\nParkinson's Disease (PD), preventing early intervention. Existing AI-based PD\ndetection methods primarily focus on unimodal analysis of motor or speech\ntasks, overlooking the multifaceted nature of the disease. To address this, we\nintroduce a large-scale, multi-task video dataset consisting of 1102 sessions\n(each containing videos of finger tapping, facial expression, and speech tasks\ncaptured via webcam) from 845 participants (272 with PD). We propose a novel\nUncertainty-calibrated Fusion Network (UFNet) that leverages this multimodal\ndata to enhance diagnostic accuracy. UFNet employs independent task-specific\nnetworks, trained with Monte Carlo Dropout for uncertainty quantification,\nfollowed by self-attended fusion of features, with attention weights\ndynamically adjusted based on task-specific uncertainties. To ensure\npatient-centered evaluation, the participants were randomly split into three\nsets: 60% for training, 20% for model selection, and 20% for final performance\nevaluation. UFNet significantly outperformed single-task models in terms of\naccuracy, area under the ROC curve (AUROC), and sensitivity while maintaining\nnon-inferior specificity. Withholding uncertain predictions further boosted the\nperformance, achieving 88.0+-0.3%$ accuracy, 93.0+-0.2% AUROC, 79.3+-0.9%\nsensitivity, and 92.6+-0.3% specificity, at the expense of not being able to\npredict for 2.3+-0.3% data (+- denotes 95% confidence interval). Further\nanalysis suggests that the trained model does not exhibit any detectable bias\nacross sex and ethnic subgroups and is most effective for individuals aged\nbetween 50 and 80. Requiring only a webcam and microphone, our approach\nfacilitates accessible home-based PD screening, especially in regions with\nlimited healthcare resources.\n","authors":["Md Saiful Islam","Tariq Adnan","Jan Freyberg","Sangwu Lee","Abdelrahman Abdelkader","Meghan Pawlik","Cathe Schwartz","Karen Jaffe","Ruth B. Schneider","E Ray Dorsey","Ehsan Hoque"],"pdf_url":"https://arxiv.org/pdf/2406.14856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03093v2","updated":"2024-10-23T15:01:34Z","published":"2024-08-06T10:48:15Z","title":"Certifiably Robust Policies for Uncertain Parametric Environments","summary":" We present a data-driven approach for producing policies that are provably\nrobust across unknown stochastic environments. Existing approaches can learn\nmodels of a single environment as an interval Markov decision processes (IMDP)\nand produce a robust policy with a probably approximately correct (PAC)\nguarantee on its performance. However these are unable to reason about the\nimpact of environmental parameters underlying the uncertainty. We propose a\nframework based on parametric Markov decision processes (MDPs) with unknown\ndistributions over parameters. We learn and analyse IMDPs for a set of unknown\nsample environments induced by parameters. The key challenge is then to produce\nmeaningful performance guarantees that combine the two layers of uncertainty:\n(1) multiple environments induced by parameters with an unknown distribution;\n(2) unknown induced environments which are approximated by IMDPs. We present a\nnovel approach based on scenario optimisation that yields a single PAC\nguarantee quantifying the risk level for which a specified performance level\ncan be assured in unseen environments, plus a means to trade-off risk and\nperformance. We implement and evaluate our framework using multiple robust\npolicy generation methods on a range of benchmarks. We show that our approach\nproduces tight bounds on a policy's performance with high confidence.\n","authors":["Yannik Schnitzer","Alessandro Abate","David Parker"],"pdf_url":"https://arxiv.org/pdf/2408.03093v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17935v1","updated":"2024-10-23T15:00:30Z","published":"2024-10-23T15:00:30Z","title":"Semi-Implicit Functional Gradient Flow","summary":" Particle-based variational inference methods (ParVIs) use non-parametric\nvariational families represented by particles to approximate the target\ndistribution according to the kernelized Wasserstein gradient flow for the\nKullback-Leibler (KL) divergence. Recent works introduce functional gradient\nflows to substitute the kernel for better flexibility. However, the\ndeterministic updating mechanism may suffer from limited exploration and\nrequire expensive repetitive runs for new samples. In this paper, we propose\nSemi-Implicit Functional Gradient flow (SIFG), a functional gradient ParVI\nmethod that uses perturbed particles as the approximation family. The\ncorresponding functional gradient flow, which can be estimated via denoising\nscore matching, exhibits strong theoretical convergence guarantee. We also\npresent an adaptive version of our method to automatically choose the suitable\nnoise magnitude. Extensive experiments demonstrate the effectiveness and\nefficiency of the proposed framework on both simulated and real data problems.\n","authors":["Shiyue Zhang","Ziheng Cheng","Cheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.17935v1.pdf","comment":"31 pages, 12 figures"},{"id":"http://arxiv.org/abs/2410.17934v1","updated":"2024-10-23T14:59:06Z","published":"2024-10-23T14:59:06Z","title":"Retrieving snow depth distribution by downscaling ERA5 Reanalysis with\n ICESat-2 laser altimetry","summary":" Estimating the variability of seasonal snow cover, in particular snow depth\nin remote areas, poses significant challenges due to limited spatial and\ntemporal data availability. This study uses snow depth measurements from the\nICESat-2 satellite laser altimeter, which are sparse in both space and time,\nand incorporates them with climate reanalysis data into a\ndownscaling-calibration scheme to produce monthly gridded snow depth maps at\nmicroscale (10 m). Snow surface elevation measurements from ICESat-2 along\nprofiles are compared to a digital elevation model to determine snow depth at\neach point. To efficiently turn sparse measurements into snow depth maps, a\nregression model is fitted to establish a relationship between the retrieved\nsnow depth and the corresponding ERA5 Land snow depth. This relationship,\nreferred to as subgrid variability, is then applied to downscale the monthly\nERA5 Land snow depth data. The method can provide timeseries of monthly snow\ndepth maps for the entire ERA5 time range (since 1950). The validation of\ndownscaled snow depth data was performed at an intermediate scale (100 m x 500\nm) using datasets from airborne laser scanning (ALS) in the Hardangervidda\nregion of southern Norway. Results show that snow depth prediction achieved R2\nvalues ranging from 0.74 to 0.88 (post-calibration). The method relies on\nglobally available data and is applicable to other snow regions above the\ntreeline. Though requiring area-specific calibration, our approach has the\npotential to provide snow depth maps in areas where no such data exist and can\nbe used to extrapolate existing snow surveys in time and over larger areas.\nWith this, it can offer valuable input data for hydrological, ecological or\npermafrost modeling tasks.\n","authors":["Zhihao Liu","Simon Filhol","Désirée Treichler"],"pdf_url":"https://arxiv.org/pdf/2410.17934v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17933v1","updated":"2024-10-23T14:55:53Z","published":"2024-10-23T14:55:53Z","title":"Multi-Continental Healthcare Modelling Using Blockchain-Enabled\n Federated Learning","summary":" One of the biggest challenges of building artificial intelligence (AI) model\nin healthcare area is the data sharing. Since healthcare data is private,\nsensitive, and heterogeneous, collecting sufficient data for modelling is\nexhausted, costly, and sometimes impossible. In this paper, we propose a\nframework for global healthcare modelling using datasets from multi-continents\n(Europe, North America and Asia) while without sharing the local datasets, and\nchoose glucose management as a study model to verify its effectiveness.\nTechnically, blockchain-enabled federated learning is implemented with adaption\nto make it meet with the privacy and safety requirements of healthcare data,\nmeanwhile rewards honest participation and penalize malicious activities using\nits on-chain incentive mechanism. Experimental results show that the proposed\nframework is effective, efficient, and privacy preserved. Its prediction\naccuracy is much better than the models trained from limited personal data and\nis similar to, and even slightly better than, the results from a centralized\ndataset. This work paves the way for international collaborations on healthcare\nprojects, where additional data is crucial for reducing bias and providing\nbenefits to humanity.\n","authors":["Rui Sun","Zhipeng Wang","Hengrui Zhang","Ming Jiang","Yizhe Wen","Jiqun Zhang","Jiahao Sun","Shuoying Zhang","Erwu Liu","Kezhi Li"],"pdf_url":"https://arxiv.org/pdf/2410.17933v1.pdf","comment":"Accepted by IEEE Global Blockchain Conference"},{"id":"http://arxiv.org/abs/2402.04033v3","updated":"2024-10-23T14:50:51Z","published":"2024-02-06T14:26:22Z","title":"On provable privacy vulnerabilities of graph representations","summary":" Graph representation learning (GRL) is critical for extracting insights from\ncomplex network structures, but it also raises security concerns due to\npotential privacy vulnerabilities in these representations. This paper\ninvestigates the structural vulnerabilities in graph neural models where\nsensitive topological information can be inferred through edge reconstruction\nattacks. Our research primarily addresses the theoretical underpinnings of\nsimilarity-based edge reconstruction attacks (SERA), furnishing a\nnon-asymptotic analysis of their reconstruction capacities. Moreover, we\npresent empirical corroboration indicating that such attacks can perfectly\nreconstruct sparse graphs as graph size increases. Conversely, we establish\nthat sparsity is a critical factor for SERA's effectiveness, as demonstrated\nthrough analysis and experiments on (dense) stochastic block models. Finally,\nwe explore the resilience of private graph representations produced via noisy\naggregation (NAG) mechanism against SERA. Through theoretical analysis and\nempirical assessments, we affirm the mitigation of SERA using NAG . In\nparallel, we also empirically delineate instances wherein SERA demonstrates\nboth efficacy and deficiency in its capacity to function as an instrument for\nelucidating the trade-off between privacy and utility.\n","authors":["Ruofan Wu","Guanhua Fang","Qiying Pan","Mingyang Zhang","Tengfei Liu","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2402.04033v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01959v2","updated":"2024-10-23T14:49:39Z","published":"2024-06-04T04:39:51Z","title":"Adaptive Variance Reduction for Stochastic Optimization under Weaker\n Assumptions","summary":" This paper explores adaptive variance reduction methods for stochastic\noptimization based on the STORM technique. Existing adaptive extensions of\nSTORM rely on strong assumptions like bounded gradients and bounded function\nvalues, or suffer an additional $\\mathcal{O}(\\log T)$ term in the convergence\nrate. To address these limitations, we introduce a novel adaptive STORM method\nthat achieves an optimal convergence rate of $\\mathcal{O}(T^{-1/3})$ for\nnon-convex functions with our newly designed learning rate strategy. Compared\nwith existing approaches, our method requires weaker assumptions and attains\nthe optimal convergence rate without the additional $\\mathcal{O}(\\log T)$ term.\nWe also extend the proposed technique to stochastic compositional optimization,\nobtaining the same optimal rate of $\\mathcal{O}(T^{-1/3})$. Furthermore, we\ninvestigate the non-convex finite-sum problem and develop another innovative\nadaptive variance reduction method that achieves an optimal convergence rate of\n$\\mathcal{O}(n^{1/4} T^{-1/2} )$, where $n$ represents the number of component\nfunctions. Numerical experiments across various tasks validate the\neffectiveness of our method.\n","authors":["Wei Jiang","Sifan Yang","Yibo Wang","Lijun Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.01959v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2406.00489"},{"id":"http://arxiv.org/abs/2310.10107v4","updated":"2024-10-23T14:47:42Z","published":"2023-10-16T06:41:13Z","title":"Posterior Sampling-based Online Learning for Episodic POMDPs","summary":" Learning in POMDPs is known to be significantly harder than in MDPs. In this\npaper, we consider the online learning problem for episodic POMDPs with unknown\ntransition and observation models. We propose a Posterior Sampling-based\nreinforcement learning algorithm for POMDPs (PS4POMDPs), which is much simpler\nand more implementable compared to state-of-the-art optimism-based online\nlearning algorithms for POMDPs. We show that the Bayesian regret of the\nproposed algorithm scales as the square root of the number of episodes and is\npolynomial in the other parameters. In a general setting, the regret scales\nexponentially in the horizon length $H$, and we show that this is inevitable by\nproviding a lower bound. However, when the POMDP is undercomplete and weakly\nrevealing (a common assumption in the recent literature), we establish a\npolynomial Bayesian regret bound. We finally propose a posterior sampling\nalgorithm for multi-agent POMDPs, and show it too has sublinear regret.\n","authors":["Dengwang Tang","Dongze Ye","Rahul Jain","Ashutosh Nayyar","Pierluigi Nuzzo"],"pdf_url":"https://arxiv.org/pdf/2310.10107v4.pdf","comment":"41 pages, 9 figures"},{"id":"http://arxiv.org/abs/2406.00489v2","updated":"2024-10-23T14:42:35Z","published":"2024-06-01T16:38:43Z","title":"Efficient Sign-Based Optimization: Accelerating Convergence via Variance\n Reduction","summary":" Sign stochastic gradient descent (signSGD) is a communication-efficient\nmethod that transmits only the sign of stochastic gradients for parameter\nupdating. Existing literature has demonstrated that signSGD can achieve a\nconvergence rate of $\\mathcal{O}(d^{1/2}T^{-1/4})$, where $d$ represents the\ndimension and $T$ is the iteration number. In this paper, we improve this\nconvergence rate to $\\mathcal{O}(d^{1/2}T^{-1/3})$ by introducing the\nSign-based Stochastic Variance Reduction (SSVR) method, which employs variance\nreduction estimators to track gradients and leverages their signs to update.\nFor finite-sum problems, our method can be further enhanced to achieve a\nconvergence rate of $\\mathcal{O}(m^{1/4}d^{1/2}T^{-1/2})$, where $m$ denotes\nthe number of component functions. Furthermore, we investigate the\nheterogeneous majority vote in distributed settings and introduce two novel\nalgorithms that attain improved convergence rates of\n$\\mathcal{O}(d^{1/2}T^{-1/2} + dn^{-1/2})$ and $\\mathcal{O}(d^{1/4}T^{-1/4})$\nrespectively, outperforming the previous results of $\\mathcal{O}(dT^{-1/4} +\ndn^{-1/2})$ and $\\mathcal{O}(d^{3/8}T^{-1/8})$, where $n$ represents the number\nof nodes. Numerical experiments across different tasks validate the\neffectiveness of our proposed methods.\n","authors":["Wei Jiang","Sifan Yang","Wenhao Yang","Lijun Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.00489v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17918v1","updated":"2024-10-23T14:34:39Z","published":"2024-10-23T14:34:39Z","title":"Addressing Asynchronicity in Clinical Multimodal Fusion via\n Individualized Chest X-ray Generation","summary":" Integrating multi-modal clinical data, such as electronic health records\n(EHR) and chest X-ray images (CXR), is particularly beneficial for clinical\nprediction tasks. However, in a temporal setting, multi-modal data are often\ninherently asynchronous. EHR can be continuously collected but CXR is generally\ntaken with a much longer interval due to its high cost and radiation dose. When\nclinical prediction is needed, the last available CXR image might have been\noutdated, leading to suboptimal predictions. To address this challenge, we\npropose DDL-CXR, a method that dynamically generates an up-to-date latent\nrepresentation of the individualized CXR images. Our approach leverages latent\ndiffusion models for patient-specific generation strategically conditioned on a\nprevious CXR image and EHR time series, providing information regarding\nanatomical structures and disease progressions, respectively. In this way, the\ninteraction across modalities could be better captured by the latent CXR\ngeneration process, ultimately improving the prediction performance.\nExperiments using MIMIC datasets show that the proposed model could effectively\naddress asynchronicity in multimodal fusion and consistently outperform\nexisting methods.\n","authors":["Wenfang Yao","Chen Liu","Kejing Yin","William K. Cheung","Jing Qin"],"pdf_url":"https://arxiv.org/pdf/2410.17918v1.pdf","comment":"Accepted by NeurIPS-24"},{"id":"http://arxiv.org/abs/2410.17917v1","updated":"2024-10-23T14:34:36Z","published":"2024-10-23T14:34:36Z","title":"regAL: Python Package for Active Learning of Regression Problems","summary":" Increasingly more research areas rely on machine learning methods to\naccelerate discovery while saving resources. Machine learning models, however,\nusually require large datasets of experimental or computational results, which\nin certain fields, such as (bio)chemistry, materials science, or medicine, are\nrarely given and often prohibitively expensive to obtain. To bypass that\nobstacle, active learning methods are employed to develop machine learning\nmodels with a desired performance while requiring the least possible number of\ncomputational or experimental results from the domain of application. For this\npurpose, the model's knowledge about certain regions of the application domain\nis estimated to guide the choice of the model's training set. Although active\nlearning is widely studied for classification problems (discrete outcomes),\ncomparatively few works handle this method for regression problems (continuous\noutcomes). In this work, we present our Python package regAL, which allows\nusers to evaluate different active learning strategies for regression problems.\nWith a minimal input of just the dataset in question, but many additional\ncustomization and insight options, this package is intended for anyone who aims\nto perform and understand active learning in their problem-specific scope.\n","authors":["Elizaveta Surzhikova","Jonny Proppe"],"pdf_url":"https://arxiv.org/pdf/2410.17917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17913v1","updated":"2024-10-23T14:33:11Z","published":"2024-10-23T14:33:11Z","title":"Deep learning for model correction of dynamical systems with data\n scarcity","summary":" We present a deep learning framework for correcting existing dynamical system\nmodels utilizing only a scarce high-fidelity data set. In many practical\nsituations, one has a low-fidelity model that can capture the dynamics\nreasonably well but lacks high resolution, due to the inherent limitation of\nthe model and the complexity of the underlying physics. When high resolution\ndata become available, it is natural to seek model correction to improve the\nresolution of the model predictions. We focus on the case when the amount of\nhigh-fidelity data is so small that most of the existing data driven modeling\nmethods cannot be applied. In this paper, we address these challenges with a\nmodel-correction method which only requires a scarce high-fidelity data set.\nOur method first seeks a deep neural network (DNN) model to approximate the\nexisting low-fidelity model. By using the scarce high-fidelity data, the method\nthen corrects the DNN model via transfer learning (TL). After TL, an improved\nDNN model with high prediction accuracy to the underlying dynamics is obtained.\nOne distinct feature of the propose method is that it does not assume a\nspecific form of the model correction terms. Instead, it offers an inherent\ncorrection to the low-fidelity model via TL. A set of numerical examples are\npresented to demonstrate the effectiveness of the proposed method.\n","authors":["Caroline Tatsuoka","Dongbin Xiu"],"pdf_url":"https://arxiv.org/pdf/2410.17913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04377v3","updated":"2024-10-23T14:29:56Z","published":"2024-08-08T11:22:52Z","title":"Anomaly Prediction: A Novel Approach with Explicit Delay and Horizon","summary":" Anomaly detection in time series data is a critical challenge across various\ndomains. Traditional methods typically focus on identifying anomalies in\nimmediate subsequent steps, often underestimating the significance of temporal\ndynamics such as delay time and horizons of anomalies, which generally require\nextensive post-analysis. This paper introduces a novel approach for time series\nanomaly prediction, incorporating temporal information directly into the\nprediction results. We propose a new dataset specifically designed to evaluate\nthis approach and conduct comprehensive experiments using several\nstate-of-the-art methods. Our results demonstrate the efficacy of our approach\nin providing timely and accurate anomaly predictions, setting a new benchmark\nfor future research in this field.\n","authors":["Jiang You","Arben Cela","René Natowicz","Jacob Ouanounou","Patrick Siarry"],"pdf_url":"https://arxiv.org/pdf/2408.04377v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16164v3","updated":"2024-10-23T14:24:50Z","published":"2024-05-25T10:15:51Z","title":"Acquiring Better Load Estimates by Combining Anomaly and Change Point\n Detection in Power Grid Time-series Measurements","summary":" In this paper we present novel methodology for automatic anomaly and switch\nevent filtering to improve load estimation in power grid systems. By leveraging\nunsupervised methods with supervised optimization, our approach prioritizes\ninterpretability while ensuring robust and generalizable performance on unseen\ndata. Through experimentation, a combination of binary segmentation for change\npoint detection and statistical process control for anomaly detection emerges\nas the most effective strategy, specifically when ensembled in a novel\nsequential manner. Results indicate the clear wasted potential when filtering\nis not applied. The automatic load estimation is also fairly accurate, with\napproximately 90% of estimates falling within a 10% error margin, with only a\nsingle significant failure in both the minimum and maximum load estimates\nacross 60 measurements in the test set. Our methodology's interpretability\nmakes it particularly suitable for critical infrastructure planning, thereby\nenhancing decision-making processes.\n","authors":["Roel Bouman","Linda Schmeitz","Luco Buise","Jacco Heres","Yuliya Shapovalova","Tom Heskes"],"pdf_url":"https://arxiv.org/pdf/2405.16164v3.pdf","comment":"All code can be found at: https://github.com/RoelBouman/StormPhase2"},{"id":"http://arxiv.org/abs/2410.17904v1","updated":"2024-10-23T14:22:49Z","published":"2024-10-23T14:22:49Z","title":"Reinforcement Learning under Latent Dynamics: Toward Statistical and\n Algorithmic Modularity","summary":" Real-world applications of reinforcement learning often involve environments\nwhere agents operate on complex, high-dimensional observations, but the\nunderlying (''latent'') dynamics are comparatively simple. However, outside of\nrestrictive settings such as small latent spaces, the fundamental statistical\nrequirements and algorithmic principles for reinforcement learning under latent\ndynamics are poorly understood.\n This paper addresses the question of reinforcement learning under\n$\\textit{general}$ latent dynamics from a statistical and algorithmic\nperspective. On the statistical side, our main negative result shows that most\nwell-studied settings for reinforcement learning with function approximation\nbecome intractable when composed with rich observations; we complement this\nwith a positive result, identifying latent pushforward coverability as a\ngeneral condition that enables statistical tractability. Algorithmically, we\ndevelop provably efficient observable-to-latent reductions -- that is,\nreductions that transform an arbitrary algorithm for the latent MDP into an\nalgorithm that can operate on rich observations -- in two settings: one where\nthe agent has access to hindsight observations of the latent dynamics [LADZ23],\nand one where the agent can estimate self-predictive latent models [SAGHCB20].\nTogether, our results serve as a first step toward a unified statistical and\nalgorithmic theory for reinforcement learning under latent dynamics.\n","authors":["Philip Amortila","Dylan J. Foster","Nan Jiang","Akshay Krishnamurthy","Zakaria Mhammedi"],"pdf_url":"https://arxiv.org/pdf/2410.17904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17898v1","updated":"2024-10-23T14:16:34Z","published":"2024-10-23T14:16:34Z","title":"Scalable Offline Reinforcement Learning for Mean Field Games","summary":" Reinforcement learning algorithms for mean-field games offer a scalable\nframework for optimizing policies in large populations of interacting agents.\nExisting methods often depend on online interactions or access to system\ndynamics, limiting their practicality in real-world scenarios where such\ninteractions are infeasible or difficult to model. In this paper, we present\nOffline Munchausen Mirror Descent (Off-MMD), a novel mean-field RL algorithm\nthat approximates equilibrium policies in mean-field games using purely offline\ndata. By leveraging iterative mirror descent and importance sampling\ntechniques, Off-MMD estimates the mean-field distribution from static datasets\nwithout relying on simulation or environment dynamics. Additionally, we\nincorporate techniques from offline reinforcement learning to address common\nissues like Q-value overestimation, ensuring robust policy learning even with\nlimited data coverage. Our algorithm scales to complex environments and\ndemonstrates strong performance on benchmark tasks like crowd exploration or\nnavigation, highlighting its applicability to real-world multi-agent systems\nwhere online experimentation is infeasible. We empirically demonstrate the\nrobustness of Off-MMD to low-quality datasets and conduct experiments to\ninvestigate its sensitivity to hyperparameter choices.\n","authors":["Axel Brunnbauer","Julian Lemmel","Zahra Babaiee","Sophie Neubauer","Radu Grosu"],"pdf_url":"https://arxiv.org/pdf/2410.17898v1.pdf","comment":"Submitted to AAMAS"},{"id":"http://arxiv.org/abs/2407.19353v2","updated":"2024-10-23T14:11:34Z","published":"2024-07-28T00:07:20Z","title":"A spring-block theory of feature learning in deep neural networks","summary":" Feature-learning deep nets progressively collapse data to a regular\nlow-dimensional geometry. How this phenomenon emerges from collective action of\nnonlinearity, noise, learning rate, and other choices that shape the dynamics,\nhas eluded first-principles theories built from microscopic neuronal dynamics.\nWe exhibit a noise-nonlinearity phase diagram that identifies regimes where\nshallow or deep layers learn more effectively. We then propose a macroscopic\nmechanical theory that reproduces the diagram, explaining why some DNNs are\nlazy and some active, and linking feature learning across layers to\ngeneralization.\n","authors":["Cheng Shi","Liming Pan","Ivan Dokmanić"],"pdf_url":"https://arxiv.org/pdf/2407.19353v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15592v2","updated":"2024-10-23T14:08:10Z","published":"2024-10-21T02:21:56Z","title":"CPE-Pro: A Structure-Sensitive Deep Learning Method for Protein\n Representation and Origin Evaluation","summary":" Protein structures are important for understanding their functions and\ninteractions. Currently, many protein structure prediction methods are\nenriching the structure database. Discriminating the origin of structures is\ncrucial for distinguishing between experimentally resolved and computationally\npredicted structures, evaluating the reliability of prediction methods, and\nguiding downstream biological studies. Building on works in structure\nprediction, We developed a structure-sensitive supervised deep learning model,\nCrystal vs Predicted Evaluator for Protein Structure (CPE-Pro), to represent\nand discriminate the origin of protein structures. CPE-Pro learns the\nstructural information of proteins and captures inter-structural differences to\nachieve accurate traceability on four data classes, and is expected to be\nextended to more. Simultaneously, we utilized Foldseek to encode protein\nstructures into \"structure-sequences\" and trained a protein Structural Sequence\nLanguage Model, SSLM. Preliminary experiments demonstrated that, compared to\nlarge-scale protein language models pre-trained on vast amounts of amino acid\nsequences, the \"structure-sequence\" enables the language model to learn more\ninformative protein features, enhancing and optimizing structural\nrepresentations. We have provided the code, model weights, and all related\nmaterials on https://github.com/GouWenrui/CPE-Pro-main.git.\n","authors":["Wenrui Gou","Wenhui Ge","Yang Tan","Mingchen Li","Guisheng Fan","Huiqun Yu"],"pdf_url":"https://arxiv.org/pdf/2410.15592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19472v2","updated":"2024-10-23T14:02:12Z","published":"2024-09-28T22:41:49Z","title":"Towards Croppable Implicit Neural Representations","summary":" Implicit Neural Representations (INRs) have peaked interest in recent years\ndue to their ability to encode natural signals using neural networks. While\nINRs allow for useful applications such as interpolating new coordinates and\nsignal compression, their black-box nature makes it difficult to modify them\npost-training. In this paper we explore the idea of editable INRs, and\nspecifically focus on the widely used cropping operation. To this end, we\npresent Local-Global SIRENs -- a novel INR architecture that supports cropping\nby design. Local-Global SIRENs are based on combining local and global feature\nextraction for signal encoding. What makes their design unique is the ability\nto effortlessly remove specific portions of an encoded signal, with a\nproportional weight decrease. This is achieved by eliminating the corresponding\nweights from the network, without the need for retraining. We further show how\nthis architecture can be used to support the straightforward extension of\npreviously encoded signals. Beyond signal editing, we examine how the\nLocal-Global approach can accelerate training, enhance encoding of various\nsignals, improve downstream performance, and be applied to modern INRs such as\nINCODE, highlighting its potential and flexibility. Code is available at\nhttps://github.com/maorash/Local-Global-INRs.\n","authors":["Maor Ashkenazi","Eran Treister"],"pdf_url":"https://arxiv.org/pdf/2409.19472v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.06927v2","updated":"2024-10-23T14:01:27Z","published":"2024-08-13T14:29:00Z","title":"Breaking Class Barriers: Efficient Dataset Distillation via Inter-Class\n Feature Compensator","summary":" Dataset distillation has emerged as a technique aiming to condense\ninformative features from large, natural datasets into a compact and synthetic\nform. While recent advancements have refined this technique, its performance is\nbottlenecked by the prevailing class-specific synthesis paradigm. Under this\nparadigm, synthetic data is optimized exclusively for a pre-assigned one-hot\nlabel, creating an implicit class barrier in feature condensation. This leads\nto inefficient utilization of the distillation budget and oversight of\ninter-class feature distributions, which ultimately limits the effectiveness\nand efficiency, as demonstrated in our analysis. To overcome these constraints,\nthis paper presents the Inter-class Feature Compensator (INFER), an innovative\ndistillation approach that transcends the class-specific data-label framework\nwidely utilized in current dataset distillation methods. Specifically, INFER\nleverages a Universal Feature Compensator (UFC) to enhance feature integration\nacross classes, enabling the generation of multiple additional synthetic\ninstances from a single UFC input. This significantly improves the efficiency\nof the distillation budget. Moreover, INFER enriches inter-class interactions\nduring the distillation, thereby enhancing the effectiveness and\ngeneralizability of the distilled data. By allowing for the linear\ninterpolation of labels similar to those in the original dataset, INFER\nmeticulously optimizes the synthetic data and dramatically reduces the size of\nsoft labels in the synthetic dataset to almost zero, establishing a new\nbenchmark for efficiency and effectiveness in dataset distillation.\n","authors":["Xin Zhang","Jiawei Du","Ping Liu","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.06927v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12819v2","updated":"2024-10-23T14:00:36Z","published":"2024-07-01T10:33:46Z","title":"I've Got 99 Problems But FLOPS Ain't One","summary":" Hyperscalers dominate the landscape of large network deployments, yet they\nrarely share data or insights about the challenges they face. In light of this\nsupremacy, what problems can we find to solve in this space? We take an\nunconventional approach to find relevant research directions, starting from\npublic plans to build a $100 billion datacenter for machine learning\napplications. Leveraging the language models scaling laws, we discover what\nworkloads such a datacenter might carry and explore the challenges one may\nencounter in doing so, with a focus on networking research. We conclude that\nbuilding the datacenter and training such models is technically possible, but\nthis requires novel wide-area transports for inter-DC communication, a\nmultipath transport and novel datacenter topologies for intra-datacenter\ncommunication, high speed scale-up networks and transports, outlining a rich\nresearch agenda for the networking community.\n","authors":["Alexandru M. Gherghescu","Vlad-Andrei Bădoiu","Alexandru Agache","Mihai-Valentin Dumitru","Iuliu Vasilescu","Radu Mantu","Costin Raiciu"],"pdf_url":"https://arxiv.org/pdf/2407.12819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17882v1","updated":"2024-10-23T13:55:42Z","published":"2024-10-23T13:55:42Z","title":"Identifiable Representation and Model Learning for Latent Dynamic\n Systems","summary":" Learning identifiable representations and models from low-level observations\nis useful for an intelligent spacecraft to reliability finish downstream tasks.\nFor temporal observations, to ensure that the data generating process is\nprovably inverted, most existing works either assume the noise variables in the\ndynamic mechanisms are (conditionally) independent, or require interventions\nwhich can directly affect each latent variable. However, in practice, the\nrelationship between the exogenous inputs/interventions and the latent\nvariables may follow some complex deterministic mechanisms. In this work, we\nstudy the problem of identifiable representation and model learning for latent\ndynamic systems. The key idea is that we use an inductive bias inspired by\ncontrollable canonical forms, which is invariant, sparse, and input dependent\nby definition. We prove that, for linear or affine nonlinear latent dynamic\nsystems, it is possible to identify the representations up to scaling and\ndetermine the models up to some simple transformations. The results have\npotential to provide some theoretical guarantees for developing more\ntrustworthy decision-making and control methods for intelligent spacecrafts.\n","authors":["Congxi Zhang","Yongchun Xie"],"pdf_url":"https://arxiv.org/pdf/2410.17882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17881v1","updated":"2024-10-23T13:53:26Z","published":"2024-10-23T13:53:26Z","title":"AdaRankGrad: Adaptive Gradient-Rank and Moments for Memory-Efficient\n LLMs Training and Fine-Tuning","summary":" Training and fine-tuning large language models (LLMs) come with challenges\nrelated to memory and computational requirements due to the increasing size of\nthe model weights and the optimizer states. Various techniques have been\ndeveloped to tackle these challenges, such as low-rank adaptation (LoRA), which\ninvolves introducing a parallel trainable low-rank matrix to the fixed\npre-trained weights at each layer. However, these methods often fall short\ncompared to the full-rank weight training approach, as they restrict the\nparameter search to a low-rank subspace. This limitation can disrupt training\ndynamics and require a full-rank warm start to mitigate the impact. In this\npaper, we introduce a new method inspired by a phenomenon we formally prove: as\ntraining progresses, the rank of the estimated layer gradients gradually\ndecreases, and asymptotically approaches rank one. Leveraging this, our\napproach involves adaptively reducing the rank of the gradients during Adam\noptimization steps, using an efficient online-updating low-rank projections\nrule. We further present a randomized SVD scheme for efficiently finding the\nprojection matrix. Our technique enables full-parameter fine-tuning with\nadaptive low-rank gradient updates, significantly reducing overall memory\nrequirements during training compared to state-of-the-art methods while\nimproving model performance in both pretraining and fine-tuning. Finally, we\nprovide a convergence analysis of our method and demonstrate its merits for\ntraining and fine-tuning language and biological foundation models.\n","authors":["Yehonathan Refael","Jonathan Svirsky","Boris Shustin","Wasim Huleihel","Ofir Lindenbaum"],"pdf_url":"https://arxiv.org/pdf/2410.17881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17878v1","updated":"2024-10-23T13:50:27Z","published":"2024-10-23T13:50:27Z","title":"Relaxed Equivariance via Multitask Learning","summary":" Incorporating equivariance as an inductive bias into deep learning\narchitectures to take advantage of the data symmetry has been successful in\nmultiple applications, such as chemistry and dynamical systems. In particular,\nroto-translations are crucial for effectively modeling geometric graphs and\nmolecules, where understanding the 3D structures enhances generalization.\nHowever, equivariant models often pose challenges due to their high\ncomputational complexity. In this paper, we introduce REMUL, a training\nprocedure for approximating equivariance with multitask learning. We show that\nunconstrained models (which do not build equivariance into the architecture)\ncan learn approximate symmetries by minimizing an additional simple\nequivariance loss. By formulating equivariance as a new learning objective, we\ncan control the level of approximate equivariance in the model. Our method\nachieves competitive performance compared to equivariant baselines while being\n$10 \\times$ faster at inference and $2.5 \\times$ at training.\n","authors":["Ahmed A. Elhag","T. Konstantin Rusch","Francesco Di Giovanni","Michael Bronstein"],"pdf_url":"https://arxiv.org/pdf/2410.17878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.04953v3","updated":"2024-10-23T13:36:37Z","published":"2022-12-09T16:03:34Z","title":"TargetCall: Eliminating the Wasted Computation in Basecalling via\n Pre-Basecalling Filtering","summary":" Basecalling is an essential step in nanopore sequencing analysis where the\nraw signals of nanopore sequencers are converted into nucleotide sequences,\ni.e., reads. State-of-the-art basecallers employ complex deep learning models\nto achieve high basecalling accuracy. This makes basecalling computationally\ninefficient and memory-hungry, bottlenecking the entire genome analysis\npipeline. However, for many applications, the majority of reads do no match the\nreference genome of interest (i.e., target reference) and thus are discarded in\nlater steps in the genomics pipeline, wasting the basecalling computation. To\novercome this issue, we propose TargetCall, the first pre-basecalling filter to\neliminate the wasted computation in basecalling. TargetCall's key idea is to\ndiscard reads that will not match the target reference (i.e., off-target reads)\nprior to basecalling. TargetCall consists of two main components: (1)\nLightCall, a lightweight neural network basecaller that produces noisy reads;\nand (2) Similarity Check, which labels each of these noisy reads as on-target\nor off-target by matching them to the target reference. Our thorough\nexperimental evaluations show that TargetCall 1) improves the end-to-end\nbasecalling runtime performance of the state-of-the-art basecaller by 3.31x\nwhile maintaining high (98.88%) recall in keeping on-target reads, 2) maintains\nhigh accuracy in downstream analysis, and 3) achieves better runtime\nperformance, throughput, recall, precision, and generality compared to prior\nworks. TargetCall is available at https://github.com/CMU-SAFARI/TargetCall.\n","authors":["Meryem Banu Cavlak","Gagandeep Singh","Mohammed Alser","Can Firtina","Joël Lindegger","Mohammad Sadrosadati","Nika Mansouri Ghiasi","Can Alkan","Onur Mutlu"],"pdf_url":"https://arxiv.org/pdf/2212.04953v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17865v1","updated":"2024-10-23T13:36:23Z","published":"2024-10-23T13:36:23Z","title":"Population stratification for prediction of mortality in post-AKI\n patients","summary":" Acute kidney injury (AKI) is a serious clinical condition that affects up to\n20% of hospitalised patients. AKI is associated with short term unplanned\nhospital readmission and post-discharge mortality risk. Patient risk and\nhealthcare expenditures can be minimised by followup planning grounded on\npredictive models and machine learning. Since AKI is multi-factorial,\npredictive models specialised in different categories of patients can increase\naccuracy of predictions. In the present article we present some results\nfollowing this approach.\n","authors":["Flavio S. Correa da Silva","Simon Sawhney"],"pdf_url":"https://arxiv.org/pdf/2410.17865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17863v1","updated":"2024-10-23T13:35:18Z","published":"2024-10-23T13:35:18Z","title":"CASCRNet: An Atrous Spatial Pyramid Pooling and Shared Channel Residual\n based Network for Capsule Endoscopy","summary":" This manuscript summarizes work on the Capsule Vision Challenge 2024 by\nMISAHUB. To address the multi-class disease classification task, which is\nchallenging due to the complexity and imbalance in the Capsule Vision challenge\ndataset, this paper proposes CASCRNet (Capsule endoscopy-Aspp-SCR-Network), a\nparameter-efficient and novel model that uses Shared Channel Residual (SCR)\nblocks and Atrous Spatial Pyramid Pooling (ASPP) blocks. Further, the\nperformance of the proposed model is compared with other well-known approaches.\nThe experimental results yield that proposed model provides better disease\nclassification results. The proposed model was successful in classifying\ndiseases with an F1 Score of 78.5% and a Mean AUC of 98.3%, which is promising\ngiven its compact architecture.\n","authors":["K V Srinanda","M Manvith Prabhu","Shyam Lal"],"pdf_url":"https://arxiv.org/pdf/2410.17863v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2410.17851v1","updated":"2024-10-23T13:20:42Z","published":"2024-10-23T13:20:42Z","title":"The Probabilistic Tsetlin Machine: A Novel Approach to Uncertainty\n Quantification","summary":" Tsetlin Machines (TMs) have emerged as a compelling alternative to\nconventional deep learning methods, offering notable advantages such as smaller\nmemory footprint, faster inference, fault-tolerant properties, and\ninterpretability. Although various adaptations of TMs have expanded their\napplicability across diverse domains, a fundamental gap remains in\nunderstanding how TMs quantify uncertainty in their predictions. In response,\nthis paper introduces the Probabilistic Tsetlin Machine (PTM) framework, aimed\nat providing a robust, reliable, and interpretable approach for uncertainty\nquantification. Unlike the original TM, the PTM learns the probability of\nstaying on each state of each Tsetlin Automaton (TA) across all clauses. These\nprobabilities are updated using the feedback tables that are part of the TM\nframework: Type I and Type II feedback. During inference, TAs decide their\nactions by sampling states based on learned probability distributions, akin to\nBayesian neural networks when generating weight values. In our experimental\nanalysis, we first illustrate the spread of the probabilities across TA states\nfor the noisy-XOR dataset. Then we evaluate the PTM alongside benchmark models\nusing both simulated and real-world datasets. The experiments on the simulated\ndataset reveal the PTM's effectiveness in uncertainty quantification,\nparticularly in delineating decision boundaries and identifying regions of high\nuncertainty. Moreover, when applied to multiclass classification tasks using\nthe Iris dataset, the PTM demonstrates competitive performance in terms of\npredictive entropy and expected calibration error, showcasing its potential as\na reliable tool for uncertainty estimation. Our findings underscore the\nimportance of selecting appropriate models for accurate uncertainty\nquantification in predictive tasks, with the PTM offering a particularly\ninterpretable and effective solution.\n","authors":["K. Darshana Abeyrathna","Sara El Mekkaoui","Andreas Hafver","Christian Agrell"],"pdf_url":"https://arxiv.org/pdf/2410.17851v1.pdf","comment":"12 pages, 5 figures, 6 tables, accepted and presented at ICAAI 2024,\n London"},{"id":"http://arxiv.org/abs/2402.15055v2","updated":"2024-10-23T13:20:15Z","published":"2024-02-23T02:15:47Z","title":"Interpreting Context Look-ups in Transformers: Investigating\n Attention-MLP Interactions","summary":" Understanding the inner workings of large language models (LLMs) is crucial\nfor advancing their theoretical foundations and real-world applications. While\nthe attention mechanism and multi-layer perceptrons (MLPs) have been studied\nindependently, their interactions remain largely unexplored. This study\ninvestigates how attention heads and next-token neurons interact in LLMs to\npredict new words. We propose a methodology to identify next-token neurons,\nfind prompts that highly activate them, and determine the upstream attention\nheads responsible. We then generate and evaluate explanations for the activity\nof these attention heads in an automated manner. Our findings reveal that some\nattention heads recognize specific contexts relevant to predicting a token and\nactivate a downstream token-predicting neuron accordingly. This mechanism\nprovides a deeper understanding of how attention heads work with MLP neurons to\nperform next-token prediction. Our approach offers a foundation for further\nresearch into the intricate workings of LLMs and their impact on text\ngeneration and understanding.\n","authors":["Clement Neo","Shay B. Cohen","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2402.15055v2.pdf","comment":"Accepted to EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2410.13563v2","updated":"2024-10-23T13:19:26Z","published":"2024-10-17T14:00:18Z","title":"Ornstein-Uhlenbeck Adaptation as a Mechanism for Learning in Brains and\n Machines","summary":" Learning is a fundamental property of intelligent systems, observed across\nbiological organisms and engineered systems. While modern intelligent systems\ntypically rely on gradient descent for learning, the need for exact gradients\nand complex information flow makes its implementation in biological and\nneuromorphic systems challenging. This has motivated the exploration of\nalternative learning mechanisms that can operate locally and do not rely on\nexact gradients. In this work, we introduce a novel approach that leverages\nnoise in the parameters of the system and global reinforcement signals. Using\nan Ornstein-Uhlenbeck process with adaptive dynamics, our method balances\nexploration and exploitation during learning, driven by deviations from error\npredictions, akin to reward prediction error. Operating in continuous time,\nOrstein-Uhlenbeck adaptation (OUA) is proposed as a general mechanism for\nlearning dynamic, time-evolving environments. We validate our approach across\ndiverse tasks, including supervised learning and reinforcement learning in\nfeedforward and recurrent systems. Additionally, we demonstrate that it can\nperform meta-learning, adjusting hyper-parameters autonomously. Our results\nindicate that OUA provides a viable alternative to traditional gradient-based\nmethods, with potential applications in neuromorphic computing. It also hints\nat a possible mechanism for noise-driven learning in the brain, where\nstochastic neurotransmitter release may guide synaptic adjustments.\n","authors":["Jesus Garcia Fernandez","Nasir Ahmad","Marcel van Gerven"],"pdf_url":"https://arxiv.org/pdf/2410.13563v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11397v2","updated":"2024-10-23T13:14:21Z","published":"2024-10-15T08:39:31Z","title":"FOOGD: Federated Collaboration for Both Out-of-distribution\n Generalization and Detection","summary":" Federated learning (FL) is a promising machine learning paradigm that\ncollaborates with client models to capture global knowledge. However, deploying\nFL models in real-world scenarios remains unreliable due to the coexistence of\nin-distribution data and unexpected out-of-distribution (OOD) data, such as\ncovariate-shift and semantic-shift data. Current FL researches typically\naddress either covariate-shift data through OOD generalization or\nsemantic-shift data via OOD detection, overlooking the simultaneous occurrence\nof various OOD shifts. In this work, we propose FOOGD, a method that estimates\nthe probability density of each client and obtains reliable global distribution\nas guidance for the subsequent FL process. Firstly, SM3D in FOOGD estimates\nscore model for arbitrary distributions without prior constraints, and detects\nsemantic-shift data powerfully. Then SAG in FOOGD provides invariant yet\ndiverse knowledge for both local covariate-shift generalization and client\nperformance generalization. In empirical validations, FOOGD significantly\nenjoys three main advantages: (1) reliably estimating non-normalized\ndecentralized distributions, (2) detecting semantic shift data via score\nvalues, and (3) generalizing to covariate-shift data by regularizing feature\nextractor. The prejoct is open in https://github.com/XeniaLLL/FOOGD-main.git.\n","authors":["Xinting Liao","Weiming Liu","Pengyang Zhou","Fengyuan Yu","Jiahe Xu","Jun Wang","Wenjie Wang","Chaochao Chen","Xiaolin Zheng"],"pdf_url":"https://arxiv.org/pdf/2410.11397v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.17840v1","updated":"2024-10-23T13:05:46Z","published":"2024-10-23T13:05:46Z","title":"Is the GPU Half-Empty or Half-Full? Practical Scheduling Techniques for\n LLMs","summary":" Serving systems for Large Language Models (LLMs) improve throughput by\nprocessing several requests concurrently. However, multiplexing hardware\nresources between concurrent requests involves non-trivial scheduling\ndecisions. Practical serving systems typically implement these decisions at two\nlevels: First, a load balancer routes requests to different servers which each\nhold a replica of the LLM. Then, on each server, an engine-level scheduler\ndecides when to run a request, or when to queue or preempt it. Improved\nscheduling policies may benefit a wide range of LLM deployments and can often\nbe implemented as \"drop-in replacements\" to a system's current policy. In this\nwork, we survey scheduling techniques from the literature and from practical\nserving systems. We find that schedulers from the literature often achieve good\nperformance but introduce significant complexity. In contrast, schedulers in\npractical deployments often leave easy performance gains on the table but are\neasy to implement, deploy and configure. This finding motivates us to introduce\ntwo new scheduling techniques, which are both easy to implement, and outperform\ncurrent techniques on production workload traces.\n","authors":["Ferdi Kossmann","Bruce Fontaine","Daya Khudia","Michael Cafarella","Samuel Madden"],"pdf_url":"https://arxiv.org/pdf/2410.17840v1.pdf","comment":"12 pages, 11 figures"},{"id":"http://arxiv.org/abs/2403.14774v2","updated":"2024-10-23T13:01:14Z","published":"2024-03-21T18:28:43Z","title":"Few-Shot Adversarial Prompt Learning on Vision-Language Models","summary":" The vulnerability of deep neural networks to imperceptible adversarial\nperturbations has attracted widespread attention. Inspired by the success of\nvision-language foundation models, previous efforts achieved zero-shot\nadversarial robustness by aligning adversarial visual features with text\nsupervision. However, in practice, they are still unsatisfactory due to several\nissues, including heavy adaptation cost, suboptimal text supervision, and\nuncontrolled natural generalization capacity. In this paper, to address these\nissues, we propose a few-shot adversarial prompt framework where adapting input\nsequences with limited data makes significant adversarial robustness\nimprovement. Specifically, we achieve this by providing adversarially\ncorrelated text supervision that is end-to-end learned from adversarial\nexamples. We also propose a novel training objective that enhances the\nconsistency of multi-modal features while encourages differentiated uni-modal\nfeatures between natural and adversarial examples. The proposed framework gives\naccess to learn adversarial text supervision, which provides superior\ncross-modal adversarial alignment and matches state-of-the-art zero-shot\nadversarial robustness with only 1% training data. Code is available at:\nhttps://github.com/lionel-w2/FAP.\n","authors":["Yiwei Zhou","Xiaobo Xia","Zhiwei Lin","Bo Han","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.14774v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.17055v2","updated":"2024-10-23T12:55:39Z","published":"2024-10-22T14:36:44Z","title":"Optimal Design for Reward Modeling in RLHF","summary":" Reinforcement Learning from Human Feedback (RLHF) has become a popular\napproach to align language models (LMs) with human preferences. This method\ninvolves collecting a large dataset of human pairwise preferences across\nvarious text generations and using it to infer (implicitly or explicitly) a\nreward model. Numerous methods have been proposed to learn the reward model and\nalign a LM with it. However, the costly process of collecting human preferences\nhas received little attention and could benefit from theoretical insights. This\npaper addresses this issue and aims to formalize the reward training model in\nRLHF. We frame the selection of an effective dataset as a simple regret\nminimization task, using a linear contextual dueling bandit method. Given the\npotentially large number of arms, this approach is more coherent than the\nbest-arm identification setting. We then propose an offline framework for\nsolving this problem. Under appropriate assumptions - linearity of the reward\nmodel in the embedding space, and boundedness of the reward parameter - we\nderive bounds on the simple regret. Finally, we provide a lower bound that\nmatches our upper bound up to constant and logarithmic terms. To our knowledge,\nthis is the first theoretical contribution in this area to provide an offline\napproach as well as worst-case guarantees.\n","authors":["Antoine Scheid","Etienne Boursier","Alain Durmus","Michael I. Jordan","Pierre Ménard","Eric Moulines","Michal Valko"],"pdf_url":"https://arxiv.org/pdf/2410.17055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17835v1","updated":"2024-10-23T12:54:04Z","published":"2024-10-23T12:54:04Z","title":"Optimal Streaming Algorithms for Multi-Armed Bandits","summary":" This paper studies two variants of the best arm identification (BAI) problem\nunder the streaming model, where we have a stream of $n$ arms with reward\ndistributions supported on $[0,1]$ with unknown means. The arms in the stream\nare arriving one by one, and the algorithm cannot access an arm unless it is\nstored in a limited size memory.\n We first study the streaming \\eps-$top$-$k$ arms identification problem,\nwhich asks for $k$ arms whose reward means are lower than that of the $k$-th\nbest arm by at most $\\eps$ with probability at least $1-\\delta$. For general\n$\\eps \\in (0,1)$, the existing solution for this problem assumes $k = 1$ and\nachieves the optimal sample complexity $O(\\frac{n}{\\eps^2} \\log\n\\frac{1}{\\delta})$ using $O(\\log^*(n))$ ($\\log^*(n)$ equals the number of times\nthat we need to apply the logarithm function on $n$ before the results is no\nmore than 1.) memory and a single pass of the stream. We propose an algorithm\nthat works for any $k$ and achieves the optimal sample complexity\n$O(\\frac{n}{\\eps^2} \\log\\frac{k}{\\delta})$ using a single-arm memory and a\nsingle pass of the stream.\n Second, we study the streaming BAI problem, where the objective is to\nidentify the arm with the maximum reward mean with at least $1-\\delta$\nprobability, using a single-arm memory and as few passes of the input stream as\npossible. We present a single-arm-memory algorithm that achieves a near\ninstance-dependent optimal sample complexity within $O(\\log \\Delta_2^{-1})$\npasses, where $\\Delta_2$ is the gap between the mean of the best arm and that\nof the second best arm.\n","authors":["Tianyuan Jin","Keke Huang","Jing Tang","Xiaokui Xiao"],"pdf_url":"https://arxiv.org/pdf/2410.17835v1.pdf","comment":"24pages"},{"id":"http://arxiv.org/abs/2410.17834v1","updated":"2024-10-23T12:53:58Z","published":"2024-10-23T12:53:58Z","title":"Non-intrusive Speech Quality Assessment with Diffusion Models Trained on\n Clean Speech","summary":" Diffusion models have found great success in generating high quality, natural\nsamples of speech, but their potential for density estimation for speech has so\nfar remained largely unexplored. In this work, we leverage an unconditional\ndiffusion model trained only on clean speech for the assessment of speech\nquality. We show that the quality of a speech utterance can be assessed by\nestimating the likelihood of a corresponding sample in the terminating Gaussian\ndistribution, obtained via a deterministic noising process. The resulting\nmethod is purely unsupervised, trained only on clean speech, and therefore does\nnot rely on annotations. Our diffusion-based approach leverages clean speech\npriors to assess quality based on how the input relates to the learned\ndistribution of clean data. Our proposed log-likelihoods show promising\nresults, correlating well with intrusive speech quality metrics such as POLQA\nand SI-SDR.\n","authors":["Danilo de Oliveira","Julius Richter","Jean-Marie Lemercier","Simon Welker","Timo Gerkmann"],"pdf_url":"https://arxiv.org/pdf/2410.17834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01476v2","updated":"2024-10-23T12:53:49Z","published":"2024-10-02T12:30:05Z","title":"Reducing Variance in Meta-Learning via Laplace Approximation for\n Regression Tasks","summary":" Given a finite set of sample points, meta-learning algorithms aim to learn an\noptimal adaptation strategy for new, unseen tasks. Often, this data can be\nambiguous as it might belong to different tasks concurrently. This is\nparticularly the case in meta-regression tasks. In such cases, the estimated\nadaptation strategy is subject to high variance due to the limited amount of\nsupport data for each task, which often leads to sub-optimal generalization\nperformance. In this work, we address the problem of variance reduction in\ngradient-based meta-learning and formalize the class of problems prone to this,\na condition we refer to as \\emph{task overlap}. Specifically, we propose a\nnovel approach that reduces the variance of the gradient estimate by weighing\neach support point individually by the variance of its posterior over the\nparameters. To estimate the posterior, we utilize the Laplace approximation,\nwhich allows us to express the variance in terms of the curvature of the loss\nlandscape of our meta-learner. Experimental results demonstrate the\neffectiveness of the proposed method and highlight the importance of variance\nreduction in meta-learning.\n","authors":["Alfredo Reichlin","Gustaf Tegnér","Miguel Vasco","Hang Yin","Mårten Björkman","Danica Kragic"],"pdf_url":"https://arxiv.org/pdf/2410.01476v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.01708v4","updated":"2024-10-23T12:50:18Z","published":"2022-10-04T16:08:54Z","title":"Conquering the Communication Constraints to Enable Large Pre-Trained\n Models in Federated Learning","summary":" Federated learning (FL) has emerged as a promising paradigm for enabling the\ncollaborative training of models without centralized access to the raw data on\nlocal devices. In the typical FL paradigm (e.g., FedAvg), model weights are\nsent to and from the server each round to participating clients. Recently, the\nuse of small pre-trained models has been shown effective in federated learning\noptimization and improving convergence. However, recent state-of-the-art\npre-trained models are getting more capable but also have more parameters. In\nconventional FL, sharing the enormous model weights can quickly put a massive\ncommunication burden on the system, especially if more capable models are\nemployed. Can we find a solution to enable those strong and readily-available\npre-trained models in FL to achieve excellent performance while simultaneously\nreducing the communication burden? To this end, we investigate the use of\nparameter-efficient fine-tuning in federated learning and thus introduce a new\nframework: FedPEFT. Specifically, we systemically evaluate the performance of\nFedPEFT across a variety of client stability, data distribution, and\ndifferential privacy settings. By only locally tuning and globally sharing a\nsmall portion of the model weights, significant reductions in the total\ncommunication overhead can be achieved while maintaining competitive or even\nbetter performance in a wide range of federated learning scenarios, providing\ninsight into a new paradigm for practical and effective federated systems.\n","authors":["Guangyu Sun","Umar Khalid","Matias Mendieta","Taojiannan Yang","Pu Wang","Minwoo Lee","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2210.01708v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01335v2","updated":"2024-10-23T12:38:40Z","published":"2024-03-30T13:25:11Z","title":"Generative AI Models for Different Steps in Architectural Design: A\n Literature Review","summary":" Recent advances in generative artificial intelligence (AI) technologies have\nbeen significantly driven by models such as generative adversarial networks\n(GANs), variational autoencoders (VAEs), and denoising diffusion probabilistic\nmodels (DDPMs). Although architects recognize the potential of generative AI in\ndesign, personal barriers often restrict their access to the latest\ntechnological developments, thereby causing the application of generative AI in\narchitectural design to lag behind. Therefore, it is essential to comprehend\nthe principles and advancements of generative AI models and analyze their\nrelevance in architecture applications. This paper first provides an overview\nof generative AI technologies, with a focus on probabilistic diffusion models\n(DDPMs), 3D generative models, and foundation models, highlighting their recent\ndevelopments and main application scenarios. Then, the paper explains how the\nabovementioned models could be utilized in architecture. We subdivide the\narchitectural design process into six steps and review related research\nprojects in each step from 2020 to the present. Lastly, this paper discusses\npotential future directions for applying generative AI in the architectural\ndesign steps. This research can help architects quickly understand the\ndevelopment and latest progress of generative AI and contribute to the further\ndevelopment of intelligent architecture.\n","authors":["Chengyuan Li","Tianyu Zhang","Xusheng Du","Ye Zhang","Haoran Xie"],"pdf_url":"https://arxiv.org/pdf/2404.01335v2.pdf","comment":"34 pages, 14 figures, accepted by Frontiers of Architectural Research"},{"id":"http://arxiv.org/abs/2410.17823v1","updated":"2024-10-23T12:32:21Z","published":"2024-10-23T12:32:21Z","title":"Att2CPC: Attention-Guided Lossy Attribute Compression of Point Clouds","summary":" With the great progress of 3D sensing and acquisition technology, the volume\nof point cloud data has grown dramatically, which urges the development of\nefficient point cloud compression methods. In this paper, we focus on the task\nof learned lossy point cloud attribute compression (PCAC). We propose an\nefficient attention-based method for lossy compression of point cloud\nattributes leveraging on an autoencoder architecture. Specifically, at the\nencoding side, we conduct multiple downsampling to best exploit the local\nattribute patterns, in which effective External Cross Attention (ECA) is\ndevised to hierarchically aggregate features by intergrating attributes and\ngeometry contexts. At the decoding side, the attributes of the point cloud are\nprogressively reconstructed based on the multi-scale representation and the\nzero-padding upsampling tactic. To the best of our knowledge, this is the first\napproach to introduce attention mechanism to point-based lossy PCAC task. We\nverify the compression efficiency of our model on various sequences, including\nhuman body frames, sparse objects, and large-scale point cloud scenes.\nExperiments show that our method achieves an average improvement of 1.15 dB and\n2.13 dB in BD-PSNR of Y channel and YUV channel, respectively, when comparing\nwith the state-of-the-art point-based method Deep-PCAC. Codes of this paper are\navailable at https://github.com/I2-Multimedia-Lab/Att2CPC.\n","authors":["Kai Liu","Kang You","Pan Gao","Manoranjan Paul"],"pdf_url":"https://arxiv.org/pdf/2410.17823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04811v2","updated":"2024-10-23T12:27:12Z","published":"2024-07-05T18:49:07Z","title":"Simplifying Deep Temporal Difference Learning","summary":" Q-learning played a foundational role in the field reinforcement learning\n(RL). However, TD algorithms with off-policy data, such as Q-learning, or\nnonlinear function approximation like deep neural networks require several\nadditional tricks to stabilise training, primarily a replay buffer and target\nnetworks. Unfortunately, the delayed updating of frozen network parameters in\nthe target network harms the sample efficiency and, similarly, the replay\nbuffer introduces memory and implementation overheads. In this paper, we\ninvestigate whether it is possible to accelerate and simplify TD training while\nmaintaining its stability. Our key theoretical result demonstrates for the\nfirst time that regularisation techniques such as LayerNorm can yield provably\nconvergent TD algorithms without the need for a target network, even with\noff-policy data. Empirically, we find that online, parallelised sampling\nenabled by vectorised environments stabilises training without the need of a\nreplay buffer. Motivated by these findings, we propose PQN, our simplified deep\nonline Q-Learning algorithm. Surprisingly, this simple algorithm is competitive\nwith more complex methods like: Rainbow in Atari, R2D2 in Hanabi, QMix in Smax,\nPPO-RNN in Craftax, and can be up to 50x faster than traditional DQN without\nsacrificing sample efficiency. In an era where PPO has become the go-to RL\nalgorithm, PQN reestablishes Q-learning as a viable alternative.\n","authors":["Matteo Gallici","Mattie Fellows","Benjamin Ellis","Bartomeu Pou","Ivan Masmitja","Jakob Nicolaus Foerster","Mario Martin"],"pdf_url":"https://arxiv.org/pdf/2407.04811v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04372v3","updated":"2024-10-23T12:19:16Z","published":"2024-01-09T06:15:45Z","title":"Stable generative modeling using Schrödinger bridges","summary":" We consider the problem of sampling from an unknown distribution for which\nonly a sufficiently large number of training samples are available. Such\nsettings have recently drawn considerable interest in the context of generative\nmodelling and Bayesian inference. In this paper, we propose a generative model\ncombining Schr\\\"odinger bridges and Langevin dynamics. Schr\\\"odinger bridges\nover an appropriate reversible reference process are used to approximate the\nconditional transition probability from the available training samples, which\nis then implemented in a discrete-time reversible Langevin sampler to generate\nnew samples. By setting the kernel bandwidth in the reference process to match\nthe time step size used in the unadjusted Langevin algorithm, our method\neffectively circumvents any stability issues typically associated with the\ntime-stepping of stiff stochastic differential equations. Moreover, we\nintroduce a novel split-step scheme, ensuring that the generated samples remain\nwithin the convex hull of the training samples. Our framework can be naturally\nextended to generate conditional samples and to Bayesian inference problems. We\ndemonstrate the performance of our proposed scheme through experiments on\nsynthetic datasets with increasing dimensions and on a stochastic subgrid-scale\nparametrization conditional sampling problem as well as generating sample\ntrajectories of a dynamical system using conditional sampling.\n","authors":["Georg A. Gottwald","Fengyi Li","Youssef Marzouk","Sebastian Reich"],"pdf_url":"https://arxiv.org/pdf/2401.04372v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11960v4","updated":"2024-10-23T12:18:49Z","published":"2024-03-18T16:57:16Z","title":"Causality-Aware Spatiotemporal Graph Neural Networks for Spatiotemporal\n Time Series Imputation","summary":" Spatiotemporal time series are usually collected via monitoring sensors\nplaced at different locations, which usually contain missing values due to\nvarious failures, such as mechanical damages and Internet outages. Imputing the\nmissing values is crucial for analyzing time series. When recovering a specific\ndata point, most existing methods consider all the information relevant to that\npoint regardless of the cause-and-effect relationship. During data collection,\nit is inevitable that some unknown confounders are included, e.g., background\nnoise in time series and non-causal shortcut edges in the constructed sensor\nnetwork. These confounders could open backdoor paths and establish non-causal\ncorrelations between the input and output. Over-exploiting these non-causal\ncorrelations could cause overfitting. In this paper, we first revisit\nspatiotemporal time series imputation from a causal perspective and show how to\nblock the confounders via the frontdoor adjustment. Based on the results of\nfrontdoor adjustment, we introduce a novel Causality-Aware Spatiotemporal Graph\nNeural Network (Casper), which contains a novel Prompt Based Decoder (PBD) and\na Spatiotemporal Causal Attention (SCA). PBD could reduce the impact of\nconfounders and SCA could discover the sparse causal relationships among\nembeddings. Theoretical analysis reveals that SCA discovers causal\nrelationships based on the values of gradients. We evaluate Casper on three\nreal-world datasets, and the experimental results show that Casper could\noutperform the baselines and could effectively discover causal relationships.\n","authors":["Baoyu Jing","Dawei Zhou","Kan Ren","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2403.11960v4.pdf","comment":"Accepted by CIKM'2024. Fixed typos"},{"id":"http://arxiv.org/abs/2410.17814v1","updated":"2024-10-23T12:18:36Z","published":"2024-10-23T12:18:36Z","title":"Learning Lossless Compression for High Bit-Depth Volumetric Medical\n Image","summary":" Recent advances in learning-based methods have markedly enhanced the\ncapabilities of image compression. However, these methods struggle with high\nbit-depth volumetric medical images, facing issues such as degraded\nperformance, increased memory demand, and reduced processing speed. To address\nthese challenges, this paper presents the Bit-Division based Lossless\nVolumetric Image Compression (BD-LVIC) framework, which is tailored for high\nbit-depth medical volume compression. The BD-LVIC framework skillfully divides\nthe high bit-depth volume into two lower bit-depth segments: the Most\nSignificant Bit-Volume (MSBV) and the Least Significant Bit-Volume (LSBV). The\nMSBV concentrates on the most significant bits of the volumetric medical image,\ncapturing vital structural details in a compact manner. This reduction in\ncomplexity greatly improves compression efficiency using traditional codecs.\nConversely, the LSBV deals with the least significant bits, which encapsulate\nintricate texture details. To compress this detailed information effectively,\nwe introduce an effective learning-based compression model equipped with a\nTransformer-Based Feature Alignment Module, which exploits both intra-slice and\ninter-slice redundancies to accurately align features. Subsequently, a Parallel\nAutoregressive Coding Module merges these features to precisely estimate the\nprobability distribution of the least significant bit-planes. Our extensive\ntesting demonstrates that the BD-LVIC framework not only sets new performance\nbenchmarks across various datasets but also maintains a competitive coding\nspeed, highlighting its significant potential and practical utility in the\nrealm of volumetric medical image compression.\n","authors":["Kai Wang","Yuanchao Bai","Daxin Li","Deming Zhai","Junjun Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2410.17814v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2408.04307v2","updated":"2024-10-23T12:08:33Z","published":"2024-08-08T08:40:15Z","title":"MoC-System: Efficient Fault Tolerance for Sparse Mixture-of-Experts\n Model Training","summary":" As large language models continue to scale up, distributed training systems\nhave expanded beyond 10k nodes, intensifying the importance of fault tolerance.\nCheckpoint has emerged as the predominant fault tolerance strategy, with\nextensive studies dedicated to optimizing its efficiency. However, the advent\nof the sparse Mixture-of-Experts (MoE) model presents new challenges due to the\nsubstantial increase in model size, despite comparable computational demands to\ndense models.\n In this work, we propose the Mixture-of-Checkpoint System (MoC-System) to\norchestrate the vast array of checkpoint shards produced in distributed\ntraining systems. MoC-System features a novel Partial Experts Checkpointing\n(PEC) mechanism, an algorithm-system co-design that strategically saves a\nselected subset of experts, effectively reducing the MoE checkpoint size to\nlevels comparable with dense models. Incorporating hybrid parallel strategies,\nMoC-System involves fully sharded checkpointing strategies to evenly distribute\nthe workload across distributed ranks. Furthermore, MoC-System introduces a\ntwo-level checkpointing management method that asynchronously handles in-memory\nsnapshots and persistence processes.\n We build MoC-System upon the Megatron-DeepSpeed framework, achieving up to a\n98.9% reduction in overhead for each checkpointing process compared to the\noriginal method, during MoE model training with ZeRO-2 data parallelism and\nexpert parallelism. Additionally, extensive empirical analyses substantiate\nthat our methods enhance efficiency while maintaining comparable model\naccuracy, even achieving an average accuracy increase of 1.08% on downstream\ntasks.\n","authors":["Weilin Cai","Le Qin","Jiayi Huang"],"pdf_url":"https://arxiv.org/pdf/2408.04307v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12641v3","updated":"2024-10-23T11:54:42Z","published":"2024-03-19T11:24:14Z","title":"Automated Contrastive Learning Strategy Search for Time Series","summary":" In recent years, Contrastive Learning (CL) has become a predominant\nrepresentation learning paradigm for time series. Most existing methods\nmanually build specific CL Strategies (CLS) by human heuristics for certain\ndatasets and tasks. However, manually developing CLS usually requires excessive\nprior knowledge about the data, and massive experiments to determine the\ndetailed CL configurations. In this paper, we present an Automated Machine\nLearning (AutoML) practice at Microsoft, which automatically learns CLS for\ntime series datasets and tasks, namely Automated Contrastive Learning (AutoCL).\nWe first construct a principled search space of size over $3\\times10^{12}$,\ncovering data augmentation, embedding transformation, contrastive pair\nconstruction, and contrastive losses. Further, we introduce an efficient\nreinforcement learning algorithm, which optimizes CLS from the performance on\nthe validation tasks, to obtain effective CLS within the space. Experimental\nresults on various real-world datasets demonstrate that AutoCL could\nautomatically find the suitable CLS for the given dataset and task. From the\ncandidate CLS found by AutoCL on several public datasets/tasks, we compose a\ntransferable Generally Good Strategy (GGS), which has a strong performance for\nother datasets. We also provide empirical analysis as a guide for the future\ndesign of CLS.\n","authors":["Baoyu Jing","Yansen Wang","Guoxin Sui","Jing Hong","Jingrui He","Yuqing Yang","Dongsheng Li","Kan Ren"],"pdf_url":"https://arxiv.org/pdf/2403.12641v3.pdf","comment":"Accepted by CIKM'2024. Fixed typos"},{"id":"http://arxiv.org/abs/2410.17796v1","updated":"2024-10-23T11:52:52Z","published":"2024-10-23T11:52:52Z","title":"A Comprehensive Analysis on the Learning Curve in Kernel Ridge\n Regression","summary":" This paper conducts a comprehensive study of the learning curves of kernel\nridge regression (KRR) under minimal assumptions. Our contributions are\nthree-fold: 1) we analyze the role of key properties of the kernel, such as its\nspectral eigen-decay, the characteristics of the eigenfunctions, and the\nsmoothness of the kernel; 2) we demonstrate the validity of the Gaussian\nEquivalent Property (GEP), which states that the generalization performance of\nKRR remains the same when the whitened features are replaced by standard\nGaussian vectors, thereby shedding light on the success of previous analyzes\nunder the Gaussian Design Assumption; 3) we derive novel bounds that improve\nover existing bounds across a broad range of setting such as (in)dependent\nfeature vectors and various combinations of eigen-decay rates in the\nover/underparameterized regimes.\n","authors":["Tin Sum Cheng","Aurelien Lucchi","Anastasis Kratsios","David Belius"],"pdf_url":"https://arxiv.org/pdf/2410.17796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17792v1","updated":"2024-10-23T11:47:04Z","published":"2024-10-23T11:47:04Z","title":"Enhancing Federated Learning Convergence with Dynamic Data Queue and\n Data Entropy-driven Participant Selection","summary":" Federated Learning (FL) is a decentralized approach for collaborative model\ntraining on edge devices. This distributed method of model training offers\nadvantages in privacy, security, regulatory compliance, and cost-efficiency.\nOur emphasis in this research lies in addressing statistical complexity in FL,\nespecially when the data stored locally across devices is not identically and\nindependently distributed (non-IID). We have observed an accuracy reduction of\nup to approximately 10\\% to 30\\%, particularly in skewed scenarios where each\nedge device trains with only 1 class of data. This reduction is attributed to\nweight divergence, quantified using the Euclidean distance between device-level\nclass distributions and the population distribution, resulting in a bias term\n(\\(\\delta_k\\)). As a solution, we present a method to improve convergence in FL\nby creating a global subset of data on the server and dynamically distributing\nit across devices using a Dynamic Data queue-driven Federated Learning (DDFL).\nNext, we leverage Data Entropy metrics to observe the process during each\ntraining round and enable reasonable device selection for aggregation.\nFurthermore, we provide a convergence analysis of our proposed DDFL to justify\ntheir viability in practical FL scenarios, aiming for better device selection,\na non-sub-optimal global model, and faster convergence. We observe that our\napproach results in a substantial accuracy boost of approximately 5\\% for the\nMNIST dataset, around 18\\% for CIFAR-10, and 20\\% for CIFAR-100 with a 10\\%\nglobal subset of data, outperforming the state-of-the-art (SOTA) aggregation\nalgorithms.\n","authors":["Charuka Herath","Xiaolan Liu","Sangarapillai Lambotharan","Yogachandran Rahulamathavan"],"pdf_url":"https://arxiv.org/pdf/2410.17792v1.pdf","comment":"The Journal is submitted to IEEE Transactions in the Internet of\n Things"},{"id":"http://arxiv.org/abs/2410.17787v1","updated":"2024-10-23T11:37:20Z","published":"2024-10-23T11:37:20Z","title":"Large Language Models Engineer Too Many Simple Features For Tabular Data","summary":" Tabular machine learning problems often require time-consuming and\nlabor-intensive feature engineering. Recent efforts have focused on using large\nlanguage models (LLMs) to capitalize on their potential domain knowledge. At\nthe same time, researchers have observed ethically concerning negative biases\nin other LLM-related use cases, such as text generation. These developments\nmotivated us to investigate whether LLMs exhibit a bias that negatively impacts\nthe performance of feature engineering. While not ethically concerning, such a\nbias could hinder practitioners from fully utilizing LLMs for automated data\nscience. Therefore, we propose a method to detect potential biases by detecting\nanomalies in the frequency of operators (e.g., adding two features) suggested\nby LLMs when engineering new features. Our experiments evaluate the bias of\nfour LLMs, two big frontier and two small open-source models, across 27 tabular\ndatasets. Our results indicate that LLMs are biased toward simple operators,\nsuch as addition, and can fail to utilize more complex operators, such as\ngrouping followed by aggregations. Furthermore, the bias can negatively impact\nthe predictive performance when using LLM-generated features. Our results call\nfor mitigating bias when using LLMs for feature engineering.\n","authors":["Jaris Küken","Lennart Purucker","Frank Hutter"],"pdf_url":"https://arxiv.org/pdf/2410.17787v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.17772v1","updated":"2024-10-23T11:19:48Z","published":"2024-10-23T11:19:48Z","title":"Scaling Robot Policy Learning via Zero-Shot Labeling with Foundation\n Models","summary":" A central challenge towards developing robots that can relate human language\nto their perception and actions is the scarcity of natural language annotations\nin diverse robot datasets. Moreover, robot policies that follow natural\nlanguage instructions are typically trained on either templated language or\nexpensive human-labeled instructions, hindering their scalability. To this end,\nwe introduce NILS: Natural language Instruction Labeling for Scalability. NILS\nautomatically labels uncurated, long-horizon robot data at scale in a zero-shot\nmanner without any human intervention. NILS combines pretrained vision-language\nfoundation models in order to detect objects in a scene, detect object-centric\nchanges, segment tasks from large datasets of unlabelled interaction data and\nultimately label behavior datasets. Evaluations on BridgeV2, Fractal, and a\nkitchen play dataset show that NILS can autonomously annotate diverse robot\ndemonstrations of unlabeled and unstructured datasets while alleviating several\nshortcomings of crowdsourced human annotations, such as low data quality and\ndiversity. We use NILS to label over 115k trajectories obtained from over 430\nhours of robot data. We open-source our auto-labeling code and generated\nannotations on our website: http://robottasklabeling.github.io.\n","authors":["Nils Blank","Moritz Reuss","Marcel Rühle","Ömer Erdinç Yağmurlu","Fabian Wenzel","Oier Mees","Rudolf Lioutikov"],"pdf_url":"https://arxiv.org/pdf/2410.17772v1.pdf","comment":"Project Website at https://robottasklabeling.github.io/"},{"id":"http://arxiv.org/abs/2410.17770v1","updated":"2024-10-23T11:19:08Z","published":"2024-10-23T11:19:08Z","title":"Locating Information in Large Language Models via Random Matrix Theory","summary":" As large language models (LLMs) become central to AI applications, gaining a\ndeeper understanding of their inner workings is increasingly important. In this\nwork, we analyze the weight matrices of pretrained transformer models --\nspecifically BERT and Llama -- using random matrix theory (RMT) as a\nzero-information hypothesis. While randomly initialized weights perfectly agree\nwith RMT predictions, deviations emerge after training, allowing us to locate\nlearned structures within the models. We identify layer-type specific behaviors\nthat are consistent across all blocks and architectures considered. By\npinpointing regions that deviate from RMT predictions, we highlight areas of\nfeature learning and confirm this through comparisons with the activation\ncovariance matrices of the corresponding layers. Our method provides a\ndiagnostic tool for identifying relevant regions in transformer weights using\nonly the trained matrices. Additionally, we address the ongoing debate\nregarding the significance of small singular values in the context of\nfine-tuning and alignment in LLMs. Our findings reveal that, after fine-tuning,\nsmall singular values play a crucial role in the models' capabilities,\nsuggesting that removing them in an already aligned transformer can be\ndetrimental, as it may compromise model alignment.\n","authors":["Max Staats","Matthias Thamm","Bernd Rosenow"],"pdf_url":"https://arxiv.org/pdf/2410.17770v1.pdf","comment":"17 pages, 14 figures"},{"id":"http://arxiv.org/abs/2410.17765v1","updated":"2024-10-23T11:06:36Z","published":"2024-10-23T11:06:36Z","title":"Faster Language Models with Better Multi-Token Prediction Using Tensor\n Decomposition","summary":" We propose a new model for multi-token prediction in transformers, aiming to\nenhance sampling efficiency without compromising accuracy. Motivated by recent\nwork that predicts the probabilities of subsequent tokens using multiple heads,\nwe connect this approach to rank-$1$ canonical tensor decomposition. By\ngeneralizing it to a rank-$r$ canonical probability decomposition, we develop\nan improved model that predicts multiple tokens simultaneously. This model can\nalso be interpreted as a mixture of experts, allowing us to leverage successful\ntechniques from that domain for efficient and robust training. Importantly, the\noverall overhead for training and sampling remains low. Our method demonstrates\nsignificant improvements in inference speed for both text and code generation\ntasks, proving particularly beneficial within the self-speculative decoding\nparadigm. It maintains its effectiveness across various model sizes and\ntraining epochs, highlighting its robustness and scalability.\n","authors":["Artem Basharin","Andrei Chertkov","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2410.17765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17764v1","updated":"2024-10-23T11:02:59Z","published":"2024-10-23T11:02:59Z","title":"Beyond Backpropagation: Optimization with Multi-Tangent Forward\n Gradients","summary":" The gradients used to train neural networks are typically computed using\nbackpropagation. While an efficient way to obtain exact gradients,\nbackpropagation is computationally expensive, hinders parallelization, and is\nbiologically implausible. Forward gradients are an approach to approximate the\ngradients from directional derivatives along random tangents computed by\nforward-mode automatic differentiation. So far, research has focused on using a\nsingle tangent per step. This paper provides an in-depth analysis of\nmulti-tangent forward gradients and introduces an improved approach to\ncombining the forward gradients from multiple tangents based on orthogonal\nprojections. We demonstrate that increasing the number of tangents improves\nboth approximation quality and optimization performance across various tasks.\n","authors":["Katharina Flügel","Daniel Coquelin","Marie Weiel","Achim Streit","Markus Götz"],"pdf_url":"https://arxiv.org/pdf/2410.17764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12676v2","updated":"2024-10-23T11:01:45Z","published":"2023-12-20T00:31:43Z","title":"Bayesian Analysis of Combinatorial Gaussian Process Bandits","summary":" We consider the combinatorial volatile Gaussian process (GP) semi-bandit\nproblem. Each round, an agent is provided a set of available base arms and must\nselect a subset of them to maximize the long-term cumulative reward. We study\nthe Bayesian setting and provide novel Bayesian cumulative regret bounds for\nthree GP-based algorithms: GP-UCB, GP-BayesUCB and GP-TS. Our bounds extend\nprevious results for GP-UCB and GP-TS to the infinite, volatile and\ncombinatorial setting, and to the best of our knowledge, we provide the first\nregret bound for GP-BayesUCB. Volatile arms encompass other widely considered\nbandit problems such as contextual bandits. Furthermore, we employ our\nframework to address the challenging real-world problem of online\nenergy-efficient navigation, where we demonstrate its effectiveness compared to\nthe alternatives.\n","authors":["Jack Sandberg","Niklas Åkerblom","Morteza Haghir Chehreghani"],"pdf_url":"https://arxiv.org/pdf/2312.12676v2.pdf","comment":"32 pages, 10 figures"},{"id":"http://arxiv.org/abs/2410.17762v1","updated":"2024-10-23T11:01:39Z","published":"2024-10-23T11:01:39Z","title":"Anomaly Resilient Temporal QoS Prediction using Hypergraph Convoluted\n Transformer Network","summary":" Quality-of-Service (QoS) prediction is a critical task in the service\nlifecycle, enabling precise and adaptive service recommendations by\nanticipating performance variations over time in response to evolving network\nuncertainties and user preferences. However, contemporary QoS prediction\nmethods frequently encounter data sparsity and cold-start issues, which hinder\naccurate QoS predictions and limit the ability to capture diverse user\npreferences. Additionally, these methods often assume QoS data reliability,\nneglecting potential credibility issues such as outliers and the presence of\ngreysheep users and services with atypical invocation patterns. Furthermore,\ntraditional approaches fail to leverage diverse features, including\ndomain-specific knowledge and complex higher-order patterns, essential for\naccurate QoS predictions. In this paper, we introduce a real-time, trust-aware\nframework for temporal QoS prediction to address the aforementioned challenges,\nfeaturing an end-to-end deep architecture called the Hypergraph Convoluted\nTransformer Network (HCTN). HCTN combines a hypergraph structure with graph\nconvolution over hyper-edges to effectively address high-sparsity issues by\ncapturing complex, high-order correlations. Complementing this, the transformer\nnetwork utilizes multi-head attention along with parallel 1D convolutional\nlayers and fully connected dense blocks to capture both fine-grained and\ncoarse-grained dynamic patterns. Additionally, our approach includes a\nsparsity-resilient solution for detecting greysheep users and services,\nincorporating their unique characteristics to improve prediction accuracy.\nTrained with a robust loss function resistant to outliers, HCTN demonstrated\nstate-of-the-art performance on the large-scale WSDREAM-2 datasets for response\ntime and throughput.\n","authors":["Suraj Kumar","Soumi Chattopadhyay","Chandranath Adak"],"pdf_url":"https://arxiv.org/pdf/2410.17762v1.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2410.17760v1","updated":"2024-10-23T10:56:05Z","published":"2024-10-23T10:56:05Z","title":"Topology meets Machine Learning: An Introduction using the Euler\n Characteristic Transform","summary":" This overview article makes the case for how topological concepts can enrich\nresearch in machine learning. Using the Euler Characteristic Transform (ECT), a\ngeometrical-topological invariant, as a running example, I present different\nuse cases that result in more efficient models for analyzing point clouds,\ngraphs, and meshes. Moreover, I outline a vision for how topological concepts\ncould be used in the future, comprising (1) the learning of functions on\ntopological spaces, (2) the building of hybrid models that imbue neural\nnetworks with knowledge about the topological information in data, and (3) the\nanalysis of qualitative properties of neural networks. With current research\nalready addressing some of these aspects, this article thus serves as an\nintroduction and invitation to this nascent area of research.\n","authors":["Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2410.17760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17758v1","updated":"2024-10-23T10:50:07Z","published":"2024-10-23T10:50:07Z","title":"Escaping the Forest: Sparse Interpretable Neural Networks for Tabular\n Data","summary":" Tabular datasets are widely used in scientific disciplines such as biology.\nWhile these disciplines have already adopted AI methods to enhance their\nfindings and analysis, they mainly use tree-based methods due to their\ninterpretability. At the same time, artificial neural networks have been shown\nto offer superior flexibility and depth for rich and complex non-tabular\nproblems, but they are falling behind tree-based models for tabular data in\nterms of performance and interpretability. Although sparsity has been shown to\nimprove the interpretability and performance of ANN models for complex\nnon-tabular datasets, enforcing sparsity structurally and formatively for\ntabular data before training the model, remains an open question. To address\nthis question, we establish a method that infuses sparsity in neural networks\nby utilising attention mechanisms to capture the features' importance in\ntabular datasets. We show that our models, Sparse TABular NET or sTAB-Net with\nattention mechanisms, are more effective than tree-based models, reaching the\nstate-of-the-art on biological datasets. They further permit the extraction of\ninsights from these datasets and achieve better performance than post-hoc\nmethods like SHAP.\n","authors":["Salvatore Raieli","Abdulrahman Altahhan","Nathalie Jeanray","Stéphane Gerart","Sebastien Vachenc"],"pdf_url":"https://arxiv.org/pdf/2410.17758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05549v3","updated":"2024-10-23T10:31:03Z","published":"2023-01-12T18:46:28Z","title":"On the explainability of quantum neural networks based on variational\n quantum circuits","summary":" Ridge functions are used to describe and study the lower bound of the\napproximation done by the neural networks which can be written as a linear\ncombination of activation functions. If the activation functions are also ridge\nfunctions, these networks are called explainable neural networks.\n In this brief paper, we first show that quantum neural networks which are\nbased on variational quantum circuits can be written as a linear combination of\nridge functions by following matrix notations. Consequently, we show that the\ninterpretability and explainability of such quantum neural networks can be\ndirectly considered and studied as an approximation with the linear combination\nof ridge functions.\n","authors":["Ammar Daskin"],"pdf_url":"https://arxiv.org/pdf/2301.05549v3.pdf","comment":"a brief paper,a few missing references have been added"},{"id":"http://arxiv.org/abs/2410.17751v1","updated":"2024-10-23T10:28:17Z","published":"2024-10-23T10:28:17Z","title":"VISAGE: Video Synthesis using Action Graphs for Surgery","summary":" Surgical data science (SDS) is a field that analyzes patient data before,\nduring, and after surgery to improve surgical outcomes and skills. However,\nsurgical data is scarce, heterogeneous, and complex, which limits the\napplicability of existing machine learning methods. In this work, we introduce\nthe novel task of future video generation in laparoscopic surgery. This task\ncan augment and enrich the existing surgical data and enable various\napplications, such as simulation, analysis, and robot-aided surgery.\nUltimately, it involves not only understanding the current state of the\noperation but also accurately predicting the dynamic and often unpredictable\nnature of surgical procedures. Our proposed method, VISAGE (VIdeo Synthesis\nusing Action Graphs for Surgery), leverages the power of action scene graphs to\ncapture the sequential nature of laparoscopic procedures and utilizes diffusion\nmodels to synthesize temporally coherent video sequences. VISAGE predicts the\nfuture frames given only a single initial frame, and the action graph triplets.\nBy incorporating domain-specific knowledge through the action graph, VISAGE\nensures the generated videos adhere to the expected visual and motion patterns\nobserved in real laparoscopic procedures. The results of our experiments\ndemonstrate high-fidelity video generation for laparoscopy procedures, which\nenables various applications in SDS.\n","authors":["Yousef Yeganeh","Rachmadio Lazuardi","Amir Shamseddin","Emine Dari","Yash Thirani","Nassir Navab Azade Farshad"],"pdf_url":"https://arxiv.org/pdf/2410.17751v1.pdf","comment":"Accepted at MICCAI 2024 Embodied AI and Robotics for HealTHcare\n (EARTH) Workshop"},{"id":"http://arxiv.org/abs/2410.17748v1","updated":"2024-10-23T10:23:53Z","published":"2024-10-23T10:23:53Z","title":"Can Uncertainty Quantification Enable Better Learning-based Index\n Tuning?","summary":" Index tuning is crucial for optimizing database performance by selecting\noptimal indexes based on workload. The key to this process lies in an accurate\nand efficient benefit estimator. Traditional methods relying on what-if tools\noften suffer from inefficiency and inaccuracy. In contrast, learning-based\nmodels provide a promising alternative but face challenges such as instability,\nlack of interpretability, and complex management. To overcome these\nlimitations, we adopt a novel approach: quantifying the uncertainty in\nlearning-based models' results, thereby combining the strengths of both\ntraditional and learning-based methods for reliable index tuning. We propose\nBeauty, the first uncertainty-aware framework that enhances learning-based\nmodels with uncertainty quantification and uses what-if tools as a\ncomplementary mechanism to improve reliability and reduce management\ncomplexity. Specifically, we introduce a novel method that combines AutoEncoder\nand Monte Carlo Dropout to jointly quantify uncertainty, tailored to the\ncharacteristics of benefit estimation tasks. In experiments involving sixteen\nmodels, our approach outperformed existing uncertainty quantification methods\nin the majority of cases. We also conducted index tuning tests on six datasets.\nBy applying the Beauty framework, we eliminated worst-case scenarios and more\nthan tripled the occurrence of best-case scenarios.\n","authors":["Tao Yu","Zhaonian Zou","Hao Xiong"],"pdf_url":"https://arxiv.org/pdf/2410.17748v1.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2410.17744v1","updated":"2024-10-23T10:17:13Z","published":"2024-10-23T10:17:13Z","title":"Learning Versatile Skills with Curriculum Masking","summary":" Masked prediction has emerged as a promising pretraining paradigm in offline\nreinforcement learning (RL) due to its versatile masking schemes, enabling\nflexible inference across various downstream tasks with a unified model.\nDespite the versatility of masked prediction, it remains unclear how to balance\nthe learning of skills at different levels of complexity. To address this, we\npropose CurrMask, a curriculum masking pretraining paradigm for sequential\ndecision making. Motivated by how humans learn by organizing knowledge in a\ncurriculum, CurrMask adjusts its masking scheme during pretraining for learning\nversatile skills. Through extensive experiments, we show that CurrMask exhibits\nsuperior zero-shot performance on skill prompting tasks, goal-conditioned\nplanning tasks, and competitive finetuning performance on offline RL tasks.\nAdditionally, our analysis of training dynamics reveals that CurrMask gradually\nacquires skills of varying complexity by dynamically adjusting its masking\nscheme.\n","authors":["Yao Tang","Zhihui Xie","Zichuan Lin","Deheng Ye","Shuai Li"],"pdf_url":"https://arxiv.org/pdf/2410.17744v1.pdf","comment":"NeurIPS 2024 poster, 21 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.03094v2","updated":"2024-10-23T10:09:10Z","published":"2024-07-03T13:34:33Z","title":"Conformal Prediction for Causal Effects of Continuous Treatments","summary":" Uncertainty quantification of causal effects is crucial for safety-critical\napplications such as personalized medicine. A powerful approach for this is\nconformal prediction, which has several practical benefits due to\nmodel-agnostic finite-sample guarantees. Yet, existing methods for conformal\nprediction of causal effects are limited to binary/discrete treatments and make\nhighly restrictive assumptions such as known propensity scores. In this work,\nwe provide a novel conformal prediction method for potential outcomes of\ncontinuous treatments. We account for the additional uncertainty introduced\nthrough propensity estimation so that our conformal prediction intervals are\nvalid even if the propensity score is unknown. Our contributions are\nthree-fold: (1) We derive finite-sample prediction intervals for potential\noutcomes of continuous treatments. (2) We provide an algorithm for calculating\nthe derived intervals. (3) We demonstrate the effectiveness of the conformal\nprediction intervals in experiments on synthetic and real-world datasets. To\nthe best of our knowledge, we are the first to propose conformal prediction for\ncontinuous treatments when the propensity score is unknown and must be\nestimated from data.\n","authors":["Maresa Schröder","Dennis Frauen","Jonas Schweisthal","Konstantin Heß","Valentyn Melnychuk","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2407.03094v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05354v3","updated":"2024-10-23T09:51:11Z","published":"2024-10-07T13:44:49Z","title":"Over-the-Air Federated Learning in Cell-Free MIMO with Long-term Power\n Constraint","summary":" Wireless networks supporting artificial intelligence have gained significant\nattention, with Over-the-Air Federated Learning emerging as a key application\ndue to its unique transmission and distributed computing characteristics. This\npaper derives error bounds for Over-the-Air Federated Learning in a Cell-free\nMIMO system and formulates an optimization problem to minimize optimality gap\nvia joint optimization of power control and beamforming. We introduce the\nMOP-LOFPC algorithm, which employs Lyapunov optimization to decouple long-term\nconstraints across rounds while requiring only causal channel state\ninformation. Experimental results demonstrate that MOP-LOFPC achieves a better\nand more flexible trade-off between the model's training loss and adherence to\nlong-term power constraints compared to existing baselines.\n","authors":["Yifan Wang","Cheng Zhang","Yuanndon Zhuang","Mingzeng Dai","Haiming Wang","Yongming Huang"],"pdf_url":"https://arxiv.org/pdf/2410.05354v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11642v2","updated":"2024-10-23T09:43:03Z","published":"2024-10-15T14:31:54Z","title":"Improve Value Estimation of Q Function and Reshape Reward with Monte\n Carlo Tree Search","summary":" Reinforcement learning has achieved remarkable success in perfect information\ngames such as Go and Atari, enabling agents to compete at the highest levels\nagainst human players. However, research in reinforcement learning for\nimperfect information games has been relatively limited due to the more complex\ngame structures and randomness. Traditional methods face challenges in training\nand improving performance in imperfect information games due to issues like\ninaccurate Q value estimation and reward sparsity. In this paper, we focus on\nUno, an imperfect information game, and aim to address these problems by\nreducing Q value overestimation and reshaping reward function. We propose a\nnovel algorithm that utilizes Monte Carlo Tree Search to average the value\nestimations in Q function. Even though we choose Double Deep Q Learning as the\nfoundational framework in this paper, our method can be generalized and used in\nany algorithm which needs Q value estimation, such as the Actor-Critic.\nAdditionally, we employ Monte Carlo Tree Search to reshape the reward structure\nin the game environment. We compare our algorithm with several traditional\nmethods applied to games such as Double Deep Q Learning, Deep Monte Carlo and\nNeural Fictitious Self Play, and the experiments demonstrate that our algorithm\nconsistently outperforms these approaches, especially as the number of players\nin Uno increases, indicating a higher level of difficulty.\n","authors":["Jiamian Li"],"pdf_url":"https://arxiv.org/pdf/2410.11642v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17715v1","updated":"2024-10-23T09:42:17Z","published":"2024-10-23T09:42:17Z","title":"Continual Learning on a Data Diet","summary":" Continual Learning (CL) methods usually learn from all available data.\nHowever, this is not the case in human cognition which efficiently focuses on\nkey experiences while disregarding the redundant information. Similarly, not\nall data points in a dataset have equal potential; some can be more informative\nthan others. This disparity may significantly impact the performance, as both\nthe quality and quantity of samples directly influence the model's\ngeneralizability and efficiency. Drawing inspiration from this, we explore the\npotential of learning from important samples and present an empirical study for\nevaluating coreset selection techniques in the context of CL to stimulate\nresearch in this unexplored area. We train different continual learners on\nincreasing amounts of selected samples and investigate the learning-forgetting\ndynamics by shedding light on the underlying mechanisms driving their improved\nstability-plasticity balance. We present several significant observations:\nlearning from selectively chosen samples (i) enhances incremental accuracy,\n(ii) improves knowledge retention of previous tasks, and (iii) refines learned\nrepresentations. This analysis contributes to a deeper understanding of\nselective learning strategies in CL scenarios.\n","authors":["Elif Ceren Gok Yildirim","Murat Onur Yildirim","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2410.17715v1.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.08745v5","updated":"2024-10-23T09:40:44Z","published":"2023-11-15T07:27:40Z","title":"Using Stochastic Gradient Descent to Smooth Nonconvex Functions:\n Analysis of Implicit Graduated Optimization","summary":" The graduated optimization approach is a heuristic method for finding global\noptimal solutions for nonconvex functions by using a function smoothing\noperation with stochastic noise. We show that stochastic noise in stochastic\ngradient descent (SGD) has the effect of smoothing the objective function, the\ndegree of which is determined by the learning rate, batch size, and variance of\nthe stochastic gradient. Using this finding, we propose and analyze a new\ngraduated optimization algorithm that varies the degree of smoothing by varying\nthe learning rate and batch size, and provide experimental results on image\nclassification tasks with ResNets that support our theoretical findings. We\nfurther show that there is an interesting correlation between the degree of\nsmoothing by SGD's stochastic noise, the well-studied ``sharpness'' indicator,\nand the generalization performance of the model.\n","authors":["Naoki Sato","Hideaki Iiduka"],"pdf_url":"https://arxiv.org/pdf/2311.08745v5.pdf","comment":"The latest version was updated in October 2024. Under review"},{"id":"http://arxiv.org/abs/2410.17711v1","updated":"2024-10-23T09:36:21Z","published":"2024-10-23T09:36:21Z","title":"Beware of Calibration Data for Pruning Large Language Models","summary":" As large language models (LLMs) are widely applied across various fields,\nmodel compression has become increasingly crucial for reducing costs and\nimproving inference efficiency. Post-training pruning is a promising method\nthat does not require resource-intensive iterative training and only needs a\nsmall amount of calibration data to assess the importance of parameters.\nPrevious research has primarily focused on designing advanced pruning methods,\nwhile different calibration data's impact on pruning performance still lacks\nsystematical exploration. We fill this blank and surprisingly observe that the\neffects of calibration data even value more than designing advanced pruning\nstrategies, especially for high sparsity. Our preliminary exploration also\ndiscloses that using calibration data similar to the training data can yield\nbetter performance. As pre-training data is usually inaccessible for advanced\nLLMs, we further provide a self-generating calibration data synthesis strategy\nto construct feasible calibration data. We conduct experiments on the recent\nstrong open-source LLMs (e.g., DCLM, and LLaMA-3), and the results show that\nthe proposed method outperforms commonly used calibration data and can\neffectively enhance strong pruning methods (e.g., Wanda, OWL).\n","authors":["Yixin Ji","Yang Xiang","Juntao Li","Qingrong Xia","Ping Li","Xinyu Duan","Zhefeng Wang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.17711v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2402.09891v2","updated":"2024-10-23T09:29:39Z","published":"2024-02-15T11:34:38Z","title":"Do causal predictors generalize better to new domains?","summary":" We study how well machine learning models trained on causal features\ngeneralize across domains. We consider 16 prediction tasks on tabular datasets\ncovering applications in health, employment, education, social benefits, and\npolitics. Each dataset comes with multiple domains, allowing us to test how\nwell a model trained in one domain performs in another. For each prediction\ntask, we select features that have a causal influence on the target of\nprediction. Our goal is to test the hypothesis that models trained on causal\nfeatures generalize better across domains. Without exception, we find that\npredictors using all available features, regardless of causality, have better\nin-domain and out-of-domain accuracy than predictors using causal features.\nMoreover, even the absolute drop in accuracy from one domain to the other is no\nbetter for causal predictors than for models that use all features. In\naddition, we show that recent causal machine learning methods for domain\ngeneralization do not perform better in our evaluation than standard predictors\ntrained on the set of causal features. Likewise, causal discovery algorithms\neither fail to run or select causal variables that perform no better than our\nselection. Extensive robustness checks confirm that our findings are stable\nunder variable misclassification.\n","authors":["Vivian Y. Nastl","Moritz Hardt"],"pdf_url":"https://arxiv.org/pdf/2402.09891v2.pdf","comment":"118 pages, 55 figures, accepted at NeurIPS'24"},{"id":"http://arxiv.org/abs/2408.14762v4","updated":"2024-10-23T09:25:58Z","published":"2024-08-27T03:30:01Z","title":"Explainable Hierarchical Urban Representation Learning for Commuting\n Flow Prediction","summary":" Commuting flow prediction is an essential task for municipal operations in\nthe real world. Previous studies have revealed that it is feasible to estimate\nthe commuting origin-destination (OD) demand within a city using multiple\nauxiliary data. However, most existing methods are not suitable to deal with a\nsimilar task at a large scale, namely within a prefecture or the whole nation,\nowing to the increased number of geographical units that need to be maintained.\nIn addition, region representation learning is a universal approach for gaining\nurban knowledge for diverse metropolitan downstream tasks. Although many\nresearchers have developed comprehensive frameworks to describe urban units\nfrom multi-source data, they have not clarified the relationship between the\nselected geographical elements. Furthermore, metropolitan areas naturally\npreserve ranked structures, like cities and their inclusive districts, which\nmakes elucidating relations between cross-level urban units necessary.\nTherefore, we develop a heterogeneous graph-based model to generate meaningful\nregion embeddings at multiple spatial resolutions for predicting different\ntypes of inter-level OD flows. To demonstrate the effectiveness of the proposed\nmethod, extensive experiments were conducted using real-world aggregated mobile\nphone datasets collected from Shizuoka Prefecture, Japan. The results indicate\nthat our proposed model outperforms existing models in terms of a uniform urban\nstructure. We extend the understanding of predicted results using reasonable\nexplanations to enhance the credibility of the model.\n","authors":["Mingfei Cai","Yanbo Pang","Yoshihide Sekimoto"],"pdf_url":"https://arxiv.org/pdf/2408.14762v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17700v1","updated":"2024-10-23T09:22:43Z","published":"2024-10-23T09:22:43Z","title":"Scalable Random Feature Latent Variable Models","summary":" Random feature latent variable models (RFLVMs) represent the state-of-the-art\nin latent variable models, capable of handling non-Gaussian likelihoods and\neffectively uncovering patterns in high-dimensional data. However, their heavy\nreliance on Monte Carlo sampling results in scalability issues which makes it\ndifficult to use these models for datasets with a massive number of\nobservations. To scale up RFLVMs, we turn to the optimization-based variational\nBayesian inference (VBI) algorithm which is known for its scalability compared\nto sampling-based methods. However, implementing VBI for RFLVMs poses\nchallenges, such as the lack of explicit probability distribution functions\n(PDFs) for the Dirichlet process (DP) in the kernel learning component, and the\nincompatibility of existing VBI algorithms with RFLVMs. To address these\nissues, we introduce a stick-breaking construction for DP to obtain an explicit\nPDF and a novel VBI algorithm called ``block coordinate descent variational\ninference\" (BCD-VI). This enables the development of a scalable version of\nRFLVMs, or in short, SRFLVM. Our proposed method shows scalability,\ncomputational efficiency, superior performance in generating informative latent\nrepresentations and the ability of imputing missing data across various\nreal-world datasets, outperforming state-of-the-art competitors.\n","authors":["Ying Li","Zhidi Lin","Yuhao Liu","Michael Minyi Zhang","Pablo M. Olmos","Petar M. Djurić"],"pdf_url":"https://arxiv.org/pdf/2410.17700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17696v1","updated":"2024-10-23T09:16:22Z","published":"2024-10-23T09:16:22Z","title":"Optimizing Load Scheduling in Power Grids Using Reinforcement Learning\n and Markov Decision Processes","summary":" Power grid load scheduling is a critical task that ensures the balance\nbetween electricity generation and consumption while minimizing operational\ncosts and maintaining grid stability. Traditional optimization methods often\nstruggle with the dynamic and stochastic nature of power systems, especially\nwhen faced with renewable energy sources and fluctuating demand. This paper\nproposes a reinforcement learning (RL) approach using a Markov Decision Process\n(MDP) framework to address the challenges of dynamic load scheduling. The MDP\nis defined by a state space representing grid conditions, an action space\ncovering control operations like generator adjustments and storage management,\nand a reward function balancing economic efficiency and system reliability. We\ninvestigate the application of various RL algorithms, from basic Q-Learning to\nmore advanced Deep Q-Networks (DQN) and Actor-Critic methods, to determine\noptimal scheduling policies. The proposed approach is evaluated through a\nsimulated power grid environment, demonstrating its potential to improve\nscheduling efficiency and adapt to variable demand patterns. Our results show\nthat the RL-based method provides a robust and scalable solution for real-time\nload scheduling, contributing to the efficient management of modern power\ngrids.\n","authors":["Dongwen Luo"],"pdf_url":"https://arxiv.org/pdf/2410.17696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03648v2","updated":"2024-10-23T09:11:00Z","published":"2023-08-07T14:58:53Z","title":"Generative Forests","summary":" We focus on generative AI for a type of data that still represent one of the\nmost prevalent form of data: tabular data. Our paper introduces two key\ncontributions: a new powerful class of forest-based models fit for such tasks\nand a simple training algorithm with strong convergence guarantees in a\nboosting model that parallels that of the original weak / strong supervised\nlearning setting. This algorithm can be implemented by a few tweaks to the most\npopular induction scheme for decision tree induction (i.e. supervised learning)\nwith two classes. Experiments on the quality of generated data display\nsubstantial improvements compared to the state of the art. The losses our\nalgorithm minimize and the structure of our models make them practical for\nrelated tasks that require fast estimation of a density given a generative\nmodel and an observation (even partially specified): such tasks include missing\ndata imputation and density estimation. Additional experiments on these tasks\nreveal that our models can be notably good contenders to diverse state of the\nart methods, relying on models as diverse as (or mixing elements of) trees,\nneural nets, kernels or graphical models.\n","authors":["Richard Nock","Mathieu Guillame-Bert"],"pdf_url":"https://arxiv.org/pdf/2308.03648v2.pdf","comment":"NeurIPS'24"},{"id":"http://arxiv.org/abs/2402.04892v2","updated":"2024-10-23T09:04:57Z","published":"2024-02-07T14:24:04Z","title":"Probabilistic ML Verification via Weighted Model Integration","summary":" In machine learning (ML) verification, the majority of procedures are\nnon-quantitative and therefore cannot be used for verifying probabilistic\nmodels, or be applied in domains where hard guarantees are practically\nunachievable. The probabilistic formal verification (PFV) of ML models is in\nits infancy, with the existing approaches limited to specific ML models,\nproperties, or both. This contrasts with standard formal methods techniques,\nwhose successful adoption in real-world scenarios is also due to their support\nfor a wide range of properties and diverse systems. We propose a unifying\nframework for the PFV of ML systems based on Weighted Model Integration (WMI),\na relatively recent formalism for probabilistic inference with algebraic and\nlogical constraints. Crucially, reducing the PFV of ML models to WMI enables\nthe verification of many properties of interest over a wide range of systems,\naddressing multiple limitations of deterministic verification and ad-hoc\nalgorithms. We substantiate the generality of the approach on prototypical\ntasks involving the verification of group fairness, monotonicity, robustness to\nnoise, probabilistic local robustness and equivalence among predictors. We\ncharacterize the challenges related to the scalability of the approach and,\nthrough our WMI-based perspective, we show how successful scaling techniques in\nthe ML verification literature can be generalized beyond their original scope.\n","authors":["Paolo Morettin","Andrea Passerini","Roberto Sebastiani"],"pdf_url":"https://arxiv.org/pdf/2402.04892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10192v3","updated":"2024-10-23T08:39:00Z","published":"2024-02-15T18:48:32Z","title":"Multi-Excitation Projective Simulation with a Many-Body Physics Inspired\n Inductive Bias","summary":" With the impressive progress of deep learning, applications relying on\nmachine learning are increasingly being integrated into daily life. However,\nmost deep learning models have an opaque, oracle-like nature making it\ndifficult to interpret and understand their decisions. This problem led to the\ndevelopment of the field known as eXplainable Artificial Intelligence (XAI).\nOne method in this field known as Projective Simulation (PS) models a\nchain-of-thought as a random walk of a particle on a graph with vertices that\nhave concepts attached to them. While this description has various benefits,\nincluding the possibility of quantization, it cannot be naturally used to model\nthoughts that combine several concepts simultaneously. To overcome this\nlimitation, we introduce Multi-Excitation Projective Simulation (mePS), a\ngeneralization that considers a chain-of-thought to be a random walk of several\nparticles on a hypergraph. A definition for a dynamic hypergraph is put forward\nto describe the agent's training history along with applications to AI and\nhypergraph visualization. An inductive bias inspired by the remarkably\nsuccessful few-body interaction models used in quantum many-body physics is\nformalized for our classical mePS framework and employed to tackle the\nexponential complexity associated with naive implementations of hypergraphs. We\nprove that our inductive bias reduces the complexity from exponential to\npolynomial, with the exponent representing the cutoff on how many particles can\ninteract. We numerically apply our method to two toy environments and a more\ncomplex scenario modelling the diagnosis of a broken computer. These\nenvironments demonstrate the resource savings provided by an appropriate choice\nof inductive bias, as well as showcasing aspects of interpretability. A quantum\nmodel for mePS is also briefly outlined and some future directions for it are\ndiscussed.\n","authors":["Philip A. LeMaitre","Marius Krumm","Hans J. Briegel"],"pdf_url":"https://arxiv.org/pdf/2402.10192v3.pdf","comment":"26 pages, 8 figures; Code repository at\n https://github.com/MariusKrumm/ManyBodyMEPS. Reorganized main text for better\n readability"},{"id":"http://arxiv.org/abs/2404.15993v4","updated":"2024-10-23T08:33:54Z","published":"2024-04-24T17:10:35Z","title":"Uncertainty Estimation and Quantification for LLMs: A Simple Supervised\n Approach","summary":" In this paper, we study the problem of uncertainty estimation and calibration\nfor LLMs. We begin by formulating the uncertainty estimation problem, a\nrelevant yet underexplored area in existing literature. We then propose a\nsupervised approach that leverages labeled datasets to estimate the uncertainty\nin LLMs' responses. Based on the formulation, we illustrate the difference\nbetween the uncertainty estimation for LLMs and that for standard ML models and\nexplain why the hidden neurons of the LLMs may contain uncertainty information.\nOur designed approach demonstrates the benefits of utilizing hidden activations\nto enhance uncertainty estimation across various tasks and shows robust\ntransferability in out-of-distribution settings. We distinguish the uncertainty\nestimation task from the uncertainty calibration task and show that better\nuncertainty estimation leads to better calibration performance. Furthermore,\nour method is easy to implement and adaptable to different levels of model\naccessibility including black box, grey box, and white box.\n","authors":["Linyu Liu","Yu Pan","Xiaocheng Li","Guanting Chen"],"pdf_url":"https://arxiv.org/pdf/2404.15993v4.pdf","comment":"29 pages, 14 figures"},{"id":"http://arxiv.org/abs/2405.11752v2","updated":"2024-10-23T08:29:20Z","published":"2024-05-20T03:26:58Z","title":"Towards Foundation Model for Chemical Reactor Modeling: Meta-Learning\n with Physics-Informed Adaptation","summary":" In this work, we present a novel application of foundation models for\nchemical reactor modeling. Accurate modeling of real-world chemical reactors\nthrough first-principles is often challenging, and the process of rebuilding\nand retraining models for each new chemical process is inefficient. This raises\na critical question: can we develop a single, universal neural network (i.e., a\nfoundation model) that can rapidly adapt to any new chemical process in a\nreactor? To address this, we propose a foundation model for chemical reactor\nmodeling that employs a meta-learning approach, followed by physics-informed\nfine-tuning on new tasks with only a few data samples. Our model is designed to\ngeneralize across three classic reactor types: continuous stirred tank\nreactors, batch reactors, and plug flow reactors. Compared to conventional\nmethods such as data-driven learning, physics-informed learning, transfer\nlearning, and meta-learning, our approach demonstrates superior performance in\nfew-shot scenarios. Specifically, it shows rapid adaptation to unseen reactions\nwith varying integer orders across different reactor set-ups, requiring minimal\ndata for fine-tuning. Source code is available at\nhttps://github.com/killingbear999/chemical-reactor-foundation-model.\n","authors":["Zihao Wang","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2405.11752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17661v1","updated":"2024-10-23T08:24:47Z","published":"2024-10-23T08:24:47Z","title":"PETAH: Parameter Efficient Task Adaptation for Hybrid Transformers in a\n resource-limited Context","summary":" Following their success in natural language processing (NLP), there has been\na shift towards transformer models in computer vision. While transformers\nperform well and offer promising multi-tasking performance, due to their high\ncompute requirements, many resource-constrained applications still rely on\nconvolutional or hybrid models that combine the benefits of convolution and\nattention layers and achieve the best results in the sub 100M parameter range.\nSimultaneously, task adaptation techniques that allow for the use of one shared\ntransformer backbone for multiple downstream tasks, resulting in great storage\nsavings at negligible cost in performance, have not yet been adopted for hybrid\ntransformers. In this work, we investigate how to achieve the best\ntask-adaptation performance and introduce PETAH: Parameter Efficient Task\nAdaptation for Hybrid Transformers. We further combine PETAH adaptation with\npruning to achieve highly performant and storage friendly models for\nmulti-tasking. In our extensive evaluation on classification and other vision\ntasks, we demonstrate that our PETAH-adapted hybrid models outperform\nestablished task-adaptation techniques for ViTs while requiring fewer\nparameters and being more efficient on mobile hardware.\n","authors":["Maximilian Augustin","Syed Shakib Sarwar","Mostafa Elhoushi","Sai Qian Zhang","Yuecheng Li","Barbara De Salvo"],"pdf_url":"https://arxiv.org/pdf/2410.17661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17655v1","updated":"2024-10-23T08:18:26Z","published":"2024-10-23T08:18:26Z","title":"Mapping the Media Landscape: Predicting Factual Reporting and Political\n Bias Through Web Interactions","summary":" Bias assessment of news sources is paramount for professionals,\norganizations, and researchers who rely on truthful evidence for information\ngathering and reporting. While certain bias indicators are discernible from\ncontent analysis, descriptors like political bias and fake news pose greater\nchallenges. In this paper, we propose an extension to a recently presented news\nmedia reliability estimation method that focuses on modeling outlets and their\nlongitudinal web interactions. Concretely, we assess the classification\nperformance of four reinforcement learning strategies on a large news media\nhyperlink graph. Our experiments, targeting two challenging bias descriptors,\nfactual reporting and political bias, showed a significant performance\nimprovement at the source media level. Additionally, we validate our methods on\nthe CLEF 2023 CheckThat! Lab challenge, outperforming the reported results in\nboth, F1-score and the official MAE metric. Furthermore, we contribute by\nreleasing the largest annotated dataset of news source media, categorized with\nfactual reporting and political bias labels. Our findings suggest that\nprofiling news media sources based on their hyperlink interactions over time is\nfeasible, offering a bird's-eye view of evolving media landscapes.\n","authors":["Dairazalia Sánchez-Cortés","Sergio Burdisso","Esaú Villatoro-Tello","Petr Motlicek"],"pdf_url":"https://arxiv.org/pdf/2410.17655v1.pdf","comment":"Accepted to CLEF 2024"},{"id":"http://arxiv.org/abs/2410.16928v2","updated":"2024-10-23T08:13:11Z","published":"2024-10-22T11:59:36Z","title":"xLSTM-Mixer: Multivariate Time Series Forecasting by Mixing via Scalar\n Memories","summary":" Time series data is prevalent across numerous fields, necessitating the\ndevelopment of robust and accurate forecasting models. Capturing patterns both\nwithin and between temporal and multivariate components is crucial for reliable\npredictions. We introduce xLSTM-Mixer, a model designed to effectively\nintegrate temporal sequences, joint time-variate information, and multiple\nperspectives for robust forecasting. Our approach begins with a linear forecast\nshared across variates, which is then refined by xLSTM blocks. These blocks\nserve as key elements for modeling the complex dynamics of challenging time\nseries data. xLSTM-Mixer ultimately reconciles two distinct views to produce\nthe final forecast. Our extensive evaluations demonstrate xLSTM-Mixer's\nsuperior long-term forecasting performance compared to recent state-of-the-art\nmethods. A thorough model analysis provides further insights into its key\ncomponents and confirms its robustness and effectiveness. This work contributes\nto the resurgence of recurrent models in time series forecasting.\n","authors":["Maurice Kraus","Felix Divo","Devendra Singh Dhami","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2410.16928v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13432v3","updated":"2024-10-23T08:07:05Z","published":"2024-07-18T12:01:09Z","title":"The Art of Imitation: Learning Long-Horizon Manipulation Tasks from Few\n Demonstrations","summary":" Task Parametrized Gaussian Mixture Models (TP-GMM) are a sample-efficient\nmethod for learning object-centric robot manipulation tasks. However, there are\nseveral open challenges to applying TP-GMMs in the wild. In this work, we\ntackle three crucial challenges synergistically. First, end-effector velocities\nare non-Euclidean and thus hard to model using standard GMMs. We thus propose\nto factorize the robot's end-effector velocity into its direction and\nmagnitude, and model them using Riemannian GMMs. Second, we leverage the\nfactorized velocities to segment and sequence skills from complex demonstration\ntrajectories. Through the segmentation, we further align skill trajectories and\nhence leverage time as a powerful inductive bias. Third, we present a method to\nautomatically detect relevant task parameters per skill from visual\nobservations. Our approach enables learning complex manipulation tasks from\njust five demonstrations while using only RGB-D observations. Extensive\nexperimental evaluations on RLBench demonstrate that our approach achieves\nstate-of-the-art performance with 20-fold improved sample efficiency. Our\npolicies generalize across different environments, object instances, and object\npositions, while the learned skills are reusable.\n","authors":["Jan Ole von Hartz","Tim Welschehold","Abhinav Valada","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2407.13432v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17648v1","updated":"2024-10-23T08:07:00Z","published":"2024-10-23T08:07:00Z","title":"Towards Active Participant-Centric Vertical Federated Learning: Some\n Representations May Be All You Need","summary":" Vertical Federated Learning (VFL) enables collaborative model training across\ndifferent participants with distinct features and common samples, while\npreserving data privacy. Existing VFL methodologies often struggle with\nrealistic data partitions, typically incurring high communication costs and\nsignificant operational complexity. In this work, we introduce a novel\nsimplified approach to VFL, Active Participant-Centric VFL (APC-VFL), that, to\nthe best of our knowledge, is the first to require only a single communication\nround between participants, and allows the active participant to do inference\nin a non collaborative fashion. This method integrates unsupervised\nrepresentation learning with knowledge distillation to achieve comparable\naccuracy to traditional VFL methods based on vertical split learning in\nclassical settings, reducing required communication rounds by up to\n$4200\\times$, while being more flexible. Our approach also shows improvements\ncompared to non-federated local models, as well as a comparable VFL proposal,\nVFedTrans, offering an efficient and flexible solution for collaborative\nlearning.\n","authors":["Jon Irureta","Jon Imaz","Aizea Lojo","Marco González","Iñigo Perona"],"pdf_url":"https://arxiv.org/pdf/2410.17648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15294v3","updated":"2024-10-23T08:06:25Z","published":"2024-01-27T04:42:50Z","title":"Integral Operator Approaches for Scattered Data Fitting on Spheres","summary":" This paper focuses on scattered data fitting problems on spheres. We study\nthe approximation performance of a class of weighted spectral filter\nalgorithms, including Tikhonov regularization, Landaweber iteration, spectral\ncut-off, and iterated Tikhonov, in fitting noisy data with possibly unbounded\nrandom noise. For the analysis, we develop an integral operator approach that\ncan be regarded as an extension of the widely used sampling inequality approach\nand norming set method in the community of scattered data fitting. After\nproviding an equivalence between the operator differences and quadrature rules,\nwe succeed in deriving optimal Sobolev-type error estimates of weighted\nspectral filter algorithms. Our derived error estimates do not suffer from the\nsaturation phenomenon for Tikhonov regularization in the literature,\nnative-space-barrier for existing error analysis and adapts to different\nembedding spaces. We also propose a divide-and-conquer scheme to equip weighted\nspectral filter algorithms to reduce their computational burden and present the\noptimal approximation error bounds.\n","authors":["Shao-Bo Lin"],"pdf_url":"https://arxiv.org/pdf/2401.15294v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09079v2","updated":"2024-10-23T08:05:57Z","published":"2024-06-13T13:03:37Z","title":"Hadamard Representations: Augmenting Hyperbolic Tangents in RL","summary":" Activation functions are one of the key components of a deep neural network.\nThe most commonly used activation functions can be classed into the category of\ncontinuously differentiable (e.g. tanh) and linear-unit functions (e.g. ReLU),\nboth having their own strengths and drawbacks with respect to downstream\nperformance and representation capacity through learning (e.g. measured by the\nnumber of dead neurons and the effective rank). In reinforcement learning, the\nperformance of continuously differentiable activations often falls short as\ncompared to linear-unit functions. We provide insights into the vanishing\ngradients associated with the former, and show that the dying neuron problem is\nnot exclusive to ReLU's. To alleviate vanishing gradients and the resulting\ndying neuron problem occurring with continuously differentiable activations, we\npropose a Hadamard representation. Using deep Q-networks and proximal policy\noptimization in the Atari domain, we show faster learning, a reduction in dead\nneurons and increased effective rank.\n","authors":["Jacob E. Kooi","Mark Hoogendoorn","Vincent François-Lavet"],"pdf_url":"https://arxiv.org/pdf/2406.09079v2.pdf","comment":"24 pages, 19 figures, 3 tables"},{"id":"http://arxiv.org/abs/2410.17647v1","updated":"2024-10-23T08:04:12Z","published":"2024-10-23T08:04:12Z","title":"Entity-based Reinforcement Learning for Autonomous Cyber Defence","summary":" A significant challenge for autonomous cyber defence is ensuring a defensive\nagent's ability to generalise across diverse network topologies and\nconfigurations.\n This capability is necessary for agents to remain effective when deployed in\ndynamically changing environments, such as an enterprise network where devices\nmay frequently join and leave.\n Standard approaches to deep reinforcement learning, where policies are\nparameterised using a fixed-input multi-layer perceptron (MLP) expect\nfixed-size observation and action spaces. In autonomous cyber defence, this\nmakes it hard to develop agents that generalise to environments with network\ntopologies different from those trained on, as the number of nodes affects the\nnatural size of the observation and action spaces. To overcome this limitation,\nwe reframe the problem of autonomous network defence using entity-based\nreinforcement learning, where the observation and action space of an agent are\ndecomposed into a collection of discrete entities. This framework enables the\nuse of policy parameterisations specialised in compositional generalisation.\nNamely, we train a Transformer-based policy on the Yawning Titan cyber-security\nsimulation environment and test its generalisation capabilities across various\nnetwork topologies. We demonstrate that this approach significantly outperforms\nan MLP-based policy on fixed networks, and has the ability for zero-shot\ngeneralisation to networks of a different size to those seen in training.\n These findings highlight the potential for entity-based reinforcement\nlearning to advance the field of autonomous cyber defence by providing more\ngeneralisable policies capable of handling variations in real-world network\nenvironments.\n","authors":["Isaac Symes Thompson","Alberto Caron","Chris Hicks","Vasilios Mavroudis"],"pdf_url":"https://arxiv.org/pdf/2410.17647v1.pdf","comment":"Material to appear in the proceedings of the 1st International\n Workshop on Autonomous Cybersecurity at ACM CCS 2024"},{"id":"http://arxiv.org/abs/2405.04098v2","updated":"2024-10-23T07:57:41Z","published":"2024-05-07T08:05:20Z","title":"Binarized Simplicial Convolutional Neural Networks","summary":" Graph Neural Networks have a limitation of solely processing features on\ngraph nodes, neglecting data on high-dimensional structures such as edges and\ntriangles. Simplicial Convolutional Neural Networks (SCNN) represent\nhigher-order structures using simplicial complexes to break this limitation\nalbeit still lacking time efficiency. In this paper, we propose a novel neural\nnetwork architecture on simplicial complexes named Binarized Simplicial\nConvolutional Neural Networks (Bi-SCNN) based on the combination of simplicial\nconvolution with a binary-sign forward propagation strategy. The usage of the\nHodge Laplacian on a binary-sign forward propagation enables Bi-SCNN to\nefficiently and effectively represent simplicial features that have\nhigher-order structures than traditional graph node representations. Compared\nto the previous Simplicial Convolutional Neural Networks, the reduced model\ncomplexity of Bi-SCNN shortens the execution time without sacrificing the\nprediction performance and is less prone to the over-smoothing effect.\nExperimenting with real-world citation and ocean-drifter data confirmed that\nour proposed Bi-SCNN is efficient and accurate.\n","authors":["Yi Yan","Ercan E. Kuruoglu"],"pdf_url":"https://arxiv.org/pdf/2405.04098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17631v1","updated":"2024-10-23T07:48:35Z","published":"2024-10-23T07:48:35Z","title":"Exploring structure diversity in atomic resolution microscopy with graph\n neural networks","summary":" The emergence of deep learning (DL) has provided great opportunities for the\nhigh-throughput analysis of atomic-resolution micrographs. However, the DL\nmodels trained by image patches in fixed size generally lack efficiency and\nflexibility when processing micrographs containing diversified atomic\nconfigurations. Herein, inspired by the similarity between the atomic\nstructures and graphs, we describe a few-shot learning framework based on an\nequivariant graph neural network (EGNN) to analyze a library of atomic\nstructures (e.g., vacancies, phases, grain boundaries, doping, etc.), showing\nsignificantly promoted robustness and three orders of magnitude reduced\ncomputing parameters compared to the image-driven DL models, which is\nespecially evident for those aggregated vacancy lines with flexible lattice\ndistortion. Besides, the intuitiveness of graphs enables quantitative and\nstraightforward extraction of the atomic-scale structural features in batches,\nthus statistically unveiling the self-assembly dynamics of vacancy lines under\nelectron beam irradiation. A versatile model toolkit is established by\nintegrating EGNN sub-models for single structure recognition to process images\ninvolving varied configurations in the form of a task chain, leading to the\ndiscovery of novel doping configurations with superior electrocatalytic\nproperties for hydrogen evolution reactions. This work provides a powerful tool\nto explore structure diversity in a fast, accurate, and intelligent manner.\n","authors":["Zheng Luo","Ming Feng","Zijian Gao","Jinyang Yu","Liang Hu","Tao Wang","Shenao Xue","Shen Zhou","Fangping Ouyang","Dawei Feng","Kele Xu","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.17631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17628v1","updated":"2024-10-23T07:44:14Z","published":"2024-10-23T07:44:14Z","title":"Feature Learning in Attention Mechanisms Is More Compact and Stable Than\n in Convolution","summary":" Attention and convolution are fundamental techniques in machine learning.\nWhile they use different approaches to learn features - attention mechanisms\ncapture both global and local data relathionships, while convolutional layers\nfocus on local patterns - both methods are effective for various tasks.\nAlthough the feature learning of both models is well-studied individually,\nthere has not been a direct comparison of their feature learning dynamics. In\nthis paper, we compare their Lipschitz continuity with respect to the\nWasserstein distance and covering numbers under similar settings. We\ndemonstrate that attention processes data in a more compact and stable manner.\nCompactness refers to the lower variance and intrinsic dimensionality of the\nactivation outputs, while stability refers to the changes between inputs and\noutputs. We validate our findings through experiments using topological data\nanalysis, measuring the 1-, 2-, and infinity-Wasserstein distances between the\noutputs of each layer from both models. Furthermore, we extend our comparison\nto Vision Transformers (ViTs) and ResNets, showing that while ViTs have higher\noutput variance, their feature learning is more stable than that of ResNets.\n","authors":["Baiyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2410.17628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17624v1","updated":"2024-10-23T07:29:30Z","published":"2024-10-23T07:29:30Z","title":"Incremental Learning of Affordances using Markov Logic Networks","summary":" Affordances enable robots to have a semantic understanding of their\nsurroundings. This allows them to have more acting flexibility when completing\na given task. Capturing object affordances in a machine learning model is a\ndifficult task, because of their dependence on contextual information. Markov\nLogic Networks (MLN) combine probabilistic reasoning with logic that is able to\ncapture such context. Mobile robots operate in partially known environments\nwherein unseen object affordances can be observed. This new information must be\nincorporated into the existing knowledge, without having to retrain the MLN\nfrom scratch. We introduce the MLN Cumulative Learning Algorithm (MLN-CLA).\nMLN-CLA learns new relations in various knowledge domains by retaining\nknowledge and only updating the changed knowledge, for which the MLN is\nretrained. We show that MLN-CLA is effective for accumulative learning and\nzero-shot affordance inference, outperforming strong baselines.\n","authors":["George Potter","Gertjan Burghouts","Joris Sijs"],"pdf_url":"https://arxiv.org/pdf/2410.17624v1.pdf","comment":"accepted at IEEE IRC 2024"},{"id":"http://arxiv.org/abs/2410.05623v2","updated":"2024-10-23T07:28:19Z","published":"2024-10-08T02:11:35Z","title":"Understanding Gradient Boosting Classifier: Training, Prediction, and\n the Role of $γ_j$","summary":" The Gradient Boosting Classifier (GBC) is a widely used machine learning\nalgorithm for binary classification, which builds decision trees iteratively to\nminimize prediction errors. This document explains the GBC's training and\nprediction processes, focusing on the computation of terminal node values\n$\\gamma_j$, which are crucial to optimizing the logistic loss function. We\nderive $\\gamma_j$ through a Taylor series approximation and provide a\nstep-by-step pseudocode for the algorithm's implementation. The guide explains\nthe theory of GBC and its practical application, demonstrating its\neffectiveness in binary classification tasks. We provide a step-by-step example\nin the appendix to help readers understand.\n","authors":["Hung-Hsuan Chen"],"pdf_url":"https://arxiv.org/pdf/2410.05623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05811v3","updated":"2024-10-23T07:26:07Z","published":"2024-03-09T06:19:53Z","title":"Statistical Efficiency of Distributional Temporal Difference Learning","summary":" Distributional reinforcement learning (DRL) has achieved empirical success in\nvarious domains. One core task in the field of DRL is distributional policy\nevaluation, which involves estimating the return distribution $\\eta^\\pi$ for a\ngiven policy $\\pi$. The distributional temporal difference learning has been\naccordingly proposed, which is an extension of the temporal difference learning\n(TD) in the classic RL area. In the tabular case, \\citet{rowland2018analysis}\nand \\citet{rowland2023analysis} proved the asymptotic convergence of two\ninstances of distributional TD, namely categorical temporal difference learning\n(CTD) and quantile temporal difference learning (QTD), respectively. In this\npaper, we go a step further and analyze the finite-sample performance of\ndistributional TD. To facilitate theoretical analysis, we propose\nnon-parametric distributional TD learning (NTD). For a $\\gamma$-discounted\ninfinite-horizon tabular Markov decision process, we show that for NTD we need\n$\\tilde{O}\\left(\\frac{1}{\\varepsilon^{2p}(1-\\gamma)^{2p+1}}\\right)$ iterations\nto achieve an $\\varepsilon$-optimal estimator with high probability, when the\nestimation error is measured by the $p$-Wasserstein distance. This sample\ncomplexity bound is minimax optimal up to logarithmic factors in the case of\nthe $1$-Wasserstein distance. To achieve this, we establish a novel Freedman's\ninequality in Hilbert spaces, which would be of independent interest. In\naddition, we revisit CTD, showing that the same non-asymptotic convergence\nbounds hold for CTD in the case of the $p$-Wasserstein distance for $p\\geq 1$.\n","authors":["Yang Peng","Liangyu Zhang","Zhihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.05811v3.pdf","comment":"NeurIPS 2024 (oral)"},{"id":"http://arxiv.org/abs/2410.17617v1","updated":"2024-10-23T07:14:37Z","published":"2024-10-23T07:14:37Z","title":"Self-Supervised Graph Neural Networks for Enhanced Feature Extraction in\n Heterogeneous Information Networks","summary":" This paper explores the applications and challenges of graph neural networks\n(GNNs) in processing complex graph data brought about by the rapid development\nof the Internet. Given the heterogeneity and redundancy problems that graph\ndata often have, traditional GNN methods may be overly dependent on the initial\nstructure and attribute information of the graph, which limits their ability to\naccurately simulate more complex relationships and patterns in the graph.\nTherefore, this study proposes a graph neural network model under a\nself-supervised learning framework, which can flexibly combine different types\nof additional information of the attribute graph and its nodes, so as to better\nmine the deep features in the graph data. By introducing a self-supervisory\nmechanism, it is expected to improve the adaptability of existing models to the\ndiversity and complexity of graph data and improve the overall performance of\nthe model.\n","authors":["Jianjun Wei","Yue Liu","Xin Huang","Xin Zhang","Wenyi Liu","Xu Yan"],"pdf_url":"https://arxiv.org/pdf/2410.17617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15899v2","updated":"2024-10-23T07:05:26Z","published":"2024-10-21T11:23:23Z","title":"On the Design and Performance of Machine Learning Based Error Correcting\n Decoders","summary":" This paper analyzes the design and competitiveness of four neural network\n(NN) architectures recently proposed as decoders for forward error correction\n(FEC) codes. We first consider the so-called single-label neural network (SLNN)\nand the multi-label neural network (MLNN) decoders which have been reported to\nachieve near maximum likelihood (ML) performance. Here, we show analytically\nthat SLNN and MLNN decoders can always achieve ML performance, regardless of\nthe code dimensions -- although at the cost of computational complexity -- and\nno training is in fact required. We then turn our attention to two\ntransformer-based decoders: the error correction code transformer (ECCT) and\nthe cross-attention message passing transformer (CrossMPT). We compare their\nperformance against traditional decoders, and show that ordered statistics\ndecoding outperforms these transformer-based decoders. The results in this\npaper cast serious doubts on the application of NN-based FEC decoders in the\nshort and medium block length regime.\n","authors":["Yuncheng Yuan","Péter Scheepers","Lydia Tasiou","Yunus Can Gültekin","Federico Corradi","Alex Alvarado"],"pdf_url":"https://arxiv.org/pdf/2410.15899v2.pdf","comment":"6 pages, 4 figures, submitted for possible presentation in a\n conference (v2: Pre-FEC BER curves are corrected)"},{"id":"http://arxiv.org/abs/2311.00656v3","updated":"2024-10-23T06:53:57Z","published":"2023-11-01T17:02:41Z","title":"Adaptive Spatio-temporal Estimation on the Graph Edges via Line Graph\n Transformation","summary":" Spatio-temporal estimation of signals on graph edges is challenging because\nmost conventional Graph Signal Processing techniques are defined on the graph\nnodes. Leveraging the Line Graph transform, the Line Graph Least Mean Square\n(LGLMS) algorithm is proposed to conduct adaptive estimation of time-varying\nedge signals by projecting the edge signals from edge space to node space.\nLGLMS is an adaptive algorithm analogous to the classical LMS algorithm but\napplied to graph edges. Unlike edge-specific methods, LGLMS retains all GSP\nconcepts and techniques originally designed for graph nodes, without the need\nfor redefinition on the edges. Experimenting with transportation graphs and\nmeteorological graphs, with the signal observations having noisy and missing\nvalues, we confirmed that LGLMS is suitable for the online prediction of\ntime-varying edge signals.\n","authors":["Yi Yan","Ercan Engin Kuruoglu"],"pdf_url":"https://arxiv.org/pdf/2311.00656v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17128v2","updated":"2024-10-23T06:51:54Z","published":"2024-10-22T16:00:44Z","title":"Understanding Transfer Learning via Mean-field Analysis","summary":" We propose a novel framework for exploring generalization errors of transfer\nlearning through the lens of differential calculus on the space of probability\nmeasures. In particular, we consider two main transfer learning scenarios,\n$\\alpha$-ERM and fine-tuning with the KL-regularized empirical risk\nminimization and establish generic conditions under which the generalization\nerror and the population risk convergence rates for these scenarios are\nstudied. Based on our theoretical results, we show the benefits of transfer\nlearning with a one-hidden-layer neural network in the mean-field regime under\nsome suitable integrability and regularity assumptions on the loss and\nactivation functions.\n","authors":["Gholamali Aminian","Łukasz Szpruch","Samuel N. Cohen"],"pdf_url":"https://arxiv.org/pdf/2410.17128v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2410.14759v2","updated":"2024-10-23T06:51:05Z","published":"2024-10-18T09:53:20Z","title":"Universal approximation results for neural networks with non-polynomial\n activation function over non-compact domains","summary":" In this paper, we generalize the universal approximation property of\nsingle-hidden-layer feed-forward neural networks beyond the classical\nformulation over compact domains. More precisely, by assuming that the\nactivation function is non-polynomial, we derive universal approximation\nresults for neural networks within function spaces over non-compact subsets of\na Euclidean space, e.g., weighted spaces, $L^p$-spaces, and (weighted) Sobolev\nspaces over unbounded domains, where the latter includes the approximation of\nthe (weak) derivatives. Furthermore, we provide some dimension-independent\nrates for approximating a function with sufficiently regular and integrable\nFourier transform by neural networks with non-polynomial activation function.\n","authors":["Ariel Neufeld","Philipp Schmocker"],"pdf_url":"https://arxiv.org/pdf/2410.14759v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2312.08410"},{"id":"http://arxiv.org/abs/2410.17592v1","updated":"2024-10-23T06:40:13Z","published":"2024-10-23T06:40:13Z","title":"A Kernel Perspective on Distillation-based Collaborative Learning","summary":" Over the past decade, there is a growing interest in collaborative learning\nthat can enhance AI models of multiple parties. However, it is still\nchallenging to enhance performance them without sharing private data and models\nfrom individual parties. One recent promising approach is to develop\ndistillation-based algorithms that exploit unlabeled public data but the\nresults are still unsatisfactory in both theory and practice. To tackle this\nproblem, we rigorously analyze a representative distillation-based algorithm in\nthe view of kernel regression. This work provides the first theoretical results\nto prove the (nearly) minimax optimality of the nonparametric collaborative\nlearning algorithm that does not directly share local data or models in\nmassively distributed statistically heterogeneous environments. Inspired by our\ntheoretical results, we also propose a practical distillation-based\ncollaborative learning algorithm based on neural network architecture. Our\nalgorithm successfully bridges the gap between our theoretical assumptions and\npractical settings with neural networks through feature kernel matching. We\nsimulate various regression tasks to verify our theory and demonstrate the\npractical feasibility of our proposed algorithm.\n","authors":["Sejun Park","Kihun Hong","Ganguk Hwang"],"pdf_url":"https://arxiv.org/pdf/2410.17592v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.17589v1","updated":"2024-10-23T06:35:41Z","published":"2024-10-23T06:35:41Z","title":"Challenge on Sound Scene Synthesis: Evaluating Text-to-Audio Generation","summary":" Despite significant advancements in neural text-to-audio generation,\nchallenges persist in controllability and evaluation. This paper addresses\nthese issues through the Sound Scene Synthesis challenge held as part of the\nDetection and Classification of Acoustic Scenes and Events 2024. We present an\nevaluation protocol combining objective metric, namely Fr\\'echet Audio\nDistance, with perceptual assessments, utilizing a structured prompt format to\nenable diverse captions and effective evaluation. Our analysis reveals varying\nperformance across sound categories and model architectures, with larger models\ngenerally excelling but innovative lightweight approaches also showing promise.\nThe strong correlation between objective metrics and human ratings validates\nour evaluation approach. We discuss outcomes in terms of audio quality,\ncontrollability, and architectural considerations for text-to-audio\nsynthesizers, providing direction for future research.\n","authors":["Junwon Lee","Modan Tailleur","Laurie M. Heller","Keunwoo Choi","Mathieu Lagrange","Brian McFee","Keisuke Imoto","Yuki Okamoto"],"pdf_url":"https://arxiv.org/pdf/2410.17589v1.pdf","comment":"accepted to NeurIPS 2024 Workshop: Audio Imagination"},{"id":"http://arxiv.org/abs/2410.17587v1","updated":"2024-10-23T06:30:20Z","published":"2024-10-23T06:30:20Z","title":"Predicting Company Growth by Econophysics informed Machine Learning","summary":" Predicting company growth is crucial for strategic adjustment, operational\ndecision-making, risk assessment, and loan eligibility reviews. Traditional\nmodels for company growth often focus too much on theory, overlooking practical\nforecasting, or they rely solely on time series forecasting techniques,\nignoring interpretability and the inherent mechanisms of company growth. In\nthis paper, we propose a machine learning-based prediction framework that\nincorporates an econophysics model for company growth. Our model captures both\nthe intrinsic growth mechanisms of companies led by scaling laws and the\nfluctuations influenced by random factors and individual decisions,\ndemonstrating superior predictive performance compared with methods that use\ntime series techniques alone. Its advantages are more pronounced in long-range\nprediction tasks. By explicitly modeling the baseline growth and volatility\ncomponents, our model is more interpretable.\n","authors":["Ruyi Tao","Kaiwei Liu","Xu Jing","Jiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.17587v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2402.12617v2","updated":"2024-10-23T06:28:19Z","published":"2024-02-20T00:51:05Z","title":"Generative AI Security: Challenges and Countermeasures","summary":" Generative AI's expanding footprint across numerous industries has led to\nboth excitement and increased scrutiny. This paper delves into the unique\nsecurity challenges posed by Generative AI, and outlines potential research\ndirections for managing these risks.\n","authors":["Banghua Zhu","Norman Mu","Jiantao Jiao","David Wagner"],"pdf_url":"https://arxiv.org/pdf/2402.12617v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15012v2","updated":"2024-10-23T06:27:26Z","published":"2024-03-22T07:56:31Z","title":"Empirical investigation of multi-source cross-validation in clinical ECG\n classification","summary":" Traditionally, machine learning-based clinical prediction models have been\ntrained and evaluated on patient data from a single source, such as a hospital.\nCross-validation methods can be used to estimate the accuracy of such models on\nnew patients originating from the same source, by repeated random splitting of\nthe data. However, such estimates tend to be highly overoptimistic when\ncompared to accuracy obtained from deploying models to sources not represented\nin the dataset, such as a new hospital. The increasing availability of\nmulti-source medical datasets provides new opportunities for obtaining more\ncomprehensive and realistic evaluations of expected accuracy through\nsource-level cross-validation designs.\n In this study, we present a systematic empirical evaluation of standard\nK-fold cross-validation and leave-source-out cross-validation methods in a\nmulti-source setting. We consider the task of electrocardiogram based\ncardiovascular disease classification, combining and harmonizing the openly\navailable PhysioNet CinC Challenge 2021 and the Shandong Provincial Hospital\ndatasets for our study.\n Our results show that K-fold cross-validation, both on single-source and\nmulti-source data, systemically overestimates prediction performance when the\nend goal is to generalize to new sources. Leave-source-out cross-validation\nprovides more reliable performance estimates, having close to zero bias though\nlarger variability. The evaluation highlights the dangers of obtaining\nmisleading cross-validation results on medical data and demonstrates how these\nissues can be mitigated when having access to multi-source data.\n","authors":["Tuija Leinonen","David Wong","Antti Vasankari","Ali Wahab","Ramesh Nadarajah","Matti Kaisti","Antti Airola"],"pdf_url":"https://arxiv.org/pdf/2403.15012v2.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.17579v1","updated":"2024-10-23T06:08:45Z","published":"2024-10-23T06:08:45Z","title":"Bonsai: Gradient-free Graph Distillation for Node Classification","summary":" Graph distillation has emerged as a promising avenue to enable scalable\ntraining of GNNs by compressing the training dataset while preserving essential\ngraph characteristics. Our study uncovers significant shortcomings in current\ngraph distillation techniques. First, the majority of the algorithms\nparadoxically require training on the full dataset to perform distillation.\nSecond, due to their gradient-emulating approach, these methods require fresh\ndistillation for any change in hyperparameters or GNN architecture, limiting\ntheir flexibility and reusability. Finally, they fail to achieve substantial\nsize reduction due to synthesizing fully-connected, edge-weighted graphs. To\naddress these challenges, we present Bonsai, a novel graph distillation method\nempowered by the observation that \\textit{computation trees} form the\nfundamental processing units of message-passing GNNs. Bonsai distills datasets\nby encoding a careful selection of \\textit{exemplar} trees that maximize the\nrepresentation of all computation trees in the training set. This unique\napproach imparts Bonsai as the first linear-time, model-agnostic graph\ndistillation algorithm for node classification that outperforms existing\nbaselines across $6$ real-world datasets on accuracy, while being $22$ times\nfaster on average. Bonsai is grounded in rigorous mathematical guarantees on\nthe adopted approximation strategies making it robust to GNN architectures,\ndatasets, and parameters.\n","authors":["Mridul Gupta","Samyak Jain","Vansh Ramani","Hariprasad Kodamana","Sayan Ranu"],"pdf_url":"https://arxiv.org/pdf/2410.17579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03801v2","updated":"2024-10-23T06:05:11Z","published":"2024-10-04T08:14:24Z","title":"P1-KAN an effective Kolmogorov Arnold Network for function approximation","summary":" A new Kolmogorov-Arnold network (KAN) is proposed to approximate potentially\nirregular functions in high dimension. We show that it outperforms multilayer\nperceptrons in terms of accuracy and converges faster. We also compare it with\nseveral proposed KAN networks: the original spline-based KAN network appears to\nbe more effective for smooth functions, while the P1-KAN network is more\neffective for irregular functions.\n","authors":["Xavier Warin"],"pdf_url":"https://arxiv.org/pdf/2410.03801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16811v2","updated":"2024-10-23T05:57:12Z","published":"2024-10-22T08:38:46Z","title":"Masked Clinical Modelling: A Framework for Synthetic and Augmented\n Survival Data Generation","summary":" Access to real clinical data is often restricted due to privacy obligations,\ncreating significant barriers for healthcare research. Synthetic datasets\nprovide a promising solution, enabling secure data sharing and model\ndevelopment. However, most existing approaches focus on data realism rather\nthan utility -- ensuring that models trained on synthetic data yield clinically\nmeaningful insights comparable to those trained on real data. In this paper, we\npresent Masked Clinical Modelling (MCM), a framework inspired by masked\nlanguage modelling, designed for both data synthesis and conditional data\naugmentation. We evaluate this prototype on the WHAS500 dataset using Cox\nProportional Hazards models, focusing on the preservation of hazard ratios as\nkey clinical metrics. Our results show that data generated using the MCM\nframework improves both discrimination and calibration in survival analysis,\noutperforming existing methods. MCM demonstrates strong potential to support\nsurvival data analysis and broader healthcare applications.\n","authors":["Nicholas I-Hsien Kuo","Blanca Gallego","Louisa Jorm"],"pdf_url":"https://arxiv.org/pdf/2410.16811v2.pdf","comment":"Re-archived due to incorrect ORCiD. Last edited: 2024-10-23"},{"id":"http://arxiv.org/abs/2410.17574v1","updated":"2024-10-23T05:55:21Z","published":"2024-10-23T05:55:21Z","title":"Adversarial Domain Adaptation for Metal Cutting Sound Detection:\n Leveraging Abundant Lab Data for Scarce Industry Data","summary":" Cutting state monitoring in the milling process is crucial for improving\nmanufacturing efficiency and tool life. Cutting sound detection using machine\nlearning (ML) models, inspired by experienced machinists, can be employed as a\ncost-effective and non-intrusive monitoring method in a complex manufacturing\nenvironment. However, labeling industry data for training is costly and\ntime-consuming. Moreover, industry data is often scarce. In this study, we\npropose a novel adversarial domain adaptation (DA) approach to leverage\nabundant lab data to learn from scarce industry data, both labeled, for\ntraining a cutting-sound detection model. Rather than adapting the features\nfrom separate domains directly, we project them first into two separate latent\nspaces that jointly work as the feature space for learning domain-independent\nrepresentations. We also analyze two different mechanisms for adversarial\nlearning where the discriminator works as an adversary and a critic in separate\nsettings, enabling our model to learn expressive domain-invariant and\ndomain-ingrained features, respectively. We collected cutting sound data from\nmultiple sensors in different locations, prepared datasets from lab and\nindustry domain, and evaluated our learning models on them. Experiments showed\nthat our models outperformed the multi-layer perceptron based vanilla domain\nadaptation models in labeling tasks on the curated datasets, achieving near\n92%, 82% and 85% accuracy respectively for three different sensors installed in\nindustry settings.\n","authors":["Mir Imtiaz Mostafiz","Eunseob Kim","Adrian Shuai Li","Elisa Bertino","Martin Byung-Guk Jun","Ali Shakouri"],"pdf_url":"https://arxiv.org/pdf/2410.17574v1.pdf","comment":"8 pages, 3 figures, 3 tables, First two named Authors have equal\n contribution (Co-first author)"},{"id":"http://arxiv.org/abs/2410.17573v1","updated":"2024-10-23T05:54:41Z","published":"2024-10-23T05:54:41Z","title":"Securing Federated Learning Against Novel and Classic Backdoor Threats\n During Foundation Model Integration","summary":" Federated learning (FL) enables decentralized model training while preserving\nprivacy. Recently, integrating Foundation Models (FMs) into FL has boosted\nperformance but also introduced a novel backdoor attack mechanism. Attackers\ncan exploit the FM's capabilities to embed backdoors into synthetic data\ngenerated by FMs used for model fusion, subsequently infecting all client\nmodels through knowledge sharing without involvement in the long-lasting FL\nprocess. These novel attacks render existing FL backdoor defenses ineffective,\nas they primarily detect anomalies among client updates, which may appear\nuniformly malicious under this attack. Our work proposes a novel data-free\ndefense strategy by constraining abnormal activations in the hidden feature\nspace during model aggregation on the server. The activation constraints,\noptimized using synthetic data alongside FL training, mitigate the attack while\nbarely affecting model performance, as the parameters remain untouched.\nExtensive experiments demonstrate its effectiveness against both novel and\nclassic backdoor attacks, outperforming existing defenses while maintaining\nmodel performance.\n","authors":["Xiaohuan Bi","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2410.17573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17555v2","updated":"2024-10-23T05:49:00Z","published":"2024-09-26T05:57:35Z","title":"Advancing Open-Set Domain Generalization Using Evidential Bi-Level\n Hardest Domain Scheduler","summary":" In Open-Set Domain Generalization (OSDG), the model is exposed to both new\nvariations of data appearance (domains) and open-set conditions, where both\nknown and novel categories are present at test time. The challenges of this\ntask arise from the dual need to generalize across diverse domains and\naccurately quantify category novelty, which is critical for applications in\ndynamic environments. Recently, meta-learning techniques have demonstrated\nsuperior results in OSDG, effectively orchestrating the meta-train and -test\ntasks by employing varied random categories and predefined domain partition\nstrategies. These approaches prioritize a well-designed training schedule over\ntraditional methods that focus primarily on data augmentation and the\nenhancement of discriminative feature learning. The prevailing meta-learning\nmodels in OSDG typically utilize a predefined sequential domain scheduler to\nstructure data partitions. However, a crucial aspect that remains inadequately\nexplored is the influence brought by strategies of domain schedulers during\ntraining. In this paper, we observe that an adaptive domain scheduler benefits\nmore in OSDG compared with prefixed sequential and random domain schedulers. We\npropose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve\nan adaptive domain scheduler. This method strategically sequences domains by\nassessing their reliabilities in utilizing a follower network, trained with\nconfidence scores learned in an evidential manner, regularized by max rebiasing\ndiscrepancy, and optimized in a bi-level manner. The results show that our\nmethod substantially improves OSDG performance and achieves more discriminative\nembeddings for both the seen and unseen categories. The source code is publicly\navailable at https://github.com/KPeng9510/EBiL-HaDS.\n","authors":["Kunyu Peng","Di Wen","Kailun Yang","Ao Luo","Yufan Chen","Jia Fu","M. Saquib Sarfraz","Alina Roitberg","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2409.17555v2.pdf","comment":"Accepted to NeurIPS 2024. The source code is publicly available at\n https://github.com/KPeng9510/EBiL-HaDS"},{"id":"http://arxiv.org/abs/2405.16194v3","updated":"2024-10-23T05:47:21Z","published":"2024-05-25T11:53:23Z","title":"Diffusion-Reward Adversarial Imitation Learning","summary":" Imitation learning aims to learn a policy from observing expert\ndemonstrations without access to reward signals from environments. Generative\nadversarial imitation learning (GAIL) formulates imitation learning as\nadversarial learning, employing a generator policy learning to imitate expert\nbehaviors and discriminator learning to distinguish the expert demonstrations\nfrom agent trajectories. Despite its encouraging results, GAIL training is\noften brittle and unstable. Inspired by the recent dominance of diffusion\nmodels in generative modeling, we propose Diffusion-Reward Adversarial\nImitation Learning (DRAIL), which integrates a diffusion model into GAIL,\naiming to yield more robust and smoother rewards for policy learning.\nSpecifically, we propose a diffusion discriminative classifier to construct an\nenhanced discriminator, and design diffusion rewards based on the classifier's\noutput for policy learning. Extensive experiments are conducted in navigation,\nmanipulation, and locomotion, verifying DRAIL's effectiveness compared to prior\nimitation learning methods. Moreover, additional experimental results\ndemonstrate the generalizability and data efficiency of DRAIL. Visualized\nlearned reward functions of GAIL and DRAIL suggest that DRAIL can produce more\nrobust and smoother rewards. Project page:\nhttps://nturobotlearninglab.github.io/DRAIL/\n","authors":["Chun-Mao Lai","Hsiang-Chun Wang","Ping-Chun Hsieh","Yu-Chiang Frank Wang","Min-Hung Chen","Shao-Hua Sun"],"pdf_url":"https://arxiv.org/pdf/2405.16194v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04869v3","updated":"2024-10-23T05:44:21Z","published":"2024-08-09T05:15:36Z","title":"UCB Exploration for Fixed-Budget Bayesian Best Arm Identification","summary":" We study best-arm identification (BAI) in the fixed-budget setting. Adaptive\nallocations based on upper confidence bounds (UCBs), such as UCBE, are known to\nwork well in BAI. However, it is well-known that its optimal regret is\ntheoretically dependent on instances, which we show to be an artifact in many\nfixed-budget BAI problems. In this paper we propose an UCB exploration\nalgorithm that is both theoretically and empirically efficient for the fixed\nbudget BAI problem under a Bayesian setting. The key idea is to learn prior\ninformation, which can enhance the performance of UCB-based BAI algorithm as it\nhas done in the cumulative regret minimization problem. We establish bounds on\nthe failure probability and the simple regret for the Bayesian BAI problem,\nproviding upper bounds of order $\\tilde{O}(\\sqrt{K/n})$, up to logarithmic\nfactors, where $n$ represents the budget and $K$ denotes the number of arms.\nFurthermore, we demonstrate through empirical results that our approach\nconsistently outperforms state-of-the-art baselines.\n","authors":["Rong J. B. Zhu","Yanqi Qiu"],"pdf_url":"https://arxiv.org/pdf/2408.04869v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.12018v2","updated":"2024-10-23T15:21:54Z","published":"2024-10-15T19:33:57Z","title":"LocoMotion: Learning Motion-Focused Video-Language Representations","summary":" This paper strives for motion-focused video-language representations.\nExisting methods to learn video-language representations use spatial-focused\ndata, where identifying the objects and scene is often enough to distinguish\nthe relevant caption. We instead propose LocoMotion to learn from\nmotion-focused captions that describe the movement and temporal progression of\nlocal object motions. We achieve this by adding synthetic motions to videos and\nusing the parameters of these motions to generate corresponding captions.\nFurthermore, we propose verb-variation paraphrasing to increase the caption\nvariety and learn the link between primitive motions and high-level verbs. With\nthis, we are able to learn a motion-focused video-language representation.\nExperiments demonstrate our approach is effective for a variety of downstream\ntasks, particularly when limited data is available for fine-tuning. Code is\navailable: https://hazeldoughty.github.io/Papers/LocoMotion/\n","authors":["Hazel Doughty","Fida Mohammad Thoker","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2410.12018v2.pdf","comment":"ACCV 2024 Oral"},{"id":"http://arxiv.org/abs/2409.02845v3","updated":"2024-10-23T12:12:44Z","published":"2024-09-04T16:17:41Z","title":"Multi-Track MusicLDM: Towards Versatile Music Generation with Latent\n Diffusion Model","summary":" Diffusion models have shown promising results in cross-modal generation tasks\ninvolving audio and music, such as text-to-sound and text-to-music generation.\nThese text-controlled music generation models typically focus on generating\nmusic by capturing global musical attributes like genre and mood. However,\nmusic composition is a complex, multilayered task that often involves musical\narrangement as an integral part of the process. This process involves composing\neach instrument to align with existing ones in terms of beat, dynamics,\nharmony, and melody, requiring greater precision and control over tracks than\ntext prompts usually provide. In this work, we address these challenges by\nextending the MusicLDM, a latent diffusion model for music, into a multi-track\ngenerative model. By learning the joint probability of tracks sharing a\ncontext, our model is capable of generating music across several tracks that\ncorrespond well to each other, either conditionally or unconditionally.\nAdditionally, our model is capable of arrangement generation, where the model\ncan generate any subset of tracks given the others (e.g., generating a piano\ntrack complementing given bass and drum tracks). We compared our model with an\nexisting multi-track generative model and demonstrated that our model achieves\nconsiderable improvements across objective metrics for both total and\narrangement generation tasks.\n","authors":["Tornike Karchkhadze","Mohammad Rasool Izadi","Ke Chen","Gerard Assayag","Shlomo Dubnov"],"pdf_url":"https://arxiv.org/pdf/2409.02845v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03394v3","updated":"2024-10-23T10:57:53Z","published":"2023-07-07T05:33:54Z","title":"Dual Inverse Degradation Network for Real-World SDRTV-to-HDRTV\n Conversion","summary":" In this study, we address the emerging necessity of converting Standard\nDynamic Range Television (SDRTV) content into High Dynamic Range Television\n(HDRTV) in light of the limited number of native HDRTV content. A principal\ntechnical challenge in this conversion is the exacerbation of coding artifacts\ninherent in SDRTV, which detrimentally impacts the quality of the resulting\nHDRTV. To address this issue, our method introduces a novel approach that\nconceptualizes the SDRTV-to-HDRTV conversion as a composite task involving dual\ndegradation restoration. This encompasses inverse tone mapping in conjunction\nwith video restoration. We propose Dual Inversion Downgraded SDRTV to HDRTV\nNetwork (DIDNet), which can accurately perform inverse tone mapping while\npreventing encoding artifacts from being amplified, thereby significantly\nimproving visual quality. DIDNet integrates an intermediate auxiliary loss\nfunction to effectively separate the dual degradation restoration tasks and\nefficient learning of both artifact reduction and inverse tone mapping during\nend-to-end training. Additionally, DIDNet introduces a spatio-temporal feature\nalignment module for video frame fusion, which augments texture quality and\nreduces artifacts. The architecture further includes a dual-modulation\nconvolution mechanism for optimized inverse tone mapping. Recognizing the\nricher texture and high-frequency information in HDRTV compared to SDRTV, we\nfurther introduce a wavelet attention module to enhance frequency features. Our\napproach demonstrates marked superiority over existing state-of-the-art\ntechniques in terms of quantitative performance and visual quality.\n","authors":["Kepeng Xu","Li Xu","Gang He","Xianyun Wu","Zhiqiang Zhang","Wenxin Yu","Yunsong Li"],"pdf_url":"https://arxiv.org/pdf/2307.03394v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17589v1","updated":"2024-10-23T06:35:41Z","published":"2024-10-23T06:35:41Z","title":"Challenge on Sound Scene Synthesis: Evaluating Text-to-Audio Generation","summary":" Despite significant advancements in neural text-to-audio generation,\nchallenges persist in controllability and evaluation. This paper addresses\nthese issues through the Sound Scene Synthesis challenge held as part of the\nDetection and Classification of Acoustic Scenes and Events 2024. We present an\nevaluation protocol combining objective metric, namely Fr\\'echet Audio\nDistance, with perceptual assessments, utilizing a structured prompt format to\nenable diverse captions and effective evaluation. Our analysis reveals varying\nperformance across sound categories and model architectures, with larger models\ngenerally excelling but innovative lightweight approaches also showing promise.\nThe strong correlation between objective metrics and human ratings validates\nour evaluation approach. We discuss outcomes in terms of audio quality,\ncontrollability, and architectural considerations for text-to-audio\nsynthesizers, providing direction for future research.\n","authors":["Junwon Lee","Modan Tailleur","Laurie M. Heller","Keunwoo Choi","Mathieu Lagrange","Brian McFee","Keisuke Imoto","Yuki Okamoto"],"pdf_url":"https://arxiv.org/pdf/2410.17589v1.pdf","comment":"accepted to NeurIPS 2024 Workshop: Audio Imagination"},{"id":"http://arxiv.org/abs/2410.15573v2","updated":"2024-10-23T06:21:09Z","published":"2024-10-21T01:36:42Z","title":"OpenMU: Your Swiss Army Knife for Music Understanding","summary":" We present OpenMU-Bench, a large-scale benchmark suite for addressing the\ndata scarcity issue in training multimodal language models to understand music.\nTo construct OpenMU-Bench, we leveraged existing datasets and bootstrapped new\nannotations. OpenMU-Bench also broadens the scope of music understanding by\nincluding lyrics understanding and music tool usage. Using OpenMU-Bench, we\ntrained our music understanding model, OpenMU, with extensive ablations,\ndemonstrating that OpenMU outperforms baseline models such as MU-Llama. Both\nOpenMU and OpenMU-Bench are open-sourced to facilitate future research in music\nunderstanding and to enhance creative music production efficiency.\n","authors":["Mengjie Zhao","Zhi Zhong","Zhuoyuan Mao","Shiqi Yang","Wei-Hsiang Liao","Shusuke Takahashi","Hiromi Wakaki","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2410.15573v2.pdf","comment":"Resources: https://github.com/mzhaojp22/openmu"},{"id":"http://arxiv.org/abs/2309.12029v2","updated":"2024-10-23T04:36:26Z","published":"2023-09-21T12:51:11Z","title":"Exploring Self-Supervised Skeleton-Based Human Action Recognition under\n Occlusions","summary":" To integrate self-supervised skeleton-based action recognition methods into\nautonomous robotic systems, it is crucial to consider adverse situations\ninvolving target occlusions. Such a scenario, despite its practical relevance,\nis rarely addressed in existing self-supervised skeleton-based action\nrecognition methods. To empower models with the capacity to address occlusion,\nwe propose a simple and effective method. We first pre-train using occluded\nskeleton sequences, then use k-means clustering (KMeans) on sequence embeddings\nto group semantically similar samples. Next, we propose KNN-Imputation to fill\nin missing skeleton data based on the closest sample neighbors. Imputing\nincomplete skeleton sequences to create relatively complete sequences as input\nprovides significant benefits to existing skeleton-based self-supervised\nmethods. Meanwhile, building on the state-of-the-art Partial Spatio-Temporal\nLearning (PSTL), we introduce an Occluded Partial Spatio-Temporal Learning\n(OPSTL) framework. This enhancement utilizes Adaptive Spatial Masking (ASM) for\nbetter use of high-quality, intact skeletons. The new proposed method is\nverified on the challenging occluded versions of the NTURGB+D 60 and NTURGB+D\n120. The source code is publicly available at https://github.com/cyfml/OPSTL.\n","authors":["Yifei Chen","Kunyu Peng","Alina Roitberg","David Schneider","Jiaming Zhang","Junwei Zheng","Ruiping Liu","Yufan Chen","Kailun Yang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2309.12029v2.pdf","comment":"The source code is publicly available at\n https://github.com/cyfml/OPSTL"},{"id":"http://arxiv.org/abs/2406.14515v2","updated":"2024-10-23T03:09:54Z","published":"2024-06-20T17:26:01Z","title":"MMBench-Video: A Long-Form Multi-Shot Benchmark for Holistic Video\n Understanding","summary":" The advent of large vision-language models (LVLMs) has spurred research into\ntheir applications in multi-modal contexts, particularly in video\nunderstanding. Traditional VideoQA benchmarks, despite providing quantitative\nmetrics, often fail to encompass the full spectrum of video content and\ninadequately assess models' temporal comprehension. To address these\nlimitations, we introduce MMBench-Video, a quantitative benchmark designed to\nrigorously evaluate LVLMs' proficiency in video understanding. MMBench-Video\nincorporates lengthy videos from YouTube and employs free-form questions,\nmirroring practical use cases. The benchmark is meticulously crafted to probe\nthe models' temporal reasoning skills, with all questions human-annotated\naccording to a carefully constructed ability taxonomy. We employ GPT-4 for\nautomated assessment, demonstrating superior accuracy and robustness over\nearlier LLM-based evaluations. Utilizing MMBench-Video, we have conducted\ncomprehensive evaluations that include both proprietary and open-source LVLMs\nfor images and videos. MMBench-Video stands as a valuable resource for the\nresearch community, facilitating improved evaluation of LVLMs and catalyzing\nprogress in the field of video understanding. The evalutation code of\nMMBench-Video will be integrated into VLMEvalKit:\nhttps://github.com/open-compass/VLMEvalKit.\n","authors":["Xinyu Fang","Kangrui Mao","Haodong Duan","Xiangyu Zhao","Yining Li","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2406.14515v2.pdf","comment":"Accepted in NeurIPS 2024 Datasets and Benchmarks Track"}]},"2024-10-22T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2410.17477v1","updated":"2024-10-22T23:24:15Z","published":"2024-10-22T23:24:15Z","title":"Do Robot Snakes Dream like Electric Sheep? Investigating the Effects of\n Architectural Inductive Biases on Hallucination","summary":" The growth in prominence of large language models (LLMs) in everyday life can\nbe largely attributed to their generative abilities, yet some of this is also\nowed to the risks and costs associated with their use. On one front is their\ntendency to \\textit{hallucinate} false or misleading information, limiting\ntheir reliability. On another is the increasing focus on the computational\nlimitations associated with traditional self-attention based LLMs, which has\nbrought about new alternatives, in particular recurrent models, meant to\novercome them. Yet it remains uncommon to consider these two concerns\nsimultaneously. Do changes in architecture exacerbate/alleviate existing\nconcerns about hallucinations? Do they affect how and where they occur? Through\nan extensive evaluation, we study how these architecture-based inductive biases\naffect the propensity to hallucinate. While hallucination remains a general\nphenomenon not limited to specific architectures, the situations in which they\noccur and the ease with which specific types of hallucinations can be induced\ncan significantly differ based on the model architecture. These findings\nhighlight the need for better understanding both these problems in conjunction\nwith each other, as well as consider how to design more universal techniques\nfor handling hallucinations.\n","authors":["Jerry Huang","Prasanna Parthasarathi","Mehdi Rezagholizadeh","Boxing Chen","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2410.17477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11402v2","updated":"2024-10-22T23:13:34Z","published":"2024-09-17T17:59:06Z","title":"NVLM: Open Frontier-Class Multimodal LLMs","summary":" We introduce NVLM 1.0, a family of frontier-class multimodal large language\nmodels (LLMs) that achieve state-of-the-art results on vision-language tasks,\nrivaling the leading proprietary models (e.g., GPT-4o) and open-access models\n(e.g., Llama 3-V 405B and InternVL 2). Remarkably, NVLM 1.0 shows improved\ntext-only performance over its LLM backbone after multimodal training. In terms\nof model design, we perform a comprehensive comparison between decoder-only\nmultimodal LLMs (e.g., LLaVA) and cross-attention-based models (e.g.,\nFlamingo). Based on the strengths and weaknesses of both approaches, we propose\na novel architecture that enhances both training efficiency and multimodal\nreasoning capabilities. Furthermore, we introduce a 1-D tile-tagging design for\ntile-based dynamic high-resolution images, which significantly boosts\nperformance on multimodal reasoning and OCR-related tasks. Regarding training\ndata, we meticulously curate and provide detailed information on our multimodal\npretraining and supervised fine-tuning datasets. Our findings indicate that\ndataset quality and task diversity are more important than scale, even during\nthe pretraining phase, across all architectures. Notably, we develop\nproduction-grade multimodality for the NVLM-1.0 models, enabling them to excel\nin vision-language tasks while maintaining and even improving text-only\nperformance compared to their LLM backbones. To achieve this, we craft and\nintegrate a high-quality text-only dataset into multimodal training, alongside\na substantial amount of multimodal math and reasoning data, leading to enhanced\nmath and coding capabilities across modalities. To advance research in the\nfield, we release the model weights at https://huggingface.co/nvidia/NVLM-D-72B\nand will open-source the training code for the community soon.\n","authors":["Wenliang Dai","Nayeon Lee","Boxin Wang","Zhuolin Yang","Zihan Liu","Jon Barker","Tuomas Rintamaki","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2409.11402v2.pdf","comment":"Fixed the typos. For more information, please visit our project page\n at: https://research.nvidia.com/labs/adlr/NVLM-1"},{"id":"http://arxiv.org/abs/2410.17462v1","updated":"2024-10-22T22:43:14Z","published":"2024-10-22T22:43:14Z","title":"Decoding Time Series with LLMs: A Multi-Agent Framework for Cross-Domain\n Annotation","summary":" Time series data is ubiquitous across various domains, including\nmanufacturing, finance, and healthcare. High-quality annotations are essential\nfor effectively understanding time series and facilitating downstream tasks;\nhowever, obtaining such annotations is challenging, particularly in\nmission-critical domains. In this paper, we propose TESSA, a multi-agent system\ndesigned to automatically generate both general and domain-specific annotations\nfor time series data. TESSA introduces two agents: a general annotation agent\nand a domain-specific annotation agent. The general agent captures common\npatterns and knowledge across multiple source domains, leveraging both\ntime-series-wise and text-wise features to generate general annotations.\nMeanwhile, the domain-specific agent utilizes limited annotations from the\ntarget domain to learn domain-specific terminology and generate targeted\nannotations. Extensive experiments on multiple synthetic and real-world\ndatasets demonstrate that TESSA effectively generates high-quality annotations,\noutperforming existing methods.\n","authors":["Minhua Lin","Zhengzhang Chen","Yanchi Liu","Xujiang Zhao","Zongyu Wu","Junxiang Wang","Xiang Zhang","Suhang Wang","Haifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2410.17462v1.pdf","comment":"23 pages, 9 figures, 24 tables"},{"id":"http://arxiv.org/abs/2410.14872v2","updated":"2024-10-22T22:18:14Z","published":"2024-10-18T21:38:21Z","title":"How to Evaluate Reward Models for RLHF","summary":" We introduce a new benchmark for reward models that quantifies their ability\nto produce strong language models through RLHF (Reinforcement Learning from\nHuman Feedback). The gold-standard approach is to run a full RLHF training\npipeline and directly probe downstream LLM performance. However, this process\nis prohibitively expensive. To address this, we build a predictive model of\ndownstream LLM performance by evaluating the reward model on proxy tasks. These\nproxy tasks consist of a large-scale human preference and a verifiable\ncorrectness preference dataset, in which we measure 12 metrics across 12\ndomains. To investigate which reward model metrics are most correlated to\ngold-standard RLHF outcomes, we launch an end-to-end RLHF experiment on a\nlarge-scale crowdsourced human preference platform to view real reward model\ndownstream performance as ground truth. Ultimately, we compile our data and\nfindings into Preference Proxy Evaluations (PPE), the first reward model\nbenchmark explicitly linked to post-RLHF real-world human preference\nperformance, which we open-source for public use and further development. Our\ncode and evaluations can be found at https://github.com/lmarena/PPE .\n","authors":["Evan Frick","Tianle Li","Connor Chen","Wei-Lin Chiang","Anastasios N. Angelopoulos","Jiantao Jiao","Banghua Zhu","Joseph E. Gonzalez","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2410.14872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13639v2","updated":"2024-10-22T22:05:16Z","published":"2024-10-17T15:09:03Z","title":"A Comparative Study on Reasoning Patterns of OpenAI's o1 Model","summary":" Enabling Large Language Models (LLMs) to handle a wider range of complex\ntasks (e.g., coding, math) has drawn great attention from many researchers. As\nLLMs continue to evolve, merely increasing the number of model parameters\nyields diminishing performance improvements and heavy computational costs.\nRecently, OpenAI's o1 model has shown that inference strategies (i.e.,\nTest-time Compute methods) can also significantly enhance the reasoning\ncapabilities of LLMs. However, the mechanisms behind these methods are still\nunexplored. In our work, to investigate the reasoning patterns of o1, we\ncompare o1 with existing Test-time Compute methods (BoN, Step-wise BoN, Agent\nWorkflow, and Self-Refine) by using OpenAI's GPT-4o as a backbone on general\nreasoning benchmarks in three domains (i.e., math, coding, commonsense\nreasoning). Specifically, first, our experiments show that the o1 model has\nachieved the best performance on most datasets. Second, as for the methods of\nsearching diverse responses (e.g., BoN), we find the reward models' capability\nand the search space both limit the upper boundary of these methods. Third, as\nfor the methods that break the problem into many sub-problems, the Agent\nWorkflow has achieved better performance than Step-wise BoN due to the\ndomain-specific system prompt for planning better reasoning processes. Fourth,\nit is worth mentioning that we have summarized six reasoning patterns of o1,\nand provided a detailed analysis on several reasoning benchmarks.\n","authors":["Siwei Wu","Zhongyuan Peng","Xinrun Du","Tuney Zheng","Minghao Liu","Jialong Wu","Jiachen Ma","Yizhi Li","Jian Yang","Wangchunshu Zhou","Qunshu Lin","Junbo Zhao","Zhaoxiang Zhang","Wenhao Huang","Ge Zhang","Chenghua Lin","J. H. Liu"],"pdf_url":"https://arxiv.org/pdf/2410.13639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17450v1","updated":"2024-10-22T21:55:54Z","published":"2024-10-22T21:55:54Z","title":"Interação entre robôs humanoides: desenvolvendo a\n colaboração e comunicação autônoma","summary":" This study investigates the interaction between humanoid robots NAO and\nPepper, emphasizing their potential applications in educational settings. NAO,\nwidely used in education, and Pepper, designed for social interactions, of er\nnew opportunities for autonomous communication and collaboration. Through a\nseries of programmed interactions, the robots demonstrated their ability to\ncommunicate and coordinate actions autonomously, highlighting their potential\nas tools for enhancing learning environments. The research also explores the\nintegration of emerging technologies, such as artificial intelligence, into\nthese systems, allowing robots to learn from each other and adapt their\nbehavior. The findings suggest that NAO and Pepper can significantly contribute\nto both technical learning and the development of social and emotional skills\nin students, of ering innovative pedagogical approaches through the use of\nhumanoid robotics.\n","authors":["Moraes Pablo","Peters Christopher","Rodríguez Mónica","Sodre Hiago","Mazondo Ahilen","Sandin Vincent","Barcelona Sebastian","Moraes William","Fernández Santiago","Assunção Nathalie","de Vargas Bruna","Dörnbach Tobias","Kelbouscas André","Grando Ricardo"],"pdf_url":"https://arxiv.org/pdf/2410.17450v1.pdf","comment":"in Portuguese language"},{"id":"http://arxiv.org/abs/2410.17448v1","updated":"2024-10-22T21:50:52Z","published":"2024-10-22T21:50:52Z","title":"In Context Learning and Reasoning for Symbolic Regression with Large\n Language Models","summary":" Large Language Models (LLMs) are transformer-based machine learning models\nthat have shown remarkable performance in tasks for which they were not\nexplicitly trained. Here, we explore the potential of LLMs to perform symbolic\nregression -- a machine-learning method for finding simple and accurate\nequations from datasets. We prompt GPT-4 to suggest expressions from data,\nwhich are then optimized and evaluated using external Python tools. These\nresults are fed back to GPT-4, which proposes improved expressions while\noptimizing for complexity and loss. Using chain-of-thought prompting, we\ninstruct GPT-4 to analyze the data, prior expressions, and the scientific\ncontext (expressed in natural language) for each problem before generating new\nexpressions. We evaluated the workflow in rediscovery of five well-known\nscientific equations from experimental data, and on an additional dataset\nwithout a known equation. GPT-4 successfully rediscovered all five equations,\nand in general, performed better when prompted to use a scratchpad and consider\nscientific context. We also demonstrate how strategic prompting improves the\nmodel's performance and how the natural language interface simplifies\nintegrating theory with data. Although this approach does not outperform\nestablished SR programs where target equations are more complex, LLMs can\nnonetheless iterate toward improved solutions while following instructions and\nincorporating scientific context in natural language.\n","authors":["Samiha Sharlin","Tyler R. Josephson"],"pdf_url":"https://arxiv.org/pdf/2410.17448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17439v1","updated":"2024-10-22T21:30:58Z","published":"2024-10-22T21:30:58Z","title":"Evaluating AI-Generated Essays with GRE Analytical Writing Assessment","summary":" The recent revolutionary advance in generative AI enables the generation of\nrealistic and coherent texts by large language models (LLMs). Despite many\nexisting evaluation metrics on the quality of the generated texts, there is\nstill a lack of rigorous assessment of how well LLMs perform in complex and\ndemanding writing assessments. This study examines essays generated by ten\nleading LLMs for the analytical writing assessment of the Graduate Record Exam\n(GRE). We assessed these essays using both human raters and the e-rater\nautomated scoring engine as used in the GRE scoring pipeline. Notably, the\ntop-performing GPT-4o received an average score of 4.67, falling between\n\"generally thoughtful, well-developed analysis of the issue and conveys meaning\nclearly\" and \"presents a competent analysis of the issue and conveys meaning\nwith acceptable clarity\" according to the GRE scoring guideline. We also\nevaluated the detection accuracy of these essays, with detectors trained on\nessays generated by the same and different LLMs.\n","authors":["Yang Zhong","Jiangang Hao","Michael Fauss","Chen Li","Yuan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.17439v1.pdf","comment":"20 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.17423v1","updated":"2024-10-22T20:52:51Z","published":"2024-10-22T20:52:51Z","title":"Artificial Intelligence in Brazilian News: A Mixed-Methods Analysis","summary":" The current surge in Artificial Intelligence (AI) interest, reflected in\nheightened media coverage since 2009, has sparked significant debate on AI's\nimplications for privacy, social justice, workers' rights, and democracy. The\nmedia plays a crucial role in shaping public perception and acceptance of AI\ntechnologies. However, research into how AI appears in media has primarily\nfocused on anglophone contexts, leaving a gap in understanding how AI is\nrepresented globally. This study addresses this gap by analyzing 3,560 news\narticles from Brazilian media published between July 1, 2023, and February 29,\n2024, from 13 popular online news outlets. Using Computational Grounded Theory\n(CGT), the study applies Latent Dirichlet Allocation (LDA), BERTopic, and\nNamed-Entity Recognition to investigate the main topics in AI coverage and the\nentities represented. The findings reveal that Brazilian news coverage of AI is\ndominated by topics related to applications in the workplace and product\nlaunches, with limited space for societal concerns, which mostly focus on\ndeepfakes and electoral integrity. The analysis also highlights a significant\npresence of industry-related entities, indicating a strong influence of\ncorporate agendas in the country's news. This study underscores the need for a\nmore critical and nuanced discussion of AI's societal impacts in Brazilian\nmedia.\n","authors":["Raphael Hernandes","Giulio Corsi"],"pdf_url":"https://arxiv.org/pdf/2410.17423v1.pdf","comment":"18 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2407.12843v2","updated":"2024-10-22T20:50:56Z","published":"2024-07-04T15:10:51Z","title":"NutriBench: A Dataset for Evaluating Large Language Models in\n Carbohydrate Estimation from Meal Descriptions","summary":" Accurate nutrition estimation helps people make informed dietary choices and\nis essential in the prevention of serious health complications. We present\nNutriBench, the first publicly available natural language meal description\nnutrition benchmark. NutriBench consists of 11,857 meal descriptions generated\nfrom real-world global dietary intake data. The data is human-verified and\nannotated with macro-nutrient labels, including carbohydrates, proteins, fats,\nand calories. We conduct an extensive evaluation of NutriBench on the task of\ncarbohydrate estimation, testing twelve leading Large Language Models (LLMs),\nincluding GPT-4o, Llama3.1, Qwen2, Gemma2, and OpenBioLLM models, using\nstandard, Chain-of-Thought and Retrieval-Augmented Generation strategies.\nAdditionally, we present a study involving professional nutritionists, finding\nthat LLMs can provide more accurate and faster estimates. Finally, we perform a\nreal-world risk assessment by simulating the effect of carbohydrate predictions\non the blood glucose levels of individuals with diabetes. Our work highlights\nthe opportunities and challenges of using LLMs for nutrition estimation,\ndemonstrating their potential to aid professionals and laypersons and improve\nhealth outcomes. Our benchmark is publicly available at:\nhttps://mehak126.github.io/nutribench.html\n","authors":["Andong Hua","Mehak Preet Dhaliwal","Ryan Burke","Yao Qin"],"pdf_url":"https://arxiv.org/pdf/2407.12843v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17413v1","updated":"2024-10-22T20:39:21Z","published":"2024-10-22T20:39:21Z","title":"Scalable Influence and Fact Tracing for Large Language Model Pretraining","summary":" Training data attribution (TDA) methods aim to attribute model outputs back\nto specific training examples, and the application of these methods to large\nlanguage model (LLM) outputs could significantly advance model transparency and\ndata curation. However, it has been challenging to date to apply these methods\nto the full scale of LLM pretraining. In this paper, we refine existing\ngradient-based methods to work effectively at scale, allowing us to retrieve\ninfluential examples for an 8B-parameter language model from a pretraining\ncorpus of over 160B tokens with no need for subsampling or pre-filtering. Our\nmethod combines several techniques, including optimizer state correction, a\ntask-specific Hessian approximation, and normalized encodings, which we find to\nbe critical for performance at scale. In quantitative evaluations on a fact\ntracing task, our method performs best at identifying examples that influence\nmodel predictions, but classical, model-agnostic retrieval methods such as BM25\nstill perform better at finding passages which explicitly contain relevant\nfacts. These results demonstrate a misalignment between factual attribution and\ncausal influence. With increasing model size and training tokens, we find that\ninfluence more closely aligns with attribution. Finally, we examine different\ntypes of examples identified as influential by our method, finding that while\nmany directly entail a particular fact, others support the same output by\nreinforcing priors on relation types, common entities, and names.\n","authors":["Tyler A. Chang","Dheeraj Rajagopal","Tolga Bolukbasi","Lucas Dixon","Ian Tenney"],"pdf_url":"https://arxiv.org/pdf/2410.17413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17401v1","updated":"2024-10-22T20:18:26Z","published":"2024-10-22T20:18:26Z","title":"AdvWeb: Controllable Black-box Attacks on VLM-powered Web Agents","summary":" Vision Language Models (VLMs) have revolutionized the creation of generalist\nweb agents, empowering them to autonomously complete diverse tasks on\nreal-world websites, thereby boosting human efficiency and productivity.\nHowever, despite their remarkable capabilities, the safety and security of\nthese agents against malicious attacks remain critically underexplored, raising\nsignificant concerns about their safe deployment. To uncover and exploit such\nvulnerabilities in web agents, we provide AdvWeb, a novel black-box attack\nframework designed against web agents. AdvWeb trains an adversarial prompter\nmodel that generates and injects adversarial prompts into web pages, misleading\nweb agents into executing targeted adversarial actions such as inappropriate\nstock purchases or incorrect bank transactions, actions that could lead to\nsevere real-world consequences. With only black-box access to the web agent, we\ntrain and optimize the adversarial prompter model using DPO, leveraging both\nsuccessful and failed attack strings against the target agent. Unlike prior\napproaches, our adversarial string injection maintains stealth and control: (1)\nthe appearance of the website remains unchanged before and after the attack,\nmaking it nearly impossible for users to detect tampering, and (2) attackers\ncan modify specific substrings within the generated adversarial string to\nseamlessly change the attack objective (e.g., purchasing stocks from a\ndifferent company), enhancing attack flexibility and efficiency. We conduct\nextensive evaluations, demonstrating that AdvWeb achieves high success rates in\nattacking SOTA GPT-4V-based VLM agent across various web tasks. Our findings\nexpose critical vulnerabilities in current LLM/VLM-based agents, emphasizing\nthe urgent need for developing more reliable web agents and effective defenses.\nOur code and data are available at https://ai-secure.github.io/AdvWeb/ .\n","authors":["Chejian Xu","Mintong Kang","Jiawei Zhang","Zeyi Liao","Lingbo Mo","Mengqi Yuan","Huan Sun","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2410.17401v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2405.06626v2","updated":"2024-10-22T20:05:32Z","published":"2024-05-10T17:40:02Z","title":"Characterizing the Accuracy -- Efficiency Trade-off of Low-rank\n Decomposition in Language Models","summary":" Recent large language models (LLMs) employ billions of parameters to enable\nbroad problem-solving capabilities. Such language models also tend to be\nmemory-bound because of the dominance of matrix-vector and matrix-matrix\nmultiplications with low arithmetic intensity. Therefore, optimizing the memory\nfootprint and traffic is an important optimization direction for LLMs today.\nModel compression methods such as quantization and parameter pruning have been\nactively explored to achieve memory footprint and traffic optimization.\nHowever, the accuracy-efficiency trade-off of rank pruning (i.e., low-rank\ndecomposition) for LLMs is not well-understood yet. Therefore, in this work, we\ncharacterize the accuracy-efficiency trade-off of a low-rank decomposition\nmethod, specifically Tucker decomposition, on recent language models, including\nan open-source LLM, Llama 2. We formalize the low-rank decomposition design\nspace and show that the decomposition design space is enormous (e.g.,\nO($2^{39}$) for Llama2-7B). To navigate such a vast design space, we formulate\nit and perform thorough case studies of accuracy-efficiency trade-offs using\nsix widely used LLM benchmarks on BERT and Llama 2 models. Our results show\nthat we can achieve a 9\\% model size reduction with minimal accuracy drops,\nwhich range from 4\\%p (\\%p refers to \"percentage point,\" which refers to the\nabsolute difference between two percentage numbers; 74\\% -> 78\\% = 4\\%p\nincrease) to 10\\%p, depending on the difficulty of the benchmark, without any\nretraining to recover accuracy after decomposition. The results show that\nlow-rank decomposition can be a promising direction for LLM-based applications\nthat require real-time service at scale (e.g., AI agent and real-time coding\nassistant), where the latency is as important as the model accuracy.\n","authors":["Chakshu Moar","Faraz Tahmasebi","Michael Pellauer","Hyoukjun Kwon"],"pdf_url":"https://arxiv.org/pdf/2405.06626v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12856v4","updated":"2024-10-22T19:53:58Z","published":"2024-05-21T15:13:12Z","title":"LLM Processes: Numerical Predictive Distributions Conditioned on Natural\n Language","summary":" Machine learning practitioners often face significant challenges in formally\nintegrating their prior knowledge and beliefs into predictive models, limiting\nthe potential for nuanced and context-aware analyses. Moreover, the expertise\nneeded to integrate this prior knowledge into probabilistic modeling typically\nlimits the application of these models to specialists. Our goal is to build a\nregression model that can process numerical data and make probabilistic\npredictions at arbitrary locations, guided by natural language text which\ndescribes a user's prior knowledge. Large Language Models (LLMs) provide a\nuseful starting point for designing such a tool since they 1) provide an\ninterface where users can incorporate expert insights in natural language and\n2) provide an opportunity for leveraging latent problem-relevant knowledge\nencoded in LLMs that users may not have themselves. We start by exploring\nstrategies for eliciting explicit, coherent numerical predictive distributions\nfrom LLMs. We examine these joint predictive distributions, which we call LLM\nProcesses, over arbitrarily-many quantities in settings such as forecasting,\nmulti-dimensional regression, black-box optimization, and image modeling. We\ninvestigate the practical details of prompting to elicit coherent predictive\ndistributions, and demonstrate their effectiveness at regression. Finally, we\ndemonstrate the ability to usefully incorporate text into numerical\npredictions, improving predictive performance and giving quantitative structure\nthat reflects qualitative descriptions. This lets us begin to explore the rich,\ngrounded hypothesis space that LLMs implicitly encode.\n","authors":["James Requeima","John Bronskill","Dami Choi","Richard E. Turner","David Duvenaud"],"pdf_url":"https://arxiv.org/pdf/2405.12856v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17385v1","updated":"2024-10-22T19:39:15Z","published":"2024-10-22T19:39:15Z","title":"Do Vision-Language Models Represent Space and How? Evaluating Spatial\n Frame of Reference Under Ambiguities","summary":" Spatial expressions in situated communication can be ambiguous, as their\nmeanings vary depending on the frames of reference (FoR) adopted by speakers\nand listeners. While spatial language understanding and reasoning by\nvision-language models (VLMs) have gained increasing attention, potential\nambiguities in these models are still under-explored. To address this issue, we\npresent the COnsistent Multilingual Frame Of Reference Test (COMFORT), an\nevaluation protocol to systematically assess the spatial reasoning capabilities\nof VLMs. We evaluate nine state-of-the-art VLMs using COMFORT. Despite showing\nsome alignment with English conventions in resolving ambiguities, our\nexperiments reveal significant shortcomings of VLMs: notably, the models (1)\nexhibit poor robustness and consistency, (2) lack the flexibility to\naccommodate multiple FoRs, and (3) fail to adhere to language-specific or\nculture-specific conventions in cross-lingual tests, as English tends to\ndominate other languages. With a growing effort to align vision-language models\nwith human cognitive intuitions, we call for more attention to the ambiguous\nnature and cross-cultural diversity of spatial reasoning.\n","authors":["Zheyuan Zhang","Fengyuan Hu","Jayjun Lee","Freda Shi","Parisa Kordjamshidi","Joyce Chai","Ziqiao Ma"],"pdf_url":"https://arxiv.org/pdf/2410.17385v1.pdf","comment":"Accepted to Pluralistic Alignment @ NeurIPS 2024 | Project page:\n https://spatial-comfort.github.io/"},{"id":"http://arxiv.org/abs/2410.17375v1","updated":"2024-10-22T19:15:35Z","published":"2024-10-22T19:15:35Z","title":"AMUSD: Asynchronous Multi-Device Speculative Decoding for LLM\n Acceleration","summary":" Large language models typically generate tokens autoregressively, using each\ntoken as input for the next. Recent work on Speculative Decoding has sought to\naccelerate this process by employing a smaller, faster draft model to more\nquickly generate candidate tokens. These candidates are then verified in\nparallel by the larger (original) verify model, resulting in overall speedup\ncompared to using the larger model by itself in an autoregressive fashion. In\nthis work, we introduce AMUSD (Asynchronous Multi-device Speculative Decoding),\na system that further accelerates generation by decoupling the draft and verify\nphases into a continuous, asynchronous approach. Unlike conventional\nspeculative decoding, where only one model (draft or verify) performs token\ngeneration at a time, AMUSD enables both models to perform predictions\nindependently on separate devices (e.g., GPUs). We evaluate our approach over\nmultiple datasets and show that AMUSD achieves an average 29% improvement over\nspeculative decoding and up to 1.96$\\times$ speedup over conventional\nautoregressive decoding, while achieving identical output quality. Our system\nis open-source and available at https://github.com/BradMcDanel/AMUSD/.\n","authors":["Bradley McDanel"],"pdf_url":"https://arxiv.org/pdf/2410.17375v1.pdf","comment":"4 pages, 5 figures, 1 table, 1 algorithm"},{"id":"http://arxiv.org/abs/2405.11724v2","updated":"2024-10-22T19:07:08Z","published":"2024-05-20T01:57:34Z","title":"Token-wise Influential Training Data Retrieval for Large Language Models","summary":" Given a Large Language Model (LLM) generation, how can we identify which\ntraining data led to this generation? In this paper, we proposed RapidIn, a\nscalable framework adapting to LLMs for estimating the influence of each\ntraining data. The proposed framework consists of two stages: caching and\nretrieval. First, we compress the gradient vectors by over 200,000x, allowing\nthem to be cached on disk or in GPU/CPU memory. Then, given a generation,\nRapidIn efficiently traverses the cached gradients to estimate the influence\nwithin minutes, achieving over a 6,326x speedup. Moreover, RapidIn supports\nmulti-GPU parallelization to substantially accelerate caching and retrieval.\nOur empirical result confirms the efficiency and effectiveness of RapidIn.\n","authors":["Huawei Lin","Jikai Long","Zhaozhuo Xu","Weijie Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.11724v2.pdf","comment":"Accepted to ACL 2024. Keywords: Influence Function, Influence\n Estimation, Training Data Attribution"},{"id":"http://arxiv.org/abs/2402.13459v2","updated":"2024-10-22T19:01:35Z","published":"2024-02-21T01:30:03Z","title":"Learning to Poison Large Language Models During Instruction Tuning","summary":" The advent of Large Language Models (LLMs) has marked significant\nachievements in language processing and reasoning capabilities. Despite their\nadvancements, LLMs face vulnerabilities to data poisoning attacks, where\nadversaries insert backdoor triggers into training data to manipulate outputs\nfor malicious purposes. This work further identifies additional security risks\nin LLMs by designing a new data poisoning attack tailored to exploit the\ninstruction tuning process. We propose a novel gradient-guided backdoor trigger\nlearning (GBTL) algorithm to identify adversarial triggers efficiently,\nensuring an evasion of detection by conventional defenses while maintaining\ncontent integrity. Through experimental validation across various tasks,\nincluding sentiment analysis, domain generation, and question answering, our\npoisoning strategy demonstrates a high success rate in compromising various\nLLMs' outputs. We further propose two defense strategies against data poisoning\nattacks, including in-context learning (ICL) and continuous learning (CL),\nwhich effectively rectify the behavior of LLMs and significantly reduce the\ndecline in performance. Our work highlights the significant security risks\npresent during the instruction tuning of LLMs and emphasizes the necessity of\nsafeguarding LLMs against data poisoning attacks.\n","authors":["Yao Qiang","Xiangyu Zhou","Saleh Zare Zade","Mohammad Amin Roshani","Prashant Khanduri","Douglas Zytko","Dongxiao Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.13459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15226v2","updated":"2024-10-22T18:54:23Z","published":"2024-10-19T22:14:07Z","title":"On the Diversity of Synthetic Data and its Impact on Training Large\n Language Models","summary":" The rise of Large Language Models (LLMs) has accentuated the need for\ndiverse, high-quality pre-training data. Synthetic data emerges as a viable\nsolution to the challenges of data scarcity and inaccessibility. While previous\nliterature has focused predominantly on the quality and quantity of real data,\nour work enables the measurement of diversity in synthetic data and explores\nits impact on LLM performance. We study the downstream effects of synthetic\ndata diversity during both the pre-training and fine-tuning stages by\nintroducing a new diversity metric, \\textit{LLM cluster-agent}, designed to\nevaluate the diversity of synthetic datasets. Through a series of controlled\nexperiments with models of 350M and 1.4B parameters, we demonstrate that the\nproposed cluster-based LLM scoring of diversity correlates positively with both\npre-training and supervised fine-tuning performance. Our findings also reveal\nthat synthetic data diversity in pre-training affects supervised fine-tuning\nmore significantly than pre-training itself, even for smaller models. We hope\nthis study advances our understanding of the optimal use of synthetic data in\nLLM training and opens new avenues for efficient data generation processes.\n","authors":["Hao Chen","Abdul Waheed","Xiang Li","Yidong Wang","Jindong Wang","Bhiksha Raj","Marah I. Abdin"],"pdf_url":"https://arxiv.org/pdf/2410.15226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17542v3","updated":"2024-10-22T18:51:01Z","published":"2024-06-25T13:29:14Z","title":"CDQuant: Greedy Coordinate Descent for Accurate LLM Quantization","summary":" Large language models (LLMs) have recently demonstrated remarkable\nperformance across diverse language tasks. But their deployment is often\nconstrained by their substantial computational and storage requirements.\nQuantization has emerged as a key technique for addressing this challenge,\nenabling the compression of large models with minimal impact on performance.\nThe recent GPTQ algorithm, a post-training quantization (PTQ) method, has\nproven highly effective for compressing LLMs, sparking a wave of research that\nleverages GPTQ as a core component. Recognizing the pivotal role of GPTQ in the\nPTQ landscape, we introduce CDQuant, a simple and scalable alternative to GPTQ\nwith improved performance. CDQuant uses greedy coordinate descent to minimize\nthe layer-wise reconstruction loss to achieve high-quality quantized weights.\nOur algorithm is easy to implement and scales efficiently to models with\nhundreds of billions of parameters. We perform extensive evaluation on Gemma,\nand PaLM2 model families, and demonstrate that CDQuant consistently outperforms\nGPTQ in 2-4 bit weight quantization. Moreover, CDQuant improves the performance\nof state-of-the-art PTQ techniques such as QuIP and FrameQuant when used as a\nreplacement for their GPTQ component, resulting in further gains in quality.\n","authors":["Pranav Ajit Nair","Arun Sai Suggala"],"pdf_url":"https://arxiv.org/pdf/2406.17542v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17355v1","updated":"2024-10-22T18:47:46Z","published":"2024-10-22T18:47:46Z","title":"All Entities are Not Created Equal: Examining the Long Tail for\n Fine-Grained Entity Typing","summary":" Pre-trained language models (PLMs) are trained on large amounts of data,\nwhich helps capture world knowledge alongside linguistic competence. Due to\nthis, they are extensively used for ultra-fine entity typing tasks, where they\nprovide the entity knowledge held in its parameter space. Given that PLMs learn\nfrom co-occurrence patterns, they likely contain more knowledge or less\nknowledge about entities depending on their how frequent they are in the\npre-training data. In this work, we probe PLMs to elicit encoded entity\nprobabilities and demonstrate that they highly correlate with their frequency\nin large-scale internet data. Then, we demonstrate that entity-typing\napproaches that rely on PLMs struggle with entities at the long tail on the\ndistribution. Our findings suggests that we need to go beyond PLMs to produce\nsolutions that perform well for rare, new or infrequent entities.\n","authors":["Advait Deshmukh","Ashwin Umadi","Dananjay Srinivas","Maria Leonor Pacheco"],"pdf_url":"https://arxiv.org/pdf/2410.17355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17678v5","updated":"2024-10-22T18:26:51Z","published":"2024-07-25T00:27:07Z","title":"S2-Attention: Hardware-Aware Context Sharding Among Attention Heads","summary":" Sparse attention, which selectively attends to a subset of tokens in the\ncontext was supposed to be efficient. However, its theoretical reduction in\nFLOPs has rarely translated into wall-clock speed-up over its dense attention\ncounterparts due to the lack of hardware-aware optimizations like\nFlashAttention. Meanwhile, it remains unclear whether sparse attention can\nmaintain the model's quality at a scale of today's large language models (LLMs)\nand how. This paper presents Sparsely-Sharded(S2) Attention, a Triton library\nthat provides kernel optimization for sparse attention customizable at both\nper-head and per-context-range levels. S2-Attention enables the exploration of\nnovel and high-performance sparse attention techniques, which we demonstrate\nthrough extensive ablations across a wide range of sparse attention designs at\nvarious model scales. From these insights, we present several basic guidelines\nto design sparse attention that can achieve not only practical efficiency\nimprovements, but also strong downstream performance. To achieve high\nparallelization and optimized memory IO, sparse attention should shard the\ncontext heterogeneously across attention heads, where each head attends to a\ndifferent subset of tokens while collectively covering the full context.\nMeanwhile, we find hybrid architectures combining sparse and dense attention\nparticularly beneficial in practice. S2-Attention achieves wall-clock speedup\nof 8.79X, 15.87X, 25.3X compared to the strong FlashAttention-2 baseline with\nstrong downstream performance on-par with full attention and perfect retrieval\nperformance at a 128k context length. At inference, for 7B models, our model,\nwith the help of our S2-Attention kernel, achieves 4.5x speed-up compared to\ndense counterparts. S2-Attention is released with easy-to-customize APIs for\ndirect usage in Megatron and vLLM.\n","authors":["Xihui Lin","Yunan Zhang","Suyu Ge","Liliang Ren","Barun Patra","Vishrav Chaudhary","Hao Peng","Xia Song"],"pdf_url":"https://arxiv.org/pdf/2407.17678v5.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2410.02024v3","updated":"2024-10-22T18:22:11Z","published":"2024-10-02T20:45:51Z","title":"FLAG: Financial Long Document Classification via AMR-based GNN","summary":" The advent of large language models (LLMs) has initiated much research into\ntheir various financial applications. However, in applying LLMs on long\ndocuments, semantic relations are not explicitly incorporated, and a full or\narbitrarily sparse attention operation is employed. In recent years, progress\nhas been made in Abstract Meaning Representation (AMR), which is a graph-based\nrepresentation of text to preserve its semantic relations. Since AMR can\nrepresent semantic relationships at a deeper level, it can be beneficially\nutilized by graph neural networks (GNNs) for constructing effective\ndocument-level graph representations built upon LLM embeddings to predict\ntarget metrics in the financial domain. We propose FLAG: Financial Long\ndocument classification via AMR-based GNN, an AMR graph based framework to\ngenerate document-level embeddings for long financial document classification.\nWe construct document-level graphs from sentence-level AMR graphs, endow them\nwith specialized LLM word embeddings in the financial domain, apply a deep\nlearning mechanism that utilizes a GNN, and examine the efficacy of our\nAMR-based approach in predicting labeled target data from long financial\ndocuments. Extensive experiments are conducted on a dataset of quarterly\nearnings calls transcripts of companies in various sectors of the economy, as\nwell as on a corpus of more recent earnings calls of companies in the S&P 1500\nComposite Index. We find that our AMR-based approach outperforms fine-tuning\nLLMs directly on text in predicting stock price movement trends at different\ntime horizons in both datasets. Our work also outperforms previous work\nutilizing document graphs and GNNs for text classification.\n","authors":["Bolun \"Namir\" Xia","Aparna Gupta","Mohammed J. Zaki"],"pdf_url":"https://arxiv.org/pdf/2410.02024v3.pdf","comment":"8 pages, 3 figures, to be published in CIFEr Conference 2024 as\n \"Semantic Graph Learning for Trend Prediction from Long Financial Documents\""},{"id":"http://arxiv.org/abs/2410.17337v1","updated":"2024-10-22T18:11:43Z","published":"2024-10-22T18:11:43Z","title":"Captions Speak Louder than Images (CASLIE): Generalizing Foundation\n Models for E-commerce from High-quality Multimodal Instruction Data","summary":" Leveraging multimodal data to drive breakthroughs in e-commerce applications\nthrough Multimodal Foundation Models (MFMs) is gaining increasing attention\nfrom the research community. However, there are significant challenges that\nhinder the optimal use of multimodal e-commerce data by foundation models: (1)\nthe scarcity of large-scale, high-quality multimodal benchmark datasets; and\n(2) the lack of effective multimodal information integration methods. To\naddress these challenges, in this paper, we introduce MMECInstruct, the\nfirst-ever, large-scale, and high-quality multimodal instruction dataset for\ne-commerce. We also develop CASLIE, a simple, lightweight, yet effective\nframework for integrating multimodal information for e-commerce. Leveraging\nMMECInstruct, we fine-tune a series of e-commerce MFMs within CASLIE, denoted\nas CASLIE models. Our comprehensive evaluation demonstrates that CASLIE models\nsubstantially outperform 5 categories of advanced baseline models in the\nin-domain evaluation. Moreover, CASLIE models show strong generalizability to\nout-of-domain settings. MMECInstruct and CASLIE models are publicly accessible\nthrough https://ninglab.github.io/CASLIE/.\n","authors":["Xinyi Ling","Bo Peng","Hanwen Du","Zhihui Zhu","Xia Ning"],"pdf_url":"https://arxiv.org/pdf/2410.17337v1.pdf","comment":"Xinyi Ling and Bo Peng contributed equally to this paper"},{"id":"http://arxiv.org/abs/2410.17333v1","updated":"2024-10-22T18:08:25Z","published":"2024-10-22T18:08:25Z","title":"Are Large Language Models Ready for Travel Planning?","summary":" While large language models (LLMs) show promise in hospitality and tourism,\ntheir ability to provide unbiased service across demographic groups remains\nunclear. This paper explores gender and ethnic biases when LLMs are utilized as\ntravel planning assistants. To investigate this issue, we apply machine\nlearning techniques to analyze travel suggestions generated from three\nopen-source LLMs. Our findings reveal that the performance of race and gender\nclassifiers substantially exceeds random chance, indicating differences in how\nLLMs engage with varied subgroups. Specifically, outputs align with cultural\nexpectations tied to certain races and genders. To minimize the effect of these\nstereotypes, we used a stop-word classification strategy, which decreased\nidentifiable differences, with no disrespectful terms found. However,\nhallucinations related to African American and gender minority groups were\nnoted. In conclusion, while LLMs can generate travel plans seemingly free from\nbias, it remains essential to verify the accuracy and appropriateness of their\nrecommendations.\n","authors":["Ruiping Ren","Xing Yao","Shu Cole","Haining Wang"],"pdf_url":"https://arxiv.org/pdf/2410.17333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17309v1","updated":"2024-10-22T18:00:00Z","published":"2024-10-22T18:00:00Z","title":"Literature Meets Data: A Synergistic Approach to Hypothesis Generation","summary":" AI holds promise for transforming scientific processes, including hypothesis\ngeneration. Prior work on hypothesis generation can be broadly categorized into\ntheory-driven and data-driven approaches. While both have proven effective in\ngenerating novel and plausible hypotheses, it remains an open question whether\nthey can complement each other. To address this, we develop the first method\nthat combines literature-based insights with data to perform LLM-powered\nhypothesis generation. We apply our method on five different datasets and\ndemonstrate that integrating literature and data outperforms other baselines\n(8.97\\% over few-shot, 15.75\\% over literature-based alone, and 3.37\\% over\ndata-driven alone). Additionally, we conduct the first human evaluation to\nassess the utility of LLM-generated hypotheses in assisting human\ndecision-making on two challenging tasks: deception detection and AI generated\ncontent detection. Our results show that human accuracy improves significantly\nby 7.44\\% and 14.19\\% on these tasks, respectively. These findings suggest that\nintegrating literature-based and data-driven approaches provides a\ncomprehensive and nuanced framework for hypothesis generation and could open\nnew avenues for scientific inquiry.\n","authors":["Haokun Liu","Yangqiaoyu Zhou","Mingxuan Li","Chenfei Yuan","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2410.17309v1.pdf","comment":"30 pages, 7 figures, code link:\n https://github.com/ChicagoHAI/hypothesis-generation"},{"id":"http://arxiv.org/abs/2410.17251v1","updated":"2024-10-22T17:59:57Z","published":"2024-10-22T17:59:57Z","title":"Altogether: Image Captioning via Re-aligning Alt-text","summary":" This paper focuses on creating synthetic data to improve the quality of image\ncaptions. Existing works typically have two shortcomings. First, they caption\nimages from scratch, ignoring existing alt-text metadata, and second, lack\ntransparency if the captioners' training data (e.g. GPT) is unknown. In this\npaper, we study a principled approach Altogether based on the key idea to edit\nand re-align existing alt-texts associated with the images. To generate\ntraining data, we perform human annotation where annotators start with the\nexisting alt-text and re-align it to the image content in multiple rounds,\nconsequently constructing captions with rich visual concepts. This differs from\nprior work that carries out human annotation as a one-time description task\nsolely based on images and annotator knowledge. We train a captioner on this\ndata that generalizes the process of re-aligning alt-texts at scale. Our\nresults show our Altogether approach leads to richer image captions that also\nimprove text-to-image generation and zero-shot image classification tasks.\n","authors":["Hu Xu","Po-Yao Huang","Xiaoqing Ellen Tan","Ching-Feng Yeh","Jacob Kahn","Christine Jou","Gargi Ghosh","Omer Levy","Luke Zettlemoyer","Wen-tau Yih","Shang-Wen Li","Saining Xie","Christoph Feichtenhofer"],"pdf_url":"https://arxiv.org/pdf/2410.17251v1.pdf","comment":"accepted by EMNLP 2024; MetaCLIPv2"},{"id":"http://arxiv.org/abs/2410.17250v1","updated":"2024-10-22T17:59:56Z","published":"2024-10-22T17:59:56Z","title":"JMMMU: A Japanese Massive Multi-discipline Multimodal Understanding\n Benchmark for Culture-aware Evaluation","summary":" Accelerating research on Large Multimodal Models (LMMs) in non-English\nlanguages is crucial for enhancing user experiences across broader populations.\nIn this paper, we introduce JMMMU (Japanese MMMU), the first large-scale\nJapanese benchmark designed to evaluate LMMs on expert-level tasks based on the\nJapanese cultural context. To facilitate comprehensive culture-aware\nevaluation, JMMMU features two complementary subsets: (i) culture-agnostic (CA)\nsubset, where the culture-independent subjects (e.g., Math) are selected and\ntranslated into Japanese, enabling one-to-one comparison with its English\ncounterpart MMMU; and (ii) culture-specific (CS) subset, comprising newly\ncrafted subjects that reflect Japanese cultural context. Using the CA subset,\nwe observe performance drop in many LMMs when evaluated in Japanese, which is\npurely attributable to language variation. Using the CS subset, we reveal their\ninadequate Japanese cultural understanding. Further, by combining both subsets,\nwe identify that some LMMs perform well on the CA subset but not on the CS\nsubset, exposing a shallow understanding of the Japanese language that lacks\ndepth in cultural understanding. We hope this work will not only help advance\nLMM performance in Japanese but also serve as a guideline to create\nhigh-standard, culturally diverse benchmarks for multilingual LMM development.\nThe project page is https://mmmu-japanese-benchmark.github.io/JMMMU/.\n","authors":["Shota Onohara","Atsuyuki Miyai","Yuki Imajuku","Kazuki Egashira","Jeonghun Baek","Xiang Yue","Graham Neubig","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2410.17250v1.pdf","comment":"Project page: https://mmmu-japanese-benchmark.github.io/JMMMU/"},{"id":"http://arxiv.org/abs/2410.17247v1","updated":"2024-10-22T17:59:53Z","published":"2024-10-22T17:59:53Z","title":"PyramidDrop: Accelerating Your Large Vision-Language Models via Pyramid\n Visual Redundancy Reduction","summary":" In large vision-language models (LVLMs), images serve as inputs that carry a\nwealth of information. As the idiom \"A picture is worth a thousand words\"\nimplies, representing a single image in current LVLMs can require hundreds or\neven thousands of tokens. This results in significant computational costs,\nwhich grow quadratically as input image resolution increases, thereby severely\nimpacting the efficiency of both training and inference. Previous approaches\nhave attempted to reduce the number of image tokens either before or within the\nearly layers of LVLMs. However, these strategies inevitably result in the loss\nof crucial image information, ultimately diminishing model performance. To\naddress this challenge, we conduct an empirical study revealing that all visual\ntokens are necessary for LVLMs in the shallow layers, and token redundancy\nprogressively increases in the deeper layers of the model. To this end, we\npropose PyramidDrop, a visual redundancy reduction strategy for LVLMs to boost\ntheir efficiency in both training and inference with neglectable performance\nloss. Specifically, we partition the LVLM into several stages and drop part of\nthe image tokens at the end of each stage with a pre-defined ratio, creating\npyramid-like visual tokens across model layers. The dropping is based on a\nlightweight similarity calculation with a negligible time overhead. Extensive\nexperiments demonstrate that PyramidDrop can achieve a 40% training time and\n55% inference FLOPs acceleration of LLaVA-NeXT with comparable performance.\nBesides, the PyramidDrop could also serve as a plug-and-play strategy for\ninference acceleration without training, with better performance and lower\ninference cost than counterparts. We hope that the insights and approach\nintroduced by PyramidDrop will inspire future research to further investigate\nthe role of image tokens in LVLMs.\n","authors":["Long Xing","Qidong Huang","Xiaoyi Dong","Jiajie Lu","Pan Zhang","Yuhang Zang","Yuhang Cao","Conghui He","Jiaqi Wang","Feng Wu","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2410.17247v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2410.17245v1","updated":"2024-10-22T17:59:39Z","published":"2024-10-22T17:59:39Z","title":"Towards Reliable Evaluation of Behavior Steering Interventions in LLMs","summary":" Representation engineering methods have recently shown promise for enabling\nefficient steering of model behavior. However, evaluation pipelines for these\nmethods have primarily relied on subjective demonstrations, instead of\nquantitative, objective metrics. We aim to take a step towards addressing this\nissue by advocating for four properties missing from current evaluations: (i)\ncontexts sufficiently similar to downstream tasks should be used for assessing\nintervention quality; (ii) model likelihoods should be accounted for; (iii)\nevaluations should allow for standardized comparisons across different target\nbehaviors; and (iv) baseline comparisons should be offered. We introduce an\nevaluation pipeline grounded in these criteria, offering both a quantitative\nand visual analysis of how effectively a given method works. We use this\npipeline to evaluate two representation engineering methods on how effectively\nthey can steer behaviors such as truthfulness and corrigibility, finding that\nsome interventions are less effective than previously reported.\n","authors":["Itamar Pres","Laura Ruis","Ekdeep Singh Lubana","David Krueger"],"pdf_url":"https://arxiv.org/pdf/2410.17245v1.pdf","comment":"Accepted to the NeurIPS 2024 - Workshop on Foundation Model\n Interventions"},{"id":"http://arxiv.org/abs/2410.17238v1","updated":"2024-10-22T17:56:08Z","published":"2024-10-22T17:56:08Z","title":"SELA: Tree-Search Enhanced LLM Agents for Automated Machine Learning","summary":" Automated Machine Learning (AutoML) approaches encompass traditional methods\nthat optimize fixed pipelines for model selection and ensembling, as well as\nnewer LLM-based frameworks that autonomously build pipelines. While LLM-based\nagents have shown promise in automating machine learning tasks, they often\ngenerate low-diversity and suboptimal code, even after multiple iterations. To\novercome these limitations, we introduce Tree-Search Enhanced LLM Agents\n(SELA), an innovative agent-based system that leverages Monte Carlo Tree Search\n(MCTS) to optimize the AutoML process. By representing pipeline configurations\nas trees, our framework enables agents to conduct experiments intelligently and\niteratively refine their strategies, facilitating a more effective exploration\nof the machine learning solution space. This novel approach allows SELA to\ndiscover optimal pathways based on experimental feedback, improving the overall\nquality of the solutions. In an extensive evaluation across 20 machine learning\ndatasets, we compare the performance of traditional and agent-based AutoML\nmethods, demonstrating that SELA achieves a win rate of 65% to 80% against each\nbaseline across all datasets. These results underscore the significant\npotential of agent-based strategies in AutoML, offering a fresh perspective on\ntackling complex machine learning challenges.\n","authors":["Yizhou Chi","Yizhang Lin","Sirui Hong","Duyi Pan","Yaying Fei","Guanghao Mei","Bangbang Liu","Tianqi Pang","Jacky Kwok","Ceyao Zhang","Bang Liu","Chenglin Wu"],"pdf_url":"https://arxiv.org/pdf/2410.17238v1.pdf","comment":"The code is available at https://github.com/geekan/MetaGPT"},{"id":"http://arxiv.org/abs/2410.17236v1","updated":"2024-10-22T17:54:45Z","published":"2024-10-22T17:54:45Z","title":"Large Language Models Empowered Personalized Web Agents","summary":" Web agents have emerged as a promising direction to automate Web task\ncompletion based on user instructions, significantly enhancing user experience.\nRecently, Web agents have evolved from traditional agents to Large Language\nModels (LLMs)-based Web agents. Despite their success, existing LLM-based Web\nagents overlook the importance of personalized data (e.g., user profiles and\nhistorical Web behaviors) in assisting the understanding of users' personalized\ninstructions and executing customized actions. To overcome the limitation, we\nfirst formulate the task of LLM-empowered personalized Web agents, which\nintegrate personalized data and user instructions to personalize instruction\ncomprehension and action execution. To address the absence of a comprehensive\nevaluation benchmark, we construct a Personalized Web Agent Benchmark\n(PersonalWAB), featuring user instructions, personalized user data, Web\nfunctions, and two evaluation paradigms across three personalized Web tasks.\nMoreover, we propose a Personalized User Memory-enhanced Alignment (PUMA)\nframework to adapt LLMs to the personalized Web agent task. PUMA utilizes a\nmemory bank with a task-specific retrieval strategy to filter relevant\nhistorical Web behaviors. Based on the behaviors, PUMA then aligns LLMs for\npersonalized action execution through fine-tuning and direct preference\noptimization. Extensive experiments validate the superiority of PUMA over\nexisting Web agents on PersonalWAB.\n","authors":["Hongru Cai","Yongqi Li","Wenjie Wang","Fengbin Zhu","Xiaoyu Shen","Wenjie Li","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.17236v1.pdf","comment":"The code and data are available on the project website\n https://hongrucai.github.io/PersonalWAB/"},{"id":"http://arxiv.org/abs/2410.17235v1","updated":"2024-10-22T17:54:07Z","published":"2024-10-22T17:54:07Z","title":"Automated Spinal MRI Labelling from Reports Using a Large Language Model","summary":" We propose a general pipeline to automate the extraction of labels from\nradiology reports using large language models, which we validate on spinal MRI\nreports. The efficacy of our labelling method is measured on five distinct\nconditions: spinal cancer, stenosis, spondylolisthesis, cauda equina\ncompression and herniation. Using open-source models, our method equals or\nsurpasses GPT-4 on a held-out set of reports. Furthermore, we show that the\nextracted labels can be used to train imaging models to classify the identified\nconditions in the accompanying MR scans. All classifiers trained using\nautomated labels achieve comparable performance to models trained using scans\nmanually annotated by clinicians. Code can be found at\nhttps://github.com/robinyjpark/AutoLabelClassifier.\n","authors":["Robin Y. Park","Rhydian Windsor","Amir Jamaludin","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2410.17235v1.pdf","comment":"Accepted to Medical Image Computing and Computer Assisted\n Intervention (MICCAI 2024, Spotlight). 11 pages plus appendix"},{"id":"http://arxiv.org/abs/2410.17234v1","updated":"2024-10-22T17:54:03Z","published":"2024-10-22T17:54:03Z","title":"Fine-Tuning Large Language Models to Appropriately Abstain with Semantic\n Entropy","summary":" Large Language Models (LLMs) are known to hallucinate, whereby they generate\nplausible but inaccurate text. This phenomenon poses significant risks in\ncritical applications, such as medicine or law, necessitating robust\nhallucination mitigation strategies. While recent works have proposed\nfine-tuning methods to teach LLMs to abstain from answering questions beyond\ntheir knowledge or capabilities, these methods rely on the existence of\nground-truth labels or are limited to short-form responses. To address these\nlimitations, we propose fine-tuning using semantic entropy, an uncertainty\nmeasure derived from introspection into the model which does not require\nexternal labels. We demonstrate that our approach matches or outperforms models\nfine-tuned using prior work and achieves strong performance for both short and\nlong-form generations on a range of datasets.\n","authors":["Benedict Aaron Tjandra","Muhammed Razzak","Jannik Kossen","Kunal Handa","Yarin Gal"],"pdf_url":"https://arxiv.org/pdf/2410.17234v1.pdf","comment":"Accepted to NeurIPS Safe Generative AI Workshop 2024"},{"id":"http://arxiv.org/abs/2407.12883v2","updated":"2024-10-22T17:49:31Z","published":"2024-07-16T17:58:27Z","title":"BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive\n Retrieval","summary":" Existing retrieval benchmarks primarily consist of information-seeking\nqueries (e.g., aggregated questions from search engines) where keyword or\nsemantic-based retrieval is usually sufficient. However, many complex\nreal-world queries require in-depth reasoning to identify relevant documents\nthat go beyond surface form matching. For example, finding documentation for a\ncoding question requires understanding the logic and syntax of the functions\ninvolved. To better benchmark retrieval on such challenging queries, we\nintroduce BRIGHT, the first text retrieval benchmark that requires intensive\nreasoning to retrieve relevant documents. Our dataset consists of 1,384\nreal-world queries spanning diverse domains, such as economics, psychology,\nmathematics, and coding. These queries are drawn from naturally occurring and\ncarefully curated human data. Extensive evaluation reveals that even\nstate-of-the-art retrieval models perform poorly on BRIGHT. The leading model\non the MTEB leaderboard (Muennighoff et al., 2023), which achieves a score of\n59.0 nDCG@10, produces a score of nDCG@10 of 18.3 on BRIGHT. We show that\nincorporating explicit reasoning about the query improves retrieval performance\nby up to 12.2 points. Moreover, incorporating retrieved documents from the\ntop-performing retriever boosts question-answering performance by over 6.6\npoints. We believe that BRIGHT paves the way for future research on retrieval\nsystems in more realistic and challenging settings.\n","authors":["Hongjin Su","Howard Yen","Mengzhou Xia","Weijia Shi","Niklas Muennighoff","Han-yu Wang","Haisu Liu","Quan Shi","Zachary S. Siegel","Michael Tang","Ruoxi Sun","Jinsung Yoon","Sercan O. Arik","Danqi Chen","Tao Yu"],"pdf_url":"https://arxiv.org/pdf/2407.12883v2.pdf","comment":"48 pages"},{"id":"http://arxiv.org/abs/2410.17225v1","updated":"2024-10-22T17:47:05Z","published":"2024-10-22T17:47:05Z","title":"Dhoroni: Exploring Bengali Climate Change and Environmental Views with a\n Multi-Perspective News Dataset and Natural Language Processing","summary":" Climate change poses critical challenges globally, disproportionately\naffecting low-income countries that often lack resources and linguistic\nrepresentation on the international stage. Despite Bangladesh's status as one\nof the most vulnerable nations to climate impacts, research gaps persist in\nBengali-language studies related to climate change and NLP. To address this\ndisparity, we introduce Dhoroni, a novel Bengali (Bangla) climate change and\nenvironmental news dataset, comprising a 2300 annotated Bangla news articles,\noffering multiple perspectives such as political influence,\nscientific/statistical data, authenticity, stance detection, and stakeholder\ninvolvement. Furthermore, we present an in-depth exploratory analysis of\nDhoroni and introduce BanglaBERT-Dhoroni family, a novel baseline model family\nfor climate and environmental opinion detection in Bangla, fine-tuned on our\ndataset. This research contributes significantly to enhancing accessibility and\nanalysis of climate discourse in Bengali (Bangla), addressing crucial\ncommunication and research gaps in climate-impacted regions like Bangladesh\nwith 180 million people.\n","authors":["Azmine Toushik Wasi","Wahid Faisal","Taj Ahmad","Abdur Rahman","Mst Rafia Islam"],"pdf_url":"https://arxiv.org/pdf/2410.17225v1.pdf","comment":"In Review"},{"id":"http://arxiv.org/abs/2410.17222v1","updated":"2024-10-22T17:45:47Z","published":"2024-10-22T17:45:47Z","title":"Context-aware Prompt Tuning: Advancing In-Context Learning with\n Adversarial Methods","summary":" Fine-tuning Large Language Models (LLMs) typically involves updating at least\na few billions of parameters. A more parameter-efficient approach is Prompt\nTuning (PT), which updates only a few learnable tokens, and differently,\nIn-Context Learning (ICL) adapts the model to a new task by simply including\nexamples in the input without any training. When applying optimization-based\nmethods, such as fine-tuning and PT for few-shot learning, the model is\nspecifically adapted to the small set of training examples, whereas ICL leaves\nthe model unchanged. This distinction makes traditional learning methods more\nprone to overfitting; in contrast, ICL is less sensitive to the few-shot\nscenario. While ICL is not prone to overfitting, it does not fully extract the\ninformation that exists in the training examples. This work introduces\nContext-aware Prompt Tuning (CPT), a method inspired by ICL, PT, and\nadversarial attacks. We build on the ICL strategy of concatenating examples\nbefore the input, but we extend this by PT-like learning, refining the context\nembedding through iterative optimization to extract deeper insights from the\ntraining examples. We carefully modify specific context tokens, considering the\nunique structure of input and output formats. Inspired by adversarial attacks,\nwe adjust the input based on the labels present in the context, focusing on\nminimizing, rather than maximizing, the loss. Moreover, we apply a projected\ngradient descent algorithm to keep token embeddings close to their original\nvalues, under the assumption that the user-provided data is inherently\nvaluable. Our method has been shown to achieve superior accuracy across\nmultiple classification tasks using various LLM models.\n","authors":["Tsachi Blau","Moshe Kimhi","Yonatan Belinkov","Alexander Bronstein","Chaim Baskin"],"pdf_url":"https://arxiv.org/pdf/2410.17222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17218v1","updated":"2024-10-22T17:43:39Z","published":"2024-10-22T17:43:39Z","title":"Creativity in AI: Progresses and Challenges","summary":" Creativity is the ability to produce novel, useful, and surprising ideas, and\nhas been widely studied as a crucial aspect of human cognition. Machine\ncreativity on the other hand has been a long-standing challenge. With the rise\nof advanced generative AI, there has been renewed interest and debate regarding\nAI's creative capabilities. Therefore, it is imperative to revisit the state of\ncreativity in AI and identify key progresses and remaining challenges. In this\nwork, we survey leading works studying the creative capabilities of AI systems,\nfocusing on creative problem-solving, linguistic, artistic, and scientific\ncreativity. Our review suggests that while the latest AI models are largely\ncapable of producing linguistically and artistically creative outputs such as\npoems, images, and musical pieces, they struggle with tasks that require\ncreative problem-solving, abstract thinking and compositionality and their\ngenerations suffer from a lack of diversity, originality, long-range\nincoherence and hallucinations. We also discuss key questions concerning\ncopyright and authorship issues with generative models. Furthermore, we\nhighlight the need for a comprehensive evaluation of creativity that is\nprocess-driven and considers several dimensions of creativity. Finally, we\npropose future research directions to improve the creativity of AI outputs,\ndrawing inspiration from cognitive science and psychology.\n","authors":["Mete Ismayilzada","Debjit Paul","Antoine Bosselut","Lonneke van der Plas"],"pdf_url":"https://arxiv.org/pdf/2410.17218v1.pdf","comment":"44 pages"},{"id":"http://arxiv.org/abs/2410.17215v1","updated":"2024-10-22T17:40:32Z","published":"2024-10-22T17:40:32Z","title":"MiniPLM: Knowledge Distillation for Pre-Training Language Models","summary":" Knowledge distillation (KD) is widely used to train small, high-performing\nstudent language models (LMs) using large teacher LMs. While effective in\nfine-tuning, KD during pre-training faces challenges in efficiency,\nflexibility, and effectiveness. Existing methods either incur high\ncomputational costs due to online teacher inference, require tokenization\nmatching between teacher and student LMs, or risk losing the difficulty and\ndiversity of the teacher-generated training data. To address these issues, we\npropose MiniPLM, a KD framework for pre-training LMs by refining the training\ndata distribution with the teacher's knowledge. For efficiency, MiniPLM\nperforms offline teacher LM inference, allowing KD for multiple student LMs\nwithout adding training-time costs. For flexibility, MiniPLM operates solely on\nthe training corpus, enabling KD across model families. For effectiveness,\nMiniPLM leverages the differences between large and small LMs to enhance the\ndifficulty and diversity of the training data, helping student LMs acquire\nversatile and sophisticated knowledge. Extensive experiments demonstrate that\nMiniPLM boosts the student LMs' performance on 9 widely used downstream tasks,\nimproves the language modeling capabilities, and reduces pre-training\ncomputation. The benefit of MiniPLM extends to large pre-training scales,\nevidenced by the extrapolation of the scaling curves. Further analysis reveals\nthat MiniPLM supports KD across model families and enhances the utilization of\npre-training data. Our model, code, and data are available at\nhttps://github.com/thu-coai/MiniPLM.\n","authors":["Yuxian Gu","Hao Zhou","Fandong Meng","Jie Zhou","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2410.17215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10796v2","updated":"2024-10-22T17:35:03Z","published":"2024-10-14T17:57:09Z","title":"Context-Parametric Inversion: Why Instruction Finetuning May Not\n Actually Improve Context Reliance","summary":" A standard practice when using large language models is for users to\nsupplement their instruction with an input context containing new information\nfor the model to process. However, models struggle to reliably follow the input\ncontext, especially when it conflicts with their parametric knowledge from\npretraining. In-principle, one would expect models to adapt to the user context\nbetter after instruction finetuning, particularly when handling knowledge\nconflicts. However, we observe a surprising failure mode: during instruction\ntuning, the context reliance under knowledge conflicts initially increases as\nexpected, but then gradually decreases as instruction finetuning progresses.\nThis happens while the performance on standard benchmarks keeps on increasing\nfar after this drop. We call this phenomenon context-parametric inversion and\nobserve it across multiple general purpose instruction tuning datasets such as\nTULU, Alpaca and Ultrachat, across different model families like Llama,\nMistral, and Pythia. We perform various controlled studies and theoretical\nanalysis to show that context-parametric inversion occurs due to examples in\nthe instruction finetuning data where the input context provides information\nthat aligns with model's parametric knowledge. Our analysis suggests some\nnatural mitigation strategies with limited but insightful gains, and serves as\na useful starting point in addressing this deficiency in instruction\nfinetuning.\n","authors":["Sachin Goyal","Christina Baek","J. Zico Kolter","Aditi Raghunathan"],"pdf_url":"https://arxiv.org/pdf/2410.10796v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2410.17210v1","updated":"2024-10-22T17:34:59Z","published":"2024-10-22T17:34:59Z","title":"Exploring Possibilities of AI-Powered Legal Assistance in Bangladesh\n through Large Language Modeling","summary":" Purpose: Bangladesh's legal system struggles with major challenges like\ndelays, complexity, high costs, and millions of unresolved cases, which deter\nmany from pursuing legal action due to lack of knowledge or financial\nconstraints. This research seeks to develop a specialized Large Language Model\n(LLM) to assist in the Bangladeshi legal system. Methods: We created\nUKIL-DB-EN, an English corpus of Bangladeshi legal documents, by collecting and\nscraping data on various legal acts. We fine-tuned the GPT-2 model on this\ndataset to develop GPT2-UKIL-EN, an LLM focused on providing legal assistance\nin English. Results: The model was rigorously evaluated using semantic\nassessments, including case studies supported by expert opinions. The\nevaluation provided promising results, demonstrating the potential for the\nmodel to assist in legal matters within Bangladesh. Conclusion: Our work\nrepresents the first structured effort toward building an AI-based legal\nassistant for Bangladesh. While the results are encouraging, further\nrefinements are necessary to improve the model's accuracy, credibility, and\nsafety. This is a significant step toward creating a legal AI capable of\nserving the needs of a population of 180 million.\n","authors":["Azmine Toushik Wasi","Wahid Faisal","Mst Rafia Islam","Mahathir Mohammad Bappy"],"pdf_url":"https://arxiv.org/pdf/2410.17210v1.pdf","comment":"In Review"},{"id":"http://arxiv.org/abs/2410.17209v1","updated":"2024-10-22T17:31:37Z","published":"2024-10-22T17:31:37Z","title":"Audio-to-Score Conversion Model Based on Whisper methodology","summary":" This thesis develops a Transformer model based on Whisper, which extracts\nmelodies and chords from music audio and records them into ABC notation. A\ncomprehensive data processing workflow is customized for ABC notation,\nincluding data cleansing, formatting, and conversion, and a mutation mechanism\nis implemented to increase the diversity and quality of training data. This\nthesis innovatively introduces the \"Orpheus' Score\", a custom notation system\nthat converts music information into tokens, designs a custom vocabulary\nlibrary, and trains a corresponding custom tokenizer. Experiments show that\ncompared to traditional algorithms, the model has significantly improved\naccuracy and performance. While providing a convenient audio-to-score tool for\nmusic enthusiasts, this work also provides new ideas and tools for research in\nmusic information processing.\n","authors":["Hongyao Zhang","Bohang Sun"],"pdf_url":"https://arxiv.org/pdf/2410.17209v1.pdf","comment":"5 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.18567v2","updated":"2024-10-22T17:21:21Z","published":"2023-11-30T13:58:13Z","title":"The Causal Influence of Grammatical Gender on Distributional Semantics","summary":" How much meaning influences gender assignment across languages is an active\narea of research in linguistics and cognitive science. We can view current\napproaches as aiming to determine where gender assignment falls on a spectrum,\nfrom being fully arbitrarily determined to being largely semantically\ndetermined. For the latter case, there is a formulation of the neo-Whorfian\nhypothesis, which claims that even inanimate noun gender influences how people\nconceive of and talk about objects (using the choice of adjective used to\nmodify inanimate nouns as a proxy for meaning). We offer a novel, causal\ngraphical model that jointly represents the interactions between a noun's\ngrammatical gender, its meaning, and adjective choice. In accordance with past\nresults, we find a significant relationship between the gender of nouns and the\nadjectives that modify them. However, when we control for the meaning of the\nnoun, the relationship between grammatical gender and adjective choice is near\nzero and insignificant.\n","authors":["Karolina Stańczak","Kevin Du","Adina Williams","Isabelle Augenstein","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2311.18567v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17196v1","updated":"2024-10-22T17:15:20Z","published":"2024-10-22T17:15:20Z","title":"VoiceBench: Benchmarking LLM-Based Voice Assistants","summary":" Building on the success of large language models (LLMs), recent advancements\nsuch as GPT-4o have enabled real-time speech interactions through LLM-based\nvoice assistants, offering a significantly improved user experience compared to\ntraditional text-based interactions. However, the absence of benchmarks\ndesigned to evaluate these speech interaction capabilities has hindered\nprogress of LLM-based voice assistants development. Current evaluations focus\nprimarily on automatic speech recognition (ASR) or general knowledge evaluation\nwith clean speeches, neglecting the more intricate, real-world scenarios that\ninvolve diverse speaker characteristics, environmental and content factors. To\naddress this, we introduce VoiceBench, the first benchmark designed to provide\na multi-faceted evaluation of LLM-based voice assistants. VoiceBench also\nincludes both real and synthetic spoken instructions that incorporate the above\nthree key real-world variations. Extensive experiments reveal the limitations\nof current LLM-based voice assistant models and offer valuable insights for\nfuture research and development in this field.\n","authors":["Yiming Chen","Xianghu Yue","Chen Zhang","Xiaoxue Gao","Robby T. Tan","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2410.17196v1.pdf","comment":"Work in progress. Data is available at\n https://github.com/MatthewCYM/VoiceBench"},{"id":"http://arxiv.org/abs/2403.08213v2","updated":"2024-10-22T17:07:14Z","published":"2024-03-13T03:22:02Z","title":"Can Large Language Models Identify Authorship?","summary":" The ability to accurately identify authorship is crucial for verifying\ncontent authenticity and mitigating misinformation. Large Language Models\n(LLMs) have demonstrated an exceptional capacity for reasoning and\nproblem-solving. However, their potential in authorship analysis remains\nunder-explored. Traditional studies have depended on hand-crafted stylistic\nfeatures, whereas state-of-the-art approaches leverage text embeddings from\npre-trained language models. These methods, which typically require fine-tuning\non labeled data, often suffer from performance degradation in cross-domain\napplications and provide limited explainability. This work seeks to address\nthree research questions: (1) Can LLMs perform zero-shot, end-to-end authorship\nverification effectively? (2) Are LLMs capable of accurately attributing\nauthorship among multiple candidates authors (e.g., 10 and 20)? (3) Can LLMs\nprovide explainability in authorship analysis, particularly through the role of\nlinguistic features? Moreover, we investigate the integration of explicit\nlinguistic features to guide LLMs in their reasoning processes. Our assessment\ndemonstrates LLMs' proficiency in both tasks without the need for\ndomain-specific fine-tuning, providing explanations into their decision making\nvia a detailed analysis of linguistic features. This establishes a new\nbenchmark for future research on LLM-based authorship analysis.\n","authors":["Baixiang Huang","Canyu Chen","Kai Shu"],"pdf_url":"https://arxiv.org/pdf/2403.08213v2.pdf","comment":"Accepted to EMNLP 2024 Findings. The main paper is 9 pages long, with\n 16 pages total. The code, results, dataset, and additional resources are\n available on the project website: https://llm-authorship.github.io/"},{"id":"http://arxiv.org/abs/2409.13686v2","updated":"2024-10-22T17:06:17Z","published":"2024-09-20T17:54:16Z","title":"The Impact of Large Language Models in Academia: from Writing to\n Speaking","summary":" Large language models (LLMs) are increasingly impacting human society,\nparticularly in textual information. Based on more than 30,000 papers and 1,000\npresentations from machine learning conferences, we examined and compared the\nwords used in writing and speaking, representing the first large-scale study of\nhow LLMs influence the two main modes of verbal communication and expression\nwithin the same group of people. Our empirical results show that LLM-style\nwords such as \"significant\" have been used more frequently in abstracts and\noral presentations. The impact on speaking is beginning to emerge and is likely\nto grow in the future, calling attention to the implicit influence and ripple\neffect of LLMs on human society.\n","authors":["Mingmeng Geng","Caixi Chen","Yanru Wu","Dongping Chen","Yao Wan","Pan Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.13686v2.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2405.06643v2","updated":"2024-10-22T17:05:17Z","published":"2024-03-06T23:02:30Z","title":"Levels of AI Agents: from Rules to Large Language Models","summary":" AI agents are defined as artificial entities to perceive the environment,\nmake decisions and take actions. Inspired by the 6 levels of autonomous driving\nby Society of Automotive Engineers, the AI agents are also categorized based on\nutilities and strongness, as the following levels: L0, no AI, with tools taking\ninto account perception plus actions; L1, using rule-based AI; L2, making\nrule-based AI replaced by IL/RL-based AI, with additional reasoning & decision\nmaking; L3, applying LLM-based AI instead of IL/RL-based AI, additionally\nsetting up memory & reflection; L4, based on L3, facilitating autonomous\nlearning & generalization; L5, based on L4, appending personality of emotion\nand character and collaborative behavior with multi-agents.\n","authors":["Yu Huang"],"pdf_url":"https://arxiv.org/pdf/2405.06643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14344v2","updated":"2024-10-22T16:59:12Z","published":"2024-07-19T14:28:07Z","title":"LLMs left, right, and center: Assessing GPT's capabilities to label\n political bias from web domains","summary":" This research investigates whether OpenAI's GPT-4, a state-of-the-art large\nlanguage model, can accurately classify the political bias of news sources\nbased solely on their URLs. Given the subjective nature of political labels,\nthird-party bias ratings like those from Ad Fontes Media, AllSides, and Media\nBias/Fact Check (MBFC) are often used in research to analyze news source\ndiversity. This study aims to determine if GPT-4 can replicate these human\nratings on a seven-degree scale (\"far-left\" to \"far-right\"). The analysis\ncompares GPT-4's classifications against MBFC's, and controls for website\npopularity using Open PageRank scores. Findings reveal a high correlation\n($\\text{Spearman's } \\rho = .89$, $n = 5,877$, $p < 0.001$) between GPT-4's and\nMBFC's ratings, indicating the model's potential reliability. However, GPT-4\nabstained from classifying approximately $\\frac{2}{3}$ of the dataset. It is\nmore likely to abstain from rating unpopular websites, which also suffer from\nless accurate assessments. The LLM tends to avoid classifying sources that MBFC\nconsiders to be centrist, resulting in more polarized outputs. Finally, this\nanalysis shows a slight leftward skew in GPT's classifications compared to\nMBFC's. Therefore, while this paper suggests that while GPT-4 can be a\nscalable, cost-effective tool for political bias classification of news\nwebsites, its use should be as a complement to human judgment to mitigate\nbiases.\n","authors":["Raphael Hernandes","Giulio Corsi"],"pdf_url":"https://arxiv.org/pdf/2407.14344v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2410.17174v1","updated":"2024-10-22T16:51:27Z","published":"2024-10-22T16:51:27Z","title":"From Attention to Activation: Unravelling the Enigmas of Large Language\n Models","summary":" We study two strange phenomena in auto-regressive Transformers: (1) the\ndominance of the first token in attention heads; (2) the occurrence of large\noutlier activations in the hidden states. We find that popular large language\nmodels, such as Llama attend maximally to the first token in 98% of attention\nheads, a behaviour we attribute to the softmax function. To mitigate this\nissue, we propose a reformulation of softmax to softmax-1. Furthermore, we\nidentify adaptive optimisers, e.g. Adam, as the primary contributor to the\nlarge outlier activations and introduce OrthoAdam, a novel optimiser that\nutilises orthogonal matrices to transform gradients, to address this issue.\nFinally, not only do our methods prevent these phenomena from occurring, but\nadditionally, they enable Transformers to sustain their performance when\nquantised using basic algorithms, something that standard methods are unable to\ndo. In summary, our methods reduce the attention proportion on the first token\nfrom 65% to 3.3%, the activation kurtosis in the hidden states from 1657 to\n3.1, and perplexity penalty under 4-bit weight quantisation from 3565 to 0.3.\n","authors":["Prannay Kaul","Chengcheng Ma","Ismail Elezi","Jiankang Deng"],"pdf_url":"https://arxiv.org/pdf/2410.17174v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2410.17170v1","updated":"2024-10-22T16:50:00Z","published":"2024-10-22T16:50:00Z","title":"Self-calibration for Language Model Quantization and Pruning","summary":" Quantization and pruning are fundamental approaches for model compression,\nenabling efficient inference for language models. In a post-training setting,\nstate-of-the-art quantization and pruning methods require calibration data, a\nsmall set of unlabeled examples. Conventionally, randomly sampled web text is\nused, aiming to reflect the model training data. However, this poses two key\nproblems: (1) unrepresentative calibration examples can harm model performance,\nand (2) organizations increasingly avoid releasing model training data. In this\npaper, we propose self-calibration as a solution. Our approach requires no\nexternal data, instead leveraging the model itself to generate synthetic\ncalibration data as a better approximation of the pre-training data\ndistribution. We extensively compare the performance of self-calibration with\nseveral baselines, across a variety of models, compression methods, and tasks.\nOur approach proves consistently competitive in maximizing downstream task\nperformance, frequently outperforming even using real data.\n","authors":["Miles Williams","George Chrysostomou","Nikolaos Aletras"],"pdf_url":"https://arxiv.org/pdf/2410.17170v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2410.17161v1","updated":"2024-10-22T16:34:36Z","published":"2024-10-22T16:34:36Z","title":"Interchangeable Token Embeddings for Extendable Vocabulary and\n Alpha-Equivalence","summary":" We propose a novel approach for learning interchangeable tokens in language\nmodels to obtain an extendable vocabulary that can generalize to new tokens.\nOur method is designed to address alpha-equivalence, the principle that\nrenaming bound variables in a syntactic expression preserves semantics. This\nproperty arises in many formal languages such as temporal logics, in which all\nproposition symbols represent the same concept but are distinguishable from\neach other. To handle such tokens, we develop a dual-part embedding approach.\nThe first part is shared across all interchangeable tokens, thereby enforcing\nthat they represent the same core concept. The second part is randomly\ngenerated for each token, which enables distinguishability. We evaluate our\nmethod in a Transformer encoder-decoder model on two tasks: solving linear\ntemporal logic formulae and copying with extendable vocabulary. Our method\ndemonstrates promising generalization capabilities in addition to introducing a\nfavorable inductive bias for alpha-equivalence.\n","authors":["İlker Işık","Ramazan Gokberk Cinbis","Ebru Aydin Gol"],"pdf_url":"https://arxiv.org/pdf/2410.17161v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.17152v1","updated":"2024-10-22T16:29:33Z","published":"2024-10-22T16:29:33Z","title":"Improving Pinterest Search Relevance Using Large Language Models","summary":" To improve relevance scoring on Pinterest Search, we integrate Large Language\nModels (LLMs) into our search relevance model, leveraging carefully designed\ntext representations to predict the relevance of Pins effectively. Our approach\nuses search queries alongside content representations that include captions\nextracted from a generative visual language model. These are further enriched\nwith link-based text data, historically high-quality engaged queries,\nuser-curated boards, Pin titles and Pin descriptions, creating robust models\nfor predicting search relevance. We use a semi-supervised learning approach to\nefficiently scale up the amount of training data, expanding beyond the\nexpensive human labeled data available. By utilizing multilingual LLMs, our\nsystem extends training data to include unseen languages and domains, despite\ninitial data and annotator expertise being confined to English. Furthermore, we\ndistill from the LLM-based model into real-time servable model architectures\nand features. We provide comprehensive offline experimental validation for our\nproposed techniques and demonstrate the gains achieved through the final\ndeployed system at scale.\n","authors":["Han Wang","Mukuntha Narayanan Sundararaman","Onur Gungor","Yu Xu","Krishna Kamath","Rakesh Chalasani","Kurchi Subhra Hazra","Jinfeng Rao"],"pdf_url":"https://arxiv.org/pdf/2410.17152v1.pdf","comment":"CIKM 2024 Workshop on Industrial Recommendation Systems"},{"id":"http://arxiv.org/abs/2410.14594v2","updated":"2024-10-22T16:27:12Z","published":"2024-10-18T16:44:22Z","title":"Toolshed: Scale Tool-Equipped Agents with Advanced RAG-Tool Fusion and\n Tool Knowledge Bases","summary":" Recent advancements in tool-equipped Agents (LLMs) have enabled complex tasks\nlike secure database interactions and multi-agent code development. However,\nscaling tool capacity beyond agent reasoning or model limits remains a\nchallenge. In this paper, we address these challenges by introducing Toolshed\nKnowledge Bases, a tool knowledge base (vector database) designed to store\nenhanced tool representations and optimize tool selection for large-scale\ntool-equipped Agents. Additionally, we propose Advanced RAG-Tool Fusion, a\nnovel ensemble of tool-applied advanced retrieval-augmented generation (RAG)\ntechniques across the pre-retrieval, intra-retrieval, and post-retrieval\nphases, without requiring model fine-tuning. During pre-retrieval, tool\ndocuments are enhanced with key information and stored in the Toolshed\nKnowledge Base. Intra-retrieval focuses on query planning and transformation to\nincrease retrieval accuracy. Post-retrieval refines the retrieved tool\ndocuments and enables self-reflection. Furthermore, by varying both the total\nnumber of tools (tool-M) an Agent has access to and the tool selection\nthreshold (top-k), we address trade-offs between retrieval accuracy, agent\nperformance, and token cost. Our approach achieves 46%, 56%, and 47% absolute\nimprovements on the ToolE single-tool, ToolE multi-tool and Seal-Tools\nbenchmark datasets, respectively (Recall@5).\n","authors":["Elias Lumer","Vamse Kumar Subbiah","James A. Burke","Pradeep Honaganahalli Basavaraju","Austin Huber"],"pdf_url":"https://arxiv.org/pdf/2410.14594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17145v1","updated":"2024-10-22T16:26:03Z","published":"2024-10-22T16:26:03Z","title":"Can General-Purpose Large Language Models Generalize to English-Thai\n Machine Translation ?","summary":" Large language models (LLMs) perform well on common tasks but struggle with\ngeneralization in low-resource and low-computation settings. We examine this\nlimitation by testing various LLMs and specialized translation models on\nEnglish-Thai machine translation and code-switching datasets. Our findings\nreveal that under more strict computational constraints, such as 4-bit\nquantization, LLMs fail to translate effectively. In contrast, specialized\nmodels, with comparable or lower computational requirements, consistently\noutperform LLMs. This underscores the importance of specialized models for\nmaintaining performance under resource constraints.\n","authors":["Jirat Chiaranaipanich","Naiyarat Hanmatheekuna","Jitkapat Sawatphol","Krittamate Tiankanon","Jiramet Kinchagawat","Amrest Chinkamol","Parinthapat Pengpun","Piyalitt Ittichaiwong","Peerat Limkonchotiwat"],"pdf_url":"https://arxiv.org/pdf/2410.17145v1.pdf","comment":"Accepted in GenBench EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.14669v2","updated":"2024-10-22T16:07:22Z","published":"2024-10-18T17:58:21Z","title":"NaturalBench: Evaluating Vision-Language Models on Natural Adversarial\n Samples","summary":" Vision-language models (VLMs) have made significant progress in recent\nvisual-question-answering (VQA) benchmarks that evaluate complex\nvisio-linguistic reasoning. However, are these models truly effective? In this\nwork, we show that VLMs still struggle with natural images and questions that\nhumans can easily answer, which we term natural adversarial samples. We also\nfind it surprisingly easy to generate these VQA samples from natural image-text\ncorpora using off-the-shelf models like CLIP and ChatGPT. We propose a\nsemi-automated approach to collect a new benchmark, NaturalBench, for reliably\nevaluating VLMs with 10,000 human-verified VQA samples. Crucially, we adopt a\n$\\textbf{vision-centric}$ design by pairing each question with two images that\nyield different answers, preventing blind solutions from answering without\nusing the images. This makes NaturalBench more challenging than previous\nbenchmarks that can be solved with commonsense priors. We evaluate 53\nstate-of-the-art VLMs on NaturalBench, showing that models like\nLLaVA-OneVision, Cambrian-1, Llama3.2-Vision, Molmo, Qwen2-VL, and even GPT-4o\nlag 50%-70% behind human performance (over 90%). We analyze why NaturalBench is\nhard from two angles: (1) Compositionality: Solving NaturalBench requires\ndiverse visio-linguistic skills, including understanding attribute bindings,\nobject relationships, and advanced reasoning like logic and counting. To this\nend, unlike prior work that uses a single tag per sample, we tag each\nNaturalBench sample with 1 to 8 skill tags for fine-grained evaluation. (2)\nBiases: NaturalBench exposes severe biases in VLMs, as models often choose the\nsame answer regardless of the image. Lastly, we apply our benchmark curation\nmethod to diverse data sources, including long captions (over 100 words) and\nnon-English languages like Chinese and Hindi, highlighting its potential for\ndynamic evaluations of VLMs.\n","authors":["Baiqi Li","Zhiqiu Lin","Wenxuan Peng","Jean de Dieu Nyandwi","Daniel Jiang","Zixian Ma","Simran Khanuja","Ranjay Krishna","Graham Neubig","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2410.14669v2.pdf","comment":"Accepted to NeurIPS 24; We open-source our dataset at:\n https://huggingface.co/datasets/BaiqiL/NaturalBench ; Project page at:\n https://linzhiqiu.github.io/papers/naturalbench/"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.11402v2","updated":"2024-10-22T23:13:34Z","published":"2024-09-17T17:59:06Z","title":"NVLM: Open Frontier-Class Multimodal LLMs","summary":" We introduce NVLM 1.0, a family of frontier-class multimodal large language\nmodels (LLMs) that achieve state-of-the-art results on vision-language tasks,\nrivaling the leading proprietary models (e.g., GPT-4o) and open-access models\n(e.g., Llama 3-V 405B and InternVL 2). Remarkably, NVLM 1.0 shows improved\ntext-only performance over its LLM backbone after multimodal training. In terms\nof model design, we perform a comprehensive comparison between decoder-only\nmultimodal LLMs (e.g., LLaVA) and cross-attention-based models (e.g.,\nFlamingo). Based on the strengths and weaknesses of both approaches, we propose\na novel architecture that enhances both training efficiency and multimodal\nreasoning capabilities. Furthermore, we introduce a 1-D tile-tagging design for\ntile-based dynamic high-resolution images, which significantly boosts\nperformance on multimodal reasoning and OCR-related tasks. Regarding training\ndata, we meticulously curate and provide detailed information on our multimodal\npretraining and supervised fine-tuning datasets. Our findings indicate that\ndataset quality and task diversity are more important than scale, even during\nthe pretraining phase, across all architectures. Notably, we develop\nproduction-grade multimodality for the NVLM-1.0 models, enabling them to excel\nin vision-language tasks while maintaining and even improving text-only\nperformance compared to their LLM backbones. To achieve this, we craft and\nintegrate a high-quality text-only dataset into multimodal training, alongside\na substantial amount of multimodal math and reasoning data, leading to enhanced\nmath and coding capabilities across modalities. To advance research in the\nfield, we release the model weights at https://huggingface.co/nvidia/NVLM-D-72B\nand will open-source the training code for the community soon.\n","authors":["Wenliang Dai","Nayeon Lee","Boxin Wang","Zhuolin Yang","Zihan Liu","Jon Barker","Tuomas Rintamaki","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2409.11402v2.pdf","comment":"Fixed the typos. For more information, please visit our project page\n at: https://research.nvidia.com/labs/adlr/NVLM-1"},{"id":"http://arxiv.org/abs/2410.16239v2","updated":"2024-10-22T22:22:14Z","published":"2024-10-21T17:42:41Z","title":"MoRE: Multi-Modal Contrastive Pre-training with Transformers on X-Rays,\n ECGs, and Diagnostic Report","summary":" In this paper, we introduce a novel Multi-Modal Contrastive Pre-training\nFramework that synergistically combines X-rays, electrocardiograms (ECGs), and\nradiology/cardiology reports. Our approach leverages transformers to encode\nthese diverse modalities into a unified representation space, aiming to enhance\ndiagnostic accuracy and facilitate comprehensive patient assessments. We\nutilize LoRA-Peft to significantly reduce trainable parameters in the LLM and\nincorporate recent linear attention dropping strategy in the Vision\nTransformer(ViT) for smoother attention. Furthermore, we provide novel\nmultimodal attention explanations and retrieval for our model. To the best of\nour knowledge, we are the first to propose an integrated model that combines\nX-ray, ECG, and Radiology/Cardiology Report with this approach. By utilizing\ncontrastive loss, MoRE effectively aligns modality-specific features into a\ncoherent embedding, which supports various downstream tasks such as zero-shot\nclassification and multimodal retrieval. Employing our proposed methodology, we\nachieve state-of-the-art (SOTA) on the Mimic-IV, CheXpert, Edema Severity, and\nPtbXl downstream datasets, surpassing existing multimodal approaches. Our\nproposed framework shows significant improvements in capturing intricate\ninter-modal relationships and its robustness in medical diagnosis that\nestablishes a framework for future research in multimodal learning in the\nhealthcare sector.\n","authors":["Samrajya Thapa","Koushik Howlader","Subhankar Bhattacharjee","Wei le"],"pdf_url":"https://arxiv.org/pdf/2410.16239v2.pdf","comment":"10 pages, 5 figures, 9 tables. Supplementary detail in Appendix. Code\n made available in Github for reproducibility"},{"id":"http://arxiv.org/abs/2410.17434v1","updated":"2024-10-22T21:21:37Z","published":"2024-10-22T21:21:37Z","title":"LongVU: Spatiotemporal Adaptive Compression for Long Video-Language\n Understanding","summary":" Multimodal Large Language Models (MLLMs) have shown promising progress in\nunderstanding and analyzing video content. However, processing long videos\nremains a significant challenge constrained by LLM's context size. To address\nthis limitation, we propose LongVU, a spatiotemporal adaptive compression\nmechanism thats reduces the number of video tokens while preserving visual\ndetails of long videos. Our idea is based on leveraging cross-modal query and\ninter-frame dependencies to adaptively reduce temporal and spatial redundancy\nin videos. Specifically, we leverage DINOv2 features to remove redundant frames\nthat exhibit high similarity. Then we utilize text-guided cross-modal query for\nselective frame feature reduction. Further, we perform spatial token reduction\nacross frames based on their temporal dependencies. Our adaptive compression\nstrategy effectively processes a large number of frames with little visual\ninformation loss within given context length. Our LongVU consistently surpass\nexisting methods across a variety of video understanding benchmarks, especially\non hour-long video understanding tasks such as VideoMME and MLVU. Given a\nlight-weight LLM, our LongVU also scales effectively into a smaller size with\nstate-of-the-art video understanding performance.\n","authors":["Xiaoqian Shen","Yunyang Xiong","Changsheng Zhao","Lemeng Wu","Jun Chen","Chenchen Zhu","Zechun Liu","Fanyi Xiao","Balakrishnan Varadarajan","Florian Bordes","Zhuang Liu","Hu Xu","Hyunwoo J. Kim","Bilge Soran","Raghuraman Krishnamoorthi","Mohamed Elhoseiny","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2410.17434v1.pdf","comment":"Project page: https://vision-cair.github.io/LongVU"},{"id":"http://arxiv.org/abs/2410.17427v1","updated":"2024-10-22T20:56:04Z","published":"2024-10-22T20:56:04Z","title":"SigCLR: Sigmoid Contrastive Learning of Visual Representations","summary":" We propose SigCLR: Sigmoid Contrastive Learning of Visual Representations.\nSigCLR utilizes the logistic loss that only operates on pairs and does not\nrequire a global view as in the cross-entropy loss used in SimCLR. We show that\nlogistic loss shows competitive performance on CIFAR-10, CIFAR-100, and Tiny-IN\ncompared to other established SSL objectives. Our findings verify the\nimportance of learnable bias as in the case of SigLUP, however, it requires a\nfixed temperature as in the SimCLR to excel. Overall, SigCLR is a promising\nreplacement for the SimCLR which is ubiquitous and has shown tremendous success\nin various domains.\n","authors":["Ömer Veysel Çağatan"],"pdf_url":"https://arxiv.org/pdf/2410.17427v1.pdf","comment":"Neurips 2024 SSL Workshop"},{"id":"http://arxiv.org/abs/2312.02521v3","updated":"2024-10-22T20:52:38Z","published":"2023-12-05T06:04:16Z","title":"RetriBooru: Leakage-Free Retrieval of Conditions from Reference Images\n for Subject-Driven Generation","summary":" Diffusion-based methods have demonstrated remarkable capabilities in\ngenerating a diverse array of high-quality images, sparking interests for\nstyled avatars, virtual try-on, and more. Previous methods use the same\nreference image as the target. An overlooked aspect is the leakage of the\ntarget's spatial information, style, etc. from the reference, harming the\ngenerated diversity and causing shortcuts. However, this approach continues as\nwidely available datasets usually consist of single images not grouped by\nidentities, and it is expensive to recollect large-scale same-identity data.\nMoreover, existing metrics adopt decoupled evaluation on text alignment and\nidentity preservation, which fail at distinguishing between balanced outputs\nand those that over-fit to one aspect. In this paper, we propose a multi-level,\nsame-identity dataset RetriBooru, which groups anime characters by both face\nand cloth identities. RetriBooru enables adopting reference images of the same\ncharacter and outfits as the target, while keeping flexible gestures and\nactions. We benchmark previous methods on our dataset, and demonstrate the\neffectiveness of training with a reference image different from target (but\nsame identity). We introduce a new concept composition task, where the\nconditioning encoder learns to retrieve different concepts from several\nreference images, and modify a baseline network RetriNet for the new task.\nFinally, we introduce a novel class of metrics named Similarity Weighted\nDiversity (SWD), to measure the overlooked diversity and better evaluate the\nalignment between similarity and diversity.\n","authors":["Haoran Tang","Jieren Deng","Zhihong Pan","Hao Tian","Pratik Chaudhari","Xin Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.02521v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17422v1","updated":"2024-10-22T20:51:45Z","published":"2024-10-22T20:51:45Z","title":"AG-SLAM: Active Gaussian Splatting SLAM","summary":" We present AG-SLAM, the first active SLAM system utilizing 3D Gaussian\nSplatting (3DGS) for online scene reconstruction. In recent years, radiance\nfield scene representations, including 3DGS have been widely used in SLAM and\nexploration, but actively planning trajectories for robotic exploration is\nstill unvisited. In particular, many exploration methods assume precise\nlocalization and thus do not mitigate the significant risk of constructing a\ntrajectory, which is difficult for a SLAM system to operate on. This can cause\ncamera tracking failure and lead to failures in real-world robotic\napplications. Our method leverages Fisher Information to balance the dual\nobjectives of maximizing the information gain for the environment while\nminimizing the cost of localization errors. Experiments conducted on the Gibson\nand Habitat-Matterport 3D datasets demonstrate state-of-the-art results of the\nproposed method.\n","authors":["Wen Jiang","Boshu Lei","Katrina Ashton","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2410.17422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01984v5","updated":"2024-10-22T20:39:52Z","published":"2024-01-03T21:24:44Z","title":"AUPIMO: Redefining Visual Anomaly Detection Benchmarks with High Speed\n and Low Tolerance","summary":" Recent advances in visual anomaly detection research have seen AUROC and\nAUPRO scores on public benchmark datasets such as MVTec and VisA converge\ntowards perfect recall, giving the impression that these benchmarks are\nnear-solved. However, high AUROC and AUPRO scores do not always reflect\nqualitative performance, which limits the validity of these metrics in\nreal-world applications. We argue that the artificial ceiling imposed by the\nlack of an adequate evaluation metric restrains progression of the field, and\nit is crucial that we revisit the evaluation metrics used to rate our\nalgorithms. In response, we introduce Per-IMage Overlap (PIMO), a novel metric\nthat addresses the shortcomings of AUROC and AUPRO. PIMO retains the\nrecall-based nature of the existing metrics but introduces two distinctions:\nthe assignment of curves (and respective area under the curve) is per-image,\nand its X-axis relies solely on normal images. Measuring recall per image\nsimplifies instance score indexing and is more robust to noisy annotations. As\nwe show, it also accelerates computation and enables the usage of statistical\ntests to compare models. By imposing low tolerance for false positives on\nnormal images, PIMO provides an enhanced model validation procedure and\nhighlights performance variations across datasets. Our experiments demonstrate\nthat PIMO offers practical advantages and nuanced performance insights that\nredefine anomaly detection benchmarks -- notably challenging the perception\nthat MVTec AD and VisA datasets have been solved by contemporary models.\nAvailable on GitHub: https://github.com/jpcbertoldo/aupimo.\n","authors":["Joao P. C. Bertoldo","Dick Ameln","Ashwin Vaidya","Samet Akçay"],"pdf_url":"https://arxiv.org/pdf/2401.01984v5.pdf","comment":"Accepted to BMVC 2024. Official implementation:\n https://github.com/jpcbertoldo/aupimo. Integrated in anomalib\n https://github.com/openvinotoolkit/anomalib. This research was conducted\n during Google Summer of Code 2023 (GSoC 2023) with the anomalib team from\n Intel's OpenVINO Toolkit"},{"id":"http://arxiv.org/abs/2410.17409v1","updated":"2024-10-22T20:33:10Z","published":"2024-10-22T20:33:10Z","title":"Geometric Graph Neural Network Modeling of Human Interactions in Crowded\n Environments","summary":" Modeling human trajectories in crowded environments is challenging due to the\ncomplex nature of pedestrian behavior and interactions. This paper proposes a\ngeometric graph neural network (GNN) architecture that integrates domain\nknowledge from psychological studies to model pedestrian interactions and\npredict future trajectories. Unlike prior studies using complete graphs, we\ndefine interaction neighborhoods using pedestrians' field of view, motion\ndirection, and distance-based kernel functions to construct graph\nrepresentations of crowds. Evaluations across multiple datasets demonstrate\nimproved prediction accuracy through reduced average and final displacement\nerror metrics. Our findings underscore the importance of integrating domain\nknowledge with data-driven approaches for effective modeling of human\ninteractions in crowds.\n","authors":["Sara Honarvar","Yancy Diaz-Mercado"],"pdf_url":"https://arxiv.org/pdf/2410.17409v1.pdf","comment":"\\c{opyright} 2024 the authors. This work has been accepted to IFAC\n for publication under a Creative Commons Licence CC-BY-NC-ND"},{"id":"http://arxiv.org/abs/2403.09240v2","updated":"2024-10-22T20:26:09Z","published":"2024-03-14T10:03:58Z","title":"XReal: Realistic Anatomy and Pathology-Aware X-ray Generation via\n Controllable Diffusion Model","summary":" Large-scale generative models have demonstrated impressive capabilities in\nproducing visually compelling images, with increasing applications in medical\nimaging. However, they continue to grapple with hallucination challenges and\nthe generation of anatomically inaccurate outputs. These limitations are mainly\ndue to the reliance on textual inputs and lack of spatial control over the\ngenerated images, hindering the potential usefulness of such models in\nreal-life settings. In this work, we present XReal, a novel controllable\ndiffusion model for generating realistic chest X-ray images through precise\nanatomy and pathology location control. Our lightweight method comprises an\nAnatomy Controller and a Pathology Controller to introduce spatial control over\nanatomy and pathology in a pre-trained Text-to-Image Diffusion Model,\nrespectively, without fine-tuning the model. XReal outperforms state-of-the-art\nX-ray diffusion models in quantitative metrics and radiologists' ratings,\nshowing significant gains in anatomy and pathology realism. Our model holds\npromise for advancing generative models in medical imaging, offering greater\nprecision and adaptability while inviting further exploration in this evolving\nfield. The code and pre-trained model weights are publicly available at\nhttps://github.com/BioMedIA-MBZUAI/XReal.\n","authors":["Anees Ur Rehman Hashmi","Ibrahim Almakky","Mohammad Areeb Qazi","Santosh Sanjeev","Vijay Ram Papineni","Jagalpathy Jagdish","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2403.09240v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17396v1","updated":"2024-10-22T20:02:38Z","published":"2024-10-22T20:02:38Z","title":"Efficient Feature Extraction Using Light-Weight CNN Attention-Based Deep\n Learning Architectures for Ultrasound Fetal Plane Classification","summary":" Ultrasound fetal imaging is beneficial to support prenatal development\nbecause it is affordable and non-intrusive. Nevertheless, fetal plane\nclassification (FPC) remains challenging and time-consuming for obstetricians\nsince it depends on nuanced clinical aspects, which increases the difficulty in\nidentifying relevant features of the fetal anatomy. Thus, to assist with its\naccurate feature extraction, a lightweight artificial intelligence architecture\nleveraging convolutional neural networks and attention mechanisms is proposed\nto classify the largest benchmark ultrasound dataset. The approach fine-tunes\nfrom lightweight EfficientNet feature extraction backbones pre-trained on the\nImageNet1k. to classify key fetal planes such as the brain, femur, thorax,\ncervix, and abdomen. Our methodology incorporates the attention mechanism to\nrefine features and 3-layer perceptrons for classification, achieving superior\nperformance with the highest Top-1 accuracy of 96.25%, Top-2 accuracy of 99.80%\nand F1-Score of 0.9576. Importantly, the model has 40x fewer trainable\nparameters than existing benchmark ensemble or transformer pipelines,\nfacilitating easy deployment on edge devices to help clinical practitioners\nwith real-time FPC. The findings are also interpreted using GradCAM to carry\nout clinical correlation to aid doctors with diagnostics and improve treatment\nplans for expectant mothers.\n","authors":["Arrun Sivasubramanian","Divya Sasidharan","Sowmya V","Vinayakumar Ravi"],"pdf_url":"https://arxiv.org/pdf/2410.17396v1.pdf","comment":"Submitted to Computers in Biology and Medicine journal"},{"id":"http://arxiv.org/abs/2403.00946v2","updated":"2024-10-22T20:01:45Z","published":"2024-03-01T19:50:22Z","title":"Fine-tuning with Very Large Dropout","summary":" It is impossible today to pretend that the practice of machine learning is\ncompatible with the idea that training and testing data follow the same\ndistribution. Several authors have recently used ensemble techniques to show\nhow scenarios involving multiple data distributions are best served by\nrepresentations that are both richer than those obtained by regularizing for\nthe best in-distribution performance, and richer than those obtained under the\ninfluence of the implicit sparsity bias of common stochastic gradient\nprocedures.\n This contribution investigates the use of very high dropout rates instead of\nensembles to obtain such rich representations. Although training a deep network\nfrom scratch using such dropout rates is virtually impossible, fine-tuning a\nlarge pre-trained model under such conditions is not only possible but also\nachieves out-of-distribution performances that exceed those of both ensembles\nand weight averaging methods such as model soups. This result has practical\nsignificance because the importance of the fine-tuning scenario has\nconsiderably grown in recent years. This result also provides interesting\ninsights on the nature of rich representations and on the intrinsically linear\nnature of fine-tuning a large network using a comparatively small dataset.\n","authors":["Jianyu Zhang","Léon Bottou"],"pdf_url":"https://arxiv.org/pdf/2403.00946v2.pdf","comment":"Fine-tuning with very large dropout outperforms weight-averaging and\n ensemble on ResNet and large vision transformer"},{"id":"http://arxiv.org/abs/2410.17393v1","updated":"2024-10-22T20:01:00Z","published":"2024-10-22T20:01:00Z","title":"Denoise-I2W: Mapping Images to Denoising Words for Accurate Zero-Shot\n Composed Image Retrieval","summary":" Zero-Shot Composed Image Retrieval (ZS-CIR) supports diverse tasks with a\nbroad range of visual content manipulation intentions that can be related to\ndomain, scene, object, and attribute. A key challenge for ZS-CIR is to\naccurately map image representation to a pseudo-word token that captures the\nmanipulation intention relevant image information for generalized CIR. However,\nexisting methods between the retrieval and pre-training stages lead to\nsignificant redundancy in the pseudo-word tokens. In this paper, we propose a\nnovel denoising image-to-word mapping approach, named Denoise-I2W, for mapping\nimages into denoising pseudo-word tokens that, without intention-irrelevant\nvisual information, enhance accurate ZS-CIR. Specifically, a pseudo triplet\nconstruction module first automatically constructs pseudo triples\n(\\textit{i.e.,} a pseudo-reference image, a pseudo-manipulation text, and a\ntarget image) for pre-training the denoising mapping network. Then, a\npseudo-composed mapping module maps the pseudo-reference image to a pseudo-word\ntoken and combines it with the pseudo-manipulation text with manipulation\nintention. This combination aligns with the target image, facilitating\ndenoising intention-irrelevant visual information for mapping. Our proposed\nDenoise-I2W is a model-agnostic and annotation-free approach. It demonstrates\nstrong generalization capabilities across three state-of-the-art ZS-CIR models\non four benchmark datasets. By integrating Denoise-I2W with existing best\nmodels, we obtain consistent and significant performance boosts ranging from\n1.45\\% to 4.17\\% over the best methods without increasing inference costs. and\nachieve new state-of-the-art results on ZS-CIR. Our code is available at\n\\url{https://github.com/Pter61/denoise-i2w-tmm}.\n","authors":["Yuanmin Tang","Jing Yu","Keke Gai","Jiamin Zhuang","Gaopeng Gou","Gang Xiong","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2410.17393v1.pdf","comment":"This work was submitted to IJCAI 2024, with a score of weak accept\n and borderline accept"},{"id":"http://arxiv.org/abs/2410.17385v1","updated":"2024-10-22T19:39:15Z","published":"2024-10-22T19:39:15Z","title":"Do Vision-Language Models Represent Space and How? Evaluating Spatial\n Frame of Reference Under Ambiguities","summary":" Spatial expressions in situated communication can be ambiguous, as their\nmeanings vary depending on the frames of reference (FoR) adopted by speakers\nand listeners. While spatial language understanding and reasoning by\nvision-language models (VLMs) have gained increasing attention, potential\nambiguities in these models are still under-explored. To address this issue, we\npresent the COnsistent Multilingual Frame Of Reference Test (COMFORT), an\nevaluation protocol to systematically assess the spatial reasoning capabilities\nof VLMs. We evaluate nine state-of-the-art VLMs using COMFORT. Despite showing\nsome alignment with English conventions in resolving ambiguities, our\nexperiments reveal significant shortcomings of VLMs: notably, the models (1)\nexhibit poor robustness and consistency, (2) lack the flexibility to\naccommodate multiple FoRs, and (3) fail to adhere to language-specific or\nculture-specific conventions in cross-lingual tests, as English tends to\ndominate other languages. With a growing effort to align vision-language models\nwith human cognitive intuitions, we call for more attention to the ambiguous\nnature and cross-cultural diversity of spatial reasoning.\n","authors":["Zheyuan Zhang","Fengyuan Hu","Jayjun Lee","Freda Shi","Parisa Kordjamshidi","Joyce Chai","Ziqiao Ma"],"pdf_url":"https://arxiv.org/pdf/2410.17385v1.pdf","comment":"Accepted to Pluralistic Alignment @ NeurIPS 2024 | Project page:\n https://spatial-comfort.github.io/"},{"id":"http://arxiv.org/abs/2410.17377v1","updated":"2024-10-22T19:26:05Z","published":"2024-10-22T19:26:05Z","title":"PtychoFormer: A Transformer-based Model for Ptychographic Phase\n Retrieval","summary":" Ptychography is a computational method of microscopy that recovers\nhigh-resolution transmission images of samples from a series of diffraction\npatterns. While conventional phase retrieval algorithms can iteratively recover\nthe images, they require oversampled diffraction patterns, incur significant\ncomputational costs, and struggle to recover the absolute phase of the sample's\ntransmission function. Deep learning algorithms for ptychography are a\npromising approach to resolving the limitations of iterative algorithms. We\npresent PtychoFormer, a hierarchical transformer-based model for data-driven\nsingle-shot ptychographic phase retrieval. PtychoFormer processes subsets of\ndiffraction patterns, generating local inferences that are seamlessly stitched\ntogether to produce a high-quality reconstruction. Our model exhibits tolerance\nto sparsely scanned diffraction patterns and achieves up to 3600 times faster\nimaging speed than the extended ptychographic iterative engine (ePIE). We also\npropose the extended-PtychoFormer (ePF), a hybrid approach that combines the\nbenefits of PtychoFormer with the ePIE. ePF minimizes global phase shifts and\nsignificantly enhances reconstruction quality, achieving state-of-the-art phase\nretrieval in ptychography.\n","authors":["Ryuma Nakahata","Shehtab Zaman","Mingyuan Zhang","Fake Lu","Kenneth Chiu"],"pdf_url":"https://arxiv.org/pdf/2410.17377v1.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2307.15250v4","updated":"2024-10-22T19:09:54Z","published":"2023-07-28T01:20:12Z","title":"D2S: Representing sparse descriptors and 3D coordinates for camera\n relocalization","summary":" State-of-the-art visual localization methods mostly rely on complex\nprocedures to match local descriptors and 3D point clouds. However, these\nprocedures can incur significant costs in terms of inference, storage, and\nupdates over time. In this study, we propose a direct learning-based approach\nthat utilizes a simple network named D2S to represent complex local descriptors\nand their scene coordinates. Our method is characterized by its simplicity and\ncost-effectiveness. It solely leverages a single RGB image for localization\nduring the testing phase and only requires a lightweight model to encode a\ncomplex sparse scene. The proposed D2S employs a combination of a simple loss\nfunction and graph attention to selectively focus on robust descriptors while\ndisregarding areas such as clouds, trees, and several dynamic objects. This\nselective attention enables D2S to effectively perform a binary-semantic\nclassification for sparse descriptors. Additionally, we propose a simple\noutdoor dataset to evaluate the capabilities of visual localization methods in\nscene-specific generalization and self-updating from unlabeled observations.\nOur approach outperforms the previous regression-based methods in both indoor\nand outdoor environments. It demonstrates the ability to generalize beyond\ntraining data, including scenarios involving transitions from day to night and\nadapting to domain shifts. The source code, trained models, dataset, and demo\nvideos are available at the following link: https://thpjp.github.io/d2s.\n","authors":["Bach-Thuan Bui","Huy-Hoang Bui","Dinh-Tuan Tran","Joo-Ho Lee"],"pdf_url":"https://arxiv.org/pdf/2307.15250v4.pdf","comment":"Accepted to IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2407.00633v2","updated":"2024-10-22T18:52:19Z","published":"2024-06-30T09:15:21Z","title":"DEAR: Disentangled Environment and Agent Representations for\n Reinforcement Learning without Reconstruction","summary":" Reinforcement Learning (RL) algorithms can learn robotic control tasks from\nvisual observations, but they often require a large amount of data, especially\nwhen the visual scene is complex and unstructured. In this paper, we explore\nhow the agent's knowledge of its shape can improve the sample efficiency of\nvisual RL methods. We propose a novel method, Disentangled Environment and\nAgent Representations (DEAR), that uses the segmentation mask of the agent as\nsupervision to learn disentangled representations of the environment and the\nagent through feature separation constraints. Unlike previous approaches, DEAR\ndoes not require reconstruction of visual observations. These representations\nare then used as an auxiliary loss to the RL objective, encouraging the agent\nto focus on the relevant features of the environment. We evaluate DEAR on two\nchallenging benchmarks: Distracting DeepMind control suite and Franka Kitchen\nmanipulation tasks. Our findings demonstrate that DEAR surpasses\nstate-of-the-art methods in sample efficiency, achieving comparable or superior\nperformance with reduced parameters. Our results indicate that integrating\nagent knowledge into visual RL methods has the potential to enhance their\nlearning efficiency and robustness.\n","authors":["Ameya Pore","Riccardo Muradore","Diego Dall'Alba"],"pdf_url":"https://arxiv.org/pdf/2407.00633v2.pdf","comment":"6 pages, 7 figures, 2 tables. Accepted at 2024 IEEE/RSJ International\n Conference on Intelligent Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2305.17828v2","updated":"2024-10-22T18:51:50Z","published":"2023-05-28T23:42:35Z","title":"Counter-Hypothetical Particle Filters for Single Object Pose Tracking","summary":" Particle filtering is a common technique for six degrees of freedom (6D) pose\nestimation due to its ability to tractably represent belief over object pose.\nHowever, the particle filter is prone to particle deprivation due to the\nhigh-dimensional nature of 6D pose. When particle deprivation occurs, it can\ncause mode collapse of the underlying belief distribution during importance\nsampling. If the region surrounding the true state suffers from mode collapse,\nrecovering its belief is challenging since the area is no longer represented in\nthe probability mass formed by the particles. Previous methods mitigate this\nproblem by randomizing and resetting particles in the belief distribution, but\ndetermining the frequency of reinvigoration has relied on hand-tuning abstract\nheuristics. In this paper, we estimate the necessary reinvigoration rate at\neach time step by introducing a Counter-Hypothetical likelihood function, which\nis used alongside the standard likelihood. Inspired by the notions of\nplausibility and implausibility from Evidential Reasoning, the addition of our\nCounter-Hypothetical likelihood function assigns a level of doubt to each\nparticle. The competing cumulative values of confidence and doubt across the\nparticle set are used to estimate the level of failure within the filter, in\norder to determine the portion of particles to be reinvigorated. We demonstrate\nthe effectiveness of our method on the rigid body object 6D pose tracking task.\n","authors":["Elizabeth A. Olson","Jana Pavlasek","Jasmine A. Berry","Odest Chadwicke Jenkins"],"pdf_url":"https://arxiv.org/pdf/2305.17828v2.pdf","comment":"International Conference on Robotics and Automation (ICRA) 2023"},{"id":"http://arxiv.org/abs/2410.17357v1","updated":"2024-10-22T18:50:20Z","published":"2024-10-22T18:50:20Z","title":"Image-aware Evaluation of Generated Medical Reports","summary":" The paper proposes a novel evaluation metric for automatic medical report\ngeneration from X-ray images, VLScore. It aims to overcome the limitations of\nexisting evaluation methods, which either focus solely on textual similarities,\nignoring clinical aspects, or concentrate only on a single clinical aspect, the\npathology, neglecting all other factors. The key idea of our metric is to\nmeasure the similarity between radiology reports while considering the\ncorresponding image. We demonstrate the benefit of our metric through\nevaluation on a dataset where radiologists marked errors in pairs of reports,\nshowing notable alignment with radiologists' judgments. In addition, we provide\na new dataset for evaluating metrics. This dataset includes well-designed\nperturbations that distinguish between significant modifications (e.g., removal\nof a diagnosis) and insignificant ones. It highlights the weaknesses in current\nevaluation metrics and provides a clear framework for analysis.\n","authors":["Gefen Dawidowicz","Elad Hirsch","Ayellet Tal"],"pdf_url":"https://arxiv.org/pdf/2410.17357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15392v2","updated":"2024-10-22T18:22:20Z","published":"2024-10-20T13:44:24Z","title":"EF-3DGS: Event-Aided Free-Trajectory 3D Gaussian Splatting","summary":" Scene reconstruction from casually captured videos has wide applications in\nreal-world scenarios. With recent advancements in differentiable rendering\ntechniques, several methods have attempted to simultaneously optimize scene\nrepresentations (NeRF or 3DGS) and camera poses. Despite recent progress,\nexisting methods relying on traditional camera input tend to fail in high-speed\n(or equivalently low-frame-rate) scenarios. Event cameras, inspired by\nbiological vision, record pixel-wise intensity changes asynchronously with high\ntemporal resolution, providing valuable scene and motion information in blind\ninter-frame intervals. In this paper, we introduce the event camera to aid\nscene construction from a casually captured video for the first time, and\npropose Event-Aided Free-Trajectory 3DGS, called EF-3DGS, which seamlessly\nintegrates the advantages of event cameras into 3DGS through three key\ncomponents. First, we leverage the Event Generation Model (EGM) to fuse events\nand frames, supervising the rendered views observed by the event stream.\nSecond, we adopt the Contrast Maximization (CMax) framework in a piece-wise\nmanner to extract motion information by maximizing the contrast of the Image of\nWarped Events (IWE), thereby calibrating the estimated poses. Besides, based on\nthe Linear Event Generation Model (LEGM), the brightness information encoded in\nthe IWE is also utilized to constrain the 3DGS in the gradient domain. Third,\nto mitigate the absence of color information of events, we introduce\nphotometric bundle adjustment (PBA) to ensure view consistency across events\nand frames. We evaluate our method on the public Tanks and Temples benchmark\nand a newly collected real-world dataset, RealEv-DAVIS. Our project page is\nhttps://lbh666.github.io/ef-3dgs/.\n","authors":["Bohao Liao","Wei Zhai","Zengyu Wan","Tianzhu Zhang","Yang Cao","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2410.15392v2.pdf","comment":"Project Page: https://lbh666.github.io/ef-3dgs/"},{"id":"http://arxiv.org/abs/2410.17331v1","updated":"2024-10-22T18:04:00Z","published":"2024-10-22T18:04:00Z","title":"Offline Evaluation of Set-Based Text-to-Image Generation","summary":" Text-to-Image (TTI) systems often support people during ideation, the early\nstages of a creative process when exposure to a broad set of relevant images\ncan help explore the design space. Since ideation is an important subclass of\nTTI tasks, understanding how to quantitatively evaluate TTI systems according\nto how well they support ideation is crucial to promoting research and\ndevelopment for these users. However, existing evaluation metrics for TTI\nremain focused on distributional similarity metrics like Fr\\'echet Inception\nDistance (FID). We take an alternative approach and, based on established\nmethods from ranking evaluation, develop TTI evaluation metrics with explicit\nmodels of how users browse and interact with sets of spatially arranged\ngenerated images. Our proposed offline evaluation metrics for TTI not only\ncapture how relevant generated images are with respect to the user's ideation\nneed but also take into consideration the diversity and arrangement of the set\nof generated images. We analyze our proposed family of TTI metrics using human\nstudies on image grids generated by three different TTI systems based on\nsubsets of the widely used benchmarks such as MS-COCO captions and Localized\nNarratives as well as prompts used in naturalistic settings. Our results\ndemonstrate that grounding metrics in how people use systems is an important\nand understudied area of benchmark design.\n","authors":["Negar Arabzadeh","Fernando Diaz","Junfeng He"],"pdf_url":"https://arxiv.org/pdf/2410.17331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17251v1","updated":"2024-10-22T17:59:57Z","published":"2024-10-22T17:59:57Z","title":"Altogether: Image Captioning via Re-aligning Alt-text","summary":" This paper focuses on creating synthetic data to improve the quality of image\ncaptions. Existing works typically have two shortcomings. First, they caption\nimages from scratch, ignoring existing alt-text metadata, and second, lack\ntransparency if the captioners' training data (e.g. GPT) is unknown. In this\npaper, we study a principled approach Altogether based on the key idea to edit\nand re-align existing alt-texts associated with the images. To generate\ntraining data, we perform human annotation where annotators start with the\nexisting alt-text and re-align it to the image content in multiple rounds,\nconsequently constructing captions with rich visual concepts. This differs from\nprior work that carries out human annotation as a one-time description task\nsolely based on images and annotator knowledge. We train a captioner on this\ndata that generalizes the process of re-aligning alt-texts at scale. Our\nresults show our Altogether approach leads to richer image captions that also\nimprove text-to-image generation and zero-shot image classification tasks.\n","authors":["Hu Xu","Po-Yao Huang","Xiaoqing Ellen Tan","Ching-Feng Yeh","Jacob Kahn","Christine Jou","Gargi Ghosh","Omer Levy","Luke Zettlemoyer","Wen-tau Yih","Shang-Wen Li","Saining Xie","Christoph Feichtenhofer"],"pdf_url":"https://arxiv.org/pdf/2410.17251v1.pdf","comment":"accepted by EMNLP 2024; MetaCLIPv2"},{"id":"http://arxiv.org/abs/2410.17249v1","updated":"2024-10-22T17:59:56Z","published":"2024-10-22T17:59:56Z","title":"SpectroMotion: Dynamic 3D Reconstruction of Specular Scenes","summary":" We present SpectroMotion, a novel approach that combines 3D Gaussian\nSplatting (3DGS) with physically-based rendering (PBR) and deformation fields\nto reconstruct dynamic specular scenes. Previous methods extending 3DGS to\nmodel dynamic scenes have struggled to accurately represent specular surfaces.\nOur method addresses this limitation by introducing a residual correction\ntechnique for accurate surface normal computation during deformation,\ncomplemented by a deformable environment map that adapts to time-varying\nlighting conditions. We implement a coarse-to-fine training strategy that\nsignificantly enhances both scene geometry and specular color prediction. We\ndemonstrate that our model outperforms prior methods for view synthesis of\nscenes containing dynamic specular objects and that it is the only existing\n3DGS method capable of synthesizing photorealistic real-world dynamic specular\nscenes, outperforming state-of-the-art methods in rendering complex, dynamic,\nand specular scenes.\n","authors":["Cheng-De Fan","Chen-Wei Chang","Yi-Ruei Liu","Jie-Ying Lee","Jiun-Long Huang","Yu-Chee Tseng","Yu-Lun Liu"],"pdf_url":"https://arxiv.org/pdf/2410.17249v1.pdf","comment":"Project page: https://cdfan0627.github.io/spectromotion/"},{"id":"http://arxiv.org/abs/2410.17250v1","updated":"2024-10-22T17:59:56Z","published":"2024-10-22T17:59:56Z","title":"JMMMU: A Japanese Massive Multi-discipline Multimodal Understanding\n Benchmark for Culture-aware Evaluation","summary":" Accelerating research on Large Multimodal Models (LMMs) in non-English\nlanguages is crucial for enhancing user experiences across broader populations.\nIn this paper, we introduce JMMMU (Japanese MMMU), the first large-scale\nJapanese benchmark designed to evaluate LMMs on expert-level tasks based on the\nJapanese cultural context. To facilitate comprehensive culture-aware\nevaluation, JMMMU features two complementary subsets: (i) culture-agnostic (CA)\nsubset, where the culture-independent subjects (e.g., Math) are selected and\ntranslated into Japanese, enabling one-to-one comparison with its English\ncounterpart MMMU; and (ii) culture-specific (CS) subset, comprising newly\ncrafted subjects that reflect Japanese cultural context. Using the CA subset,\nwe observe performance drop in many LMMs when evaluated in Japanese, which is\npurely attributable to language variation. Using the CS subset, we reveal their\ninadequate Japanese cultural understanding. Further, by combining both subsets,\nwe identify that some LMMs perform well on the CA subset but not on the CS\nsubset, exposing a shallow understanding of the Japanese language that lacks\ndepth in cultural understanding. We hope this work will not only help advance\nLMM performance in Japanese but also serve as a guideline to create\nhigh-standard, culturally diverse benchmarks for multilingual LMM development.\nThe project page is https://mmmu-japanese-benchmark.github.io/JMMMU/.\n","authors":["Shota Onohara","Atsuyuki Miyai","Yuki Imajuku","Kazuki Egashira","Jeonghun Baek","Xiang Yue","Graham Neubig","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2410.17250v1.pdf","comment":"Project page: https://mmmu-japanese-benchmark.github.io/JMMMU/"},{"id":"http://arxiv.org/abs/2410.17247v1","updated":"2024-10-22T17:59:53Z","published":"2024-10-22T17:59:53Z","title":"PyramidDrop: Accelerating Your Large Vision-Language Models via Pyramid\n Visual Redundancy Reduction","summary":" In large vision-language models (LVLMs), images serve as inputs that carry a\nwealth of information. As the idiom \"A picture is worth a thousand words\"\nimplies, representing a single image in current LVLMs can require hundreds or\neven thousands of tokens. This results in significant computational costs,\nwhich grow quadratically as input image resolution increases, thereby severely\nimpacting the efficiency of both training and inference. Previous approaches\nhave attempted to reduce the number of image tokens either before or within the\nearly layers of LVLMs. However, these strategies inevitably result in the loss\nof crucial image information, ultimately diminishing model performance. To\naddress this challenge, we conduct an empirical study revealing that all visual\ntokens are necessary for LVLMs in the shallow layers, and token redundancy\nprogressively increases in the deeper layers of the model. To this end, we\npropose PyramidDrop, a visual redundancy reduction strategy for LVLMs to boost\ntheir efficiency in both training and inference with neglectable performance\nloss. Specifically, we partition the LVLM into several stages and drop part of\nthe image tokens at the end of each stage with a pre-defined ratio, creating\npyramid-like visual tokens across model layers. The dropping is based on a\nlightweight similarity calculation with a negligible time overhead. Extensive\nexperiments demonstrate that PyramidDrop can achieve a 40% training time and\n55% inference FLOPs acceleration of LLaVA-NeXT with comparable performance.\nBesides, the PyramidDrop could also serve as a plug-and-play strategy for\ninference acceleration without training, with better performance and lower\ninference cost than counterparts. We hope that the insights and approach\nintroduced by PyramidDrop will inspire future research to further investigate\nthe role of image tokens in LVLMs.\n","authors":["Long Xing","Qidong Huang","Xiaoyi Dong","Jiajie Lu","Pan Zhang","Yuhang Zang","Yuhang Cao","Conghui He","Jiaqi Wang","Feng Wu","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2410.17247v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2410.17243v1","updated":"2024-10-22T17:59:30Z","published":"2024-10-22T17:59:30Z","title":"Breaking the Memory Barrier: Near Infinite Batch Size Scaling for\n Contrastive Loss","summary":" Contrastive loss is a powerful approach for representation learning, where\nlarger batch sizes enhance performance by providing more negative samples to\nbetter distinguish between similar and dissimilar data. However, scaling batch\nsizes is constrained by the quadratic growth in GPU memory consumption,\nprimarily due to the full instantiation of the similarity matrix. To address\nthis, we propose a tile-based computation strategy that partitions the\ncontrastive loss calculation into arbitrary small blocks, avoiding full\nmaterialization of the similarity matrix. Furthermore, we introduce a\nmulti-level tiling strategy to leverage the hierarchical structure of\ndistributed systems, employing ring-based communication at the GPU level to\noptimize synchronization and fused kernels at the CUDA core level to reduce I/O\noverhead. Experimental results show that the proposed method scales batch sizes\nto unprecedented levels. For instance, it enables contrastive training of a\nCLIP-ViT-L/14 model with a batch size of 4M or 12M using 8 or 32 A800 80GB\nwithout sacrificing any accuracy. Compared to SOTA memory-efficient solutions,\nit achieves a two-order-of-magnitude reduction in memory while maintaining\ncomparable speed. The code will be made publicly available.\n","authors":["Zesen Cheng","Hang Zhang","Kehan Li","Sicong Leng","Zhiqiang Hu","Fei Wu","Deli Zhao","Xin Li","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2410.17243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17242v1","updated":"2024-10-22T17:58:28Z","published":"2024-10-22T17:58:28Z","title":"LVSM: A Large View Synthesis Model with Minimal 3D Inductive Bias","summary":" We propose the Large View Synthesis Model (LVSM), a novel transformer-based\napproach for scalable and generalizable novel view synthesis from sparse-view\ninputs. We introduce two architectures: (1) an encoder-decoder LVSM, which\nencodes input image tokens into a fixed number of 1D latent tokens, functioning\nas a fully learned scene representation, and decodes novel-view images from\nthem; and (2) a decoder-only LVSM, which directly maps input images to\nnovel-view outputs, completely eliminating intermediate scene representations.\nBoth models bypass the 3D inductive biases used in previous methods -- from 3D\nrepresentations (e.g., NeRF, 3DGS) to network designs (e.g., epipolar\nprojections, plane sweeps) -- addressing novel view synthesis with a fully\ndata-driven approach. While the encoder-decoder model offers faster inference\ndue to its independent latent representation, the decoder-only LVSM achieves\nsuperior quality, scalability, and zero-shot generalization, outperforming\nprevious state-of-the-art methods by 1.5 to 3.5 dB PSNR. Comprehensive\nevaluations across multiple datasets demonstrate that both LVSM variants\nachieve state-of-the-art novel view synthesis quality. Notably, our models\nsurpass all previous methods even with reduced computational resources (1-2\nGPUs). Please see our website for more details:\nhttps://haian-jin.github.io/projects/LVSM/ .\n","authors":["Haian Jin","Hanwen Jiang","Hao Tan","Kai Zhang","Sai Bi","Tianyuan Zhang","Fujun Luan","Noah Snavely","Zexiang Xu"],"pdf_url":"https://arxiv.org/pdf/2410.17242v1.pdf","comment":"project page: https://haian-jin.github.io/projects/LVSM/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2410.17406v1","updated":"2024-10-22T20:28:57Z","published":"2024-10-22T20:28:57Z","title":"ProveRAG: Provenance-Driven Vulnerability Analysis with Automated\n Retrieval-Augmented LLMs","summary":" In cybersecurity, security analysts face the challenge of mitigating newly\ndiscovered vulnerabilities in real-time, with over 300,000 Common\nVulnerabilities and Exposures (CVEs) identified since 1999. The sheer volume of\nknown vulnerabilities complicates the detection of patterns for unknown\nthreats. While LLMs can assist, they often hallucinate and lack alignment with\nrecent threats. Over 25,000 vulnerabilities have been identified so far in\n2024, which are introduced after popular LLMs' (e.g., GPT-4) training data\ncutoff. This raises a major challenge of leveraging LLMs in cybersecurity,\nwhere accuracy and up-to-date information are paramount. In this work, we aim\nto improve the adaptation of LLMs in vulnerability analysis by mimicking how\nanalysts perform such tasks. We propose ProveRAG, an LLM-powered system\ndesigned to assist in rapidly analyzing CVEs with automated retrieval\naugmentation of web data while self-evaluating its responses with verifiable\nevidence. ProveRAG incorporates a self-critique mechanism to help alleviate\nomission and hallucination common in the output of LLMs applied in\ncybersecurity applications. The system cross-references data from verifiable\nsources (NVD and CWE), giving analysts confidence in the actionable insights\nprovided. Our results indicate that ProveRAG excels in delivering verifiable\nevidence to the user with over 99% and 97% accuracy in exploitation and\nmitigation strategies, respectively. This system outperforms direct prompting\nand chunking retrieval in vulnerability analysis by overcoming temporal and\ncontext-window limitations. ProveRAG guides analysts to secure their systems\nmore effectively while documenting the process for future audits.\n","authors":["Reza Fayyazi","Stella Hoyos Trueba","Michael Zuzak","Shanchieh Jay Yang"],"pdf_url":"https://arxiv.org/pdf/2410.17406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11724v2","updated":"2024-10-22T19:07:08Z","published":"2024-05-20T01:57:34Z","title":"Token-wise Influential Training Data Retrieval for Large Language Models","summary":" Given a Large Language Model (LLM) generation, how can we identify which\ntraining data led to this generation? In this paper, we proposed RapidIn, a\nscalable framework adapting to LLMs for estimating the influence of each\ntraining data. The proposed framework consists of two stages: caching and\nretrieval. First, we compress the gradient vectors by over 200,000x, allowing\nthem to be cached on disk or in GPU/CPU memory. Then, given a generation,\nRapidIn efficiently traverses the cached gradients to estimate the influence\nwithin minutes, achieving over a 6,326x speedup. Moreover, RapidIn supports\nmulti-GPU parallelization to substantially accelerate caching and retrieval.\nOur empirical result confirms the efficiency and effectiveness of RapidIn.\n","authors":["Huawei Lin","Jikai Long","Zhaozhuo Xu","Weijie Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.11724v2.pdf","comment":"Accepted to ACL 2024. Keywords: Influence Function, Influence\n Estimation, Training Data Attribution"},{"id":"http://arxiv.org/abs/2410.17337v1","updated":"2024-10-22T18:11:43Z","published":"2024-10-22T18:11:43Z","title":"Captions Speak Louder than Images (CASLIE): Generalizing Foundation\n Models for E-commerce from High-quality Multimodal Instruction Data","summary":" Leveraging multimodal data to drive breakthroughs in e-commerce applications\nthrough Multimodal Foundation Models (MFMs) is gaining increasing attention\nfrom the research community. However, there are significant challenges that\nhinder the optimal use of multimodal e-commerce data by foundation models: (1)\nthe scarcity of large-scale, high-quality multimodal benchmark datasets; and\n(2) the lack of effective multimodal information integration methods. To\naddress these challenges, in this paper, we introduce MMECInstruct, the\nfirst-ever, large-scale, and high-quality multimodal instruction dataset for\ne-commerce. We also develop CASLIE, a simple, lightweight, yet effective\nframework for integrating multimodal information for e-commerce. Leveraging\nMMECInstruct, we fine-tune a series of e-commerce MFMs within CASLIE, denoted\nas CASLIE models. Our comprehensive evaluation demonstrates that CASLIE models\nsubstantially outperform 5 categories of advanced baseline models in the\nin-domain evaluation. Moreover, CASLIE models show strong generalizability to\nout-of-domain settings. MMECInstruct and CASLIE models are publicly accessible\nthrough https://ninglab.github.io/CASLIE/.\n","authors":["Xinyi Ling","Bo Peng","Hanwen Du","Zhihui Zhu","Xia Ning"],"pdf_url":"https://arxiv.org/pdf/2410.17337v1.pdf","comment":"Xinyi Ling and Bo Peng contributed equally to this paper"},{"id":"http://arxiv.org/abs/2410.17331v1","updated":"2024-10-22T18:04:00Z","published":"2024-10-22T18:04:00Z","title":"Offline Evaluation of Set-Based Text-to-Image Generation","summary":" Text-to-Image (TTI) systems often support people during ideation, the early\nstages of a creative process when exposure to a broad set of relevant images\ncan help explore the design space. Since ideation is an important subclass of\nTTI tasks, understanding how to quantitatively evaluate TTI systems according\nto how well they support ideation is crucial to promoting research and\ndevelopment for these users. However, existing evaluation metrics for TTI\nremain focused on distributional similarity metrics like Fr\\'echet Inception\nDistance (FID). We take an alternative approach and, based on established\nmethods from ranking evaluation, develop TTI evaluation metrics with explicit\nmodels of how users browse and interact with sets of spatially arranged\ngenerated images. Our proposed offline evaluation metrics for TTI not only\ncapture how relevant generated images are with respect to the user's ideation\nneed but also take into consideration the diversity and arrangement of the set\nof generated images. We analyze our proposed family of TTI metrics using human\nstudies on image grids generated by three different TTI systems based on\nsubsets of the widely used benchmarks such as MS-COCO captions and Localized\nNarratives as well as prompts used in naturalistic settings. Our results\ndemonstrate that grounding metrics in how people use systems is an important\nand understudied area of benchmark design.\n","authors":["Negar Arabzadeh","Fernando Diaz","Junfeng He"],"pdf_url":"https://arxiv.org/pdf/2410.17331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17236v1","updated":"2024-10-22T17:54:45Z","published":"2024-10-22T17:54:45Z","title":"Large Language Models Empowered Personalized Web Agents","summary":" Web agents have emerged as a promising direction to automate Web task\ncompletion based on user instructions, significantly enhancing user experience.\nRecently, Web agents have evolved from traditional agents to Large Language\nModels (LLMs)-based Web agents. Despite their success, existing LLM-based Web\nagents overlook the importance of personalized data (e.g., user profiles and\nhistorical Web behaviors) in assisting the understanding of users' personalized\ninstructions and executing customized actions. To overcome the limitation, we\nfirst formulate the task of LLM-empowered personalized Web agents, which\nintegrate personalized data and user instructions to personalize instruction\ncomprehension and action execution. To address the absence of a comprehensive\nevaluation benchmark, we construct a Personalized Web Agent Benchmark\n(PersonalWAB), featuring user instructions, personalized user data, Web\nfunctions, and two evaluation paradigms across three personalized Web tasks.\nMoreover, we propose a Personalized User Memory-enhanced Alignment (PUMA)\nframework to adapt LLMs to the personalized Web agent task. PUMA utilizes a\nmemory bank with a task-specific retrieval strategy to filter relevant\nhistorical Web behaviors. Based on the behaviors, PUMA then aligns LLMs for\npersonalized action execution through fine-tuning and direct preference\noptimization. Extensive experiments validate the superiority of PUMA over\nexisting Web agents on PersonalWAB.\n","authors":["Hongru Cai","Yongqi Li","Wenjie Wang","Fengbin Zhu","Xiaoyu Shen","Wenjie Li","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.17236v1.pdf","comment":"The code and data are available on the project website\n https://hongrucai.github.io/PersonalWAB/"},{"id":"http://arxiv.org/abs/2407.12883v2","updated":"2024-10-22T17:49:31Z","published":"2024-07-16T17:58:27Z","title":"BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive\n Retrieval","summary":" Existing retrieval benchmarks primarily consist of information-seeking\nqueries (e.g., aggregated questions from search engines) where keyword or\nsemantic-based retrieval is usually sufficient. However, many complex\nreal-world queries require in-depth reasoning to identify relevant documents\nthat go beyond surface form matching. For example, finding documentation for a\ncoding question requires understanding the logic and syntax of the functions\ninvolved. To better benchmark retrieval on such challenging queries, we\nintroduce BRIGHT, the first text retrieval benchmark that requires intensive\nreasoning to retrieve relevant documents. Our dataset consists of 1,384\nreal-world queries spanning diverse domains, such as economics, psychology,\nmathematics, and coding. These queries are drawn from naturally occurring and\ncarefully curated human data. Extensive evaluation reveals that even\nstate-of-the-art retrieval models perform poorly on BRIGHT. The leading model\non the MTEB leaderboard (Muennighoff et al., 2023), which achieves a score of\n59.0 nDCG@10, produces a score of nDCG@10 of 18.3 on BRIGHT. We show that\nincorporating explicit reasoning about the query improves retrieval performance\nby up to 12.2 points. Moreover, incorporating retrieved documents from the\ntop-performing retriever boosts question-answering performance by over 6.6\npoints. We believe that BRIGHT paves the way for future research on retrieval\nsystems in more realistic and challenging settings.\n","authors":["Hongjin Su","Howard Yen","Mengzhou Xia","Weijia Shi","Niklas Muennighoff","Han-yu Wang","Haisu Liu","Quan Shi","Zachary S. Siegel","Michael Tang","Ruoxi Sun","Jinsung Yoon","Sercan O. Arik","Danqi Chen","Tao Yu"],"pdf_url":"https://arxiv.org/pdf/2407.12883v2.pdf","comment":"48 pages"},{"id":"http://arxiv.org/abs/2410.17152v1","updated":"2024-10-22T16:29:33Z","published":"2024-10-22T16:29:33Z","title":"Improving Pinterest Search Relevance Using Large Language Models","summary":" To improve relevance scoring on Pinterest Search, we integrate Large Language\nModels (LLMs) into our search relevance model, leveraging carefully designed\ntext representations to predict the relevance of Pins effectively. Our approach\nuses search queries alongside content representations that include captions\nextracted from a generative visual language model. These are further enriched\nwith link-based text data, historically high-quality engaged queries,\nuser-curated boards, Pin titles and Pin descriptions, creating robust models\nfor predicting search relevance. We use a semi-supervised learning approach to\nefficiently scale up the amount of training data, expanding beyond the\nexpensive human labeled data available. By utilizing multilingual LLMs, our\nsystem extends training data to include unseen languages and domains, despite\ninitial data and annotator expertise being confined to English. Furthermore, we\ndistill from the LLM-based model into real-time servable model architectures\nand features. We provide comprehensive offline experimental validation for our\nproposed techniques and demonstrate the gains achieved through the final\ndeployed system at scale.\n","authors":["Han Wang","Mukuntha Narayanan Sundararaman","Onur Gungor","Yu Xu","Krishna Kamath","Rakesh Chalasani","Kurchi Subhra Hazra","Jinfeng Rao"],"pdf_url":"https://arxiv.org/pdf/2410.17152v1.pdf","comment":"CIKM 2024 Workshop on Industrial Recommendation Systems"},{"id":"http://arxiv.org/abs/2410.17134v1","updated":"2024-10-22T16:06:33Z","published":"2024-10-22T16:06:33Z","title":"TELII: Temporal Event Level Inverted Indexing for Cohort Discovery on a\n Large Covid-19 EHR Dataset","summary":" Cohort discovery is a crucial step in clinical research on Electronic Health\nRecord (EHR) data. Temporal queries, which are common in cohort discovery, can\nbe time-consuming and prone to errors when processed on large EHR datasets. In\nthis work, we introduce TELII, a temporal event level inverted indexing method\ndesigned for cohort discovery on large EHR datasets. TELII is engineered to\npre-compute and store the relations along with the time difference between\nevents, thereby providing fast and accurate temporal query capabilities. We\nimplemented TELII for the OPTUM de-identified COVID-19 EHR dataset, which\ncontains data from 8.87 million patients. We demonstrate four common temporal\nquery tasks and their implementation using TELII with a MongoDB backend. Our\nresults show that the temporal query speed for TELII is up to 2000 times faster\nthan that of existing non-temporal inverted indexes. TELII achieves\nmillisecond-level response times, enabling users to quickly explore event\nrelations and find preliminary evidence for their research questions. Not only\nis TELII practical and straightforward to implement, but it also offers easy\nadaptability to other EHR datasets. These advantages underscore TELII's\npotential to serve as the query engine for EHR-based applications, ensuring\nfast, accurate, and user-friendly query responses.\n","authors":["Yan Huang"],"pdf_url":"https://arxiv.org/pdf/2410.17134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17112v1","updated":"2024-10-22T15:37:46Z","published":"2024-10-22T15:37:46Z","title":"Enhancing Answer Attribution for Faithful Text Generation with Large\n Language Models","summary":" The increasing popularity of Large Language Models (LLMs) in recent years has\nchanged the way users interact with and pose questions to AI-based\nconversational systems. An essential aspect for increasing the trustworthiness\nof generated LLM answers is the ability to trace the individual claims from\nresponses back to relevant sources that support them, the process known as\nanswer attribution. While recent work has started exploring the task of answer\nattribution in LLMs, some challenges still remain. In this work, we first\nperform a case study analyzing the effectiveness of existing answer attribution\nmethods, with a focus on subtasks of answer segmentation and evidence\nretrieval. Based on the observed shortcomings, we propose new methods for\nproducing more independent and contextualized claims for better retrieval and\nattribution. The new methods are evaluated and shown to improve the performance\nof answer attribution components. We end with a discussion and outline of\nfuture directions for the task.\n","authors":["Juraj Vladika","Luca Mülln","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2410.17112v1.pdf","comment":"Accepted to KDIR 2024 (part of IC3K 2024)"},{"id":"http://arxiv.org/abs/2312.10463v4","updated":"2024-10-22T12:40:25Z","published":"2023-12-16T14:42:46Z","title":"RecPrompt: A Self-tuning Prompting Framework for News Recommendation\n Using Large Language Models","summary":" News recommendations heavily rely on Natural Language Processing (NLP)\nmethods to analyze, understand, and categorize content, enabling personalized\nsuggestions based on user interests and reading behaviors. Large Language\nModels (LLMs) like GPT-4 have shown promising performance in understanding\nnatural language. However, the extent of their applicability to news\nrecommendation systems remains to be validated. This paper introduces\nRecPrompt, the first self-tuning prompting framework for news recommendation,\nleveraging the capabilities of LLMs to perform complex news recommendation\ntasks. This framework incorporates a news recommender and a prompt optimizer\nthat applies an iterative bootstrapping process to enhance recommendations\nthrough automatic prompt engineering. Extensive experimental results with 400\nusers show that RecPrompt can achieve an improvement of 3.36% in AUC, 10.49% in\nMRR, 9.64% in nDCG@5, and 6.20% in nDCG@10 compared to deep neural models.\nAdditionally, we introduce TopicScore, a novel metric to assess explainability\nby evaluating LLM's ability to summarize topics of interest for users. The\nresults show LLM's effectiveness in accurately identifying topics of interest\nand delivering comprehensive topic-based explanations.\n","authors":["Dairui Liu","Boming Yang","Honghui Du","Derek Greene","Neil Hurley","Aonghus Lawlor","Ruihai Dong","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2312.10463v4.pdf","comment":"5 pages, 2 figures, and 2 tables"},{"id":"http://arxiv.org/abs/2410.13226v2","updated":"2024-10-22T12:28:28Z","published":"2024-10-17T05:17:01Z","title":"Research on Travel Route Planing Problems Based on Greedy Algorithm","summary":" The route planning problem based on the greedy algorithm represents a method\nof identifying the optimal or near-optimal route between a given start point\nand end point. In this paper, the PCA method is employed initially to downscale\nthe city evaluation indexes, extract the key principal components, and then\ndownscale the data using the KMO and TOPSIS algorithms, all of which are based\non the MindSpore framework. Secondly, for the dataset that does not pass the\nKMO test, the entropy weight method and TOPSIS method will be employed for\ncomprehensive evaluation. Finally, a route planning algorithm is proposed and\noptimised based on the greedy algorithm, which provides personalised route\ncustomisation according to the different needs of tourists. In addition, the\nlocal travelling efficiency, the time required to visit tourist attractions and\nthe necessary daily breaks are considered in order to reduce the cost and avoid\nfalling into the locally optimal solution.\n","authors":["Yiquan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.13226v2.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2410.16853v1","updated":"2024-10-22T09:37:29Z","published":"2024-10-22T09:37:29Z","title":"Bridging the Modality Gap: Dimension Information Alignment and Sparse\n Spatial Constraint for Image-Text Matching","summary":" Many contrastive learning based models have achieved advanced performance in\nimage-text matching tasks. The key of these models lies in analyzing the\ncorrelation between image-text pairs, which involves cross-modal interaction of\nembeddings in corresponding dimensions. However, the embeddings of different\nmodalities are from different models or modules, and there is a significant\nmodality gap. Directly interacting such embeddings lacks rationality and may\ncapture inaccurate correlation. Therefore, we propose a novel method called\nDIAS to bridge the modality gap from two aspects: (1) We align the information\nrepresentation of embeddings from different modalities in corresponding\ndimension to ensure the correlation calculation is based on interactions of\nsimilar information. (2) The spatial constraints of inter- and intra-modalities\nunmatched pairs are introduced to ensure the effectiveness of semantic\nalignment of the model. Besides, a sparse correlation algorithm is proposed to\nselect strong correlated spatial relationships, enabling the model to learn\nmore significant features and avoid being misled by weak correlation. Extensive\nexperiments demonstrate the superiority of DIAS, achieving 4.3\\%-10.2\\% rSum\nimprovements on Flickr30k and MSCOCO benchmarks.\n","authors":["Xiang Ma","Xuemei Li","Lexin Fang","Caiming Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.16853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16838v1","updated":"2024-10-22T09:17:05Z","published":"2024-10-22T09:17:05Z","title":"Neural Collaborative Filtering Classification Model to Obtain Prediction\n Reliabilities","summary":" Neural collaborative filtering is the state of art field in the recommender\nsystems area; it provides some models that obtain accurate predictions and\nrecommendations. These models are regression-based, and they just return rating\npredictions. This paper proposes the use of a classification-based approach,\nreturning both rating predictions and their reliabilities. The extra\ninformation (prediction reliabilities) can be used in a variety of relevant\ncollaborative filtering areas such as detection of shilling attacks,\nrecommendations explanation or navigational tools to show users and items\ndependences. Additionally, recommendation reliabilities can be gracefully\nprovided to users: \"probably you will like this film\", \"almost certainly you\nwill like this song\", etc. This paper provides the proposed neural\narchitecture; it also tests that the quality of its recommendation results is\nas good as the state of art baselines. Remarkably, individual rating\npredictions are improved by using the proposed architecture compared to\nbaselines. Experiments have been performed making use of four popular public\ndatasets, showing generalizable quality results. Overall, the proposed\narchitecture improves individual rating predictions quality, maintains\nrecommendation results and opens the doors to a set of relevant collaborative\nfiltering fields.\n","authors":["Jesús Bobadilla","Abraham Gutiérrez","Santiago Alonso","Ángel González-Prieto"],"pdf_url":"https://arxiv.org/pdf/2410.16838v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2410.16823v1","updated":"2024-10-22T08:49:43Z","published":"2024-10-22T08:49:43Z","title":"Bridging Search and Recommendation in Generative Retrieval: Does One\n Task Help the Other?","summary":" Generative retrieval for search and recommendation is a promising paradigm\nfor retrieving items, offering an alternative to traditional methods that\ndepend on external indexes and nearest-neighbor searches. Instead, generative\nmodels directly associate inputs with item IDs. Given the breakthroughs of\nLarge Language Models (LLMs), these generative systems can play a crucial role\nin centralizing a variety of Information Retrieval (IR) tasks in a single model\nthat performs tasks such as query understanding, retrieval, recommendation,\nexplanation, re-ranking, and response generation. Despite the growing interest\nin such a unified generative approach for IR systems, the advantages of using a\nsingle, multi-task model over multiple specialized models are not well\nestablished in the literature. This paper investigates whether and when such a\nunified approach can outperform task-specific models in the IR tasks of search\nand recommendation, broadly co-existing in multiple industrial online\nplatforms, such as Spotify, YouTube, and Netflix. Previous work shows that (1)\nthe latent representations of items learned by generative recommenders are\nbiased towards popularity, and (2) content-based and\ncollaborative-filtering-based information can improve an item's\nrepresentations. Motivated by this, our study is guided by two hypotheses: [H1]\nthe joint training regularizes the estimation of each item's popularity, and\n[H2] the joint training regularizes the item's latent representations, where\nsearch captures content-based aspects of an item and recommendation captures\ncollaborative-filtering aspects. Our extensive experiments with both simulated\nand real-world data support both [H1] and [H2] as key contributors to the\neffectiveness improvements observed in the unified search and recommendation\ngenerative models over the single-task approaches.\n","authors":["Gustavo Penha","Ali Vardasbi","Enrico Palumbo","Marco de Nadai","Hugues Bouchard"],"pdf_url":"https://arxiv.org/pdf/2410.16823v1.pdf","comment":"Accepted for publication in the 18th ACM Conference on Recommender\n Systems (RecSys'24)"},{"id":"http://arxiv.org/abs/2410.16780v1","updated":"2024-10-22T07:53:41Z","published":"2024-10-22T07:53:41Z","title":"Beyond Retrieval: Generating Narratives in Conversational Recommender\n Systems","summary":" The recent advances in Large Language Model's generation and reasoning\ncapabilities present an opportunity to develop truly conversational\nrecommendation systems. However, effectively integrating recommender system\nknowledge into LLMs for natural language generation which is tailored towards\nrecommendation tasks remains a challenge. This paper addresses this challenge\nby making two key contributions.\n First, we introduce a new dataset (REGEN) for natural language generation\ntasks in conversational recommendations. REGEN (Reviews Enhanced with\nGEnerative Narratives) extends the Amazon Product Reviews dataset with rich\nuser narratives, including personalized explanations of product preferences,\nproduct endorsements for recommended items, and summaries of user purchase\nhistory. REGEN is made publicly available to facilitate further research.\nFurthermore, we establish benchmarks using well-known generative metrics, and\nperform an automated evaluation of the new dataset using a rater LLM. Second,\nthe paper introduces a fusion architecture (CF model with an LLM) which serves\nas a baseline for REGEN. And to the best of our knowledge, represents the first\nattempt to analyze the capabilities of LLMs in understanding recommender\nsignals and generating rich narratives. We demonstrate that LLMs can\neffectively learn from simple fusion architectures utilizing interaction-based\nCF embeddings, and this can be further enhanced using the metadata and\npersonalization data associated with items. Our experiments show that combining\nCF and content embeddings leads to improvements of 4-12% in key language\nmetrics compared to using either type of embedding individually. We also\nprovide an analysis to interpret how CF and content embeddings contribute to\nthis new generative task.\n","authors":["Krishna Sayana","Raghavendra Vasudeva","Yuri Vasilevski","Kun Su","Liam Hebert","Hubert Pham","Ambarish Jash","Sukhdeep Sodhi"],"pdf_url":"https://arxiv.org/pdf/2410.16780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13428v3","updated":"2024-10-22T07:33:52Z","published":"2024-10-17T10:51:34Z","title":"Generate and Instantiate What You Prefer: Text-Guided Diffusion for\n Sequential Recommendation","summary":" Recent advancements in generative recommendation systems, particularly in the\nrealm of sequential recommendation tasks, have shown promise in enhancing\ngeneralization to new items. Among these approaches, diffusion-based generative\nrecommendation has emerged as an effective tool, leveraging its ability to\ncapture data distributions and generate high-quality samples. Despite\neffectiveness, two primary challenges have been identified: 1) the lack of\nconsistent modeling of data distribution for oracle items; and 2) the\ndifficulty in scaling to more informative control signals beyond historical\ninteractions. These issues stem from the uninformative nature of ID embeddings,\nwhich necessitate random initialization and limit the incorporation of\nadditional control signals. To address these limitations, we propose iDreamRec\nto involve more concrete prior knowledge to establish item embeddings,\nparticularly through detailed item text descriptions and advanced Text\nEmbedding Models (TEM). More importantly, by converting item descriptions into\nembeddings aligned with TEM, we enable the integration of intention\ninstructions as control signals to guide the generation of oracle items.\nExperimental results on four datasets demonstrate that iDreamRec not only\noutperforms existing diffusion-based generative recommenders but also\nfacilitates the incorporation of intention instructions for more precise and\neffective recommendation generation.\n","authors":["Guoqing Hu","Zhengyi Yang","Zhibo Cai","An Zhang","Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.13428v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16755v1","updated":"2024-10-22T07:20:37Z","published":"2024-10-22T07:20:37Z","title":"Coarse-to-fine Dynamic Uplift Modeling for Real-time Video\n Recommendation","summary":" With the rise of short video platforms, video recommendation technology faces\nmore complex challenges. Currently, there are multiple non-personalized modules\nin the video recommendation pipeline that urgently need personalized modeling\ntechniques for improvement. Inspired by the success of uplift modeling in\nonline marketing, we attempt to implement uplift modeling in the video\nrecommendation scenario. However, we face two main challenges: 1) Design and\nutilization of treatments, and 2) Capture of user real-time interest. To\naddress them, we design adjusting the distribution of videos with varying\ndurations as the treatment and propose Coarse-to-fine Dynamic Uplift Modeling\n(CDUM) for real-time video recommendation. CDUM consists of two modules, CPM\nand FIC. The former module fully utilizes the offline features of users to\nmodel their long-term preferences, while the latter module leverages online\nreal-time contextual features and request-level candidates to model users'\nreal-time interests. These two modules work together to dynamically identify\nand targeting specific user groups and applying treatments effectively.\nFurther, we conduct comprehensive experiments on the offline public and\nindustrial datasets and online A/B test, demonstrating the superiority and\neffectiveness of our proposed CDUM. Our proposed CDUM is eventually fully\ndeployed on the Kuaishou platform, serving hundreds of millions of users every\nday. The source code will be provided after the paper is accepted.\n","authors":["Chang Meng","Chenhao Zhai","Xueliang Wang","Shuchang Liu","Xiaoqiang Feng","Lantao Hu","Xiu Li","Han Li","Kun Gai"],"pdf_url":"https://arxiv.org/pdf/2410.16755v1.pdf","comment":"9 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2406.17378v2","updated":"2024-10-22T06:32:10Z","published":"2024-06-25T08:55:12Z","title":"A Text is Worth Several Tokens: Text Embedding from LLMs Secretly Aligns\n Well with The Key Tokens","summary":" Text embeddings from large language models (LLMs) have achieved excellent\nresults in tasks such as information retrieval, semantic textual similarity,\netc. In this work, we show an interesting finding: when feeding a text into the\nembedding LLMs, the obtained text embedding will be able to be aligned with the\nkey tokens in the input text. We first fully analyze this phenomenon on eight\nembedding LLMs and show that this phenomenon is universal and is not affected\nby model architecture, training strategy, and embedding method. With a deeper\nanalysis, we then find that the main change in embedding space between the\nembedding LLMs and their original generative LLMs is in the first principal\ncomponent. By adjusting the first principal component, we can align text\nembedding with the key tokens. Finally, we give several examples to demonstrate\nthe vast application potential of this finding: (1) we propose a simple and\npractical sparse retrieval method based on the aligned tokens, which can\nachieve 80\\% of the dense retrieval effect of the same model while reducing the\ncomputation significantly; (2) we show that our findings provide a fresh\nperspective to help understand fuzzy concepts (e.g., semantic relatedness vs.\nsemantic similarity) and emerging technologies (e.g., instruction-following\nembedding) in this field.\n","authors":["Zhijie Nie","Richong Zhang","Zhanyu Wu"],"pdf_url":"https://arxiv.org/pdf/2406.17378v2.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2410.15586v2","updated":"2024-10-22T03:07:11Z","published":"2024-10-21T02:11:48Z","title":"Automatic Search of Multiword Place Names on Historical Maps","summary":" Historical maps are invaluable sources of information about the past, and\nscanned historical maps are increasingly accessible in online libraries. To\nretrieve maps from these large libraries that contain specific places of\ninterest, previous work has applied computer vision techniques to recognize\nwords on historical maps, enabling searches for maps that contain specific\nplace names. However, searching for multiword place names is challenging due to\ncomplex layouts of text labels on historical maps. This paper proposes an\nefficient query method for searching a given multiword place name on historical\nmaps. Using existing methods to recognize words on historical maps, we link\nsingle-word text labels into potential multiword phrases by constructing\nminimum spanning trees. These trees aim to link pairs of text labels that are\nspatially close and have similar height, angle, and capitalization. We then\nquery these trees for the given multiword place name. We evaluate the proposed\nmethod in two experiments: 1) to evaluate the accuracy of the minimum spanning\ntree approach at linking multiword place names and 2) to evaluate the number\nand time range of maps retrieved by the query approach. The resulting maps\nreveal how places using multiword names have changed on a large number of maps\nfrom across history.\n","authors":["Rhett Olson","Jina Kim","Yao-Yi Chiang"],"pdf_url":"https://arxiv.org/pdf/2410.15586v2.pdf","comment":"4 pages, 4 figures, and 2 tables. To be published in proceedings ACM\n SIGSPATIAL 2024 GeoSearch Workshop"},{"id":"http://arxiv.org/abs/2410.16629v1","updated":"2024-10-22T02:18:44Z","published":"2024-10-22T02:18:44Z","title":"Cutting Through the Confusion and Hype: Understanding the True Potential\n of Generative AI","summary":" This paper explores the nuanced landscape of generative AI (genAI),\nparticularly focusing on neural network-based models like Large Language Models\n(LLMs). While genAI garners both optimistic enthusiasm and sceptical criticism,\nthis work seeks to provide a balanced examination of its capabilities,\nlimitations, and the profound impact it may have on societal functions and\npersonal interactions. The first section demystifies language-based genAI\nthrough detailed discussions on how LLMs learn, their computational needs,\ndistinguishing features from supporting technologies, and the inherent\nlimitations in their accuracy and reliability. Real-world examples illustrate\nthe practical applications and implications of these technologies. The latter\npart of the paper adopts a systems perspective, evaluating how the integration\nof LLMs with existing technologies can enhance productivity and address\nemerging concerns. It highlights the need for significant investment to\nunderstand the implications of recent advancements, advocating for a\nwell-informed dialogue to ethically and responsibly integrate genAI into\ndiverse sectors. The paper concludes with prospective developments and\nrecommendations, emphasizing a forward-looking approach to harnessing genAI`s\npotential while mitigating its risks.\n","authors":["Ante Prodan","Jo-An Occhipinti","Rehez Ahlip","Goran Ujdur","Harris A. Eyre","Kyle Goosen","Luke Penza","Mark Heffernan"],"pdf_url":"https://arxiv.org/pdf/2410.16629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16597v1","updated":"2024-10-22T00:47:54Z","published":"2024-10-22T00:47:54Z","title":"Distill-SynthKG: Distilling Knowledge Graph Synthesis Workflow for\n Improved Coverage and Efficiency","summary":" Knowledge graphs (KGs) generated by large language models (LLMs) are becoming\nincreasingly valuable for Retrieval-Augmented Generation (RAG) applications\nthat require knowledge-intensive reasoning. However, existing KG extraction\nmethods predominantly rely on prompt-based approaches, which are inefficient\nfor processing large-scale corpora. These approaches often suffer from\ninformation loss, particularly with long documents, due to the lack of\nspecialized design for KG construction. Additionally, there is a gap in\nevaluation datasets and methodologies for ontology-free KG construction. To\novercome these limitations, we propose SynthKG, a multi-step, document-level\nontology-free KG synthesis workflow based on LLMs. By fine-tuning a smaller LLM\non the synthesized document-KG pairs, we streamline the multi-step process into\na single-step KG generation approach called Distill-SynthKG, substantially\nreducing the number of LLM inference calls. Furthermore, we re-purpose existing\nquestion-answering datasets to establish KG evaluation datasets and introduce\nnew evaluation metrics. Using KGs produced by Distill-SynthKG, we also design a\nnovel graph-based retrieval framework for RAG. Experimental results demonstrate\nthat Distill-SynthKG not only surpasses all baseline models in KG quality --\nincluding models up to eight times larger -- but also consistently excels in\nretrieval and question-answering tasks. Our proposed graph retrieval framework\nalso outperforms all KG-retrieval methods across multiple benchmark datasets.\nWe release the SynthKG dataset and Distill-SynthKG model publicly to support\nfurther research and development.\n","authors":["Prafulla Kumar Choubey","Xin Su","Man Luo","Xiangyu Peng","Caiming Xiong","Tiep Le","Shachar Rosenman","Vasudev Lal","Phil Mui","Ricky Ho","Phillip Howard","Chien-Sheng Wu"],"pdf_url":"https://arxiv.org/pdf/2410.16597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16586v1","updated":"2024-10-22T00:11:41Z","published":"2024-10-22T00:11:41Z","title":"Optimizing LLMs with Direct Preferences: A Data Efficiency Perspective","summary":" Aligning the output of Large Language Models (LLMs) with human preferences\n(e.g., by means of reinforcement learning with human feedback, or RLHF) is\nessential for ensuring their effectiveness in real-world scenarios. Despite\nsignificant advancements in LLM alignment techniques, the impact of different\ntype of preference data on model performance has yet to be systematically\nexplored. In this study, we investigate the scalability, data efficiency, and\neffectiveness of Direct Preference Optimization (DPO) in fine-tuning\npre-trained LLMs, aiming to reduce their dependency on extensive amounts of\npreference data, which is expensive to collect. We (1) systematically compare\nthe performance of models fine-tuned with varying percentages of a combined\npreference judgement dataset to define the improvement curve of DPO and assess\nits effectiveness in data-constrained environments; and (2) provide insights\nfor the development of an optimal approach for selective preference data usage.\nOur study reveals that increasing the amount of data used for training\ngenerally enhances and stabilizes model performance. Moreover, the use of a\ncombination of diverse datasets significantly improves model effectiveness.\nFurthermore, when models are trained separately using different types of\nprompts, models trained with conversational prompts outperformed those trained\nwith question answering prompts.\n","authors":["Pietro Bernardelle","Gianluca Demartini"],"pdf_url":"https://arxiv.org/pdf/2410.16586v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.11402v2","updated":"2024-10-22T23:13:34Z","published":"2024-09-17T17:59:06Z","title":"NVLM: Open Frontier-Class Multimodal LLMs","summary":" We introduce NVLM 1.0, a family of frontier-class multimodal large language\nmodels (LLMs) that achieve state-of-the-art results on vision-language tasks,\nrivaling the leading proprietary models (e.g., GPT-4o) and open-access models\n(e.g., Llama 3-V 405B and InternVL 2). Remarkably, NVLM 1.0 shows improved\ntext-only performance over its LLM backbone after multimodal training. In terms\nof model design, we perform a comprehensive comparison between decoder-only\nmultimodal LLMs (e.g., LLaVA) and cross-attention-based models (e.g.,\nFlamingo). Based on the strengths and weaknesses of both approaches, we propose\na novel architecture that enhances both training efficiency and multimodal\nreasoning capabilities. Furthermore, we introduce a 1-D tile-tagging design for\ntile-based dynamic high-resolution images, which significantly boosts\nperformance on multimodal reasoning and OCR-related tasks. Regarding training\ndata, we meticulously curate and provide detailed information on our multimodal\npretraining and supervised fine-tuning datasets. Our findings indicate that\ndataset quality and task diversity are more important than scale, even during\nthe pretraining phase, across all architectures. Notably, we develop\nproduction-grade multimodality for the NVLM-1.0 models, enabling them to excel\nin vision-language tasks while maintaining and even improving text-only\nperformance compared to their LLM backbones. To achieve this, we craft and\nintegrate a high-quality text-only dataset into multimodal training, alongside\na substantial amount of multimodal math and reasoning data, leading to enhanced\nmath and coding capabilities across modalities. To advance research in the\nfield, we release the model weights at https://huggingface.co/nvidia/NVLM-D-72B\nand will open-source the training code for the community soon.\n","authors":["Wenliang Dai","Nayeon Lee","Boxin Wang","Zhuolin Yang","Zihan Liu","Jon Barker","Tuomas Rintamaki","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2409.11402v2.pdf","comment":"Fixed the typos. For more information, please visit our project page\n at: https://research.nvidia.com/labs/adlr/NVLM-1"},{"id":"http://arxiv.org/abs/2408.11982v3","updated":"2024-10-22T16:58:09Z","published":"2024-08-21T20:32:45Z","title":"AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and\n Results","summary":" Video quality assessment (VQA) is a crucial task in the development of video\ncompression standards, as it directly impacts the viewer experience. This paper\npresents the results of the Compressed Video Quality Assessment challenge, held\nin conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV\n2024. The challenge aimed to evaluate the performance of VQA methods on a\ndiverse dataset of 459 videos, encoded with 14 codecs of various compression\nstandards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a\ncomprehensive collection of compression artifacts. To measure the methods\nperformance, we employed traditional correlation coefficients between their\npredictions and subjective scores, which were collected via large-scale\ncrowdsourced pairwise human comparisons. For training purposes, participants\nwere provided with the Compressed Video Quality Assessment Dataset (CVQAD), a\npreviously developed dataset of 1022 videos. Up to 30 participating teams\nregistered for the challenge, while we report the results of 6 teams, which\nsubmitted valid final solutions and code for reproducing the results. Moreover,\nwe calculated and present the performance of state-of-the-art VQA methods on\nthe developed dataset, providing a comprehensive benchmark for future research.\nThe dataset, results, and online leaderboard are publicly available at\nhttps://challenges.videoprocessing.ai/challenges/compressedvideo-quality-assessment.html.\n","authors":["Maksim Smirnov","Aleksandr Gushchin","Anastasia Antsiferova","Dmitry Vatolin","Radu Timofte","Ziheng Jia","Zicheng Zhang","Wei Sun","Jiaying Qian","Yuqin Cao","Yinan Sun","Yuxin Zhu","Xiongkuo Min","Guangtao Zhai","Kanjar De","Qing Luo","Ao-Xiang Zhang","Peng Zhang","Haibo Lei","Linyan Jiang","Yaqing Li","Wenhui Meng","Zhenzhong Chen","Zhengxue Cheng","Jiahao Xiao","Jun Xu","Chenlong He","Qi Zheng","Ruoxi Zhu","Min Li","Yibo Fan","Zhengzhong Tu"],"pdf_url":"https://arxiv.org/pdf/2408.11982v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17073v1","updated":"2024-10-22T14:50:19Z","published":"2024-10-22T14:50:19Z","title":"Personalized Playback Technology: How Short Video Services Create\n Excellent User Experience","summary":" Short-form video content has become increasingly popular and influential in\nrecent years. Its concise yet engaging format aligns well with todays'\nfast-paced and on-the-go lifestyles, making it a dominating trend in the\ndigital world. As one of the front runners in the short video platform space,\nByteDance has been highly successful in delivering a one-of-a-kind short video\nexperience and attracting billions of users worldwide. One key contributing\nfactor is its advanced end-to-end personalized short video playback technology,\nwhere we pioneered and developed the new technical field over the past five\nyears to optimize user experience. This paper introduces the major concepts and\nmethodologies of this personalized video playback technology that distinguish\nit from traditional multimedia technologies. More details, including goal\nsetting, iterative process, modeling, experimental methods and required\nsupporting systems, are also provided to encourage deeper research in this\narea.\n","authors":["Weihui Deng","Zhiwei Fan","Deliang Fu","Yun Gong","Shenglan Huang","Xiaocheng Li","Zheng Li","Yiting Liao","He Liu","Chunyu Qiao","Bin Wang","Zhen Wang","Zhengyu Xiong"],"pdf_url":"https://arxiv.org/pdf/2410.17073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16532v2","updated":"2024-10-22T14:40:15Z","published":"2024-08-29T13:43:36Z","title":"WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio\n Language Modeling","summary":" Language models have been effectively applied to modeling natural signals,\nsuch as images, video, speech, and audio. A crucial component of these models\nis the codec tokenizer, which compresses high-dimensional natural signals into\nlower-dimensional discrete tokens. In this paper, we introduce WavTokenizer,\nwhich offers several advantages over previous SOTA acoustic codec models in the\naudio domain: 1)extreme compression. By compressing the layers of quantizers\nand the temporal dimension of the discrete codec, one-second audio of 24kHz\nsampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved\nsubjective quality. Despite the reduced number of tokens, WavTokenizer achieves\nstate-of-the-art reconstruction quality with outstanding UTMOS scores and\ninherently contains richer semantic information. Specifically, we achieve these\nresults by designing a broader VQ space, extended contextual windows, and\nimproved attention networks, as well as introducing a powerful multi-scale\ndiscriminator and an inverse Fourier transform structure. We conducted\nextensive reconstruction experiments in the domains of speech, audio, and\nmusic. WavTokenizer exhibited strong performance across various objective and\nsubjective metrics compared to state-of-the-art models. We also tested semantic\ninformation, VQ utilization, and adaptability to generative models.\nComprehensive ablation studies confirm the necessity of each module in\nWavTokenizer. The related code, demos, and pre-trained models are available at\nhttps://github.com/jishengpeng/WavTokenizer.\n","authors":["Shengpeng Ji","Ziyue Jiang","Wen Wang","Yifu Chen","Minghui Fang","Jialong Zuo","Qian Yang","Xize Cheng","Zehan Wang","Ruiqi Li","Ziang Zhang","Xiaoda Yang","Rongjie Huang","Yidi Jiang","Qian Chen","Siqi Zheng","Wen Wang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.16532v2.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2410.15778v2","updated":"2024-10-22T05:01:28Z","published":"2024-10-21T08:42:30Z","title":"Reducing Hallucinations in Vision-Language Models via Latent Space\n Steering","summary":" Hallucination poses a challenge to the deployment of large vision-language\nmodels (LVLMs) in applications. Unlike in large language models (LLMs),\nhallucination in LVLMs often arises from misalignments between visual inputs\nand textual outputs. This paper investigates the underlying mechanisms of\nhallucination, focusing on the unique structure of LVLMs that distinguishes\nthem from large language models (LLMs). We identify that hallucinations often\narise from the sensitivity of text decoders to vision inputs, a natural\nphenomenon when image encoders and text decoders are pre-trained separately.\nInspired by this, we introduce Visual and Textual Intervention (VTI), a novel\ntechnique designed to reduce hallucinations by steering latent space\nrepresentations during inference to enhance the stability of vision features.\nAs a task-agnostic test-time intervention, VTI can be easily applied to any\nproblem without additional cost. Extensive experiments demonstrate that it can\neffectively reduce hallucinations and outperform baseline methods across\nmultiple metrics, highlighting the critical role of vision feature stability in\nLVLMs.\n","authors":["Sheng Liu","Haotian Ye","Lei Xing","James Zou"],"pdf_url":"https://arxiv.org/pdf/2410.15778v2.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2404.01617v2","updated":"2024-10-22T04:09:15Z","published":"2024-04-02T03:43:55Z","title":"Designing Network Algorithms via Large Language Models","summary":" We introduce NADA, the first framework to autonomously design network\nalgorithms by leveraging the generative capabilities of large language models\n(LLMs). Starting with an existing algorithm implementation, NADA enables LLMs\nto create a wide variety of alternative designs in the form of code blocks. It\nthen efficiently identifies the top-performing designs through a series of\nfiltering techniques, minimizing the need for full-scale evaluations and\nsignificantly reducing computational costs. Using adaptive bitrate (ABR)\nstreaming as a case study, we demonstrate that NADA produces novel ABR\nalgorithms -- previously unknown to human developers -- that consistently\noutperform the original algorithm in diverse network environments, including\nbroadband, satellite, 4G, and 5G.\n","authors":["Zhiyuan He","Aashish Gottipati","Lili Qiu","Xufang Luo","Kenuo Xu","Yuqing Yang","Francis Y. Yan"],"pdf_url":"https://arxiv.org/pdf/2404.01617v2.pdf","comment":null}]},"2024-10-21T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2410.16458v1","updated":"2024-10-21T19:34:40Z","published":"2024-10-21T19:34:40Z","title":"STAR: A Simple Training-free Approach for Recommendations using Large\n Language Models","summary":" Recent progress in large language models (LLMs) offers promising new\napproaches for recommendation system (RecSys) tasks. While the current\nstate-of-the-art methods rely on fine-tuning LLMs to achieve optimal results,\nthis process is costly and introduces significant engineering complexities.\nConversely, methods that bypass fine-tuning and use LLMs directly are less\nresource-intensive but often fail to fully capture both semantic and\ncollaborative information, resulting in sub-optimal performance compared to\ntheir fine-tuned counterparts. In this paper, we propose a Simple Training-free\nApproach for Recommendation (STAR), a framework that utilizes LLMs and can be\napplied to various recommendation tasks without the need for fine-tuning. Our\napproach involves a retrieval stage that uses semantic embeddings from LLMs\ncombined with collaborative user information to retrieve candidate items. We\nthen apply an LLM for pairwise ranking to enhance next-item prediction.\nExperimental results on the Amazon Review dataset show competitive performance\nfor next item prediction, even with our retrieval stage alone. Our full method\nachieves Hits@10 performance of +23.8% on Beauty, +37.5% on Toys and Games, and\n-1.8% on Sports and Outdoors relative to the best supervised models. This\nframework offers an effective alternative to traditional supervised models,\nhighlighting the potential of LLMs in recommendation systems without extensive\ntraining or custom architectures.\n","authors":["Dong-Ho Lee","Adam Kraft","Long Jin","Nikhil Mehta","Taibai Xu","Lichan Hong","Ed H. Chi","Xinyang Yi"],"pdf_url":"https://arxiv.org/pdf/2410.16458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16156v1","updated":"2024-10-21T16:21:45Z","published":"2024-10-21T16:21:45Z","title":"Limpeh ga li gong: Challenges in Singlish Annotations","summary":" Singlish, or Colloquial Singapore English, is a language formed from oral and\nsocial communication within multicultural Singapore. In this work, we work on a\nfundamental Natural Language Processing (NLP) task: Parts-Of-Speech (POS)\ntagging of Singlish sentences. For our analysis, we build a parallel Singlish\ndataset containing direct English translations and POS tags, with translation\nand POS annotation done by native Singlish speakers. Our experiments show that\nautomatic transition- and transformer- based taggers perform with only $\\sim\n80\\%$ accuracy when evaluated against human-annotated POS labels, suggesting\nthat there is indeed room for improvement on computation analysis of the\nlanguage. We provide an exposition of challenges in Singlish annotation: its\ninconsistencies in form and semantics, the highly context-dependent particles\nof the language, its structural unique expressions, and the variation of the\nlanguage on different mediums. Our task definition, resultant labels and\nresults reflects the challenges in analysing colloquial languages formulated\nfrom a variety of dialects, and paves the way for future studies beyond POS\ntagging.\n","authors":["Lynnette Hui Xian Ng","Luo Qi Chan"],"pdf_url":"https://arxiv.org/pdf/2410.16156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16148v1","updated":"2024-10-21T16:17:22Z","published":"2024-10-21T16:17:22Z","title":"PODTILE: Facilitating Podcast Episode Browsing with Auto-generated\n Chapters","summary":" Listeners of long-form talk-audio content, such as podcast episodes, often\nfind it challenging to understand the overall structure and locate relevant\nsections. A practical solution is to divide episodes into\nchapters--semantically coherent segments labeled with titles and timestamps.\nSince most episodes on our platform at Spotify currently lack creator-provided\nchapters, automating the creation of chapters is essential. Scaling the\nchapterization of podcast episodes presents unique challenges. First, episodes\ntend to be less structured than written texts, featuring spontaneous\ndiscussions with nuanced transitions. Second, the transcripts are usually\nlengthy, averaging about 16,000 tokens, which necessitates efficient processing\nthat can preserve context. To address these challenges, we introduce PODTILE, a\nfine-tuned encoder-decoder transformer to segment conversational data. The\nmodel simultaneously generates chapter transitions and titles for the input\ntranscript. To preserve context, each input text is augmented with global\ncontext, including the episode's title, description, and previous chapter\ntitles. In our intrinsic evaluation, PODTILE achieved an 11% improvement in\nROUGE score over the strongest baseline. Additionally, we provide insights into\nthe practical benefits of auto-generated chapters for listeners navigating\nepisode content. Our findings indicate that auto-generated chapters serve as a\nuseful tool for engaging with less popular podcasts. Finally, we present\nempirical evidence that using chapter titles can enhance effectiveness of\nsparse retrieval in search tasks.\n","authors":["Azin Ghazimatin","Ekaterina Garmash","Gustavo Penha","Kristen Sheets","Martin Achenbach","Oguz Semerci","Remi Galvez","Marcus Tannenberg","Sahitya Mantravadi","Divya Narayanan","Ofeliya Kalaydzhyan","Douglas Cole","Ben Carterette","Ann Clifton","Paul N. Bennett","Claudia Hauff","Mounia Lalmas"],"pdf_url":"https://arxiv.org/pdf/2410.16148v1.pdf","comment":"9 pages, 4 figures, CIKM industry track 2024"},{"id":"http://arxiv.org/abs/2410.16080v1","updated":"2024-10-21T14:58:38Z","published":"2024-10-21T14:58:38Z","title":"Unleashing the Potential of Multi-Channel Fusion in Retrieval for\n Personalized Recommendations","summary":" Recommender systems (RS) are pivotal in managing information overload in\nmodern digital services. A key challenge in RS is efficiently processing vast\nitem pools to deliver highly personalized recommendations under strict latency\nconstraints. Multi-stage cascade ranking addresses this by employing\ncomputationally efficient retrieval methods to cover diverse user interests,\nfollowed by more precise ranking models to refine the results. In the retrieval\nstage, multi-channel retrieval is often used to generate distinct item subsets\nfrom different candidate generators, leveraging the complementary strengths of\nthese methods to maximize coverage. However, forwarding all retrieved items\noverwhelms downstream rankers, necessitating truncation. Despite advancements\nin individual retrieval methods, multi-channel fusion, the process of\nefficiently merging multi-channel retrieval results, remains underexplored. We\nare the first to identify and systematically investigate multi-channel fusion\nin the retrieval stage. Current industry practices often rely on heuristic\napproaches and manual designs, which often lead to suboptimal performance.\nMoreover, traditional gradient-based methods like SGD are unsuitable for this\ntask due to the non-differentiable nature of the selection process. In this\npaper, we explore advanced channel fusion strategies by assigning\nsystematically optimized weights to each channel. We utilize black-box\noptimization techniques, including the Cross Entropy Method and Bayesian\nOptimization for global weight optimization, alongside policy gradient-based\napproaches for personalized merging. Our methods enhance both personalization\nand flexibility, achieving significant performance improvements across multiple\ndatasets and yielding substantial gains in real-world deployments, offering a\nscalable solution for optimizing multi-channel fusion in retrieval.\n","authors":["Junjie Huang","Jiarui Qin","Jianghao Lin","Ziming Feng","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.16080v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.15996v1","updated":"2024-10-21T13:27:29Z","published":"2024-10-21T13:27:29Z","title":"Surprising Patterns in Musical Influence Networks","summary":" Analyzing musical influence networks, such as those formed by artist\ninfluence or sampling, has provided valuable insights into contemporary Western\nmusic. Here, computational methods like centrality rankings help identify\ninfluential artists. However, little attention has been given to how influence\nchanges over time. In this paper, we apply Bayesian Surprise to track the\nevolution of musical influence networks. Using two networks -- one of artist\ninfluence and another of covers, remixes, and samples -- our results reveal\nsignificant periods of change in network structure. Additionally, we\ndemonstrate that Bayesian Surprise is a flexible framework for testing various\nhypotheses on network evolution with real-world data.\n","authors":["Flavio Figueiredo","Tales Panoutsos","Nazareno Andrade"],"pdf_url":"https://arxiv.org/pdf/2410.15996v1.pdf","comment":"To appear in the Latin American Musical Information Retrieval\n Workshop"},{"id":"http://arxiv.org/abs/2409.14038v3","updated":"2024-10-21T12:54:33Z","published":"2024-09-21T06:49:34Z","title":"OAEI-LLM: A Benchmark Dataset for Understanding Large Language Model\n Hallucinations in Ontology Matching","summary":" Hallucinations of large language models (LLMs) commonly occur in\ndomain-specific downstream tasks, with no exception in ontology matching (OM).\nThe prevalence of using LLMs for OM raises the need for benchmarks to better\nunderstand LLM hallucinations. The OAEI-LLM dataset is an extended version of\nthe Ontology Alignment Evaluation Initiative (OAEI) datasets that evaluate\nLLM-specific hallucinations in OM tasks. We outline the methodology used in\ndataset construction and schema extension, and provide examples of potential\nuse cases.\n","authors":["Zhangcheng Qiang","Kerry Taylor","Weiqing Wang","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.14038v3.pdf","comment":"5 pages, 1 figure, 1 table"},{"id":"http://arxiv.org/abs/2410.15944v1","updated":"2024-10-21T12:21:49Z","published":"2024-10-21T12:21:49Z","title":"Developing Retrieval Augmented Generation (RAG) based LLM Systems from\n PDFs: An Experience Report","summary":" This paper presents an experience report on the development of Retrieval\nAugmented Generation (RAG) systems using PDF documents as the primary data\nsource. The RAG architecture combines generative capabilities of Large Language\nModels (LLMs) with the precision of information retrieval. This approach has\nthe potential to redefine how we interact with and augment both structured and\nunstructured knowledge in generative models to enhance transparency, accuracy,\nand contextuality of responses. The paper details the end-to-end pipeline, from\ndata collection, preprocessing, to retrieval indexing and response generation,\nhighlighting technical challenges and practical solutions. We aim to offer\ninsights to researchers and practitioners developing similar systems using two\ndistinct approaches: OpenAI's Assistant API with GPT Series and Llama's\nopen-source models. The practical implications of this research lie in\nenhancing the reliability of generative AI systems in various sectors where\ndomain-specific knowledge and real-time information retrieval is important. The\nPython code used in this work is also available at:\nhttps://github.com/GPT-Laboratory/RAG-LLM-Development-Guidebook-from-PDFs.\n","authors":["Ayman Asad Khan","Md Toufique Hasan","Kai Kristian Kemell","Jussi Rasku","Pekka Abrahamsson"],"pdf_url":"https://arxiv.org/pdf/2410.15944v1.pdf","comment":"36 pages, 8 figures, 2 tables, and python code snippets"},{"id":"http://arxiv.org/abs/2410.15930v1","updated":"2024-10-21T11:59:14Z","published":"2024-10-21T11:59:14Z","title":"Centrality-aware Product Retrieval and Ranking","summary":" This paper addresses the challenge of improving user experience on e-commerce\nplatforms by enhancing product ranking relevant to users' search queries.\nAmbiguity and complexity of user queries often lead to a mismatch between the\nuser's intent and retrieved product titles or documents. Recent approaches have\nproposed the use of Transformer-based models, which need millions of annotated\nquery-title pairs during the pre-training stage, and this data often does not\ntake user intent into account. To tackle this, we curate samples from existing\ndatasets at eBay, manually annotated with buyer-centric relevance scores and\ncentrality scores, which reflect how well the product title matches the users'\nintent. We introduce a User-intent Centrality Optimization (UCO) approach for\nexisting models, which optimises for the user intent in semantic product\nsearch. To that end, we propose a dual-loss based optimisation to handle hard\nnegatives, i.e., product titles that are semantically relevant but do not\nreflect the user's intent. Our contributions include curating challenging\nevaluation sets and implementing UCO, resulting in significant product ranking\nefficiency improvements observed for different evaluation metrics. Our work\naims to ensure that the most buyer-centric titles for a query are ranked\nhigher, thereby, enhancing the user experience on e-commerce platforms.\n","authors":["Hadeel Saadany","Swapnil Bhosale","Samarth Agrawal","Diptesh Kanojia","Constantin Orasan","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2410.15930v1.pdf","comment":"EMNLP 2024: Industry track"},{"id":"http://arxiv.org/abs/2410.15884v1","updated":"2024-10-21T11:02:18Z","published":"2024-10-21T11:02:18Z","title":"Using GPT Models for Qualitative and Quantitative News Analytics in the\n 2024 US Presidental Election Process","summary":" The paper considers an approach of using Google Search API and GPT-4o model\nfor qualitative and quantitative analyses of news through retrieval-augmented\ngeneration (RAG). This approach was applied to analyze news about the 2024 US\npresidential election process. Different news sources for different time\nperiods have been analyzed. Quantitative scores generated by GPT model have\nbeen analyzed using Bayesian regression to derive trend lines. The\ndistributions found for the regression parameters allow for the analysis of\nuncertainty in the election process. The obtained results demonstrate that\nusing the GPT models for news analysis, one can get informative analytics and\nprovide key insights that can be applied in further analyses of election\nprocesses.\n","authors":["Bohdan M. Pavlyshenko"],"pdf_url":"https://arxiv.org/pdf/2410.15884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15801v1","updated":"2024-10-21T09:18:30Z","published":"2024-10-21T09:18:30Z","title":"Improve Dense Passage Retrieval with Entailment Tuning","summary":" Retrieval module can be plugged into many downstream NLP tasks to improve\ntheir performance, such as open-domain question answering and\nretrieval-augmented generation. The key to a retrieval system is to calculate\nrelevance scores to query and passage pairs. However, the definition of\nrelevance is often ambiguous. We observed that a major class of relevance\naligns with the concept of entailment in NLI tasks. Based on this observation,\nwe designed a method called entailment tuning to improve the embedding of dense\nretrievers. Specifically, we unify the form of retrieval data and NLI data\nusing existence claim as a bridge. Then, we train retrievers to predict the\nclaims entailed in a passage with a variant task of masked prediction. Our\nmethod can be efficiently plugged into current dense retrieval methods, and\nexperiments show the effectiveness of our method.\n","authors":["Lu Dai","Hao Liu","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2410.15801v1.pdf","comment":"EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2410.06062v3","updated":"2024-10-21T09:13:48Z","published":"2024-10-08T14:09:12Z","title":"LLM-based SPARQL Query Generation from Natural Language over Federated\n Knowledge Graphs","summary":" We introduce a Retrieval-Augmented Generation (RAG) system for translating\nuser questions into accurate federated SPARQL queries over bioinformatics\nknowledge graphs (KGs) leveraging Large Language Models (LLMs). To enhance\naccuracy and reduce hallucinations in query generation, our system utilises\nmetadata from the KGs, including query examples and schema information, and\nincorporates a validation step to correct generated queries. The system is\navailable online at chat.expasy.org.\n","authors":["Vincent Emonet","Jerven Bolleman","Severine Duvaud","Tarcisio Mendes de Farias","Ana Claudia Sima"],"pdf_url":"https://arxiv.org/pdf/2410.06062v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12400v2","updated":"2024-10-21T09:05:47Z","published":"2024-10-16T09:28:58Z","title":"QUIDS: Query Intent Generation via Dual Space Modeling","summary":" Query understanding is a crucial component of Information Retrieval (IR),\naimed at identifying the underlying search intent of textual queries. However,\nmost existing approaches oversimplify this task into query classification or\nclustering, which fails to fully capture the nuanced intent behind the query.\nIn this paper, we address the task of query intent generation: to automatically\ngenerate detailed and precise intent descriptions for search queries using\nrelevant and irrelevant documents given a query. These intent descriptions can\nhelp users understand why the search engine considered the top-ranked documents\nrelevant, and provide more transparency to the retrieval process. We propose a\ndual-space model that uses semantic relevance and irrelevance information in\nthe returned documents to explain the understanding of the query intent.\nSpecifically, in the encoding process, we project, separate, and distinguish\nrelevant and irrelevant documents in the representation space. Then, we\nintroduce a semantic decoupling model in the novel disentangling space, where\nthe semantics of irrelevant information are removed from the relevant space,\nensuring that only the essential and relevant intent is captured. This process\nrefines the understanding of the query and provides more accurate explanations\nfor the search results. Experiments on benchmark data demonstrate that our\nmethods produce high-quality query intent descriptions, outperforming existing\nmethods for this task, as well as state-of-the-art query-based summarization\nmethods. A token-level visualization of attention scores reveals that our model\neffectively reduces the focus on irrelevant intent topics. Our findings open up\npromising research and application directions for query intent generation,\nparticularly in exploratory search.\n","authors":["Yumeng Wang","Xiuying Chen","Suzan Verberne"],"pdf_url":"https://arxiv.org/pdf/2410.12400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15737v1","updated":"2024-10-21T07:56:45Z","published":"2024-10-21T07:56:45Z","title":"Who's Who: Large Language Models Meet Knowledge Conflicts in Practice","summary":" Retrieval-augmented generation (RAG) methods are viable solutions for\naddressing the static memory limits of pre-trained language models.\nNevertheless, encountering conflicting sources of information within the\nretrieval context is an inevitable practical challenge. In such situations, the\nlanguage models are recommended to transparently inform users about the\nconflicts rather than autonomously deciding what to present based on their\ninherent biases. To analyze how current large language models (LLMs) align with\nour recommendation, we introduce WhoQA, a public benchmark dataset to examine\nmodel's behavior in knowledge conflict situations. We induce conflicts by\nasking about a common property among entities having the same name, resulting\nin questions with up to 8 distinctive answers. WhoQA evaluation set includes 5K\nquestions across 13 Wikidata property types and 150K Wikipedia entities. Our\nexperiments show that despite the simplicity of WhoQA questions, knowledge\nconflicts significantly degrades LLMs' performance in RAG settings.\n","authors":["Quang Hieu Pham","Hoang Ngo","Anh Tuan Luu","Dat Quoc Nguyen"],"pdf_url":"https://arxiv.org/pdf/2410.15737v1.pdf","comment":"Accepted to EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2410.14066v2","updated":"2024-10-21T07:50:28Z","published":"2024-10-17T22:28:07Z","title":"Lightweight Correlation-Aware Table Compression","summary":" The growing adoption of data lakes for managing relational data necessitates\nefficient, open storage formats that provide high scan performance and\ncompetitive compression ratios. While existing formats achieve fast scans\nthrough lightweight encoding techniques, they have reached a plateau in terms\nof minimizing storage footprint. Recently, correlation-aware compression\nschemes have been shown to reduce file sizes further. Yet, current approaches\neither incur significant scan overheads or require manual specification of\ncorrelations, limiting their practicability. We present $\\texttt{Virtual}$, a\nframework that integrates seamlessly with existing open formats to\nautomatically leverage data correlations, achieving substantial compression\ngains while having minimal scan performance overhead. Experiments on data-gov\ndatasets show that $\\texttt{Virtual}$ reduces file sizes by up to 40% compared\nto Apache Parquet.\n","authors":["Mihail Stoian","Alexander van Renen","Jan Kobiolka","Ping-Lin Kuo","Josif Grabocka","Andreas Kipf"],"pdf_url":"https://arxiv.org/pdf/2410.14066v2.pdf","comment":"Third Table Representation Learning Workshop (TRL @ NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2410.15576v1","updated":"2024-10-21T01:54:46Z","published":"2024-10-21T01:54:46Z","title":"A Survey of Conversational Search","summary":" As a cornerstone of modern information access, search engines have become\nindispensable in everyday life. With the rapid advancements in AI and natural\nlanguage processing (NLP) technologies, particularly large language models\n(LLMs), search engines have evolved to support more intuitive and intelligent\ninteractions between users and systems. Conversational search, an emerging\nparadigm for next-generation search engines, leverages natural language\ndialogue to facilitate complex and precise information retrieval, thus\nattracting significant attention. Unlike traditional keyword-based search\nengines, conversational search systems enhance user experience by supporting\nintricate queries, maintaining context over multi-turn interactions, and\nproviding robust information integration and processing capabilities. Key\ncomponents such as query reformulation, search clarification, conversational\nretrieval, and response generation work in unison to enable these sophisticated\ninteractions. In this survey, we explore the recent advancements and potential\nfuture directions in conversational search, examining the critical modules that\nconstitute a conversational search system. We highlight the integration of LLMs\nin enhancing these systems and discuss the challenges and opportunities that\nlie ahead in this dynamic field. Additionally, we provide insights into\nreal-world applications and robust evaluations of current conversational search\nsystems, aiming to guide future research and development in conversational\nsearch.\n","authors":["Fengran Mo","Kelong Mao","Ziliang Zhao","Hongjin Qian","Haonan Chen","Yiruo Cheng","Xiaoxi Li","Yutao Zhu","Zhicheng Dou","Jian-Yun Nie"],"pdf_url":"https://arxiv.org/pdf/2410.15576v1.pdf","comment":"35 pages, 8 figures, continue to update"}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.16438v1","updated":"2024-10-21T19:02:13Z","published":"2024-10-21T19:02:13Z","title":"AlignVSR: Audio-Visual Cross-Modal Alignment for Visual Speech\n Recognition","summary":" Visual Speech Recognition (VSR) aims to recognize corresponding text by\nanalyzing visual information from lip movements. Due to the high variability\nand weak information of lip movements, VSR tasks require effectively utilizing\nany information from any source and at any level. In this paper, we propose a\nVSR method based on audio-visual cross-modal alignment, named AlignVSR. The\nmethod leverages the audio modality as an auxiliary information source and\nutilizes the global and local correspondence between the audio and visual\nmodalities to improve visual-to-text inference. Specifically, the method first\ncaptures global alignment between video and audio through a cross-modal\nattention mechanism from video frames to a bank of audio units. Then, based on\nthe temporal correspondence between audio and video, a frame-level local\nalignment loss is introduced to refine the global alignment, improving the\nutility of the audio information. Experimental results on the LRS2 and\nCNVSRC.Single datasets consistently show that AlignVSR outperforms several\nmainstream VSR methods, demonstrating its superior and robust performance.\n","authors":["Zehua Liu","Xiaolou Li","Chen Chen","Li Guo","Lantian Li","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2410.16438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16407v1","updated":"2024-10-21T18:19:09Z","published":"2024-10-21T18:19:09Z","title":"Enhancing Multimodal Affective Analysis with Learned Live Comment\n Features","summary":" Live comments, also known as Danmaku, are user-generated messages that are\nsynchronized with video content. These comments overlay directly onto streaming\nvideos, capturing viewer emotions and reactions in real-time. While prior work\nhas leveraged live comments in affective analysis, its use has been limited due\nto the relative rarity of live comments across different video platforms. To\naddress this, we first construct the Live Comment for Affective Analysis\n(LCAffect) dataset which contains live comments for English and Chinese videos\nspanning diverse genres that elicit a wide spectrum of emotions. Then, using\nthis dataset, we use contrastive learning to train a video encoder to produce\nsynthetic live comment features for enhanced multimodal affective content\nanalysis. Through comprehensive experimentation on a wide range of affective\nanalysis tasks (sentiment, emotion recognition, and sarcasm detection) in both\nEnglish and Chinese, we demonstrate that these synthetic live comment features\nsignificantly improve performance over state-of-the-art methods.\n","authors":["Zhaoyuan Deng","Amith Ananthram","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2410.16407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14485v8","updated":"2024-10-21T15:24:04Z","published":"2024-06-20T16:48:14Z","title":"Proceedings of The second international workshop on eXplainable AI for\n the Arts (XAIxArts)","summary":" This second international workshop on explainable AI for the Arts (XAIxArts)\nbrought together a community of researchers in HCI, Interaction Design, AI,\nexplainable AI (XAI), and digital arts to explore the role of XAI for the Arts.\nWorkshop held at the 16th ACM Conference on Creativity and Cognition (C&C\n2024), Chicago, USA.\n","authors":["Nick Bryan-Kinns","Corey Ford","Shuoyang Zheng","Helen Kennedy","Alan Chamberlain","Makayla Lewis","Drew Hemment","Zijin Li","Qiong Wu","Lanxi Xiao","Gus Xia","Jeba Rezwana","Michael Clemens","Gabriel Vigliensoni"],"pdf_url":"https://arxiv.org/pdf/2406.14485v8.pdf","comment":"Proceedings of The second international workshop on eXplainable AI\n for the Arts (XAIxArts)"},{"id":"http://arxiv.org/abs/2410.16058v1","updated":"2024-10-21T14:37:26Z","published":"2024-10-21T14:37:26Z","title":"Shorter Is Different: Characterizing the Dynamics of Short-Form Video\n Platforms","summary":" The emerging short-form video platforms have been growing tremendously and\nbecome one of the leading social media recently. Although the expanded\npopularity of these platforms has attracted increasing research attention,\nthere has been a lack of understanding of whether and how they deviate from\ntraditional long-form video-sharing platforms such as YouTube and Bilibili. To\naddress this, we conduct a large-scale data-driven analysis of Kuaishou, one of\nthe largest short-form video platforms in China. Based on 248 million videos\nuploaded to the platform across all categories, we identify their notable\ndifferences from long-form video platforms through a comparison study with\nBilibili, a leading long-form video platform in China. We find that videos are\nshortened by multiples on Kuaishou, with distinctive categorical distributions\nover-represented by life-related rather than interest-based videos. Users\ninteract with videos less per view, but top videos can even more effectively\nacquire users' collective attention. More importantly, ordinary content\ncreators have higher probabilities of producing hit videos. Our results shed\nlight on the uniqueness of short-form video platforms and pave the way for\nfuture research and design for better short-form video ecology.\n","authors":["Zhilong Chen","Peijie Liu","Jinghua Piao","Fengli Xu","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2410.16058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15846v1","updated":"2024-10-21T10:16:56Z","published":"2024-10-21T10:16:56Z","title":"Modelling Concurrent RTP Flows for End-to-end Predictions of QoS in Real\n Time Communications","summary":" The Real-time Transport Protocol (RTP)-based real-time communications (RTC)\napplications, exemplified by video conferencing, have experienced an\nunparalleled surge in popularity and development in recent years. In pursuit of\noptimizing their performance, the prediction of Quality of Service (QoS)\nmetrics emerges as a pivotal endeavor, bolstering network monitoring and\nproactive solutions. However, contemporary approaches are confined to\nindividual RTP flows and metrics, falling short in relationship capture and\ncomputational efficiency. To this end, we propose Packet-to-Prediction (P2P), a\nnovel deep learning (DL) framework that hinges on raw packets to simultaneously\nprocess concurrent RTP flows and perform end-to-end prediction of multiple QoS\nmetrics. Specifically, we implement a streamlined architecture, namely\nlength-free Transformer with cross and neighbourhood attention, capable of\nhandling an unlimited number of RTP flows, and employ a multi-task learning\nparadigm to forecast four key metrics in a single shot. Our work is based on\nextensive traffic collected during real video calls, and conclusively, P2P\nexcels comparative models in both prediction performance and temporal\nefficiency.\n","authors":["Tailai Song","Paolo Garza","Michela Meo","Maurizio Matteo Munafò"],"pdf_url":"https://arxiv.org/pdf/2410.15846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.08813v3","updated":"2024-10-21T03:08:08Z","published":"2024-05-14T17:59:02Z","title":"CinePile: A Long Video Question Answering Dataset and Benchmark","summary":" Current datasets for long-form video understanding often fall short of\nproviding genuine long-form comprehension challenges, as many tasks derived\nfrom these datasets can be successfully tackled by analyzing just one or a few\nrandom frames from a video. To address this issue, we present a novel dataset\nand benchmark, CinePile, specifically designed for authentic long-form video\nunderstanding. This paper details our innovative approach for creating a\nquestion-answer dataset, utilizing advanced LLMs with human-in-the-loop and\nbuilding upon human-generated raw data. Our comprehensive dataset comprises\n305,000 multiple-choice questions (MCQs), covering various visual and\nmultimodal aspects, including temporal comprehension, understanding\nhuman-object interactions, and reasoning about events or actions within a\nscene. Additionally, we fine-tuned open-source Video-LLMs on the training split\nand evaluated both open-source and proprietary video-centric LLMs on the test\nsplit of our dataset. The findings indicate that although current models\nunderperform compared to humans, fine-tuning these models can lead to\nsignificant improvements in their performance.\n","authors":["Ruchit Rawal","Khalid Saifullah","Miquel Farré","Ronen Basri","David Jacobs","Gowthami Somepalli","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2405.08813v3.pdf","comment":"Project page with all the artifacts -\n https://ruchitrawal.github.io/cinepile/. Updated version with adversarial\n refinement pipeline and more model evaluations"}]},"2024-10-20T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2410.15511v1","updated":"2024-10-20T21:17:05Z","published":"2024-10-20T21:17:05Z","title":"ConTReGen: Context-driven Tree-structured Retrieval for Open-domain\n Long-form Text Generation","summary":" Open-domain long-form text generation requires generating coherent,\ncomprehensive responses that address complex queries with both breadth and\ndepth. This task is challenging due to the need to accurately capture diverse\nfacets of input queries. Existing iterative retrieval-augmented generation\n(RAG) approaches often struggle to delve deeply into each facet of complex\nqueries and integrate knowledge from various sources effectively. This paper\nintroduces ConTReGen, a novel framework that employs a context-driven,\ntree-structured retrieval approach to enhance the depth and relevance of\nretrieved content. ConTReGen integrates a hierarchical, top-down in-depth\nexploration of query facets with a systematic bottom-up synthesis, ensuring\ncomprehensive coverage and coherent integration of multifaceted information.\nExtensive experiments on multiple datasets, including LFQA and ODSUM, alongside\na newly introduced dataset, ODSUM-WikiHow, demonstrate that ConTReGen\noutperforms existing state-of-the-art RAG models.\n","authors":["Kashob Kumar Roy","Pritom Saha Akash","Kevin Chen-Chuan Chang","Lucian Popa"],"pdf_url":"https://arxiv.org/pdf/2410.15511v1.pdf","comment":"Accepted at EMNLP'24 Findings"},{"id":"http://arxiv.org/abs/2404.13207v3","updated":"2024-10-20T18:59:02Z","published":"2024-04-19T22:54:54Z","title":"STaRK: Benchmarking LLM Retrieval on Textual and Relational Knowledge\n Bases","summary":" Answering real-world complex queries, such as complex product search, often\nrequires accurate retrieval from semi-structured knowledge bases that involve\nblend of unstructured (e.g., textual descriptions of products) and structured\n(e.g., entity relations of products) information. However, many previous works\nstudied textual and relational retrieval tasks as separate topics. To address\nthe gap, we develop STARK, a large-scale Semi-structure retrieval benchmark on\nTextual and Relational Knowledge Bases. Our benchmark covers three domains:\nproduct search, academic paper search, and queries in precision medicine. We\ndesign a novel pipeline to synthesize realistic user queries that integrate\ndiverse relational information and complex textual properties, together with\ntheir ground-truth answers (items). We conduct rigorous human evaluation to\nvalidate the quality of our synthesized queries. We further enhance the\nbenchmark with high-quality human-generated queries to provide an authentic\nreference. STARK serves as a comprehensive testbed for evaluating the\nperformance of retrieval systems driven by large language models (LLMs). Our\nexperiments suggest that STARK presents significant challenges to the current\nretrieval and LLM systems, highlighting the need for more capable\nsemi-structured retrieval systems. The benchmark data and code are available on\nhttps://github.com/snap-stanford/STaRK.\n","authors":["Shirley Wu","Shiyu Zhao","Michihiro Yasunaga","Kexin Huang","Kaidi Cao","Qian Huang","Vassilis N. Ioannidis","Karthik Subbian","James Zou","Jure Leskovec"],"pdf_url":"https://arxiv.org/pdf/2404.13207v3.pdf","comment":"NeurIPS 2024 Track on Datasets and Benchmarks. 26 Pages, 6 Figures.\n Website: https://stark.stanford.edu/"},{"id":"http://arxiv.org/abs/2410.15387v1","updated":"2024-10-20T13:21:52Z","published":"2024-10-20T13:21:52Z","title":"Deep Class-guided Hashing for Multi-label Cross-modal Retrieval","summary":" Deep hashing, due to its low cost and efficient retrieval advantages, is\nwidely valued in cross-modal retrieval. However, existing cross-modal hashing\nmethods either explore the relationships between data points, which inevitably\nleads to intra-class dispersion, or explore the relationships between data\npoints and categories while ignoring the preservation of inter-class structural\nrelationships, resulting in the generation of suboptimal hash codes. How to\nmaintain both intra-class aggregation and inter-class structural relationships,\nIn response to this issue, this paper proposes a DCGH method. Specifically, we\nuse proxy loss as the mainstay to maintain intra-class aggregation of data,\ncombined with pairwise loss to maintain inter-class structural relationships,\nand on this basis, further propose a variance constraint to address the\nsemantic bias issue caused by the combination. A large number of comparative\nexperiments on three benchmark datasets show that the DCGH method has\ncomparable or even better performance compared to existing cross-modal\nretrieval methods. The code for the implementation of our DCGH framework is\navailable at https://github.com/donnotnormal/DCGH.\n","authors":["Hao Chen","Lei Zhu","Xinghui Zhu"],"pdf_url":"https://arxiv.org/pdf/2410.15387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09369v4","updated":"2024-10-20T11:05:15Z","published":"2024-05-15T14:20:37Z","title":"Diffusion-based Contrastive Learning for Sequential Recommendation","summary":" Contrastive learning has been effectively utilized to enhance the training of\nsequential recommendation models by leveraging informative self-supervised\nsignals. Most existing approaches generate augmented views of the same user\nsequence through random augmentation and subsequently maximize their agreement\nin the representation space. However, these methods often neglect the\nrationality of the augmented samples. Due to significant uncertainty, random\naugmentation can disrupt the semantic information and interest evolution\npatterns inherent in the original user sequences. Moreover, pulling\nsemantically inconsistent sequences closer in the representation space can\nrender the user sequence embeddings insensitive to variations in user\npreferences, which contradicts the primary objective of sequential\nrecommendation. To address these limitations, we propose the Context-aware\nDiffusion-based Contrastive Learning for Sequential Recommendation, named\nCaDiRec. The core idea is to leverage context information to generate more\nreasonable augmented views. Specifically, CaDiRec employs a context-aware\ndiffusion model to generate alternative items for the given positions within a\nsequence. These generated items are aligned with their respective context\ninformation and can effectively replace the corresponding original items,\nthereby generating a positive view of the original sequence. By considering two\ndifferent augmentations of the same user sequence, we can construct a pair of\npositive samples for contrastive learning. To ensure representation cohesion,\nwe train the entire framework in an end-to-end manner, with shared item\nembeddings between the diffusion model and the recommendation model. Extensive\nexperiments on five benchmark datasets demonstrate the advantages of our\nproposed method over existing baselines.\n","authors":["Ziqiang Cui","Haolun Wu","Bowei He","Ji Cheng","Chen Ma"],"pdf_url":"https://arxiv.org/pdf/2405.09369v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10360v2","updated":"2024-10-20T07:53:50Z","published":"2024-10-14T10:26:57Z","title":"Parenting: Optimizing Knowledge Selection of Retrieval-Augmented\n Language Models with Parameter Decoupling and Tailored Tuning","summary":" Retrieval-Augmented Generation (RAG) offers an effective solution to the\nissues faced by Large Language Models (LLMs) in hallucination generation and\nknowledge obsolescence by incorporating externally retrieved knowledge.\nHowever, existing methods lack effective control mechanisms for integrating\ninternal and external knowledge. Inspired by human cognitive processes, we\npropose Parenting, a novel framework that decouples, identifies, and\npurposefully optimizes parameter subspaces related to adherence and robustness.\nSpecifically, Parenting utilizes a key parameter mining method that combines\nforward and backward propagation signals to localize subspaces representing\ndifferent capabilities. Then, Parenting employs a type-tailored tuning\nstrategy, applying specific and appropriate optimizations to different\nsubspaces, aiming to achieve a balanced enhancement of both adherence and\nrobustness. Extensive experiments on various datasets and models validate the\neffectiveness and generalizability of our method.\n","authors":["Yongxin Xu","Ruizhe Zhang","Xinke Jiang","Yujie Feng","Yuzhen Xiao","Xinyu Ma","Runchuan Zhu","Xu Chu","Junfeng Zhao","Yasha Wang"],"pdf_url":"https://arxiv.org/pdf/2410.10360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10311v2","updated":"2024-10-20T05:49:18Z","published":"2024-05-16T17:58:45Z","title":"UniRAG: Universal Retrieval Augmentation for Multi-Modal Large Language\n Models","summary":" Recently, Multi-Modal (MM) Large Language Models (LLMs) have unlocked many\ncomplex use-cases that require MM understanding (e.g., image captioning or\nvisual question answering) and MM generation (e.g., text-guided image\ngeneration or editing) capabilities. To further improve the output fidelity of\nMM-LLMs we introduce UniRAG, a plug-and-play technique that adds relevant\nretrieved information to prompts as few-shot examples during inference. Unlike\nthe common belief that Retrieval Augmentation (RA) mainly improves generation\nor understanding of uncommon entities, our evaluation results on the MSCOCO\ndataset with common entities show that both proprietary models like GPT-4o and\nGemini-Pro and smaller open-source models like LLaVA, LaVIT, and Emu2\nsignificantly enhance their generation quality when their input prompts are\naugmented with relevant information retrieved by MM retrievers like UniIR\nmodels.\n","authors":["Sahel Sharifymoghaddam","Shivani Upadhyay","Wenhu Chen","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2405.10311v2.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.15272v1","updated":"2024-10-20T04:05:18Z","published":"2024-10-20T04:05:18Z","title":"Performance-Driven QUBO for Recommender Systems on Quantum Annealers","summary":" We propose Counterfactual Analysis Quadratic Unconstrained Binary\nOptimization (CAQUBO) to solve QUBO problems for feature selection in\nrecommender systems. CAQUBO leverages counterfactual analysis to measure the\nimpact of individual features and feature combinations on model performance and\nemploys the measurements to construct the coefficient matrix for a quantum\nannealer to select the optimal feature combinations for recommender systems,\nthereby improving their final recommendation performance. By establishing\nexplicit connections between features and the recommendation performance, the\nproposed approach demonstrates superior performance compared to the\nstate-of-the-art quantum annealing methods. Extensive experiments indicate that\nintegrating quantum computing with counterfactual analysis holds great promise\nfor addressing these challenges.\n","authors":["Jiayang Niu","Jie Li","Ke Deng","Mark Sanderson","Yongli Ren"],"pdf_url":"https://arxiv.org/pdf/2410.15272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15262v1","updated":"2024-10-20T03:15:01Z","published":"2024-10-20T03:15:01Z","title":"HyQE: Ranking Contexts with Hypothetical Query Embeddings","summary":" In retrieval-augmented systems, context ranking techniques are commonly\nemployed to reorder the retrieved contexts based on their relevance to a user\nquery. A standard approach is to measure this relevance through the similarity\nbetween contexts and queries in the embedding space. However, such similarity\noften fails to capture the relevance. Alternatively, large language models\n(LLMs) have been used for ranking contexts. However, they can encounter\nscalability issues when the number of candidate contexts grows and the context\nwindow sizes of the LLMs remain constrained. Additionally, these approaches\nrequire fine-tuning LLMs with domain-specific data. In this work, we introduce\na scalable ranking framework that combines embedding similarity and LLM\ncapabilities without requiring LLM fine-tuning. Our framework uses a\npre-trained LLM to hypothesize the user query based on the retrieved contexts\nand ranks the context based on the similarity between the hypothesized queries\nand the user query. Our framework is efficient at inference time and is\ncompatible with many other retrieval and ranking techniques. Experimental\nresults show that our method improves the ranking performance across multiple\nbenchmarks. The complete code and data are available at\nhttps://github.com/zwc662/hyqe\n","authors":["Weichao Zhou","Jiaxin Zhang","Hilaf Hasson","Anu Singh","Wenchao Li"],"pdf_url":"https://arxiv.org/pdf/2410.15262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02219v2","updated":"2024-10-20T02:40:57Z","published":"2024-10-03T05:23:39Z","title":"Multi-modal clothing recommendation model based on large model and VAE\n enhancement","summary":" Accurately recommending products has long been a subject requiring in-depth\nresearch. This study proposes a multimodal paradigm for clothing\nrecommendations. Specifically, it designs a multimodal analysis method that\nintegrates clothing description texts and images, utilizing a pre-trained large\nlanguage model to deeply explore the hidden meanings of users and products.\nAdditionally, a variational encoder is employed to learn the relationship\nbetween user information and products to address the cold start problem in\nrecommendation systems. This study also validates the significant performance\nadvantages of this method over various recommendation system methods through\nextensive ablation experiments, providing crucial practical guidance for the\ncomprehensive optimization of recommendation systems.\n","authors":["Bingjie Huang","Qingyi Lu","Shuaishuai Huang","Xue-she Wang","Haowei Yang"],"pdf_url":"https://arxiv.org/pdf/2410.02219v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.15461v1","updated":"2024-10-20T18:24:00Z","published":"2024-10-20T18:24:00Z","title":"EVA: An Embodied World Model for Future Video Anticipation","summary":" World models integrate raw data from various modalities, such as images and\nlanguage to simulate comprehensive interactions in the world, thereby\ndisplaying crucial roles in fields like mixed reality and robotics. Yet,\napplying the world model for accurate video prediction is quite challenging due\nto the complex and dynamic intentions of the various scenes in practice. In\nthis paper, inspired by the human rethinking process, we decompose the complex\nvideo prediction into four meta-tasks that enable the world model to handle\nthis issue in a more fine-grained manner. Alongside these tasks, we introduce a\nnew benchmark named Embodied Video Anticipation Benchmark (EVA-Bench) to\nprovide a well-rounded evaluation. EVA-Bench focused on evaluating the video\nprediction ability of human and robot actions, presenting significant\nchallenges for both the language model and the generation model. Targeting\nembodied video prediction, we propose the Embodied Video Anticipator (EVA), a\nunified framework aiming at video understanding and generation. EVA integrates\na video generation model with a visual language model, effectively combining\nreasoning capabilities with high-quality generation. Moreover, to enhance the\ngeneralization of our framework, we tailor-designed a multi-stage pretraining\nparadigm that adaptatively ensembles LoRA to produce high-fidelity results.\nExtensive experiments on EVA-Bench highlight the potential of EVA to\nsignificantly improve performance in embodied scenes, paving the way for\nlarge-scale pre-trained models in real-world prediction tasks.\n","authors":["Xiaowei Chi","Hengyuan Zhang","Chun-Kai Fan","Xingqun Qi","Rongyu Zhang","Anthony Chen","Chi-min Chan","Wei Xue","Wenhan Luo","Shanghang Zhang","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2410.15461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15364v1","updated":"2024-10-20T11:40:31Z","published":"2024-10-20T11:40:31Z","title":"Scene Graph Generation with Role-Playing Large Language Models","summary":" Current approaches for open-vocabulary scene graph generation (OVSGG) use\nvision-language models such as CLIP and follow a standard zero-shot pipeline --\ncomputing similarity between the query image and the text embeddings for each\ncategory (i.e., text classifiers). In this work, we argue that the text\nclassifiers adopted by existing OVSGG methods, i.e., category-/part-level\nprompts, are scene-agnostic as they remain unchanged across contexts. Using\nsuch fixed text classifiers not only struggles to model visual relations with\nhigh variance, but also falls short in adapting to distinct contexts. To plug\nthese intrinsic shortcomings, we devise SDSGG, a scene-specific description\nbased OVSGG framework where the weights of text classifiers are adaptively\nadjusted according to the visual content. In particular, to generate\ncomprehensive and diverse descriptions oriented to the scene, an LLM is asked\nto play different roles (e.g., biologist and engineer) to analyze and discuss\nthe descriptive features of a given scene from different views. Unlike previous\nefforts simply treating the generated descriptions as mutually equivalent text\nclassifiers, SDSGG is equipped with an advanced renormalization mechanism to\nadjust the influence of each text classifier based on its relevance to the\npresented scene (this is what the term \"specific\" means). Furthermore, to\ncapture the complicated interplay between subjects and objects, we propose a\nnew lightweight module called mutual visual adapter. It refines CLIP's ability\nto recognize relations by learning an interaction-aware semantic space.\nExtensive experiments on prevalent benchmarks show that SDSGG outperforms\ntop-leading methods by a clear margin.\n","authors":["Guikun Chen","Jin Li","Wenguan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.15364v1.pdf","comment":"NeurIPS 2024. Code: https://github.com/guikunchen/SDSGG"},{"id":"http://arxiv.org/abs/2410.15279v1","updated":"2024-10-20T04:28:19Z","published":"2024-10-20T04:28:19Z","title":"ContextDet: Temporal Action Detection with Adaptive Context Aggregation","summary":" Temporal action detection (TAD), which locates and recognizes action\nsegments, remains a challenging task in video understanding due to variable\nsegment lengths and ambiguous boundaries. Existing methods treat neighboring\ncontexts of an action segment indiscriminately, leading to imprecise boundary\npredictions. We introduce a single-stage ContextDet framework, which makes use\nof large-kernel convolutions in TAD for the first time. Our model features a\npyramid adaptive context aggragation (ACA) architecture, capturing long context\nand improving action discriminability. Each ACA level consists of two novel\nmodules. The context attention module (CAM) identifies salient contextual\ninformation, encourages context diversity, and preserves context integrity\nthrough a context gating block (CGB). The long context module (LCM) makes use\nof a mixture of large- and small-kernel convolutions to adaptively gather\nlong-range context and fine-grained local features. Additionally, by varying\nthe length of these large kernels across the ACA pyramid, our model provides\nlightweight yet effective context aggregation and action discrimination. We\nconducted extensive experiments and compared our model with a number of\nadvanced TAD methods on six challenging TAD benchmarks: MultiThumos, Charades,\nFineAction, EPIC-Kitchens 100, Thumos14, and HACS, demonstrating superior\naccuracy at reduced inference speed.\n","authors":["Ning Wang","Yun Xiao","Xiaopeng Peng","Xiaojun Chang","Xuanhong Wang","Dingyi Fang"],"pdf_url":"https://arxiv.org/pdf/2410.15279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15266v1","updated":"2024-10-20T03:45:50Z","published":"2024-10-20T03:45:50Z","title":"GSSF: Generalized Structural Sparse Function for Deep Cross-modal Metric\n Learning","summary":" Cross-modal metric learning is a prominent research topic that bridges the\nsemantic heterogeneity between vision and language. Existing methods frequently\nutilize simple cosine or complex distance metrics to transform the pairwise\nfeatures into a similarity score, which suffers from an inadequate or\ninefficient capability for distance measurements. Consequently, we propose a\nGeneralized Structural Sparse Function to dynamically capture thorough and\npowerful relationships across modalities for pair-wise similarity learning\nwhile remaining concise but efficient. Specifically, the distance metric\ndelicately encapsulates two formats of diagonal and block-diagonal terms,\nautomatically distinguishing and highlighting the cross-channel relevancy and\ndependency inside a structured and organized topology. Hence, it thereby\nempowers itself to adapt to the optimal matching patterns between the paired\nfeatures and reaches a sweet spot between model complexity and capability.\nExtensive experiments on cross-modal and two extra uni-modal retrieval tasks\n(image-text retrieval, person re-identification, fine-grained image retrieval)\nhave validated its superiority and flexibility over various popular retrieval\nframeworks. More importantly, we further discover that it can be seamlessly\nincorporated into multiple application scenarios, and demonstrates promising\nprospects from Attention Mechanism to Knowledge Distillation in a plug-and-play\nmanner. Our code is publicly available at: https://github.com/Paranioar/GSSF.\n","authors":["Haiwen Diao","Ying Zhang","Shang Gao","Jiawen Zhu","Long Chen","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2410.15266v1.pdf","comment":"12 pages, 9 figures, Accepted by TIP2024"}]},"2024-10-19T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2410.03071v2","updated":"2024-10-19T20:40:34Z","published":"2024-10-04T01:28:56Z","title":"Enhancing Short-Text Topic Modeling with LLM-Driven Context Expansion\n and Prefix-Tuned VAEs","summary":" Topic modeling is a powerful technique for uncovering hidden themes within a\ncollection of documents. However, the effectiveness of traditional topic models\noften relies on sufficient word co-occurrence, which is lacking in short texts.\nTherefore, existing approaches, whether probabilistic or neural, frequently\nstruggle to extract meaningful patterns from such data, resulting in incoherent\ntopics. To address this challenge, we propose a novel approach that leverages\nlarge language models (LLMs) to extend short texts into more detailed sequences\nbefore applying topic modeling. To further improve the efficiency and solve the\nproblem of semantic inconsistency from LLM-generated texts, we propose to use\nprefix tuning to train a smaller language model coupled with a variational\nautoencoder for short-text topic modeling. Our method significantly improves\nshort-text topic modeling performance, as demonstrated by extensive experiments\non real-world datasets with extreme data sparsity, outperforming current\nstate-of-the-art topic models.\n","authors":["Pritom Saha Akash","Kevin Chen-Chuan Chang"],"pdf_url":"https://arxiv.org/pdf/2410.03071v2.pdf","comment":"EMNLP Findings 2024. arXiv admin note: substantial text overlap with\n arXiv:2310.15420"},{"id":"http://arxiv.org/abs/2410.15174v1","updated":"2024-10-19T18:28:06Z","published":"2024-10-19T18:28:06Z","title":"Crafting Tomorrow: The Influence of Design Choices on Fresh Content in\n Social Media Recommendation","summary":" The rise in popularity of social media platforms, has resulted in millions of\nnew, content pieces being created every day. This surge in content creation\nunderscores the need to pay attention to our design choices as they can greatly\nimpact how long content remains relevant. In today's landscape where regularly\nrecommending new content is crucial, particularly in the absence of detailed\ninformation, a variety of factors such as UI features, algorithms and system\nsettings contribute to shaping the journey of content across the platform.\nWhile previous research has focused on how new content affects users'\nexperiences, this study takes a different approach by analyzing these decisions\nconsidering the content itself.\n Through a series of carefully crafted experiments we explore how seemingly\nsmall decisions can influence the longevity of content, measured by metrics\nlike Content Progression (CVP) and Content Survival (CSR). We also emphasize\nthe importance of recognizing the stages that content goes through underscoring\nthe need to tailor strategies for each stage as a one size fits all approach\nmay not be effective. Additionally we argue for a departure from traditional\nexperimental setups in the study of content lifecycles, to avoid potential\nmisunderstandings while proposing advanced techniques, to achieve greater\nprecision and accuracy in the evaluation process.\n","authors":["Srijan Saket","Mohit Agarwal","Rishabh Mehrotra"],"pdf_url":"https://arxiv.org/pdf/2410.15174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16350v3","updated":"2024-10-19T18:04:52Z","published":"2024-06-24T06:46:32Z","title":"A Survey on Intent-aware Recommender Systems","summary":" Many modern online services feature personalized recommendations. A central\nchallenge when providing such recommendations is that the reason why an\nindividual user accesses the service may change from visit to visit or even\nduring an ongoing usage session. To be effective, a recommender system should\ntherefore aim to take the users' probable intent of using the service at a\ncertain point in time into account. In recent years, researchers have thus\nstarted to address this challenge by incorporating intent-awareness into\nrecommender systems. Correspondingly, a number of technical approaches were put\nforward, including diversification techniques, intent prediction models or\nlatent intent modeling approaches. In this paper, we survey and categorize\nexisting approaches to building the next generation of Intent-Aware Recommender\nSystems (IARS). Based on an analysis of current evaluation practices, we\noutline open gaps and possible future directions in this area, which in\nparticular include the consideration of additional interaction signals and\ncontextual information to further improve the effectiveness of such systems.\n","authors":["Dietmar Jannach","Markus Zanker"],"pdf_url":"https://arxiv.org/pdf/2406.16350v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15145v1","updated":"2024-10-19T16:12:22Z","published":"2024-10-19T16:12:22Z","title":"Mining Asymmetric Intertextuality","summary":" This paper introduces a new task in Natural Language Processing (NLP) and\nDigital Humanities (DH): Mining Asymmetric Intertextuality. Asymmetric\nintertextuality refers to one-sided relationships between texts, where one text\ncites, quotes, or borrows from another without reciprocation. These\nrelationships are common in literature and historical texts, where a later work\nreferences aclassical or older text that remain static.\n We propose a scalable and adaptive approach for mining asymmetric\nintertextuality, leveraging a split-normalize-merge paradigm. In this approach,\ndocuments are split into smaller chunks, normalized into structured data using\nLLM-assisted metadata extraction, and merged during querying to detect both\nexplicit and implicit intertextual relationships. Our system handles\nintertextuality at various levels, from direct quotations to paraphrasing and\ncross-document influence, using a combination of metadata filtering, vector\nsimilarity search, and LLM-based verification.\n This method is particularly well-suited for dynamically growing corpora, such\nas expanding literary archives or historical databases. By enabling the\ncontinuous integration of new documents, the system can scale efficiently,\nmaking it highly valuable for digital humanities practitioners in literacy\nstudies, historical research and related fields.\n","authors":["Pak Kin Lau","Stuart Michael McManus"],"pdf_url":"https://arxiv.org/pdf/2410.15145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03538v2","updated":"2024-10-19T13:51:20Z","published":"2024-09-15T06:40:38Z","title":"Dreaming User Multimodal Representation Guided by The Platonic\n Representation Hypothesis for Micro-Video Recommendation","summary":" The proliferation of online micro-video platforms has underscored the\nnecessity for advanced recommender systems to mitigate information overload and\ndeliver tailored content. Despite advancements, accurately and promptly\ncapturing dynamic user interests remains a formidable challenge. Inspired by\nthe Platonic Representation Hypothesis, which posits that different data\nmodalities converge towards a shared statistical model of reality, we introduce\nDreamUMM (Dreaming User Multi-Modal Representation), a novel approach\nleveraging user historical behaviors to create real-time user representation in\na multimoda space. DreamUMM employs a closed-form solution correlating user\nvideo preferences with multimodal similarity, hypothesizing that user interests\ncan be effectively represented in a unified multimodal space. Additionally, we\npropose Candidate-DreamUMM for scenarios lacking recent user behavior data,\ninferring interests from candidate videos alone. Extensive online A/B tests\ndemonstrate significant improvements in user engagement metrics, including\nactive days and play count. The successful deployment of DreamUMM in two\nmicro-video platforms with hundreds of millions of daily active users,\nillustrates its practical efficacy and scalability in personalized micro-video\ncontent delivery. Our work contributes to the ongoing exploration of\nrepresentational convergence by providing empirical evidence supporting the\npotential for user interest representations to reside in a multimodal space.\n","authors":["Chengzhi Lin","Hezheng Lin","Shuchang Liu","Cangguang Ruan","LingJing Xu","Dezhao Yang","Chuyuan Wang","Yongqi Liu"],"pdf_url":"https://arxiv.org/pdf/2410.03538v2.pdf","comment":"4 Figure; 2 Table"},{"id":"http://arxiv.org/abs/2410.15098v1","updated":"2024-10-19T13:15:36Z","published":"2024-10-19T13:15:36Z","title":"Incorporating Group Prior into Variational Inference for Tail-User\n Behavior Modeling in CTR Prediction","summary":" User behavior modeling -- which aims to extract user interests from\nbehavioral data -- has shown great power in Click-through rate (CTR)\nprediction, a key component in recommendation systems. Recently,\nattention-based algorithms have become a promising direction, as attention\nmechanisms emphasize the relevant interactions from rich behaviors. However,\nthe methods struggle to capture the preferences of tail users with sparse\ninteraction histories. To address the problem, we propose a novel variational\ninference approach, namely Group Prior Sampler Variational Inference (GPSVI),\nwhich introduces group preferences as priors to refine latent user interests\nfor tail users. In GPSVI, the extent of adjustments depends on the estimated\nuncertainty of individual preference modeling. In addition, We further enhance\nthe expressive power of variational inference by a volume-preserving flow. An\nappealing property of the GPSVI method is its ability to revert to traditional\nattention for head users with rich behavioral data while consistently enhancing\nperformance for long-tail users with sparse behaviors. Rigorous analysis and\nextensive experiments demonstrate that GPSVI consistently improves the\nperformance of tail users. Moreover, online A/B testing on a large-scale\nreal-world recommender system further confirms the effectiveness of our\nproposed approach.\n","authors":["Han Xu","Taoxing Pan","Zhiqiang Liu","Xiaoxiao Xu","Lantao Hu"],"pdf_url":"https://arxiv.org/pdf/2410.15098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15026v1","updated":"2024-10-19T07:49:21Z","published":"2024-10-19T07:49:21Z","title":"A Recommendation Model Utilizing Separation Embedding and Self-Attention\n for Feature Mining","summary":" With the explosive growth of Internet data, users are facing the problem of\ninformation overload, which makes it a challenge to efficiently obtain the\nrequired resources. Recommendation systems have emerged in this context. By\nfiltering massive amounts of information, they provide users with content that\nmeets their needs, playing a key role in scenarios such as advertising\nrecommendation and product recommendation. However, traditional click-through\nrate prediction and TOP-K recommendation mechanisms are gradually unable to\nmeet the recommendations needs in modern life scenarios due to high\ncomputational complexity, large memory consumption, long feature selection\ntime, and insufficient feature interaction. This paper proposes a\nrecommendations system model based on a separation embedding cross-network. The\nmodel uses an embedding neural network layer to transform sparse feature\nvectors into dense embedding vectors, and can independently perform feature\ncross operations on different dimensions, thereby improving the accuracy and\ndepth of feature mining. Experimental results show that the model shows\nstronger adaptability and higher prediction accuracy in processing complex data\nsets, effectively solving the problems existing in existing models.\n","authors":["Wenyi Liu","Rui Wang","Yuanshuai Luo","Jianjun Wei","Zihao Zhao","Junming Huang"],"pdf_url":"https://arxiv.org/pdf/2410.15026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15016v1","updated":"2024-10-19T07:08:40Z","published":"2024-10-19T07:08:40Z","title":"Transit Pulse: Utilizing Social Media as a Source for Customer Feedback\n and Information Extraction with Large Language Model","summary":" Users of the transit system flood social networks daily with messages that\ncontain valuable insights crucial for improving service quality. These posts\nhelp transit agencies quickly identify emerging issues. Parsing topics and\nsentiments is key to gaining comprehensive insights to foster service\nexcellence. However, the volume of messages makes manual analysis impractical,\nand standard NLP techniques like Term Frequency-Inverse Document Frequency\n(TF-IDF) fall short in nuanced interpretation. Traditional sentiment analysis\nseparates topics and sentiments before integrating them, often missing the\ninteraction between them. This incremental approach complicates classification\nand reduces analytical productivity. To address these challenges, we propose a\nnovel approach to extracting and analyzing transit-related information,\nincluding sentiment and sarcasm detection, identification of unusual system\nproblems, and location data from social media. Our method employs Large\nLanguage Models (LLM), specifically Llama 3, for a streamlined analysis free\nfrom pre-established topic labels. To enhance the model's domain-specific\nknowledge, we utilize Retrieval-Augmented Generation (RAG), integrating\nexternal knowledge sources into the information extraction pipeline. We\nvalidated our method through extensive experiments comparing its performance\nwith traditional NLP approaches on user tweet data from the real world transit\nsystem. Our results demonstrate the potential of LLMs to transform social media\ndata analysis in the public transit domain, providing actionable insights and\nenhancing transit agencies' responsiveness by extracting a broader range of\ninformation.\n","authors":["Jiahao Wang","Amer Shalaby"],"pdf_url":"https://arxiv.org/pdf/2410.15016v1.pdf","comment":"17 pages, 21 figures"},{"id":"http://arxiv.org/abs/2410.14969v1","updated":"2024-10-19T04:20:23Z","published":"2024-10-19T04:20:23Z","title":"Visual Navigation of Digital Libraries: Retrieval and Classification of\n Images in the National Library of Norway's Digitised Book Collection","summary":" Digital tools for text analysis have long been essential for the\nsearchability and accessibility of digitised library collections. Recent\ncomputer vision advances have introduced similar capabilities for visual\nmaterials, with deep learning-based embeddings showing promise for analysing\nvisual heritage. Given that many books feature visuals in addition to text,\ntaking advantage of these breakthroughs is critical to making library\ncollections open and accessible. In this work, we present a proof-of-concept\nimage search application for exploring images in the National Library of\nNorway's pre-1900 books, comparing Vision Transformer (ViT), Contrastive\nLanguage-Image Pre-training (CLIP), and Sigmoid loss for Language-Image\nPre-training (SigLIP) embeddings for image retrieval and classification. Our\nresults show that the application performs well for exact image retrieval, with\nSigLIP embeddings slightly outperforming CLIP and ViT in both retrieval and\nclassification tasks. Additionally, SigLIP-based image classification can aid\nin cleaning image datasets from a digitisation pipeline.\n","authors":["Marie Roald","Magnus Breder Birkenes","Lars Gunnarsønn Bagøien Johnsen"],"pdf_url":"https://arxiv.org/pdf/2410.14969v1.pdf","comment":"13 pages, 2 figures, 4 tables, Accepted to the 2024 Computational\n Humanities Research Conference (CHR)"},{"id":"http://arxiv.org/abs/2404.18424v3","updated":"2024-10-19T01:07:37Z","published":"2024-04-29T04:51:30Z","title":"PromptReps: Prompting Large Language Models to Generate Dense and Sparse\n Representations for Zero-Shot Document Retrieval","summary":" Utilizing large language models (LLMs) for zero-shot document ranking is done\nin one of two ways: (1) prompt-based re-ranking methods, which require no\nfurther training but are only feasible for re-ranking a handful of candidate\ndocuments due to computational costs; and (2) unsupervised contrastive trained\ndense retrieval methods, which can retrieve relevant documents from the entire\ncorpus but require a large amount of paired text data for contrastive training.\nIn this paper, we propose PromptReps, which combines the advantages of both\ncategories: no need for training and the ability to retrieve from the whole\ncorpus. Our method only requires prompts to guide an LLM to generate query and\ndocument representations for effective document retrieval. Specifically, we\nprompt the LLMs to represent a given text using a single word, and then use the\nlast token's hidden states and the corresponding logits associated with the\nprediction of the next token to construct a hybrid document retrieval system.\nThe retrieval system harnesses both dense text embedding and sparse\nbag-of-words representations given by the LLM. Our experimental evaluation on\nthe MSMARCO, TREC deep learning and BEIR zero-shot document retrieval datasets\nillustrates that this simple prompt-based LLM retrieval method can achieve a\nsimilar or higher retrieval effectiveness than state-of-the-art LLM embedding\nmethods that are trained with large amounts of unsupervised data, especially\nwhen using a larger LLM.\n","authors":["Shengyao Zhuang","Xueguang Ma","Bevan Koopman","Jimmy Lin","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2404.18424v3.pdf","comment":"EMNLP2024 main"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.03650v2","updated":"2024-10-19T14:23:57Z","published":"2024-08-07T09:25:17Z","title":"Towards Multimodal Emotional Support Conversation Systems","summary":" The integration of conversational artificial intelligence (AI) into mental\nhealth care promises a new horizon for therapist-client interactions, aiming to\nclosely emulate the depth and nuance of human conversations. Despite the\npotential, the current landscape of conversational AI is markedly limited by\nits reliance on single-modal data, constraining the systems' ability to\nempathize and provide effective emotional support. This limitation stems from a\npaucity of resources that encapsulate the multimodal nature of human\ncommunication essential for therapeutic counseling. To address this gap, we\nintroduce the Multimodal Emotional Support Conversation (MESC) dataset, a\nfirst-of-its-kind resource enriched with comprehensive annotations across text,\naudio, and video modalities. This dataset captures the intricate interplay of\nuser emotions, system strategies, system emotion, and system responses, setting\na new precedent in the field. Leveraging the MESC dataset, we propose a general\nSequential Multimodal Emotional Support framework (SMES) grounded in\nTherapeutic Skills Theory. Tailored for multimodal dialogue systems, the SMES\nframework incorporates an LLM-based reasoning model that sequentially generates\nuser emotion recognition, system strategy prediction, system emotion\nprediction, and response generation. Our rigorous evaluations demonstrate that\nthis framework significantly enhances the capability of AI systems to mimic\ntherapist behaviors with heightened empathy and strategic responsiveness. By\nintegrating multimodal data in this innovative manner, we bridge the critical\ngap between emotion recognition and emotional support, marking a significant\nadvancement in conversational AI for mental health support.\n","authors":["Yuqi Chu","Lizi Liao","Zhiyuan Zhou","Chong-Wah Ngo","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2408.03650v2.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2410.15007v1","updated":"2024-10-19T06:42:43Z","published":"2024-10-19T06:42:43Z","title":"DiffuseST: Unleashing the Capability of the Diffusion Model for Style\n Transfer","summary":" Style transfer aims to fuse the artistic representation of a style image with\nthe structural information of a content image. Existing methods train specific\nnetworks or utilize pre-trained models to learn content and style features.\nHowever, they rely solely on textual or spatial representations that are\ninadequate to achieve the balance between content and style. In this work, we\npropose a novel and training-free approach for style transfer, combining\ntextual embedding with spatial features and separating the injection of content\nor style. Specifically, we adopt the BLIP-2 encoder to extract the textual\nrepresentation of the style image. We utilize the DDIM inversion technique to\nextract intermediate embeddings in content and style branches as spatial\nfeatures. Finally, we harness the step-by-step property of diffusion models by\nseparating the injection of content and style in the target branch, which\nimproves the balance between content preservation and style fusion. Various\nexperiments have demonstrated the effectiveness and robustness of our proposed\nDiffeseST for achieving balanced and controllable style transfer results, as\nwell as the potential to extend to other tasks.\n","authors":["Ying Hu","Chenyi Zhuang","Pan Gao"],"pdf_url":"https://arxiv.org/pdf/2410.15007v1.pdf","comment":"Accepted to ACMMM Asia 2024. Code is available at\n https://github.com/I2-Multimedia-Lab/DiffuseST"},{"id":"http://arxiv.org/abs/2410.14922v1","updated":"2024-10-19T00:49:05Z","published":"2024-10-19T00:49:05Z","title":"Testing and validation of innovative eXtended Reality technologies for\n astronaut training in a partial-gravity parabolic flight campaign","summary":" The use of eXtended Reality (XR) technologies in the space domain has\nincreased significantly over the past few years as it can offer many advantages\nwhen simulating complex and challenging environments. Space agencies are\ncurrently using these disruptive tools to train astronauts for Extravehicular\nActivities (EVAs), to test equipment and procedures, and to assess spacecraft\nand hardware designs. With the Moon being the current focus of the next\ngeneration of space exploration missions, simulating its harsh environment is\none of the key areas where XR can be applied, particularly for astronaut\ntraining. Peculiar lunar lighting conditions in combination with reduced\ngravity levels will highly impact human locomotion especially for movements\nsuch as walking, jumping, and running. In order to execute operations on the\nlunar surface and to safely live on the Moon for an extended period of time,\ninnovative training methodologies and tools such as XR are becoming paramount\nto perform pre-mission validation and certification. This research work\npresents the findings of the experiments aimed at exploring the integration of\nXR technology and parabolic flight activities for astronaut training. In\naddition, the study aims to consolidate these findings into a set of guidelines\nthat can assist future researchers who wish to incorporate XR technology into\nlunar training and preparation activities, including the use of such XR tools\nduring long duration missions.\n","authors":["Florian Saling","Andrea Emanuele Maria Casini","Andreas Treuer","Martial Costantini","Leonie Bensch","Tommy Nilsson","Lionel Ferra"],"pdf_url":"https://arxiv.org/pdf/2410.14922v1.pdf","comment":"75th International Astronautical Congress (IAC), Milan, Italy, 14-18\n October 2024"}]},"2024-10-18T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2407.12982v2","updated":"2024-10-18T18:42:25Z","published":"2024-07-17T20:01:21Z","title":"Retrieval-Enhanced Machine Learning: Synthesis and Opportunities","summary":" In the field of language modeling, models augmented with retrieval components\nhave emerged as a promising solution to address several challenges faced in the\nnatural language processing (NLP) field, including knowledge grounding,\ninterpretability, and scalability. Despite the primary focus on NLP, we posit\nthat the paradigm of retrieval-enhancement can be extended to a broader\nspectrum of machine learning (ML) such as computer vision, time series\nprediction, and computational biology. Therefore, this work introduces a formal\nframework of this paradigm, Retrieval-Enhanced Machine Learning (REML), by\nsynthesizing the literature in various domains in ML with consistent notations\nwhich is missing from the current literature. Also, we found that while a\nnumber of studies employ retrieval components to augment their models, there is\na lack of integration with foundational Information Retrieval (IR) research. We\nbridge this gap between the seminal IR research and contemporary REML studies\nby investigating each component that comprises the REML framework. Ultimately,\nthe goal of this work is to equip researchers across various disciplines with a\ncomprehensive, formally structured framework of retrieval-enhanced models,\nthereby fostering interdisciplinary future research.\n","authors":["To Eun Kim","Alireza Salemi","Andrew Drozdov","Fernando Diaz","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2407.12982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14808v1","updated":"2024-10-18T18:30:05Z","published":"2024-10-18T18:30:05Z","title":"The S2 Hierarchical Discrete Global Grid as a Nexus for Data\n Representation, Integration, and Querying Across Geospatial Knowledge Graphs","summary":" Geospatial Knowledge Graphs (GeoKGs) have become integral to the growing\nfield of Geospatial Artificial Intelligence. Initiatives like the U.S. National\nScience Foundation's Open Knowledge Network program aim to create an ecosystem\nof nation-scale, cross-disciplinary GeoKGs that provide AI-ready geospatial\ndata aligned with FAIR principles. However, building this infrastructure\npresents key challenges, including 1) managing large volumes of data, 2) the\ncomputational complexity of discovering topological relations via SPARQL, and\n3) conflating multi-scale raster and vector data. Discrete Global Grid Systems\n(DGGS) help tackle these issues by offering efficient data integration and\nrepresentation strategies. The KnowWhereGraph utilizes Google's S2 Geometry --\na DGGS framework -- to enable efficient multi-source data processing,\nqualitative spatial querying, and cross-graph integration. This paper outlines\nthe implementation of S2 within KnowWhereGraph, emphasizing its role in\ntopologically enriching and semantically compressing data. Ultimately, this\nwork demonstrates the potential of DGGS frameworks, particularly S2, for\nbuilding scalable GeoKGs.\n","authors":["Shirly Stephen","Mitchell Faulk","Krzysztof Janowicz","Colby Fisher","Thomas Thelen","Rui Zhu","Pascal Hitzler","Cogan Shimizu","Kitty Currier","Mark Schildhauer","Dean Rehberger","Zhangyu Wang","Antrea Christou"],"pdf_url":"https://arxiv.org/pdf/2410.14808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08821v3","updated":"2024-10-18T17:50:57Z","published":"2024-08-16T16:09:59Z","title":"EasyRec: Simple yet Effective Language Models for Recommendation","summary":" Deep neural networks have become a powerful technique for learning\nrepresentations from user-item interaction data in collaborative filtering (CF)\nfor recommender systems. However, many existing methods heavily rely on unique\nuser and item IDs, which limits their ability to perform well in practical\nzero-shot learning scenarios where sufficient training data may be unavailable.\nInspired by the success of language models (LMs) and their strong\ngeneralization capabilities, a crucial question arises: How can we harness the\npotential of language models to empower recommender systems and elevate its\ngeneralization capabilities to new heights? In this study, we propose EasyRec -\nan effective and easy-to-use approach that seamlessly integrates text-based\nsemantic understanding with collaborative signals. EasyRec employs a\ntext-behavior alignment framework, which combines contrastive learning with\ncollaborative language model tuning, to ensure a strong alignment between the\ntext-enhanced semantic space and the collaborative behavior information.\nExtensive empirical evaluations across diverse real-world datasets demonstrate\nthe superior performance of EasyRec compared to state-of-the-art alternative\nmodels, particularly in the challenging text-based zero-shot recommendation\nscenarios. Furthermore, the study highlights the potential of seamlessly\nintegrating EasyRec as a plug-and-play component into text-enhanced\ncollaborative filtering frameworks, thereby empowering existing recommender\nsystems to elevate their recommendation performance and adapt to the evolving\nuser preferences in dynamic environments. For better result reproducibility of\nour EasyRec framework, the model implementation details, source code, and\ndatasets are available at the link: https://github.com/HKUDS/EasyRec.\n","authors":["Xubin Ren","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2408.08821v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14629v1","updated":"2024-10-18T17:30:17Z","published":"2024-10-18T17:30:17Z","title":"SIMformer: Single-Layer Vanilla Transformer Can Learn Free-Space\n Trajectory Similarity","summary":" Free-space trajectory similarity calculation, e.g., DTW, Hausdorff, and\nFrechet, often incur quadratic time complexity, thus learning-based methods\nhave been proposed to accelerate the computation. The core idea is to train an\nencoder to transform trajectories into representation vectors and then compute\nvector similarity to approximate the ground truth. However, existing methods\nface dual challenges of effectiveness and efficiency: 1) they all utilize\nEuclidean distance to compute representation similarity, which leads to the\nsevere curse of dimensionality issue -- reducing the distinguishability among\nrepresentations and significantly affecting the accuracy of subsequent\nsimilarity search tasks; 2) most of them are trained in triplets manner and\noften necessitate additional information which downgrades the efficiency; 3)\nprevious studies, while emphasizing the scalability in terms of efficiency,\noverlooked the deterioration of effectiveness when the dataset size grows. To\ncope with these issues, we propose a simple, yet accurate, fast, scalable model\nthat only uses a single-layer vanilla transformer encoder as the feature\nextractor and employs tailored representation similarity functions to\napproximate various ground truth similarity measures. Extensive experiments\ndemonstrate our model significantly mitigates the curse of dimensionality issue\nand outperforms the state-of-the-arts in effectiveness, efficiency, and\nscalability.\n","authors":["Chuang Yang","Renhe Jiang","Xiaohang Xu","Chuan Xiao","Kaoru Sezaki"],"pdf_url":"https://arxiv.org/pdf/2410.14629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14625v1","updated":"2024-10-18T17:27:07Z","published":"2024-10-18T17:27:07Z","title":"Enhancing AI Accessibility in Veterinary Medicine: Linking Classifiers\n and Electronic Health Records","summary":" In the rapidly evolving landscape of veterinary healthcare, integrating\nmachine learning (ML) clinical decision-making tools with electronic health\nrecords (EHRs) promises to improve diagnostic accuracy and patient care.\nHowever, the seamless integration of ML classifiers into existing EHRs in\nveterinary medicine is frequently hindered by the rigidity of EHR systems or\nthe limited availability of IT resources. To address this shortcoming, we\npresent Anna, a freely-available software solution that provides ML classifier\nresults for EHR laboratory data in real-time.\n","authors":["Chun Yin Kong","Picasso Vasquez","Makan Farhoodimoghadam","Chris Brandt","Titus C. Brown","Krystle L. Reagan","Allison Zwingenberger","Stefan M. Keller"],"pdf_url":"https://arxiv.org/pdf/2410.14625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14609v1","updated":"2024-10-18T17:03:17Z","published":"2024-10-18T17:03:17Z","title":"DiSCo Meets LLMs: A Unified Approach for Sparse Retrieval and Contextual\n Distillation in Conversational Search","summary":" Conversational Search (CS) is the task of retrieving relevant documents from\na corpus within a conversational context, combining retrieval with\nconversational context modeling. With the explosion of Large Language Models\n(LLMs), the CS field has seen major improvements with LLMs rewriting user\nqueries, accounting for conversational context. However, engaging LLMs at\ninference time harms efficiency. Current methods address this by distilling\nembeddings from human-rewritten queries to learn the context modeling task.\nYet, these approaches predominantly focus on context modeling, and only treat\nthe contrastive component of the retrieval task within a\ndistillation-independent loss term. To address these limitations, we propose a\nnew distillation method, as a relaxation of the previous objective, unifying\nretrieval and context modeling. We relax the existing training objectives by\ndistilling similarity scores between conversations and documents, rather than\nrelying solely on representation learning. Our proposed distillation objective\nallows for more freedom in the representation space and leverages the\ncontrastive nature of document relevance. Through experiments on Learned Sparse\nRetrieval (LSR) across 5 CS datasets, our approach demonstrates substantial\nimprovements in both in-domain and out-of-domain retrieval performance,\noutperforming state-of-the-art with gains of up to 6 points in recall for\nout-of-domain datasets. Additionally, through the relaxation of the objective,\nwe propose a multi-teacher distillation, using multiple LLMs as teachers,\nyielding additional gains, and outperforming the teachers themselves in\nin-domain experiments. Finally, analysis of the sparsity of the models reveals\nthat our distillation allows for better control over the sparsity of the\ntrained models.\n","authors":["Simon Lupart","Mohammad Aliannejadi","Evangelos Kanoulas"],"pdf_url":"https://arxiv.org/pdf/2410.14609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14567v1","updated":"2024-10-18T16:11:29Z","published":"2024-10-18T16:11:29Z","title":"RAG-ConfusionQA: A Benchmark for Evaluating LLMs on Confusing Questions","summary":" Conversational AI agents use Retrieval Augmented Generation (RAG) to provide\nverifiable document-grounded responses to user inquiries. However, many natural\nquestions do not have good answers: about 25\\% contain false\nassumptions~\\cite{Yu2023:CREPE}, and over 50\\% are\nambiguous~\\cite{Min2020:AmbigQA}. RAG agents need high-quality data to improve\ntheir responses to confusing questions. This paper presents a novel synthetic\ndata generation method to efficiently create a diverse set of context-grounded\nconfusing questions from a given document corpus. We conduct an empirical\ncomparative evaluation of several large language models as RAG agents to\nmeasure the accuracy of confusion detection and appropriate response\ngeneration. We contribute a benchmark dataset to the public domain.\n","authors":["Zhiyuan Peng","Jinming Nian","Alexandre Evfimievski","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2410.14567v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2407.14346v2","updated":"2024-10-18T13:59:54Z","published":"2024-07-19T14:28:53Z","title":"Improving Retrieval in Sponsored Search by Leveraging Query Context\n Signals","summary":" Accurately retrieving relevant bid keywords for user queries is critical in\nSponsored Search but remains challenging, particularly for short, ambiguous\nqueries. Existing dense and generative retrieval models often fail to capture\nnuanced user intent in these cases. To address this, we propose an approach to\nenhance query understanding by augmenting queries with rich contextual signals\nderived from web search results and large language models, stored in an online\ncache. Specifically, we use web search titles and snippets to ground queries in\nreal-world information and utilize GPT-4 to generate query rewrites and\nexplanations that clarify user intent. These signals are efficiently integrated\nthrough a Fusion-in-Decoder based Unity architecture, enabling both dense and\ngenerative retrieval with serving costs on par with traditional context-free\nmodels. To address scenarios where context is unavailable in the cache, we\nintroduce context glancing, a curriculum learning strategy that improves model\nrobustness and performance even without contextual signals during inference.\nExtensive offline experiments demonstrate that our context-aware approach\nsubstantially outperforms context-free models. Furthermore, online A/B testing\non a prominent search engine across 160+ countries shows significant\nimprovements in user engagement and revenue.\n","authors":["Akash Kumar Mohankumar","Gururaj K","Gagan Madan","Amit Singh"],"pdf_url":"https://arxiv.org/pdf/2407.14346v2.pdf","comment":"Accepted to EMNLP 2024 Industry Track. 10 pages, 10 tables, 1 figure"},{"id":"http://arxiv.org/abs/2409.14217v2","updated":"2024-10-18T13:25:29Z","published":"2024-09-21T18:39:53Z","title":"Revisiting BPR: A Replicability Study of a Common Recommender System\n Baseline","summary":" Bayesian Personalized Ranking (BPR), a collaborative filtering approach based\non matrix factorization, frequently serves as a benchmark for recommender\nsystems research. However, numerous studies often overlook the nuances of BPR\nimplementation, claiming that it performs worse than newly proposed methods\nacross various tasks. In this paper, we thoroughly examine the features of the\nBPR model, indicating their impact on its performance, and investigate\nopen-source BPR implementations. Our analysis reveals inconsistencies between\nthese implementations and the original BPR paper, leading to a significant\ndecrease in performance of up to 50% for specific implementations. Furthermore,\nthrough extensive experiments on real-world datasets under modern evaluation\nsettings, we demonstrate that with proper tuning of its hyperparameters, the\nBPR model can achieve performance levels close to state-of-the-art methods on\nthe top-n recommendation tasks and even outperform them on specific datasets.\nSpecifically, on the Million Song Dataset, the BPR model with hyperparameters\ntuning statistically significantly outperforms Mult-VAE by 10% in NDCG@100 with\nbinary relevance function.\n","authors":["Aleksandr Milogradskii","Oleg Lashinin","Alexander P","Marina Ananyeva","Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2409.14217v2.pdf","comment":"This paper is accepted at the Reproducibility track of the ACM RecSys\n '24 conference"},{"id":"http://arxiv.org/abs/2410.14452v1","updated":"2024-10-18T13:24:18Z","published":"2024-10-18T13:24:18Z","title":"SPFresh: Incremental In-Place Update for Billion-Scale Vector Search","summary":" Approximate Nearest Neighbor Search (ANNS) is now widely used in various\napplications, ranging from information retrieval, question answering, and\nrecommendation, to search for similar high-dimensional vectors. As the amount\nof vector data grows continuously, it becomes important to support updates to\nvector index, the enabling technique that allows for efficient and accurate\nANNS on vectors. Because of the curse of high dimensionality, it is often\ncostly to identify the right neighbors of a single new vector, a necessary\nprocess for index update. To amortize update costs, existing systems maintain a\nsecondary index to accumulate updates, which are merged by the main index by\nglobal rebuilding the entire index periodically. However, this approach has\nhigh fluctuations of search latency and accuracy, not even to mention that it\nrequires substantial resources and is extremely time-consuming for rebuilds. We\nintroduce SPFresh, a system that supports in-place vector updates. At the heart\nof SPFresh is LIRE, a lightweight incremental rebalancing protocol to split\nvector partitions and reassign vectors in the nearby partitions to adapt to\ndata distribution shift. LIRE achieves low-overhead vector updates by only\nreassigning vectors at the boundary between partitions, where in a high-quality\nvector index the amount of such vectors are deemed small. With LIRE, SPFresh\nprovides superior query latency and accuracy to solutions based on global\nrebuild, with only 1% of DRAM and less than 10% cores needed at the peak\ncompared to the state-of-the-art, in a billion scale vector index with 1% of\ndaily vector update rate.\n","authors":["Yuming Xu","Hengyu Liang","Jin Li","Shuotao Xu","Qi Chen","Qianxi Zhang","Cheng Li","Ziyue Yang","Fan Yang","Yuqing Yang","Peng Cheng","Mao Yang"],"pdf_url":"https://arxiv.org/pdf/2410.14452v1.pdf","comment":"SOSP 23"},{"id":"http://arxiv.org/abs/2410.14331v1","updated":"2024-10-18T09:43:30Z","published":"2024-10-18T09:43:30Z","title":"ChartifyText: Automated Chart Generation from Data-Involved Texts via\n LLM","summary":" Text documents with numerical values involved are widely used in various\napplications such as scientific research, economy, public health and\njournalism. However, it is difficult for readers to quickly interpret such\ndata-involved texts and gain deep insights. To fill this research gap, this\nwork aims to automatically generate charts to accurately convey the underlying\ndata and ideas to readers, which is essentially a challenging task. The\nchallenges originate from text ambiguities, intrinsic sparsity and uncertainty\nof data in text documents, and subjective sentiment differences. Specifically,\nwe propose ChartifyText, a novel fully-automated approach that leverages Large\nLanguage Models (LLMs) to convert complex data-involved texts to expressive\ncharts. It consists of two major modules: tabular data inference and expressive\nchart generation. The tabular data inference module employs systematic prompt\nengineering to guide the LLM (e.g., GPT-4) to infer table data, where data\nranges, uncertainties, missing data values and corresponding subjective\nsentiments are explicitly considered. The expressive chart generation module\naugments standard charts with intuitive visual encodings and concise texts to\naccurately convey the underlying data and insights. We extensively evaluate the\neffectiveness of ChartifyText on real-world data-involved text documents\nthrough case studies, in-depth interviews with three visualization experts, and\na carefully-designed user study with 15 participants. The results demonstrate\nthe usefulness and effectiveness of ChartifyText in helping readers efficiently\nand effectively make sense of data-involved texts.\n","authors":["Songheng Zhang","Lei Wang","Toby Jia-Jun Li","Qiaomu Shen","Yixin Cao","Yong Wang"],"pdf_url":"https://arxiv.org/pdf/2410.14331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14626v2","updated":"2024-10-18T08:56:18Z","published":"2023-10-23T07:00:51Z","title":"Conversational Recommender System and Large Language Model Are Made for\n Each Other in E-commerce Pre-sales Dialogue","summary":" E-commerce pre-sales dialogue aims to understand and elicit user needs and\npreferences for the items they are seeking so as to provide appropriate\nrecommendations. Conversational recommender systems (CRSs) learn user\nrepresentation and provide accurate recommendations based on dialogue context,\nbut rely on external knowledge. Large language models (LLMs) generate responses\nthat mimic pre-sales dialogues after fine-tuning, but lack domain-specific\nknowledge for accurate recommendations. Intuitively, the strengths of LLM and\nCRS in E-commerce pre-sales dialogues are complementary, yet no previous work\nhas explored this. This paper investigates the effectiveness of combining LLM\nand CRS in E-commerce pre-sales dialogues, proposing two collaboration methods:\nCRS assisting LLM and LLM assisting CRS. We conduct extensive experiments on a\nreal-world dataset of Ecommerce pre-sales dialogues. We analyze the impact of\ntwo collaborative approaches with two CRSs and two LLMs on four tasks of\nEcommerce pre-sales dialogue. We find that collaborations between CRS and LLM\ncan be very effective in some cases.\n","authors":["Yuanxing Liu","Wei-Nan Zhang","Yifan Chen","Yuchi Zhang","Haopeng Bai","Fan Feng","Hengbin Cui","Yongbin Li","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2310.14626v2.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2410.13230v2","updated":"2024-10-18T08:36:37Z","published":"2024-10-17T05:33:50Z","title":"Starbucks: Improved Training for 2D Matryoshka Embeddings","summary":" Effective approaches that can scale embedding model depth (i.e. layers) and\nembedding size allow for the creation of models that are highly scalable across\ndifferent computational resources and task requirements. While the recently\nproposed 2D Matryoshka training approach can efficiently produce a single\nembedding model such that its sub-layers and sub-dimensions can measure text\nsimilarity, its effectiveness is significantly worse than if smaller models\nwere trained separately. To address this issue, we propose Starbucks, a new\ntraining strategy for Matryoshka-like embedding models, which encompasses both\nthe fine-tuning and pre-training phases. For the fine-tuning phase, we discover\nthat, rather than sampling a random sub-layer and sub-dimensions for each\ntraining steps, providing a fixed list of layer-dimension pairs, from small\nsize to large sizes, and computing the loss across all pairs significantly\nimproves the effectiveness of 2D Matryoshka embedding models, bringing them on\npar with their separately trained counterparts. To further enhance performance,\nwe introduce a new pre-training strategy, which applies masked autoencoder\nlanguage modelling to sub-layers and sub-dimensions during pre-training,\nresulting in a stronger backbone for subsequent fine-tuning of the embedding\nmodel. Experimental results on both semantic text similarity and retrieval\nbenchmarks demonstrate that the proposed pre-training and fine-tuning\nstrategies significantly improved the effectiveness over 2D Matryoshka models,\nenabling Starbucks models to perform more efficiently and effectively than\nseparately trained models.\n","authors":["Shengyao Zhuang","Shuai Wang","Bevan Koopman","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2410.13230v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06572v2","updated":"2024-10-18T08:20:38Z","published":"2024-06-03T17:07:46Z","title":"Graph Neural Network Enhanced Retrieval for Question Answering of LLMs","summary":" Retrieval augmented generation has revolutionized large language model (LLM)\noutputs by providing factual supports. Nevertheless, it struggles to capture\nall the necessary knowledge for complex reasoning questions. Existing retrieval\nmethods typically divide reference documents into passages, treating them in\nisolation. These passages, however, are often interrelated, such as passages\nthat are contiguous or share the same keywords. Therefore, it is crucial to\nrecognize such relatedness for enhancing the retrieval process. In this paper,\nwe propose a novel retrieval method, called GNN-Ret, which leverages graph\nneural networks (GNNs) to enhance retrieval by exploiting the relatedness\nbetween passages. Specifically, we first construct a graph of passages by\nconnecting passages that are structure-related or keyword-related. A graph\nneural network (GNN) is then leveraged to exploit the relationships between\npassages and improve the retrieval of supporting passages. Furthermore, we\nextend our method to handle multi-hop reasoning questions using a recurrent\ngraph neural network (RGNN), named RGNN-Ret. At each step, RGNN-Ret integrates\nthe graphs of passages from previous steps, thereby enhancing the retrieval of\nsupporting passages. Extensive experiments on benchmark datasets demonstrate\nthat GNN-Ret achieves higher accuracy for question answering with a single\nquery of LLMs than strong baselines that require multiple queries, and RGNN-Ret\nfurther improves accuracy and achieves state-of-the-art performance, with up to\n10.4% accuracy improvement on the 2WikiMQA dataset.\n","authors":["Zijian Li","Qingyan Guo","Jiawei Shao","Lei Song","Jiang Bian","Jun Zhang","Rui Wang"],"pdf_url":"https://arxiv.org/pdf/2406.06572v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2410.14241v1","updated":"2024-10-18T07:44:12Z","published":"2024-10-18T07:44:12Z","title":"Graph Neural Patching for Cold-Start Recommendations","summary":" The cold start problem in recommender systems remains a critical challenge.\nCurrent solutions often train hybrid models on auxiliary data for both cold and\nwarm users/items, potentially degrading the experience for the latter. This\ndrawback limits their viability in practical scenarios where the satisfaction\nof existing warm users/items is paramount. Although graph neural networks\n(GNNs) excel at warm recommendations by effective collaborative signal\nmodeling, they haven't been effectively leveraged for the cold-start issue\nwithin a user-item graph, which is largely due to the lack of initial\nconnections for cold user/item entities. Addressing this requires a GNN adept\nat cold-start recommendations without sacrificing performance for existing\nones. To this end, we introduce Graph Neural Patching for Cold-Start\nRecommendations (GNP), a customized GNN framework with dual functionalities:\nGWarmer for modeling collaborative signal on existing warm users/items and\nPatching Networks for simulating and enhancing GWarmer's performance on\ncold-start recommendations. Extensive experiments on three benchmark datasets\nconfirm GNP's superiority in recommending both warm and cold users/items.\n","authors":["Hao Chen","Yu Yang","Yuanchen Bei","Zefan Wang","Yue Xu","Feiran Huang"],"pdf_url":"https://arxiv.org/pdf/2410.14241v1.pdf","comment":"13 pages, accepted by Australasian Database Conference 2024. arXiv\n admin note: substantial text overlap with arXiv:2209.12215"},{"id":"http://arxiv.org/abs/2406.00012v2","updated":"2024-10-18T06:07:06Z","published":"2024-05-20T09:24:45Z","title":"FINED: Feed Instance-Wise Information Need with Essential and\n Disentangled Parametric Knowledge from the Past","summary":" Recommender models play a vital role in various industrial scenarios, while\noften faced with the catastrophic forgetting problem caused by the fast\nshifting data distribution. To alleviate this problem, a common approach is to\nreuse knowledge from the historical data. However, preserving the vast and\nfast-accumulating data is hard, which causes dramatic storage overhead.\nMemorizing old data through a parametric knowledge base is then proposed, which\ncompresses the vast amount of raw data into model parameters. Despite the\nflexibility, how to improve the memorization and generalization capabilities of\nthe parametric knowledge base and suit the flexible information need of each\ninstance are challenging. In this paper, we propose FINED to Feed INstance-wise\ninformation need with Essential and Disentangled parametric knowledge from past\ndata for recommendation enhancement. Concretely, we train a knowledge extractor\nthat extracts knowledge patterns of arbitrary order from past data and a\nknowledge encoder that memorizes the arbitrary order patterns, which serves as\nthe retrieval key generator and memory network respectively in the following\nknowledge reusing phase. The whole process is regularized by the proposed two\nconstraints, which improve the capabilities of the parametric knowledge base\nwithout increasing the size of it. The essential principle helps to compress\nthe input into representative vectors that capture the task-relevant\ninformation and filter out the noisy information. The disentanglement principle\nreduces the redundancy of stored information and pushes the knowledge base to\nfocus on capturing the disentangled invariant patterns. These two rules\ntogether promote rational compression of information for robust and generalized\nknowledge representations. Extensive experiments on two datasets justify the\neffectiveness of the proposed method.\n","authors":["Kounianhua Du","Jizheng Chen","Jianghao Lin","Menghui Zhu","Bo Chen","Shuai Li","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.00012v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14170v1","updated":"2024-10-18T04:20:46Z","published":"2024-10-18T04:20:46Z","title":"Personalized Image Generation with Large Multimodal Models","summary":" Personalized content filtering, such as recommender systems, has become a\ncritical infrastructure to alleviate information overload. However, these\nsystems merely filter existing content and are constrained by its limited\ndiversity, making it difficult to meet users' varied content needs. To address\nthis limitation, personalized content generation has emerged as a promising\ndirection with broad applications. Nevertheless, most existing research focuses\non personalized text generation, with relatively little attention given to\npersonalized image generation. The limited work in personalized image\ngeneration faces challenges in accurately capturing users' visual preferences\nand needs from noisy user-interacted images and complex multimodal\ninstructions. Worse still, there is a lack of supervised data for training\npersonalized image generation models.\n To overcome the challenges, we propose a Personalized Image Generation\nFramework named Pigeon, which adopts exceptional large multimodal models with\nthree dedicated modules to capture users' visual preferences and needs from\nnoisy user history and multimodal instructions. To alleviate the data scarcity,\nwe introduce a two-stage preference alignment scheme, comprising masked\npreference reconstruction and pairwise preference alignment, to align Pigeon\nwith the personalized image generation task. We apply Pigeon to personalized\nsticker and movie poster generation, where extensive quantitative results and\nhuman evaluation highlight its superiority over various generative baselines.\n","authors":["Yiyan Xu","Wenjie Wang","Yang Zhang","Tang Biao","Peng Yan","Fuli Feng","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2410.14170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14167v1","updated":"2024-10-18T04:17:49Z","published":"2024-10-18T04:17:49Z","title":"Optimizing Retrieval-Augmented Generation with Elasticsearch for\n Enhanced Question-Answering Systems","summary":" This study aims to improve the accuracy and quality of large-scale language\nmodels (LLMs) in answering questions by integrating Elasticsearch into the\nRetrieval Augmented Generation (RAG) framework. The experiment uses the\nStanford Question Answering Dataset (SQuAD) version 2.0 as the test dataset and\ncompares the performance of different retrieval methods, including traditional\nmethods based on keyword matching or semantic similarity calculation, BM25-RAG\nand TF-IDF- RAG, and the newly proposed ES-RAG scheme. The results show that\nES-RAG not only has obvious advantages in retrieval efficiency but also\nperforms well in key indicators such as accuracy, which is 0.51 percentage\npoints higher than TF-IDF-RAG. In addition, Elasticsearch's powerful search\ncapabilities and rich configuration options enable the entire\nquestion-answering system to better handle complex queries and provide more\nflexible and efficient responses based on the diverse needs of users. Future\nresearch directions can further explore how to optimize the interaction\nmechanism between Elasticsearch and LLM, such as introducing higher-level\nsemantic understanding and context-awareness capabilities, to achieve a more\nintelligent and humanized question-answering experience.\n","authors":["Jiajing Chen","Runyuan Bao","Hongye Zheng","Zhen Qi","Jianjun Wei","Jiacheng Hu"],"pdf_url":"https://arxiv.org/pdf/2410.14167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14122v1","updated":"2024-10-18T02:31:36Z","published":"2024-10-18T02:31:36Z","title":"Towards Robust Transcription: Exploring Noise Injection Strategies for\n Training Data Augmentation","summary":" Recent advancements in Automatic Piano Transcription (APT) have significantly\nimproved system performance, but the impact of noisy environments on the system\nperformance remains largely unexplored. This study investigates the impact of\nwhite noise at various Signal-to-Noise Ratio (SNR) levels on state-of-the-art\nAPT models and evaluates the performance of the Onsets and Frames model when\ntrained on noise-augmented data. We hope this research provides valuable\ninsights as preliminary work toward developing transcription models that\nmaintain consistent performance across a range of acoustic conditions.\n","authors":["Yonghyun Kim","Alexander Lerch"],"pdf_url":"https://arxiv.org/pdf/2410.14122v1.pdf","comment":"Accepted to the Late-Breaking Demo Session of the 25th International\n Society for Music Information Retrieval (ISMIR) Conference, 2024"}],"Multimedia":[{"id":"http://arxiv.org/abs/2404.17821v2","updated":"2024-10-18T21:55:23Z","published":"2024-04-27T08:00:43Z","title":"An automatic mixing speech enhancement system for multi-track audio","summary":" We propose a speech enhancement system for multitrack audio. The system will\nminimize auditory masking while allowing one to hear multiple simultaneous\nspeakers. The system can be used in multiple communication scenarios e.g.,\nteleconferencing, invoice gaming, and live streaming. The ITU-R BS.1387\nPerceptual Evaluation of Audio Quality (PEAQ) model is used to evaluate the\namount of masking in the audio signals. Different audio effects e.g., level\nbalance, equalization, dynamic range compression, and spatialization are\napplied via an iterative Harmony searching algorithm that aims to minimize the\nmasking. In the subjective listening test, the designed system can compete with\nmixes by professional sound engineers and outperforms mixes by existing\nauto-mixing systems.\n","authors":["Xiaojing Liu","Hongwei Ai","Joshua D. Reiss"],"pdf_url":"https://arxiv.org/pdf/2404.17821v2.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2410.14634v1","updated":"2024-10-18T17:35:33Z","published":"2024-10-18T17:35:33Z","title":"Parallel Backpropagation for Inverse of a Convolution with Application\n to Normalizing Flows","summary":" Inverse of an invertible convolution is an important operation that comes up\nin Normalizing Flows, Image Deblurring, etc. The naive algorithm for\nbackpropagation of this operation using Gaussian elimination has running time\n$O(n^3)$ where $n$ is the number of pixels in the image. We give a fast\nparallel backpropagation algorithm with running time $O(\\sqrt{n})$ for a square\nimage and provide a GPU implementation of the same. Inverse Convolutions are\nusually used in Normalizing Flows in the sampling pass, making them slow. We\npropose to use Inverse Convolutions in the forward (image to latent vector)\npass of the Normalizing flow. Since the sampling pass is the inverse of the\nforward pass, it will use convolutions only, resulting in efficient sampling\ntimes. We use our parallel backpropagation algorithm for optimizing the inverse\nconvolution layer resulting in fast training times also. We implement this\napproach in various Normalizing Flow backbones, resulting in our Inverse-Flow\nmodels. We benchmark Inverse-Flow on standard datasets and show significantly\nimproved sampling times with similar bits per dimension compared to previous\nmodels.\n","authors":["Sandeep Nagar","Girish Varma"],"pdf_url":"https://arxiv.org/pdf/2410.14634v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2404.13370v2","updated":"2024-10-18T16:44:05Z","published":"2024-04-20T13:15:27Z","title":"Movie101v2: Improved Movie Narration Benchmark","summary":" Automatic movie narration aims to generate video-aligned plot descriptions to\nassist visually impaired audiences. Unlike standard video captioning, it\ninvolves not only describing key visual details but also inferring plots that\nunfold across multiple movie shots, presenting distinct and complex challenges.\nTo advance this field, we introduce Movie101v2, a large-scale, bilingual\ndataset with enhanced data quality specifically designed for movie narration.\nRevisiting the task, we propose breaking down the ultimate goal of automatic\nmovie narration into three progressive stages, offering a clear roadmap with\ncorresponding evaluation metrics. Based on our new benchmark, we baseline a\nrange of large vision-language models, including GPT-4V, and conduct an\nin-depth analysis of the challenges in narration generation. Our findings\nhighlight that achieving applicable movie narration generation is a fascinating\ngoal that requires significant research.\n","authors":["Zihao Yue","Yepeng Zhang","Ziheng Wang","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2404.13370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06729v2","updated":"2024-10-18T13:45:09Z","published":"2024-10-09T09:55:51Z","title":"Perceptual Quality Assessment of Octree-RAHT Encoded 3D Point Clouds","summary":" No-reference bitstream-layer point cloud quality assessment (PCQA) can be\ndeployed without full decoding at any network node to achieve real-time quality\nmonitoring. In this work, we focus on the PCQA problem dedicated to Octree-RAHT\nencoding mode. First, to address the issue that existing PCQA databases have a\nsmall scale and limited distortion levels, we establish the WPC5.0 database\nwhich is the first one dedicated to Octree-RAHT encoding mode with a scale of\n400 distorted point clouds (PCs) including 4 geometric multiplied by 5 attitude\ndistortion levels. Then, we propose the first PCQA model dedicated to\nOctree-RAHT encoding mode by parsing PC bitstreams without full decoding. The\nmodel introduces texture bitrate (TBPP) to predict texture complexity (TC) and\nfurther derives the texture distortion factor. In addition, the Geometric\nQuantization Parameter (PQS) is used to estimate the geometric distortion\nfactor, which is then integrated into the model along with the texture\ndistortion factor to obtain the proposed PCQA model named streamPCQ-OR. The\nproposed model has been compared with other advanced PCQA methods on the\nWPC5.0, BASICS and M-PCCD databases, and experimental results show that our\nmodel has excellent performance while having very low computational complexity,\nproviding a reliable choice for time-critical applications. To facilitate\nsubsequent research, the database and source code will be publicly released at\nhttps://github.com/qdushl/Waterloo-Point-Cloud-Database-5.0.\n","authors":["Dongshuai Duan","Honglei Su","Qi Liu","Hui Yuan","Wei Gao","Jiarun Song","Zhou Wang"],"pdf_url":"https://arxiv.org/pdf/2410.06729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10291v2","updated":"2024-10-18T09:26:46Z","published":"2024-10-14T08:45:35Z","title":"Evaluating Semantic Variation in Text-to-Image Synthesis: A Causal\n Perspective","summary":" Accurate interpretation and visualization of human instructions are crucial\nfor text-to-image (T2I) synthesis. However, current models struggle to capture\nsemantic variations from word order changes, and existing evaluations, relying\non indirect metrics like text-image similarity, fail to reliably assess these\nchallenges. This often obscures poor performance on complex or uncommon\nlinguistic patterns by the focus on frequent word combinations. To address\nthese deficiencies, we propose a novel metric called SemVarEffect and a\nbenchmark named SemVarBench, designed to evaluate the causality between\nsemantic variations in inputs and outputs in T2I synthesis. Semantic variations\nare achieved through two types of linguistic permutations, while avoiding\neasily predictable literal variations. Experiments reveal that the\nCogView-3-Plus and Ideogram 2 performed the best, achieving a score of 0.2/1.\nSemantic variations in object relations are less understood than attributes,\nscoring 0.07/1 compared to 0.17-0.19/1. We found that cross-modal alignment in\nUNet or Transformers plays a crucial role in handling semantic variations, a\nfactor previously overlooked by a focus on textual encoders. Our work\nestablishes an effective evaluation framework that advances the T2I synthesis\ncommunity's exploration of human instruction understanding. Our benchmark and\ncode are available at https://github.com/zhuxiangru/SemVarBench .\n","authors":["Xiangru Zhu","Penglei Sun","Yaoxian Song","Yanghua Xiao","Zhixu Li","Chengyu Wang","Jun Huang","Bei Yang","Xiaoxiao Xu"],"pdf_url":"https://arxiv.org/pdf/2410.10291v2.pdf","comment":"The only change in the current version update is the replacement of\n the template with a more precise one"},{"id":"http://arxiv.org/abs/2410.13754v2","updated":"2024-10-18T08:56:52Z","published":"2024-10-17T16:52:28Z","title":"MixEval-X: Any-to-Any Evaluations from Real-World Data Mixtures","summary":" Perceiving and generating diverse modalities are crucial for AI models to\neffectively learn from and engage with real-world signals, necessitating\nreliable evaluations for their development. We identify two major issues in\ncurrent evaluations: (1) inconsistent standards, shaped by different\ncommunities with varying protocols and maturity levels; and (2) significant\nquery, grading, and generalization biases. To address these, we introduce\nMixEval-X, the first any-to-any, real-world benchmark designed to optimize and\nstandardize evaluations across diverse input and output modalities. We propose\nmulti-modal benchmark mixture and adaptation-rectification pipelines to\nreconstruct real-world task distributions, ensuring evaluations generalize\neffectively to real-world use cases. Extensive meta-evaluations show our\napproach effectively aligns benchmark samples with real-world task\ndistributions. Meanwhile, MixEval-X's model rankings correlate strongly with\nthat of crowd-sourced real-world evaluations (up to 0.98) while being much more\nefficient. We provide comprehensive leaderboards to rerank existing models and\norganizations and offer insights to enhance understanding of multi-modal\nevaluations and inform future research.\n","authors":["Jinjie Ni","Yifan Song","Deepanway Ghosal","Bo Li","David Junhao Zhang","Xiang Yue","Fuzhao Xue","Zian Zheng","Kaichen Zhang","Mahir Shah","Kabir Jain","Yang You","Michael Shieh"],"pdf_url":"https://arxiv.org/pdf/2410.13754v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14154v1","updated":"2024-10-18T03:45:19Z","published":"2024-10-18T03:45:19Z","title":"RA-BLIP: Multimodal Adaptive Retrieval-Augmented Bootstrapping\n Language-Image Pre-training","summary":" Multimodal Large Language Models (MLLMs) have recently received substantial\ninterest, which shows their emerging potential as general-purpose models for\nvarious vision-language tasks. MLLMs involve significant external knowledge\nwithin their parameters; however, it is challenging to continually update these\nmodels with the latest knowledge, which involves huge computational costs and\npoor interpretability. Retrieval augmentation techniques have proven to be\neffective plugins for both LLMs and MLLMs. In this study, we propose multimodal\nadaptive Retrieval-Augmented Bootstrapping Language-Image Pre-training\n(RA-BLIP), a novel retrieval-augmented framework for various MLLMs. Considering\nthe redundant information within vision modality, we first leverage the\nquestion to instruct the extraction of visual information through interactions\nwith one set of learnable queries, minimizing irrelevant interference during\nretrieval and generation. Besides, we introduce a pre-trained multimodal\nadaptive fusion module to achieve question text-to-multimodal retrieval and\nintegration of multimodal knowledge by projecting visual and language\nmodalities into a unified semantic space. Furthermore, we present an Adaptive\nSelection Knowledge Generation (ASKG) strategy to train the generator to\nautonomously discern the relevance of retrieved knowledge, which realizes\nexcellent denoising performance. Extensive experiments on open multimodal\nquestion-answering datasets demonstrate that RA-BLIP achieves significant\nperformance and surpasses the state-of-the-art retrieval-augmented models.\n","authors":["Muhe Ding","Yang Ma","Pengda Qin","Jianlong Wu","Yuhong Li","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2410.14154v1.pdf","comment":"10 pages, 6 figures, Journal"},{"id":"http://arxiv.org/abs/2402.07640v3","updated":"2024-10-18T02:50:53Z","published":"2024-02-12T13:27:22Z","title":"Synthesizing Sentiment-Controlled Feedback For Multimodal Text and Image\n Data","summary":" The ability to generate sentiment-controlled feedback in response to\nmultimodal inputs comprising text and images addresses a critical gap in\nhuman-computer interaction. This capability allows systems to provide\nempathetic, accurate, and engaging responses, with useful applications in\neducation, healthcare, marketing, and customer service. To this end, we have\nconstructed a large-scale Controllable Multimodal Feedback Synthesis (CMFeed)\ndataset and propose a controllable feedback synthesis system. The system\nfeatures an encoder, decoder, and controllability block for textual and visual\ninputs. It extracts features using a transformer and Faster R-CNN networks,\ncombining them to generate feedback. The CMFeed dataset includes images, texts,\nreactions to the posts, human comments with relevance scores, and reactions to\nthese comments. These reactions train the model to produce feedback with\nspecified sentiments, achieving a sentiment classification accuracy of 77.23\\%,\nwhich is 18.82\\% higher than the accuracy without controllability. The system\nalso incorporates a similarity module for assessing feedback relevance through\nrank-based metrics and an interpretability technique to analyze the\ncontributions of textual and visual features during feedback generation. Access\nto the CMFeed dataset and the system's code is available at\nhttps://github.com/MIntelligence-Group/CMFeed.\n","authors":["Puneet Kumar","Sarthak Malik","Balasubramanian Raman","Xiaobai Li"],"pdf_url":"https://arxiv.org/pdf/2402.07640v3.pdf","comment":null}]},"2024-10-17T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2210.16953v2","updated":"2024-10-17T22:47:50Z","published":"2022-10-30T21:26:07Z","title":"Improving Bilingual Lexicon Induction with Cross-Encoder Reranking","summary":" Bilingual lexicon induction (BLI) with limited bilingual supervision is a\ncrucial yet challenging task in multilingual NLP. Current state-of-the-art BLI\nmethods rely on the induction of cross-lingual word embeddings (CLWEs) to\ncapture cross-lingual word similarities; such CLWEs are obtained 1) via\ntraditional static models (e.g., VecMap), or 2) by extracting type-level CLWEs\nfrom multilingual pretrained language models (mPLMs), or 3) through combining\nthe former two options. In this work, we propose a novel semi-supervised\npost-hoc reranking method termed BLICEr (BLI with Cross-Encoder Reranking),\napplicable to any precalculated CLWE space, which improves their BLI\ncapability. The key idea is to 'extract' cross-lingual lexical knowledge from\nmPLMs, and then combine it with the original CLWEs. This crucial step is done\nvia 1) creating a word similarity dataset, comprising positive word pairs\n(i.e., true translations) and hard negative pairs induced from the original\nCLWE space, and then 2) fine-tuning an mPLM (e.g., mBERT or XLM-R) in a\ncross-encoder manner to predict the similarity scores. At inference, we 3)\ncombine the similarity score from the original CLWE space with the score from\nthe BLI-tuned cross-encoder. BLICEr establishes new state-of-the-art results on\ntwo standard BLI benchmarks spanning a wide spectrum of diverse languages: it\nsubstantially outperforms a series of strong baselines across the board. We\nalso validate the robustness of BLICEr with different CLWEs.\n","authors":["Yaoyiran Li","Fangyu Liu","Ivan Vulić","Anna Korhonen"],"pdf_url":"https://arxiv.org/pdf/2210.16953v2.pdf","comment":"Findings of EMNLP 2022"},{"id":"http://arxiv.org/abs/2410.12123v2","updated":"2024-10-17T22:06:32Z","published":"2024-10-15T23:51:04Z","title":"The Moral Case for Using Language Model Agents for Recommendation","summary":" Our information and communication environment has fallen short of the ideals\nthat networked global communication might have served. Identifying all the\ncauses of its pathologies is difficult, but existing recommender systems very\nlikely play a contributing role. In this paper, which draws on the normative\ntools of philosophy of computing, informed by empirical and technical insights\nfrom natural language processing and recommender systems, we make the moral\ncase for an alternative approach. We argue that existing recommenders\nincentivise mass surveillance, concentrate power, fall prey to narrow\nbehaviourism, and compromise user agency. Rather than just trying to avoid\nalgorithms entirely, or to make incremental improvements to the current\nparadigm, researchers and engineers should explore an alternative paradigm: the\nuse of language model (LM) agents to source and curate content that matches\nusers' preferences and values, expressed in natural language. The use of LM\nagents for recommendation poses its own challenges, including those related to\ncandidate generation, computational efficiency, preference modelling, and\nprompt injection. Nonetheless, if implemented successfully LM agents could:\nguide us through the digital public sphere without relying on mass\nsurveillance; shift power away from platforms towards users; optimise for what\nmatters instead of just for behavioural proxies; and scaffold our agency\ninstead of undermining it.\n","authors":["Seth Lazar","Luke Thorburn","Tian Jin","Luca Belli"],"pdf_url":"https://arxiv.org/pdf/2410.12123v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.08307v5","updated":"2024-10-17T21:50:37Z","published":"2022-03-15T22:51:22Z","title":"Improving Word Translation via Two-Stage Contrastive Learning","summary":" Word translation or bilingual lexicon induction (BLI) is a key cross-lingual\ntask, aiming to bridge the lexical gap between different languages. In this\nwork, we propose a robust and effective two-stage contrastive learning\nframework for the BLI task. At Stage C1, we propose to refine standard\ncross-lingual linear maps between static word embeddings (WEs) via a\ncontrastive learning objective; we also show how to integrate it into the\nself-learning procedure for even more refined cross-lingual maps. In Stage C2,\nwe conduct BLI-oriented contrastive fine-tuning of mBERT, unlocking its word\ntranslation capability. We also show that static WEs induced from the\n`C2-tuned' mBERT complement static WEs from Stage C1. Comprehensive experiments\non standard BLI datasets for diverse languages and different experimental\nsetups demonstrate substantial gains achieved by our framework. While the BLI\nmethod from Stage C1 already yields substantial gains over all state-of-the-art\nBLI methods in our comparison, even stronger improvements are met with the full\ntwo-stage framework: e.g., we report gains for 112/112 BLI setups, spanning 28\nlanguage pairs.\n","authors":["Yaoyiran Li","Fangyu Liu","Nigel Collier","Anna Korhonen","Ivan Vulić"],"pdf_url":"https://arxiv.org/pdf/2203.08307v5.pdf","comment":"ACL 2022 Main"},{"id":"http://arxiv.org/abs/2410.14044v1","updated":"2024-10-17T21:37:08Z","published":"2024-10-17T21:37:08Z","title":"Best in Tau@LLMJudge: Criteria-Based Relevance Evaluation with Llama3","summary":" Traditional evaluation of information retrieval (IR) systems relies on\nhuman-annotated relevance labels, which can be both biased and costly at scale.\nIn this context, large language models (LLMs) offer an alternative by allowing\nus to directly prompt them to assign relevance labels for passages associated\nwith each query. In this study, we explore alternative methods to directly\nprompt LLMs for assigned relevance labels, by exploring two hypotheses:\n Hypothesis 1 assumes that it is helpful to break down \"relevance\" into\nspecific criteria - exactness, coverage, topicality, and contextual fit. We\nexplore different approaches that prompt large language models (LLMs) to obtain\ncriteria-level grades for all passages, and we consider various ways to\naggregate criteria-level grades into a relevance label. Hypothesis 2 assumes\nthat differences in linguistic style between queries and passages may\nnegatively impact the automatic relevance label prediction. We explore whether\nimprovements can be achieved by first synthesizing a summary of the passage in\nthe linguistic style of a query, and then using this summary in place of the\npassage to assess its relevance.\n We include an empirical evaluation of our approaches based on data from the\nLLMJudge challenge run in Summer 2024, where our \"Four Prompts\" approach\nobtained the highest scores in Kendall's tau.\n","authors":["Naghmeh Farzi","Laura Dietz"],"pdf_url":"https://arxiv.org/pdf/2410.14044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14043v1","updated":"2024-10-17T21:35:55Z","published":"2024-10-17T21:35:55Z","title":"Efficient Retrieval of Temporal Event Sequences from Textual\n Descriptions","summary":" Retrieving temporal event sequences from textual descriptions is essential\nfor applications such as analyzing e-commerce behavior, monitoring social media\nactivities, and tracking criminal incidents. In this paper, we introduce\nTPP-LLM-Embedding, a unified model for efficiently embedding and retrieving\nevent sequences based on natural language descriptions. Built on the TPP-LLM\nframework, which integrates large language models with temporal point\nprocesses, our model encodes both event types and times, generating a\nsequence-level representation through pooling. Textual descriptions are\nembedded using the same architecture, ensuring a shared embedding space for\nboth sequences and descriptions. We optimize a contrastive loss based on\nsimilarity between these embeddings, bringing matching pairs closer and\nseparating non-matching ones. TPP-LLM-Embedding enables efficient retrieval and\ndemonstrates superior performance compared to baseline models across diverse\ndatasets.\n","authors":["Zefang Liu","Yinzhu Quan"],"pdf_url":"https://arxiv.org/pdf/2410.14043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15232v2","updated":"2024-10-17T20:43:22Z","published":"2024-08-27T17:50:03Z","title":"Into the Unknown Unknowns: Engaged Human Learning through Participation\n in Language Model Agent Conversations","summary":" While language model (LM)-powered chatbots and generative search engines\nexcel at answering concrete queries, discovering information in the terrain of\nunknown unknowns remains challenging for users. To emulate the common\neducational scenario where children/students learn by listening to and\nparticipating in conversations of their parents/teachers, we create\nCollaborative STORM (Co-STORM). Unlike QA systems that require users to ask all\nthe questions, Co-STORM lets users observe and occasionally steer the discourse\namong several LM agents. The agents ask questions on the user's behalf,\nallowing the user to discover unknown unknowns serendipitously. To facilitate\nuser interaction, Co-STORM assists users in tracking the discourse by\norganizing the uncovered information into a dynamic mind map, ultimately\ngenerating a comprehensive report as takeaways. For automatic evaluation, we\nconstruct the WildSeek dataset by collecting real information-seeking records\nwith user goals. Co-STORM outperforms baseline methods on both discourse trace\nand report quality. In a further human evaluation, 70% of participants prefer\nCo-STORM over a search engine, and 78% favor it over a RAG chatbot.\n","authors":["Yucheng Jiang","Yijia Shao","Dekun Ma","Sina J. Semnani","Monica S. Lam"],"pdf_url":"https://arxiv.org/pdf/2408.15232v2.pdf","comment":"EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2307.08803v3","updated":"2024-10-17T19:09:13Z","published":"2023-07-17T19:38:40Z","title":"Mixed-initiative Query Rewriting in Conversational Passage Retrieval","summary":" In this paper, we report our methods and experiments for the TREC\nConversational Assistance Track (CAsT) 2022. In this work, we aim to reproduce\nmulti-stage retrieval pipelines and explore one of the potential benefits of\ninvolving mixed-initiative interaction in conversational passage retrieval\nscenarios: reformulating raw queries. Before the first ranking stage of a\nmulti-stage retrieval pipeline, we propose a mixed-initiative query rewriting\nmodule, which achieves query rewriting based on the mixed-initiative\ninteraction between the users and the system, as the replacement for the neural\nrewriting method. Specifically, we design an algorithm to generate appropriate\nquestions related to the ambiguities in raw queries, and another algorithm to\nreformulate raw queries by parsing users' feedback and incorporating it into\nthe raw query. For the first ranking stage of our multi-stage pipelines, we\nadopt a sparse ranking function: BM25, and a dense retrieval method:\nTCT-ColBERT. For the second-ranking step, we adopt a pointwise reranker:\nMonoT5, and a pairwise reranker: DuoT5. Experiments on both TREC CAsT 2021 and\nTREC CAsT 2022 datasets show the effectiveness of our mixed-initiative-based\nquery rewriting (or query reformulation) method on improving retrieval\nperformance compared with two popular reformulators: a neural reformulator:\nCANARD-T5 and a rule-based reformulator: historical query reformulator(HQE).\n","authors":["Dayu Yang","Yue Zhang","Hui Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08803v3.pdf","comment":"https://trec.nist.gov/pubs/trec31/papers/udel_fang.C.pdf"},{"id":"http://arxiv.org/abs/2404.11773v2","updated":"2024-10-17T18:59:18Z","published":"2024-04-17T21:56:27Z","title":"Behavior Alignment: A New Perspective of Evaluating LLM-based\n Conversational Recommender Systems","summary":" Large Language Models (LLMs) have demonstrated great potential in\nConversational Recommender Systems (CRS). However, the application of LLMs to\nCRS has exposed a notable discrepancy in behavior between LLM-based CRS and\nhuman recommenders: LLMs often appear inflexible and passive, frequently\nrushing to complete the recommendation task without sufficient inquiry.This\nbehavior discrepancy can lead to decreased accuracy in recommendations and\nlower user satisfaction. Despite its importance, existing studies in CRS lack a\nstudy about how to measure such behavior discrepancy. To fill this gap, we\npropose Behavior Alignment, a new evaluation metric to measure how well the\nrecommendation strategies made by a LLM-based CRS are consistent with human\nrecommenders'. Our experiment results show that the new metric is better\naligned with human preferences and can better differentiate how systems perform\nthan existing evaluation metrics. As Behavior Alignment requires explicit and\ncostly human annotations on the recommendation strategies, we also propose a\nclassification-based method to implicitly measure the Behavior Alignment based\non the responses. The evaluation results confirm the robustness of the method.\n","authors":["Dayu Yang","Fumian Chen","Hui Fang"],"pdf_url":"https://arxiv.org/pdf/2404.11773v2.pdf","comment":"Accepted by the 47th International ACM SIGIR Conference on Research\n and Development in Information Retrieval"},{"id":"http://arxiv.org/abs/2307.09384v3","updated":"2024-10-17T18:53:54Z","published":"2023-07-18T16:05:25Z","title":"ZeQR: Zero-shot Query Reformulation for Conversational Search","summary":" As the popularity of voice assistants continues to surge, conversational\nsearch has gained increased attention in Information Retrieval. However, data\nsparsity issues in conversational search significantly hinder the progress of\nsupervised conversational search methods. Consequently, researchers are\nfocusing more on zero-shot conversational search approaches. Nevertheless,\nexisting zero-shot methods face three primary limitations: they are not\nuniversally applicable to all retrievers, their effectiveness lacks sufficient\nexplainability, and they struggle to resolve common conversational ambiguities\ncaused by omission. To address these limitations, we introduce a novel\nZero-shot Query Reformulation (or Query Rewriting) (ZeQR) framework that\nreformulates queries based on previous dialogue contexts without requiring\nsupervision from conversational search data. Specifically, our framework\nutilizes language models designed for machine reading comprehension tasks to\nexplicitly resolve two common ambiguities: coreference and omission, in raw\nqueries. In comparison to existing zero-shot methods, our approach is\nuniversally applicable to any retriever without additional adaptation or\nindexing. It also provides greater explainability and effectively enhances\nquery intent understanding because ambiguities are explicitly and proactively\nresolved. Through extensive experiments on four TREC conversational datasets,\nwe demonstrate the effectiveness of our method, which consistently outperforms\nstate-of-the-art baselines.\n","authors":["Dayu Yang","Yue Zhang","Hui Fang"],"pdf_url":"https://arxiv.org/pdf/2307.09384v3.pdf","comment":"Accepted by the 9th ACM SIGIR International Conference on the Theory\n of Information Retrieval"},{"id":"http://arxiv.org/abs/2310.01038v2","updated":"2024-10-17T18:35:41Z","published":"2023-10-02T09:30:11Z","title":"Dataset Condensation for Recommendation","summary":" Training recommendation models on large datasets requires significant time\nand resources. It is desired to construct concise yet informative datasets for\nefficient training. Recent advances in dataset condensation show promise in\naddressing this problem by synthesizing small datasets. However, applying\nexisting methods of dataset condensation to recommendation has limitations: (1)\nthey fail to generate discrete user-item interactions, and (2) they could not\npreserve users' potential preferences. To address the limitations, we propose a\nlightweight condensation framework tailored for recommendation (DConRec),\nfocusing on condensing user-item historical interaction sets. Specifically, we\nmodel the discrete user-item interactions via a probabilistic approach and\ndesign a pre-augmentation module to incorporate the potential preferences of\nusers into the condensed datasets. While the substantial size of datasets leads\nto costly optimization, we propose a lightweight policy gradient estimation to\naccelerate the data synthesis. Experimental results on multiple real-world\ndatasets have demonstrated the effectiveness and efficiency of our framework.\nBesides, we provide a theoretical analysis of the provable convergence of\nDConRec. Our implementation is available at:\nhttps://github.com/JiahaoWuGit/DConRec.\n","authors":["Jiahao Wu","Wenqi Fan","Jingfan Chen","Shengcai Liu","Qijiong Liu","Rui He","Qing Li","Ke Tang"],"pdf_url":"https://arxiv.org/pdf/2310.01038v2.pdf","comment":"Accepted by IEEE TKDE. Also titled as \"Condensing Pre-augmented\n Recommendation Data via Lightweight Policy Gradient Estimation\""},{"id":"http://arxiv.org/abs/2410.13959v1","updated":"2024-10-17T18:34:43Z","published":"2024-10-17T18:34:43Z","title":"FinQAPT: Empowering Financial Decisions with End-to-End LLM-driven\n Question Answering Pipeline","summary":" Financial decision-making hinges on the analysis of relevant information\nembedded in the enormous volume of documents in the financial domain. To\naddress this challenge, we developed FinQAPT, an end-to-end pipeline that\nstreamlines the identification of relevant financial reports based on a query,\nextracts pertinent context, and leverages Large Language Models (LLMs) to\nperform downstream tasks. To evaluate the pipeline, we experimented with\nvarious techniques to optimize the performance of each module using the FinQA\ndataset. We introduced a novel clustering-based negative sampling technique to\nenhance context extraction and a novel prompting method called Dynamic N-shot\nPrompting to boost the numerical question-answering capabilities of LLMs. At\nthe module level, we achieved state-of-the-art accuracy on FinQA, attaining an\naccuracy of 80.6\\%. However, at the pipeline level, we observed decreased\nperformance due to challenges in extracting relevant context from financial\nreports. We conducted a detailed error analysis of each module and the\nend-to-end pipeline, pinpointing specific challenges that must be addressed to\ndevelop a robust solution for handling complex financial tasks.\n","authors":["Kuldeep Singh","Simerjot Kaur","Charese Smiley"],"pdf_url":"https://arxiv.org/pdf/2410.13959v1.pdf","comment":"Accepted in ICAIF 2024, 8 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2410.13951v1","updated":"2024-10-17T18:22:42Z","published":"2024-10-17T18:22:42Z","title":"Identifying High Consideration E-Commerce Search Queries","summary":" In e-commerce, high consideration search missions typically require careful\nand elaborate decision making, and involve a substantial research investment\nfrom customers. We consider the task of identifying High Consideration (HC)\nqueries. Identifying such queries enables e-commerce sites to better serve user\nneeds using targeted experiences such as curated QA widgets that help users\nreach purchase decisions. We explore the task by proposing an Engagement-based\nQuery Ranking (EQR) approach, focusing on query ranking to indicate potential\nengagement levels with query-related shopping knowledge content during product\nsearch. Unlike previous studies on predicting trends, EQR prioritizes\nquery-level features related to customer behavior, finance, and catalog\ninformation rather than popularity signals. We introduce an accurate and\nscalable method for EQR and present experimental results demonstrating its\neffectiveness. Offline experiments show strong ranking performance. Human\nevaluation shows a precision of 96% for HC queries identified by our model. The\nmodel was commercially deployed, and shown to outperform human-selected queries\nin terms of downstream customer impact, as measured through engagement.\n","authors":["Zhiyu Chen","Jason Choi","Besnik Fetahu","Shervin Malmasi"],"pdf_url":"https://arxiv.org/pdf/2410.13951v1.pdf","comment":"Accepted by EMNLP 2024 (Industry Track)"},{"id":"http://arxiv.org/abs/2410.13765v1","updated":"2024-10-17T17:03:23Z","published":"2024-10-17T17:03:23Z","title":"Knowledge-Aware Query Expansion with Large Language Models for Textual\n and Relational Retrieval","summary":" Large language models (LLMs) have been used to generate query expansions\naugmenting original queries for improving information search. Recent studies\nalso explore providing LLMs with initial retrieval results to generate query\nexpansions more grounded to document corpus. However, these methods mostly\nfocus on enhancing textual similarities between search queries and target\ndocuments, overlooking document relations. For queries like \"Find me a highly\nrated camera for wildlife photography compatible with my Nikon F-Mount lenses\",\nexisting methods may generate expansions that are semantically similar but\nstructurally unrelated to user intents. To handle such semi-structured queries\nwith both textual and relational requirements, in this paper we propose a\nknowledge-aware query expansion framework, augmenting LLMs with structured\ndocument relations from knowledge graph (KG). To further address the limitation\nof entity-based scoring in existing KG-based methods, we leverage document\ntexts as rich KG node representations and use document-based relation filtering\nfor our Knowledge-Aware Retrieval (KAR). Extensive experiments on three\ndatasets of diverse domains show the advantages of our method compared against\nstate-of-the-art baselines on textual and relational semi-structured retrieval.\n","authors":["Yu Xia","Junda Wu","Sungchul Kim","Tong Yu","Ryan A. Rossi","Haoliang Wang","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2410.13765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13707v1","updated":"2024-10-17T16:07:51Z","published":"2024-10-17T16:07:51Z","title":"Disjointness Violations in Wikidata","summary":" Disjointness checks are among the most important constraint checks in a\nknowledge base and can be used to help detect and correct incorrect statements\nand internal contradictions. Wikidata is a very large, community-managed\nknowledge base. Because of both its size and construction, Wikidata contains\nmany incorrect statements and internal contradictions. We analyze the current\nmodeling of disjointness on Wikidata, identify patterns that cause these\ndisjointness violations and categorize them. We use SPARQL queries to identify\neach ``culprit'' causing a disjointness violation and lay out formulas to\nidentify and fix conflicting information. We finally discuss how disjointness\ninformation could be better modeled and expanded in Wikidata in the future.\n","authors":["Ege Atacan Doğan","Peter F. Patel-Schneider"],"pdf_url":"https://arxiv.org/pdf/2410.13707v1.pdf","comment":"Sixth International Knowledge Graph and Semantic Web Conference"},{"id":"http://arxiv.org/abs/2410.13680v1","updated":"2024-10-17T15:40:09Z","published":"2024-10-17T15:40:09Z","title":"Pessimistic Evaluation","summary":" Traditional evaluation of information access systems has focused primarily on\naverage utility across a set of information needs (information retrieval) or\nusers (recommender systems). In this work, we argue that evaluating only with\naverage metric measurements assumes utilitarian values not aligned with\ntraditions of information access based on equal access. We advocate for\npessimistic evaluation of information access systems focusing on worst case\nutility. These methods are (a) grounded in ethical and pragmatic concepts, (b)\ntheoretically complementary to existing robustness and fairness methods, and\n(c) empirically validated across a set of retrieval and recommendation tasks.\nThese results suggest that pessimistic evaluation should be included in\nexisting experimentation processes to better understand the behavior of\nsystems, especially when concerned with principles of social good.\n","authors":["Fernando Diaz"],"pdf_url":"https://arxiv.org/pdf/2410.13680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13604v1","updated":"2024-10-17T14:39:24Z","published":"2024-10-17T14:39:24Z","title":"Large Language Models as Narrative-Driven Recommenders","summary":" Narrative-driven recommenders aim to provide personalized suggestions for\nuser requests expressed in free-form text such as \"I want to watch a thriller\nwith a mind-bending story, like Shutter Island.\" Although large language models\n(LLMs) have been shown to excel in processing general natural language queries,\ntheir effectiveness for handling such recommendation requests remains\nrelatively unexplored. To close this gap, we compare the performance of 38\nopen- and closed-source LLMs of various sizes, such as LLama 3.2 and GPT-4o, in\na movie recommendation setting. For this, we utilize a gold-standard,\ncrowdworker-annotated dataset of posts from reddit's movie suggestion community\nand employ various prompting strategies, including zero-shot, identity, and\nfew-shot prompting. Our findings demonstrate the ability of LLMs to generate\ncontextually relevant movie recommendations, significantly outperforming other\nstate-of-the-art approaches, such as doc2vec. While we find that closed-source\nand large-parameterized models generally perform best, medium-sized open-source\nmodels remain competitive, being only slightly outperformed by their more\ncomputationally expensive counterparts. Furthermore, we observe no significant\ndifferences across prompting strategies for most models, underscoring the\neffectiveness of simple approaches such as zero-shot prompting for\nnarrative-driven recommendations. Overall, this work offers valuable insights\nfor recommender system researchers as well as practitioners aiming to integrate\nLLMs into real-world recommendation tools.\n","authors":["Lukas Eberhard","Thorsten Ruprechter","Denis Helic"],"pdf_url":"https://arxiv.org/pdf/2410.13604v1.pdf","comment":"Under review; 19 pages"},{"id":"http://arxiv.org/abs/2410.13588v1","updated":"2024-10-17T14:22:57Z","published":"2024-10-17T14:22:57Z","title":"Cross-Domain Sequential Recommendation via Neural Process","summary":" Cross-Domain Sequential Recommendation (CDSR) is a hot topic in\nsequence-based user interest modeling, which aims at utilizing a single model\nto predict the next items for different domains. To tackle the CDSR, many\nmethods are focused on domain overlapped users' behaviors fitting, which\nheavily relies on the same user's different-domain item sequences collaborating\nsignals to capture the synergy of cross-domain item-item correlation. Indeed,\nthese overlapped users occupy a small fraction of the entire user set only,\nwhich introduces a strong assumption that the small group of domain overlapped\nusers is enough to represent all domain user behavior characteristics. However,\nintuitively, such a suggestion is biased, and the insufficient learning\nparadigm in non-overlapped users will inevitably limit model performance.\nFurther, it is not trivial to model non-overlapped user behaviors in CDSR\nbecause there are no other domain behaviors to collaborate with, which causes\nthe observed single-domain users' behavior sequences to be hard to contribute\nto cross-domain knowledge mining. Considering such a phenomenon, we raise a\nchallenging and unexplored question: How to unleash the potential of\nnon-overlapped users' behaviors to empower CDSR?\n","authors":["Haipeng Li","Jiangxia Cao","Yiwen Gao","Yunhuai Liu","Shuchao Pang"],"pdf_url":"https://arxiv.org/pdf/2410.13588v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2409.20305v2","updated":"2024-10-17T13:19:08Z","published":"2024-09-30T14:04:27Z","title":"Mixed-Precision Embeddings for Large-Scale Recommendation Models","summary":" Embedding techniques have become essential components of large databases in\nthe deep learning era. By encoding discrete entities, such as words, items, or\ngraph nodes, into continuous vector spaces, embeddings facilitate more\nefficient storage, retrieval, and processing in large databases. Especially in\nthe domain of recommender systems, millions of categorical features are encoded\nas unique embedding vectors, which facilitates the modeling of similarities and\ninteractions among features. However, numerous embedding vectors can result in\nsignificant storage overhead. In this paper, we aim to compress the embedding\ntable through quantization techniques. Given that features vary in importance\nlevels, we seek to identify an appropriate precision for each feature to\nbalance model accuracy and memory usage. To this end, we propose a novel\nembedding compression method, termed Mixed-Precision Embeddings (MPE).\nSpecifically, to reduce the size of the search space, we first group features\nby frequency and then search precision for each feature group. MPE further\nlearns the probability distribution over precision levels for each feature\ngroup, which can be used to identify the most suitable precision with a\nspecially designed sampling strategy. Extensive experiments on three public\ndatasets demonstrate that MPE significantly outperforms existing embedding\ncompression methods. Remarkably, MPE achieves about 200x compression on the\nCriteo dataset without comprising the prediction accuracy.\n","authors":["Shiwei Li","Zhuoqi Hu","Xing Tang","Haozhao Wang","Shijie Xu","Weihong Luo","Yuhua Li","Xiuqiang He","Ruixuan Li"],"pdf_url":"https://arxiv.org/pdf/2409.20305v2.pdf","comment":"under submision"},{"id":"http://arxiv.org/abs/2410.13374v1","updated":"2024-10-17T09:24:40Z","published":"2024-10-17T09:24:40Z","title":"Context-aware adaptive personalised recommendation: a meta-hybrid","summary":" Recommenders take place on a wide scale of e-commerce systems, reducing the\nproblem of information overload. The most common approach is to choose a\nrecommender used by the system to make predictions. However, users vary from\neach other; thus, a one-fits-all approach seems to be sub-optimal. In this\npaper, we propose a meta-hybrid recommender that uses machine learning to\npredict an optimal algorithm. In this way, the best-performing recommender is\nused for each specific session and user. This selection depends on contextual\nand preferential information collected about the user. We use standard\nMovieLens and The Movie DB datasets for offline evaluation. We show that based\non the proposed model, it is possible to predict which recommender will provide\nthe most precise recommendations to a user. The theoretical performance of our\nmeta-hybrid outperforms separate approaches by 20-50% in normalized Discounted\nGain and Root Mean Square Error metrics. However, it is hard to obtain the\noptimal performance based on widely-used standard information stored about\nusers.\n","authors":["Peter Tibensky","Michal Kompan"],"pdf_url":"https://arxiv.org/pdf/2410.13374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00800v2","updated":"2024-10-17T09:13:18Z","published":"2024-07-22T11:58:36Z","title":"Chatbot-Based Ontology Interaction Using Large Language Models and\n Domain-Specific Standards","summary":" The following contribution introduces a concept that employs Large Language\nModels (LLMs) and a chatbot interface to enhance SPARQL query generation for\nontologies, thereby facilitating intuitive access to formalized knowledge.\nUtilizing natural language inputs, the system converts user inquiries into\naccurate SPARQL queries that strictly query the factual content of the\nontology, effectively preventing misinformation or fabrication by the LLM. To\nenhance the quality and precision of outcomes, additional textual information\nfrom established domain-specific standards is integrated into the ontology for\nprecise descriptions of its concepts and relationships. An experimental study\nassesses the accuracy of generated SPARQL queries, revealing significant\nbenefits of using LLMs for querying ontologies and highlighting areas for\nfuture research.\n","authors":["Jonathan Reif","Tom Jeleniewski","Milapji Singh Gill","Felix Gehlhoff","Alexander Fay"],"pdf_url":"https://arxiv.org/pdf/2408.00800v2.pdf","comment":"\\c{opyright} 2024 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2410.13326v1","updated":"2024-10-17T08:37:25Z","published":"2024-10-17T08:37:25Z","title":"Comparing the Utility, Preference, and Performance of Course Material\n Search Functionality and Retrieval-Augmented Generation Large Language Model\n (RAG-LLM) AI Chatbots in Information-Seeking Tasks","summary":" Providing sufficient support for students requires substantial resources,\nespecially considering the growing enrollment numbers. Students need help in a\nvariety of tasks, ranging from information-seeking to requiring support with\ncourse assignments. To explore the utility of recent large language models\n(LLMs) as a support mechanism, we developed an LLM-powered AI chatbot that\naugments the answers that are produced with information from the course\nmaterials. To study the effect of the LLM-powered AI chatbot, we conducted a\nlab-based user study (N=14), in which the participants worked on tasks from a\nweb software development course. The participants were divided into two groups,\nwhere one of the groups first had access to the chatbot and then to a more\ntraditional search functionality, while another group started with the search\nfunctionality and was then given the chatbot. We assessed the participants'\nperformance and perceptions towards the chatbot and the search functionality\nand explored their preferences towards the support functionalities. Our\nfindings highlight that both support mechanisms are seen as useful and that\nsupport mechanisms work well for specific tasks, while less so for other tasks.\nWe also observe that students tended to prefer the second support mechanism\nmore, where students who were first given the chatbot tended to prefer the\nsearch functionality and vice versa.\n","authors":["Leonardo Pasquarelli","Charles Koutcheme","Arto Hellas"],"pdf_url":"https://arxiv.org/pdf/2410.13326v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.10025v2","updated":"2024-10-17T08:16:52Z","published":"2024-09-16T06:33:26Z","title":"DiffATR: Diffusion-based Generative Modeling for Audio-Text Retrieval","summary":" Existing audio-text retrieval (ATR) methods are essentially discriminative\nmodels that aim to maximize the conditional likelihood, represented as\np(candidates|query). Nevertheless, this methodology fails to consider the\nintrinsic data distribution p(query), leading to difficulties in discerning\nout-of-distribution data. In this work, we attempt to tackle this constraint\nthrough a generative perspective and model the relationship between audio and\ntext as their joint probability p(candidates,query). To this end, we present a\ndiffusion-based ATR framework (DiffATR), which models ATR as an iterative\nprocedure that progressively generates joint distribution from noise.\nThroughout its training phase, DiffATR is optimized from both generative and\ndiscriminative viewpoints: the generator is refined through a generation loss,\nwhile the feature extractor benefits from a contrastive loss, thus combining\nthe merits of both methodologies. Experiments on the AudioCaps and Clotho\ndatasets with superior performances, verify the effectiveness of our approach.\nNotably, without any alterations, our DiffATR consistently exhibits strong\nperformance in out-of-domain retrieval settings.\n","authors":["Yifei Xin","Xuxin Cheng","Zhihong Zhu","Xusheng Yang","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2409.10025v2.pdf","comment":"Accepted by Interspeech2024"},{"id":"http://arxiv.org/abs/2410.13293v1","updated":"2024-10-17T07:46:49Z","published":"2024-10-17T07:46:49Z","title":"SBI-RAG: Enhancing Math Word Problem Solving for Students through\n Schema-Based Instruction and Retrieval-Augmented Generation","summary":" Many students struggle with math word problems (MWPs), often finding it\ndifficult to identify key information and select the appropriate mathematical\noperations.Schema-based instruction (SBI) is an evidence-based strategy that\nhelps students categorize problems based on their structure, improving\nproblem-solving accuracy. Building on this, we propose a Schema-Based\nInstruction Retrieval-Augmented Generation (SBI-RAG) framework that\nincorporates a large language model (LLM).Our approach emphasizes step-by-step\nreasoning by leveraging schemas to guide solution generation. We evaluate its\nperformance on the GSM8K dataset, comparing it with GPT-4 and GPT-3.5 Turbo,\nand introduce a \"reasoning score\" metric to assess solution quality. Our\nfindings suggest that SBI-RAG enhances reasoning clarity and problem-solving\naccuracy, potentially providing educational benefits for students\n","authors":["Prakhar Dixit","Tim Oates"],"pdf_url":"https://arxiv.org/pdf/2410.13293v1.pdf","comment":"Accepted to the 4th MATH-AI Workshop at NeurIPS'24"},{"id":"http://arxiv.org/abs/2410.13248v1","updated":"2024-10-17T06:15:00Z","published":"2024-10-17T06:15:00Z","title":"Disentangling Likes and Dislikes in Personalized Generative Explainable\n Recommendation","summary":" Recent research on explainable recommendation generally frames the task as a\nstandard text generation problem, and evaluates models simply based on the\ntextual similarity between the predicted and ground-truth explanations.\nHowever, this approach fails to consider one crucial aspect of the systems:\nwhether their outputs accurately reflect the users' (post-purchase) sentiments,\ni.e., whether and why they would like and/or dislike the recommended items. To\nshed light on this issue, we introduce new datasets and evaluation methods that\nfocus on the users' sentiments. Specifically, we construct the datasets by\nexplicitly extracting users' positive and negative opinions from their\npost-purchase reviews using an LLM, and propose to evaluate systems based on\nwhether the generated explanations 1) align well with the users' sentiments,\nand 2) accurately identify both positive and negative opinions of users on the\ntarget items. We benchmark several recent models on our datasets and\ndemonstrate that achieving strong performance on existing metrics does not\nensure that the generated explanations align well with the users' sentiments.\nLastly, we find that existing models can provide more sentiment-aware\nexplanations when the users' (predicted) ratings for the target items are\ndirectly fed into the models as input. We will release our code and datasets\nupon acceptance.\n","authors":["Ryotaro Shimizu","Takashi Wada","Yu Wang","Johannes Kruse","Sean O'Brien","Sai HtaungKham","Linxin Song","Yuya Yoshikawa","Yuki Saito","Fugee Tsung","Masayuki Goto","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2410.13248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13217v1","updated":"2024-10-17T04:48:06Z","published":"2024-10-17T04:48:06Z","title":"MixEHR-Nest: Identifying Subphenotypes within Electronic Health Records\n through Hierarchical Guided-Topic Modeling","summary":" Automatic subphenotyping from electronic health records (EHRs)provides\nnumerous opportunities to understand diseases with unique subgroups and enhance\npersonalized medicine for patients. However, existing machine learning\nalgorithms either focus on specific diseases for better interpretability or\nproduce coarse-grained phenotype topics without considering nuanced disease\npatterns. In this study, we propose a guided topic model, MixEHR-Nest, to infer\nsub-phenotype topics from thousands of disease using multi-modal EHR data.\nSpecifically, MixEHR-Nest detects multiple subtopics from each phenotype topic,\nwhose prior is guided by the expert-curated phenotype concepts such as\nPhenotype Codes (PheCodes) or Clinical Classification Software (CCS) codes. We\nevaluated MixEHR-Nest on two EHR datasets: (1) the MIMIC-III dataset consisting\nof over 38 thousand patients from intensive care unit (ICU) from Beth Israel\nDeaconess Medical Center (BIDMC) in Boston, USA; (2) the healthcare\nadministrative database PopHR, comprising 1.3 million patients from Montreal,\nCanada. Experimental results demonstrate that MixEHR-Nest can identify\nsubphenotypes with distinct patterns within each phenotype, which are\npredictive for disease progression and severity. Consequently, MixEHR-Nest\ndistinguishes between type 1 and type 2 diabetes by inferring subphenotypes\nusing CCS codes, which do not differentiate these two subtype concepts.\nAdditionally, MixEHR-Nest not only improved the prediction accuracy of\nshort-term mortality of ICU patients and initial insulin treatment in diabetic\npatients but also revealed the contributions of subphenotypes. For longitudinal\nanalysis, MixEHR-Nest identified subphenotypes of distinct age prevalence under\nthe same phenotypes, such as asthma, leukemia, epilepsy, and depression. The\nMixEHR-Nest software is available at GitHub:\nhttps://github.com/li-lab-mcgill/MixEHR-Nest.\n","authors":["Ruohan Wang","Zilong Wang","Ziyang Song","David Buckeridge","Yue Li"],"pdf_url":"https://arxiv.org/pdf/2410.13217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05666v7","updated":"2024-10-17T03:23:49Z","published":"2024-06-09T06:49:22Z","title":"Probability Distribution Learning: A theoretical framework for Deep\n Learning","summary":" This paper introduces probability distribution learning (PD learning), a\nnovel theoretical learning framework. Departing from the traditional\nstatistical learning framework, PD learning focuses on learning the underlying\nprobability distribution, which is modeled as a random variable within the\nprobability simplex. Within this framework, the learning error is decomposed\ninto uncertainty, estimation error, and the model's fitting error.\nSubsequently, we present the methodology for calculating uncertainty, along\nwith optimization strategies for both estimation error and fitting error. Given\nthat minimizing the fitting error typically constitutes a non-convex\noptimization problem, we introduce a standard loss function and the gradient\nstructural control (GSC) algorithm, and demonstrate that by employing this\nfunction, the optima of fitting error minimization can be approached by\nreducing the gradient norm and structural error. Furthermore, we apply the PD\nlearning framework to deep learning, elucidating the mechanisms by which\ntechniques such as random parameter initialization, over-parameterization,\nbias-variance trade-off, and dropout influence deep model training. Finally,\nexperimental results on various models validate the effectiveness of the\nproposed framework.\n","authors":["Binchuan Qi","Wei Gong","Li Li"],"pdf_url":"https://arxiv.org/pdf/2406.05666v7.pdf","comment":"arXiv admin note: text overlap with arXiv:2105.04026 by other\n authors. arXiv admin note: text overlap with arXiv:2105.04026 by other\n authors"},{"id":"http://arxiv.org/abs/2402.16872v2","updated":"2024-10-17T02:53:23Z","published":"2024-01-29T03:30:15Z","title":"NFT1000: A Cross-Modal Dataset for Non-Fungible Token Retrieval","summary":" With the rise of \"Metaverse\" and \"Web 3.0\", Non-Fungible Token (NFT) has\nemerged as a kind of pivotal digital asset, garnering significant attention. By\nthe end of March 2024, more than 1.7 billion NFTs have been minted across\nvarious blockchain platforms. To effectively locate a desired NFT, conducting\nsearches within a vast array of NFTs is essential. The challenge in NFT\nretrieval is heightened due to the high degree of similarity among different\nNFTs, regarding regional and semantic aspects. In this paper, we will introduce\na benchmark dataset named \"NFT Top1000 Visual-Text Dataset\" (NFT1000),\ncontaining 7.56 million image-text pairs, and being collected from 1000 most\nfamous PFP1 NFT collections2 by sales volume on the Ethereum blockchain. Based\non this dataset and leveraging the CLIP series of pre-trained models as our\nfoundation, we propose the dynamic masking fine-tuning scheme. This innovative\napproach results in a 7.4\\% improvement in the top1 accuracy rate, while\nutilizing merely 13\\% of the total training data (0.79 million vs. 6.1\nmillion). We also propose a robust metric Comprehensive Variance Index (CVI) to\nassess the similarity and retrieval difficulty of visual-text pairs data. The\ndataset will be released as an open-source resource. For more details, please\nrefer to: https://github.com/ShuxunoO/NFT-Net.git.\n","authors":["Shuxun Wang","Yunfei Lei","Ziqi Zhang","Wei Liu","Haowei Liu","Li Yang","Wenjuan Li","Bing Li","Weiming Hu"],"pdf_url":"https://arxiv.org/pdf/2402.16872v2.pdf","comment":"11 pages,12figures to be published in ACM Multimedia 2024, see\n https://openreview.net/forum?id=xUtNrKH8iB¬eId=xUtNrKH8iB"},{"id":"http://arxiv.org/abs/2408.01262v4","updated":"2024-10-17T02:20:47Z","published":"2024-08-02T13:35:11Z","title":"RAGEval: Scenario Specific RAG Evaluation Dataset Generation Framework","summary":" Retrieval-Augmented Generation (RAG) is a powerful approach that enables\nlarge language models (LLMs) to incorporate external knowledge. However,\nevaluating the effectiveness of RAG systems in specialized scenarios remains\nchallenging due to the high costs of data construction and the lack of suitable\nevaluation metrics. This paper introduces RAGEval, a framework designed to\nassess RAG systems across diverse scenarios by generating high-quality\ndocuments, questions, answers, and references through a schema-based pipeline.\nWith a focus on factual accuracy, we propose three novel metrics Completeness,\nHallucination, and Irrelevance to rigorously evaluate LLM-generated responses.\nExperimental results show that RAGEval outperforms zero-shot and one-shot\nmethods in terms of clarity, safety, conformity, and richness of generated\nsamples. Furthermore, the use of LLMs for scoring the proposed metrics\ndemonstrates a high level of consistency with human evaluations. RAGEval\nestablishes a new paradigm for evaluating RAG systems in real-world\napplications.\n","authors":["Kunlun Zhu","Yifan Luo","Dingling Xu","Ruobing Wang","Shi Yu","Shuo Wang","Yukun Yan","Zhenghao Liu","Xu Han","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2408.01262v4.pdf","comment":"https://github.com/OpenBMB/RAGEval"},{"id":"http://arxiv.org/abs/2406.00944v2","updated":"2024-10-17T02:15:11Z","published":"2024-06-03T02:56:14Z","title":"A Theory for Token-Level Harmonization in Retrieval-Augmented Generation","summary":" Retrieval-augmented generation (RAG) utilizes retrieved texts to enhance\nlarge language models (LLMs). Studies show that while RAG provides valuable\nexternal information (benefit), it may also mislead LLMs (detriment) with noisy\nor incorrect retrieved texts. Although many existing methods attempt to\npreserve benefit and avoid detriment, they lack a theoretical explanation for\nRAG. The benefit and detriment in the next token prediction of RAG remain a\nblack box that cannot be quantified or compared in an explainable manner, so\nexisting methods are data-driven, need additional utility evaluators or\npost-hoc. This paper takes the first step towards providing a theory to explain\nand trade off the benefit and detriment in RAG. First, we model RAG as the\nfusion between distribution of LLMs knowledge and distribution of retrieved\ntexts. Then, we formalize the trade-off between the value of external knowledge\n(benefit) and its potential risk of misleading LLMs (detriment) in next token\nprediction of RAG by distribution difference in this fusion. Finally, we prove\nthat the actual effect of RAG on the token, which is the comparison between\nbenefit and detriment, can be predicted without any training or accessing the\nutility of retrieval. Based on our theory, we propose a practical novel method,\nTok-RAG, which achieves collaborative generation between the pure LLM and RAG\nat token level to preserve benefit and avoid detriment. Experiments in\nreal-world tasks using LLMs such as OPT, LLaMA-2, and Mistral show the\neffectiveness of our method and support our theoretical findings.\n","authors":["Shicheng Xu","Liang Pang","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2406.00944v2.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2410.13125v1","updated":"2024-10-17T01:27:57Z","published":"2024-10-17T01:27:57Z","title":"Transformers4NewsRec: A Transformer-based News Recommendation Framework","summary":" Pre-trained transformer models have shown great promise in various natural\nlanguage processing tasks, including personalized news recommendations. To\nharness the power of these models, we introduce Transformers4NewsRec, a new\nPython framework built on the \\textbf{Transformers} library. This framework is\ndesigned to unify and compare the performance of various news recommendation\nmodels, including deep neural networks and graph-based models.\nTransformers4NewsRec offers flexibility in terms of model selection, data\npreprocessing, and evaluation, allowing both quantitative and qualitative\nanalysis.\n","authors":["Dairui Liu","Honghui Du","Boming Yang","Neil Hurley","Aonghus Lawlor","Irene Li","Derek Greene","Ruihai Dong"],"pdf_url":"https://arxiv.org/pdf/2410.13125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13118v1","updated":"2024-10-17T01:12:48Z","published":"2024-10-17T01:12:48Z","title":"Retrieval-Enhanced Named Entity Recognition","summary":" When combined with In-Context Learning, a technique that enables models to\nadapt to new tasks by incorporating task-specific examples or demonstrations\ndirectly within the input prompt, autoregressive language models have achieved\ngood performance in a wide range of tasks and applications. However, this\ncombination has not been properly explored in the context of named entity\nrecognition, where the structure of this task poses unique challenges. We\npropose RENER (Retrieval-Enhanced Named Entity Recognition), a technique for\nnamed entity recognition using autoregressive language models based on\nIn-Context Learning and information retrieval techniques. When presented with\nan input text, RENER fetches similar examples from a dataset of training\nexamples that are used to enhance a language model to recognize named entities\nfrom this input text. RENER is modular and independent of the underlying\nlanguage model and information retrieval algorithms. Experimental results show\nthat in the CrossNER collection we achieve state-of-the-art performance with\nthe proposed technique and that information retrieval can increase the F-score\nby up to 11 percentage points.\n","authors":["Enzo Shiraishi","Raphael Y. de Camargo","Henrique L. P. Silva","Ronaldo C. Prati"],"pdf_url":"https://arxiv.org/pdf/2410.13118v1.pdf","comment":"13 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2410.13117v1","updated":"2024-10-17T01:02:04Z","published":"2024-10-17T01:02:04Z","title":"Preference Diffusion for Recommendation","summary":" Recommender systems predict personalized item rankings based on user\npreference distributions derived from historical behavior data. Recently,\ndiffusion models (DMs) have gained attention in recommendation for their\nability to model complex distributions, yet current DM-based recommenders often\nrely on traditional objectives like mean squared error (MSE) or recommendation\nobjectives, which are not optimized for personalized ranking tasks or fail to\nfully leverage DM's generative potential. To address this, we propose\nPreferDiff, a tailored optimization objective for DM-based recommenders.\nPreferDiff transforms BPR into a log-likelihood ranking objective and\nintegrates multiple negative samples to better capture user preferences.\nSpecifically, we employ variational inference to handle the intractability\nthrough minimizing the variational upper bound and replaces MSE with cosine\nerror to improve alignment with recommendation tasks. Finally, we balance\nlearning generation and preference to enhance the training stability of DMs.\nPreferDiff offers three key benefits: it is the first personalized ranking loss\ndesigned specifically for DM-based recommenders and it improves ranking and\nfaster convergence by addressing hard negatives. We also prove that it is\ntheoretically connected to Direct Preference Optimization which indicates that\nit has the potential to align user preferences in DM-based recommenders via\ngenerative modeling. Extensive experiments across three benchmarks validate its\nsuperior recommendation performance and commendable general sequential\nrecommendation capabilities. Our codes are available at\n\\url{https://github.com/lswhim/PreferDiff}.\n","authors":["Shuo Liu","An Zhang","Guoqing Hu","Hong Qian","Tat-seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.13117v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.13733v1","updated":"2024-10-17T16:36:38Z","published":"2024-10-17T16:36:38Z","title":"Improving Multi-modal Large Language Model through Boosting Vision\n Capabilities","summary":" We focus on improving the visual understanding capability for boosting the\nvision-language models. We propose \\textbf{Arcana}, a multiModal language\nmodel, which introduces two crucial techniques. First, we present Multimodal\nLoRA (MM-LoRA), a module designed to enhance the decoder. Unlike traditional\nlanguage-driven decoders, MM-LoRA consists of two parallel LoRAs -- one for\nvision and one for language -- each with its own parameters. This disentangled\nparameters design allows for more specialized learning in each modality and\nbetter integration of multimodal information. Second, we introduce the Query\nLadder adapter (QLadder) to improve the visual encoder. QLadder employs a\nlearnable ``\\textit{ladder}'' structure to deeply aggregates the intermediate\nrepresentations from the frozen pretrained visual encoder (e.g., CLIP image\nencoder). This enables the model to learn new and informative visual features,\nas well as remaining the powerful capabilities of the pretrained visual\nencoder. These techniques collectively enhance Arcana's visual perception\npower, enabling it to leverage improved visual information for more accurate\nand contextually relevant outputs across various multimodal scenarios.\nExtensive experiments and ablation studies demonstrate the effectiveness and\ngeneralization capability of our Arcana. The code and re-annotated data are\navailable at \\url{https://arcana-project-page.github.io}.\n","authors":["Yanpeng Sun","Huaxin Zhang","Qiang Chen","Xinyu Zhang","Nong Sang","Gang Zhang","Jingdong Wang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2410.13733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12407v2","updated":"2024-10-17T15:59:34Z","published":"2024-10-16T09:42:29Z","title":"Beyond Coarse-Grained Matching in Video-Text Retrieval","summary":" Video-text retrieval has seen significant advancements, yet the ability of\nmodels to discern subtle differences in captions still requires verification.\nIn this paper, we introduce a new approach for fine-grained evaluation. Our\napproach can be applied to existing datasets by automatically generating hard\nnegative test captions with subtle single-word variations across nouns, verbs,\nadjectives, adverbs, and prepositions. We perform comprehensive experiments\nusing four state-of-the-art models across two standard benchmarks (MSR-VTT and\nVATEX) and two specially curated datasets enriched with detailed descriptions\n(VLN-UVO and VLN-OOPS), resulting in a number of novel insights: 1) our\nanalyses show that the current evaluation benchmarks fall short in detecting a\nmodel's ability to perceive subtle single-word differences, 2) our fine-grained\nevaluation highlights the difficulty models face in distinguishing such subtle\nvariations. To enhance fine-grained understanding, we propose a new baseline\nthat can be easily combined with current methods. Experiments on our\nfine-grained evaluations demonstrate that this approach enhances a model's\nability to understand fine-grained differences.\n","authors":["Aozhu Chen","Hazel Doughty","Xirong Li","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2410.12407v2.pdf","comment":"Accepted to ACCV 2024"},{"id":"http://arxiv.org/abs/2410.13647v1","updated":"2024-10-17T15:13:26Z","published":"2024-10-17T15:13:26Z","title":"Multimodal growth and development assessment model","summary":" With the development of social economy and the improvement of people's\nattention to health, the growth and development of children and adolescents has\nbecome an important indicator to measure the level of national health.\nTherefore, accurate and timely assessment of children's growth and development\nhas become increasingly important. At the same time, global health\ninequalities, especially child malnutrition and stunting in developing\ncountries, urgently require effective assessment tools to monitor and\nintervene. In recent years, the rapid development of technologies such as big\ndata, artificial intelligence, and cloud computing, and the cross-integration\nof multiple disciplines such as biomedicine, statistics, and computer science\nhave promoted the rapid development of large-scale models for growth and\ndevelopment assessment. However, there are still problems such as too single\nevaluation factors, inaccurate diagnostic results, and inability to give\naccurate and reasonable recommendations. The multi-modal growth and development\nassessment model uses the public data set of RSNA ( North American College of\nRadiology ) as the training set, and the data set of the Department of\nPediatrics of Huaibei People's Hospital as the open source test set. The\nembedded ICL module enables the model to quickly adapt and identify the tasks\nthat need to be done to ensure that under the premise of considering multiple\nevaluation factors, accurate diagnosis results and reasonable medical\nrecommendations are given, so as to provide solutions to the above problems and\npromote the development of the medical field.\n","authors":["Ying Li","Zichen Song","Zijie Gong","Sitan Huang","Jiewei Ge"],"pdf_url":"https://arxiv.org/pdf/2410.13647v1.pdf","comment":"7 Pages 7 Figures"},{"id":"http://arxiv.org/abs/2410.13419v1","updated":"2024-10-17T10:41:52Z","published":"2024-10-17T10:41:52Z","title":"MeloTrans: A Text to Symbolic Music Generation Model Following Human\n Composition Habit","summary":" At present, neural network models show powerful sequence prediction ability\nand are used in many automatic composition models. In comparison, the way\nhumans compose music is very different from it. Composers usually start by\ncreating musical motifs and then develop them into music through a series of\nrules. This process ensures that the music has a specific structure and\nchanging pattern. However, it is difficult for neural network models to learn\nthese composition rules from training data, which results in a lack of\nmusicality and diversity in the generated music. This paper posits that\nintegrating the learning capabilities of neural networks with human-derived\nknowledge may lead to better results. To archive this, we develop the\nPOP909$\\_$M dataset, the first to include labels for musical motifs and their\nvariants, providing a basis for mimicking human compositional habits. Building\non this, we propose MeloTrans, a text-to-music composition model that employs\nprinciples of motif development rules. Our experiments demonstrate that\nMeloTrans excels beyond existing music generation models and even surpasses\nLarge Language Models (LLMs) like ChatGPT-4. This highlights the importance of\nmerging human insights with neural network capabilities to achieve superior\nsymbolic music generation.\n","authors":["Yutian Wang","Wanyin Yang","Zhenrong Dai","Yilong Zhang","Kun Zhao","Hui Wang"],"pdf_url":"https://arxiv.org/pdf/2410.13419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07728v4","updated":"2024-10-17T09:54:06Z","published":"2024-07-10T15:00:08Z","title":"SaMoye: Zero-shot Singing Voice Conversion Model Based on Feature\n Disentanglement and Enhancement","summary":" Singing voice conversion (SVC) aims to convert a singer's voice to another\nsinger's from a reference audio while keeping the original semantics. However,\nexisting SVC methods can hardly perform zero-shot due to incomplete feature\ndisentanglement or dependence on the speaker look-up table. We propose the\nfirst open-source high-quality zero-shot SVC model SaMoye that can convert\nsinging to human and non-human timbre. SaMoye disentangles the singing voice's\nfeatures into content, timbre, and pitch features, where we combine multiple\nASR models and compress the content features to reduce timbre leaks. Besides,\nwe enhance the timbre features by unfreezing the speaker encoder and mixing the\nspeaker embedding with top-3 similar speakers. We also establish an\nunparalleled large-scale dataset to guarantee zero-shot performance, which\ncomprises more than 1,815 hours of pure singing voice and 6,367 speakers. We\nconduct objective and subjective experiments to find that SaMoye outperforms\nother models in zero-shot SVC tasks even under extreme conditions like\nconverting singing to animals' timbre. The code and weight of SaMoye are\navailable on https://github.com/CarlWangChina/SaMoye-SVC. The weights, code,\ndataset, and documents of SaMoye are publicly available on\n\\url{https://github.com/CarlWangChina/SaMoye-SVC}.\n","authors":["Zihao Wang","Le Ma","Yongsheng Feng","Xin Pan","Yuhang Jin","Kejun Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.07728v4.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2410.13360v1","updated":"2024-10-17T09:10:26Z","published":"2024-10-17T09:10:26Z","title":"Remember, Retrieve and Generate: Understanding Infinite Visual Concepts\n as Your Personalized Assistant","summary":" The development of large language models (LLMs) has significantly enhanced\nthe capabilities of multimodal LLMs (MLLMs) as general assistants. However,\nlack of user-specific knowledge still restricts their application in human's\ndaily life. In this paper, we introduce the Retrieval Augmented Personalization\n(RAP) framework for MLLMs' personalization. Starting from a general MLLM, we\nturn it into a personalized assistant in three steps. (a) Remember: We design a\nkey-value database to store user-related information, e.g., user's name, avatar\nand other attributes. (b) Retrieve: When the user initiates a conversation, RAP\nwill retrieve relevant information from the database using a multimodal\nretriever. (c) Generate: The input query and retrieved concepts' information\nare fed into MLLMs to generate personalized, knowledge-augmented responses.\nUnlike previous methods, RAP allows real-time concept editing via updating the\nexternal database. To further improve generation quality and alignment with\nuser-specific information, we design a pipeline for data collection and create\na specialized dataset for personalized training of MLLMs. Based on the dataset,\nwe train a series of MLLMs as personalized multimodal assistants. By\npretraining on large-scale dataset, RAP-MLLMs can generalize to infinite visual\nconcepts without additional finetuning. Our models demonstrate outstanding\nflexibility and generation quality across a variety of tasks, such as\npersonalized image captioning, question answering and visual recognition. The\ncode, data and models are available at https://github.com/Hoar012/RAP-MLLM.\n","authors":["Haoran Hao","Jiaming Han","Changsheng Li","Yu-Feng Li","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2410.13360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11522v2","updated":"2024-10-17T08:18:14Z","published":"2024-10-15T11:48:31Z","title":"Leveraging LLM Embeddings for Cross Dataset Label Alignment and Zero\n Shot Music Emotion Prediction","summary":" In this work, we present a novel method for music emotion recognition that\nleverages Large Language Model (LLM) embeddings for label alignment across\nmultiple datasets and zero-shot prediction on novel categories. First, we\ncompute LLM embeddings for emotion labels and apply non-parametric clustering\nto group similar labels, across multiple datasets containing disjoint labels.\nWe use these cluster centers to map music features (MERT) to the LLM embedding\nspace. To further enhance the model, we introduce an alignment regularization\nthat enables dissociation of MERT embeddings from different clusters. This\nfurther enhances the model's ability to better adaptation to unseen datasets.\nWe demonstrate the effectiveness of our approach by performing zero-shot\ninference on a new dataset, showcasing its ability to generalize to unseen\nlabels without additional training.\n","authors":["Renhang Liu","Abhinaba Roy","Dorien Herremans"],"pdf_url":"https://arxiv.org/pdf/2410.11522v2.pdf","comment":null}]},"2024-10-16T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2410.13070v1","updated":"2024-10-16T21:53:48Z","published":"2024-10-16T21:53:48Z","title":"Is Semantic Chunking Worth the Computational Cost?","summary":" Recent advances in Retrieval-Augmented Generation (RAG) systems have\npopularized semantic chunking, which aims to improve retrieval performance by\ndividing documents into semantically coherent segments. Despite its growing\nadoption, the actual benefits over simpler fixed-size chunking, where documents\nare split into consecutive, fixed-size segments, remain unclear. This study\nsystematically evaluates the effectiveness of semantic chunking using three\ncommon retrieval-related tasks: document retrieval, evidence retrieval, and\nretrieval-based answer generation. The results show that the computational\ncosts associated with semantic chunking are not justified by consistent\nperformance gains. These findings challenge the previous assumptions about\nsemantic chunking and highlight the need for more efficient chunking strategies\nin RAG systems.\n","authors":["Renyi Qu","Ruixuan Tu","Forrest Bao"],"pdf_url":"https://arxiv.org/pdf/2410.13070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13051v1","updated":"2024-10-16T21:24:13Z","published":"2024-10-16T21:24:13Z","title":"Supply Chain Network Extraction and Entity Classification Leveraging\n Large Language Models","summary":" Supply chain networks are critical to the operational efficiency of\nindustries, yet their increasing complexity presents significant challenges in\nmapping relationships and identifying the roles of various entities.\nTraditional methods for constructing supply chain networks rely heavily on\nstructured datasets and manual data collection, limiting their scope and\nefficiency. In contrast, recent advancements in Natural Language Processing\n(NLP) and large language models (LLMs) offer new opportunities for discovering\nand analyzing supply chain networks using unstructured text data. This paper\nproposes a novel approach that leverages LLMs to extract and process raw\ntextual information from publicly available sources to construct a\ncomprehensive supply chain graph. We focus on the civil engineering sector as a\ncase study, demonstrating how LLMs can uncover hidden relationships among\ncompanies, projects, and other entities. Additionally, we fine-tune an LLM to\nclassify entities within the supply chain graph, providing detailed insights\ninto their roles and relationships. The results show that domain-specific\nfine-tuning improves classification accuracy, highlighting the potential of\nLLMs for industry-specific supply chain analysis. Our contributions include the\ndevelopment of a supply chain graph for the civil engineering sector, as well\nas a fine-tuned LLM model that enhances entity classification and understanding\nof supply chain networks.\n","authors":["Tong Liu","Hadi Meidani"],"pdf_url":"https://arxiv.org/pdf/2410.13051v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2410.13047v1","updated":"2024-10-16T21:17:18Z","published":"2024-10-16T21:17:18Z","title":"LLM Confidence Evaluation Measures in Zero-Shot CSS Classification","summary":" Assessing classification confidence is critical for leveraging large language\nmodels (LLMs) in automated labeling tasks, especially in the sensitive domains\npresented by Computational Social Science (CSS) tasks. In this paper, we make\nthree key contributions: (1) we propose an uncertainty quantification (UQ)\nperformance measure tailored for data annotation tasks, (2) we compare, for the\nfirst time, five different UQ strategies across three distinct LLMs and CSS\ndata annotation tasks, (3) we introduce a novel UQ aggregation strategy that\neffectively identifies low-confidence LLM annotations and disproportionately\nuncovers data incorrectly labeled by the LLMs. Our results demonstrate that our\nproposed UQ aggregation strategy improves upon existing methods andcan be used\nto significantly improve human-in-the-loop data annotation processes.\n","authors":["David Farr","Iain Cruickshank","Nico Manzonelli","Nicholas Clark","Kate Starbird","Jevin West"],"pdf_url":"https://arxiv.org/pdf/2410.13047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13037v1","updated":"2024-10-16T20:52:39Z","published":"2024-10-16T20:52:39Z","title":"LFOSum: Summarizing Long-form Opinions with Large Language Models","summary":" Online reviews play a pivotal role in influencing consumer decisions across\nvarious domains, from purchasing products to selecting hotels or restaurants.\nHowever, the sheer volume of reviews -- often containing repetitive or\nirrelevant content -- leads to information overload, making it challenging for\nusers to extract meaningful insights. Traditional opinion summarization models\nface challenges in handling long inputs and large volumes of reviews, while\nnewer Large Language Model (LLM) approaches often fail to generate accurate and\nfaithful summaries. To address those challenges, this paper introduces (1) a\nnew dataset of long-form user reviews, each entity comprising over a thousand\nreviews, (2) two training-free LLM-based summarization approaches that scale to\nlong inputs, and (3) automatic evaluation metrics. Our dataset of user reviews\nis paired with in-depth and unbiased critical summaries by domain experts,\nserving as a reference for evaluation. Additionally, our novel reference-free\nevaluation metrics provide a more granular, context-sensitive assessment of\nsummary faithfulness. We benchmark several open-source and closed-source LLMs\nusing our methods. Our evaluation reveals that LLMs still face challenges in\nbalancing sentiment and format adherence in long-form summaries, though\nopen-source models can narrow the gap when relevant information is retrieved in\na focused manner.\n","authors":["Mir Tafseer Nayeem","Davood Rafiei"],"pdf_url":"https://arxiv.org/pdf/2410.13037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12956v1","updated":"2024-10-16T18:44:28Z","published":"2024-10-16T18:44:28Z","title":"Towards Computational Analysis of Pansori Singing","summary":" Pansori is one of the most representative vocal genres of Korean traditional\nmusic, which has an elaborated vocal melody line with strong vibrato. Although\nthe music is transmitted orally without any music notation, transcribing\npansori music in Western staff notation has been introduced for several\npurposes, such as documentation of music, education, or research. In this\npaper, we introduce computational analysis of pansori based on both audio and\ncorresponding transcription, how modern Music Information Retrieval tasks can\nbe used in analyzing traditional music and how it revealed different audio\ncharacteristics of what pansori contains.\n","authors":["Sangheon Park","Danbinaerin Han","Dasaem Jeong"],"pdf_url":"https://arxiv.org/pdf/2410.12956v1.pdf","comment":"Late-Breaking Demo Session of the 25th International Society for\n Music Information Retrieval (ISMIR) Conference, 2024"},{"id":"http://arxiv.org/abs/2407.18940v2","updated":"2024-10-16T18:37:15Z","published":"2024-07-10T18:00:03Z","title":"LitSearch: A Retrieval Benchmark for Scientific Literature Search","summary":" Literature search questions, such as \"Where can I find research on the\nevaluation of consistency in generated summaries?\" pose significant challenges\nfor modern search engines and retrieval systems. These questions often require\na deep understanding of research concepts and the ability to reason across\nentire articles. In this work, we introduce LitSearch, a retrieval benchmark\ncomprising 597 realistic literature search queries about recent ML and NLP\npapers. LitSearch is constructed using a combination of (1) questions generated\nby GPT-4 based on paragraphs containing inline citations from research papers\nand (2) questions manually written by authors about their recently published\npapers. All LitSearch questions were manually examined or edited by experts to\nensure high quality. We extensively benchmark state-of-the-art retrieval models\nand also evaluate two LLM-based reranking pipelines. We find a significant\nperformance gap between BM25 and state-of-the-art dense retrievers, with a\n24.8% absolute difference in recall@5. The LLM-based reranking strategies\nfurther improve the best-performing dense retriever by 4.4%. Additionally,\ncommercial search engines and research tools like Google Search perform poorly\non LitSearch, lagging behind the best dense retriever by up to 32 recall\npoints. Taken together, these results show that LitSearch is an informative new\ntestbed for retrieval systems while catering to a real-world use case.\n","authors":["Anirudh Ajith","Mengzhou Xia","Alexis Chevalier","Tanya Goyal","Danqi Chen","Tianyu Gao"],"pdf_url":"https://arxiv.org/pdf/2407.18940v2.pdf","comment":"Accepted by EMNLP 2024. Dataset and code are available at\n https://github.com/princeton-nlp/LitSearch"},{"id":"http://arxiv.org/abs/2406.12593v2","updated":"2024-10-16T13:45:54Z","published":"2024-06-18T13:25:18Z","title":"PromptDSI: Prompt-based Rehearsal-free Instance-wise Incremental\n Learning for Document Retrieval","summary":" Differentiable Search Index (DSI) utilizes Pre-trained Language Models (PLMs)\nfor efficient document retrieval without relying on external indexes. However,\nDSI needs full re-training to handle updates in dynamic corpora, causing\nsignificant computational inefficiencies. We introduce PromptDSI, a\nprompt-based rehearsal-free approach for instance-wise incremental learning\ndocument retrieval. PromptDSI attaches prompts to the frozen PLM's encoder of\nDSI, leveraging its powerful representation to efficiently index new corpora\nwhile maintaining a balance between stability and plasticity. We eliminate the\ninitial forward pass of prompt-based continual learning methods that doubles\ntraining and inference time. Moreover, we propose a topic-aware prompt pool\nthat employs neural topic embeddings as fixed keys. This strategy ensures\ndiverse and effective prompt usage, addressing the challenge of parameter\nunderutilization caused by the collapse of the query-key matching mechanism.\nOur empirical evaluations demonstrate that BERT-based PromptDSI matches IncDSI\nin managing forgetting while improving new corpora performance by more than 4%\nHits@10 on NQ320k and upto 3% MRR@10 on MS MARCO 300k.\n","authors":["Tuan-Luc Huynh","Thuy-Trang Vu","Weiqing Wang","Yinwei Wei","Trung Le","Dragan Gasevic","Yuan-Fang Li","Thanh-Toan Do"],"pdf_url":"https://arxiv.org/pdf/2406.12593v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2406.14162v3","updated":"2024-10-16T13:16:25Z","published":"2024-06-20T10:04:09Z","title":"DIRAS: Efficient LLM Annotation of Document Relevance in Retrieval\n Augmented Generation","summary":" Retrieval Augmented Generation (RAG) is widely employed to ground responses\nto queries on domain-specific documents. But do RAG implementations leave out\nimportant information when answering queries that need an integrated analysis\nof information (e.g., Tell me good news in the stock market today.)? To address\nthese concerns, RAG developers need to annotate information retrieval (IR) data\nfor their domain of interest, which is challenging because (1) domain-specific\nqueries usually need nuanced definitions of relevance beyond shallow semantic\nrelevance; and (2) human or GPT-4 annotation is costly and cannot cover all\n(query, document) pairs (i.e., annotation selection bias), thus harming the\neffectiveness in evaluating IR recall. To address these challenges, we propose\nDIRAS (Domain-specific Information Retrieval Annotation with Scalability), a\nmanual-annotation-free schema that fine-tunes open-sourced LLMs to consider\nnuanced relevance definition and annotate (partial) relevance labels with\ncalibrated relevance scores. Extensive evaluation shows that DIRAS enables\nsmaller (8B) LLMs to achieve GPT-4-level performance on annotating and ranking\nunseen (query, document) pairs, and is helpful for real-world RAG development.\nAll code, LLM generations, and human annotations can be found in\n\\url{https://github.com/EdisonNi-hku/DIRAS}.\n","authors":["Jingwei Ni","Tobias Schimanski","Meihong Lin","Mrinmaya Sachan","Elliott Ash","Markus Leippold"],"pdf_url":"https://arxiv.org/pdf/2406.14162v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19014v3","updated":"2024-10-16T12:55:55Z","published":"2024-09-24T01:40:50Z","title":"FLEX: Expert-level False-Less EXecution Metric for Reliable Text-to-SQL\n Benchmark","summary":" Text-to-SQL systems have become crucial for translating natural language into\nSQL queries in various industries, enabling non-technical users to perform\ncomplex data operations. The need for accurate evaluation methods has increased\nas these systems have grown more sophisticated. However, the Execution Accuracy\n(EX), the most prevalent evaluation metric, still shows many false positives\nand negatives. Thus, this paper introduces FLEX (False-Less EXecution), a novel\napproach to evaluating text-to-SQL systems using large language models (LLMs)\nto emulate human expert-level evaluation of SQL queries. Our metric improves\nagreement with human experts (from 62 to 87.04 in Cohen's kappa) with\ncomprehensive context and sophisticated criteria. Our extensive experiments\nyield several key insights: (1) Models' performance increases by over 2.6\npoints on average, substantially affecting rankings on Spider and BIRD\nbenchmarks; (2) The underestimation of models in EX primarily stems from\nannotation quality issues; and (3) Model performance on particularly\nchallenging questions tends to be overestimated. This work contributes to a\nmore accurate and nuanced evaluation of text-to-SQL systems, potentially\nreshaping our understanding of state-of-the-art performance in this field.\n","authors":["Heegyu Kim","Taeyang Jeon","Seunghwan Choi","Seungtaek Choi","Hyunsouk Cho"],"pdf_url":"https://arxiv.org/pdf/2409.19014v3.pdf","comment":"preprint, under review"},{"id":"http://arxiv.org/abs/2410.12519v1","updated":"2024-10-16T12:54:34Z","published":"2024-10-16T12:54:34Z","title":"RosePO: Aligning LLM-based Recommenders with Human Values","summary":" Recently, there has been a growing interest in leveraging Large Language\nModels (LLMs) for recommendation systems, which usually adapt a pre-trained LLM\nto the recommendation scenario through supervised fine-tuning (SFT). However,\nboth the pre-training and SFT stages fail to explicitly model the comparative\nrelationships of a user's preferences on different items. To construct a\n\"helpful and harmless\" LLM-based recommender, we propose a general framework --\nRecommendation with smoothing personalized Preference Optimization (RosePO),\nwhich better aligns with customized human values during the post-training\nstage. Specifically, in addition to the input and chosen response that\nnaturally align with SFT data, we design a rejected sampling strategy tailored\nfor enhancing helpfulness, along with two strategies aimed at mitigating biases\nto promote harmlessness. To ensure robustness against uncertain labels present\nin automatically constructed preference data, we introduce a personalized\nsmoothing factor predicted by a preference oracle into the optimization\nobjective. Evaluation on three real-world datasets demonstrates the\neffectiveness of our method, showcasing not only improved recommendation\nperformance but also mitigation of semantic hallucination and popularity bias.\n","authors":["Jiayi Liao","Xiangnan He","Ruobing Xie","Jiancan Wu","Yancheng Yuan","Xingwu Sun","Zhanhui Kang","Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2410.12519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13905v1","updated":"2024-10-16T12:29:22Z","published":"2024-10-16T12:29:22Z","title":"P4GCN: Vertical Federated Social Recommendation with Privacy-Preserving\n Two-Party Graph Convolution Networks","summary":" In recent years, graph neural networks (GNNs) have been commonly utilized for\nsocial recommendation systems. However, real-world scenarios often present\nchallenges related to user privacy and business constraints, inhibiting direct\naccess to valuable social information from other platforms. While many existing\nmethods have tackled matrix factorization-based social recommendations without\ndirect social data access, developing GNN-based federated social recommendation\nmodels under similar conditions remains largely unexplored. To address this\nissue, we propose a novel vertical federated social recommendation method\nleveraging privacy-preserving two-party graph convolution networks (P4GCN) to\nenhance recommendation accuracy without requiring direct access to sensitive\nsocial information. First, we introduce a Sandwich-Encryption module to ensure\ncomprehensive data privacy during the collaborative computing process. Second,\nwe provide a thorough theoretical analysis of the privacy guarantees,\nconsidering the participation of both curious and honest parties. Extensive\nexperiments on four real-world datasets demonstrate that P4GCN outperforms\nstate-of-the-art methods in terms of recommendation accuracy. The code is\navailable at https://github.com/WwZzz/P4GCN.\n","authors":["Zheng Wang","Wanwan Wang","Yimin Huang","Zhaopeng Peng","Ziqi Yang","Cheng Wang","Xiaoliang Fan"],"pdf_url":"https://arxiv.org/pdf/2410.13905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12473v1","updated":"2024-10-16T11:41:24Z","published":"2024-10-16T11:41:24Z","title":"Unifying Economic and Language Models for Enhanced Sentiment Analysis of\n the Oil Market","summary":" Crude oil, a critical component of the global economy, has its prices\ninfluenced by various factors such as economic trends, political events, and\nnatural disasters. Traditional prediction methods based on historical data have\ntheir limits in forecasting, but recent advancements in natural language\nprocessing bring new possibilities for event-based analysis. In particular,\nLanguage Models (LM) and their advancement, the Generative Pre-trained\nTransformer (GPT), have shown potential in classifying vast amounts of natural\nlanguage. However, these LMs often have difficulty with domain-specific\nterminology, limiting their effectiveness in the crude oil sector. Addressing\nthis gap, we introduce CrudeBERT, a fine-tuned LM specifically for the crude\noil market. The results indicate that CrudeBERT's sentiment scores align more\nclosely with the WTI Futures curve and significantly enhance price predictions,\nunderscoring the crucial role of integrating economic principles into LMs.\n","authors":["Himmet Kaplan","Ralf-Peter Mundani","Heiko Rölke","Albert Weichselbraun","Martin Tschudy"],"pdf_url":"https://arxiv.org/pdf/2410.12473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12451v1","updated":"2024-10-16T10:58:53Z","published":"2024-10-16T10:58:53Z","title":"Mitigating Dual Latent Confounding Biases in Recommender Systems","summary":" Recommender systems are extensively utilised across various areas to predict\nuser preferences for personalised experiences and enhanced user engagement and\nsatisfaction. Traditional recommender systems, however, are complicated by\nconfounding bias, particularly in the presence of latent confounders that\naffect both item exposure and user feedback. Existing debiasing methods often\nfail to capture the complex interactions caused by latent confounders in\ninteraction data, especially when dual latent confounders affect both the user\nand item sides. To address this, we propose a novel debiasing method that\njointly integrates the Instrumental Variables (IV) approach and identifiable\nVariational Auto-Encoder (iVAE) for Debiased representation learning in\nRecommendation systems, referred to as IViDR. Specifically, IViDR leverages the\nembeddings of user features as IVs to address confounding bias caused by latent\nconfounders between items and user feedback, and reconstructs the embedding of\nitems to obtain debiased interaction data. Moreover, IViDR employs an\nIdentifiable Variational Auto-Encoder (iVAE) to infer identifiable\nrepresentations of latent confounders between item exposure and user feedback\nfrom both the original and debiased interaction data. Additionally, we provide\ntheoretical analyses of the soundness of using IV and the identifiability of\nthe latent representations. Extensive experiments on both synthetic and\nreal-world datasets demonstrate that IViDR outperforms state-of-the-art models\nin reducing bias and providing reliable recommendations.\n","authors":["Jianfeng Deng","Qingfeng Chen","Debo Cheng","Jiuyong Li","Lin Liu","Xiaojing Du"],"pdf_url":"https://arxiv.org/pdf/2410.12451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12890v1","updated":"2024-10-16T08:43:39Z","published":"2024-10-16T08:43:39Z","title":"REFINE on Scarce Data: Retrieval Enhancement through Fine-Tuning via\n Model Fusion of Embedding Models","summary":" Retrieval augmented generation (RAG) pipelines are commonly used in tasks\nsuch as question-answering (QA), relying on retrieving relevant documents from\na vector store computed using a pretrained embedding model. However, if the\nretrieved context is inaccurate, the answers generated using the large language\nmodel (LLM) may contain errors or hallucinations. Although pretrained embedding\nmodels have advanced, adapting them to new domains remains challenging.\nFine-tuning is a potential solution, but industry settings often lack the\nnecessary fine-tuning data. To address these challenges, we propose REFINE, a\nnovel technique that generates synthetic data from available documents and then\nuses a model fusion approach to fine-tune embeddings for improved retrieval\nperformance in new domains, while preserving out-of-domain capability. We\nconducted experiments on the two public datasets: SQUAD and RAG-12000 and a\nproprietary TOURISM dataset. Results demonstrate that even the standard\nfine-tuning with the proposed data augmentation technique outperforms the\nvanilla pretrained model. Furthermore, when combined with model fusion, the\nproposed approach achieves superior performance, with a 5.76% improvement in\nrecall on the TOURISM dataset, and 6.58 % and 0.32% enhancement on SQUAD and\nRAG-12000 respectively.\n","authors":["Ambuje Gupta","Mrinal Rawat","Andreas Stolcke","Roberto Pieraccini"],"pdf_url":"https://arxiv.org/pdf/2410.12890v1.pdf","comment":"Accepted in AJCAI'24"},{"id":"http://arxiv.org/abs/2410.12366v1","updated":"2024-10-16T08:37:39Z","published":"2024-10-16T08:37:39Z","title":"Multi-Cause Deconfounding for Recommender Systems with Latent\n Confounders","summary":" In recommender systems, various latent confounding factors (e.g., user social\nenvironment and item public attractiveness) can affect user behavior, item\nexposure, and feedback in distinct ways. These factors may directly or\nindirectly impact user feedback and are often shared across items or users,\nmaking them multi-cause latent confounders. However, existing methods typically\nfail to account for latent confounders between users and their feedback, as\nwell as those between items and user feedback simultaneously. To address the\nproblem of multi-cause latent confounders, we propose a multi-cause\ndeconfounding method for recommender systems with latent confounders (MCDCF).\nMCDCF leverages multi-cause causal effect estimation to learn substitutes for\nlatent confounders associated with both users and items, using user behaviour\ndata. Specifically, MCDCF treats the multiple items that users interact with\nand the multiple users that interact with items as treatment variables,\nenabling it to learn substitutes for the latent confounders that influence the\nestimation of causality between users and their feedback, as well as between\nitems and user feedback. Additionally, we theoretically demonstrate the\nsoundness of our MCDCF method. Extensive experiments on three real-world\ndatasets demonstrate that our MCDCF method effectively recovers latent\nconfounders related to users and items, reducing bias and thereby improving\nrecommendation accuracy.\n","authors":["Zhirong Huang","Shichao Zhang","Debo Cheng","Jiuyong Li","Lin Liu","Guixian Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.12366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00333v2","updated":"2024-10-16T05:21:47Z","published":"2024-06-01T07:18:56Z","title":"A Practice-Friendly LLM-Enhanced Paradigm with Preference Parsing for\n Sequential Recommendation","summary":" The training paradigm integrating large language models (LLM) is gradually\nreshaping sequential recommender systems (SRS) and has shown promising results.\nHowever, most existing LLM-enhanced methods rely on rich textual information on\nthe item side and instance-level supervised fine-tuning (SFT) to inject\ncollaborative information into LLM, which is inefficient and limited in many\napplications. To alleviate these problems, this paper proposes a\npractice-friendly LLM-enhanced paradigm with preference parsing (P2Rec) for\nSRS. Specifically, in the information reconstruction stage, we design a new\nuser-level SFT task for collaborative information injection with the assistance\nof a pre-trained SRS model, which is more efficient and compatible with limited\ntext information. Our goal is to let LLM learn to reconstruct a corresponding\nprior preference distribution from each user's interaction sequence, where LLM\nneeds to effectively parse the latent category of each item and the\nrelationship between different items to accomplish this task. In the\ninformation augmentation stage, we feed each item into LLM to obtain a set of\nenhanced embeddings that combine collaborative information and LLM inference\ncapabilities. These embeddings can then be used to help train various future\nSRS models. Finally, we verify the effectiveness and efficiency of our TSLRec\non three SRS benchmark datasets.\n","authors":["Dugang Liu","Shenxian Xian","Xiaolin Lin","Xiaolian Zhang","Hong Zhu","Yuan Fang","Zhen Chen","Zhong Ming"],"pdf_url":"https://arxiv.org/pdf/2406.00333v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12229v1","updated":"2024-10-16T04:44:34Z","published":"2024-10-16T04:44:34Z","title":"Comprehending Knowledge Graphs with Large Language Models for\n Recommender Systems","summary":" Recently, the introduction of knowledge graphs (KGs) has significantly\nadvanced recommender systems by facilitating the discovery of potential\nassociations between items. However, existing methods still face several\nlimitations. First, most KGs suffer from missing facts or limited scopes. This\ncan lead to biased knowledge representations, thereby constraining the model's\nperformance. Second, existing methods typically convert textual information\ninto IDs, resulting in the loss of natural semantic connections between\ndifferent items. Third, existing methods struggle to capture high-order\nrelationships in global KGs due to their inefficient layer-by-layer information\npropagation mechanisms, which are prone to introducing significant noise. To\naddress these limitations, we propose a novel method called CoLaKG, which\nleverages large language models (LLMs) for knowledge-aware recommendation. The\nextensive world knowledge and remarkable reasoning capabilities of LLMs enable\nthem to supplement KGs. Additionally, the strong text comprehension abilities\nof LLMs allow for a better understanding of semantic information. Based on\nthis, we first extract subgraphs centered on each item from the KG and convert\nthem into textual inputs for the LLM. The LLM then outputs its comprehension of\nthese item-centered subgraphs, which are subsequently transformed into semantic\nembeddings. Furthermore, to utilize the global information of the KG, we\nconstruct an item-item graph using these semantic embeddings, which can\ndirectly capture higher-order associations between items. Both the semantic\nembeddings and the structural information from the item-item graph are\neffectively integrated into the recommendation model through our designed\nrepresentation alignment and neighbor augmentation modules. Extensive\nexperiments on four real-world datasets demonstrate the superiority of our\nmethod.\n","authors":["Ziqiang Cui","Yunpeng Weng","Xing Tang","Fuyuan Lyu","Dugang Liu","Xiuqiang He","Chen Ma"],"pdf_url":"https://arxiv.org/pdf/2410.12229v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12228v1","updated":"2024-10-16T04:44:15Z","published":"2024-10-16T04:44:15Z","title":"Triple Modality Fusion: Aligning Visual, Textual, and Graph Data with\n Large Language Models for Multi-Behavior Recommendations","summary":" Integrating diverse data modalities is crucial for enhancing the performance\nof personalized recommendation systems. Traditional models, which often rely on\nsingular data sources, lack the depth needed to accurately capture the\nmultifaceted nature of item features and user behaviors. This paper introduces\na novel framework for multi-behavior recommendations, leveraging the fusion of\ntriple-modality, which is visual, textual, and graph data through alignment\nwith large language models (LLMs). By incorporating visual information, we\ncapture contextual and aesthetic item characteristics; textual data provides\ninsights into user interests and item features in detail; and graph data\nelucidates relationships within the item-behavior heterogeneous graphs. Our\nproposed model called Triple Modality Fusion (TMF) utilizes the power of LLMs\nto align and integrate these three modalities, achieving a comprehensive\nrepresentation of user behaviors. The LLM models the user's interactions\nincluding behaviors and item features in natural languages. Initially, the LLM\nis warmed up using only natural language-based prompts. We then devise the\nmodality fusion module based on cross-attention and self-attention mechanisms\nto integrate different modalities from other models into the same embedding\nspace and incorporate them into an LLM. Extensive experiments demonstrate the\neffectiveness of our approach in improving recommendation accuracy. Further\nablation studies validate the effectiveness of our model design and benefits of\nthe TMF.\n","authors":["Luyi Ma","Xiaohan Li","Zezhong Fan","Jianpeng Xu","Jason Cho","Praveen Kanumala","Kaushiki Nag","Sushant Kumar","Kannan Achan"],"pdf_url":"https://arxiv.org/pdf/2410.12228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17969v2","updated":"2024-10-16T03:35:22Z","published":"2024-05-28T08:56:33Z","title":"Knowledge Circuits in Pretrained Transformers","summary":" The remarkable capabilities of modern large language models are rooted in\ntheir vast repositories of knowledge encoded within their parameters, enabling\nthem to perceive the world and engage in reasoning. The inner workings of how\nthese models store knowledge have long been a subject of intense interest and\ninvestigation among researchers. To date, most studies have concentrated on\nisolated components within these models, such as the Multilayer Perceptrons and\nattention head. In this paper, we delve into the computation graph of the\nlanguage model to uncover the knowledge circuits that are instrumental in\narticulating specific knowledge. The experiments, conducted with GPT2 and\nTinyLLAMA, have allowed us to observe how certain information heads, relation\nheads, and Multilayer Perceptrons collaboratively encode knowledge within the\nmodel. Moreover, we evaluate the impact of current knowledge editing techniques\non these knowledge circuits, providing deeper insights into the functioning and\nconstraints of these editing methodologies. Finally, we utilize knowledge\ncircuits to analyze and interpret language model behaviors such as\nhallucinations and in-context learning. We believe the knowledge circuits hold\npotential for advancing our understanding of Transformers and guiding the\nimproved design of knowledge editing. Code and data are available in\nhttps://github.com/zjunlp/KnowledgeCircuits.\n","authors":["Yunzhi Yao","Ningyu Zhang","Zekun Xi","Mengru Wang","Ziwen Xu","Shumin Deng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2405.17969v2.pdf","comment":"NeurIPS 2024, 32 pages"},{"id":"http://arxiv.org/abs/2410.09119v2","updated":"2024-10-16T03:15:12Z","published":"2024-10-10T21:13:56Z","title":"$\\textit{lucie}$: An Improved Python Package for Loading Datasets from\n the UCI Machine Learning Repository","summary":" The University of California--Irvine (UCI) Machine Learning (ML) Repository\n(UCIMLR) is consistently cited as one of the most popular dataset repositories,\nhosting hundreds of high-impact datasets. However, a significant portion,\nincluding 28.4% of the top 250, cannot be imported via the $\\textit{ucimlrepo}$\npackage that is provided and recommended by the UCIMLR website. Instead, they\nare hosted as .zip files, containing nonstandard formats that are difficult to\nimport without additional ad hoc processing. To address this issue, here we\npresent $\\textit{lucie}$ -- $\\underline{l}oad$ $\\underline{U}niversity$\n$\\underline{C}alifornia$ $\\underline{I}rvine$ $\\underline{e}xamples$ -- a\nutility that automatically determines the data format and imports many of these\npreviously non-importable datasets, while preserving as much of a tabular data\nstructure as possible. $\\textit{lucie}$ was designed using the top 100 most\npopular datasets and benchmarked on the next 130, where it resulted in a\nsuccess rate of 95.4% vs. 73.1% for $\\textit{ucimlrepo}$. $\\textit{lucie}$ is\navailable as a Python package on PyPI with 98% code coverage.\n","authors":["Kenneth Ge","Phuc Nguyen","Ramy Arnaout"],"pdf_url":"https://arxiv.org/pdf/2410.09119v2.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.07427v2","updated":"2024-10-16T02:37:50Z","published":"2024-08-14T10:03:40Z","title":"Beyond Inter-Item Relations: Dynamic Adaption for Enhancing LLM-Based\n Sequential Recommendation","summary":" Sequential recommender systems (SRS) predict the next items that users may\nprefer based on user historical interaction sequences. Inspired by the rise of\nlarge language models (LLMs) in various AI applications, there is a surge of\nwork on LLM-based SRS. Despite their attractive performance, existing LLM-based\nSRS still exhibit some limitations, including neglecting intra-item relations,\nignoring long-term collaborative knowledge and using inflexible architecture\ndesigns for adaption. To alleviate these issues, we propose an LLM-based\nsequential recommendation model named DARec. Built on top of coarse-grained\nadaption for capturing inter-item relations, DARec is further enhanced with (1)\ncontext masking that models intra-item relations to help LLM better understand\ntoken and item semantics in the context of SRS, (2) collaborative knowledge\ninjection that helps LLM incorporate long-term collaborative knowledge, and (3)\na dynamic adaption mechanism that uses Bayesian optimization to flexibly choose\nlayer-wise adapter architectures in order to better incorporate different\nsequential information. Extensive experiments demonstrate that DARec can\neffectively handle sequential recommendation in a dynamic and adaptive manner.\n","authors":["CanYi Liu","Wei Li"," Youchen"," Zhang","Hui Li","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2408.07427v2.pdf","comment":"11 pages, 14 figures"},{"id":"http://arxiv.org/abs/2410.12886v1","updated":"2024-10-16T01:57:56Z","published":"2024-10-16T01:57:56Z","title":"AT-RAG: An Adaptive RAG Model Enhancing Query Efficiency with Topic\n Filtering and Iterative Reasoning","summary":" Recent advancements in QA with LLM, like GPT-4, have shown limitations in\nhandling complex multi-hop queries. We propose AT-RAG, a novel multistep RAG\nincorporating topic modeling for efficient document retrieval and reasoning.\nUsing BERTopic, our model dynamically assigns topics to queries, improving\nretrieval accuracy and efficiency. We evaluated AT-RAG on multihop benchmark\ndatasets QA and a medical case study QA. Results show significant improvements\nin correctness, completeness, and relevance compared to existing methods.\nAT-RAG reduces retrieval time while maintaining high precision, making it\nsuitable for general tasks QA and complex domain-specific challenges such as\nmedical QA. The integration of topic filtering and iterative reasoning enables\nour model to handle intricate queries efficiently, which makes it suitable for\napplications that require nuanced information retrieval and decision-making.\n","authors":["Mohammad Reza Rezaei","Maziar Hafezi","Amit Satpathy","Lovell Hodge","Ebrahim Pourjafari"],"pdf_url":"https://arxiv.org/pdf/2410.12886v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.13088v1","updated":"2024-10-16T23:05:59Z","published":"2024-10-16T23:05:59Z","title":"Self-Comparison for Dataset-Level Membership Inference in Large\n (Vision-)Language Models","summary":" Large Language Models (LLMs) and Vision-Language Models (VLMs) have made\nsignificant advancements in a wide range of natural language processing and\nvision-language tasks. Access to large web-scale datasets has been a key factor\nin their success. However, concerns have been raised about the unauthorized use\nof copyrighted materials and potential copyright infringement. Existing\nmethods, such as sample-level Membership Inference Attacks (MIA) and\ndistribution-based dataset inference, distinguish member data (data used for\ntraining) and non-member data by leveraging the common observation that models\ntend to memorize and show greater confidence in member data. Nevertheless,\nthese methods face challenges when applied to LLMs and VLMs, such as the\nrequirement for ground-truth member data or non-member data that shares the\nsame distribution as the test data. In this paper, we propose a novel\ndataset-level membership inference method based on Self-Comparison. We find\nthat a member prefix followed by a non-member suffix (paraphrased from a member\nsuffix) can further trigger the model's memorization on training data. Instead\nof directly comparing member and non-member data, we introduce paraphrasing to\nthe second half of the sequence and evaluate how the likelihood changes before\nand after paraphrasing. Unlike prior approaches, our method does not require\naccess to ground-truth member data or non-member data in identical\ndistribution, making it more practical. Extensive experiments demonstrate that\nour proposed method outperforms traditional MIA and dataset inference\ntechniques across various datasets and models, including including public\nmodels, fine-tuned models, and API-based commercial models.\n","authors":["Jie Ren","Kangrui Chen","Chen Chen","Vikash Sehwag","Yue Xing","Jiliang Tang","Lingjuan Lyu"],"pdf_url":"https://arxiv.org/pdf/2410.13088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12957v1","updated":"2024-10-16T18:44:56Z","published":"2024-10-16T18:44:56Z","title":"MuVi: Video-to-Music Generation with Semantic Alignment and Rhythmic\n Synchronization","summary":" Generating music that aligns with the visual content of a video has been a\nchallenging task, as it requires a deep understanding of visual semantics and\ninvolves generating music whose melody, rhythm, and dynamics harmonize with the\nvisual narratives. This paper presents MuVi, a novel framework that effectively\naddresses these challenges to enhance the cohesion and immersive experience of\naudio-visual content. MuVi analyzes video content through a specially designed\nvisual adaptor to extract contextually and temporally relevant features. These\nfeatures are used to generate music that not only matches the video's mood and\ntheme but also its rhythm and pacing. We also introduce a contrastive\nmusic-visual pre-training scheme to ensure synchronization, based on the\nperiodicity nature of music phrases. In addition, we demonstrate that our\nflow-matching-based music generator has in-context learning ability, allowing\nus to control the style and genre of the generated music. Experimental results\nshow that MuVi demonstrates superior performance in both audio quality and\ntemporal synchronization. The generated music video samples are available at\nhttps://muvi-v2m.github.io.\n","authors":["Ruiqi Li","Siqi Zheng","Xize Cheng","Ziang Zhang","Shengpeng Ji","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.12957v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2410.12700v1","updated":"2024-10-16T16:03:42Z","published":"2024-10-16T16:03:42Z","title":"Embedding an Ethical Mind: Aligning Text-to-Image Synthesis via\n Lightweight Value Optimization","summary":" Recent advancements in diffusion models trained on large-scale data have\nenabled the generation of indistinguishable human-level images, yet they often\nproduce harmful content misaligned with human values, e.g., social bias, and\noffensive content. Despite extensive research on Large Language Models (LLMs),\nthe challenge of Text-to-Image (T2I) model alignment remains largely\nunexplored. Addressing this problem, we propose LiVO (Lightweight Value\nOptimization), a novel lightweight method for aligning T2I models with human\nvalues. LiVO only optimizes a plug-and-play value encoder to integrate a\nspecified value principle with the input prompt, allowing the control of\ngenerated images over both semantics and values. Specifically, we design a\ndiffusion model-tailored preference optimization loss, which theoretically\napproximates the Bradley-Terry model used in LLM alignment but provides a more\nflexible trade-off between image quality and value conformity. To optimize the\nvalue encoder, we also develop a framework to automatically construct a\ntext-image preference dataset of 86k (prompt, aligned image, violating image,\nvalue principle) samples. Without updating most model parameters and through\nadaptive value selection from the input prompt, LiVO significantly reduces\nharmful outputs and achieves faster convergence, surpassing several strong\nbaselines and taking an initial step towards ethically aligned T2I models.\n","authors":["Xingqi Wang","Xiaoyuan Yi","Xing Xie","Jia Jia"],"pdf_url":"https://arxiv.org/pdf/2410.12700v1.pdf","comment":"Accepted by ACM Multimedia 2024. The dataset and code can be found at\n https://github.com/achernarwang/LiVO"},{"id":"http://arxiv.org/abs/2410.12220v1","updated":"2024-10-16T04:31:25Z","published":"2024-10-16T04:31:25Z","title":"Rethinking Bjøntegaard Delta for Compression Efficiency Evaluation:\n Are We Calculating It Precisely and Reliably?","summary":" For decades, the Bj{\\o}ntegaard Delta (BD) has been the metric for evaluating\ncodec Rate-Distortion (R-D) performance. Yet, in most studies, BD is determined\nusing just 4-5 R-D data points, could this be sufficient? As codecs and quality\nmetrics advance, does the conventional BD estimation still hold up? Crucially,\nare the performance improvements of new codecs and tools genuine, or merely\nartifacts of estimation flaws? This paper addresses these concerns by\nreevaluating BD estimation. We present a novel approach employing a\nparameterized deep neural network to model R-D curves with high precision\nacross various metrics, accompanied by a comprehensive R-D dataset. This\napproach both assesses the reliability of BD calculations and serves as a\nprecise BD estimator. Our findings advocate for the adoption of rigorous R-D\nsampling and reliability metrics in future compression research to ensure the\nvalidity and reliability of results.\n","authors":["Xinyu Hang","Shenpeng Song","Zhimeng Huang","Chuanmin Jia","Siwei Ma","Wen Gao"],"pdf_url":"https://arxiv.org/pdf/2410.12220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.12219v1","updated":"2024-10-16T04:29:46Z","published":"2024-10-16T04:29:46Z","title":"OmnixR: Evaluating Omni-modality Language Models on Reasoning across\n Modalities","summary":" We introduce OmnixR, an evaluation suite designed to benchmark SoTA\nOmni-modality Language Models, such as GPT-4o and Gemini. Evaluating OLMs,\nwhich integrate multiple modalities such as text, vision, and audio, presents\nunique challenges. Particularly, the user message might often consist of\nmultiple modalities, such that OLMs have to establish holistic understanding\nand reasoning across modalities to accomplish the task. Existing benchmarks are\nlimited to single modality or dual-modality tasks, overlooking comprehensive\nmulti-modal assessments of model reasoning. To address this, OmnixR offers two\nevaluation variants: (1)synthetic subset: a synthetic dataset generated\nautomatically by translating text into multiple modalities--audio, images,\nvideo, and hybrids (Omnify). (2)realistic subset: a real-world dataset,\nmanually curated and annotated by experts, for evaluating cross-modal reasoning\nin natural settings. OmnixR presents a unique evaluation towards assessing OLMs\nover a diverse mix of modalities, such as a question that involves video,\naudio, and text, providing a rigorous cross-modal reasoning testbed unlike any\nexisting benchmarks. Our experiments find that all state-of-the-art OLMs\nstruggle with OmnixR questions that require integrating information from\nmultiple modalities to answer. Further analysis highlights differences in\nreasoning behavior, underscoring the challenges of omni-modal AI alignment.\n","authors":["Lichang Chen","Hexiang Hu","Mingda Zhang","Yiwen Chen","Zifeng Wang","Yandong Li","Pranav Shyam","Tianyi Zhou","Heng Huang","Ming-Hsuan Yang","Boqing Gong"],"pdf_url":"https://arxiv.org/pdf/2410.12219v1.pdf","comment":"19 pages, 6 figures, 12 tables"},{"id":"http://arxiv.org/abs/2407.07464v2","updated":"2024-10-16T03:44:41Z","published":"2024-07-10T08:40:39Z","title":"Video-to-Audio Generation with Hidden Alignment","summary":" Generating semantically and temporally aligned audio content in accordance\nwith video input has become a focal point for researchers, particularly\nfollowing the remarkable breakthrough in text-to-video generation. In this\nwork, we aim to offer insights into the video-to-audio generation paradigm,\nfocusing on three crucial aspects: vision encoders, auxiliary embeddings, and\ndata augmentation techniques. Beginning with a foundational model built on a\nsimple yet surprisingly effective intuition, we explore various vision encoders\nand auxiliary embeddings through ablation studies. Employing a comprehensive\nevaluation pipeline that emphasizes generation quality and video-audio\nsynchronization alignment, we demonstrate that our model exhibits\nstate-of-the-art video-to-audio generation capabilities. Furthermore, we\nprovide critical insights into the impact of different data augmentation\nmethods on enhancing the generation framework's overall capacity. We showcase\npossibilities to advance the challenge of generating synchronized audio from\nsemantic and temporal perspectives. We hope these insights will serve as a\nstepping stone toward developing more realistic and accurate audio-visual\ngeneration models.\n","authors":["Manjie Xu","Chenxing Li","Xinyi Tu","Yong Ren","Rilin Chen","Yu Gu","Wei Liang","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2407.07464v2.pdf","comment":"https://sites.google.com/view/vta-ldm"},{"id":"http://arxiv.org/abs/2410.12191v1","updated":"2024-10-16T03:25:16Z","published":"2024-10-16T03:25:16Z","title":"Test-time adaptation for image compression with distribution\n regularization","summary":" Current test- or compression-time adaptation image compression (TTA-IC)\napproaches, which leverage both latent and decoder refinements as a two-step\nadaptation scheme, have potentially enhanced the rate-distortion (R-D)\nperformance of learned image compression models on cross-domain compression\ntasks, \\textit{e.g.,} from natural to screen content images. However, compared\nwith the emergence of various decoder refinement variants, the latent\nrefinement, as an inseparable ingredient, is barely tailored to cross-domain\nscenarios. To this end, we aim to develop an advanced latent refinement method\nby extending the effective hybrid latent refinement (HLR) method, which is\ndesigned for \\textit{in-domain} inference improvement but shows noticeable\ndegradation of the rate cost in \\textit{cross-domain} tasks. Specifically, we\nfirst provide theoretical analyses, in a cue of marginalization approximation\nfrom in- to cross-domain scenarios, to uncover that the vanilla HLR suffers\nfrom an underlying mismatch between refined Gaussian conditional and hyperprior\ndistributions, leading to deteriorated joint probability approximation of\nmarginal distribution with increased rate consumption. To remedy this issue, we\nintroduce a simple Bayesian approximation-endowed \\textit{distribution\nregularization} to encourage learning a better joint probability approximation\nin a plug-and-play manner. Extensive experiments on six in- and cross-domain\ndatasets demonstrate that our proposed method not only improves the R-D\nperformance compared with other latent refinement counterparts, but also can be\nflexibly integrated into existing TTA-IC methods with incremental benefits.\n","authors":["Kecheng Chen","Pingping Zhang","Tiexin Qin","Shiqi Wang","Hong Yan","Haoliang Li"],"pdf_url":"https://arxiv.org/pdf/2410.12191v1.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..54a5d1b1 --- /dev/null +++ b/index.html @@ -0,0 +1,24845 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 95 + +
+
+
+ + ☆ ALTA: Compiler-Based Analysis of Transformers + + +
+ We propose a new programming language called ALTA and a compiler that can map +ALTA programs to Transformer weights. ALTA is inspired by RASP, a language +proposed by Weiss et al. (2021), and Tracr (Lindner et al., 2023), a compiler +from RASP programs to Transformer weights. ALTA complements and extends this +prior work, offering the ability to express loops and to compile programs to +Universal Transformers, among other advantages. ALTA allows us to +constructively show how Transformers can represent length-invariant algorithms +for computing parity and addition, as well as a solution to the SCAN benchmark +of compositional generalization tasks, without requiring intermediate +scratchpad decoding steps. We also propose tools to analyze cases where the +expressibility of an algorithm is established, but end-to-end training on a +given training set fails to induce behavior consistent with the desired +algorithm. To this end, we explore training from ALTA execution traces as a +more fine-grained supervision signal. This enables additional experiments and +theoretical analyses relating the learnability of various algorithms to data +availability and modeling decisions, such as positional encodings. We make the +ALTA framework -- language specification, symbolic interpreter, and weight +compiler -- available to the community to enable further applications and +insights. + +
+
+
+
+
+ + ☆ TP-Eval: Tap Multimodal LLMs' Potential in Evaluation by Customizing + Prompts + + +
+ Recently, multimodal large language models (MLLMs) have received much +attention for their impressive capabilities. The evaluation of MLLMs is +becoming critical to analyzing attributes of MLLMs and providing valuable +insights. However, current benchmarks overlook the problem of prompt +sensitivity - minor prompt variations may lead to significant performance +fluctuations. Thus, inappropriate prompts may obscure the models' capabilities, +underestimating the models' performance. Moreover, different models have +different preferences for different prompts, and thus, using the same prompt +for all models will cause evaluation bias. This paper analyzes this deficiency +in existing benchmarks and further introduces a new evaluation framework named +TP-Eval, which introduces a prompt customization method to reduce evaluation +biases and tap models' potential. TP-Eval will rewrite the original prompts to +different customized prompts for different models. In particular, we propose +some well-designed modules for prompt customization tailored to the scenario of +MLLM evaluation. Extensive experiments demonstrate the effectiveness of our +approach to uncovering models' capabilities, and TP-Eval should benefit the +community in developing more comprehensive and convincing MLLM evaluation +benchmarks. + +
+
+
+
+
+ + ☆ CLEAR: Character Unlearning in Textual and Visual Modalities + + +
+ Machine Unlearning (MU) is critical for enhancing privacy and security in +deep learning models, particularly in large multimodal language models (MLLMs), +by removing specific private or hazardous information. While MU has made +significant progress in textual and visual modalities, multimodal unlearning +(MMU) remains significantly underexplored, partially due to the absence of a +suitable open-source benchmark. To address this, we introduce CLEAR, a new +benchmark designed to evaluate MMU methods. CLEAR contains 200 fictitious +individuals and 3,700 images linked with corresponding question-answer pairs, +enabling a thorough evaluation across modalities. We assess 10 MU methods, +adapting them for MMU, and highlight new challenges specific to multimodal +forgetting. We also demonstrate that simple $\ell_1$ regularization on LoRA +weights significantly mitigates catastrophic forgetting, preserving model +performance on retained data. The dataset is available at +https://huggingface.co/datasets/therem/CLEAR + +
+
+
+
+
+ + ☆ LongRAG: A Dual-Perspective Retrieval-Augmented Generation Paradigm for + Long-Context Question Answering EMNLP 2024 + + +
+ Long-Context Question Answering (LCQA), a challenging task, aims to reason +over long-context documents to yield accurate answers to questions. Existing +long-context Large Language Models (LLMs) for LCQA often struggle with the +"lost in the middle" issue. Retrieval-Augmented Generation (RAG) mitigates this +issue by providing external factual evidence. However, its chunking strategy +disrupts the global long-context information, and its low-quality retrieval in +long contexts hinders LLMs from identifying effective factual details due to +substantial noise. To this end, we propose LongRAG, a general, +dual-perspective, and robust LLM-based RAG system paradigm for LCQA to enhance +RAG's understanding of complex long-context knowledge (i.e., global information +and factual details). We design LongRAG as a plug-and-play paradigm, +facilitating adaptation to various domains and LLMs. Extensive experiments on +three multi-hop datasets demonstrate that LongRAG significantly outperforms +long-context LLMs (up by 6.94%), advanced RAG (up by 6.16%), and Vanilla RAG +(up by 17.25%). Furthermore, we conduct quantitative ablation studies and +multi-dimensional analyses, highlighting the effectiveness of the system's +components and fine-tuning strategies. Data and code are available at +https://github.com/QingFei1/LongRAG. + +
+
+ comment: EMNLP 2024 Main +
+
+
+
+
+ + ☆ Key Algorithms for Keyphrase Generation: Instruction-Based LLMs for + Russian Scientific Keyphrases + + +
+ Keyphrase selection is a challenging task in natural language processing that +has a wide range of applications. Adapting existing supervised and unsupervised +solutions for the Russian language faces several limitations due to the rich +morphology of Russian and the limited number of training datasets available. +Recent studies conducted on English texts show that large language models +(LLMs) successfully address the task of generating keyphrases. LLMs allow +achieving impressive results without task-specific fine-tuning, using text +prompts instead. In this work, we access the performance of prompt-based +methods for generating keyphrases for Russian scientific abstracts. First, we +compare the performance of zero-shot and few-shot prompt-based methods, +fine-tuned models, and unsupervised methods. Then we assess strategies for +selecting keyphrase examples in a few-shot setting. We present the outcomes of +human evaluation of the generated keyphrases and analyze the strengths and +weaknesses of the models through expert assessment. Our results suggest that +prompt-based methods can outperform common baselines even using simple text +prompts. + +
+
+ comment: The 12th International Conference on Analysis of Images, Social + Networks and Texts (AIST'2024) +
+
+
+
+
+ + ☆ MiLoRA: Efficient Mixture of Low-Rank Adaptation for Large Language + Models Fine-tuning EMNLP 2024 + + +
+ Low-rank adaptation (LoRA) and its mixture-of-experts (MOE) variants are +highly effective parameter-efficient fine-tuning (PEFT) methods. However, they +introduce significant latency in multi-tenant settings due to the LoRA modules +and MOE routers added to multiple linear modules in the Transformer layer. To +address this issue, we propose Mixture of Low-Rank Adaptation (MiLoRA), a novel +and efficient LoRA variant. MiLoRA differs from previous MOE-style LoRA methods +by considering each LoRA module as an expert and employing a prompt-aware +routing mechanism. This mechanism calculates expert routing results once before +generating the first new token and reuses these results for subsequent tokens, +reducing latency. Extensive experiments and analysis on commonsense reasoning +tasks, math reasoning tasks, and widely used LLM evaluation benchmarks +demonstrate that MiLoRA consistently outperforms strong PEFT baselines with +comparable tunable parameter budgets. Additionally, MiLoRA significantly +reduces latency in multi-tenant settings compared to previous LoRA-based +methods. + +
+
+ comment: Accepted by EMNLP 2024 Findings. arXiv admin note: substantial text + overlap with arXiv:2405.18203 +
+
+
+
+
+ + ☆ GraphTeam: Facilitating Large Language Model-based Graph Analysis via + Multi-Agent Collaboration + + +
+ Graphs are widely used for modeling relational data in real-world scenarios, +such as social networks and urban computing. Existing LLM-based graph analysis +approaches either integrate graph neural networks (GNNs) for specific machine +learning tasks, limiting their transferability, or rely solely on LLMs' +internal reasoning ability, resulting in suboptimal performance. To address +these limitations, we take advantage of recent advances in LLM-based agents, +which have shown capabilities of utilizing external knowledge or tools for +problem solving. By simulating human problem-solving strategies such as analogy +and collaboration, we propose a multi-agent system based on LLMs named +GraphTeam, for graph analysis. GraphTeam consists of five LLM-based agents from +three modules, and the agents with different specialities can collaborate with +each other to address complex problems. Specifically, (1) input-output +normalization module: the question agent extracts and refines four key +arguments from the original question, facilitating the problem understanding, +and the answer agent organizes the results to meet the output requirement; (2) +external knowledge retrieval module: we first build a knowledge base consisting +of relevant documentation and experience information, and then the search agent +retrieves the most relevant entries for each question. (3) problem-solving +module: given the retrieved information from search agent, the coding agent +uses established algorithms via programming to generate solutions, and in case +the coding agent does not work, the reasoning agent will directly compute the +results without programming. Extensive experiments on six graph analysis +benchmarks demonstrate that GraphTeam achieves state-of-the-art performance +with an average 25.85% improvement over the best baseline in terms of accuracy. +The code and data are available at https://github.com/BUPT-GAMMA/GraphTeam. + +
+
+
+
+
+ + ☆ Cross-lingual Transfer of Reward Models in Multilingual Alignment + + +
+ Reinforcement learning with human feedback (RLHF) is shown to largely benefit +from precise reward models (RMs). However, recent studies in reward modeling +schemes are skewed towards English, limiting the applicability of RLHF in +multilingual alignments. In this work, we investigate the cross-lingual +transfer of RMs trained in diverse languages, primarily from English. Our +experimental results demonstrate the strong cross-lingual transfer of English +RMs, exceeding target language RMs by 3~4% average increase in Multilingual +RewardBench. Furthermore, we analyze the cross-lingual transfer of RMs through +the representation shifts. Finally, we perform multilingual alignment to +exemplify how cross-lingual transfer in RM propagates to enhanced multilingual +instruction-following capability, along with extensive analyses on +off-the-shelf RMs. We release the code, model, and data. + +
+
+
+
+
+ + ☆ Together We Can: Multilingual Automatic Post-Editing for Low-Resource + Languages EMNLP 2024 + + +
+ This exploratory study investigates the potential of multilingual Automatic +Post-Editing (APE) systems to enhance the quality of machine translations for +low-resource Indo-Aryan languages. Focusing on two closely related language +pairs, English-Marathi and English-Hindi, we exploit the linguistic +similarities to develop a robust multilingual APE model. To facilitate +cross-linguistic transfer, we generate synthetic Hindi-Marathi and +Marathi-Hindi APE triplets. Additionally, we incorporate a Quality Estimation +(QE)-APE multi-task learning framework. While the experimental results +underline the complementary nature of APE and QE, we also observe that QE-APE +multitask learning facilitates effective domain adaptation. Our experiments +demonstrate that the multilingual APE models outperform their corresponding +English-Hindi and English-Marathi single-pair models by $2.5$ and $2.39$ TER +points, respectively, with further notable improvements over the multilingual +APE model observed through multi-task learning ($+1.29$ and $+1.44$ TER +points), data augmentation ($+0.53$ and $+0.45$ TER points) and domain +adaptation ($+0.35$ and $+0.45$ TER points). We release the synthetic data, +code, and models accrued during this study publicly at +https://github.com/cfiltnlp/Multilingual-APE. + +
+
+ comment: Accepted at Findings of EMNLP 2024 +
+
+
+
+
+ + ☆ Dependency Graph Parsing as Sequence Labeling EMNLP-2024 + + +
+ Various linearizations have been proposed to cast syntactic dependency +parsing as sequence labeling. However, these approaches do not support more +complex graph-based representations, such as semantic dependencies or enhanced +universal dependencies, as they cannot handle reentrancy or cycles. By +extending them, we define a range of unbounded and bounded linearizations that +can be used to cast graph parsing as a tagging task, enlarging the toolbox of +problems that can be solved under this paradigm. Experimental results on +semantic dependency and enhanced UD parsing show that with a good choice of +encoding, sequence-labeling dependency graph parsers combine high efficiency +with accuracies close to the state of the art, in spite of their simplicity. + +
+
+ comment: Accepted at EMNLP-2024 +
+
+
+
+
+ + ☆ A Time-Aware Approach to Early Detection of Anorexia: UNSL at eRisk 2024 + + +
+ The eRisk laboratory aims to address issues related to early risk detection +on the Web. In this year's edition, three tasks were proposed, where Task 2 was +about early detection of signs of anorexia. Early risk detection is a problem +where precision and speed are two crucial objectives. Our research group solved +Task 2 by defining a CPI+DMC approach, addressing both objectives +independently, and a time-aware approach, where precision and speed are +considered a combined single-objective. We implemented the last approach by +explicitly integrating time during the learning process, considering the +ERDE{\theta} metric as the training objective. It also allowed us to +incorporate temporal metrics to validate and select the optimal models. We +achieved outstanding results for the ERDE50 metric and ranking-based metrics, +demonstrating consistency in solving ERD problems. + +
+
+ comment: In Conference and Labs of the Evaluation Forum (CLEF 2024), Grenoble, + France +
+
+
+
+
+ + ☆ Zeitenwenden: Detecting changes in the German political discourse + + +
+ From a monarchy to a democracy, to a dictatorship and back to a democracy -- +the German political landscape has been constantly changing ever since the +first German national state was formed in 1871. After World War II, the Federal +Republic of Germany was formed in 1949. Since then every plenary session of the +German Bundestag was logged and even has been digitized over the course of the +last few years. We analyze these texts using a time series variant of the topic +model LDA to investigate which events had a lasting effect on the political +discourse and how the political topics changed over time. This allows us to +detect changes in word frequency (and thus key discussion points) in political +discourse. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ☆ ExpertFlow: Optimized Expert Activation and Token Allocation for + Efficient Mixture-of-Experts Inference + + +
+ Sparse Mixture of Experts (MoE) models, while outperforming dense Large +Language Models (LLMs) in terms of performance, face significant deployment +challenges during inference due to their high memory demands. Existing +offloading techniques, which involve swapping activated and idle experts +between the GPU and CPU, often suffer from rigid expert caching mechanisms. +These mechanisms fail to adapt to dynamic routing, leading to inefficient cache +utilization, or incur prohibitive costs for prediction training. To tackle +these inference-specific challenges, we introduce ExpertFlow, a comprehensive +system specifically designed to enhance inference efficiency by accommodating +flexible routing and enabling efficient expert scheduling between CPU and GPU. +This reduces overhead and boosts system performance. Central to our approach is +a predictive routing path-based offloading mechanism that utilizes a +lightweight predictor to accurately forecast routing paths before computation +begins. This proactive strategy allows for real-time error correction in expert +caching, significantly increasing cache hit ratios and reducing the frequency +of expert transfers, thereby minimizing I/O overhead. Additionally, we +implement a dynamic token scheduling strategy that optimizes MoE inference by +rearranging input tokens across different batches. This method not only reduces +the number of activated experts per batch but also improves computational +efficiency. Our extensive experiments demonstrate that ExpertFlow achieves up +to 93.72\% GPU memory savings and enhances inference speed by 2 to 10 times +compared to baseline methods, highlighting its effectiveness and utility as a +robust solution for resource-constrained inference scenarios. + +
+
+ comment: Mixture-of-Experts, Inference, Offloading +
+
+
+
+
+ + ☆ SimRAG: Self-Improving Retrieval-Augmented Generation for Adapting Large + Language Models to Specialized Domains + + +
+ Retrieval-augmented generation (RAG) enhances the question-answering (QA) +abilities of large language models (LLMs) by integrating external knowledge. +However, adapting general-purpose RAG systems to specialized fields such as +science and medicine poses unique challenges due to distribution shifts and +limited access to domain-specific data. To tackle this, we propose SimRAG, a +self-training approach that equips the LLM with joint capabilities of question +answering and question generation for domain adaptation. Our method first +fine-tunes the LLM on instruction-following, question-answering, and +search-related data. Then, it prompts the same LLM to generate diverse +domain-relevant questions from unlabeled corpora, with an additional filtering +strategy to retain high-quality synthetic examples. By leveraging these +synthetic examples, the LLM can improve their performance on domain-specific +RAG tasks. Experiments on 11 datasets, spanning two backbone sizes and three +domains, demonstrate that SimRAG outperforms baselines by 1.2\%--8.6\%. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ ELAICHI: Enhancing Low-resource TTS by Addressing Infrequent and + Low-frequency Character Bigrams + + +
+ Recent advancements in Text-to-Speech (TTS) technology have led to +natural-sounding speech for English, primarily due to the availability of +large-scale, high-quality web data. However, many other languages lack access +to such resources, relying instead on limited studio-quality data. This +scarcity results in synthesized speech that often suffers from intelligibility +issues, particularly with low-frequency character bigrams. In this paper, we +propose three solutions to address this challenge. First, we leverage +high-quality data from linguistically or geographically related languages to +improve TTS for the target language. Second, we utilize low-quality Automatic +Speech Recognition (ASR) data recorded in non-studio environments, which is +refined using denoising and speech enhancement models. Third, we apply +knowledge distillation from large-scale models using synthetic data to generate +more robust outputs. Our experiments with Hindi demonstrate significant +reductions in intelligibility issues, as validated by human evaluators. We +propose this methodology as a viable alternative for languages with limited +access to high-quality data, enabling them to collectively benefit from shared +resources. + +
+
+ comment: 11 pages, 1 figure, 3 tables +
+
+
+
+
+ + ☆ Value Residual Learning For Alleviating Attention Concentration In + Transformers + + +
+ Transformers can capture long-range dependencies using self-attention, +allowing tokens to attend to all others directly. However, stacking multiple +attention layers leads to attention concentration. One natural way to address +this issue is to use cross-layer attention, allowing information from earlier +layers to be directly accessible to later layers. However, this approach is +computationally expensive. To address this problem, we propose Transformer with +residual value (ResFormer) which approximates cross-layer attention through +adding a residual connection from the values of the the first layer to all +subsequent layers. Based on this method, one variant is the Transformer with +single layer value (SVFormer), where all layers share the same value embedding +from first layer, reducing the KV cache by nearly 50%. Comprehensive empirical +evidence demonstrates that ResFormer mitigates attention concentration problem +in deeper layers and enhances representation across most layers, outperforming +the vanilla Transformer, DenseFormer, and NeuTRENO in training error as well as +downstream tasks. SVFormer trains significantly faster than the vanilla +Transformer and performs better than other methods like GQA and CLA, with +performance influenced by sequence length and cumulative learning rate. + +
+
+
+
+
+ + ☆ Scaling Diffusion Language Models via Adaptation from Autoregressive + Models + + +
+ Diffusion Language Models (DLMs) have emerged as a promising new paradigm for +text generative modeling, potentially addressing limitations of autoregressive +(AR) models. However, current DLMs have been studied at a smaller scale +compared to their AR counterparts and lack fair comparison on language modeling +benchmarks. Additionally, training diffusion models from scratch at scale +remains challenging. Given the prevalence of open-source AR language models, we +propose adapting these models to build text diffusion models. We demonstrate +connections between AR and diffusion modeling objectives and introduce a simple +continual pre-training approach for training diffusion models. Through +systematic evaluation on language modeling, reasoning, and commonsense +benchmarks, we show that we can convert AR models ranging from 127M to 7B +parameters (GPT2 and LLaMA) into diffusion models DiffuGPT and DiffuLLaMA, +using less than 200B tokens for training. Our experimental results reveal that +these models outperform earlier DLMs and are competitive with their AR +counterparts. We release a suite of DLMs (with 127M, 355M, and 7B parameters) +capable of generating fluent text, performing in-context learning, filling in +the middle without prompt re-ordering, and following instructions +\url{https://github.com/HKUNLP/DiffuLLaMA}. + +
+
+ comment: 25 pages. Code: https://github.com/HKUNLP/DiffuLLaMA +
+
+
+
+
+ + ☆ SpeakGer: A meta-data enriched speech corpus of German state and federal + parliaments + + +
+ The application of natural language processing on political texts as well as +speeches has become increasingly relevant in political sciences due to the +ability to analyze large text corpora which cannot be read by a single person. +But such text corpora often lack critical meta information, detailing for +instance the party, age or constituency of the speaker, that can be used to +provide an analysis tailored to more fine-grained research questions. To enable +researchers to answer such questions with quantitative approaches such as +natural language processing, we provide the SpeakGer data set, consisting of +German parliament debates from all 16 federal states of Germany as well as the +German Bundestag from 1947-2023, split into a total of 10,806,105 speeches. +This data set includes rich meta data in form of information on both reactions +from the audience towards the speech as well as information about the speaker's +party, their age, their constituency and their party's political alignment, +which enables a deeper analysis. We further provide three exploratory analyses, +detailing topic shares of different parties throughout time, a descriptive +analysis of the development of the age of an average speaker as well as a +sentiment analysis of speeches of different parties with regards to the +COVID-19 pandemic. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ Understanding Layer Significance in LLM Alignment + + +
+ Aligning large language models (LLMs) through fine-tuning is essential for +tailoring them to specific applications. Therefore, understanding what LLMs +learn during the alignment process is crucial. Recent studies suggest that +alignment primarily adjusts a model's presentation style rather than its +foundational knowledge, indicating that only certain components of the model +are significantly impacted. To delve deeper into LLM alignment, we propose to +identify which layers within LLMs are most critical to the alignment process, +thereby uncovering how alignment influences model behavior at a granular level. +We propose a novel approach to identify the important layers for LLM alignment +(ILA). It involves learning a binary mask for each incremental weight matrix in +the LoRA algorithm, indicating the significance of each layer. ILA consistently +identifies important layers across various alignment datasets, with nearly 90% +overlap even with substantial dataset differences, highlighting fundamental +patterns in LLM alignment. Experimental results indicate that freezing +non-essential layers improves overall model performance, while selectively +tuning the most critical layers significantly enhances fine-tuning efficiency +with minimal performance loss. + +
+
+
+
+
+ + ☆ Understanding When Tree of Thoughts Succeeds: Larger Models Excel in + Generation, Not Discrimination + + +
+ Tree of Thoughts (ToT) is a reasoning strategy for Large Language Models +(LLMs) that employs a generator to suggest reasoning steps and a discriminator +to decide which steps to implement. ToT demonstrates strong performance on +reasoning tasks, often surpassing simple methods such as Input-Output (IO) +prompting and Chain-of-Thought (CoT) reasoning. However, ToT does not +consistently outperform such simpler methods across all models, leaving large +knowledge gaps on the conditions under which ToT is most beneficial. In this +paper, we analyze the roles of the generator and discriminator separately to +better understand the conditions when ToT is beneficial. We find that the +generator plays a more critical role than the discriminator in driving the +success of ToT. While using even a smaller model as the discriminator, scaling +the generator leads to notable improvements in ToT performance, whereas scaling +the discriminator with a fixed generator yields only marginal gains. Our +results show that models across different scales exhibit comparable +discrimination capabilities, yet differ significantly in their generative +performance for ToT. + +
+
+ comment: Code: github.com/mainlp/tot-eval +
+
+
+
+
+ + ☆ OmniFlatten: An End-to-end GPT Model for Seamless Voice Conversation + + +
+ Full-duplex spoken dialogue systems significantly advance over traditional +turn-based dialogue systems, as they allow simultaneous bidirectional +communication, closely mirroring human-human interactions. However, achieving +low latency and natural interactions in full-duplex dialogue systems remains a +significant challenge, especially considering human conversation dynamics such +as interruptions, backchannels, and overlapping speech. In this paper, we +introduce a novel End-to-End GPT-based model OmniFlatten for full-duplex +conversation, capable of effectively modeling the complex behaviors inherent to +natural conversations with low latency. To achieve full-duplex communication +capabilities, we propose a multi-stage post-training scheme that progressively +adapts a text-based large language model (LLM) backbone into a speech-text +dialogue LLM, capable of generating text and speech in real time, without +modifying the architecture of the backbone LLM. The training process comprises +three stages: modality alignment, half-duplex dialogue learning, and +full-duplex dialogue learning. Throughout all training stages, we standardize +the data using a flattening operation, which allows us to unify the training +methods and the model architecture across different modalities and tasks. Our +approach offers a straightforward modeling technique and a promising research +direction for developing efficient and natural end-to-end full-duplex spoken +dialogue systems. Audio samples of dialogues generated by OmniFlatten can be +found at this web site (https://omniflatten.github.io/). + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Leveraging the Domain Adaptation of Retrieval Augmented Generation + Models for Question Answering and Reducing Hallucination + + +
+ While ongoing advancements in Large Language Models have demonstrated +remarkable success across various NLP tasks, Retrieval Augmented Generation +Model stands out to be highly effective on downstream applications like +Question Answering. Recently, RAG-end2end model further optimized the +architecture and achieved notable performance improvements on domain +adaptation. However, the effectiveness of these RAG-based architectures remains +relatively unexplored when fine-tuned on specialized domains such as customer +service for building a reliable conversational AI system. Furthermore, a +critical challenge persists in reducing the occurrence of hallucinations while +maintaining high domain-specific accuracy. In this paper, we investigated the +performance of diverse RAG and RAG-like architectures through domain adaptation +and evaluated their ability to generate accurate and relevant response grounded +in the contextual knowledge base. To facilitate the evaluation of the models, +we constructed a novel dataset HotelConvQA, sourced from wide range of +hotel-related conversations and fine-tuned all the models on our domain +specific dataset. We also addressed a critical research gap on determining the +impact of domain adaptation on reducing hallucinations across different RAG +architectures, an aspect that was not properly measured in prior work. Our +evaluation shows positive results in all metrics by employing domain +adaptation, demonstrating strong performance on QA tasks and providing insights +into their efficacy in reducing hallucinations. Our findings clearly indicate +that domain adaptation not only enhances the models' performance on QA tasks +but also significantly reduces hallucination across all evaluated RAG +architectures. + +
+
+ comment: Initial Version fine-tuned on HotelConvQA +
+
+
+
+
+ + ☆ Latent Structures of Intertextuality in French Fiction + + +
+ Intertextuality is a key concept in literary theory that challenges +traditional notions of text, signification or authorship. It views texts as +part of a vast intertextual network that is constantly evolving and being +reconfigured. This paper argues that the field of computational literary +studies is the ideal place to conduct a study of intertextuality since we have +now the ability to systematically compare texts with each others. Specifically, +we present a work on a corpus of more than 12.000 French fictions from the +18th, 19th and early 20th century. We focus on evaluating the underlying roles +of two literary notions, sub-genres and the literary canon in the framing of +textuality. The article attempts to operationalize intertextuality using +state-of-the-art contextual language models to encode novels and capture +features that go beyond simple lexical or thematic approaches. Previous +research (Hughes, 2012) supports the existence of a literary "style of a time", +and our findings further reinforce this concept. Our findings also suggest that +both subgenres and canonicity play a significant role in shaping textual +similarities within French fiction. These discoveries point to the importance +of considering genre and canon as dynamic forces that influence the evolution +and intertextual connections of literary works within specific historical +contexts. + +
+
+ comment: 13 pages, 6 figures. Computational Humanities Research Conference + 2024 +
+
+
+
+
+ + ☆ Local Contrastive Editing of Gender Stereotypes EMNLP 2024 + + +
+ Stereotypical bias encoded in language models (LMs) poses a threat to safe +language technology, yet our understanding of how bias manifests in the +parameters of LMs remains incomplete. We introduce local contrastive editing +that enables the localization and editing of a subset of weights in a target +model in relation to a reference model. We deploy this approach to identify and +modify subsets of weights that are associated with gender stereotypes in LMs. +Through a series of experiments, we demonstrate that local contrastive editing +can precisely localize and control a small subset (< 0.5%) of weights that +encode gender bias. Our work (i) advances our understanding of how +stereotypical biases can manifest in the parameter space of LMs and (ii) opens +up new avenues for developing parameter-efficient strategies for controlling +model properties in a contrastive manner. + +
+
+ comment: Accepted at EMNLP 2024 +
+
+
+
+
+ + ☆ MojoBench: Language Modeling and Benchmarks for Mojo + + +
+ The recently introduced Mojo programming language (PL) by Modular, has +received significant attention in the scientific community due to its claimed +significant speed boost over Python. Despite advancements in code Large +Language Models (LLMs) across various PLs, Mojo remains unexplored in this +context. To address this gap, we introduce MojoBench, the first framework for +Mojo code generation. MojoBench includes HumanEval-Mojo, a benchmark dataset +designed for evaluating code LLMs on Mojo, and Mojo-Coder, the first LLM +pretrained and finetuned for Mojo code generation, which supports instructions +in 5 natural languages (NLs). Our results show that Mojo-Coder achieves a +30-35% performance improvement over leading models like GPT-4o and +Claude-3.5-Sonnet. Furthermore, we provide insights into LLM behavior with +underrepresented and unseen PLs, offering potential strategies for enhancing +model adaptability. MojoBench contributes to our understanding of LLM +capabilities and limitations in emerging programming paradigms fostering more +robust code generation systems. + +
+
+
+
+
+ + ☆ Dialectal and Low Resource Machine Translation for Aromanian COLING 2025 + + +
+ We present a neural machine translation system that can translate between +Romanian, English, and Aromanian (an endangered Eastern Romance language); the +first of its kind. BLEU scores range from 17 to 32 depending on the direction +and genre of the text. Alongside, we release the biggest known +Aromanian-Romanian bilingual corpus, consisting of 79k cleaned sentence pairs. +Additional tools such as an agnostic sentence embedder (used for both text +mining and automatic evaluation) and a diacritics converter are also presented. +We publicly release our findings and models. Finally, we describe the +deployment of our quantized model at https://arotranslate.com. + +
+
+ comment: 16 pages, 3 figures, 6 tables, submitted to COLING 2025 +
+
+
+
+
+ + ☆ CogSteer: Cognition-Inspired Selective Layer Intervention for Efficient + Semantic Steering in Large Language Models + + +
+ Despite their impressive capabilities, large language models (LLMs) often +lack interpretability and can generate toxic content. While using LLMs as +foundation models and applying semantic steering methods are widely practiced, +we believe that efficient methods should be based on a thorough understanding +of LLM behavior. To this end, we propose using eye movement measures to +interpret LLM behavior across layers. We find that LLMs exhibit patterns +similar to human gaze across layers and different layers function differently. +Inspired by these findings, we introduce a heuristic steering layer selection +and apply it to layer intervention methods via fine-tuning and inference. Using +language toxification and detoxification as test beds, we demonstrate that our +proposed CogSteer methods achieve better results in terms of toxicity scores +while efficiently saving 97% of the computational resources and 60% of the +training time. Our model-agnostic approach can be adopted into various LLMs, +contributing to their interpretability and promoting trustworthiness for safe +deployment. + +
+
+
+
+
+ + ☆ Beware of Calibration Data for Pruning Large Language Models + + +
+ As large language models (LLMs) are widely applied across various fields, +model compression has become increasingly crucial for reducing costs and +improving inference efficiency. Post-training pruning is a promising method +that does not require resource-intensive iterative training and only needs a +small amount of calibration data to assess the importance of parameters. +Previous research has primarily focused on designing advanced pruning methods, +while different calibration data's impact on pruning performance still lacks +systematical exploration. We fill this blank and surprisingly observe that the +effects of calibration data even value more than designing advanced pruning +strategies, especially for high sparsity. Our preliminary exploration also +discloses that using calibration data similar to the training data can yield +better performance. As pre-training data is usually inaccessible for advanced +LLMs, we further provide a self-generating calibration data synthesis strategy +to construct feasible calibration data. We conduct experiments on the recent +strong open-source LLMs (e.g., DCLM, and LLaMA-3), and the results show that +the proposed method outperforms commonly used calibration data and can +effectively enhance strong pruning methods (e.g., Wanda, OWL). + +
+
+ comment: under review +
+
+
+
+
+ + ☆ An Adaptive Framework for Generating Systematic Explanatory Answer in + Online Q&A Platforms + + +
+ Question Answering (QA) systems face challenges in handling complex questions +that require multi-domain knowledge synthesis. The naive RAG models, although +effective in information retrieval, struggle with complex questions that +require comprehensive and in-depth answers. The pioneering task is defined as +explanatory answer generation, which entails handling identified challenges +such as the requirement for comprehensive information and logical coherence +within the generated context. To address these issues, we refer to systematic +thinking theory and propose SynthRAG, an innovative framework designed to +enhance QA performance. SynthRAG improves on conventional models by employing +adaptive outlines for dynamic content structuring, generating systematic +information to ensure detailed coverage, and producing customized answers +tailored to specific user inquiries. This structured approach guarantees +logical coherence and thorough integration of information, yielding responses +that are both insightful and methodically organized. Empirical evaluations +underscore SynthRAG's effectiveness, demonstrating its superiority in handling +complex questions, overcoming the limitations of naive RAG models, and +significantly improving answer quality and depth. Furthermore, an online +deployment on the Zhihu platform revealed that SynthRAG's answers achieved +notable user engagement, with each response averaging 5.73 upvotes and +surpassing the performance of 79.8% of human contributors, highlighting the +practical relevance and impact of the proposed framework. Our code is available +at https://github.com/czy1999/SynthRAG . + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Towards a Similarity-adjusted Surprisal Theory EMNLP 2024 + + +
+ Surprisal theory posits that the cognitive effort required to comprehend a +word is determined by its contextual predictability, quantified as surprisal. +Traditionally, surprisal theory treats words as distinct entities, overlooking +any potential similarity between them. Giulianelli et al. (2023) address this +limitation by introducing information value, a measure of predictability +designed to account for similarities between communicative units. Our work +leverages Ricotta and Szeidl's (2006) diversity index to extend surprisal into +a metric that we term similarity-adjusted surprisal, exposing a mathematical +relationship between surprisal and information value. Similarity-adjusted +surprisal aligns with information value when considering graded similarities +and reduces to standard surprisal when words are treated as distinct. +Experimental results with reading time data indicate that similarity-adjusted +surprisal adds predictive power beyond standard surprisal for certain datasets, +suggesting it serves as a complementary measure of comprehension effort. + +
+
+ comment: EMNLP 2024 main conference proceedings +
+
+
+
+
+ + ☆ Quantifying the Risks of Tool-assisted Rephrasing to Linguistic + Diversity + + +
+ Writing assistants and large language models see widespread use in the +creation of text content. While their effectiveness for individual users has +been evaluated in the literature, little is known about their proclivity to +change language or reduce its richness when adopted by a large user base. In +this paper, we take a first step towards quantifying this risk by measuring the +semantic and vocabulary change enacted by the use of rephrasing tools on a +multi-domain corpus of human-generated text. + +
+
+
+
+
+ + ☆ ReflecTool: Towards Reflection-Aware Tool-Augmented Clinical Agents + + +
+ Large Language Models (LLMs) have shown promising potential in the medical +domain, assisting with tasks like clinical note generation and patient +communication. However, current LLMs are limited to text-based communication, +hindering their ability to interact with diverse forms of information in +clinical environments. Despite clinical agents succeeding in diverse signal +interaction, they are oriented to a single clinical scenario and hence fail for +broader applications. To evaluate clinical agents holistically, we propose +ClinicalAgent Bench~(CAB), a comprehensive medical agent benchmark consisting +of 18 tasks across five key realistic clinical dimensions. Building on this, we +introduce ReflecTool, a novel framework that excels at utilizing +domain-specific tools within two stages. The first optimization stage +progressively enlarges a long-term memory by saving successful solving +processes and tool-wise experience of agents in a tiny pre-defined training +set. In the following inference stage, ReflecTool can search for supportive +successful demonstrations from already built long-term memory to guide the tool +selection strategy, and a verifier improves the tool usage according to the +tool-wise experience with two verification methods--iterative refinement and +candidate selection. Extensive experiments on ClinicalAgent Benchmark +demonstrate that ReflecTool surpasses the pure LLMs with more than 10 points +and the well-established agent-based methods with 3 points, highlighting its +adaptability and effectiveness in solving complex clinical tasks. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Markov Chain of Thought for Efficient Mathematical Reasoning + + +
+ Chain of Thought (CoT) of multi-step benefits from the logical structure of +the reasoning steps and task-specific actions, significantly enhancing the +mathematical reasoning capabilities of large language models. As the prevalence +of long CoT, the number of reasoning steps exceeds manageable token limits and +leads to higher computational demands. Inspired by the fundamental logic of +human cognition, ``derive, then reduce'', we conceptualize the standard +multi-step CoT as a novel Markov Chain of Thought (MCoT). In this study, we +consider the mathematical reasoning task, defining each reasoning step as text +accompanied by a Python code snippet. To facilitate a longer reasoning path, +self-correction is enabled through interactions with the code interpreter. Our +MCoT aims to compress previous reasoning steps into a simplified question, +enabling efficient next-step inference without relying on a lengthy KV cache. +In our experiments, we curate the \texttt{MCoTInstruct} dataset, and the +empirical results indicate that MCoT not only significantly enhances efficiency +but also maintains comparable accuracy. While much remains to be explored, this +work paves the way for exploring the long CoT reasoning abilities of LLMs. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ LMLPA: Language Model Linguistic Personality Assessment + + +
+ Large Language Models (LLMs) are increasingly used in everyday life and +research. One of the most common use cases is conversational interactions, +enabled by the language generation capabilities of LLMs. Just as between two +humans, a conversation between an LLM-powered entity and a human depends on the +personality of the conversants. However, measuring the personality of a given +LLM is currently a challenge. This paper introduces the Language Model +Linguistic Personality Assessment (LMLPA), a system designed to evaluate the +linguistic personalities of LLMs. Our system helps to understand LLMs' language +generation capabilities by quantitatively assessing the distinct personality +traits reflected in their linguistic outputs. Unlike traditional human-centric +psychometrics, the LMLPA adapts a personality assessment questionnaire, +specifically the Big Five Inventory, to align with the operational capabilities +of LLMs, and also incorporates the findings from previous language-based +personality measurement literature. To mitigate sensitivity to the order of +options, our questionnaire is designed to be open-ended, resulting in textual +answers. Thus, the AI rater is needed to transform ambiguous personality +information from text responses into clear numerical indicators of personality +traits. Utilising Principal Component Analysis and reliability validations, our +findings demonstrate that LLMs possess distinct personality traits that can be +effectively quantified by the LMLPA. This research contributes to +Human-Computer Interaction and Human-Centered AI, providing a robust framework +for future studies to refine AI personality assessments and expand their +applications in multiple areas, including education and manufacturing. + +
+
+
+
+
+ + ☆ Graphusion: A RAG Framework for Knowledge Graph Construction with a + Global Perspective + + +
+ Knowledge Graphs (KGs) are crucial in the field of artificial intelligence +and are widely used in downstream tasks, such as question-answering (QA). The +construction of KGs typically requires significant effort from domain experts. +Large Language Models (LLMs) have recently been used for Knowledge Graph +Construction (KGC). However, most existing approaches focus on a local +perspective, extracting knowledge triplets from individual sentences or +documents, missing a fusion process to combine the knowledge in a global KG. +This work introduces Graphusion, a zero-shot KGC framework from free text. It +contains three steps: in Step 1, we extract a list of seed entities using topic +modeling to guide the final KG includes the most relevant entities; in Step 2, +we conduct candidate triplet extraction using LLMs; in Step 3, we design the +novel fusion module that provides a global view of the extracted knowledge, +incorporating entity merging, conflict resolution, and novel triplet discovery. +Results show that Graphusion achieves scores of 2.92 and 2.37 out of 3 for +entity extraction and relation recognition, respectively. Moreover, we showcase +how Graphusion could be applied to the Natural Language Processing (NLP) domain +and validate it in an educational scenario. Specifically, we introduce TutorQA, +a new expert-verified benchmark for QA, comprising six tasks and a total of +1,200 QA pairs. Using the Graphusion-constructed KG, we achieve a significant +improvement on the benchmark, for example, a 9.2% accuracy improvement on +sub-graph completion. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2407.10794 +
+
+
+
+
+ + ☆ Cross-model Control: Improving Multiple Large Language Models in + One-time Training NeurIPS 2024 + + +
+ The number of large language models (LLMs) with varying parameter scales and +vocabularies is increasing. While they deliver powerful performance, they also +face a set of common optimization needs to meet specific requirements or +standards, such as instruction following or avoiding the output of sensitive +information from the real world. However, how to reuse the fine-tuning outcomes +of one model to other models to reduce training costs remains a challenge. To +bridge this gap, we introduce Cross-model Control (CMC), a method that improves +multiple LLMs in one-time training with a portable tiny language model. +Specifically, we have observed that the logit shift before and after +fine-tuning is remarkably similar across different models. Based on this +insight, we incorporate a tiny language model with a minimal number of +parameters. By training alongside a frozen template LLM, the tiny model gains +the capability to alter the logits output by the LLMs. To make this tiny +language model applicable to models with different vocabularies, we propose a +novel token mapping strategy named PM-MinED. We have conducted extensive +experiments on instruction tuning and unlearning tasks, demonstrating the +effectiveness of CMC. Our code is available at https://github.com/wujwyi/CMC. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ MM-Eval: A Multilingual Meta-Evaluation Benchmark for LLM-as-a-Judge and + Reward Models + + +
+ Large language models (LLMs) are commonly used as evaluators in tasks (e.g., +reward modeling, LLM-as-a-judge), where they act as proxies for human +preferences or judgments. This leads to the need for meta-evaluation: +evaluating the credibility of LLMs as evaluators. However, existing benchmarks +primarily focus on English, offering limited insight into LLMs' effectiveness +as evaluators in non-English contexts. To address this, we introduce MM-Eval, a +multilingual meta-evaluation benchmark that covers 18 languages across six +categories. MM-Eval evaluates various dimensions, including language-specific +challenges like linguistics and language hallucinations. Evaluation results +show that both proprietary and open-source language models have considerable +room for improvement. Further analysis reveals a tendency for these models to +assign middle-ground scores to low-resource languages. We publicly release our +benchmark and code. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ Differentially Private Learning Needs Better Model Initialization and + Self-Distillation + + +
+ Differentially private SGD (DPSGD) enables privacy-preserving training of +language models, but often reduces utility, diversity, and linguistic quality. +We introduce DPRefine, a three-phase method that initializes a model using data +synthesis from a small pre-trained LM with rigorous filtering, applies DP +finetuning on private data, and performs self-distillation to refine outputs. +This approach significantly outperforms vanilla DPSGD, with AlpacaEval +preferring DPRefine's generations in 78.4% of cases across all datasets. Our +analysis reveals that DPRefine reduces linguistic errors in generated text by +84.0%, mitigating grammar and spelling errors, commonly associated with DPSGD. +It also reduces inconsistencies of non-private models, such as hallucinated +details and misattributed quotes. We find that small models like GPT-2 can be +effective for initialization and distillation, highlighting their potential in +enabling scalable and efficient deployment of privacy-preserving language. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ ESpeW: Robust Copyright Protection for LLM-based EaaS via + Embedding-Specific Watermark + + +
+ Embeddings as a Service (EaaS) is emerging as a crucial role in AI +applications. Unfortunately, EaaS is vulnerable to model extraction attacks, +highlighting the urgent need for copyright protection.Although some preliminary +works propose applying embedding watermarks to protect EaaS, recent research +reveals that these watermarks can be easily removed. Hence, it is crucial to +inject robust watermarks resistant to watermark removal attacks.Existing +watermarking methods typically inject a target embedding into embeddings +through linear interpolation when the text contains triggers. However, this +mechanism results in each watermarked embedding having the same component, +which makes the watermark easy to identify and eliminate.Motivated by this, in +this paper, we propose a novel embedding-specific watermarking (ESpeW) +mechanism to offer robust copyright protection for EaaS. Our approach involves +injecting unique, yet readily identifiable watermarks into each embedding. +Watermarks inserted by ESpeW are designed to maintain a significant distance +from one another and to avoid sharing common components, thus making it +significantly more challenging to remove the watermarks.Extensive experiments +on four popular datasets demonstrate that ESpeW can even watermark successfully +against a highly aggressive removal strategy without sacrificing the quality of +embeddings. + +
+
+
+
+
+ + ☆ ProtoLens: Advancing Prototype Learning for Fine-Grained + Interpretability in Text Classification + + +
+ Deep neural networks have achieved remarkable performance in various +text-based tasks but often lack interpretability, making them less suitable for +applications where transparency is critical. To address this, we propose +ProtoLens, a novel prototype-based model that provides fine-grained, +sub-sentence level interpretability for text classification. ProtoLens uses a +Prototype-aware Span Extraction module to identify relevant text spans +associated with learned prototypes and a Prototype Alignment mechanism to +ensure prototypes are semantically meaningful throughout training. By aligning +the prototype embeddings with human-understandable examples, ProtoLens provides +interpretable predictions while maintaining competitive accuracy. Extensive +experiments demonstrate that ProtoLens outperforms both prototype-based and +non-interpretable baselines on multiple text classification benchmarks. Code +and data are available at +\url{https://anonymous.4open.science/r/ProtoLens-CE0B/}. + +
+
+
+
+
+ + ☆ Responsible Multilingual Large Language Models: A Survey of Development, + Applications, and Societal Impact + + +
+ Multilingual Large Language Models (MLLMs) represent a pivotal advancement in +democratizing artificial intelligence across linguistic boundaries. While +theoretical foundations are well-established, practical implementation +guidelines remain scattered. This work bridges this gap by providing a +comprehensive end-to-end framework for developing and deploying MLLMs in +production environments. We make three distinctive contributions: First, we +present an actionable pipeline from data pre-processing through deployment, +integrating insights from academic research and industrial applications. +Second, using Llama2 as a case study, we provide detailed optimization +strategies for enhancing multilingual capabilities, including curriculum +learning approaches for balancing high-resource and low-resource languages, +tokenization strategies, and effective sampling methods. Third, we offer an +interdisciplinary analysis that considers technical, linguistic, and cultural +perspectives in MLLM development. Our findings reveal critical challenges in +supporting linguistic diversity, with 88.38% of world languages categorized as +low-resource, affecting over a billion speakers. We examine practical solutions +through real-world applications in customer service, search engines, and +machine translation. By synthesizing theoretical frameworks with +production-ready implementation strategies, this survey provides essential +guidance for practitioners and researchers working to develop more inclusive +and effective multilingual AI systems. + +
+
+
+
+
+ + ☆ Navigate Complex Physical Worlds via Geometrically Constrained LLM + + +
+ This study investigates the potential of Large Language Models (LLMs) for +reconstructing and constructing the physical world solely based on textual +knowledge. It explores the impact of model performance on spatial understanding +abilities. To enhance the comprehension of geometric and spatial relationships +in the complex physical world, the study introduces a set of geometric +conventions and develops a workflow based on multi-layer graphs and multi-agent +system frameworks. It examines how LLMs achieve multi-step and multi-objective +geometric inference in a spatial environment using multi-layer graphs under +unified geometric conventions. Additionally, the study employs a genetic +algorithm, inspired by large-scale model knowledge, to solve geometric +constraint problems. In summary, this work innovatively explores the +feasibility of using text-based LLMs as physical world builders and designs a +workflow to enhance their capabilities. + +
+
+
+
+
+ + ☆ MobileSafetyBench: Evaluating Safety of Autonomous Agents in Mobile + Device Control + + +
+ Autonomous agents powered by large language models (LLMs) show promising +potential in assistive tasks across various domains, including mobile device +control. As these agents interact directly with personal information and device +settings, ensuring their safe and reliable behavior is crucial to prevent +undesirable outcomes. However, no benchmark exists for standardized evaluation +of the safety of mobile device-control agents. In this work, we introduce +MobileSafetyBench, a benchmark designed to evaluate the safety of +device-control agents within a realistic mobile environment based on Android +emulators. We develop a diverse set of tasks involving interactions with +various mobile applications, including messaging and banking applications. To +clearly evaluate safety apart from general capabilities, we design separate +tasks measuring safety and tasks evaluating helpfulness. The safety tasks +challenge agents with managing potential risks prevalent in daily life and +include tests to evaluate robustness against indirect prompt injections. Our +experiments demonstrate that while baseline agents, based on state-of-the-art +LLMs, perform well in executing helpful tasks, they show poor performance in +safety tasks. To mitigate these safety concerns, we propose a prompting method +that encourages agents to prioritize safety considerations. While this method +shows promise in promoting safer behaviors, there is still considerable room +for improvement to fully earn user trust. This highlights the urgent need for +continued research to develop more robust safety mechanisms in mobile +environments. We open-source our benchmark at: +https://mobilesafetybench.github.io/. + +
+
+
+
+
+ + ☆ Large Language Models Still Exhibit Bias in Long Text + + +
+ Existing fairness benchmarks for large language models (LLMs) primarily focus +on simple tasks, such as multiple-choice questions, overlooking biases that may +arise in more complex scenarios like long-text generation. To address this gap, +we introduce the Long Text Fairness Test (LTF-TEST), a framework that evaluates +biases in LLMs through essay-style prompts. LTF-TEST covers 14 topics and 10 +demographic axes, including gender and race, resulting in 11,948 samples. By +assessing both model responses and the reasoning behind them, LTF-TEST uncovers +subtle biases that are difficult to detect in simple responses. In our +evaluation of five recent LLMs, including GPT-4o and LLaMa3, we identify two +key patterns of bias. First, these models frequently favor certain demographic +groups in their responses. Second, they show excessive sensitivity toward +traditionally disadvantaged groups, often providing overly protective responses +while neglecting others. To mitigate these biases, we propose FT-REGARD, a +finetuning approach that pairs biased prompts with neutral responses. FT-REGARD +reduces gender bias by 34.6% and improves performance by 1.4 percentage points +on the BBQ benchmark, offering a promising approach to addressing biases in +long-text generation tasks. + +
+
+ comment: 22 page, 38 figures, Neurips (SoLaR Workshop) +
+
+
+
+
+ + ☆ Mechanisms of Symbol Processing for In-Context Learning in Transformer + Networks + + +
+ Large Language Models (LLMs) have demonstrated impressive abilities in symbol +processing through in-context learning (ICL). This success flies in the face of +decades of predictions that artificial neural networks cannot master abstract +symbol manipulation. We seek to understand the mechanisms that can enable +robust symbol processing in transformer networks, illuminating both the +unanticipated success, and the significant limitations, of transformers in +symbol processing. Borrowing insights from symbolic AI on the power of +Production System architectures, we develop a high-level language, PSL, that +allows us to write symbolic programs to do complex, abstract symbol processing, +and create compilers that precisely implement PSL programs in transformer +networks which are, by construction, 100% mechanistically interpretable. We +demonstrate that PSL is Turing Universal, so the work can inform the +understanding of transformer ICL in general. The type of transformer +architecture that we compile from PSL programs suggests a number of paths for +enhancing transformers' capabilities at symbol processing. (Note: The first +section of the paper gives an extended synopsis of the entire paper.) + +
+
+ comment: 101 pages (including 30 pages of Appendices), 18 figures +
+
+
+
+
+ + ☆ BadFair: Backdoored Fairness Attacks with Group-conditioned Triggers EMNLP 2024 + + +
+ Attacking fairness is crucial because compromised models can introduce biased +outcomes, undermining trust and amplifying inequalities in sensitive +applications like hiring, healthcare, and law enforcement. This highlights the +urgent need to understand how fairness mechanisms can be exploited and to +develop defenses that ensure both fairness and robustness. We introduce +BadFair, a novel backdoored fairness attack methodology. BadFair stealthily +crafts a model that operates with accuracy and fairness under regular +conditions but, when activated by certain triggers, discriminates and produces +incorrect results for specific groups. This type of attack is particularly +stealthy and dangerous, as it circumvents existing fairness detection methods, +maintaining an appearance of fairness in normal use. Our findings reveal that +BadFair achieves a more than 85% attack success rate in attacks aimed at target +groups on average while only incurring a minimal accuracy loss. Moreover, it +consistently exhibits a significant discrimination score, distinguishing +between pre-defined target and non-target attacked groups across various +datasets and models. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ☆ VoiceTextBlender: Augmenting Large Language Models with Speech + Capabilities via Single-Stage Joint Speech-Text Supervised Fine-Tuning + + +
+ Recent studies have augmented large language models (LLMs) with speech +capabilities, leading to the development of speech language models (SpeechLMs). +Earlier SpeechLMs focused on single-turn speech-based question answering (QA), +where user input comprised a speech context and a text question. More recent +studies have extended this to multi-turn conversations, though they often +require complex, multi-stage supervised fine-tuning (SFT) with diverse data. +Another critical challenge with SpeechLMs is catastrophic forgetting-where +models optimized for speech tasks suffer significant degradation in text-only +performance. To mitigate these issues, we propose a novel single-stage joint +speech-text SFT approach on the low-rank adaptation (LoRA) of the LLM backbone. +Our joint SFT combines text-only SFT data with three types of speech-related +data: speech recognition and translation, speech-based QA, and mixed-modal SFT. +Compared to previous SpeechLMs with 7B or 13B parameters, our 3B model +demonstrates superior performance across various speech benchmarks while +preserving the original capabilities on text-only tasks. Furthermore, our model +shows emergent abilities of effectively handling previously unseen prompts and +tasks, including multi-turn, mixed-modal inputs. + +
+
+
+
+
+ + ☆ Which Client is Reliable?: A Reliable and Personalized Prompt-based + Federated Learning for Medical Image Question Answering + + +
+ Conventional medical artificial intelligence (AI) models face barriers in +clinical application and ethical issues owing to their inability to handle the +privacy-sensitive characteristics of medical data. We present a novel +personalized federated learning (pFL) method for medical visual question +answering (VQA) models, addressing privacy reliability challenges in the +medical domain. Our method introduces learnable prompts into a Transformer +architecture to efficiently train it on diverse medical datasets without +massive computational costs. Then we introduce a reliable client VQA model that +incorporates Dempster-Shafer evidence theory to quantify uncertainty in +predictions, enhancing the model's reliability. Furthermore, we propose a novel +inter-client communication mechanism that uses maximum likelihood estimation to +balance accuracy and uncertainty, fostering efficient integration of insights +across clients. + +
+
+
+
+
+ + ☆ Is artificial intelligence still intelligence? LLMs generalize to novel + adjective-noun pairs, but don't mimic the full human distribution + + +
+ Inferences from adjective-noun combinations like "Is artificial intelligence +still intelligence?" provide a good test bed for LLMs' understanding of meaning +and compositional generalization capability, since there are many combinations +which are novel to both humans and LLMs but nevertheless elicit convergent +human judgments. We study a range of LLMs and find that the largest models we +tested are able to draw human-like inferences when the inference is determined +by context and can generalize to unseen adjective-noun combinations. We also +propose three methods to evaluate LLMs on these inferences out of context, +where there is a distribution of human-like answers rather than a single +correct answer. We find that LLMs show a human-like distribution on at most +75\% of our dataset, which is promising but still leaves room for improvement. + +
+
+ comment: 9 pages (23 pages with appendix). Accepted to GenBench 2024 +
+
+
+
+
+ + ♻ ☆ MADial-Bench: Towards Real-world Evaluation of Memory-Augmented Dialogue + Generation NAACL 2025 + + +
+ Long-term memory is important for chatbots and dialogue systems (DS) to +create consistent and human-like conversations, evidenced by numerous developed +memory-augmented DS (MADS). To evaluate the effectiveness of such MADS, +existing commonly used evaluation metrics, like retrieval accuracy and +perplexity (PPL), mainly focus on query-oriented factualness and language +quality assessment. However, these metrics often lack practical value. +Moreover, the evaluation dimensions are insufficient for human-like assessment +in DS. Regarding memory-recalling paradigms, current evaluation schemes only +consider passive memory retrieval while ignoring diverse memory recall with +rich triggering factors, e.g., emotions and surroundings, which can be +essential in emotional support scenarios. To bridge the gap, we construct a +novel Memory-Augmented Dialogue Benchmark (MADail-Bench) covering various +memory-recalling paradigms based on cognitive science and psychology theories. +The benchmark assesses two tasks separately: memory retrieval and memory +recognition with the incorporation of both passive and proactive memory recall +data. We introduce new scoring criteria to the evaluation, including memory +injection, emotion support (ES) proficiency, and intimacy, to comprehensively +assess generated responses. Results from cutting-edge embedding models and +large language models on this benchmark indicate the potential for further +advancement. Extensive testing further reveals correlations between memory +injection, ES proficiency, and intimacy. + +
+
+ comment: Submitted to NAACL 2025 +
+
+
+
+
+ + ♻ ☆ Does Generative AI speak Nigerian-Pidgin?: Issues about + Representativeness and Bias for Multilingualism in LLMs + + +
+ Nigeria is a multilingual country with 500+ languages. Naija is a +Nigerian-Pidgin spoken by approx. 120M speakers in Nigeria and it is a mixed +language (e.g., English, Portuguese, Yoruba, Hausa and Igbo). Although it has +mainly been a spoken language until recently, there are now various platforms +publishing exclusively in Naija such as Naija Wikipedia. However, it is hard to +distinguish by non-native from a larger pidgin languages spoken across West +Africa known as West African Pidgin English (WAPE) -- which is more simplied +and understandable by wider audience in Ghana, Nigeria, and Cameroon. BBC news +platform publishes exclusively in WAPE to cater for several countries in West +Africa. In our paper, we show through statistical analyses and Machine +Translation experiments that these two creole varieties do not represent each +other (i.e., there are linguistic differences in word order and vocabulary) and +Generative AI operates only based on WAPE. In other words, Naija is +under-represented in Generative AI, and it is hard to teach LLMs with few +examples. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ Conditional Language Policy: A General Framework for Steerable + Multi-Objective Finetuning EMNLP 2024 + + +
+ Reward-based finetuning is crucial for aligning language policies with +intended behaviors (e.g., creativity and safety). A key challenge is to develop +steerable language models that trade-off multiple (conflicting) objectives in a +flexible and efficient manner. This paper presents Conditional Language Policy +(CLP), a general framework for finetuning language models on multiple +objectives. Building on techniques from multi-task training and +parameter-efficient finetuning, CLP learn steerable models that effectively +trade-off conflicting objectives at inference time. Notably, this does not +require training or maintaining multiple models to achieve different trade-offs +between the objectives. Through extensive experiments and ablations on two +summarization datasets, we show that CLP learns steerable language models that +outperform and Pareto-dominate the existing approaches for multi-objective +finetuning. + +
+
+ comment: 40 pages. Findings of EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ STAR: SocioTechnical Approach to Red Teaming Language Models + + +
+ This research introduces STAR, a sociotechnical framework that improves on +current best practices for red teaming safety of large language models. STAR +makes two key contributions: it enhances steerability by generating +parameterised instructions for human red teamers, leading to improved coverage +of the risk surface. Parameterised instructions also provide more detailed +insights into model failures at no increased cost. Second, STAR improves signal +quality by matching demographics to assess harms for specific groups, resulting +in more sensitive annotations. STAR further employs a novel step of arbitration +to leverage diverse viewpoints and improve label reliability, treating +disagreement not as noise but as a valuable contribution to signal quality. + +
+
+ comment: 8 pages, 5 figures, 5 pages appendix. * denotes equal contribution +
+
+
+
+
+ + ♻ ☆ Proof of Thought : Neurosymbolic Program Synthesis allows Robust and + Interpretable Reasoning NeurIPS + 2024 + + +
+ Large Language Models (LLMs) have revolutionized natural language processing, +yet they struggle with inconsistent reasoning, particularly in novel domains +and complex logical sequences. This research introduces Proof of Thought, a +framework that enhances the reliability and transparency of LLM outputs. Our +approach bridges LLM-generated ideas with formal logic verification, employing +a custom interpreter to convert LLM outputs into First Order Logic constructs +for theorem prover scrutiny. Central to our method is an intermediary +JSON-based Domain-Specific Language, which by design balances precise logical +structures with intuitive human concepts. This hybrid representation enables +both rigorous validation and accessible human comprehension of LLM reasoning +processes. Key contributions include a robust type system with sort management +for enhanced logical integrity, explicit representation of rules for clear +distinction between factual and inferential knowledge, and a flexible +architecture that allows for easy extension to various domain-specific +applications. We demonstrate Proof of Thought's effectiveness through +benchmarking on StrategyQA and a novel multimodal reasoning task, showing +improved performance in open-ended scenarios. By providing verifiable and +interpretable results, our technique addresses critical needs for AI system +accountability and sets a foundation for human-in-the-loop oversight in +high-stakes domains. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) System 2 Reasoning At Scale Workshop +
+
+
+
+
+ + ♻ ☆ AlleNoise: large-scale text classification benchmark dataset with + real-world label noise + + +
+ Label noise remains a challenge for training robust classification models. +Most methods for mitigating label noise have been benchmarked using primarily +datasets with synthetic noise. While the need for datasets with realistic noise +distribution has partially been addressed by web-scraped benchmarks such as +WebVision and Clothing1M, those benchmarks are restricted to the computer +vision domain. With the growing importance of Transformer-based models, it is +crucial to establish text classification benchmarks for learning with noisy +labels. In this paper, we present AlleNoise, a new curated text classification +benchmark dataset with real-world instance-dependent label noise, containing +over 500,000 examples across approximately 5,600 classes, complemented with a +meaningful, hierarchical taxonomy of categories. The noise distribution comes +from actual users of a major e-commerce marketplace, so it realistically +reflects the semantics of human mistakes. In addition to the noisy labels, we +provide human-verified clean labels, which help to get a deeper insight into +the noise distribution, unlike web-scraped datasets typically used in the +field. We demonstrate that a representative selection of established methods +for learning with noisy labels is inadequate to handle such real-world noise. +In addition, we show evidence that these algorithms do not alleviate excessive +memorization. As such, with AlleNoise, we set the bar high for the development +of label noise methods that can handle real-world label noise in text +classification tasks. The code and dataset are available for download at +https://github.com/allegro/AlleNoise. + +
+
+
+
+
+ + ♻ ☆ Annotator-Centric Active Learning for Subjective NLP Tasks EMNLP2024 + + +
+ Active Learning (AL) addresses the high costs of collecting human annotations +by strategically annotating the most informative samples. However, for +subjective NLP tasks, incorporating a wide range of perspectives in the +annotation process is crucial to capture the variability in human judgments. We +introduce Annotator-Centric Active Learning (ACAL), which incorporates an +annotator selection strategy following data sampling. Our objective is +two-fold: 1) to efficiently approximate the full diversity of human judgments, +and 2) to assess model performance using annotator-centric metrics, which value +minority and majority perspectives equally. We experiment with multiple +annotator selection strategies across seven subjective NLP tasks, employing +both traditional and novel, human-centered evaluation metrics. Our findings +indicate that ACAL improves data efficiency and excels in annotator-centric +performance evaluations. However, its success depends on the availability of a +sufficiently large and diverse pool of annotators to sample from. + +
+
+ comment: Accepted at EMNLP2024 +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Truly Grasp Mathematics? An Empirical + Exploration + + +
+ Despite their proficiency in math tasks, the mechanisms underlying LLMs' +mathematical reasoning abilities remain a subject of debate. Recent studies +suggest that chain-of-thought (CoT) prompts can bolster mathematical reasoning +by encouraging LLMs to employ human-like logical reasoning (System 2), enabling +them to excel on the Cognitive Reflection Test (CRT). To assess whether LLMs +genuinely possess System 2-like logical reasoning, we introduced targeted +modifications to CRT problems. Our findings reveal that, despite the use of CoT +prompts, mainstream LLMs, including the latest o1-preview model, continue to +exhibit a significant error rate. Further analysis indicates that they +predominantly rely on System 1-like intuitive reasoning and pattern matching +derived from training data, rather than demonstrating mastery of mathematical +thinking. This discovery challenges the prevailing notion that LLMs possess +genuine logical reasoning abilities and that CoT can enhance them. +Consequently, this work may temper overly optimistic projections regarding +LLMs' advancement toward artificial general intelligence. + +
+
+
+
+
+ + ♻ ☆ Linear Adversarial Concept Erasure ICML 2022 + + +
+ Modern neural models trained on textual data rely on pre-trained +representations that emerge without direct supervision. As these +representations are increasingly being used in real-world applications, the +inability to \emph{control} their content becomes an increasingly important +problem. We formulate the problem of identifying and erasing a linear subspace +that corresponds to a given concept, in order to prevent linear predictors from +recovering the concept. We model this problem as a constrained, linear maximin +game, and show that existing solutions are generally not optimal for this task. +We derive a closed-form solution for certain objectives, and propose a convex +relaxation, \method, that works well for others. When evaluated in the context +of binary gender removal, the method recovers a low-dimensional subspace whose +removal mitigates bias by intrinsic and extrinsic evaluation. We show that the +method is highly expressive, effectively mitigating bias in deep nonlinear +classifiers while maintaining tractability and interpretability. + +
+
+ comment: Accepted in ICML 2022; a revised version +
+
+
+
+
+ + ♻ ☆ Fast and Slow Generating: An Empirical Study on Large and Small Language + Models Collaborative Decoding + + +
+ Large Language Models (LLMs) exhibit impressive capabilities across various +applications but encounter substantial challenges such as high inference +latency, considerable training costs, and the generation of hallucinations. +Collaborative decoding between large and small language models (SLMs) presents +a promising strategy to mitigate these issues through methods including +speculative decoding, contrastive decoding, and emulator or proxy fine-tuning. +However, the specifics of such collaborations, particularly from a unified +perspective, remain largely unexplored. Inspired by dual-process cognitive +theory, we propose a unified framework in this paper, termed Fast and Slow +Generating (FS-GEN). Within this framework, LLMs (sometimes along with SLMs) +are categorized as System 2 (slow and deliberate), while independent SLMs are +designated as System 1 (fast and intuitive). We provide a comprehensive +analysis of these collaborative methodologies, elucidating their common +properties and shedding light on the differential knowledge capabilities of +System 2 versus System 1 through the FS-GEN framework. Our findings indicate +that only a small proportion of collaborative interactions (approximately less +than 20\% in most instances) are necessary across various methods. These +interactions between System 1 and System 2 conform to a scaling law related to +the parameter ratios, enabling predictable collaboration. Furthermore, we +explore the specific conditions under which collaboration proves most +effective, particularly from an uncertainty perspective, offering novel +insights that may guide future optimization efforts. Our research underscores +that the fundamental distinction between System 1 and System 2 lies in the +uncertainty of next token predictions, where interventions by System 2 are +crucial to support System 1. Code for Reproduction: +https://github.com/TsinghuaC3I/FS-GEN + +
+
+ comment: update figures and results on Pythia Series +
+
+
+
+
+ + ♻ ☆ LocoMotion: Learning Motion-Focused Video-Language Representations ACCV 2024 + + +
+ This paper strives for motion-focused video-language representations. +Existing methods to learn video-language representations use spatial-focused +data, where identifying the objects and scene is often enough to distinguish +the relevant caption. We instead propose LocoMotion to learn from +motion-focused captions that describe the movement and temporal progression of +local object motions. We achieve this by adding synthetic motions to videos and +using the parameters of these motions to generate corresponding captions. +Furthermore, we propose verb-variation paraphrasing to increase the caption +variety and learn the link between primitive motions and high-level verbs. With +this, we are able to learn a motion-focused video-language representation. +Experiments demonstrate our approach is effective for a variety of downstream +tasks, particularly when limited data is available for fine-tuning. Code is +available: https://hazeldoughty.github.io/Papers/LocoMotion/ + +
+
+ comment: ACCV 2024 Oral +
+
+
+
+
+ + ♻ ☆ Reconfidencing LLMs from the Grouping Loss Perspective EMNLP 2024 + + +
+ Large Language Models (LLMs), including ChatGPT and LLaMA, are susceptible to +generating hallucinated answers in a confident tone. While efforts to elicit +and calibrate confidence scores have proven useful, recent findings show that +controlling uncertainty must go beyond calibration: predicted scores may +deviate significantly from the actual posterior probabilities due to the impact +of grouping loss. In this work, we construct a new evaluation dataset derived +from a knowledge base to assess confidence scores given to answers of Mistral +and LLaMA. Experiments show that they tend to be overconfident. Further, we +show that they are more overconfident on some answers than others, \emph{eg} +depending on the nationality of the person in the query. In +uncertainty-quantification theory, this is grouping loss. To address this, we +propose a solution to reconfidence LLMs, canceling not only calibration but +also grouping loss. The LLMs, after the reconfidencing process, indicate +improved confidence alignment with the accuracy of their responses. + +
+
+ comment: EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ TravelPlanner: A Benchmark for Real-World Planning with Language Agents ICML 2024 + + +
+ Planning has been part of the core pursuit for artificial intelligence since +its conception, but earlier AI agents mostly focused on constrained settings +because many of the cognitive substrates necessary for human-level planning +have been lacking. Recently, language agents powered by large language models +(LLMs) have shown interesting capabilities such as tool use and reasoning. Are +these language agents capable of planning in more complex settings that are out +of the reach of prior AI agents? To advance this investigation, we propose +TravelPlanner, a new planning benchmark that focuses on travel planning, a +common real-world planning scenario. It provides a rich sandbox environment, +various tools for accessing nearly four million data records, and 1,225 +meticulously curated planning intents and reference plans. Comprehensive +evaluations show that the current language agents are not yet capable of +handling such complex planning tasks-even GPT-4 only achieves a success rate of +0.6%. Language agents struggle to stay on task, use the right tools to collect +information, or keep track of multiple constraints. However, we note that the +mere possibility for language agents to tackle such a complex problem is in +itself non-trivial progress. TravelPlanner provides a challenging yet +meaningful testbed for future language agents. + +
+
+ comment: ICML 2024 (Spotlight) +
+
+
+
+
+ + ♻ ☆ Trends in Integration of Knowledge and Large Language Models: A Survey + and Taxonomy of Methods, Benchmarks, and Applications + + +
+ Large language models (LLMs) exhibit superior performance on various natural +language tasks, but they are susceptible to issues stemming from outdated data +and domain-specific limitations. In order to address these challenges, +researchers have pursued two primary strategies, knowledge editing and +retrieval augmentation, to enhance LLMs by incorporating external information +from different aspects. Nevertheless, there is still a notable absence of a +comprehensive survey. In this paper, we propose a review to discuss the trends +in integration of knowledge and large language models, including taxonomy of +methods, benchmarks, and applications. In addition, we conduct an in-depth +analysis of different methods and point out potential research directions in +the future. We hope this survey offers the community quick access and a +comprehensive overview of this research area, with the intention of inspiring +future research endeavors. + +
+
+ comment: Work in progress; 22 pages. This work has been submitted to the IEEE + for possible publication +
+
+
+
+
+ + ♻ ☆ Task Prompt Vectors: Effective Initialization through Multi-Task + Soft-Prompt Transfer + + +
+ Prompt tuning is an efficient solution for training large language models +(LLMs). However, current soft-prompt-based methods often sacrifice multi-task +modularity, requiring the training process to be fully or partially repeated +for each newly added task. While recent work on task vectors applied arithmetic +operations on full model weights to achieve the desired multi-task performance, +a similar approach for soft-prompts is still missing. To this end, we introduce +Task Prompt Vectors, created by element-wise difference between weights of +tuned soft-prompts and their random initialization. Experimental results on 12 +NLU datasets show that task prompt vectors can be used in low-resource settings +to effectively initialize prompt tuning on similar tasks. In addition, we show +that task prompt vectors are independent of the random initialization of prompt +tuning on 2 different language model architectures. This allows prompt +arithmetics with the pre-trained vectors from different tasks. In this way, we +provide a competitive alternative to state-of-the-art baselines by arithmetic +addition of task prompt vectors from multiple tasks. + +
+
+
+
+
+ + ♻ ☆ Let Me Teach You: Pedagogical Foundations of Feedback for Language + Models EMNLP 2024 + + +
+ Natural Language Feedback (NLF) is an increasingly popular mechanism for +aligning Large Language Models (LLMs) to human preferences. Despite the +diversity of the information it can convey, NLF methods are often hand-designed +and arbitrary, with little systematic grounding. At the same time, research in +learning sciences has long established several effective feedback models. In +this opinion piece, we compile ideas from pedagogy to introduce FELT, a +feedback framework for LLMs that outlines various characteristics of the +feedback space, and a feedback content taxonomy based on these variables, +providing a general mapping of the feedback space. In addition to streamlining +NLF designs, FELT also brings out new, unexplored directions for research in +NLF. We make our taxonomy available to the community, providing guides and +examples for mapping our categorizations to future research. + +
+
+ comment: EMNLP 2024; 9 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ CPE-Pro: A Structure-Sensitive Deep Learning Method for Protein + Representation and Origin Evaluation + + +
+ Protein structures are important for understanding their functions and +interactions. Currently, many protein structure prediction methods are +enriching the structure database. Discriminating the origin of structures is +crucial for distinguishing between experimentally resolved and computationally +predicted structures, evaluating the reliability of prediction methods, and +guiding downstream biological studies. Building on works in structure +prediction, We developed a structure-sensitive supervised deep learning model, +Crystal vs Predicted Evaluator for Protein Structure (CPE-Pro), to represent +and discriminate the origin of protein structures. CPE-Pro learns the +structural information of proteins and captures inter-structural differences to +achieve accurate traceability on four data classes, and is expected to be +extended to more. Simultaneously, we utilized Foldseek to encode protein +structures into "structure-sequences" and trained a protein Structural Sequence +Language Model, SSLM. Preliminary experiments demonstrated that, compared to +large-scale protein language models pre-trained on vast amounts of amino acid +sequences, the "structure-sequence" enables the language model to learn more +informative protein features, enhancing and optimizing structural +representations. We have provided the code, model weights, and all related +materials on https://github.com/GouWenrui/CPE-Pro-main.git. + +
+
+
+
+
+ + ♻ ☆ Do LLMs Have Distinct and Consistent Personality? TRAIT: Personality + Testset designed for LLMs with Psychometrics + + +
+ Recent advancements in Large Language Models (LLMs) have led to their +adaptation in various domains as conversational agents. We wonder: can +personality tests be applied to these agents to analyze their behavior, similar +to humans? We introduce TRAIT, a new benchmark consisting of 8K multi-choice +questions designed to assess the personality of LLMs. TRAIT is built on two +psychometrically validated small human questionnaires, Big Five Inventory (BFI) +and Short Dark Triad (SD-3), enhanced with the ATOMIC-10X knowledge graph to a +variety of real-world scenarios. TRAIT also outperforms existing personality +tests for LLMs in terms of reliability and validity, achieving the highest +scores across four key metrics: Content Validity, Internal Validity, Refusal +Rate, and Reliability. Using TRAIT, we reveal two notable insights into +personalities of LLMs: 1) LLMs exhibit distinct and consistent personality, +which is highly influenced by their training data (e.g., data used for +alignment tuning), and 2) current prompting techniques have limited +effectiveness in eliciting certain traits, such as high psychopathy or low +conscientiousness, suggesting the need for further research in this direction. + +
+
+ comment: Preprint; Under review +
+
+
+
+
+ + ♻ ☆ Attribute or Abstain: Large Language Models as Long Document Assistants EMNLP 2024 + + +
+ LLMs can help humans working with long documents, but are known to +hallucinate. Attribution can increase trust in LLM responses: The LLM provides +evidence that supports its response, which enhances verifiability. Existing +approaches to attribution have only been evaluated in RAG settings, where the +initial retrieval confounds LLM performance. This is crucially different from +the long document setting, where retrieval is not needed, but could help. Thus, +a long document specific evaluation of attribution is missing. To fill this +gap, we present LAB, a benchmark of 6 diverse long document tasks with +attribution, and experiments with different approaches to attribution on 5 LLMs +of different sizes. + We find that citation, i.e. response generation and evidence extraction in +one step, performs best for large and fine-tuned models, while additional +retrieval can help for small, prompted models. We investigate whether the "Lost +in the Middle'' phenomenon exists for attribution, but do not find this. We +also find that evidence quality can predict response quality on datasets with +simple responses, but not so for complex responses, as models struggle with +providing evidence for complex claims. + +
+
+ comment: Accepted at EMNLP 2024. Code and data: + https://github.com/UKPLab/arxiv2024-attribute-or-abstain +
+
+
+
+
+ + ♻ ☆ I've Got 99 Problems But FLOPS Ain't One + + +
+ Hyperscalers dominate the landscape of large network deployments, yet they +rarely share data or insights about the challenges they face. In light of this +supremacy, what problems can we find to solve in this space? We take an +unconventional approach to find relevant research directions, starting from +public plans to build a $100 billion datacenter for machine learning +applications. Leveraging the language models scaling laws, we discover what +workloads such a datacenter might carry and explore the challenges one may +encounter in doing so, with a focus on networking research. We conclude that +building the datacenter and training such models is technically possible, but +this requires novel wide-area transports for inter-DC communication, a +multipath transport and novel datacenter topologies for intra-datacenter +communication, high speed scale-up networks and transports, outlining a rich +research agenda for the networking community. + +
+
+
+
+
+ + ♻ ☆ Interpreting Context Look-ups in Transformers: Investigating + Attention-MLP Interactions EMNLP 2024 + + +
+ Understanding the inner workings of large language models (LLMs) is crucial +for advancing their theoretical foundations and real-world applications. While +the attention mechanism and multi-layer perceptrons (MLPs) have been studied +independently, their interactions remain largely unexplored. This study +investigates how attention heads and next-token neurons interact in LLMs to +predict new words. We propose a methodology to identify next-token neurons, +find prompts that highly activate them, and determine the upstream attention +heads responsible. We then generate and evaluate explanations for the activity +of these attention heads in an automated manner. Our findings reveal that some +attention heads recognize specific contexts relevant to predicting a token and +activate a downstream token-predicting neuron accordingly. This mechanism +provides a deeper understanding of how attention heads work with MLP neurons to +perform next-token prediction. Our approach offers a foundation for further +research into the intricate workings of LLMs and their impact on text +generation and understanding. + +
+
+ comment: Accepted to EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Few-Shot Adversarial Prompt Learning on Vision-Language Models NeurIPS 2024 + + +
+ The vulnerability of deep neural networks to imperceptible adversarial +perturbations has attracted widespread attention. Inspired by the success of +vision-language foundation models, previous efforts achieved zero-shot +adversarial robustness by aligning adversarial visual features with text +supervision. However, in practice, they are still unsatisfactory due to several +issues, including heavy adaptation cost, suboptimal text supervision, and +uncontrolled natural generalization capacity. In this paper, to address these +issues, we propose a few-shot adversarial prompt framework where adapting input +sequences with limited data makes significant adversarial robustness +improvement. Specifically, we achieve this by providing adversarially +correlated text supervision that is end-to-end learned from adversarial +examples. We also propose a novel training objective that enhances the +consistency of multi-modal features while encourages differentiated uni-modal +features between natural and adversarial examples. The proposed framework gives +access to learn adversarial text supervision, which provides superior +cross-modal adversarial alignment and matches state-of-the-art zero-shot +adversarial robustness with only 1% training data. Code is available at: +https://github.com/lionel-w2/FAP. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Have an English Accent? Evaluating and + Improving the Naturalness of Multilingual LLMs + + +
+ Current Large Language Models (LLMs) are predominantly designed with English +as the primary language, and even the few that are multilingual tend to exhibit +strong English-centric biases. Much like speakers who might produce awkward +expressions when learning a second language, LLMs often generate unnatural +outputs in non-English languages, reflecting English-centric patterns in both +vocabulary and grammar. Despite the importance of this issue, the naturalness +of multilingual LLM outputs has received limited attention. In this paper, we +address this gap by introducing novel automatic corpus-level metrics to assess +the lexical and syntactic naturalness of LLM outputs in a multilingual context. +Using our new metrics, we evaluate state-of-the-art LLMs on a curated benchmark +in French and Chinese, revealing a tendency towards English-influenced +patterns. To mitigate this issue, we also propose a simple and effective +alignment method to improve the naturalness of an LLM in a target language and +domain, achieving consistent improvements in naturalness without compromising +the performance on general-purpose benchmarks. Our work highlights the +importance of developing multilingual metrics, resources and methods for the +new wave of multilingual LLMs. + +
+
+
+
+
+ + ♻ ☆ RaTEScore: A Metric for Radiology Report Generation EMNLP 2024 + + +
+ This paper introduces a novel, entity-aware metric, termed as Radiological +Report (Text) Evaluation (RaTEScore), to assess the quality of medical reports +generated by AI models. RaTEScore emphasizes crucial medical entities such as +diagnostic outcomes and anatomical details, and is robust against complex +medical synonyms and sensitive to negation expressions. Technically, we +developed a comprehensive medical NER dataset, RaTE-NER, and trained an NER +model specifically for this purpose. This model enables the decomposition of +complex radiological reports into constituent medical entities. The metric +itself is derived by comparing the similarity of entity embeddings, obtained +from a language model, based on their types and relevance to clinical +significance. Our evaluations demonstrate that RaTEScore aligns more closely +with human preference than existing metrics, validated both on established +public benchmarks and our newly proposed RaTE-Eval benchmark. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Can Language Models Induce Grammatical Knowledge from Indirect Evidence? EMNLP 2024 + + +
+ What kinds of and how much data is necessary for language models to induce +grammatical knowledge to judge sentence acceptability? Recent language models +still have much room for improvement in their data efficiency compared to +humans. This paper investigates whether language models efficiently use +indirect data (indirect evidence), from which they infer sentence +acceptability. In contrast, humans use indirect evidence efficiently, which is +considered one of the inductive biases contributing to efficient language +acquisition. To explore this question, we introduce the Wug InDirect Evidence +Test (WIDET), a dataset consisting of training instances inserted into the +pre-training data and evaluation instances. We inject synthetic instances with +newly coined wug words into pretraining data and explore the model's behavior +on evaluation data that assesses grammatical acceptability regarding those +words. We prepare the injected instances by varying their levels of +indirectness and quantity. Our experiments surprisingly show that language +models do not induce grammatical knowledge even after repeated exposure to +instances with the same structure but differing only in lexical items from +evaluation instances in certain language phenomena. Our findings suggest a +potential direction for future research: developing models that use latent +indirect evidence to induce grammatical knowledge. + +
+
+ comment: This paper is accepted at EMNLP 2024 Main +
+
+
+
+
+ + ♻ ☆ A Review of Prominent Paradigms for LLM-Based Agents: Tool Use + (Including RAG), Planning, and Feedback Learning + + +
+ Tool use, planning, and feedback learning are currently three prominent +paradigms for developing Large Language Model (LLM)-based agents across various +tasks. Although numerous frameworks have been devised for each paradigm, their +intricate workflows and inconsistent taxonomy create challenges in +understanding and reviewing the frameworks across different paradigms. This +survey introduces a unified taxonomy to systematically review and discuss these +frameworks. Specifically, 1) the taxonomy defines environments/tasks, common +LLM-profiled roles or LMPRs (policy models, evaluators, and dynamic models), +and universally applicable workflows found in prior work, and 2) it enables a +comparison of key perspectives on the implementations of LMPRs and workflow +designs across different agent paradigms and frameworks. 3) Finally, we +identify three limitations in existing workflow designs and systematically +discuss the future work. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ GRAMMAR: Grounded and Modular Methodology for Assessment of + Closed-Domain Retrieval-Augmented Language Model + + +
+ Retrieval-Augmented Generation (RAG) systems are widely used across various +industries for querying closed-domain and in-house knowledge bases. However, +evaluating these systems presents significant challenges due to the private +nature of closed-domain data and a scarcity of queries with verifiable ground +truths. Moreover, there is a lack of analytical methods to diagnose problematic +modules and identify types of failure, such as those caused by knowledge +deficits or issues with robustness. To address these challenges, we introduce +GRAMMAR (GRounded And Modular Methodology for Assessment of RAG), an evaluation +framework comprising a grounded data generation process and an evaluation +protocol that effectively pinpoints defective modules. Our validation +experiments reveal that GRAMMAR provides a reliable approach for identifying +vulnerable modules and supports hypothesis testing for textual form +vulnerabilities. An open-source tool accompanying this framework is available +in our GitHub repository (see https://github.com/xinzhel/grammar), allowing for +easy reproduction of our results and enabling reliable and modular evaluation +in closed-domain settings. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ 1-bit AI Infra: Part 1.1, Fast and Lossless BitNet b1.58 Inference on + CPUs + + +
+ Recent advances in 1-bit Large Language Models (LLMs), such as BitNet and +BitNet b1.58, present a promising approach to enhancing the efficiency of LLMs +in terms of speed and energy consumption. These developments also enable local +LLM deployment across a broad range of devices. In this work, we introduce +bitnet.cpp, a tailored software stack designed to unlock the full potential of +1-bit LLMs. Specifically, we develop a set of kernels to support fast and +lossless inference of ternary BitNet b1.58 LLMs on CPUs. Extensive experiments +demonstrate that bitnet.cpp achieves significant speedups, ranging from 2.37x +to 6.17x on x86 CPUs and from 1.37x to 5.07x on ARM CPUs, across various model +sizes. The code is available at https://github.com/microsoft/BitNet. + +
+
+
+
+
+ + ♻ ☆ From Keywords to Structured Summaries: Streamlining Scholarly + Information Access ISWC 2024 + + +
+ This paper highlights the growing importance of information retrieval (IR) +engines in the scientific community, addressing the inefficiency of traditional +keyword-based search engines due to the rising volume of publications. The +proposed solution involves structured records, underpinning advanced +information technology (IT) tools, including visualization dashboards, to +revolutionize how researchers access and filter articles, replacing the +traditional text-heavy approach. This vision is exemplified through a proof of +concept centered on the "reproductive number estimate of infectious diseases" +research theme, using a fine-tuned large language model (LLM) to automate the +creation of structured records to populate a backend database that now goes +beyond keywords. The result is a next-generation information access system as +an IR method accessible at https://orkg.org/usecases/r0-estimates. + +
+
+ comment: 8 pages, 3 figures | Accepted for publication as a poster paper at + the International Semantic Web Conference (ISWC 2024) +
+
+
+
+
+ + ♻ ☆ Learning to Plan for Retrieval-Augmented Large Language Models from + Knowledge Graphs EMNLP2024 + + +
+ Improving the performance of large language models (LLMs) in complex +question-answering (QA) scenarios has always been a research focal point. +Recent studies have attempted to enhance LLMs' performance by combining +step-wise planning with external retrieval. While effective for advanced models +like GPT-3.5, smaller LLMs face challenges in decomposing complex questions, +necessitating supervised fine-tuning. Previous work has relied on manual +annotation and knowledge distillation from teacher LLMs, which are +time-consuming and not accurate enough. In this paper, we introduce a novel +framework for enhancing LLMs' planning capabilities by using planning data +derived from knowledge graphs (KGs). LLMs fine-tuned with this data have +improved planning capabilities, better equipping them to handle complex QA +tasks that involve retrieval. Evaluations on multiple datasets, including our +newly proposed benchmark, highlight the effectiveness of our framework and the +benefits of KG-derived planning data. + +
+
+ comment: EMNLP2024 Findings +
+
+
+
+
+ + ♻ ☆ Uncertainty Estimation and Quantification for LLMs: A Simple Supervised + Approach + + +
+ In this paper, we study the problem of uncertainty estimation and calibration +for LLMs. We begin by formulating the uncertainty estimation problem, a +relevant yet underexplored area in existing literature. We then propose a +supervised approach that leverages labeled datasets to estimate the uncertainty +in LLMs' responses. Based on the formulation, we illustrate the difference +between the uncertainty estimation for LLMs and that for standard ML models and +explain why the hidden neurons of the LLMs may contain uncertainty information. +Our designed approach demonstrates the benefits of utilizing hidden activations +to enhance uncertainty estimation across various tasks and shows robust +transferability in out-of-distribution settings. We distinguish the uncertainty +estimation task from the uncertainty calibration task and show that better +uncertainty estimation leads to better calibration performance. Furthermore, +our method is easy to implement and adaptable to different levels of model +accessibility including black box, grey box, and white box. + +
+
+ comment: 29 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Regularizing Hidden States Enables Learning Generalizable Reward Model + for LLMs NeurIPS 2024 + + +
+ Reward models trained on human preference data have been proven to +effectively align Large Language Models (LLMs) with human intent within the +framework of reinforcement learning from human feedback (RLHF). However, +current reward models have limited generalization capabilities to unseen +prompts and responses, which can lead to an unexpected phenomenon known as +reward over-optimization, resulting in a decline in actual performance due to +excessive optimization of rewards. While previous research has advocated for +constraining policy optimization, our study introduces a novel approach to +enhance the reward model's generalization ability against distribution shifts +by regularizing the hidden states. Specifically, we retain the base model's +language model head and incorporate a suite of text-generation losses to +preserve the hidden states' text-generation capabilities, while concurrently +learning a reward head behind the same hidden states. Our experimental results +demonstrate that the introduced regularization technique markedly improves the +accuracy of learned reward models across a variety of out-of-distribution (OOD) +tasks and effectively alleviates the over-optimization issue in RLHF, offering +a more reliable and robust preference learning paradigm. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Mind's Eye of LLMs: Visualization-of-Thought Elicits Spatial Reasoning + in Large Language Models NeurIPS + 2024 + + +
+ Large language models (LLMs) have exhibited impressive performance in +language comprehension and various reasoning tasks. However, their abilities in +spatial reasoning, a crucial aspect of human cognition, remain relatively +unexplored. Human possess a remarkable ability to create mental images of +unseen objects and actions through a process known as the Mind's Eye, enabling +the imagination of the unseen world. Inspired by this cognitive capacity, we +propose Visualization-of-Thought (VoT) prompting. VoT aims to elicit spatial +reasoning of LLMs by visualizing their reasoning traces, thereby guiding +subsequent reasoning steps. We employed VoT for multi-hop spatial reasoning +tasks, including natural language navigation, visual navigation, and visual +tiling in 2D grid worlds. Experimental results demonstrated that VoT +significantly enhances the spatial reasoning abilities of LLMs. Notably, VoT +outperformed existing multimodal large language models (MLLMs) in these tasks. +While VoT works surprisingly well on LLMs, the ability to generate mental +images to facilitate spatial reasoning resembles the mind's eye process, +suggesting its potential viability in MLLMs. Please find the dataset and codes +at https://microsoft.github.io/visualization-of-thought + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) +
+
+
+
+
+ + ♻ ☆ GPT-SW3: An Autoregressive Language Model for the Nordic Languages + + +
+ This paper details the process of developing the first native large +generative language model for the Nordic languages, GPT-SW3. We cover all parts +of the development process, from data collection and processing, training +configuration and instruction finetuning, to evaluation and considerations for +release strategies. We hope that this paper can serve as a guide and reference +for other researchers that undertake the development of large generative models +for smaller languages. + +
+
+
+
+
+ + ♻ ☆ Non-myopic Generation of Language Model for Reasoning and Planning + + +
+ Large Language Models have demonstrated remarkable abilities in reasoning and +planning by breaking down complex problems into sequential steps. Despite their +success in various domains like mathematical problem-solving and coding, LLMs +face challenges in ensuring reliable and optimal planning due to their inherent +myopic nature of autoregressive decoding. This paper revisits LLM reasoning +from an optimal-control perspective, proposing a novel method, +Predictive-Decoding, that leverages Model Predictive Control to enhance +planning accuracy. By re-weighting LLM distributions based on foresight +trajectories, Predictive-Decoding aims to mitigate early errors and promote +non-myopic planning. Our experiments show significant improvements in a wide +range of tasks for math, coding, and agents. Furthermore, Predictive-Decoding +demonstrates computational efficiency, outperforming search baselines with +reduced computational resources. This study provides insights into optimizing +LLM planning capabilities. + +
+
+
+
+
+ + ♻ ☆ Generative AI Security: Challenges and Countermeasures + + +
+ Generative AI's expanding footprint across numerous industries has led to +both excitement and increased scrutiny. This paper delves into the unique +security challenges posed by Generative AI, and outlines potential research +directions for managing these risks. + +
+
+
+
+
+ + ♻ ☆ OpenMU: Your Swiss Army Knife for Music Understanding + + +
+ We present OpenMU-Bench, a large-scale benchmark suite for addressing the +data scarcity issue in training multimodal language models to understand music. +To construct OpenMU-Bench, we leveraged existing datasets and bootstrapped new +annotations. OpenMU-Bench also broadens the scope of music understanding by +including lyrics understanding and music tool usage. Using OpenMU-Bench, we +trained our music understanding model, OpenMU, with extensive ablations, +demonstrating that OpenMU outperforms baseline models such as MU-Llama. Both +OpenMU and OpenMU-Bench are open-sourced to facilitate future research in music +understanding and to enhance creative music production efficiency. + +
+
+ comment: Resources: https://github.com/mzhaojp22/openmu +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning with Dynamic Multi-Reward Weighting for + Multi-Style Controllable Generation + + +
+ Textual style expresses a diverse set of information, including interpersonal +dynamics (e.g., formality) and the author's emotions or attitudes (e.g., +disgust). An open question is how language models can be explicitly controlled +so that they weave together target styles when generating text: for example, to +produce text that is both negative and non-toxic. One approach to such +controlled generation is multi-objective reinforcement learning (RL), but how +best to combine multiple objectives in a reward function is an open question. +In this paper, we investigate various formulations of multi-style rewards, +including calibrated outputs from discriminators and dynamic weighting by +discriminator gradient magnitudes. We find that our proposed dynamic weighting +outperforms static weighting approaches with respect to style control while +maintaining linguistic quality, and we explore its effectiveness in 2- and +3-style control. + +
+
+
+
+
+ + ♻ ☆ LLMScan: Causal Scan for LLM Misbehavior Detection + + +
+ Despite the success of Large Language Models (LLMs) across various fields, +their potential to generate untruthful, biased and harmful responses poses +significant risks, particularly in critical applications. This highlights the +urgent need for systematic methods to detect and prevent such misbehavior. +While existing approaches target specific issues such as harmful responses, +this work introduces LLMScan, an innovative LLM monitoring technique based on +causality analysis, offering a comprehensive solution. LLMScan systematically +monitors the inner workings of an LLM through the lens of causal inference, +operating on the premise that the LLM's `brain' behaves differently when +misbehaving. By analyzing the causal contributions of the LLM's input tokens +and transformer layers, LLMScan effectively detects misbehavior. Extensive +experiments across various tasks and models reveal clear distinctions in the +causal distributions between normal behavior and misbehavior, enabling the +development of accurate, lightweight detectors for a variety of misbehavior +detection tasks. + +
+
+
+
+
+ + ♻ ☆ BrainTransformers: SNN-LLM + + +
+ This study introduces BrainTransformers, an innovative Large Language Model +(LLM) implemented using Spiking Neural Networks (SNN). Our key contributions +include: (1) designing SNN-compatible Transformer components such as SNNMatmul, +SNNSoftmax, and SNNSiLU; (2) implementing an SNN approximation of the SiLU +activation function; and (3) developing a Synapsis module to simulate synaptic +plasticity. Our 3-billion parameter model, BrainTransformers-3B-Chat, +demonstrates competitive performance across various benchmarks, including MMLU +(63.2), BBH (54.1), ARC-C (54.3), and GSM8K (76.3), while potentially offering +improved energy efficiency and biological plausibility. The model employs a +three-stage training approach, including SNN-specific neuronal synaptic +plasticity training. This research opens new avenues for brain-like AI systems +in natural language processing and neuromorphic computing. Future work will +focus on hardware optimization, developing specialized SNN fine-tuning tools, +and exploring practical applications in energy-efficient computing +environments. + +
+
+
+
+
+ + ♻ ☆ TSDS: Data Selection for Task-Specific Model Finetuning + + +
+ Finetuning foundation models for specific tasks is an emerging paradigm in +modern machine learning. The efficacy of task-specific finetuning largely +depends on the selection of appropriate training data. We present TSDS +(Task-Specific Data Selection), a framework to select data for task-specific +model finetuning, guided by a small but representative set of examples from the +target task. To do so, we formulate data selection for task-specific finetuning +as an optimization problem with a distribution alignment loss based on optimal +transport to capture the discrepancy between the selected data and the target +distribution. In addition, we add a regularizer to encourage the diversity of +the selected data and incorporate kernel density estimation into the +regularizer to reduce the negative effects of near-duplicates among the +candidate data. We connect our optimization problem to nearest neighbor search +and design efficient algorithms to compute the optimal solution based on +approximate nearest neighbor search techniques. We evaluate our method on data +selection for both continued pretraining and instruction tuning of language +models. We show that instruction tuning using data selected by our method with +a 1% selection ratio often outperforms using the full dataset and beats the +baseline selection methods by 1.5 points in F1 score on average. + +
+
+ comment: 31 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Selective Vision is the Challenge for Visual Reasoning: A Benchmark for + Visual Argument Understanding EMNLP 2024 + + +
+ Visual arguments, often used in advertising or social causes, rely on images +to persuade viewers to do or believe something. Understanding these arguments +requires selective vision: only specific visual stimuli within an image are +relevant to the argument, and relevance can only be understood within the +context of a broader argumentative structure. While visual arguments are +readily appreciated by human audiences, we ask: are today's AI capable of +similar understanding? We present VisArgs, a dataset of 1,611 images annotated +with 5,112 visual premises (with regions), 5,574 commonsense premises, and +reasoning trees connecting them into structured arguments. We propose three +tasks for evaluating visual argument understanding: premise localization, +premise identification, and conclusion deduction. Experiments show that 1) +machines struggle to capture visual cues: GPT-4-O achieved 78.5% accuracy, +while humans reached 98.0%. Models also performed 19.5% worse when +distinguishing between irrelevant objects within the image compared to external +objects. 2) Providing relevant visual premises improved model performance +significantly. + +
+
+ comment: 12 pages, 6 figures. Accepted as main paper in EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Do LLMs Have Political Correctness? Analyzing Ethical Biases and + Jailbreak Vulnerabilities in AI Systems + + +
+ Although large language models (LLMs) demonstrate impressive proficiency in +various tasks, they present potential safety risks, such as `jailbreaks', where +malicious inputs can coerce LLMs into generating harmful content. To address +these issues, many LLM developers have implemented various safety measures to +align these models. This alignment involves several techniques, including data +filtering during pre-training, supervised fine-tuning, reinforcement learning +from human feedback, and red-teaming exercises. These methods often introduce +deliberate and intentional biases similar to Political Correctness (PC) to +ensure the ethical behavior of LLMs. In this paper, we delve into the +intentional biases injected into LLMs for safety purposes and examine methods +to circumvent these safety alignment techniques. Notably, these intentional +biases result in a jailbreaking success rate in GPT-4o models that differs by +20% between non-binary and cisgender keywords and by 16% between white and +black keywords, even when the other parts of the prompts are identical. We +introduce the concept of PCJailbreak, highlighting the inherent risks posed by +these safety-induced biases. Additionally, we propose an efficient defense +method PCDefense, which prevents jailbreak attempts by injecting defense +prompts prior to generation. PCDefense stands as an appealing alternative to +Guard Models, such as Llama-Guard, that require additional inference cost after +text generation. Our findings emphasize the urgent need for LLM developers to +adopt a more responsible approach when designing and implementing safety +measures. + +
+
+
+
+
+ + ♻ ☆ Susu Box or Piggy Bank: Assessing Cultural Commonsense Knowledge between + Ghana and the U.S EMNLP 2024 + + +
+ Recent work has highlighted the culturally-contingent nature of commonsense +knowledge. We introduce AMAMMER${\epsilon}$, a test set of 525 multiple-choice +questions designed to evaluate the commonsense knowledge of English LLMs, +relative to the cultural contexts of Ghana and the United States. To create +AMAMMER${\epsilon}$, we select a set of multiple-choice questions (MCQs) from +existing commonsense datasets and rewrite them in a multi-stage process +involving surveys of Ghanaian and U.S. participants. In three rounds of +surveys, participants from both pools are solicited to (1) write correct and +incorrect answer choices, (2) rate individual answer choices on a 5-point +Likert scale, and (3) select the best answer choice from the newly-constructed +MCQ items, in a final validation step. By engaging participants at multiple +stages, our procedure ensures that participant perspectives are incorporated +both in the creation and validation of test items, resulting in high levels of +agreement within each pool. We evaluate several off-the-shelf English LLMs on +AMAMMER${\epsilon}$. Uniformly, models prefer answers choices that align with +the preferences of U.S. annotators over Ghanaian annotators. Additionally, when +test items specify a cultural context (Ghana or the U.S.), models exhibit some +ability to adapt, but performance is consistently better in U.S. contexts than +Ghanaian. As large resources are devoted to the advancement of English LLMs, +our findings underscore the need for culturally adaptable models and +evaluations to meet the needs of diverse English-speaking populations around +the world. + +
+
+ comment: Accepted to EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ A Bi-consolidating Model for Joint Relational Triple Extraction + + +
+ Current methods to extract relational triples directly make a prediction +based on a possible entity pair in a raw sentence without depending on entity +recognition. The task suffers from a serious semantic overlapping problem, in +which several relation triples may share one or two entities in a sentence. In +this paper, based on a two-dimensional sentence representation, a +bi-consolidating model is proposed to address this problem by simultaneously +reinforcing the local and global semantic features relevant to a relation +triple. This model consists of a local consolidation component and a global +consolidation component. The first component uses a pixel difference +convolution to enhance semantic information of a possible triple representation +from adjacent regions and mitigate noise in neighbouring neighbours. The second +component strengthens the triple representation based a channel attention and a +spatial attention, which has the advantage to learn remote semantic +dependencies in a sentence. They are helpful to improve the performance of both +entity identification and relation type classification in relation triple +extraction. After evaluated on several publish datasets, the bi-consolidating +model achieves competitive performance. Analytical experiments demonstrate the +effectiveness of our model for relational triple extraction and give motivation +for other natural language processing tasks. + +
+
+
+
+
+ + ♻ ☆ When "Competency" in Reasoning Opens the Door to Vulnerability: + Jailbreaking LLMs via Novel Complex Ciphers + + +
+ Recent advancements in the safety of Large Language Models (LLMs) have +primarily focused on mitigating attacks crafted in natural language or in +common encryption techniques like Base64. However, new models which often +possess better reasoning capabilities, open the door to new attack vectors that +were previously non-existent in older models. This seems counter-intuitive at +first glance, but these advanced models can decipher more complex cryptic +queries that previous models could not, making them susceptible to attacks +using such prompts. To exploit this vulnerability, we propose Attacks using +Custom Encryptions (ACE), a novel method to jailbreak LLMs by leveraging custom +encryption schemes. We evaluate the effectiveness of ACE on four +state-of-the-art LLMs, achieving Attack Success Rates (ASR) of up to 66% on +close-source models and 88% on open-source models. Building upon this, we +introduce Layered Attacks using Custom Encryptions (LACE), which employs +multiple layers of encryption through our custom ciphers to further enhance the +ASR. Our findings demonstrate that LACE significantly enhances the ability to +jailbreak LLMs, increasing the ASR of GPT-4o from 40% to 78%, a 38% +improvement. Our results highlight that the advanced capabilities of LLMs +introduce unforeseen vulnerabilities to complex attacks. Specifically complex +and layered ciphers increase the chance of jailbreaking. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 124 + +
+
+
+ + ☆ DynamicCity: Large-Scale LiDAR Generation from Dynamic Scenes + + +
+ LiDAR scene generation has been developing rapidly recently. However, +existing methods primarily focus on generating static and single-frame scenes, +overlooking the inherently dynamic nature of real-world driving environments. +In this work, we introduce DynamicCity, a novel 4D LiDAR generation framework +capable of generating large-scale, high-quality LiDAR scenes that capture the +temporal evolution of dynamic environments. DynamicCity mainly consists of two +key models. 1) A VAE model for learning HexPlane as the compact 4D +representation. Instead of using naive averaging operations, DynamicCity +employs a novel Projection Module to effectively compress 4D LiDAR features +into six 2D feature maps for HexPlane construction, which significantly +enhances HexPlane fitting quality (up to 12.56 mIoU gain). Furthermore, we +utilize an Expansion & Squeeze Strategy to reconstruct 3D feature volumes in +parallel, which improves both network training efficiency and reconstruction +accuracy than naively querying each 3D point (up to 7.05 mIoU gain, 2.06x +training speedup, and 70.84% memory reduction). 2) A DiT-based diffusion model +for HexPlane generation. To make HexPlane feasible for DiT generation, a Padded +Rollout Operation is proposed to reorganize all six feature planes of the +HexPlane as a squared 2D feature map. In particular, various conditions could +be introduced in the diffusion or sampling process, supporting versatile 4D +generation applications, such as trajectory- and command-driven generation, +inpainting, and layout-conditioned generation. Extensive experiments on the +CarlaSC and Waymo datasets demonstrate that DynamicCity significantly +outperforms existing state-of-the-art 4D LiDAR generation methods across +multiple metrics. The code will be released to facilitate future research. + +
+
+ comment: Preprint; 29 pages, 15 figures, 7 tables; Project Page at + https://dynamic-city.github.io/ +
+
+
+
+
+ + ☆ FIPER: Generalizable Factorized Fields for Joint Image Compression and + Super-Resolution + + +
+ In this work, we propose a unified representation for Super-Resolution (SR) +and Image Compression, termed **Factorized Fields**, motivated by the shared +principles between these two tasks. Both SISR and Image Compression require +recovering and preserving fine image details--whether by enhancing resolution +or reconstructing compressed data. Unlike previous methods that mainly focus on +network architecture, our proposed approach utilizes a basis-coefficient +decomposition to explicitly capture multi-scale visual features and structural +components in images, addressing the core challenges of both tasks. We first +derive our SR model, which includes a Coefficient Backbone and Basis Swin +Transformer for generalizable Factorized Fields. Then, to further unify these +two tasks, we leverage the strong information-recovery capabilities of the +trained SR modules as priors in the compression pipeline, improving both +compression efficiency and detail reconstruction. Additionally, we introduce a +merged-basis compression branch that consolidates shared structures, further +optimizing the compression process. Extensive experiments show that our unified +representation delivers state-of-the-art performance, achieving an average +relative improvement of 204.4% in PSNR over the baseline in Super-Resolution +(SR) and 9.35% BD-rate reduction in Image Compression compared to the previous +SOTA. + +
+
+ comment: Project page: https://jayisaking.github.io/FIPER/ +
+
+
+
+
+ + ☆ FreeVS: Generative View Synthesis on Free Driving Trajectory + + +
+ Existing reconstruction-based novel view synthesis methods for driving scenes +focus on synthesizing camera views along the recorded trajectory of the ego +vehicle. Their image rendering performance will severely degrade on viewpoints +falling out of the recorded trajectory, where camera rays are untrained. We +propose FreeVS, a novel fully generative approach that can synthesize camera +views on free new trajectories in real driving scenes. To control the +generation results to be 3D consistent with the real scenes and accurate in +viewpoint pose, we propose the pseudo-image representation of view priors to +control the generation process. Viewpoint transformation simulation is applied +on pseudo-images to simulate camera movement in each direction. Once trained, +FreeVS can be applied to any validation sequences without reconstruction +process and synthesis views on novel trajectories. Moreover, we propose two new +challenging benchmarks tailored to driving scenes, which are novel camera +synthesis and novel trajectory synthesis, emphasizing the freedom of +viewpoints. Given that no ground truth images are available on novel +trajectories, we also propose to evaluate the consistency of images synthesized +on novel trajectories with 3D perception models. Experiments on the Waymo Open +Dataset show that FreeVS has a strong image synthesis performance on both the +recorded trajectories and novel trajectories. Project Page: +https://freevs24.github.io/ + +
+
+ comment: Project Page: https://freevs24.github.io/ +
+
+
+
+
+ + ☆ UnCLe: Unsupervised Continual Learning of Depth Completion + + +
+ We propose UnCLe, a standardized benchmark for Unsupervised Continual +Learning of a multimodal depth estimation task: Depth completion aims to infer +a dense depth map from a pair of synchronized RGB image and sparse depth map. +We benchmark depth completion models under the practical scenario of +unsupervised learning over continuous streams of data. Existing methods are +typically trained on a static, or stationary, dataset. However, when adapting +to novel non-stationary distributions, they "catastrophically forget" +previously learned information. UnCLe simulates these non-stationary +distributions by adapting depth completion models to sequences of datasets +containing diverse scenes captured from distinct domains using different visual +and range sensors. We adopt representative methods from continual learning +paradigms and translate them to enable unsupervised continual learning of depth +completion. We benchmark these models for indoor and outdoor and investigate +the degree of catastrophic forgetting through standard quantitative metrics. +Furthermore, we introduce model inversion quality as an additional measure of +forgetting. We find that unsupervised continual learning of depth completion is +an open problem, and we invite researchers to leverage UnCLe as a development +platform. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ WorldSimBench: Towards Video Generation Models as World Simulators + + +
+ Recent advancements in predictive models have demonstrated exceptional +capabilities in predicting the future state of objects and scenes. However, the +lack of categorization based on inherent characteristics continues to hinder +the progress of predictive model development. Additionally, existing benchmarks +are unable to effectively evaluate higher-capability, highly embodied +predictive models from an embodied perspective. In this work, we classify the +functionalities of predictive models into a hierarchy and take the first step +in evaluating World Simulators by proposing a dual evaluation framework called +WorldSimBench. WorldSimBench includes Explicit Perceptual Evaluation and +Implicit Manipulative Evaluation, encompassing human preference assessments +from the visual perspective and action-level evaluations in embodied tasks, +covering three representative embodied scenarios: Open-Ended Embodied +Environment, Autonomous, Driving, and Robot Manipulation. In the Explicit +Perceptual Evaluation, we introduce the HF-Embodied Dataset, a video assessment +dataset based on fine-grained human feedback, which we use to train a Human +Preference Evaluator that aligns with human perception and explicitly assesses +the visual fidelity of World Simulators. In the Implicit Manipulative +Evaluation, we assess the video-action consistency of World Simulators by +evaluating whether the generated situation-aware video can be accurately +translated into the correct control signals in dynamic environments. Our +comprehensive evaluation offers key insights that can drive further innovation +in video generation models, positioning World Simulators as a pivotal +advancement toward embodied artificial intelligence. + +
+
+
+
+
+ + ☆ TP-Eval: Tap Multimodal LLMs' Potential in Evaluation by Customizing + Prompts + + +
+ Recently, multimodal large language models (MLLMs) have received much +attention for their impressive capabilities. The evaluation of MLLMs is +becoming critical to analyzing attributes of MLLMs and providing valuable +insights. However, current benchmarks overlook the problem of prompt +sensitivity - minor prompt variations may lead to significant performance +fluctuations. Thus, inappropriate prompts may obscure the models' capabilities, +underestimating the models' performance. Moreover, different models have +different preferences for different prompts, and thus, using the same prompt +for all models will cause evaluation bias. This paper analyzes this deficiency +in existing benchmarks and further introduces a new evaluation framework named +TP-Eval, which introduces a prompt customization method to reduce evaluation +biases and tap models' potential. TP-Eval will rewrite the original prompts to +different customized prompts for different models. In particular, we propose +some well-designed modules for prompt customization tailored to the scenario of +MLLM evaluation. Extensive experiments demonstrate the effectiveness of our +approach to uncovering models' capabilities, and TP-Eval should benefit the +community in developing more comprehensive and convincing MLLM evaluation +benchmarks. + +
+
+
+
+
+ + ☆ SPIRE: Synergistic Planning, Imitation, and Reinforcement Learning for + Long-Horizon Manipulation + + +
+ Robot learning has proven to be a general and effective technique for +programming manipulators. Imitation learning is able to teach robots solely +from human demonstrations but is bottlenecked by the capabilities of the +demonstrations. Reinforcement learning uses exploration to discover better +behaviors; however, the space of possible improvements can be too large to +start from scratch. And for both techniques, the learning difficulty increases +proportional to the length of the manipulation task. Accounting for this, we +propose SPIRE, a system that first uses Task and Motion Planning (TAMP) to +decompose tasks into smaller learning subproblems and second combines imitation +and reinforcement learning to maximize their strengths. We develop novel +strategies to train learning agents when deployed in the context of a planning +system. We evaluate SPIRE on a suite of long-horizon and contact-rich robot +manipulation problems. We find that SPIRE outperforms prior approaches that +integrate imitation learning, reinforcement learning, and planning by 35% to +50% in average task performance, is 6 times more data efficient in the number +of human demonstrations needed to train proficient agents, and learns to +complete tasks nearly twice as efficiently. View +https://sites.google.com/view/spire-corl-2024 for more details. + +
+
+ comment: Conference on Robot Learning (CoRL) 2024 +
+
+
+
+
+ + ☆ CLEAR: Character Unlearning in Textual and Visual Modalities + + +
+ Machine Unlearning (MU) is critical for enhancing privacy and security in +deep learning models, particularly in large multimodal language models (MLLMs), +by removing specific private or hazardous information. While MU has made +significant progress in textual and visual modalities, multimodal unlearning +(MMU) remains significantly underexplored, partially due to the absence of a +suitable open-source benchmark. To address this, we introduce CLEAR, a new +benchmark designed to evaluate MMU methods. CLEAR contains 200 fictitious +individuals and 3,700 images linked with corresponding question-answer pairs, +enabling a thorough evaluation across modalities. We assess 10 MU methods, +adapting them for MMU, and highlight new challenges specific to multimodal +forgetting. We also demonstrate that simple $\ell_1$ regularization on LoRA +weights significantly mitigates catastrophic forgetting, preserving model +performance on retained data. The dataset is available at +https://huggingface.co/datasets/therem/CLEAR + +
+
+
+
+
+ + ☆ In-Pixel Foreground and Contrast Enhancement Circuits with Customizable + Mapping + + +
+ This paper presents an innovative in-pixel contrast enhancement circuit that +performs image processing directly within the pixel circuit. The circuit can be +tuned for different modes of operation. In foreground enhancement mode, it +suppresses low-intensity background pixels to nearly zero, isolating the +foreground for better object visibility. In contrast enhancement mode, it +improves overall image contrast. The contrast enhancement function is +customizable both during the design phase and in real-time, allowing the +circuit to adapt to specific applications and varying lighting conditions. A +model of the designed pixel circuit is developed and applied to a full pixel +array, demonstrating significant improvements in image quality. Simulations +performed in HSPICE show a nearly 6x increase in Michelson Contrast Ratio (CR) +in the foreground enhancement mode. The simulation results indicate its +potential for real-time, adaptive contrast enhancement across various imaging +environments. + +
+
+
+
+
+ + ☆ Real time anomalies detection on video + + +
+ Nowadays, many places use security cameras. Unfortunately, when an incident +occurs, these technologies are used to show past events. So it can be +considered as a deterrence tool than a detection tool. In this article, we will +propose a deep learning approach trying to solve this problematic. This +approach uses convolutional models (CNN) to extract relevant characteristics +linked to the video images, theses characteristics will form times series to be +analyzed by LSTM / GRU models. + +
+
+
+
+
+ + ☆ Scalable Ranked Preference Optimization for Text-to-Image Generation + + +
+ Direct Preference Optimization (DPO) has emerged as a powerful approach to +align text-to-image (T2I) models with human feedback. Unfortunately, successful +application of DPO to T2I models requires a huge amount of resources to collect +and label large-scale datasets, e.g., millions of generated paired images +annotated with human preferences. In addition, these human preference datasets +can get outdated quickly as the rapid improvements of T2I models lead to higher +quality images. In this work, we investigate a scalable approach for collecting +large-scale and fully synthetic datasets for DPO training. Specifically, the +preferences for paired images are generated using a pre-trained reward +function, eliminating the need for involving humans in the annotation process, +greatly improving the dataset collection efficiency. Moreover, we demonstrate +that such datasets allow averaging predictions across multiple models and +collecting ranked preferences as opposed to pairwise preferences. Furthermore, +we introduce RankDPO to enhance DPO-based methods using the ranking feedback. +Applying RankDPO on SDXL and SD3-Medium models with our synthetically generated +preference dataset ``Syn-Pic'' improves both prompt-following (on benchmarks +like T2I-Compbench, GenEval, and DPG-Bench) and visual quality (through user +studies). This pipeline presents a practical and scalable solution to develop +better preference datasets to enhance the performance of text-to-image models. + +
+
+ comment: Project Page: https://snap-research.github.io/RankDPO/ +
+
+
+
+
+ + ☆ Characterization of the multiplicity of solutions for camera pose given + two vertically-aligned landmarks and accelerometer + + +
+ We consider the problem of recovering the position and orientation of a +camera equipped with an accelerometer from sensor images of two labeled +landmarks whose positions in a coordinate system aligned in a known way with +gravity are known. This a variant on the much studied P$n$P problem of +recovering camera position and orientation from $n$ points without any +gravitational data. It is proved that in three types of singular cases there +are infinitely many solutions, in another type of case there is one, and in a +final type of case there are two. A precise characterization of each type of +case. In particular, there is always a unique solution in the practically +interesting case where the two landmarks are at the same altitude and the +camera is at a different altitude. This case is studied by numerical simulation +and an implementation on a consumer cellphone. It is also proved that if the +two landmarks are unlabeled, then apart from the same singular cases, there are +still always one or two solutions. + +
+
+ comment: 32 pages, 8 figures +
+
+
+
+
+ + ☆ A Pipeline for Segmenting and Structuring RGB-D Data for Robotics + Applications + + +
+ We introduce a novel pipeline for segmenting and structuring color and depth +(RGB-D) data. Existing processing pipelines for RGB-D data have focused on +extracting geometric information alone. This approach precludes the development +of more advanced robotic navigation and manipulation algorithms, which benefit +from a semantic understanding of their environment. Our pipeline can segment +RGB-D data into accurate semantic masks. These masks are then used to fuse raw +captured point clouds into semantically separated point clouds. We store this +information using the Universal Scene Description (USD) file format, a format +suitable for easy querying by downstream robotics algorithms, human-friendly +visualization, and robotics simulation. + +
+
+
+
+
+ + ☆ Robust Two-View Geometry Estimation with Implicit Differentiation IROS 2024 + + +
+ We present a novel two-view geometry estimation framework which is based on a +differentiable robust loss function fitting. We propose to treat the robust +fundamental matrix estimation as an implicit layer, which allows us to avoid +backpropagation through time and significantly improves the numerical +stability. To take full advantage of the information from the feature matching +stage we incorporate learnable weights that depend on the matching confidences. +In this way our solution brings together feature extraction, matching and +two-view geometry estimation in a unified end-to-end trainable pipeline. We +evaluate our approach on the camera pose estimation task in both outdoor and +indoor scenarios. The experiments on several datasets show that the proposed +method outperforms both classic and learning-based state-of-the-art methods by +a large margin. The project webpage is available at: +https://github.com/VladPyatov/ihls + +
+
+ comment: IROS 2024 Accepted +
+
+
+
+
+ + ☆ A Wavelet Diffusion GAN for Image Super-Resolution + + +
+ In recent years, diffusion models have emerged as a superior alternative to +generative adversarial networks (GANs) for high-fidelity image generation, with +wide applications in text-to-image generation, image-to-image translation, and +super-resolution. However, their real-time feasibility is hindered by slow +training and inference speeds. This study addresses this challenge by proposing +a wavelet-based conditional Diffusion GAN scheme for Single-Image +Super-Resolution (SISR). Our approach utilizes the diffusion GAN paradigm to +reduce the timesteps required by the reverse diffusion process and the Discrete +Wavelet Transform (DWT) to achieve dimensionality reduction, decreasing +training and inference times significantly. The results of an experimental +validation on the CelebA-HQ dataset confirm the effectiveness of our proposed +scheme. Our approach outperforms other state-of-the-art methodologies +successfully ensuring high-fidelity output while overcoming inherent drawbacks +associated with diffusion models in time-sensitive applications. + +
+
+ comment: The paper has been accepted at Italian Workshop on Neural Networks + (WIRN) 2024 +
+
+
+
+
+ + ☆ Medical Imaging Complexity and its Effects on GAN Performance ACCV + + +
+ The proliferation of machine learning models in diverse clinical applications +has led to a growing need for high-fidelity, medical image training data. Such +data is often scarce due to cost constraints and privacy concerns. Alleviating +this burden, medical image synthesis via generative adversarial networks (GANs) +emerged as a powerful method for synthetically generating photo-realistic +images based on existing sets of real medical images. However, the exact image +set size required to efficiently train such a GAN is unclear. In this work, we +experimentally establish benchmarks that measure the relationship between a +sample dataset size and the fidelity of the generated images, given the +dataset's distribution of image complexities. We analyze statistical metrics +based on delentropy, an image complexity measure rooted in Shannon's entropy in +information theory. For our pipeline, we conduct experiments with two +state-of-the-art GANs, StyleGAN 3 and SPADE-GAN, trained on multiple medical +imaging datasets with variable sample sizes. Across both GANs, general +performance improved with increasing training set size but suffered with +increasing complexity. + +
+
+ comment: Accepted to ACCV, Workshop on Generative AI for Synthetic Medical + Data +
+
+
+
+
+ + ☆ VR-Splatting: Foveated Radiance Field Rendering via 3D Gaussian + Splatting and Neural Points + + +
+ Recent advances in novel view synthesis (NVS), particularly neural radiance +fields (NeRF) and Gaussian splatting (3DGS), have demonstrated impressive +results in photorealistic scene rendering. These techniques hold great +potential for applications in virtual tourism and teleportation, where +immersive realism is crucial. However, the high-performance demands of virtual +reality (VR) systems present challenges in directly utilizing even such +fast-to-render scene representations like 3DGS due to latency and computational +constraints. + In this paper, we propose foveated rendering as a promising solution to these +obstacles. We analyze state-of-the-art NVS methods with respect to their +rendering performance and compatibility with the human visual system. Our +approach introduces a novel foveated rendering approach for Virtual Reality, +that leverages the sharp, detailed output of neural point rendering for the +foveal region, fused with a smooth rendering of 3DGS for the peripheral vision. + Our evaluation confirms that perceived sharpness and detail-richness are +increased by our approach compared to a standard VR-ready 3DGS configuration. +Our system meets the necessary performance requirements for real-time VR +interactions, ultimately enhancing the user's immersive experience. + Project page: https://lfranke.github.io/vr_splatting + +
+
+
+
+
+ + ☆ Gaze-Assisted Medical Image Segmentation NeurIPS'24 + + +
+ The annotation of patient organs is a crucial part of various diagnostic and +treatment procedures, such as radiotherapy planning. Manual annotation is +extremely time-consuming, while its automation using modern image analysis +techniques has not yet reached levels sufficient for clinical adoption. This +paper investigates the idea of semi-supervised medical image segmentation using +human gaze as interactive input for segmentation correction. In particular, we +fine-tuned the Segment Anything Model in Medical Images (MedSAM), a public +solution that uses various prompt types as additional input for semi-automated +segmentation correction. We used human gaze data from reading abdominal images +as a prompt for fine-tuning MedSAM. The model was validated on a public WORD +database, which consists of 120 CT scans of 16 abdominal organs. The results of +the gaze-assisted MedSAM were shown to be superior to the results of the +state-of-the-art segmentation models. In particular, the average Dice +coefficient for 16 abdominal organs was 85.8%, 86.7%, 81.7%, and 90.5% for +nnUNetV2, ResUNet, original MedSAM, and our gaze-assisted MedSAM model, +respectively. + +
+
+ comment: 16 pages, 4 figures, Accepted to AIM-FM Workshop @ NeurIPS'24 +
+
+
+
+
+ + ☆ Addressing Asynchronicity in Clinical Multimodal Fusion via + Individualized Chest X-ray Generation NeurIPS-24 + + +
+ Integrating multi-modal clinical data, such as electronic health records +(EHR) and chest X-ray images (CXR), is particularly beneficial for clinical +prediction tasks. However, in a temporal setting, multi-modal data are often +inherently asynchronous. EHR can be continuously collected but CXR is generally +taken with a much longer interval due to its high cost and radiation dose. When +clinical prediction is needed, the last available CXR image might have been +outdated, leading to suboptimal predictions. To address this challenge, we +propose DDL-CXR, a method that dynamically generates an up-to-date latent +representation of the individualized CXR images. Our approach leverages latent +diffusion models for patient-specific generation strategically conditioned on a +previous CXR image and EHR time series, providing information regarding +anatomical structures and disease progressions, respectively. In this way, the +interaction across modalities could be better captured by the latent CXR +generation process, ultimately improving the prediction performance. +Experiments using MIMIC datasets show that the proposed model could effectively +address asynchronicity in multimodal fusion and consistently outperform +existing methods. + +
+
+ comment: Accepted by NeurIPS-24 +
+
+
+
+
+ + ☆ R-CoT: Reverse Chain-of-Thought Problem Generation for Geometric + Reasoning in Large Multimodal Models + + +
+ Existing Large Multimodal Models (LMMs) struggle with mathematical geometric +reasoning due to a lack of high-quality image-text paired data. Current +geometric data generation approaches, which apply preset templates to generate +geometric data or use Large Language Models (LLMs) to rephrase questions and +answers (Q&A), unavoidably limit data accuracy and diversity. To synthesize +higher-quality data, we propose a two-stage Reverse Chain-of-Thought (R-CoT) +geometry problem generation pipeline. First, we introduce GeoChain to produce +high-fidelity geometric images and corresponding descriptions highlighting +relations among geometric elements. We then design a Reverse A&Q method that +reasons step-by-step based on the descriptions and generates questions in +reverse from the reasoning results. Experiments demonstrate that the proposed +method brings significant and consistent improvements on multiple LMM +baselines, achieving new performance records in the 2B, 7B, and 8B settings. +Notably, R-CoT-8B significantly outperforms previous state-of-the-art +open-source mathematical models by 16.6% on MathVista and 9.2% on GeoQA, while +also surpassing the closed-source model GPT-4o by an average of 13% across both +datasets. The code is available at https://github.com/dle666/R-CoT. + +
+
+
+
+
+ + ☆ A utility-based spatial analysis of residential street-level conditions; + A case study of Rotterdam + + +
+ Residential location choices are traditionally modelled using factors related +to accessibility and socioeconomic environments, neglecting the importance of +local street-level conditions. Arguably, this neglect is due to data practices. +Today, however, street-level images -- which are highly effective at encoding +street-level conditions -- are widely available. Additionally, recent advances +in discrete choice models incorporating computer vision capabilities offer +opportunities to integrate street-level conditions into residential location +choice analysis. This study leverages these developments to investigate the +spatial distribution of utility derived from street-level conditions in +residential location choices on a city-wide scale. In our case study of +Rotterdam, the Netherlands, we find that the utility derived from street-level +conditions varies significantly on a highly localised scale, with conditions +rapidly changing even within neighbourhoods. Our results also reveal that the +high real-estate prices in the city centre cannot be attributed to attractive +street-level conditions. Furthermore, whereas the city centre is characterised +by relatively unattractive residential street-level conditions, neighbourhoods +in the southern part of the city -- often perceived as problematic -- exhibit +surprisingly appealing street-level environments. The methodological +contribution of this paper is that it advances the discrete choice models +incorporating computer vision capabilities by introducing a semantic +regularisation layer to the model. Thereby, it adds explainability and +eliminates the need for a separate pipeline to extract information from images, +streamlining the analysis. As such, this paper's findings and methodological +advancements pave the way for further studies to explore integrating +street-level conditions in urban planning. + +
+
+
+
+
+ + ☆ CASCRNet: An Atrous Spatial Pyramid Pooling and Shared Channel Residual + based Network for Capsule Endoscopy + + +
+ This manuscript summarizes work on the Capsule Vision Challenge 2024 by +MISAHUB. To address the multi-class disease classification task, which is +challenging due to the complexity and imbalance in the Capsule Vision challenge +dataset, this paper proposes CASCRNet (Capsule endoscopy-Aspp-SCR-Network), a +parameter-efficient and novel model that uses Shared Channel Residual (SCR) +blocks and Atrous Spatial Pyramid Pooling (ASPP) blocks. Further, the +performance of the proposed model is compared with other well-known approaches. +The experimental results yield that proposed model provides better disease +classification results. The proposed model was successful in classifying +diseases with an F1 Score of 78.5% and a Mean AUC of 98.3%, which is promising +given its compact architecture. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Blendify -- Python rendering framework for Blender + + +
+ With the rapid growth of the volume of research fields like computer vision +and computer graphics, researchers require effective and user-friendly +rendering tools to visualize results. While advanced tools like Blender offer +powerful capabilities, they also require a significant effort to master. This +technical report introduces Blendify, a lightweight Python-based framework that +seamlessly integrates with Blender, providing a high-level API for scene +creation and rendering. Blendify reduces the complexity of working with +Blender's native API by automating object creation, handling the colors and +material linking, and implementing features such as shadow-catcher objects +while maintaining support for high-quality ray-tracing rendering output. With a +focus on usability Blendify enables efficient and flexible rendering workflow +for rendering in common computer vision and computer graphics use cases. The +code is available at https://github.com/ptrvilya/blendify + +
+
+ comment: Project page: https://virtualhumans.mpi-inf.mpg.de/blendify/ +
+
+
+
+
+ + ☆ ROCKET-1: Master Open-World Interaction with Visual-Temporal Context + Prompting + + +
+ Vision-language models (VLMs) have excelled in multimodal tasks, but adapting +them to embodied decision-making in open-world environments presents +challenges. A key issue is the difficulty in smoothly connecting individual +entities in low-level observations with abstract concepts required for +planning. A common approach to address this problem is through the use of +hierarchical agents, where VLMs serve as high-level reasoners that break down +tasks into executable sub-tasks, typically specified using language and +imagined observations. However, language often fails to effectively convey +spatial information, while generating future images with sufficient accuracy +remains challenging. To address these limitations, we propose visual-temporal +context prompting, a novel communication protocol between VLMs and policy +models. This protocol leverages object segmentation from both past and present +observations to guide policy-environment interactions. Using this approach, we +train ROCKET-1, a low-level policy that predicts actions based on concatenated +visual observations and segmentation masks, with real-time object tracking +provided by SAM-2. Our method unlocks the full potential of VLMs +visual-language reasoning abilities, enabling them to solve complex creative +tasks, especially those heavily reliant on spatial understanding. Experiments +in Minecraft demonstrate that our approach allows agents to accomplish +previously unattainable tasks, highlighting the effectiveness of +visual-temporal context prompting in embodied decision-making. Codes and demos +will be available on the project page: https://craftjarvis.github.io/ROCKET-1. + +
+
+
+
+
+ + ☆ TAGE: Trustworthy Attribute Group Editing for Stable Few-shot Image + Generation + + +
+ Generative Adversarial Networks (GANs) have emerged as a prominent research +focus for image editing tasks, leveraging the powerful image generation +capabilities of the GAN framework to produce remarkable results.However, +prevailing approaches are contingent upon extensive training datasets and +explicit supervision, presenting a significant challenge in manipulating the +diverse attributes of new image classes with limited sample availability. To +surmount this hurdle, we introduce TAGE, an innovative image generation network +comprising three integral modules: the Codebook Learning Module (CLM), the Code +Prediction Module (CPM) and the Prompt-driven Semantic Module (PSM). The CPM +module delves into the semantic dimensions of category-agnostic attributes, +encapsulating them within a discrete codebook. This module is predicated on the +concept that images are assemblages of attributes, and thus, by editing these +category-independent attributes, it is theoretically possible to generate +images from unseen categories. Subsequently, the CPM module facilitates +naturalistic image editing by predicting indices of category-independent +attribute vectors within the codebook. Additionally, the PSM module generates +semantic cues that are seamlessly integrated into the Transformer architecture +of the CPM, enhancing the model's comprehension of the targeted attributes for +editing. With these semantic cues, the model can generate images that +accentuate desired attributes more prominently while maintaining the integrity +of the original category, even with a limited number of samples. We have +conducted extensive experiments utilizing the Animal Faces, Flowers, and +VGGFaces datasets. The results of these experiments demonstrate that our +proposed method not only achieves superior performance but also exhibits a high +degree of stability when compared to other few-shot image generation +techniques. + +
+
+ comment: Accepted by International Conference on Signal Processing Systems + Conference +
+
+
+
+
+ + ☆ Few-shot NeRF by Adaptive Rendering Loss Regularization ECCV2024 + + +
+ Novel view synthesis with sparse inputs poses great challenges to Neural +Radiance Field (NeRF). Recent works demonstrate that the frequency +regularization of Positional Encoding (PE) can achieve promising results for +few-shot NeRF. In this work, we reveal that there exists an inconsistency +between the frequency regularization of PE and rendering loss. This prevents +few-shot NeRF from synthesizing higher-quality novel views. To mitigate this +inconsistency, we propose Adaptive Rendering loss regularization for few-shot +NeRF, dubbed AR-NeRF. Specifically, we present a two-phase rendering +supervision and an adaptive rendering loss weight learning strategy to align +the frequency relationship between PE and 2D-pixel supervision. In this way, +AR-NeRF can learn global structures better in the early training phase and +adaptively learn local details throughout the training process. Extensive +experiments show that our AR-NeRF achieves state-of-the-art performance on +different datasets, including object-level and complex scenes. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ☆ Exploiting Text-Image Latent Spaces for the Description of Visual + Concepts ICPR + + +
+ Concept Activation Vectors (CAVs) offer insights into neural network +decision-making by linking human friendly concepts to the model's internal +feature extraction process. However, when a new set of CAVs is discovered, they +must still be translated into a human understandable description. For +image-based neural networks, this is typically done by visualizing the most +relevant images of a CAV, while the determination of the concept is left to +humans. In this work, we introduce an approach to aid the interpretation of +newly discovered concept sets by suggesting textual descriptions for each CAV. +This is done by mapping the most relevant images representing a CAV into a +text-image embedding where a joint description of these relevant images can be +computed. We propose utilizing the most relevant receptive fields instead of +full images encoded. We demonstrate the capabilities of this approach in +multiple experiments with and without given CAV labels, showing that the +proposed approach provides accurate descriptions for the CAVs and reduces the +challenge of concept interpretation. + +
+
+ comment: 19 pages, 7 figures, to be published in ICPR +
+
+
+
+
+ + ☆ Att2CPC: Attention-Guided Lossy Attribute Compression of Point Clouds + + +
+ With the great progress of 3D sensing and acquisition technology, the volume +of point cloud data has grown dramatically, which urges the development of +efficient point cloud compression methods. In this paper, we focus on the task +of learned lossy point cloud attribute compression (PCAC). We propose an +efficient attention-based method for lossy compression of point cloud +attributes leveraging on an autoencoder architecture. Specifically, at the +encoding side, we conduct multiple downsampling to best exploit the local +attribute patterns, in which effective External Cross Attention (ECA) is +devised to hierarchically aggregate features by intergrating attributes and +geometry contexts. At the decoding side, the attributes of the point cloud are +progressively reconstructed based on the multi-scale representation and the +zero-padding upsampling tactic. To the best of our knowledge, this is the first +approach to introduce attention mechanism to point-based lossy PCAC task. We +verify the compression efficiency of our model on various sequences, including +human body frames, sparse objects, and large-scale point cloud scenes. +Experiments show that our method achieves an average improvement of 1.15 dB and +2.13 dB in BD-PSNR of Y channel and YUV channel, respectively, when comparing +with the state-of-the-art point-based method Deep-PCAC. Codes of this paper are +available at https://github.com/I2-Multimedia-Lab/Att2CPC. + +
+
+
+
+
+ + ☆ DREB-Net: Dual-stream Restoration Embedding Blur-feature Fusion Network + for High-mobility UAV Object Detection + + +
+ Object detection algorithms are pivotal components of unmanned aerial vehicle +(UAV) imaging systems, extensively employed in complex fields. However, images +captured by high-mobility UAVs often suffer from motion blur cases, which +significantly impedes the performance of advanced object detection algorithms. +To address these challenges, we propose an innovative object detection +algorithm specifically designed for blurry images, named DREB-Net (Dual-stream +Restoration Embedding Blur-feature Fusion Network). First, DREB-Net addresses +the particularities of blurry image object detection problem by incorporating a +Blurry image Restoration Auxiliary Branch (BRAB) during the training phase. +Second, it fuses the extracted shallow features via Multi-level +Attention-Guided Feature Fusion (MAGFF) module, to extract richer features. +Here, the MAGFF module comprises local attention modules and global attention +modules, which assign different weights to the branches. Then, during the +inference phase, the deep feature extraction of the BRAB can be removed to +reduce computational complexity and improve detection speed. In loss function, +a combined loss of MSE and SSIM is added to the BRAB to restore blurry images. +Finally, DREB-Net introduces Fast Fourier Transform in the early stages of +feature extraction, via a Learnable Frequency domain Amplitude Modulation +Module (LFAMM), to adjust feature amplitude and enhance feature processing +capability. Experimental results indicate that DREB-Net can still effectively +perform object detection tasks under motion blur in captured images, showcasing +excellent performance and broad application prospects. Our source code will be +available at https://github.com/EEIC-Lab/DREB-Net.git. + +
+
+
+
+
+ + ☆ Deep Learning for Active Region Classification: A Systematic Study from + Convolutional Neural Networks to Vision Transformers + + +
+ A solar active region can significantly disrupt the Sun Earth space +environment, often leading to severe space weather events such as solar flares +and coronal mass ejections. As a consequence, the automatic classification of +active region groups is the crucial starting point for accurately and promptly +predicting solar activity. This study presents our results concerned with the +application of deep learning techniques to the classification of active region +cutouts based on the Mount Wilson classification scheme. Specifically, we have +explored the latest advancements in image classification architectures, from +Convolutional Neural Networks to Vision Transformers, and reported on their +performances for the active region classification task, showing that the +crucial point for their effectiveness consists in a robust training process +based on the latest advances in the field. + +
+
+
+
+
+ + ☆ Learning Lossless Compression for High Bit-Depth Volumetric Medical + Image + + +
+ Recent advances in learning-based methods have markedly enhanced the +capabilities of image compression. However, these methods struggle with high +bit-depth volumetric medical images, facing issues such as degraded +performance, increased memory demand, and reduced processing speed. To address +these challenges, this paper presents the Bit-Division based Lossless +Volumetric Image Compression (BD-LVIC) framework, which is tailored for high +bit-depth medical volume compression. The BD-LVIC framework skillfully divides +the high bit-depth volume into two lower bit-depth segments: the Most +Significant Bit-Volume (MSBV) and the Least Significant Bit-Volume (LSBV). The +MSBV concentrates on the most significant bits of the volumetric medical image, +capturing vital structural details in a compact manner. This reduction in +complexity greatly improves compression efficiency using traditional codecs. +Conversely, the LSBV deals with the least significant bits, which encapsulate +intricate texture details. To compress this detailed information effectively, +we introduce an effective learning-based compression model equipped with a +Transformer-Based Feature Alignment Module, which exploits both intra-slice and +inter-slice redundancies to accurately align features. Subsequently, a Parallel +Autoregressive Coding Module merges these features to precisely estimate the +probability distribution of the least significant bit-planes. Our extensive +testing demonstrates that the BD-LVIC framework not only sets new performance +benchmarks across various datasets but also maintains a competitive coding +speed, highlighting its significant potential and practical utility in the +realm of volumetric medical image compression. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ PGDiffSeg: Prior-Guided Denoising Diffusion Model with Parameter-Shared + Attention for Breast Cancer Segmentation + + +
+ Early detection through imaging and accurate diagnosis is crucial in +mitigating the high mortality rate associated with breast cancer. However, +locating tumors from low-resolution and high-noise medical images is extremely +challenging. Therefore, this paper proposes a novel PGDiffSeg (Prior-Guided +Diffusion Denoising Model with Parameter-Shared Attention) that applies +diffusion denoising methods to breast cancer medical image segmentation, +accurately recovering the affected areas from Gaussian noise. Firstly, we +design a parallel pipeline for noise processing and semantic information +processing and propose a parameter-shared attention module (PSA) in multi-layer +that seamlessly integrates these two pipelines. This integration empowers +PGDiffSeg to incorporate semantic details at multiple levels during the +denoising process, producing highly accurate segmentation maps. Secondly, we +introduce a guided strategy that leverages prior knowledge to simulate the +decision-making process of medical professionals, thereby enhancing the model's +ability to locate tumor positions precisely. Finally, we provide the first-ever +discussion on the interpretability of the generative diffusion model in the +context of breast cancer segmentation. Extensive experiments have demonstrated +the superiority of our model over the current state-of-the-art approaches, +confirming its effectiveness as a flexible diffusion denoising method suitable +for medical image research. Our code will be publicly available later. + +
+
+
+
+
+ + ☆ EntityCLIP: Entity-Centric Image-Text Matching via Multimodal Attentive + Contrastive Learning + + +
+ Recent advancements in image-text matching have been notable, yet prevailing +models predominantly cater to broad queries and struggle with accommodating +fine-grained query intention. In this paper, we work towards the +\textbf{E}ntity-centric \textbf{I}mage-\textbf{T}ext \textbf{M}atching (EITM), +a task that the text and image involve specific entity-related information. The +challenge of this task mainly lies in the larger semantic gap in entity +association modeling, comparing with the general image-text matching problem.To +narrow the huge semantic gap between the entity-centric text and the images, we +take the fundamental CLIP as the backbone and devise a multimodal attentive +contrastive learning framework to tam CLIP to adapt EITM problem, developing a +model named EntityCLIP. The key of our multimodal attentive contrastive +learning is to generate interpretive explanation text using Large Language +Models (LLMs) as the bridge clues. In specific, we proceed by extracting +explanatory text from off-the-shelf LLMs. This explanation text, coupled with +the image and text, is then input into our specially crafted Multimodal +Attentive Experts (MMAE) module, which effectively integrates explanation texts +to narrow the gap of the entity-related text and image in a shared semantic +space. Building on the enriched features derived from MMAE, we further design +an effective Gated Integrative Image-text Matching (GI-ITM) strategy. The +GI-ITM employs an adaptive gating mechanism to aggregate MMAE's features, +subsequently applying image-text matching constraints to steer the alignment +between the text and the image. Extensive experiments are conducted on three +social media news benchmarks including N24News, VisualNews, and GoodNews, the +results shows that our method surpasses the competition methods with a clear +margin. + +
+
+
+
+
+ + ☆ An Intelligent Agentic System for Complex Image Restoration Problems + + +
+ Real-world image restoration (IR) is inherently complex and often requires +combining multiple specialized models to address diverse degradations. Inspired +by human problem-solving, we propose AgenticIR, an agentic system that mimics +the human approach to image processing by following five key stages: +Perception, Scheduling, Execution, Reflection, and Rescheduling. AgenticIR +leverages large language models (LLMs) and vision-language models (VLMs) that +interact via text generation to dynamically operate a toolbox of IR models. We +fine-tune VLMs for image quality analysis and employ LLMs for reasoning, +guiding the system step by step. To compensate for LLMs' lack of specific IR +knowledge and experience, we introduce a self-exploration method, allowing the +LLM to observe and summarize restoration results into referenceable documents. +Experiments demonstrate AgenticIR's potential in handling complex IR tasks, +representing a promising path toward achieving general intelligence in visual +processing. + +
+
+
+
+
+ + ☆ GenUDC: High Quality 3D Mesh Generation with Unsigned Dual Contouring + Representation + + +
+ Generating high-quality meshes with complex structures and realistic surfaces +is the primary goal of 3D generative models. Existing methods typically employ +sequence data or deformable tetrahedral grids for mesh generation. However, +sequence-based methods have difficulty producing complex structures with many +faces due to memory limits. The deformable tetrahedral grid-based method +MeshDiffusion fails to recover realistic surfaces due to the inherent ambiguity +in deformable grids. We propose the GenUDC framework to address these +challenges by leveraging the Unsigned Dual Contouring (UDC) as the mesh +representation. UDC discretizes a mesh in a regular grid and divides it into +the face and vertex parts, recovering both complex structures and fine details. +As a result, the one-to-one mapping between UDC and mesh resolves the ambiguity +problem. In addition, GenUDC adopts a two-stage, coarse-to-fine generative +process for 3D mesh generation. It first generates the face part as a rough +shape and then the vertex part to craft a detailed shape. Extensive evaluations +demonstrate the superiority of UDC as a mesh representation and the favorable +performance of GenUDC in mesh generation. The code and trained models are +available at https://github.com/TrepangCat/GenUDC. + +
+
+ comment: ACMMM 2024, code:https://github.com/TrepangCat/GenUDC +
+
+
+
+
+ + ☆ TranSPORTmer: A Holistic Approach to Trajectory Understanding in + Multi-Agent Sports ACCV 2024 + + +
+ Understanding trajectories in multi-agent scenarios requires addressing +various tasks, including predicting future movements, imputing missing +observations, inferring the status of unseen agents, and classifying different +global states. Traditional data-driven approaches often handle these tasks +separately with specialized models. We introduce TranSPORTmer, a unified +transformer-based framework capable of addressing all these tasks, showcasing +its application to the intricate dynamics of multi-agent sports scenarios like +soccer and basketball. Using Set Attention Blocks, TranSPORTmer effectively +captures temporal dynamics and social interactions in an equivariant manner. +The model's tasks are guided by an input mask that conceals missing or +yet-to-be-predicted observations. Additionally, we introduce a CLS extra agent +to classify states along soccer trajectories, including passes, possessions, +uncontrolled states, and out-of-play intervals, contributing to an enhancement +in modeling trajectories. Evaluations on soccer and basketball datasets show +that TranSPORTmer outperforms state-of-the-art task-specific models in player +forecasting, player forecasting-imputation, ball inference, and ball +imputation. https://youtu.be/8VtSRm8oGoE + +
+
+ comment: Accepted to ACCV 2024 +
+
+
+
+
+ + ☆ ADEM-VL: Adaptive and Embedded Fusion for Efficient Vision-Language + Tuning + + +
+ Recent advancements in multimodal fusion have witnessed the remarkable +success of vision-language (VL) models, which excel in various multimodal +applications such as image captioning and visual question answering. However, +building VL models requires substantial hardware resources, where efficiency is +restricted by two key factors: the extended input sequence of the language +model with vision features demands more computational operations, and a large +number of additional learnable parameters increase memory complexity. These +challenges significantly restrict the broader applicability of such models. To +bridge this gap, we propose ADEM-VL, an efficient vision-language method that +tunes VL models based on pretrained large language models (LLMs) by adopting a +parameter-free cross-attention mechanism for similarity measurements in +multimodal fusion. This approach only requires embedding vision features into +the language space, significantly reducing the number of trainable parameters +and accelerating both training and inference speeds. To enhance representation +learning in fusion module, we introduce an efficient multiscale feature +generation scheme that requires only a single forward pass through the vision +encoder. Moreover, we propose an adaptive fusion scheme that dynamically +discards less relevant visual information for each text token based on its +attention score. This ensures that the fusion process prioritizes the most +pertinent visual features. With experiments on various tasks including visual +question answering, image captioning, and instruction-following, we demonstrate +that our framework outperforms existing approaches. Specifically, our method +surpasses existing methods by an average accuracy of 0.77% on ScienceQA +dataset, with reduced training and inference latency, demonstrating the +superiority of our framework. The code is available at +https://github.com/Hao840/ADEM-VL. + +
+
+
+
+
+ + ☆ Quasi-Medial Distance Field (Q-MDF): A Robust Method for Approximating + and Discretizing Neural Medial Axis + + +
+ The medial axis, a lower-dimensional shape descriptor, plays an important +role in the field of digital geometry processing. Despite its importance, +robust computation of the medial axis transform from diverse inputs, especially +point clouds with defects, remains a significant challenge. In this paper, we +tackle the challenge by proposing a new implicit method that diverges from +mainstream explicit medial axis computation techniques. Our key technical +insight is the difference between the signed distance field (SDF) and the +medial field (MF) of a solid shape is the unsigned distance field (UDF) of the +shape's medial axis. This allows for formulating medial axis computation as an +implicit reconstruction problem. Utilizing a modified double covering method, +we extract the medial axis as the zero level-set of the UDF. Extensive +experiments show that our method has enhanced accuracy and robustness in +learning compact medial axis transform from thorny meshes and point clouds +compared to existing methods. + +
+
+
+
+
+ + ☆ Scaling Robot Policy Learning via Zero-Shot Labeling with Foundation + Models + + +
+ A central challenge towards developing robots that can relate human language +to their perception and actions is the scarcity of natural language annotations +in diverse robot datasets. Moreover, robot policies that follow natural +language instructions are typically trained on either templated language or +expensive human-labeled instructions, hindering their scalability. To this end, +we introduce NILS: Natural language Instruction Labeling for Scalability. NILS +automatically labels uncurated, long-horizon robot data at scale in a zero-shot +manner without any human intervention. NILS combines pretrained vision-language +foundation models in order to detect objects in a scene, detect object-centric +changes, segment tasks from large datasets of unlabelled interaction data and +ultimately label behavior datasets. Evaluations on BridgeV2, Fractal, and a +kitchen play dataset show that NILS can autonomously annotate diverse robot +demonstrations of unlabeled and unstructured datasets while alleviating several +shortcomings of crowdsourced human annotations, such as low data quality and +diversity. We use NILS to label over 115k trajectories obtained from over 430 +hours of robot data. We open-source our auto-labeling code and generated +annotations on our website: http://robottasklabeling.github.io. + +
+
+ comment: Project Website at https://robottasklabeling.github.io/ +
+
+
+
+
+ + ☆ AdaDiffSR: Adaptive Region-aware Dynamic Acceleration Diffusion Model + for Real-World Image Super-Resolution ECCV2024 + + +
+ Diffusion models (DMs) have shown promising results on single-image +super-resolution and other image-to-image translation tasks. Benefiting from +more computational resources and longer inference times, they are able to yield +more realistic images. Existing DMs-based super-resolution methods try to +achieve an overall average recovery over all regions via iterative refinement, +ignoring the consideration that different input image regions require different +timesteps to reconstruct. In this work, we notice that previous DMs-based +super-resolution methods suffer from wasting computational resources to +reconstruct invisible details. To further improve the utilization of +computational resources, we propose AdaDiffSR, a DMs-based SR pipeline with +dynamic timesteps sampling strategy (DTSS). Specifically, by introducing the +multi-metrics latent entropy module (MMLE), we can achieve dynamic perception +of the latent spatial information gain during the denoising process, thereby +guiding the dynamic selection of the timesteps. In addition, we adopt a +progressive feature injection module (PFJ), which dynamically injects the +original image features into the denoising process based on the current +information gain, so as to generate images with both fidelity and realism. +Experiments show that our AdaDiffSR achieves comparable performance over +current state-of-the-art DMs-based SR methods while consuming less +computational resources and inference time on both synthetic and real-world +datasets. + +
+
+ comment: 18 pages, 6 figures, ECCV2024 accepted +
+
+
+
+
+ + ☆ VISAGE: Video Synthesis using Action Graphs for Surgery MICCAI 2024 + + +
+ Surgical data science (SDS) is a field that analyzes patient data before, +during, and after surgery to improve surgical outcomes and skills. However, +surgical data is scarce, heterogeneous, and complex, which limits the +applicability of existing machine learning methods. In this work, we introduce +the novel task of future video generation in laparoscopic surgery. This task +can augment and enrich the existing surgical data and enable various +applications, such as simulation, analysis, and robot-aided surgery. +Ultimately, it involves not only understanding the current state of the +operation but also accurately predicting the dynamic and often unpredictable +nature of surgical procedures. Our proposed method, VISAGE (VIdeo Synthesis +using Action Graphs for Surgery), leverages the power of action scene graphs to +capture the sequential nature of laparoscopic procedures and utilizes diffusion +models to synthesize temporally coherent video sequences. VISAGE predicts the +future frames given only a single initial frame, and the action graph triplets. +By incorporating domain-specific knowledge through the action graph, VISAGE +ensures the generated videos adhere to the expected visual and motion patterns +observed in real laparoscopic procedures. The results of our experiments +demonstrate high-fidelity video generation for laparoscopy procedures, which +enables various applications in SDS. + +
+
+ comment: Accepted at MICCAI 2024 Embodied AI and Robotics for HealTHcare + (EARTH) Workshop +
+
+
+
+
+ + ☆ Efficient Neural Implicit Representation for 3D Human Reconstruction + + +
+ High-fidelity digital human representations are increasingly in demand in the +digital world, particularly for interactive telepresence, AR/VR, 3D graphics, +and the rapidly evolving metaverse. Even though they work well in small spaces, +conventional methods for reconstructing 3D human motion frequently require the +use of expensive hardware and have high processing costs. This study presents +HumanAvatar, an innovative approach that efficiently reconstructs precise human +avatars from monocular video sources. At the core of our methodology, we +integrate the pre-trained HuMoR, a model celebrated for its proficiency in +human motion estimation. This is adeptly fused with the cutting-edge neural +radiance field technology, Instant-NGP, and the state-of-the-art articulated +model, Fast-SNARF, to enhance the reconstruction fidelity and speed. By +combining these two technologies, a system is created that can render quickly +and effectively while also providing estimation of human pose parameters that +are unmatched in accuracy. We have enhanced our system with an advanced +posture-sensitive space reduction technique, which optimally balances rendering +quality with computational efficiency. In our detailed experimental analysis +using both artificial and real-world monocular videos, we establish the +advanced performance of our approach. HumanAvatar consistently equals or +surpasses contemporary leading-edge reconstruction techniques in quality. +Furthermore, it achieves these complex reconstructions in minutes, a fraction +of the time typically required by existing methods. Our models achieve a +training speed that is 110X faster than that of State-of-The-Art (SoTA) +NeRF-based models. Our technique performs noticeably better than SoTA dynamic +human NeRF methods if given an identical runtime limit. HumanAvatar can provide +effective visuals after only 30 seconds of training. + +
+
+
+
+
+ + ☆ Emotion Recognition with Facial Attention and Objective Activation + Functions + + +
+ In this paper, we study the effect of introducing channel and spatial +attention mechanisms, namely SEN-Net, ECA-Net, and CBAM, to existing CNN +vision-based models such as VGGNet, ResNet, and ResNetV2 to perform the Facial +Emotion Recognition task. We show that not only attention can significantly +improve the performance of these models but also that combining them with a +different activation function can further help increase the performance of +these models. + +
+
+
+
+
+ + ☆ New Insight in Cervical Cancer Diagnosis Using Convolution Neural + Network Architecture + + +
+ The Pap smear is a screening method for early cervical cancer diagnosis. The +selection of the right optimizer in the convolutional neural network (CNN) +model is key to the success of the CNN in image classification, including the +classification of cervical cancer Pap smear images. In this study, stochastic +gradient descent (SGD), RMSprop, Adam, AdaGrad, AdaDelta, Adamax, and Nadam +optimizers were used to classify cervical cancer Pap smear images from the +SipakMed dataset. Resnet-18, Resnet-34, and VGG-16 are the CNN architectures +used in this study, and each architecture uses a transfer-learning model. Based +on the test results, we conclude that the transfer learning model performs +better on all CNNs and optimization techniques and that in the transfer +learning model, the optimization has little influence on the training of the +model. Adamax, with accuracy values of 72.8% and 66.8%, had the best accuracy +for the VGG-16 and Resnet-18 architectures, respectively. Resnet-34 had 54.0%. +This is 0.034% lower than Nadam. Overall, Adamax is a suitable optimizer for +CNN in cervical cancer classification on Resnet-18, Resnet-34, and VGG-16 +architectures. This study provides new insights into the configuration of CNN +models for Pap smear image analysis. + +
+
+
+
+
+ + ☆ YOLO-Vehicle-Pro: A Cloud-Edge Collaborative Framework for Object + Detection in Autonomous Driving under Adverse Weather Conditions + + +
+ With the rapid advancement of autonomous driving technology, efficient and +accurate object detection capabilities have become crucial factors in ensuring +the safety and reliability of autonomous driving systems. However, in +low-visibility environments such as hazy conditions, the performance of +traditional object detection algorithms often degrades significantly, failing +to meet the demands of autonomous driving. To address this challenge, this +paper proposes two innovative deep learning models: YOLO-Vehicle and +YOLO-Vehicle-Pro. YOLO-Vehicle is an object detection model tailored +specifically for autonomous driving scenarios, employing multimodal fusion +techniques to combine image and textual information for object detection. +YOLO-Vehicle-Pro builds upon this foundation by introducing an improved image +dehazing algorithm, enhancing detection performance in low-visibility +environments. In addition to model innovation, this paper also designs and +implements a cloud-edge collaborative object detection system, deploying models +on edge devices and offloading partial computational tasks to the cloud in +complex situations. Experimental results demonstrate that on the KITTI dataset, +the YOLO-Vehicle-v1s model achieved 92.1% accuracy while maintaining a +detection speed of 226 FPS and an inference time of 12ms, meeting the real-time +requirements of autonomous driving. When processing hazy images, the +YOLO-Vehicle-Pro model achieved a high accuracy of 82.3% mAP@50 on the Foggy +Cityscapes dataset while maintaining a detection speed of 43 FPS. + +
+
+
+
+
+ + ☆ YOLOv11: An Overview of the Key Architectural Enhancements + + +
+ This study presents an architectural analysis of YOLOv11, the latest +iteration in the YOLO (You Only Look Once) series of object detection models. +We examine the models architectural innovations, including the introduction of +the C3k2 (Cross Stage Partial with kernel size 2) block, SPPF (Spatial Pyramid +Pooling - Fast), and C2PSA (Convolutional block with Parallel Spatial +Attention) components, which contribute in improving the models performance in +several ways such as enhanced feature extraction. The paper explores YOLOv11's +expanded capabilities across various computer vision tasks, including object +detection, instance segmentation, pose estimation, and oriented object +detection (OBB). We review the model's performance improvements in terms of +mean Average Precision (mAP) and computational efficiency compared to its +predecessors, with a focus on the trade-off between parameter count and +accuracy. Additionally, the study discusses YOLOv11's versatility across +different model sizes, from nano to extra-large, catering to diverse +application needs from edge devices to high-performance computing environments. +Our research provides insights into YOLOv11's position within the broader +landscape of object detection and its potential impact on real-time computer +vision applications. + +
+
+
+
+
+ + ☆ Continual Learning on a Data Diet + + +
+ Continual Learning (CL) methods usually learn from all available data. +However, this is not the case in human cognition which efficiently focuses on +key experiences while disregarding the redundant information. Similarly, not +all data points in a dataset have equal potential; some can be more informative +than others. This disparity may significantly impact the performance, as both +the quality and quantity of samples directly influence the model's +generalizability and efficiency. Drawing inspiration from this, we explore the +potential of learning from important samples and present an empirical study for +evaluating coreset selection techniques in the context of CL to stimulate +research in this unexplored area. We train different continual learners on +increasing amounts of selected samples and investigate the learning-forgetting +dynamics by shedding light on the underlying mechanisms driving their improved +stability-plasticity balance. We present several significant observations: +learning from selectively chosen samples (i) enhances incremental accuracy, +(ii) improves knowledge retention of previous tasks, and (iii) refines learned +representations. This analysis contributes to a deeper understanding of +selective learning strategies in CL scenarios. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ☆ Longitudinal Causal Image Synthesis + + +
+ Clinical decision-making relies heavily on causal reasoning and longitudinal +analysis. For example, for a patient with Alzheimer's disease (AD), how will +the brain grey matter atrophy in a year if intervened on the A-beta level in +cerebrospinal fluid? The answer is fundamental to diagnosis and follow-up +treatment. However, this kind of inquiry involves counterfactual medical images +which can not be acquired by instrumental or correlation-based image synthesis +models. Yet, such queries require counterfactual medical images, not obtainable +through standard image synthesis models. Hence, a causal longitudinal image +synthesis (CLIS) method, enabling the synthesis of such images, is highly +valuable. However, building a CLIS model confronts three primary yet unmet +challenges: mismatched dimensionality between high-dimensional images and +low-dimensional tabular variables, inconsistent collection intervals of +follow-up data, and inadequate causal modeling capability of existing causal +graph methods for image data. In this paper, we established a tabular-visual +causal graph (TVCG) for CLIS overcoming these challenges through a novel +integration of generative imaging, continuous-time modeling, and structural +causal models combined with a neural network. We train our CLIS based on the +ADNI dataset and evaluate it on two other AD datasets, which illustrate the +outstanding yet controllable quality of the synthesized images and the +contributions of synthesized MRI to the characterization of AD progression, +substantiating the reliability and utility in clinics. + +
+
+
+
+
+ + ☆ Deep Generative Models for 3D Medical Image Synthesis + + +
+ Deep generative modeling has emerged as a powerful tool for synthesizing +realistic medical images, driving advances in medical image analysis, disease +diagnosis, and treatment planning. This chapter explores various deep +generative models for 3D medical image synthesis, with a focus on Variational +Autoencoders (VAEs), Generative Adversarial Networks (GANs), and Denoising +Diffusion Models (DDMs). We discuss the fundamental principles, recent +advances, as well as strengths and weaknesses of these models and examine their +applications in clinically relevant problems, including unconditional and +conditional generation tasks like image-to-image translation and image +reconstruction. We additionally review commonly used evaluation metrics for +assessing image fidelity, diversity, utility, and privacy and provide an +overview of current challenges in the field. + +
+
+
+
+
+ + ☆ Surgical Scene Segmentation by Transformer With Asymmetric Feature + Enhancement + + +
+ Surgical scene segmentation is a fundamental task for robotic-assisted +laparoscopic surgery understanding. It often contains various anatomical +structures and surgical instruments, where similar local textures and +fine-grained structures make the segmentation a difficult task. Vision-specific +transformer method is a promising way for surgical scene understanding. +However, there are still two main challenges. Firstly, the absence of +inner-patch information fusion leads to poor segmentation performance. +Secondly, the specific characteristics of anatomy and instruments are not +specifically modeled. To tackle the above challenges, we propose a novel +Transformer-based framework with an Asymmetric Feature Enhancement module +(TAFE), which enhances local information and then actively fuses the improved +feature pyramid into the embeddings from transformer encoders by a multi-scale +interaction attention strategy. The proposed method outperforms the SOTA +methods in several different surgical segmentation tasks and additionally +proves its ability of fine-grained structure recognition. Code is available at +https://github.com/cyuan-sjtu/ViT-asym. + +
+
+
+
+
+ + ☆ MIA-DPO: Multi-Image Augmented Direct Preference Optimization For Large + Vision-Language Models + + +
+ Visual preference alignment involves training Large Vision-Language Models +(LVLMs) to predict human preferences between visual inputs. This is typically +achieved by using labeled datasets of chosen/rejected pairs and employing +optimization algorithms like direct preference optimization (DPO). Existing +visual alignment methods, primarily designed for single-image scenarios, +struggle to effectively handle the complexity of multi-image tasks due to the +scarcity of diverse training data and the high cost of annotating +chosen/rejected pairs. We present Multi-Image Augmented Direct Preference +Optimization (MIA-DPO), a visual preference alignment approach that effectively +handles multi-image inputs. MIA-DPO mitigates the scarcity of diverse +multi-image training data by extending single-image data with unrelated images +arranged in grid collages or pic-in-pic formats, significantly reducing the +costs associated with multi-image data annotations. Our observation reveals +that attention values of LVLMs vary considerably across different images. We +use attention values to identify and filter out rejected responses the model +may have mistakenly focused on. Our attention-aware selection for constructing +the chosen/rejected pairs without relying on (i) human annotation, (ii) extra +data, and (iii) external models or APIs. MIA-DPO is compatible with various +architectures and outperforms existing methods on five multi-image benchmarks, +achieving an average performance boost of 3.0% on LLaVA-v1.5 and 4.3% on the +recent InternLM-XC2.5. Moreover, MIA-DPO has a minimal effect on the model's +ability to understand single images. + +
+
+ comment: Project URL: https://github.com/Liuziyu77/MIA-DPO +
+
+
+
+
+ + ☆ Bridging the Gaps: Utilizing Unlabeled Face Recognition Datasets to + Boost Semi-Supervised Facial Expression Recognition + + +
+ In recent years, Facial Expression Recognition (FER) has gained increasing +attention. Most current work focuses on supervised learning, which requires a +large amount of labeled and diverse images, while FER suffers from the scarcity +of large, diverse datasets and annotation difficulty. To address these +problems, we focus on utilizing large unlabeled Face Recognition (FR) datasets +to boost semi-supervised FER. Specifically, we first perform face +reconstruction pre-training on large-scale facial images without annotations to +learn features of facial geometry and expression regions, followed by two-stage +fine-tuning on FER datasets with limited labels. In addition, to further +alleviate the scarcity of labeled and diverse images, we propose a Mixup-based +data augmentation strategy tailored for facial images, and the loss weights of +real and virtual images are determined according to the intersection-over-union +(IoU) of the faces in the two images. Experiments on RAF-DB, AffectNet, and +FERPlus show that our method outperforms existing semi-supervised FER methods +and achieves new state-of-the-art performance. Remarkably, with only 5%, 25% +training sets,our method achieves 64.02% on AffectNet,and 88.23% on RAF-DB, +which is comparable to fully supervised state-of-the-art methods. Codes will be +made publicly available at https://github.com/zhelishisongjie/SSFER. + +
+
+
+
+
+ + ☆ ImDy: Human Inverse Dynamics from Imitated Observations + + +
+ Inverse dynamics (ID), which aims at reproducing the driven torques from +human kinematic observations, has been a critical tool for gait analysis. +However, it is hindered from wider application to general motion due to its +limited scalability. Conventional optimization-based ID requires expensive +laboratory setups, restricting its availability. To alleviate this problem, we +propose to exploit the recently progressive human motion imitation algorithms +to learn human inverse dynamics in a data-driven manner. The key insight is +that the human ID knowledge is implicitly possessed by motion imitators, though +not directly applicable. In light of this, we devise an efficient data +collection pipeline with state-of-the-art motion imitation algorithms and +physics simulators, resulting in a large-scale human inverse dynamics benchmark +as Imitated Dynamics (ImDy). ImDy contains over 150 hours of motion with joint +torque and full-body ground reaction force data. With ImDy, we train a +data-driven human inverse dynamics solver ImDyS(olver) in a fully supervised +manner, which conducts ID and ground reaction force estimation simultaneously. +Experiments on ImDy and real-world data demonstrate the impressive competency +of ImDyS in human inverse dynamics and ground reaction force estimation. +Moreover, the potential of ImDy(-S) as a fundamental motion analysis tool is +exhibited with downstream applications. The project page is +https://foruck.github.io/ImDy/. + +
+
+ comment: Yong-Lu Li and Cewu Lu are the corresponding authors +
+
+
+
+
+ + ☆ Towards Effective Data-Free Knowledge Distillation via Diverse Diffusion + Augmentation + + +
+ Data-free knowledge distillation (DFKD) has emerged as a pivotal technique in +the domain of model compression, substantially reducing the dependency on the +original training data. Nonetheless, conventional DFKD methods that employ +synthesized training data are prone to the limitations of inadequate diversity +and discrepancies in distribution between the synthesized and original +datasets. To address these challenges, this paper introduces an innovative +approach to DFKD through diverse diffusion augmentation (DDA). Specifically, we +revise the paradigm of common data synthesis in DFKD to a composite process +through leveraging diffusion models subsequent to data synthesis for +self-supervised augmentation, which generates a spectrum of data samples with +similar distributions while retaining controlled variations. Furthermore, to +mitigate excessive deviation in the embedding space, we introduce an image +filtering technique grounded in cosine similarity to maintain fidelity during +the knowledge distillation process. Comprehensive experiments conducted on +CIFAR-10, CIFAR-100, and Tiny-ImageNet datasets showcase the superior +performance of our method across various teacher-student network +configurations, outperforming the contemporary state-of-the-art DFKD methods. +Code will be available at:https://github.com/SLGSP/DDA. + +
+
+
+
+
+ + ☆ PlantCamo: Plant Camouflage Detection + + +
+ Camouflaged Object Detection (COD) aims to detect objects with camouflaged +properties. Although previous studies have focused on natural (animals and +insects) and unnatural (artistic and synthetic) camouflage detection, plant +camouflage has been neglected. However, plant camouflage plays a vital role in +natural camouflage. Therefore, this paper introduces a new challenging problem +of Plant Camouflage Detection (PCD). To address this problem, we introduce the +PlantCamo dataset, which comprises 1,250 images with camouflaged plants +representing 58 object categories in various natural scenes. To investigate the +current status of plant camouflage detection, we conduct a large-scale +benchmark study using 20+ cutting-edge COD models on the proposed dataset. Due +to the unique characteristics of plant camouflage, including holes and +irregular borders, we developed a new framework, named PCNet, dedicated to PCD. +Our PCNet surpasses performance thanks to its multi-scale global feature +enhancement and refinement. Finally, we discuss the potential applications and +insights, hoping this work fills the gap in fine-grained COD research and +facilitates further intelligent ecology research. All resources will be +available on https://github.com/yjybuaa/PlantCamo. + +
+
+
+
+
+ + ☆ How to Continually Adapt Text-to-Image Diffusion Models for Flexible + Customization? NeurIPS2024 + + +
+ Custom diffusion models (CDMs) have attracted widespread attention due to +their astonishing generative ability for personalized concepts. However, most +existing CDMs unreasonably assume that personalized concepts are fixed and +cannot change over time. Moreover, they heavily suffer from catastrophic +forgetting and concept neglect on old personalized concepts when continually +learning a series of new concepts. To address these challenges, we propose a +novel Concept-Incremental text-to-image Diffusion Model (CIDM), which can +resolve catastrophic forgetting and concept neglect to learn new customization +tasks in a concept-incremental manner. Specifically, to surmount the +catastrophic forgetting of old concepts, we develop a concept consolidation +loss and an elastic weight aggregation module. They can explore task-specific +and task-shared knowledge during training, and aggregate all low-rank weights +of old concepts based on their contributions during inference. Moreover, in +order to address concept neglect, we devise a context-controllable synthesis +strategy that leverages expressive region features and noise estimation to +control the contexts of generated images according to user conditions. +Experiments validate that our CIDM surpasses existing custom diffusion models. +The source codes are available at https://github.com/JiahuaDong/CIFC. + +
+
+ comment: Accepted to NeurIPS2024 +
+
+
+
+
+ + ☆ Double Banking on Knowledge: Customized Modulation and Prototypes for + Multi-Modality Semi-supervised Medical Image Segmentation + + +
+ Multi-modality (MM) semi-supervised learning (SSL) based medical image +segmentation has recently gained increasing attention for its ability to +utilize MM data and reduce reliance on labeled images. However, current methods +face several challenges: (1) Complex network designs hinder scalability to +scenarios with more than two modalities. (2) Focusing solely on +modality-invariant representation while neglecting modality-specific features, +leads to incomplete MM learning. (3) Leveraging unlabeled data with generative +methods can be unreliable for SSL. To address these problems, we propose Double +Bank Dual Consistency (DBDC), a novel MM-SSL approach for medical image +segmentation. To address challenge (1), we propose a modality all-in-one +segmentation network that accommodates data from any number of modalities, +removing the limitation on modality count. To address challenge (2), we design +two learnable plug-in banks, Modality-Level Modulation bank (MLMB) and +Modality-Level Prototype (MLPB) bank, to capture both modality-invariant and +modality-specific knowledge. These banks are updated using our proposed +Modality Prototype Contrastive Learning (MPCL). Additionally, we design +Modality Adaptive Weighting (MAW) to dynamically adjust learning weights for +each modality, ensuring balanced MM learning as different modalities learn at +different rates. Finally, to address challenge (3), we introduce a Dual +Consistency (DC) strategy that enforces consistency at both the image and +feature levels without relying on generative methods. We evaluate our method on +a 2-to-4 modality segmentation task using three open-source datasets, and +extensive experiments show that our method outperforms state-of-the-art +approaches. + +
+
+
+
+
+ + ☆ BlurryScope: a cost-effective and compact scanning microscope for + automated HER2 scoring using deep learning on blurry image data + + +
+ We developed a rapid scanning optical microscope, termed "BlurryScope", that +leverages continuous image acquisition and deep learning to provide a +cost-effective and compact solution for automated inspection and analysis of +tissue sections. BlurryScope integrates specialized hardware with a neural +network-based model to quickly process motion-blurred histological images and +perform automated pathology classification. This device offers comparable speed +to commercial digital pathology scanners, but at a significantly lower price +point and smaller size/weight, making it ideal for fast triaging in small +clinics, as well as for resource-limited settings. To demonstrate the +proof-of-concept of BlurryScope, we implemented automated classification of +human epidermal growth factor receptor 2 (HER2) scores on immunohistochemically +(IHC) stained breast tissue sections, achieving concordant results with those +obtained from a high-end digital scanning microscope. We evaluated this +approach by scanning HER2-stained tissue microarrays (TMAs) at a continuous +speed of 5 mm/s, which introduces bidirectional motion blur artifacts. These +compromised images were then used to train our network models. Using a test set +of 284 unique patient cores, we achieved blind testing accuracies of 79.3% and +89.7% for 4-class (0, 1+, 2+, 3+) and 2-class (0/1+ , 2+/3+) HER2 score +classification, respectively. BlurryScope automates the entire workflow, from +image scanning to stitching and cropping of regions of interest, as well as +HER2 score classification. We believe BlurryScope has the potential to enhance +the current pathology infrastructure in resource-scarce environments, save +diagnostician time and bolster cancer identification and classification across +various clinical environments. + +
+
+ comment: 18 Pages, 6 Figures +
+
+
+
+
+ + ☆ Unsupervised Low-dose CT Reconstruction with One-way Conditional + Normalizing Flows + + +
+ Deep-learning methods have shown promising performance for low-dose computed +tomography (LDCT) reconstruction. However, supervised methods face the problem +of lacking labeled data in clinical scenarios, and the CNN-based unsupervised +denoising methods would cause excessive smoothing in the reconstructed image. +Recently, the normalizing flows (NFs) based methods have shown advantages in +producing detail-rich images and avoiding over-smoothing, however, there are +still issues: (1) Although the alternating optimization in the data and latent +space can well utilize the regularization and generation capabilities of NFs, +the current two-way transformation strategy of noisy images and latent +variables would cause detail loss and secondary artifacts; and (2) Training NFs +on high-resolution CT images is hard due to huge computation. Though using +conditional normalizing flows (CNFs) to learn conditional probability can +reduce the computational burden, current methods require labeled data for +conditionalization, and the unsupervised CNFs-based LDCT reconstruction remains +a problem. To tackle these problems, we propose a novel CNFs-based unsupervised +LDCT iterative reconstruction algorithm. It employs strict one-way +transformation when performing alternating optimization in the dual spaces, +thus effectively avoiding the problems of detail loss and secondary artifacts. +By proposing a novel unsupervised conditionalization strategy, we train CNFs on +high-resolution CT images, thus achieving fast and high-quality unsupervised +reconstruction. Experiments on different datasets suggest that the performance +of the proposed algorithm could surpass some state-of-the-art unsupervised and +even supervised methods. + +
+
+
+
+
+ + ☆ OVT-B: A New Large-Scale Benchmark for Open-Vocabulary Multi-Object + Tracking NeurIPS 2024 + + +
+ Open-vocabulary object perception has become an important topic in artificial +intelligence, which aims to identify objects with novel classes that have not +been seen during training. Under this setting, open-vocabulary object detection +(OVD) in a single image has been studied in many literature. However, +open-vocabulary object tracking (OVT) from a video has been studied less, and +one reason is the shortage of benchmarks. In this work, we have built a new +large-scale benchmark for open-vocabulary multi-object tracking namely OVT-B. +OVT-B contains 1,048 categories of objects and 1,973 videos with 637,608 +bounding box annotations, which is much larger than the sole open-vocabulary +tracking dataset, i.e., OVTAO-val dataset (200+ categories, 900+ videos). The +proposed OVT-B can be used as a new benchmark to pave the way for OVT research. +We also develop a simple yet effective baseline method for OVT. It integrates +the motion features for object tracking, which is an important feature for MOT +but is ignored in previous OVT methods. Experimental results have verified the +usefulness of the proposed benchmark and the effectiveness of our method. We +have released the benchmark to the public at +https://github.com/Coo1Sea/OVT-B-Dataset. + +
+
+ comment: 15 pages, 6 figures, accepted at NeurIPS 2024 Dataset and Benchmark + Track +
+
+
+
+
+ + ☆ Diffusion Priors for Variational Likelihood Estimation and Image + Denoising NeurIPS2024 + + +
+ Real-world noise removal is crucial in low-level computer vision. Due to the +remarkable generation capabilities of diffusion models, recent attention has +shifted towards leveraging diffusion priors for image restoration tasks. +However, existing diffusion priors-based methods either consider simple noise +types or rely on approximate posterior estimation, limiting their effectiveness +in addressing structured and signal-dependent noise commonly found in +real-world images. In this paper, we build upon diffusion priors and propose +adaptive likelihood estimation and MAP inference during the reverse diffusion +process to tackle real-world noise. We introduce an independent, +non-identically distributed likelihood combined with the noise precision +(inverse variance) prior and dynamically infer the precision posterior using +variational Bayes during the generation process. Meanwhile, we rectify the +estimated noise variance through local Gaussian convolution. The final denoised +image is obtained by propagating intermediate MAP solutions that balance the +updated likelihood and diffusion prior. Additionally, we explore the local +diffusion prior inherent in low-resolution diffusion models, enabling direct +handling of high-resolution noisy images. Extensive experiments and analyses on +diverse real-world datasets demonstrate the effectiveness of our method. Code +is available at https://github.com/HUST-Tan/DiffusionVI. + +
+
+ comment: Accepted by NeurIPS2024 as Spotlight +
+
+
+
+
+ + ☆ PathMoCo: A Novel Framework to Improve Feature Embedding in + Self-supervised Contrastive Learning for Histopathological Images + + +
+ Self-supervised contrastive learning has become a cornerstone in various +areas, particularly histopathological image analysis. Image augmentation plays +a crucial role in self-supervised contrastive learning, as it generates +variations in image samples. However, traditional image augmentation techniques +often overlook the unique characteristics of histopathological images. In this +paper, we propose a new histopathology-specific image augmentation method +called stain reconstruction augmentation (SRA). We integrate our SRA with MoCo +v3, a leading model in self-supervised contrastive learning, along with our +additional contrastive loss terms, and call the new model PathMoCo. We +demonstrate that our PathMoCo always outperforms the standard MoCo v3 across +various downstream tasks and achieves comparable or superior performance to +other foundation models pre-trained on significantly larger histopathology +datasets. + +
+
+
+
+
+ + ☆ HCDN: A Change Detection Network for Construction Housekeeping Using + Feature Fusion and Large Vision Models + + +
+ Workplace safety has received increasing attention as millions of workers +worldwide suffer from work-related accidents. Despite poor housekeeping is a +significant contributor to construction accidents, there remains a significant +lack of technological research focused on improving housekeeping practices in +construction sites. Recognizing and locating poor housekeeping in a dynamic +construction site is an important task that can be improved through computer +vision approaches. Despite advances in AI and computer vision, existing methods +for detecting poor housekeeping conditions face many challenges, including +limited explanations, lack of locating of poor housekeeping, and lack of +annotated datasets. On the other hand, change detection which aims to detect +the changed environmental conditions (e.g., changing from good to poor +housekeeping) and 'where' the change has occurred (e.g., location of objects +causing poor housekeeping), has not been explored to the problem of +housekeeping management. To address these challenges, we propose the +Housekeeping Change Detection Network (HCDN), an advanced change detection +neural network that integrates a feature fusion module and a large vision +model, achieving state-of-the-art performance. Additionally, we introduce the +approach to establish a novel change detection dataset (named Housekeeping-CCD) +focused on housekeeping in construction sites, along with a housekeeping +segmentation dataset. Our contributions include significant performance +improvements compared to existing methods, providing an effective tool for +enhancing construction housekeeping and safety. To promote further development, +we share our source code and trained models for global researchers: +https://github.com/NUS-DBE/Housekeeping-CD. + +
+
+
+
+
+ + ☆ PLGS: Robust Panoptic Lifting with 3D Gaussian Splatting + + +
+ Previous methods utilize the Neural Radiance Field (NeRF) for panoptic +lifting, while their training and rendering speed are unsatisfactory. In +contrast, 3D Gaussian Splatting (3DGS) has emerged as a prominent technique due +to its rapid training and rendering speed. However, unlike NeRF, the +conventional 3DGS may not satisfy the basic smoothness assumption as it does +not rely on any parameterized structures to render (e.g., MLPs). Consequently, +the conventional 3DGS is, in nature, more susceptible to noisy 2D mask +supervision. In this paper, we propose a new method called PLGS that enables +3DGS to generate consistent panoptic segmentation masks from noisy 2D +segmentation masks while maintaining superior efficiency compared to NeRF-based +methods. Specifically, we build a panoptic-aware structured 3D Gaussian model +to introduce smoothness and design effective noise reduction strategies. For +the semantic field, instead of initialization with structure from motion, we +construct reliable semantic anchor points to initialize the 3D Gaussians. We +then use these anchor points as smooth regularization during training. +Additionally, we present a self-training approach using pseudo labels generated +by merging the rendered masks with the noisy masks to enhance the robustness of +PLGS. For the instance field, we project the 2D instance masks into 3D space +and match them with oriented bounding boxes to generate cross-view consistent +instance masks for supervision. Experiments on various benchmarks demonstrate +that our method outperforms previous state-of-the-art methods in terms of both +segmentation quality and speed. + +
+
+
+
+
+ + ☆ Bilateral Hippocampi Segmentation in Low Field MRIs Using Mutual Feature + Learning via Dual-Views + + +
+ Accurate hippocampus segmentation in brain MRI is critical for studying +cognitive and memory functions and diagnosing neurodevelopmental disorders. +While high-field MRIs provide detailed imaging, low-field MRIs are more +accessible and cost-effective, which eliminates the need for sedation in +children, though they often suffer from lower image quality. In this paper, we +present a novel deep-learning approach for the automatic segmentation of +bilateral hippocampi in low-field MRIs. Extending recent advancements in infant +brain segmentation to underserved communities through the use of low-field MRIs +ensures broader access to essential diagnostic tools, thereby supporting better +healthcare outcomes for all children. Inspired by our previous work, Co-BioNet, +the proposed model employs a dual-view structure to enable mutual feature +learning via high-frequency masking, enhancing segmentation accuracy by +leveraging complementary information from different perspectives. Extensive +experiments demonstrate that our method provides reliable segmentation outcomes +for hippocampal analysis in low-resource settings. The code is publicly +available at: https://github.com/himashi92/LoFiHippSeg. + +
+
+
+
+
+ + ☆ Enhancing Multimodal Medical Image Classification using Cross-Graph + Modal Contrastive Learning + + +
+ The classification of medical images is a pivotal aspect of disease +diagnosis, often enhanced by deep learning techniques. However, traditional +approaches typically focus on unimodal medical image data, neglecting the +integration of diverse non-image patient data. This paper proposes a novel +Cross-Graph Modal Contrastive Learning (CGMCL) framework for multimodal medical +image classification. The model effectively integrates both image and non-image +data by constructing cross-modality graphs and leveraging contrastive learning +to align multimodal features in a shared latent space. An inter-modality +feature scaling module further optimizes the representation learning process by +reducing the gap between heterogeneous modalities. The proposed approach is +evaluated on two datasets: a Parkinson's disease (PD) dataset and a public +melanoma dataset. Results demonstrate that CGMCL outperforms conventional +unimodal methods in accuracy, interpretability, and early disease prediction. +Additionally, the method shows superior performance in multi-class melanoma +classification. The CGMCL framework provides valuable insights into medical +image classification while offering improved disease interpretability and +predictive capabilities. + +
+
+
+
+
+ + ☆ Unsupervised Domain Adaptation for Action Recognition via + Self-Ensembling and Conditional Embedding Alignment + + +
+ Recent advancements in deep learning-based wearable human action recognition +(wHAR) have improved the capture and classification of complex motions, but +adoption remains limited due to the lack of expert annotations and domain +discrepancies from user variations. Limited annotations hinder the model's +ability to generalize to out-of-distribution samples. While data augmentation +can improve generalizability, unsupervised augmentation techniques must be +applied carefully to avoid introducing noise. Unsupervised domain adaptation +(UDA) addresses domain discrepancies by aligning conditional distributions with +labeled target samples, but vanilla pseudo-labeling can lead to error +propagation. To address these challenges, we propose $\mu$DAR, a novel joint +optimization architecture comprised of three functions: (i) consistency +regularizer between augmented samples to improve model classification +generalizability, (ii) temporal ensemble for robust pseudo-label generation and +(iii) conditional distribution alignment to improve domain generalizability. +The temporal ensemble works by aggregating predictions from past epochs to +smooth out noisy pseudo-label predictions, which are then used in the +conditional distribution alignment module to minimize kernel-based class-wise +conditional maximum mean discrepancy ($k$CMMD) between the source and target +feature space to learn a domain invariant embedding. The +consistency-regularized augmentations ensure that multiple augmentations of the +same sample share the same labels; this results in (a) strong generalization +with limited source domain samples and (b) consistent pseudo-label generation +in target samples. The novel integration of these three modules in $\mu$DAR +results in a range of $\approx$ 4-12% average macro-F1 score improvement over +six state-of-the-art UDA methods in four benchmark wHAR datasets + +
+
+ comment: This work has been accepted to the Proceedings of the IEEE + International Conference on Data Mining, 2024 +
+
+
+
+
+ + ☆ GenDP: 3D Semantic Fields for Category-Level Generalizable Diffusion + Policy + + +
+ Diffusion-based policies have shown remarkable capability in executing +complex robotic manipulation tasks but lack explicit characterization of +geometry and semantics, which often limits their ability to generalize to +unseen objects and layouts. To enhance the generalization capabilities of +Diffusion Policy, we introduce a novel framework that incorporates explicit +spatial and semantic information via 3D semantic fields. We generate 3D +descriptor fields from multi-view RGBD observations with large foundational +vision models, then compare these descriptor fields against reference +descriptors to obtain semantic fields. The proposed method explicitly considers +geometry and semantics, enabling strong generalization capabilities in tasks +requiring category-level generalization, resolving geometric ambiguities, and +attention to subtle geometric details. We evaluate our method across eight +tasks involving articulated objects and instances with varying shapes and +textures from multiple object categories. Our method demonstrates its +effectiveness by increasing Diffusion Policy's average success rate on unseen +instances from 20% to 93%. Additionally, we provide a detailed analysis and +visualization to interpret the sources of performance gain and explain how our +method can generalize to novel instances. + +
+
+ comment: Accepted to Conference on Robot Learning (CoRL 2024). Project Page: + https://robopil.github.io/GenDP/ +
+
+
+
+
+ + ☆ Which Client is Reliable?: A Reliable and Personalized Prompt-based + Federated Learning for Medical Image Question Answering + + +
+ Conventional medical artificial intelligence (AI) models face barriers in +clinical application and ethical issues owing to their inability to handle the +privacy-sensitive characteristics of medical data. We present a novel +personalized federated learning (pFL) method for medical visual question +answering (VQA) models, addressing privacy reliability challenges in the +medical domain. Our method introduces learnable prompts into a Transformer +architecture to efficiently train it on diverse medical datasets without +massive computational costs. Then we introduce a reliable client VQA model that +incorporates Dempster-Shafer evidence theory to quantify uncertainty in +predictions, enhancing the model's reliability. Furthermore, we propose a novel +inter-client communication mechanism that uses maximum likelihood estimation to +balance accuracy and uncertainty, fostering efficient integration of insights +across clients. + +
+
+
+
+
+ + ♻ ☆ Pruning By Explaining Revisited: Optimizing Attribution Methods to Prune + CNNs and Transformers ECCV 2024 + + +
+ To solve ever more complex problems, Deep Neural Networks are scaled to +billions of parameters, leading to huge computational costs. An effective +approach to reduce computational requirements and increase efficiency is to +prune unnecessary components of these often over-parameterized networks. +Previous work has shown that attribution methods from the field of eXplainable +AI serve as effective means to extract and prune the least relevant network +components in a few-shot fashion. We extend the current state by proposing to +explicitly optimize hyperparameters of attribution methods for the task of +pruning, and further include transformer-based networks in our analysis. Our +approach yields higher model compression rates of large transformer- and +convolutional architectures (VGG, ResNet, ViT) compared to previous works, +while still attaining high performance on ImageNet classification tasks. Here, +our experiments indicate that transformers have a higher degree of +over-parameterization compared to convolutional neural networks. Code is +available at https://github.com/erfanhatefi/Pruning-by-eXplaining-in-PyTorch. + +
+
+ comment: Accepted as a workshop paper at ECCV 2024, 26 pages (11 pages + manuscript, 3 pages references, 12 pages appendix) +
+
+
+
+
+ + ♻ ☆ VILA-U: a Unified Foundation Model Integrating Visual Understanding and + Generation + + +
+ VILA-U is a Unified foundation model that integrates Video, Image, Language +understanding and generation. Traditional visual language models (VLMs) use +separate modules for understanding and generating visual content, which can +lead to misalignment and increased complexity. In contrast, VILA-U employs a +single autoregressive next-token prediction framework for both tasks, +eliminating the need for additional components like diffusion models. This +approach not only simplifies the model but also achieves near state-of-the-art +performance in visual language understanding and generation. The success of +VILA-U is attributed to two main factors: the unified vision tower that aligns +discrete visual tokens with textual inputs during pretraining, which enhances +visual perception, and autoregressive image generation can achieve similar +quality as diffusion models with high-quality dataset. This allows VILA-U to +perform comparably to more complex models using a fully token-based +autoregressive framework. + +
+
+ comment: Code: https://github.com/mit-han-lab/vila-u. The first two authors + contributed equally to this work +
+
+
+
+
+ + ♻ ☆ JointMotion: Joint Self-Supervision for Joint Motion Prediction + + +
+ We present JointMotion, a self-supervised pre-training method for joint +motion prediction in self-driving vehicles. Our method jointly optimizes a +scene-level objective connecting motion and environments, and an instance-level +objective to refine learned representations. Scene-level representations are +learned via non-contrastive similarity learning of past motion sequences and +environment context. At the instance level, we use masked autoencoding to +refine multimodal polyline representations. We complement this with an adaptive +pre-training decoder that enables JointMotion to generalize across different +environment representations, fusion mechanisms, and dataset characteristics. +Notably, our method reduces the joint final displacement error of Wayformer, +HPTR, and Scene Transformer models by 3\%, 8\%, and 12\%, respectively; and +enables transfer learning between the Waymo Open Motion and the Argoverse 2 +Motion Forecasting datasets. Code: https://github.com/kit-mrt/future-motion + +
+
+ comment: CoRL'24 camera-ready +
+
+
+
+
+ + ♻ ☆ Telling Stories for Common Sense Zero-Shot Action Recognition ACCV 2024 + + +
+ Video understanding has long suffered from reliance on large labeled +datasets, motivating research into zero-shot learning. Recent progress in +language modeling presents opportunities to advance zero-shot video analysis, +but constructing an effective semantic space relating action classes remains +challenging. We address this by introducing a novel dataset, Stories, which +contains rich textual descriptions for diverse action classes extracted from +WikiHow articles. For each class, we extract multi-sentence narratives +detailing the necessary steps, scenes, objects, and verbs that characterize the +action. This contextual data enables modeling of nuanced relationships between +actions, paving the way for zero-shot transfer. We also propose an approach +that harnesses Stories to improve feature generation for training zero-shot +classification. Without any target dataset fine-tuning, our method achieves new +state-of-the-art on multiple benchmarks, improving top-1 accuracy by up to +6.1%. We believe Stories provides a valuable resource that can catalyze +progress in zero-shot action recognition. The textual narratives forge +connections between seen and unseen classes, overcoming the bottleneck of +labeled data that has long impeded advancements in this exciting domain. The +data can be found here: https://github.com/kini5gowda/Stories . + +
+
+ comment: Accepted in ACCV 2024! +
+
+
+
+
+ + ♻ ☆ Generalizable Prompt Tuning for Vision-Language Models + + +
+ Prompt tuning for vision-language models such as CLIP involves optimizing the +text prompts used to generate image-text pairs for specific downstream tasks. +While hand-crafted or template-based prompts are generally applicable to a +wider range of unseen classes, they tend to perform poorly in downstream tasks +(i.e., seen classes). Learnable soft prompts, on the other hand, often perform +well in downstream tasks but lack generalizability. Additionally, prior +research has predominantly concentrated on the textual modality, with very few +studies attempting to explore the prompt's generalization potential from the +visual modality. Keeping these limitations in mind, we investigate how to +prompt tuning to obtain both a competitive downstream performance and +generalization. The study shows that by treating soft and hand-crafted prompts +as dual views of the textual modality, and maximizing their mutual information, +we can better ensemble task-specific and general semantic information. +Moreover, to generate more expressive prompts, the study introduces a +class-wise augmentation from the visual modality, resulting in significant +robustness to a wider range of unseen classes. Extensive evaluations on several +benchmarks report that the proposed approach achieves competitive results in +terms of both task-specific performance and general abilities. + +
+
+ comment: in progress +
+
+
+
+
+ + ♻ ☆ Exploring the Adversarial Robustness of CLIP for AI-generated Image + Detection + + +
+ In recent years, many forensic detectors have been proposed to detect +AI-generated images and prevent their use for malicious purposes. Convolutional +neural networks (CNNs) have long been the dominant architecture in this field +and have been the subject of intense study. However, recently proposed +Transformer-based detectors have been shown to match or even outperform +CNN-based detectors, especially in terms of generalization. In this paper, we +study the adversarial robustness of AI-generated image detectors, focusing on +Contrastive Language-Image Pretraining (CLIP)-based methods that rely on Visual +Transformer (ViT) backbones and comparing their performance with CNN-based +methods. We study the robustness to different adversarial attacks under a +variety of conditions and analyze both numerical results and frequency-domain +patterns. CLIP-based detectors are found to be vulnerable to white-box attacks +just like CNN-based detectors. However, attacks do not easily transfer between +CNN-based and CLIP-based methods. This is also confirmed by the different +distribution of the adversarial noise patterns in the frequency domain. +Overall, this analysis provides new insights into the properties of forensic +detectors that can help to develop more effective strategies. + +
+
+
+
+
+ + ♻ ☆ Leveraging Hallucinations to Reduce Manual Prompt Dependency in + Promptable Segmentation NeurIPS 2024 + + +
+ Promptable segmentation typically requires instance-specific manual prompts +to guide the segmentation of each desired object. To minimize such a need, +task-generic promptable segmentation has been introduced, which employs a +single task-generic prompt to segment various images of different objects in +the same task. Current methods use Multimodal Large Language Models (MLLMs) to +reason detailed instance-specific prompts from a task-generic prompt for +improving segmentation accuracy. The effectiveness of this segmentation heavily +depends on the precision of these derived prompts. However, MLLMs often suffer +hallucinations during reasoning, resulting in inaccurate prompting. While +existing methods focus on eliminating hallucinations to improve a model, we +argue that MLLM hallucinations can reveal valuable contextual insights when +leveraged correctly, as they represent pre-trained large-scale knowledge beyond +individual images. In this paper, we utilize hallucinations to mine +task-related information from images and verify its accuracy for enhancing +precision of the generated prompts. Specifically, we introduce an iterative +Prompt-Mask Cycle generation framework (ProMaC) with a prompt generator and a +mask generator.The prompt generator uses a multi-scale chain of thought +prompting, initially exploring hallucinations for extracting extended +contextual knowledge on a test image.These hallucinations are then reduced to +formulate precise instance-specific prompts, directing the mask generator to +produce masks that are consistent with task semantics by mask semantic +alignment. The generated masks iteratively induce the prompt generator to focus +more on task-relevant image areas and reduce irrelevant hallucinations, +resulting jointly in better prompts and masks. Experiments on 5 benchmarks +demonstrate the effectiveness of ProMaC. Code given in +https://lwpyh.github.io/ProMaC/. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ LocoMotion: Learning Motion-Focused Video-Language Representations ACCV 2024 + + +
+ This paper strives for motion-focused video-language representations. +Existing methods to learn video-language representations use spatial-focused +data, where identifying the objects and scene is often enough to distinguish +the relevant caption. We instead propose LocoMotion to learn from +motion-focused captions that describe the movement and temporal progression of +local object motions. We achieve this by adding synthetic motions to videos and +using the parameters of these motions to generate corresponding captions. +Furthermore, we propose verb-variation paraphrasing to increase the caption +variety and learn the link between primitive motions and high-level verbs. With +this, we are able to learn a motion-focused video-language representation. +Experiments demonstrate our approach is effective for a variety of downstream +tasks, particularly when limited data is available for fine-tuning. Code is +available: https://hazeldoughty.github.io/Papers/LocoMotion/ + +
+
+ comment: ACCV 2024 Oral +
+
+
+
+
+ + ♻ ☆ Accessible, At-Home Detection of Parkinson's Disease via Multi-task + Video Analysis + + +
+ Limited accessibility to neurological care leads to underdiagnosed +Parkinson's Disease (PD), preventing early intervention. Existing AI-based PD +detection methods primarily focus on unimodal analysis of motor or speech +tasks, overlooking the multifaceted nature of the disease. To address this, we +introduce a large-scale, multi-task video dataset consisting of 1102 sessions +(each containing videos of finger tapping, facial expression, and speech tasks +captured via webcam) from 845 participants (272 with PD). We propose a novel +Uncertainty-calibrated Fusion Network (UFNet) that leverages this multimodal +data to enhance diagnostic accuracy. UFNet employs independent task-specific +networks, trained with Monte Carlo Dropout for uncertainty quantification, +followed by self-attended fusion of features, with attention weights +dynamically adjusted based on task-specific uncertainties. To ensure +patient-centered evaluation, the participants were randomly split into three +sets: 60% for training, 20% for model selection, and 20% for final performance +evaluation. UFNet significantly outperformed single-task models in terms of +accuracy, area under the ROC curve (AUROC), and sensitivity while maintaining +non-inferior specificity. Withholding uncertain predictions further boosted the +performance, achieving 88.0+-0.3%$ accuracy, 93.0+-0.2% AUROC, 79.3+-0.9% +sensitivity, and 92.6+-0.3% specificity, at the expense of not being able to +predict for 2.3+-0.3% data (+- denotes 95% confidence interval). Further +analysis suggests that the trained model does not exhibit any detectable bias +across sex and ethnic subgroups and is most effective for individuals aged +between 50 and 80. Requiring only a webcam and microphone, our approach +facilitates accessible home-based PD screening, especially in regions with +limited healthcare resources. + +
+
+
+
+
+ + ♻ ☆ SCA: Highly Efficient Semantic-Consistent Unrestricted Adversarial + Attack + + +
+ Deep neural network based systems deployed in sensitive environments are +vulnerable to adversarial attacks. Unrestricted adversarial attacks typically +manipulate the semantic content of an image (e.g., color or texture) to create +adversarial examples that are both effective and photorealistic. Recent works +have utilized the diffusion inversion process to map images into a latent +space, where high-level semantics are manipulated by introducing perturbations. +However, they often results in substantial semantic distortions in the denoised +output and suffers from low efficiency. In this study, we propose a novel +framework called Semantic-Consistent Unrestricted Adversarial Attacks (SCA), +which employs an inversion method to extract edit-friendly noise maps and +utilizes Multimodal Large Language Model (MLLM) to provide semantic guidance +throughout the process. Under the condition of rich semantic information +provided by MLLM, we perform the DDPM denoising process of each step using a +series of edit-friendly noise maps, and leverage DPM Solver++ to accelerate +this process, enabling efficient sampling with semantic consistency. Compared +to existing methods, our framework enables the efficient generation of +adversarial examples that exhibit minimal discernible semantic changes. +Consequently, we for the first time introduce Semantic-Consistent Adversarial +Examples (SCAE). Extensive experiments and visualizations have demonstrated the +high efficiency of SCA, particularly in being on average 12 times faster than +the state-of-the-art attacks. Our research can further draw attention to the +security of multimedia information. + +
+
+
+
+
+ + ♻ ☆ PnLCalib: Sports Field Registration via Points and Lines Optimization + + +
+ Camera calibration in broadcast sports videos presents numerous challenges +for accurate sports field registration due to multiple camera angles, varying +camera parameters, and frequent occlusions of the field. Traditional +search-based methods depend on initial camera pose estimates, which can +struggle in non-standard positions and dynamic environments. In response, we +propose an optimization-based calibration pipeline that leverages a 3D soccer +field model and a predefined set of keypoints to overcome these limitations. +Our method also introduces a novel refinement module that improves initial +calibration by using detected field lines in a non-linear optimization process. +This approach outperforms existing techniques in both multi-view and +single-view 3D camera calibration tasks, while maintaining competitive +performance in homography estimation. Extensive experimentation on real-world +soccer datasets, including SoccerNet-Calibration, WorldCup 2014, and +TS-WorldCup, highlights the robustness and accuracy of our method across +diverse broadcast scenarios. Our approach offers significant improvements in +camera calibration precision and reliability. + +
+
+ comment: Extended version of "No Bells, Just Whistles: Sports Field + Registration Leveraging Geometric Properties" +
+
+
+
+
+ + ♻ ☆ PixLore: A Dataset-driven Approach to Rich Image Captioning + + +
+ In the domain of vision-language integration, generating detailed image +captions poses a significant challenge due to the lack of curated and rich +datasets. This study introduces PixLore, a novel method that leverages Querying +Transformers through the fine-tuning of the BLIP-2 model using the LoRa method +on a standard commercial GPU. The followed approach, which involves training on +a carefully assembled dataset from state-of-the-art Computer Vision models +combined and augmented by ChatGPT, addresses the question of whether intricate +image understanding can be achieved with an ensemble of smaller-scale models, +referred to as Knowledge Stitching. Comparative evaluations against major +models such as GPT-4 and Google Bard demonstrate that PixLore-2.7B, despite +having considerably fewer parameters, is rated higher than the existing +State-of-the-Art models in over half of the assessments. Precisely, PixLore +outperform Bard and BLIP-2, which score approximately 35.18% and 27.98% lower +than PixLore in the task of image captioning. This research not only presents a +groundbreaking approach but also highlights the importance of well-curated +datasets in enhancing the performance of smaller models. + +
+
+ comment: Paper in preprint pending of publication +
+
+
+
+
+ + ♻ ☆ Denoising Diffusion Models for Inpainting of Healthy Brain Tissue MICCAI + + +
+ This paper is a contribution to the "BraTS 2023 Local Synthesis of Healthy +Brain Tissue via Inpainting Challenge". The task of this challenge is to +transform tumor tissue into healthy tissue in brain magnetic resonance (MR) +images. This idea originates from the problem that MR images can be evaluated +using automatic processing tools, however, many of these tools are optimized +for the analysis of healthy tissue. By solving the given inpainting task, we +enable the automatic analysis of images featuring lesions, and further +downstream tasks. Our approach builds on denoising diffusion probabilistic +models. We use a 2D model that is trained using slices in which healthy tissue +was cropped out and is learned to be inpainted again. This allows us to use the +ground truth healthy tissue during training. In the sampling stage, we replace +the slices containing diseased tissue in the original 3D volume with the slices +containing the healthy tissue inpainting. With our approach, we achieve +comparable results to the competing methods. On the validation set our model +achieves a mean SSIM of 0.7804, a PSNR of 20.3525 and a MSE of 0.0113. In +future we plan to extend our 2D model to a 3D model, allowing to inpaint the +region of interest as a whole without losing context information of neighboring +slices. + +
+
+ comment: 12 pages, 5 figures, MICCAI challenge submission +
+
+
+
+
+ + ♻ ☆ A Multimodal Fusion Network For Student Emotion Recognition Based on + Transformer and Tensor Product + + +
+ This paper introduces a new multi-modal model based on the Transformer +architecture and tensor product fusion strategy, combining BERT's text vectors +and ViT's image vectors to classify students' psychological conditions, with an +accuracy of 93.65%. The purpose of the study is to accurately analyze the +mental health status of students from various data sources. This paper +discusses modal fusion methods, including early, late and intermediate fusion, +to overcome the challenges of integrating multi-modal information. Ablation +studies compare the performance of different models and fusion techniques, +showing that the proposed model outperforms existing methods such as CLIP and +ViLBERT in terms of accuracy and inference speed. Conclusions indicate that +while this model has significant advantages in emotion recognition, its +potential to incorporate other data modalities provides areas for future +research. + +
+
+
+
+
+ + ♻ ☆ Towards Croppable Implicit Neural Representations NeurIPS 2024 + + +
+ Implicit Neural Representations (INRs) have peaked interest in recent years +due to their ability to encode natural signals using neural networks. While +INRs allow for useful applications such as interpolating new coordinates and +signal compression, their black-box nature makes it difficult to modify them +post-training. In this paper we explore the idea of editable INRs, and +specifically focus on the widely used cropping operation. To this end, we +present Local-Global SIRENs -- a novel INR architecture that supports cropping +by design. Local-Global SIRENs are based on combining local and global feature +extraction for signal encoding. What makes their design unique is the ability +to effortlessly remove specific portions of an encoded signal, with a +proportional weight decrease. This is achieved by eliminating the corresponding +weights from the network, without the need for retraining. We further show how +this architecture can be used to support the straightforward extension of +previously encoded signals. Beyond signal editing, we examine how the +Local-Global approach can accelerate training, enhance encoding of various +signals, improve downstream performance, and be applied to modern INRs such as +INCODE, highlighting its potential and flexibility. Code is available at +https://github.com/maorash/Local-Global-INRs. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Breaking Class Barriers: Efficient Dataset Distillation via Inter-Class + Feature Compensator + + +
+ Dataset distillation has emerged as a technique aiming to condense +informative features from large, natural datasets into a compact and synthetic +form. While recent advancements have refined this technique, its performance is +bottlenecked by the prevailing class-specific synthesis paradigm. Under this +paradigm, synthetic data is optimized exclusively for a pre-assigned one-hot +label, creating an implicit class barrier in feature condensation. This leads +to inefficient utilization of the distillation budget and oversight of +inter-class feature distributions, which ultimately limits the effectiveness +and efficiency, as demonstrated in our analysis. To overcome these constraints, +this paper presents the Inter-class Feature Compensator (INFER), an innovative +distillation approach that transcends the class-specific data-label framework +widely utilized in current dataset distillation methods. Specifically, INFER +leverages a Universal Feature Compensator (UFC) to enhance feature integration +across classes, enabling the generation of multiple additional synthetic +instances from a single UFC input. This significantly improves the efficiency +of the distillation budget. Moreover, INFER enriches inter-class interactions +during the distillation, thereby enhancing the effectiveness and +generalizability of the distilled data. By allowing for the linear +interpolation of labels similar to those in the original dataset, INFER +meticulously optimizes the synthetic data and dramatically reduces the size of +soft labels in the synthetic dataset to almost zero, establishing a new +benchmark for efficiency and effectiveness in dataset distillation. + +
+
+
+
+
+ + ♻ ☆ Exploring Stronger Transformer Representation Learning for Occluded + Person Re-Identification + + +
+ Due to some complex factors (e.g., occlusion, pose variation and diverse +camera perspectives), extracting stronger feature representation in person +re-identification remains a challenging task. In this paper, we proposed a +novel self-supervision and supervision combining transformer-based person +re-identification framework, namely SSSC-TransReID. Different from the general +transformer-based person re-identification models, we designed a +self-supervised contrastive learning branch, which can enhance the feature +representation for person re-identification without negative samples or +additional pre-training. In order to train the contrastive learning branch, we +also proposed a novel random rectangle mask strategy to simulate the occlusion +in real scenes, so as to enhance the feature representation for occlusion. +Finally, we utilized the joint-training loss function to integrate the +advantages of supervised learning with ID tags and self-supervised contrastive +learning without negative samples, which can reinforce the ability of our model +to excavate stronger discriminative features, especially for occlusion. +Extensive experimental results on several benchmark datasets show our proposed +model obtains superior Re-ID performance consistently and outperforms the +state-of-the-art ReID methods by large margins on the mean average accuracy +(mAP) and Rank-1 accuracy. + +
+
+
+
+
+ + ♻ ☆ From Real Artifacts to Virtual Reference: A Robust Framework for + Translating Endoscopic Images + + +
+ Domain adaptation, which bridges the distributions across different +modalities, plays a crucial role in multimodal medical image analysis. In +endoscopic imaging, combining pre-operative data with intra-operative imaging +is important for surgical planning and navigation. However, existing domain +adaptation methods are hampered by distribution shift caused by in vivo +artifacts, necessitating robust techniques for aligning noisy and artifact +abundant patient endoscopic videos with clean virtual images reconstructed from +pre-operative tomographic data for pose estimation during intraoperative +guidance. This paper presents an artifact-resilient image translation method +and an associated benchmark for this purpose. The method incorporates a novel +``local-global'' translation framework and a noise-resilient feature extraction +strategy. For the former, it decouples the image translation process into a +local step for feature denoising, and a global step for global style transfer. +For feature extraction, a new contrastive learning strategy is proposed, which +can extract noise-resilient features for establishing robust correspondence +across domains. Detailed validation on both public and in-house clinical +datasets has been conducted, demonstrating significantly improved performance +compared to the current state-of-the-art. + +
+
+
+
+
+ + ♻ ☆ Few-Shot Adversarial Prompt Learning on Vision-Language Models NeurIPS 2024 + + +
+ The vulnerability of deep neural networks to imperceptible adversarial +perturbations has attracted widespread attention. Inspired by the success of +vision-language foundation models, previous efforts achieved zero-shot +adversarial robustness by aligning adversarial visual features with text +supervision. However, in practice, they are still unsatisfactory due to several +issues, including heavy adaptation cost, suboptimal text supervision, and +uncontrolled natural generalization capacity. In this paper, to address these +issues, we propose a few-shot adversarial prompt framework where adapting input +sequences with limited data makes significant adversarial robustness +improvement. Specifically, we achieve this by providing adversarially +correlated text supervision that is end-to-end learned from adversarial +examples. We also propose a novel training objective that enhances the +consistency of multi-modal features while encourages differentiated uni-modal +features between natural and adversarial examples. The proposed framework gives +access to learn adversarial text supervision, which provides superior +cross-modal adversarial alignment and matches state-of-the-art zero-shot +adversarial robustness with only 1% training data. Code is available at: +https://github.com/lionel-w2/FAP. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Interaction Modeling with Agent Selection and Physical + Coefficient for Trajectory Prediction SP + + +
+ A thorough understanding of the interaction between the target agent and +surrounding agents is a prerequisite for accurate trajectory prediction. +Although many methods have been explored, they all assign correlation +coefficients to surrounding agents in a purely learning-based manner. In this +study, we present ASPILin, which manually selects interacting agents and +calculates their correlations instead of attention scores. Surprisingly, these +simple modifications can significantly improve prediction performance and +substantially reduce computational costs. Additionally, ASPILin models the +interacting agents at each past time step separately, rather than only modeling +the interacting agents at the current time step. This clarifies the causal +chain of the target agent's historical trajectory and helps the model better +understand dynamic interactions. We intentionally simplified our model in other +aspects, such as map encoding. Remarkably, experiments conducted on the +INTERACTION, highD, and CitySim datasets demonstrate that our method is +efficient and straightforward, outperforming other state-of-the-art methods. + +
+
+ comment: code:https://github.com/kkk00714/ASPILin +
+
+
+
+
+ + ♻ ☆ Conquering the Communication Constraints to Enable Large Pre-Trained + Models in Federated Learning + + +
+ Federated learning (FL) has emerged as a promising paradigm for enabling the +collaborative training of models without centralized access to the raw data on +local devices. In the typical FL paradigm (e.g., FedAvg), model weights are +sent to and from the server each round to participating clients. Recently, the +use of small pre-trained models has been shown effective in federated learning +optimization and improving convergence. However, recent state-of-the-art +pre-trained models are getting more capable but also have more parameters. In +conventional FL, sharing the enormous model weights can quickly put a massive +communication burden on the system, especially if more capable models are +employed. Can we find a solution to enable those strong and readily-available +pre-trained models in FL to achieve excellent performance while simultaneously +reducing the communication burden? To this end, we investigate the use of +parameter-efficient fine-tuning in federated learning and thus introduce a new +framework: FedPEFT. Specifically, we systemically evaluate the performance of +FedPEFT across a variety of client stability, data distribution, and +differential privacy settings. By only locally tuning and globally sharing a +small portion of the model weights, significant reductions in the total +communication overhead can be achieved while maintaining competitive or even +better performance in a wide range of federated learning scenarios, providing +insight into a new paradigm for practical and effective federated systems. + +
+
+
+
+
+ + ♻ ☆ Latent Noise Segmentation: How Neural Noise Leads to the Emergence of + Segmentation and Grouping ICML 2024 + + +
+ Humans are able to segment images effortlessly without supervision using +perceptual grouping. Here, we propose a counter-intuitive computational +approach to solving unsupervised perceptual grouping and segmentation: that +they arise because of neural noise, rather than in spite of it. We (1) +mathematically demonstrate that under realistic assumptions, neural noise can +be used to separate objects from each other; (2) that adding noise in a DNN +enables the network to segment images even though it was never trained on any +segmentation labels; and (3) that segmenting objects using noise results in +segmentation performance that aligns with the perceptual grouping phenomena +observed in humans, and is sample-efficient. We introduce the Good Gestalt (GG) +datasets -- six datasets designed to specifically test perceptual grouping, and +show that our DNN models reproduce many important phenomena in human +perception, such as illusory contours, closure, continuity, proximity, and +occlusion. Finally, we (4) show that our model improves performance on our GG +datasets compared to other tested unsupervised models by $24.9\%$. Together, +our results suggest a novel unsupervised segmentation method requiring few +assumptions, a new explanation for the formation of perceptual grouping, and a +novel potential benefit of neural noise. + +
+
+ comment: ICML 2024 camera ready version +
+
+
+
+
+ + ♻ ☆ STBA: Towards Evaluating the Robustness of DNNs for Query-Limited + Black-box Scenario + + +
+ Many attack techniques have been proposed to explore the vulnerability of +DNNs and further help to improve their robustness. Despite the significant +progress made recently, existing black-box attack methods still suffer from +unsatisfactory performance due to the vast number of queries needed to optimize +desired perturbations. Besides, the other critical challenge is that +adversarial examples built in a noise-adding manner are abnormal and struggle +to successfully attack robust models, whose robustness is enhanced by +adversarial training against small perturbations. There is no doubt that these +two issues mentioned above will significantly increase the risk of exposure and +result in a failure to dig deeply into the vulnerability of DNNs. Hence, it is +necessary to evaluate DNNs' fragility sufficiently under query-limited settings +in a non-additional way. In this paper, we propose the Spatial Transform +Black-box Attack (STBA), a novel framework to craft formidable adversarial +examples in the query-limited scenario. Specifically, STBA introduces a flow +field to the high-frequency part of clean images to generate adversarial +examples and adopts the following two processes to enhance their naturalness +and significantly improve the query efficiency: a) we apply an estimated flow +field to the high-frequency part of clean images to generate adversarial +examples instead of introducing external noise to the benign image, and b) we +leverage an efficient gradient estimation method based on a batch of samples to +optimize such an ideal flow field under query-limited settings. Compared to +existing score-based black-box baselines, extensive experiments indicated that +STBA could effectively improve the imperceptibility of the adversarial examples +and remarkably boost the attack success rate under query-limited settings. + +
+
+ comment: Accepted by T-MM +
+
+
+
+
+ + ♻ ☆ ODTFormer: Efficient Obstacle Detection and Tracking with Stereo Cameras + Based on Transformer IROS 2024 + + +
+ Obstacle detection and tracking represent a critical component in robot +autonomous navigation. In this paper, we propose ODTFormer, a Transformer-based +model to address both obstacle detection and tracking problems. For the +detection task, our approach leverages deformable attention to construct a 3D +cost volume, which is decoded progressively in the form of voxel occupancy +grids. We further track the obstacles by matching the voxels between +consecutive frames. The entire model can be optimized in an end-to-end manner. +Through extensive experiments on DrivingStereo and KITTI benchmarks, our model +achieves state-of-the-art performance in the obstacle detection task. We also +report comparable accuracy to state-of-the-art obstacle tracking models while +requiring only a fraction of their computation cost, typically ten-fold to +twenty-fold less. The code and model weights will be publicly released. + +
+
+ comment: 8 pages. Accepted by IROS 2024 +
+
+
+
+
+ + ♻ ☆ Meteor: Mamba-based Traversal of Rationale for Large Language and Vision + Models + + +
+ The rapid development of large language and vision models (LLVMs) has been +driven by advances in visual instruction tuning. Recently, open-source LLVMs +have curated high-quality visual instruction tuning datasets and utilized +additional vision encoders or multiple computer vision models in order to +narrow the performance gap with powerful closed-source LLVMs. These +advancements are attributed to multifaceted information required for diverse +capabilities, including fundamental image understanding, real-world knowledge +about common-sense and non-object concepts (e.g., charts, diagrams, symbols, +signs, and math problems), and step-by-step procedures for solving complex +questions. Drawing from the multifaceted information, we present a new +efficient LLVM, Mamba-based traversal of rationales (Meteor), which leverages +multifaceted rationale to enhance understanding and answering capabilities. To +embed lengthy rationales containing abundant information, we employ the Mamba +architecture, capable of processing sequential data with linear time +complexity. We introduce a new concept of traversal of rationale that +facilitates efficient embedding of rationale. Subsequently, the backbone +multimodal language model (MLM) is trained to generate answers with the aid of +rationale. Through these steps, Meteor achieves significant improvements in +vision language performances across multiple evaluation benchmarks requiring +diverse capabilities, without scaling up the model size or employing additional +vision encoders and computer vision models. + +
+
+ comment: Code is available in https://github.com/ByungKwanLee/Meteor +
+
+
+
+
+ + ♻ ☆ LLaVA-MoD: Making LLaVA Tiny via MoE Knowledge Distillation + + +
+ We introduce LLaVA-MoD, a novel framework designed to enable the efficient +training of small-scale Multimodal Language Models (s-MLLM) by distilling +knowledge from large-scale MLLM (l-MLLM). Our approach tackles two fundamental +challenges in MLLM distillation. First, we optimize the network structure of +s-MLLM by integrating a sparse Mixture of Experts (MoE) architecture into the +language model, striking a balance between computational efficiency and model +expressiveness. Second, we propose a progressive knowledge transfer strategy to +ensure comprehensive knowledge migration. This strategy begins with mimic +distillation, where we minimize the Kullback-Leibler (KL) divergence between +output distributions to enable the student model to emulate the teacher +network's understanding. Following this, we introduce preference distillation +via Direct Preference Optimization (DPO), where the key lies in treating l-MLLM +as the reference model. During this phase, the s-MLLM's ability to discriminate +between superior and inferior examples is significantly enhanced beyond l-MLLM, +leading to a better student that surpasses its teacher, particularly in +hallucination benchmarks. Extensive experiments demonstrate that LLaVA-MoD +outperforms existing models across various multimodal benchmarks while +maintaining a minimal number of activated parameters and low computational +costs. Remarkably, LLaVA-MoD, with only 2B activated parameters, surpasses +Qwen-VL-Chat-7B by an average of 8.8% across benchmarks, using merely 0.3% of +the training data and 23% trainable parameters. These results underscore +LLaVA-MoD's ability to effectively distill comprehensive knowledge from its +teacher model, paving the way for the development of more efficient MLLMs. The +code will be available on: https://github.com/shufangxun/LLaVA-MoD. + +
+
+
+
+
+ + ♻ ☆ DIP-Watermark: A Double Identity Protection Method Based on Robust + Adversarial Watermark + + +
+ The wide deployment of Face Recognition (FR) systems poses privacy risks. One +countermeasure is adversarial attack, deceiving unauthorized malicious FR, but +it also disrupts regular identity verification of trusted authorizers, +exacerbating the potential threat of identity impersonation. To address this, +we propose the first double identity protection scheme based on traceable +adversarial watermarking, termed DIP-Watermark. DIP-Watermark employs a +one-time watermark embedding to deceive unauthorized FR models and allows +authorizers to perform identity verification by extracting the watermark. +Specifically, we propose an information-guided adversarial attack against FR +models. The encoder embeds an identity-specific watermark into the deep feature +space of the carrier, guiding recognizable features of the image to deviate +from the source identity. We further adopt a collaborative meta-optimization +strategy compatible with sub-tasks, which regularizes the joint optimization +direction of the encoder and decoder. This strategy enhances the representation +of universal carrier features, mitigating multi-objective optimization +conflicts in watermarking. Experiments confirm that DIP-Watermark achieves +significant attack success rates and traceability accuracy on state-of-the-art +FR models, exhibiting remarkable robustness that outperforms the existing +privacy protection methods using adversarial attacks and deep watermarking, or +simple combinations of the two. Our work potentially opens up new insights into +proactive protection for FR privacy. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Light Transformer Ensembles for Multimodal Trajectory + Forecasting + + +
+ Accurate trajectory forecasting is crucial for the performance of various +systems, such as advanced driver-assistance systems and self-driving vehicles. +These forecasts allow to anticipate events leading to collisions and, +therefore, to mitigate them. Deep Neural Networks have excelled in motion +forecasting, but issues like overconfidence and uncertainty quantification +persist. Deep Ensembles address these concerns, yet applying them to multimodal +distributions remains challenging. In this paper, we propose a novel approach +named Hierarchical Light Transformer Ensembles (HLT-Ens), aimed at efficiently +training an ensemble of Transformer architectures using a novel hierarchical +loss function. HLT-Ens leverages grouped fully connected layers, inspired by +grouped convolution techniques, to capture multimodal distributions, +effectively. Through extensive experimentation, we demonstrate that HLT-Ens +achieves state-of-the-art performance levels, offering a promising avenue for +improving trajectory forecasting techniques. + +
+
+ comment: acknowledgement added +
+
+
+
+
+ + ♻ ☆ Pulling Target to Source: A New Perspective on Domain Adaptive Semantic + Segmentation + + +
+ Domain adaptive semantic segmentation aims to transfer knowledge from a +labeled source domain to an unlabeled target domain. However, existing methods +primarily focus on directly learning qualified target features, making it +challenging to guarantee their discrimination in the absence of target labels. +This work provides a new perspective. We observe that the features learned with +source data manage to keep categorically discriminative during training, +thereby enabling us to implicitly learn adequate target representations by +simply \textbf{pulling target features close to source features for each +category}. To this end, we propose T2S-DA, which we interpret as a form of +pulling Target to Source for Domain Adaptation, encouraging the model in +learning similar cross-domain features. Also, considering the pixel categories +are heavily imbalanced for segmentation datasets, we come up with a dynamic +re-weighting strategy to help the model concentrate on those underperforming +classes. Extensive experiments confirm that T2S-DA learns a more discriminative +and generalizable representation, significantly surpassing the +state-of-the-art. We further show that our method is quite qualified for the +domain generalization task, verifying its domain-invariant property. + +
+
+ comment: Accepted by IJCV +
+
+
+
+
+ + ♻ ☆ Improving Text Generation on Images with Synthetic Captions + + +
+ The recent emergence of latent diffusion models such as SDXL and SD 1.5 has +shown significant capability in generating highly detailed and realistic +images. Despite their remarkable ability to produce images, generating accurate +text within images still remains a challenging task. In this paper, we examine +the validity of fine-tuning approaches in generating legible text within the +image. We propose a low-cost approach by leveraging SDXL without any +time-consuming training on large-scale datasets. The proposed strategy employs +a fine-tuning technique that examines the effects of data refinement levels and +synthetic captions. Moreover, our results demonstrate how our small scale +fine-tuning approach can improve the accuracy of text generation in different +scenarios without the need of additional multimodal encoders. Our experiments +show that with the addition of random letters to our raw dataset, our model's +performance improves in producing well-formed visual text. + +
+
+ comment: 2024 16th IIAI International Congress on Advanced Applied Informatics + (IIAI-AAI) +
+
+
+
+
+ + ♻ ☆ Harmonizing Visual Text Comprehension and Generation NeurIPS 2024 + + +
+ In this work, we present TextHarmony, a unified and versatile multimodal +generative model proficient in comprehending and generating visual text. +Simultaneously generating images and texts typically results in performance +degradation due to the inherent inconsistency between vision and language +modalities. To overcome this challenge, existing approaches resort to +modality-specific data for supervised fine-tuning, necessitating distinct model +instances. We propose Slide-LoRA, which dynamically aggregates +modality-specific and modality-agnostic LoRA experts, partially decoupling the +multimodal generation space. Slide-LoRA harmonizes the generation of vision and +language within a singular model instance, thereby facilitating a more unified +generative process. Additionally, we develop a high-quality image caption +dataset, DetailedTextCaps-100K, synthesized with a sophisticated closed-source +MLLM to enhance visual text generation capabilities further. Comprehensive +experiments across various benchmarks demonstrate the effectiveness of the +proposed approach. Empowered by Slide-LoRA, TextHarmony achieves comparable +performance to modality-specific fine-tuning results with only a 2% increase in +parameters and shows an average improvement of 2.5% in visual text +comprehension tasks and 4.0% in visual text generation tasks. Our work +delineates the viability of an integrated approach to multimodal generation +within the visual text domain, setting a foundation for subsequent inquiries. +Code is available at https://github.com/bytedance/TextHarmony. + +
+
+ comment: accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Lightning-Fast Image Inversion and Editing for Text-to-Image Diffusion + Models + + +
+ Diffusion inversion is the problem of taking an image and a text prompt that +describes it and finding a noise latent that would generate the exact same +image. Most current deterministic inversion techniques operate by approximately +solving an implicit equation and may converge slowly or yield poor +reconstructed images. We formulate the problem by finding the roots of an +implicit equation and devlop a method to solve it efficiently. Our solution is +based on Newton-Raphson (NR), a well-known technique in numerical analysis. We +show that a vanilla application of NR is computationally infeasible while +naively transforming it to a computationally tractable alternative tends to +converge to out-of-distribution solutions, resulting in poor reconstruction and +editing. We therefore derive an efficient guided formulation that fastly +converges and provides high-quality reconstructions and editing. We showcase +our method on real image editing with three popular open-sourced diffusion +models: Stable Diffusion, SDXL-Turbo, and Flux with different deterministic +schedulers. Our solution, Guided Newton-Raphson Inversion, inverts an image +within 0.4 sec (on an A100 GPU) for few-step models (SDXL-Turbo and Flux.1), +opening the door for interactive image editing. We further show improved +results in image interpolation and generation of rare objects. + +
+
+
+
+
+ + ♻ ☆ Improving Instance Optimization in Deformable Image Registration with + Gradient Projection MICCAI 2024 + + +
+ Deformable image registration is inherently a multi-objective optimization +(MOO) problem, requiring a delicate balance between image similarity and +deformation regularity. These conflicting objectives often lead to poor +optimization outcomes, such as being trapped in unsatisfactory local minima or +experiencing slow convergence. Deep learning methods have recently gained +popularity in this domain due to their efficiency in processing large datasets +and achieving high accuracy. However, they often underperform during test time +compared to traditional optimization techniques, which further explore +iterative, instance-specific gradient-based optimization. This performance gap +is more pronounced when a distribution shift between training and test data +exists. To address this issue, we focus on the instance optimization (IO) +paradigm, which involves additional optimization for test-time instances based +on a pre-trained model. IO effectively combines the generalization capabilities +of deep learning with the fine-tuning advantages of instance-specific +optimization. Within this framework, we emphasize the use of gradient +projection to mitigate conflicting updates in MOO. This technique projects +conflicting gradients into a common space, better aligning the dual objectives +and enhancing optimization stability. We validate our method using a +state-of-the-art foundation model on the 3D Brain inter-subject registration +task (LUMIR) from the Learn2Reg 2024 Challenge. Our results show significant +improvements over standard gradient descent, leading to more accurate and +reliable registration results. + +
+
+ comment: Learn2Reg Challenge at MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ ERX: A Fast Real-Time Anomaly Detection Algorithm for Hyperspectral Line + Scanning + + +
+ Detecting unexpected objects (anomalies) in real time has great potential for +monitoring, managing, and protecting the environment. Hyperspectral line-scan +cameras are a low-cost solution that enhance confidence in anomaly detection +over RGB and multispectral imagery. However, existing line-scan algorithms are +too slow when using small computers (e.g. those onboard a drone or small +satellite), do not adapt to changing scenery, or lack robustness against +geometric distortions. This paper introduces the Exponentially moving RX +algorithm (ERX) to address these issues, and compares it with existing RX-based +anomaly detection methods for hyperspectral line scanning. Three large and more +complex datasets are also introduced to better assess the practical challenges +when using line-scan cameras (two hyperspectral and one multispectral). ERX is +evaluated using a Jetson Xavier NX compute module, achieving the best +combination of speed and detection performance. This research paves the way for +future studies in grouping and locating anomalous objects, adaptive and +automatic threshold selection, and real-time field tests. The datasets and the +Python code are available at: https://github.com/WiseGamgee/HyperAD. + +
+
+ comment: 17 pages, 13 figures, 4 tables, code and datasets accessible at + https://github.com/WiseGamgee/HyperAD +
+
+
+
+
+ + ♻ ☆ CAT: Contrastive Adapter Training for Personalized Image Generation CVPR + + +
+ The emergence of various adapters, including Low-Rank Adaptation (LoRA) +applied from the field of natural language processing, has allowed diffusion +models to personalize image generation at a low cost. However, due to the +various challenges including limited datasets and shortage of regularization +and computation resources, adapter training often results in unsatisfactory +outcomes, leading to the corruption of the backbone model's prior knowledge. +One of the well known phenomena is the loss of diversity in object generation, +especially within the same class which leads to generating almost identical +objects with minor variations. This poses challenges in generation +capabilities. To solve this issue, we present Contrastive Adapter Training +(CAT), a simple yet effective strategy to enhance adapter training through the +application of CAT loss. Our approach facilitates the preservation of the base +model's original knowledge when the model initiates adapters. Furthermore, we +introduce the Knowledge Preservation Score (KPS) to evaluate CAT's ability to +keep the former information. We qualitatively and quantitatively compare CAT's +improvement. Finally, we mention the possibility of CAT in the aspects of +multi-concept adapter and optimization. + +
+
+ comment: CVPRW 2024 +
+
+
+
+
+ + ♻ ☆ Gaussian-Informed Continuum for Physical Property Identification and + Simulation NeurIPS 2024 + + +
+ This paper studies the problem of estimating physical properties (system +identification) through visual observations. To facilitate geometry-aware +guidance in physical property estimation, we introduce a novel hybrid framework +that leverages 3D Gaussian representation to not only capture explicit shapes +but also enable the simulated continuum to render object masks as 2D shape +surrogates during training. + We propose a new dynamic 3D Gaussian framework based on motion factorization +to recover the object as 3D Gaussian point sets across different time states. + Furthermore, we develop a coarse-to-fine filling strategy to generate the +density fields of the object from the Gaussian reconstruction, allowing for the +extraction of object continuums along with their surfaces and the integration +of Gaussian attributes into these continuums. + In addition to the extracted object surfaces, the Gaussian-informed continuum +also enables the rendering of object masks during simulations, serving as +2D-shape guidance for physical property estimation. + Extensive experimental evaluations demonstrate that our pipeline achieves +state-of-the-art performance across multiple benchmarks and metrics. +Additionally, we illustrate the effectiveness of the proposed method through +real-world demonstrations, showcasing its practical utility. + Our project page is at https://jukgei.github.io/project/gic. + +
+
+ comment: 21 pages, 8 figures, NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Toward Fairer Face Recognition Datasets + + +
+ Face recognition and verification are two computer vision tasks whose +performance has progressed with the introduction of deep representations. +However, ethical, legal, and technical challenges due to the sensitive +character of face data and biases in real training datasets hinder their +development. Generative AI addresses privacy by creating fictitious identities, +but fairness problems persist. We promote fairness by introducing a demographic +attributes balancing mechanism in generated training datasets. We experiment +with an existing real dataset, three generated training datasets, and the +balanced versions of a diffusion-based dataset. We propose a comprehensive +evaluation that considers accuracy and fairness equally and includes a rigorous +regression-based statistical analysis of attributes. The analysis shows that +balancing reduces demographic unfairness. Also, a performance gap persists +despite generation becoming more accurate with time. The proposed balancing +method and comprehensive verification evaluation promote fairer and transparent +face recognition and verification. + +
+
+
+
+
+ + ♻ ☆ LVBench: An Extreme Long Video Understanding Benchmark + + +
+ Recent progress in multimodal large language models has markedly enhanced the +understanding of short videos (typically under one minute), and several +evaluation datasets have emerged accordingly. However, these advancements fall +short of meeting the demands of real-world applications such as embodied +intelligence for long-term decision-making, in-depth movie reviews and +discussions, and live sports commentary, all of which require comprehension of +long videos spanning several hours. To address this gap, we introduce LVBench, +a benchmark specifically designed for long video understanding. Our dataset +comprises publicly sourced videos and encompasses a diverse set of tasks aimed +at long video comprehension and information extraction. LVBench is designed to +challenge multimodal models to demonstrate long-term memory and extended +comprehension capabilities. Our extensive evaluations reveal that current +multimodal models still underperform on these demanding long video +understanding tasks. Through LVBench, we aim to spur the development of more +advanced models capable of tackling the complexities of long video +comprehension. Our data and code are publicly available at: +https://lvbench.github.io. + +
+
+
+
+
+ + ♻ ☆ Advancing Open-Set Domain Generalization Using Evidential Bi-Level + Hardest Domain Scheduler NeurIPS 2024 + + +
+ In Open-Set Domain Generalization (OSDG), the model is exposed to both new +variations of data appearance (domains) and open-set conditions, where both +known and novel categories are present at test time. The challenges of this +task arise from the dual need to generalize across diverse domains and +accurately quantify category novelty, which is critical for applications in +dynamic environments. Recently, meta-learning techniques have demonstrated +superior results in OSDG, effectively orchestrating the meta-train and -test +tasks by employing varied random categories and predefined domain partition +strategies. These approaches prioritize a well-designed training schedule over +traditional methods that focus primarily on data augmentation and the +enhancement of discriminative feature learning. The prevailing meta-learning +models in OSDG typically utilize a predefined sequential domain scheduler to +structure data partitions. However, a crucial aspect that remains inadequately +explored is the influence brought by strategies of domain schedulers during +training. In this paper, we observe that an adaptive domain scheduler benefits +more in OSDG compared with prefixed sequential and random domain schedulers. We +propose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve +an adaptive domain scheduler. This method strategically sequences domains by +assessing their reliabilities in utilizing a follower network, trained with +confidence scores learned in an evidential manner, regularized by max rebiasing +discrepancy, and optimized in a bi-level manner. The results show that our +method substantially improves OSDG performance and achieves more discriminative +embeddings for both the seen and unseen categories. The source code is publicly +available at https://github.com/KPeng9510/EBiL-HaDS. + +
+
+ comment: Accepted to NeurIPS 2024. The source code is publicly available at + https://github.com/KPeng9510/EBiL-HaDS +
+
+
+
+
+ + ♻ ☆ Learning to Manipulate Anywhere: A Visual Generalizable Framework For + Reinforcement Learning + + +
+ Can we endow visuomotor robots with generalization capabilities to operate in +diverse open-world scenarios? In this paper, we propose \textbf{Maniwhere}, a +generalizable framework tailored for visual reinforcement learning, enabling +the trained robot policies to generalize across a combination of multiple +visual disturbance types. Specifically, we introduce a multi-view +representation learning approach fused with Spatial Transformer Network (STN) +module to capture shared semantic information and correspondences among +different viewpoints. In addition, we employ a curriculum-based randomization +and augmentation approach to stabilize the RL training process and strengthen +the visual generalization ability. To exhibit the effectiveness of Maniwhere, +we meticulously design 8 tasks encompassing articulate objects, bi-manual, and +dexterous hand manipulation tasks, demonstrating Maniwhere's strong visual +generalization and sim2real transfer abilities across 3 hardware platforms. Our +experiments show that Maniwhere significantly outperforms existing +state-of-the-art methods. Videos are provided at +https://gemcollector.github.io/maniwhere/. + +
+
+ comment: Webpage: https://gemcollector.github.io/maniwhere/ +
+
+
+
+
+ + ♻ ☆ The Ultimate Combo: Boosting Adversarial Example Transferability by + Composing Data Augmentations + + +
+ To help adversarial examples generalize from surrogate machine-learning (ML) +models to targets, certain transferability-based black-box evasion attacks +incorporate data augmentations (e.g., random resizing). Yet, prior work has +explored limited augmentations and their composition. To fill the gap, we +systematically studied how data augmentation affects transferability. +Specifically, we explored 46 augmentation techniques originally proposed to +help ML models generalize to unseen benign samples, and assessed how they +impact transferability, when applied individually or composed. Performing +exhaustive search on a small subset of augmentation techniques and genetic +search on all techniques, we identified augmentation combinations that help +promote transferability. Extensive experiments with the ImageNet and CIFAR-10 +datasets and 18 models showed that simple color-space augmentations (e.g., +color to greyscale) attain high transferability when combined with standard +augmentations. Furthermore, we discovered that composing augmentations impacts +transferability mostly monotonically (i.e., more augmentations $\rightarrow$ +$\ge$transferability). We also found that the best composition significantly +outperformed the state of the art (e.g., 91.8% vs. $\le$82.5% average +transferability to adversarially trained targets on ImageNet). Lastly, our +theoretical analysis, backed by empirical evidence, intuitively explains why +certain augmentations promote transferability. + +
+
+ comment: Accepted by AISec'24 +
+
+
+
+
+ + ♻ ☆ Diffusion Models are Certifiably Robust Classifiers NeurIPS 2024 + + +
+ Generative learning, recognized for its effective modeling of data +distributions, offers inherent advantages in handling out-of-distribution +instances, especially for enhancing robustness to adversarial attacks. Among +these, diffusion classifiers, utilizing powerful diffusion models, have +demonstrated superior empirical robustness. However, a comprehensive +theoretical understanding of their robustness is still lacking, raising +concerns about their vulnerability to stronger future attacks. In this study, +we prove that diffusion classifiers possess $O(1)$ Lipschitzness, and establish +their certified robustness, demonstrating their inherent resilience. To achieve +non-constant Lipschitzness, thereby obtaining much tighter certified +robustness, we generalize diffusion classifiers to classify Gaussian-corrupted +data. This involves deriving the evidence lower bounds (ELBOs) for these +distributions, approximating the likelihood using the ELBO, and calculating +classification probabilities via Bayes' theorem. Experimental results show the +superior certified robustness of these Noised Diffusion Classifiers (NDCs). +Notably, we achieve over 80% and 70% certified robustness on CIFAR-10 under +adversarial perturbations with \(\ell_2\) norms less than 0.25 and 0.5, +respectively, using a single off-the-shelf diffusion model without any +additional data. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ SemiSAM: Enhancing Semi-Supervised Medical Image Segmentation via + SAM-Assisted Consistency Regularization + + +
+ Semi-supervised learning has attracted much attention due to its less +dependence on acquiring abundant annotations from experts compared to fully +supervised methods, which is especially important for medical image +segmentation which typically requires intensive pixel/voxel-wise labeling by +domain experts. Although semi-supervised methods can improve the performance by +utilizing unlabeled data, there are still gaps between fully supervised methods +under extremely limited annotation scenarios. In this paper, we propose a +simple yet efficient strategy to explore the usage of the Segment Anything +Model (SAM) for enhancing semi-supervised medical image segmentation. +Concretely, the segmentation model trained with domain knowledge provides +information for localization and generating input prompts to the SAM. Then the +generated pseudo-labels of SAM are utilized as additional supervision to assist +in the learning procedure of the semi-supervised framework. Extensive +experiments demonstrate that SemiSAM significantly improves the performance of +existing semi-supervised frameworks when only one or a few labeled images are +available and shows strong efficiency as a plug-and-play strategy for +semi-supervised medical image segmentation. + +
+
+ comment: Accept for BIBM 2024 +
+
+
+
+
+ + ♻ ☆ RotCAtt-TransUNet++: Novel Deep Neural Network for Sophisticated Cardiac + Segmentation + + +
+ Cardiovascular disease remains a predominant global health concern, +responsible for a significant portion of mortality worldwide. Accurate +segmentation of cardiac medical imaging data is pivotal in mitigating fatality +rates associated with cardiovascular conditions. However, existing +state-of-the-art (SOTA) neural networks, including both CNN-based and +Transformer-based approaches, exhibit limitations in practical applicability +due to their inability to effectively capture inter-slice connections alongside +intra-slice information. This deficiency is particularly evident in datasets +featuring intricate, long-range details along the z-axis, such as coronary +arteries in axial views. Additionally, SOTA methods fail to differentiate +non-cardiac components from myocardium in segmentation, leading to the +"spraying" phenomenon. To address these challenges, we present +RotCAtt-TransUNet++, a novel architecture tailored for robust segmentation of +complex cardiac structures. Our approach emphasizes modeling global contexts by +aggregating multiscale features with nested skip connections in the encoder. It +integrates transformer layers to capture interactions between patches and +employs a rotatory attention mechanism to capture connectivity between multiple +slices (inter-slice information). Additionally, a channel-wise cross-attention +gate guides the fused multi-scale channel-wise information and features from +decoder stages to bridge semantic gaps. Experimental results demonstrate that +our proposed model outperforms existing SOTA approaches across four cardiac +datasets and one abdominal dataset. Importantly, coronary arteries and +myocardium are annotated with near-perfect accuracy during inference. An +ablation study shows that the rotatory attention mechanism effectively +transforms embedded vectorized patches in the semantic dimensional space, +enhancing segmentation accuracy. + +
+
+ comment: 11 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Exploring Self-Supervised Skeleton-Based Human Action Recognition under + Occlusions + + +
+ To integrate self-supervised skeleton-based action recognition methods into +autonomous robotic systems, it is crucial to consider adverse situations +involving target occlusions. Such a scenario, despite its practical relevance, +is rarely addressed in existing self-supervised skeleton-based action +recognition methods. To empower models with the capacity to address occlusion, +we propose a simple and effective method. We first pre-train using occluded +skeleton sequences, then use k-means clustering (KMeans) on sequence embeddings +to group semantically similar samples. Next, we propose KNN-Imputation to fill +in missing skeleton data based on the closest sample neighbors. Imputing +incomplete skeleton sequences to create relatively complete sequences as input +provides significant benefits to existing skeleton-based self-supervised +methods. Meanwhile, building on the state-of-the-art Partial Spatio-Temporal +Learning (PSTL), we introduce an Occluded Partial Spatio-Temporal Learning +(OPSTL) framework. This enhancement utilizes Adaptive Spatial Masking (ASM) for +better use of high-quality, intact skeletons. The new proposed method is +verified on the challenging occluded versions of the NTURGB+D 60 and NTURGB+D +120. The source code is publicly available at https://github.com/cyfml/OPSTL. + +
+
+ comment: The source code is publicly available at + https://github.com/cyfml/OPSTL +
+
+
+
+
+ + ♻ ☆ RealignDiff: Boosting Text-to-Image Diffusion Model with Coarse-to-fine + Semantic Re-alignment + + +
+ Recent advances in text-to-image diffusion models have achieved remarkable +success in generating high-quality, realistic images from textual descriptions. +However, these approaches have faced challenges in precisely aligning the +generated visual content with the textual concepts described in the prompts. In +this paper, we propose a two-stage coarse-to-fine semantic re-alignment method, +named RealignDiff, aimed at improving the alignment between text and images in +text-to-image diffusion models. In the coarse semantic re-alignment phase, a +novel caption reward, leveraging the BLIP-2 model, is proposed to evaluate the +semantic discrepancy between the generated image caption and the given text +prompt. Subsequently, the fine semantic re-alignment stage employs a local +dense caption generation module and a re-weighting attention modulation module +to refine the previously generated images from a local semantic view. +Experimental results on the MS-COCO and ViLG-300 datasets demonstrate that the +proposed two-stage coarse-to-fine semantic re-alignment method outperforms +other baseline re-alignment techniques by a substantial margin in both visual +quality and semantic similarity with the input prompt. + +
+
+
+
+
+ + ♻ ☆ Real-World Robot Applications of Foundation Models: A Review + + +
+ Recent developments in foundation models, like Large Language Models (LLMs) +and Vision-Language Models (VLMs), trained on extensive data, facilitate +flexible application across different tasks and modalities. Their impact spans +various fields, including healthcare, education, and robotics. This paper +provides an overview of the practical application of foundation models in +real-world robotics, with a primary emphasis on the replacement of specific +components within existing robot systems. The summary encompasses the +perspective of input-output relationships in foundation models, as well as +their role in perception, motion planning, and control within the field of +robotics. This paper concludes with a discussion of future challenges and +implications for practical robot applications. + +
+
+
+
+
+ + ♻ ☆ MMBench-Video: A Long-Form Multi-Shot Benchmark for Holistic Video + Understanding NeurIPS 2024 + + +
+ The advent of large vision-language models (LVLMs) has spurred research into +their applications in multi-modal contexts, particularly in video +understanding. Traditional VideoQA benchmarks, despite providing quantitative +metrics, often fail to encompass the full spectrum of video content and +inadequately assess models' temporal comprehension. To address these +limitations, we introduce MMBench-Video, a quantitative benchmark designed to +rigorously evaluate LVLMs' proficiency in video understanding. MMBench-Video +incorporates lengthy videos from YouTube and employs free-form questions, +mirroring practical use cases. The benchmark is meticulously crafted to probe +the models' temporal reasoning skills, with all questions human-annotated +according to a carefully constructed ability taxonomy. We employ GPT-4 for +automated assessment, demonstrating superior accuracy and robustness over +earlier LLM-based evaluations. Utilizing MMBench-Video, we have conducted +comprehensive evaluations that include both proprietary and open-source LVLMs +for images and videos. MMBench-Video stands as a valuable resource for the +research community, facilitating improved evaluation of LVLMs and catalyzing +progress in the field of video understanding. The evalutation code of +MMBench-Video will be integrated into VLMEvalKit: +https://github.com/open-compass/VLMEvalKit. + +
+
+ comment: Accepted in NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ♻ ☆ Selective Vision is the Challenge for Visual Reasoning: A Benchmark for + Visual Argument Understanding EMNLP 2024 + + +
+ Visual arguments, often used in advertising or social causes, rely on images +to persuade viewers to do or believe something. Understanding these arguments +requires selective vision: only specific visual stimuli within an image are +relevant to the argument, and relevance can only be understood within the +context of a broader argumentative structure. While visual arguments are +readily appreciated by human audiences, we ask: are today's AI capable of +similar understanding? We present VisArgs, a dataset of 1,611 images annotated +with 5,112 visual premises (with regions), 5,574 commonsense premises, and +reasoning trees connecting them into structured arguments. We propose three +tasks for evaluating visual argument understanding: premise localization, +premise identification, and conclusion deduction. Experiments show that 1) +machines struggle to capture visual cues: GPT-4-O achieved 78.5% accuracy, +while humans reached 98.0%. Models also performed 19.5% worse when +distinguishing between irrelevant objects within the image compared to external +objects. 2) Providing relevant visual premises improved model performance +significantly. + +
+
+ comment: 12 pages, 6 figures. Accepted as main paper in EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Can visual language models resolve textual ambiguity with visual cues? + Let visual puns tell you! EMNLP 2024 + + +
+ Humans possess multimodal literacy, allowing them to actively integrate +information from various modalities to form reasoning. Faced with challenges +like lexical ambiguity in text, we supplement this with other modalities, such +as thumbnail images or textbook illustrations. Is it possible for machines to +achieve a similar multimodal understanding capability? In response, we present +Understanding Pun with Image Explanations (UNPIE), a novel benchmark designed +to assess the impact of multimodal inputs in resolving lexical ambiguities. +Puns serve as the ideal subject for this evaluation due to their intrinsic +ambiguity. Our dataset includes 1,000 puns, each accompanied by an image that +explains both meanings. We pose three multimodal challenges with the +annotations to assess different aspects of multimodal literacy; Pun Grounding, +Disambiguation, and Reconstruction. The results indicate that various Socratic +Models and Visual-Language Models improve over the text-only models when given +visual context, particularly as the complexity of the tasks increases. + +
+
+ comment: Accepted as main paper in EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ CV-VAE: A Compatible Video VAE for Latent Generative Video Models + + +
+ Spatio-temporal compression of videos, utilizing networks such as Variational +Autoencoders (VAE), plays a crucial role in OpenAI's SORA and numerous other +video generative models. For instance, many LLM-like video models learn the +distribution of discrete tokens derived from 3D VAEs within the VQVAE +framework, while most diffusion-based video models capture the distribution of +continuous latent extracted by 2D VAEs without quantization. The temporal +compression is simply realized by uniform frame sampling which results in +unsmooth motion between consecutive frames. Currently, there lacks of a +commonly used continuous video (3D) VAE for latent diffusion-based video models +in the research community. Moreover, since current diffusion-based approaches +are often implemented using pre-trained text-to-image (T2I) models, directly +training a video VAE without considering the compatibility with existing T2I +models will result in a latent space gap between them, which will take huge +computational resources for training to bridge the gap even with the T2I models +as initialization. To address this issue, we propose a method for training a +video VAE of latent video models, namely CV-VAE, whose latent space is +compatible with that of a given image VAE, e.g., image VAE of Stable Diffusion +(SD). The compatibility is achieved by the proposed novel latent space +regularization, which involves formulating a regularization loss using the +image VAE. Benefiting from the latent space compatibility, video models can be +trained seamlessly from pre-trained T2I or video models in a truly +spatio-temporally compressed latent space, rather than simply sampling video +frames at equal intervals. With our CV-VAE, existing video models can generate +four times more frames with minimal finetuning. Extensive experiments are +conducted to demonstrate the effectiveness of the proposed video VAE. + +
+
+ comment: Project Page: https://ailab-cvc.github.io/cvvae/index.html +
+
+
+
+
+ + ♻ ☆ GPHM: Gaussian Parametric Head Model for Monocular Head Avatar + Reconstruction + + +
+ Creating high-fidelity 3D human head avatars is crucial for applications in +VR/AR, digital human, and film production. Recent advances have leveraged +morphable face models to generate animated head avatars from easily accessible +data, representing varying identities and expressions within a low-dimensional +parametric space. However, existing methods often struggle with modeling +complex appearance details, e.g., hairstyles, and suffer from low rendering +quality and efficiency. In this paper we introduce a novel approach, 3D +Gaussian Parametric Head Model, which employs 3D Gaussians to accurately +represent the complexities of the human head, allowing precise control over +both identity and expression. The Gaussian model can handle intricate details, +enabling realistic representations of varying appearances and complex +expressions. Furthermore, we presents a well-designed training framework to +ensure smooth convergence, providing a robust guarantee for learning the rich +content. Our method achieves high-quality, photo-realistic rendering with +real-time efficiency, making it a valuable contribution to the field of +parametric head models. Finally, we apply the 3D Gaussian Parametric Head Model +to monocular video or few-shot head avatar reconstruction tasks, which enables +instant reconstruction of high-quality 3D head avatars even when input data is +extremely limited, surpassing previous methods in terms of reconstruction +quality and training speed. + +
+
+ comment: Project page: https://yuelangx.github.io/gphmv2/ +
+
+
+
+
+ + ♻ ☆ CoIN: A Benchmark of Continual Instruction tuNing for Multimodel Large + Language Model + + +
+ Instruction tuning represents a prevalent strategy employed by Multimodal +Large Language Models (MLLMs) to align with human instructions and adapt to new +tasks. Nevertheless, MLLMs encounter the challenge of adapting to users' +evolving knowledge and demands. Therefore, how to retain existing skills while +acquiring new knowledge needs to be investigated. In this paper, we present a +comprehensive benchmark, namely Continual Instruction tuNing (CoIN), to assess +existing MLLMs in the sequential instruction tuning paradigm. CoIN comprises 10 +commonly used datasets spanning 8 task categories, ensuring a diverse range of +instructions and tasks. Besides, the trained model is evaluated from two +aspects: Instruction Following and General Knowledge, which assess the +alignment with human intention and knowledge preserved for reasoning, +respectively. Experiments on CoIN demonstrate that current powerful MLLMs still +suffer catastrophic forgetting, and the failure in intention alignment assumes +the main responsibility, instead of the knowledge forgetting. To this end, we +introduce MoELoRA to MLLMs which is effective to retain the previous +instruction alignment. Experimental results consistently illustrate the +forgetting decreased from this method on CoIN. + +
+
+
+
+
+ + ♻ ☆ CD-NGP: A Fast Scalable Continual Representation for Dynamic Scenes + + +
+ Current methodologies for novel view synthesis (NVS) in dynamic scenes +encounter significant challenges in harmonizing memory consumption, model +complexity, training efficiency, and rendering fidelity. Existing offline +techniques, while delivering high-quality results, are often characterized by +substantial memory demands and limited scalability. In contrast, online methods +grapple with the challenge of balancing rapid convergence with model +compactness. To address these issues, we propose continual dynamic neural +graphics primitives (CD-NGP). Our approach synergizes features from both +temporal and spatial hash encodings to achieve high rendering quality, employs +parameter reuse to enhance scalability, and leverages a continual learning +framework to mitigate memory overhead. Furthermore, we introduce a novel +dataset comprising multi-view, exceptionally long video sequences with +substantial rigid and non-rigid motion, thereby substantiating the scalability +of our method. + +
+
+ comment: new template, editing +
+
+
+
+
+ + ♻ ☆ Hybrid Spatial Representations for Species Distribution Modeling SDM + + +
+ We address an important problem in ecology called Species Distribution +Modeling (SDM), whose goal is to predict whether a species exists at a certain +position on Earth. In particular, we tackle a challenging version of this task, +where we learn from presence-only data in a community-sourced dataset, model a +large number of species simultaneously, and do not use any additional +environmental information. Previous work has used neural implicit +representations to construct models that achieve promising results. However, +implicit representations often generate predictions of limited spatial +precision. We attribute this limitation to their inherently global formulation +and inability to effectively capture local feature variations. This issue is +especially pronounced with presence-only data and a large number of species. To +address this, we propose a hybrid embedding scheme that combines both implicit +and explicit embeddings. Specifically, the explicit embedding is implemented +with a multiresolution hashgrid, enabling our models to better capture local +information. Experiments demonstrate that our results exceed other works by a +large margin on various standard benchmarks, and that the hybrid representation +is better than both purely implicit and explicit ones. Qualitative +visualizations and comprehensive ablation studies reveal that our hybrid +representation successfully addresses the two main challenges. Our code is +open-sourced at https://github.com/Shiran-Yuan/HSR-SDM. + +
+
+ comment: Project codebase https://github.com/Shiran-Yuan/HSR-SDM +
+
+
+
+
+
+
+
+ + Information Retrieval 8 + +
+
+
+ + ☆ SimRAG: Self-Improving Retrieval-Augmented Generation for Adapting Large + Language Models to Specialized Domains + + +
+ Retrieval-augmented generation (RAG) enhances the question-answering (QA) +abilities of large language models (LLMs) by integrating external knowledge. +However, adapting general-purpose RAG systems to specialized fields such as +science and medicine poses unique challenges due to distribution shifts and +limited access to domain-specific data. To tackle this, we propose SimRAG, a +self-training approach that equips the LLM with joint capabilities of question +answering and question generation for domain adaptation. Our method first +fine-tunes the LLM on instruction-following, question-answering, and +search-related data. Then, it prompts the same LLM to generate diverse +domain-relevant questions from unlabeled corpora, with an additional filtering +strategy to retain high-quality synthetic examples. By leveraging these +synthetic examples, the LLM can improve their performance on domain-specific +RAG tasks. Experiments on 11 datasets, spanning two backbone sizes and three +domains, demonstrate that SimRAG outperforms baselines by 1.2\%--8.6\%. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ YOLO-Vehicle-Pro: A Cloud-Edge Collaborative Framework for Object + Detection in Autonomous Driving under Adverse Weather Conditions + + +
+ With the rapid advancement of autonomous driving technology, efficient and +accurate object detection capabilities have become crucial factors in ensuring +the safety and reliability of autonomous driving systems. However, in +low-visibility environments such as hazy conditions, the performance of +traditional object detection algorithms often degrades significantly, failing +to meet the demands of autonomous driving. To address this challenge, this +paper proposes two innovative deep learning models: YOLO-Vehicle and +YOLO-Vehicle-Pro. YOLO-Vehicle is an object detection model tailored +specifically for autonomous driving scenarios, employing multimodal fusion +techniques to combine image and textual information for object detection. +YOLO-Vehicle-Pro builds upon this foundation by introducing an improved image +dehazing algorithm, enhancing detection performance in low-visibility +environments. In addition to model innovation, this paper also designs and +implements a cloud-edge collaborative object detection system, deploying models +on edge devices and offloading partial computational tasks to the cloud in +complex situations. Experimental results demonstrate that on the KITTI dataset, +the YOLO-Vehicle-v1s model achieved 92.1% accuracy while maintaining a +detection speed of 226 FPS and an inference time of 12ms, meeting the real-time +requirements of autonomous driving. When processing hazy images, the +YOLO-Vehicle-Pro model achieved a high accuracy of 82.3% mAP@50 on the Foggy +Cityscapes dataset while maintaining a detection speed of 43 FPS. + +
+
+
+
+
+ + ☆ Testing Deep Learning Recommender Systems Models on Synthetic + GAN-Generated Datasets + + +
+ The published method Generative Adversarial Networks for Recommender Systems +(GANRS) allows generating data sets for collaborative filtering recommendation +systems. The GANRS source code is available along with a representative set of +generated datasets. We have tested the GANRS method by creating multiple +synthetic datasets from three different real datasets taken as a source. +Experiments include variations in the number of users in the synthetic +datasets, as well as a different number of samples. We have also selected six +state-of-the-art collaborative filtering deep learning models to test both +their comparative performance and the GANRS method. The results show a +consistent behavior of the generated datasets compared to the source ones; +particularly, in the obtained values and trends of the precision and recall +quality measures. The tested deep learning models have also performed as +expected on all synthetic datasets, making it possible to compare the results +with those obtained from the real source data. Future work is proposed, +including different cold start scenarios, unbalanced data, and demographic +fairness. + +
+
+ comment: 10 pages, 7 figures, In press +
+
+
+
+
+ + ☆ Comprehensive Evaluation of Matrix Factorization Models for + Collaborative Filtering Recommender Systems + + +
+ Matrix factorization models are the core of current commercial collaborative +filtering Recommender Systems. This paper tested six representative matrix +factorization models, using four collaborative filtering datasets. Experiments +have tested a variety of accuracy and beyond accuracy quality measures, +including prediction, recommendation of ordered and unordered lists, novelty, +and diversity. Results show each convenient matrix factorization model +attending to their simplicity, the required prediction quality, the necessary +recommendation quality, the desired recommendation novelty and diversity, the +need to explain recommendations, the adequacy of assigning semantic +interpretations to hidden factors, the advisability of recommending to groups +of users, and the need to obtain reliability values. To ensure the +reproducibility of the experiments, an open framework has been used, and the +implementation code is provided. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Extending and Applying Automated HERMES Software Publication Workflows + + +
+ Research software is an import output of research and must be published +according to the FAIR Principles for Research Software. This can be achieved by +publishing software with metadata under a persistent identifier. HERMES is a +tool that leverages continuous integration to automate the publication of +software with rich metadata. In this work, we describe the HERMES workflow +itself, and how to extend it to meet the needs of specific research software +metadata or infrastructure. We introduce the HERMES plugin architecture and +provide the example of creating a new HERMES plugin that harvests metadata from +a metadata source in source code repositories. We show how to use HERMES as an +end user, both via the command line interface, and as a step in a continuous +integration pipeline. Finally, we report three informal case studies whose +results provide a preliminary evaluation of the feasibility and applicability +of HERMES workflows, and the extensibility of the hermes software package. + +
+
+ comment: 17 pages, 2 figures, 2 tables, submitted to a special issue of + Electronic Communications of the EASST collecting submissions of deRSE24, + Conference for Research Software Engineers +
+
+
+
+
+ + ♻ ☆ LLM-Assisted Multi-Teacher Continual Learning for Visual Question + Answering in Robotic Surgery ICRA + + +
+ Visual question answering (VQA) is crucial for promoting surgical education. +In practice, the needs of trainees are constantly evolving, such as learning +more surgical types, adapting to different robots, and learning new surgical +instruments and techniques for various surgeries. However, patient data privacy +often restricts the availability of old data when updating the model, +necessitating an exemplar-free continual learning (CL) setup. Prior CL studies +overlooked two vital problems in the surgical domain: 1) large domain shifts +from diverse surgical operations collected from multiple sources, and 2) severe +data imbalance arising from the uneven presence of surgical instruments or +activities. This paper proposes addressing these problems with a multimodal +large language model (LLM) and an adaptive weight assignment methodology. We +first develop a new multi-teacher CL framework that leverages a multimodal LLM +as the additional teacher. The strong generalization ability of the LLM can +bridge the knowledge gap when domain shifts and data imbalances occur. We then +put forth a novel data processing method that transforms complex LLM embeddings +into logits compatible with our CL framework. We further design an adaptive +weight assignment approach that balances the generalization ability of the LLM +and the domain expertise of the old CL model. Finally, to comprehensively test +the effectiveness of our proposed method, we have also constructed two new +surgical VQA datasets that are largely different from existing ones and could +be valuable resources for future research. Extensive experimental results on +the tested datasets demonstrate the superiority of our method to other advanced +CL schemes. + +
+
+ comment: This paper has been accapted by 2024 IEEE International Conference on + Robotics and Automation (ICRA) +
+
+
+
+
+ + ♻ ☆ From Keywords to Structured Summaries: Streamlining Scholarly + Information Access ISWC 2024 + + +
+ This paper highlights the growing importance of information retrieval (IR) +engines in the scientific community, addressing the inefficiency of traditional +keyword-based search engines due to the rising volume of publications. The +proposed solution involves structured records, underpinning advanced +information technology (IT) tools, including visualization dashboards, to +revolutionize how researchers access and filter articles, replacing the +traditional text-heavy approach. This vision is exemplified through a proof of +concept centered on the "reproductive number estimate of infectious diseases" +research theme, using a fine-tuned large language model (LLM) to automate the +creation of structured records to populate a backend database that now goes +beyond keywords. The result is a next-generation information access system as +an IR method accessible at https://orkg.org/usecases/r0-estimates. + +
+
+ comment: 8 pages, 3 figures | Accepted for publication as a poster paper at + the International Semantic Web Conference (ISWC 2024) +
+
+
+
+
+ + ♻ ☆ Post-Training Attribute Unlearning in Recommender Systems + + +
+ With the growing privacy concerns in recommender systems, recommendation +unlearning is getting increasing attention. Existing studies predominantly use +training data, i.e., model inputs, as unlearning target. However, attackers can +extract private information from the model even if it has not been explicitly +encountered during training. We name this unseen information as +\textit{attribute} and treat it as unlearning target. To protect the sensitive +attribute of users, Attribute Unlearning (AU) aims to make target attributes +indistinguishable. In this paper, we focus on a strict but practical setting of +AU, namely Post-Training Attribute Unlearning (PoT-AU), where unlearning can +only be performed after the training of the recommendation model is completed. +To address the PoT-AU problem in recommender systems, we propose a +two-component loss function. The first component is distinguishability loss, +where we design a distribution-based measurement to make attribute labels +indistinguishable from attackers. We further extend this measurement to handle +multi-class attribute cases with efficient computational overhead. The second +component is regularization loss, where we explore a function-space measurement +that effectively maintains recommendation performance compared to +parameter-space regularization. We use stochastic gradient descent algorithm to +optimize our proposed loss. Extensive experiments on four real-world datasets +demonstrate the effectiveness of our proposed methods. + +
+
+ comment: Accepted by TOIS. arXiv admin note: text overlap with + arXiv:2310.05847 +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Prioritized Generative Replay + + +
+ Sample-efficient online reinforcement learning often uses replay buffers to +store experience for reuse when updating the value function. However, uniform +replay is inefficient, since certain classes of transitions can be more +relevant to learning. While prioritization of more useful samples is helpful, +this strategy can also lead to overfitting, as useful samples are likely to be +more rare. In this work, we instead propose a prioritized, parametric version +of an agent's memory, using generative models to capture online experience. +This paradigm enables (1) densification of past experience, with new +generations that benefit from the generative model's generalization capacity +and (2) guidance via a family of "relevance functions" that push these +generations towards more useful parts of an agent's acquired history. We show +this recipe can be instantiated using conditional diffusion models and simple +relevance functions such as curiosity- or value-based metrics. Our approach +consistently improves performance and sample efficiency in both state- and +pixel-based domains. We expose the mechanisms underlying these gains, showing +how guidance promotes diversity in our generated transitions and reduces +overfitting. We also showcase how our approach can train policies with even +higher update-to-data ratios than before, opening up avenues to better scale +online RL agents. + +
+
+
+
+
+ + ☆ ALTA: Compiler-Based Analysis of Transformers + + +
+ We propose a new programming language called ALTA and a compiler that can map +ALTA programs to Transformer weights. ALTA is inspired by RASP, a language +proposed by Weiss et al. (2021), and Tracr (Lindner et al., 2023), a compiler +from RASP programs to Transformer weights. ALTA complements and extends this +prior work, offering the ability to express loops and to compile programs to +Universal Transformers, among other advantages. ALTA allows us to +constructively show how Transformers can represent length-invariant algorithms +for computing parity and addition, as well as a solution to the SCAN benchmark +of compositional generalization tasks, without requiring intermediate +scratchpad decoding steps. We also propose tools to analyze cases where the +expressibility of an algorithm is established, but end-to-end training on a +given training set fails to induce behavior consistent with the desired +algorithm. To this end, we explore training from ALTA execution traces as a +more fine-grained supervision signal. This enables additional experiments and +theoretical analyses relating the learnability of various algorithms to data +availability and modeling decisions, such as positional encodings. We make the +ALTA framework -- language specification, symbolic interpreter, and weight +compiler -- available to the community to enable further applications and +insights. + +
+
+
+
+
+ + ☆ Leveraging Skills from Unlabeled Prior Data for Efficient Online + Exploration + + +
+ Unsupervised pretraining has been transformative in many supervised domains. +However, applying such ideas to reinforcement learning (RL) presents a unique +challenge in that fine-tuning does not involve mimicking task-specific data, +but rather exploring and locating the solution through iterative +self-improvement. In this work, we study how unlabeled prior trajectory data +can be leveraged to learn efficient exploration strategies. While prior data +can be used to pretrain a set of low-level skills, or as additional off-policy +data for online RL, it has been unclear how to combine these ideas effectively +for online exploration. Our method SUPE (Skills from Unlabeled Prior data for +Exploration) demonstrates that a careful combination of these ideas compounds +their benefits. Our method first extracts low-level skills using a variational +autoencoder (VAE), and then pseudo-relabels unlabeled trajectories using an +optimistic reward model, transforming prior data into high-level, task-relevant +examples. Finally, SUPE uses these transformed examples as additional +off-policy data for online RL to learn a high-level policy that composes +pretrained low-level skills to explore efficiently. We empirically show that +SUPE reliably outperforms prior strategies, successfully solving a suite of +long-horizon, sparse-reward tasks. Code: https://github.com/rail-berkeley/supe. + +
+
+ comment: 23 pages, 10 figures +
+
+
+
+
+ + ☆ ProFL: Performative Robust Optimal Federated Learning + + +
+ Performative prediction (PP) is a framework that captures distribution shifts +that occur during the training of machine learning models due to their +deployment. As the trained model is used, its generated data could cause the +model to evolve, leading to deviations from the original data distribution. The +impact of such model-induced distribution shifts in the federated learning (FL) +setup remains unexplored despite being increasingly likely to transpire in +real-life use cases. Although Jin et al. (2024) recently extended PP to FL in a +straightforward manner, the resulting model only converges to a performative +stable point, which may be far from optimal. The methods in Izzo et al. (2021); +Miller et al. (2021) can find a performative optimal point in centralized +settings, but they require the performative risk to be convex and the training +data to be noiseless, assumptions often violated in realistic FL systems. This +paper overcomes all of these shortcomings and proposes Performative robust +optimal Federated Learning (ProFL), an algorithm that finds performative +optimal points in FL from noisy and contaminated data. We present the +convergence analysis under the Polyak-Lojasiewicz condition, which applies to +non-convex objectives. Extensive experiments on multiple datasets validate our +proposed algorithms' efficiency. + +
+
+ comment: 27 pages with Appendix, 18 figures. The paper has been submitted and + is currently under review +
+
+
+
+
+ + ☆ UnCLe: Unsupervised Continual Learning of Depth Completion + + +
+ We propose UnCLe, a standardized benchmark for Unsupervised Continual +Learning of a multimodal depth estimation task: Depth completion aims to infer +a dense depth map from a pair of synchronized RGB image and sparse depth map. +We benchmark depth completion models under the practical scenario of +unsupervised learning over continuous streams of data. Existing methods are +typically trained on a static, or stationary, dataset. However, when adapting +to novel non-stationary distributions, they "catastrophically forget" +previously learned information. UnCLe simulates these non-stationary +distributions by adapting depth completion models to sequences of datasets +containing diverse scenes captured from distinct domains using different visual +and range sensors. We adopt representative methods from continual learning +paradigms and translate them to enable unsupervised continual learning of depth +completion. We benchmark these models for indoor and outdoor and investigate +the degree of catastrophic forgetting through standard quantitative metrics. +Furthermore, we introduce model inversion quality as an additional measure of +forgetting. We find that unsupervised continual learning of depth completion is +an open problem, and we invite researchers to leverage UnCLe as a development +platform. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Training Free Guided Flow Matching with Optimal Control + + +
+ Controlled generation with pre-trained Diffusion and Flow Matching models has +vast applications. One strategy for guiding ODE-based generative models is +through optimizing a target loss $R(x_1)$ while staying close to the prior +distribution. Along this line, some recent work showed the effectiveness of +guiding flow model by differentiating through its ODE sampling process. Despite +the superior performance, the theoretical understanding of this line of methods +is still preliminary, leaving space for algorithm improvement. Moreover, +existing methods predominately focus on Euclidean data manifold, and there is a +compelling need for guided flow methods on complex geometries such as SO(3), +which prevails in high-stake scientific applications like protein design. We +present OC-Flow, a general and theoretically grounded training-free framework +for guided flow matching using optimal control. Building upon advances in +optimal control theory, we develop effective and practical algorithms for +solving optimal control in guided ODE-based generation and provide a systematic +theoretical analysis of the convergence guarantee in both Euclidean and SO(3). +We show that existing backprop-through-ODE methods can be interpreted as +special cases of Euclidean OC-Flow. OC-Flow achieved superior performance in +extensive experiments on text-guided image manipulation, conditional molecule +generation, and all-atom peptide design. + +
+
+
+
+
+ + ☆ Beyond position: how rotary embeddings shape representations and memory + in autoregressive transfomers + + +
+ Rotary Positional Embeddings (RoPE) enhance positional encoding in +Transformer models, yet their full impact on model dynamics remains +underexplored. This paper studies how RoPE introduces position-dependent +rotations, causing phase shifts in token embeddings that influence +higher-frequency components within the model's internal representations. +Through spectral analysis, we demonstrate that RoPE's rotation matrices induce +oscillatory behaviors in embeddings, affecting information retention across +layers and shaping temporal modeling capabilities. We show that activation +functions in feed-forward networks interact with RoPE-modulated embeddings to +generate harmonics, leading to constructive or destructive interference based +on phase alignment. Our findings reveal that phase alignment amplifies +activations and sharpens attention, while misalignment weakens activations and +disrupts focus on positional patterns. This study underscores the importance of +frequency components as intrinsic elements of model behavior, offering new +insights beyond traditional analyses. + +
+
+
+
+
+ + ☆ The Double-Edged Sword of Behavioral Responses in Strategic + Classification: Theory and User Studies + + +
+ When humans are subject to an algorithmic decision system, they can +strategically adjust their behavior accordingly (``game'' the system). While a +growing line of literature on strategic classification has used game-theoretic +modeling to understand and mitigate such gaming, these existing works consider +standard models of fully rational agents. In this paper, we propose a strategic +classification model that considers behavioral biases in human responses to +algorithms. We show how misperceptions of a classifier (specifically, of its +feature weights) can lead to different types of discrepancies between biased +and rational agents' responses, and identify when behavioral agents over- or +under-invest in different features. We also show that strategic agents with +behavioral biases can benefit or (perhaps, unexpectedly) harm the firm compared +to fully rational strategic agents. We complement our analytical results with +user studies, which support our hypothesis of behavioral biases in human +responses to the algorithm. Together, our findings highlight the need to +account for human (cognitive) biases when designing AI systems, and providing +explanations of them, to strategic human in the loop. + +
+
+
+
+
+ + ☆ SPIRE: Synergistic Planning, Imitation, and Reinforcement Learning for + Long-Horizon Manipulation + + +
+ Robot learning has proven to be a general and effective technique for +programming manipulators. Imitation learning is able to teach robots solely +from human demonstrations but is bottlenecked by the capabilities of the +demonstrations. Reinforcement learning uses exploration to discover better +behaviors; however, the space of possible improvements can be too large to +start from scratch. And for both techniques, the learning difficulty increases +proportional to the length of the manipulation task. Accounting for this, we +propose SPIRE, a system that first uses Task and Motion Planning (TAMP) to +decompose tasks into smaller learning subproblems and second combines imitation +and reinforcement learning to maximize their strengths. We develop novel +strategies to train learning agents when deployed in the context of a planning +system. We evaluate SPIRE on a suite of long-horizon and contact-rich robot +manipulation problems. We find that SPIRE outperforms prior approaches that +integrate imitation learning, reinforcement learning, and planning by 35% to +50% in average task performance, is 6 times more data efficient in the number +of human demonstrations needed to train proficient agents, and learns to +complete tasks nearly twice as efficiently. View +https://sites.google.com/view/spire-corl-2024 for more details. + +
+
+ comment: Conference on Robot Learning (CoRL) 2024 +
+
+
+
+
+ + ☆ POD-Attention: Unlocking Full Prefill-Decode Overlap for Faster LLM + Inference + + +
+ Each request in LLM inference goes through two phases: compute-bound prefill +and memory-bandwidth-bound decode. To improve GPU utilization, recent systems +use hybrid batching that combines the prefill and decode phases of different +requests into the same batch. Hybrid batching works well for linear operations +as it amortizes the cost of loading model weights from HBM. However, attention +computation in hybrid batches remains inefficient because existing attention +kernels are optimized for either prefill or decode. + In this paper, we present POD-Attention -- the first GPU kernel that +efficiently computes attention for hybrid batches. POD-Attention aims to +maximize the utilization of both compute and memory bandwidth by carefully +allocating the GPU's resources such that prefill and decode operations happen +concurrently on the same multiprocessor. We integrate POD-Attention in a +state-of-the-art LLM inference scheduler Sarathi-Serve. POD-Attention speeds up +attention computation by up to 75% (mean 28%) and increases LLM serving +throughput by up to 22% in offline inference. In online inference, +POD-Attention enables lower time-to-first-token (TTFT), time-between-tokens +(TBT), and request execution latency versus Sarathi-Serve. + +
+
+
+
+
+ + ☆ Inferring stability properties of chaotic systems on autoencoders' + latent spaces + + +
+ The data-driven learning of solutions of partial differential equations can +be based on a divide-and-conquer strategy. First, the high dimensional data is +compressed to a latent space with an autoencoder; and, second, the temporal +dynamics are inferred on the latent space with a form of recurrent neural +network. In chaotic systems and turbulence, convolutional autoencoders and echo +state networks (CAE-ESN) successfully forecast the dynamics, but little is +known about whether the stability properties can also be inferred. We show that +the CAE-ESN model infers the invariant stability properties and the geometry of +the tangent space in the low-dimensional manifold (i.e. the latent space) +through Lyapunov exponents and covariant Lyapunov vectors. This work opens up +new opportunities for inferring the stability of high-dimensional chaotic +systems in latent spaces. + +
+
+
+
+
+ + ☆ Estimating the Spectral Moments of the Kernel Integral Operator from + Finite Sample Matrices + + +
+ Analyzing the structure of sampled features from an input data distribution +is challenging when constrained by limited measurements in both the number of +inputs and features. Traditional approaches often rely on the eigenvalue +spectrum of the sample covariance matrix derived from finite measurement +matrices; however, these spectra are sensitive to the size of the measurement +matrix, leading to biased insights. In this paper, we introduce a novel +algorithm that provides unbiased estimates of the spectral moments of the +kernel integral operator in the limit of infinite inputs and features from +finitely sampled measurement matrices. Our method, based upon dynamic +programming, is efficient and capable of estimating the moments of the operator +spectrum. We demonstrate the accuracy of our estimator on radial basis function +(RBF) kernels, highlighting its consistency with the theoretical spectra. +Furthermore, we showcase the practical utility and robustness of our method in +understanding the geometry of learned representations in neural networks. + +
+
+
+
+
+ + ☆ Federated Transformer: Multi-Party Vertical Federated Learning on + Practical Fuzzily Linked Data + + +
+ Federated Learning (FL) is an evolving paradigm that enables multiple parties +to collaboratively train models without sharing raw data. Among its variants, +Vertical Federated Learning (VFL) is particularly relevant in real-world, +cross-organizational collaborations, where distinct features of a shared +instance group are contributed by different parties. In these scenarios, +parties are often linked using fuzzy identifiers, leading to a common practice +termed as multi-party fuzzy VFL. Existing models generally address either +multi-party VFL or fuzzy VFL between two parties. Extending these models to +practical multi-party fuzzy VFL typically results in significant performance +degradation and increased costs for maintaining privacy. To overcome these +limitations, we introduce the Federated Transformer (FeT), a novel framework +that supports multi-party VFL with fuzzy identifiers. FeT innovatively encodes +these identifiers into data representations and employs a transformer +architecture distributed across different parties, incorporating three new +techniques to enhance performance. Furthermore, we have developed a multi-party +privacy framework for VFL that integrates differential privacy with secure +multi-party computation, effectively protecting local representations while +minimizing associated utility costs. Our experiments demonstrate that the FeT +surpasses the baseline models by up to 46\% in terms of accuracy when scaled to +50 parties. Additionally, in two-party fuzzy VFL settings, FeT also shows +improved performance and privacy over cutting-edge VFL models. + +
+
+
+
+
+ + ☆ Stick-breaking Attention + + +
+ The self-attention mechanism traditionally relies on the softmax operator, +necessitating positional embeddings like RoPE, or position biases to account +for token order. But current methods using still face length generalisation +challenges. We propose an alternative attention mechanism based on the +stick-breaking process: For each token before the current, we determine a break +point $\beta_{i,j}$, which represents the proportion of the remaining stick to +allocate to the current token. We repeat the process until the stick is fully +allocated, resulting in a sequence of attention weights. This process naturally +incorporates recency bias, which has linguistic motivations for grammar parsing +(Shen et. al., 2017). We study the implications of replacing the conventional +softmax-based attention mechanism with stick-breaking attention. We then +discuss implementation of numerically stable stick-breaking attention and adapt +Flash Attention to accommodate this mechanism. When used as a drop-in +replacement for current softmax+RoPE attention systems, we find that +stick-breaking attention performs competitively with current methods on length +generalisation and downstream tasks. Stick-breaking also performs well at +length generalisation, allowing a model trained with $2^{11}$ context window to +perform well at $2^{14}$ with perplexity improvements. + +
+
+
+
+
+ + ☆ metasnf: Meta Clustering with Similarity Network Fusion in R + + +
+ metasnf is an R package that enables users to apply meta clustering, a method +for efficiently searching a broad space of cluster solutions by clustering the +solutions themselves, to clustering workflows based on similarity network +fusion (SNF). SNF is a multi-modal data integration algorithm commonly used for +biomedical subtype discovery. The package also contains functions to assist +with cluster visualization, characterization, and validation. This package can +help researchers identify SNF-derived cluster solutions that are guided by +context-specific utility over context-agnostic measures of quality. + +
+
+ comment: 72 pages, 22 figures, submitted to Journal of Statistical Software +
+
+
+
+
+ + ☆ Optical Generative Models + + +
+ Generative models cover various application areas, including image, video and +music synthesis, natural language processing, and molecular design, among many +others. As digital generative models become larger, scalable inference in a +fast and energy-efficient manner becomes a challenge. Here, we present optical +generative models inspired by diffusion models, where a shallow and fast +digital encoder first maps random noise into phase patterns that serve as +optical generative seeds for a desired data distribution; a jointly-trained +free-space-based reconfigurable decoder all-optically processes these +generative seeds to create novel images (never seen before) following the +target data distribution. Except for the illumination power and the random seed +generation through a shallow encoder, these optical generative models do not +consume computing power during the synthesis of novel images. We report the +optical generation of monochrome and multi-color novel images of handwritten +digits, fashion products, butterflies, and human faces, following the data +distributions of MNIST, Fashion MNIST, Butterflies-100, and Celeb-A datasets, +respectively, achieving an overall performance comparable to digital neural +network-based generative models. To experimentally demonstrate optical +generative models, we used visible light to generate, in a snapshot, novel +images of handwritten digits and fashion products. These optical generative +models might pave the way for energy-efficient, scalable and rapid inference +tasks, further exploiting the potentials of optics and photonics for artificial +intelligence-generated content. + +
+
+ comment: 24 Pages, 9 Figures +
+
+
+
+
+ + ☆ POMDP-Driven Cognitive Massive MIMO Radar: Joint Target + Detection-Tracking In Unknown Disturbances + + +
+ The joint detection and tracking of a moving target embedded in an unknown +disturbance represents a key feature that motivates the development of the +cognitive radar paradigm. Building upon recent advancements in robust target +detection with multiple-input multiple-output (MIMO) radars, this work explores +the application of a Partially Observable Markov Decision Process (POMDP) +framework to enhance the tracking and detection tasks in a statistically +unknown environment. In the POMDP setup, the radar system is considered as an +intelligent agent that continuously senses the surrounding environment, +optimizing its actions to maximize the probability of detection $(P_D)$ and +improve the target position and velocity estimation, all this while keeping a +constant probability of false alarm $(P_{FA})$. The proposed approach employs +an online algorithm that does not require any apriori knowledge of the noise +statistics, and it relies on a much more general observation model than the +traditional range-azimuth-elevation model employed by conventional tracking +algorithms. Simulation results clearly show substantial performance improvement +of the POMDP-based algorithm compared to the State-Action-Reward-State-Action +(SARSA)-based one that has been recently investigated in the context of massive +MIMO (MMIMO) radar systems. + +
+
+ comment: The paper has been submitted to ieee Transactions on radar systems +
+
+
+
+
+ + ☆ A Time-Aware Approach to Early Detection of Anorexia: UNSL at eRisk 2024 + + +
+ The eRisk laboratory aims to address issues related to early risk detection +on the Web. In this year's edition, three tasks were proposed, where Task 2 was +about early detection of signs of anorexia. Early risk detection is a problem +where precision and speed are two crucial objectives. Our research group solved +Task 2 by defining a CPI+DMC approach, addressing both objectives +independently, and a time-aware approach, where precision and speed are +considered a combined single-objective. We implemented the last approach by +explicitly integrating time during the learning process, considering the +ERDE{\theta} metric as the training objective. It also allowed us to +incorporate temporal metrics to validate and select the optimal models. We +achieved outstanding results for the ERDE50 metric and ranking-based metrics, +demonstrating consistency in solving ERD problems. + +
+
+ comment: In Conference and Labs of the Evaluation Forum (CLEF 2024), Grenoble, + France +
+
+
+
+
+ + ☆ Closed-form merging of parameter-efficient modules for Federated + Continual Learning + + +
+ Model merging has emerged as a crucial technique in Deep Learning, enabling +the integration of multiple models into a unified system while preserving +performance and scalability. In this respect, the compositional properties of +low-rank adaptation techniques (e.g., LoRA) have proven beneficial, as simple +averaging LoRA modules yields a single model that mostly integrates the +capabilities of all individual modules. Building on LoRA, we take a step +further by imposing that the merged model matches the responses of all learned +modules. Solving this objective in closed form yields an indeterminate system +with A and B as unknown variables, indicating the existence of infinitely many +closed-form solutions. To address this challenge, we introduce LoRM, an +alternating optimization strategy that trains one LoRA matrix at a time. This +allows solving for each unknown variable individually, thus finding a unique +solution. We apply our proposed methodology to Federated Class-Incremental +Learning (FCIL), ensuring alignment of model responses both between clients and +across tasks. Our method demonstrates state-of-the-art performance across a +range of FCIL scenarios. + +
+
+
+
+
+ + ☆ Medical Imaging Complexity and its Effects on GAN Performance ACCV + + +
+ The proliferation of machine learning models in diverse clinical applications +has led to a growing need for high-fidelity, medical image training data. Such +data is often scarce due to cost constraints and privacy concerns. Alleviating +this burden, medical image synthesis via generative adversarial networks (GANs) +emerged as a powerful method for synthetically generating photo-realistic +images based on existing sets of real medical images. However, the exact image +set size required to efficiently train such a GAN is unclear. In this work, we +experimentally establish benchmarks that measure the relationship between a +sample dataset size and the fidelity of the generated images, given the +dataset's distribution of image complexities. We analyze statistical metrics +based on delentropy, an image complexity measure rooted in Shannon's entropy in +information theory. For our pipeline, we conduct experiments with two +state-of-the-art GANs, StyleGAN 3 and SPADE-GAN, trained on multiple medical +imaging datasets with variable sample sizes. Across both GANs, general +performance improved with increasing training set size but suffered with +increasing complexity. + +
+
+ comment: Accepted to ACCV, Workshop on Generative AI for Synthetic Medical + Data +
+
+
+
+
+ + ☆ MCUBERT: Memory-Efficient BERT Inference on Commodity Microcontrollers + + +
+ In this paper, we propose MCUBERT to enable language models like BERT on tiny +microcontroller units (MCUs) through network and scheduling co-optimization. We +observe the embedding table contributes to the major storage bottleneck for +tiny BERT models. Hence, at the network level, we propose an MCU-aware +two-stage neural architecture search algorithm based on clustered low-rank +approximation for embedding compression. To reduce the inference memory +requirements, we further propose a novel fine-grained MCU-friendly scheduling +strategy. Through careful computation tiling and re-ordering as well as kernel +design, we drastically increase the input sequence lengths supported on MCUs +without any latency or accuracy penalty. MCUBERT reduces the parameter size of +BERT-tiny and BERT-mini by 5.7$\times$ and 3.0$\times$ and the execution memory +by 3.5$\times$ and 4.3$\times$, respectively. MCUBERT also achieves 1.5$\times$ +latency reduction. For the first time, MCUBERT enables lightweight BERT models +on commodity MCUs and processing more than 512 tokens with less than 256KB of +memory. + +
+
+ comment: ICCAD 2024 +
+
+
+
+
+ + ☆ SimRAG: Self-Improving Retrieval-Augmented Generation for Adapting Large + Language Models to Specialized Domains + + +
+ Retrieval-augmented generation (RAG) enhances the question-answering (QA) +abilities of large language models (LLMs) by integrating external knowledge. +However, adapting general-purpose RAG systems to specialized fields such as +science and medicine poses unique challenges due to distribution shifts and +limited access to domain-specific data. To tackle this, we propose SimRAG, a +self-training approach that equips the LLM with joint capabilities of question +answering and question generation for domain adaptation. Our method first +fine-tunes the LLM on instruction-following, question-answering, and +search-related data. Then, it prompts the same LLM to generate diverse +domain-relevant questions from unlabeled corpora, with an additional filtering +strategy to retain high-quality synthetic examples. By leveraging these +synthetic examples, the LLM can improve their performance on domain-specific +RAG tasks. Experiments on 11 datasets, spanning two backbone sizes and three +domains, demonstrate that SimRAG outperforms baselines by 1.2\%--8.6\%. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ Generalized Resubstitution for Regression Error Estimation + + +
+ We propose generalized resubstitution error estimators for regression, a +broad family of estimators, each corresponding to a choice of empirical +probability measures and loss function. The usual sum of squares criterion is a +special case corresponding to the standard empirical probability measure and +the quadratic loss. Other choices of empirical probability measure lead to more +general estimators with superior bias and variance properties. We prove that +these error estimators are consistent under broad assumptions. In addition, +procedures for choosing the empirical measure based on the method of moments +and maximum pseudo-likelihood are proposed and investigated. Detailed +experimental results using polynomial regression demonstrate empirically the +superior finite-sample bias and variance properties of the proposed estimators. +The R code for the experiments is provided. + +
+
+
+
+
+ + ☆ Theoretically Grounded Pruning of Large Ground Sets for Constrained, + Discrete Optimization + + +
+ Modern instances of combinatorial optimization problems often exhibit +billion-scale ground sets, which have many uninformative or redundant elements. +In this work, we develop light-weight pruning algorithms to quickly discard +elements that are unlikely to be part of an optimal solution. Under mild +assumptions on the instance, we prove theoretical guarantees on the fraction of +the optimal value retained and the size of the resulting pruned ground set. +Through extensive experiments on real-world datasets for various applications, +we demonstrate that our algorithm, QuickPrune, efficiently prunes over 90% of +the ground set and outperforms state-of-the-art classical and machine learning +heuristics for pruning. + +
+
+
+
+
+ + ☆ Optimizing Travel Itineraries with AI Algorithms in a Microservices + Architecture: Balancing Cost, Time, Preferences, and Sustainability + + +
+ The objective of this research is how an implementation of AI algorithms in +the microservices architecture enhances travel itineraries by cost, time, user +preferences, and environmental sustainability. It uses machine learning models +for both cost forecasting and personalization, genetic algorithm for +optimization of the itinerary, and heuristics for sustainability checking. +Primary evaluated parameters consist of latency, ability to satisfy user +preferences, cost and environmental concern. The experimental results +demonstrate an average of 4.5 seconds of response time on 1000 concurrent users +and 92% of user preferences accuracy. The cost efficiency is proved, with 95% +of provided trips being within the limits of the budget declared by the user. +The system also implements some measures to alleviate negative externalities +related to travel and 60% of offered travel plans had green options +incorporated, resulting in the average 15% lower carbon emissions than the +traditional travel plans offered. The genetic algorithm with time complexity +O(g.p.f) provides the optimal solution in 100 generations. Every iteration +improves the quality of the solution by 5%, thus enabling its effective use in +optimization problems where time is measured in seconds. Finally, the system is +designed to be fault-tolerant with functional 99.9% availability which allows +the provision of services even when requirements are exceeded. Travel +optimization platform is turned dynamic and efficient by this microservices +based architecture which provides enhanced scaling, allows asynchronous +communication and real time changes. Because of the incorporation of Ai, cost +control and eco-friendliness approaches, the system addresses the different +user needs in the present days travel business. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ☆ Spiking Graph Neural Network on Riemannian Manifolds NeurIPS 2024 + + +
+ Graph neural networks (GNNs) have become the dominant solution for learning +on graphs, the typical non-Euclidean structures. Conventional GNNs, constructed +with the Artificial Neuron Network (ANN), have achieved impressive performance +at the cost of high computation and energy consumption. In parallel, spiking +GNNs with brain-like spiking neurons are drawing increasing research attention +owing to the energy efficiency. So far, existing spiking GNNs consider graphs +in Euclidean space, ignoring the structural geometry, and suffer from the high +latency issue due to Back-Propagation-Through-Time (BPTT) with the surrogate +gradient. In light of the aforementioned issues, we are devoted to exploring +spiking GNN on Riemannian manifolds, and present a Manifold-valued Spiking GNN +(MSG). In particular, we design a new spiking neuron on geodesically complete +manifolds with the diffeomorphism, so that BPTT regarding the spikes is +replaced by the proposed differentiation via manifold. Theoretically, we show +that MSG approximates a solver of the manifold ordinary differential equation. +Extensive experiments on common graphs show the proposed MSG achieves superior +performance to previous spiking GNNs and energy efficiency to conventional +GNNs. + +
+
+ comment: Accepted by NeurIPS 2024, 30 pages +
+
+
+
+
+ + ☆ Semi-Implicit Functional Gradient Flow + + +
+ Particle-based variational inference methods (ParVIs) use non-parametric +variational families represented by particles to approximate the target +distribution according to the kernelized Wasserstein gradient flow for the +Kullback-Leibler (KL) divergence. Recent works introduce functional gradient +flows to substitute the kernel for better flexibility. However, the +deterministic updating mechanism may suffer from limited exploration and +require expensive repetitive runs for new samples. In this paper, we propose +Semi-Implicit Functional Gradient flow (SIFG), a functional gradient ParVI +method that uses perturbed particles as the approximation family. The +corresponding functional gradient flow, which can be estimated via denoising +score matching, exhibits strong theoretical convergence guarantee. We also +present an adaptive version of our method to automatically choose the suitable +noise magnitude. Extensive experiments demonstrate the effectiveness and +efficiency of the proposed framework on both simulated and real data problems. + +
+
+ comment: 31 pages, 12 figures +
+
+
+
+
+ + ☆ Retrieving snow depth distribution by downscaling ERA5 Reanalysis with + ICESat-2 laser altimetry + + +
+ Estimating the variability of seasonal snow cover, in particular snow depth +in remote areas, poses significant challenges due to limited spatial and +temporal data availability. This study uses snow depth measurements from the +ICESat-2 satellite laser altimeter, which are sparse in both space and time, +and incorporates them with climate reanalysis data into a +downscaling-calibration scheme to produce monthly gridded snow depth maps at +microscale (10 m). Snow surface elevation measurements from ICESat-2 along +profiles are compared to a digital elevation model to determine snow depth at +each point. To efficiently turn sparse measurements into snow depth maps, a +regression model is fitted to establish a relationship between the retrieved +snow depth and the corresponding ERA5 Land snow depth. This relationship, +referred to as subgrid variability, is then applied to downscale the monthly +ERA5 Land snow depth data. The method can provide timeseries of monthly snow +depth maps for the entire ERA5 time range (since 1950). The validation of +downscaled snow depth data was performed at an intermediate scale (100 m x 500 +m) using datasets from airborne laser scanning (ALS) in the Hardangervidda +region of southern Norway. Results show that snow depth prediction achieved R2 +values ranging from 0.74 to 0.88 (post-calibration). The method relies on +globally available data and is applicable to other snow regions above the +treeline. Though requiring area-specific calibration, our approach has the +potential to provide snow depth maps in areas where no such data exist and can +be used to extrapolate existing snow surveys in time and over larger areas. +With this, it can offer valuable input data for hydrological, ecological or +permafrost modeling tasks. + +
+
+
+
+
+ + ☆ Multi-Continental Healthcare Modelling Using Blockchain-Enabled + Federated Learning + + +
+ One of the biggest challenges of building artificial intelligence (AI) model +in healthcare area is the data sharing. Since healthcare data is private, +sensitive, and heterogeneous, collecting sufficient data for modelling is +exhausted, costly, and sometimes impossible. In this paper, we propose a +framework for global healthcare modelling using datasets from multi-continents +(Europe, North America and Asia) while without sharing the local datasets, and +choose glucose management as a study model to verify its effectiveness. +Technically, blockchain-enabled federated learning is implemented with adaption +to make it meet with the privacy and safety requirements of healthcare data, +meanwhile rewards honest participation and penalize malicious activities using +its on-chain incentive mechanism. Experimental results show that the proposed +framework is effective, efficient, and privacy preserved. Its prediction +accuracy is much better than the models trained from limited personal data and +is similar to, and even slightly better than, the results from a centralized +dataset. This work paves the way for international collaborations on healthcare +projects, where additional data is crucial for reducing bias and providing +benefits to humanity. + +
+
+ comment: Accepted by IEEE Global Blockchain Conference +
+
+
+
+
+ + ☆ Addressing Asynchronicity in Clinical Multimodal Fusion via + Individualized Chest X-ray Generation NeurIPS-24 + + +
+ Integrating multi-modal clinical data, such as electronic health records +(EHR) and chest X-ray images (CXR), is particularly beneficial for clinical +prediction tasks. However, in a temporal setting, multi-modal data are often +inherently asynchronous. EHR can be continuously collected but CXR is generally +taken with a much longer interval due to its high cost and radiation dose. When +clinical prediction is needed, the last available CXR image might have been +outdated, leading to suboptimal predictions. To address this challenge, we +propose DDL-CXR, a method that dynamically generates an up-to-date latent +representation of the individualized CXR images. Our approach leverages latent +diffusion models for patient-specific generation strategically conditioned on a +previous CXR image and EHR time series, providing information regarding +anatomical structures and disease progressions, respectively. In this way, the +interaction across modalities could be better captured by the latent CXR +generation process, ultimately improving the prediction performance. +Experiments using MIMIC datasets show that the proposed model could effectively +address asynchronicity in multimodal fusion and consistently outperform +existing methods. + +
+
+ comment: Accepted by NeurIPS-24 +
+
+
+
+
+ + ☆ regAL: Python Package for Active Learning of Regression Problems + + +
+ Increasingly more research areas rely on machine learning methods to +accelerate discovery while saving resources. Machine learning models, however, +usually require large datasets of experimental or computational results, which +in certain fields, such as (bio)chemistry, materials science, or medicine, are +rarely given and often prohibitively expensive to obtain. To bypass that +obstacle, active learning methods are employed to develop machine learning +models with a desired performance while requiring the least possible number of +computational or experimental results from the domain of application. For this +purpose, the model's knowledge about certain regions of the application domain +is estimated to guide the choice of the model's training set. Although active +learning is widely studied for classification problems (discrete outcomes), +comparatively few works handle this method for regression problems (continuous +outcomes). In this work, we present our Python package regAL, which allows +users to evaluate different active learning strategies for regression problems. +With a minimal input of just the dataset in question, but many additional +customization and insight options, this package is intended for anyone who aims +to perform and understand active learning in their problem-specific scope. + +
+
+
+
+
+ + ☆ Deep learning for model correction of dynamical systems with data + scarcity + + +
+ We present a deep learning framework for correcting existing dynamical system +models utilizing only a scarce high-fidelity data set. In many practical +situations, one has a low-fidelity model that can capture the dynamics +reasonably well but lacks high resolution, due to the inherent limitation of +the model and the complexity of the underlying physics. When high resolution +data become available, it is natural to seek model correction to improve the +resolution of the model predictions. We focus on the case when the amount of +high-fidelity data is so small that most of the existing data driven modeling +methods cannot be applied. In this paper, we address these challenges with a +model-correction method which only requires a scarce high-fidelity data set. +Our method first seeks a deep neural network (DNN) model to approximate the +existing low-fidelity model. By using the scarce high-fidelity data, the method +then corrects the DNN model via transfer learning (TL). After TL, an improved +DNN model with high prediction accuracy to the underlying dynamics is obtained. +One distinct feature of the propose method is that it does not assume a +specific form of the model correction terms. Instead, it offers an inherent +correction to the low-fidelity model via TL. A set of numerical examples are +presented to demonstrate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ Reinforcement Learning under Latent Dynamics: Toward Statistical and + Algorithmic Modularity + + +
+ Real-world applications of reinforcement learning often involve environments +where agents operate on complex, high-dimensional observations, but the +underlying (''latent'') dynamics are comparatively simple. However, outside of +restrictive settings such as small latent spaces, the fundamental statistical +requirements and algorithmic principles for reinforcement learning under latent +dynamics are poorly understood. + This paper addresses the question of reinforcement learning under +$\textit{general}$ latent dynamics from a statistical and algorithmic +perspective. On the statistical side, our main negative result shows that most +well-studied settings for reinforcement learning with function approximation +become intractable when composed with rich observations; we complement this +with a positive result, identifying latent pushforward coverability as a +general condition that enables statistical tractability. Algorithmically, we +develop provably efficient observable-to-latent reductions -- that is, +reductions that transform an arbitrary algorithm for the latent MDP into an +algorithm that can operate on rich observations -- in two settings: one where +the agent has access to hindsight observations of the latent dynamics [LADZ23], +and one where the agent can estimate self-predictive latent models [SAGHCB20]. +Together, our results serve as a first step toward a unified statistical and +algorithmic theory for reinforcement learning under latent dynamics. + +
+
+
+
+
+ + ☆ Scalable Offline Reinforcement Learning for Mean Field Games AAMAS + + +
+ Reinforcement learning algorithms for mean-field games offer a scalable +framework for optimizing policies in large populations of interacting agents. +Existing methods often depend on online interactions or access to system +dynamics, limiting their practicality in real-world scenarios where such +interactions are infeasible or difficult to model. In this paper, we present +Offline Munchausen Mirror Descent (Off-MMD), a novel mean-field RL algorithm +that approximates equilibrium policies in mean-field games using purely offline +data. By leveraging iterative mirror descent and importance sampling +techniques, Off-MMD estimates the mean-field distribution from static datasets +without relying on simulation or environment dynamics. Additionally, we +incorporate techniques from offline reinforcement learning to address common +issues like Q-value overestimation, ensuring robust policy learning even with +limited data coverage. Our algorithm scales to complex environments and +demonstrates strong performance on benchmark tasks like crowd exploration or +navigation, highlighting its applicability to real-world multi-agent systems +where online experimentation is infeasible. We empirically demonstrate the +robustness of Off-MMD to low-quality datasets and conduct experiments to +investigate its sensitivity to hyperparameter choices. + +
+
+ comment: Submitted to AAMAS +
+
+
+
+
+ + ☆ Identifiable Representation and Model Learning for Latent Dynamic + Systems + + +
+ Learning identifiable representations and models from low-level observations +is useful for an intelligent spacecraft to reliability finish downstream tasks. +For temporal observations, to ensure that the data generating process is +provably inverted, most existing works either assume the noise variables in the +dynamic mechanisms are (conditionally) independent, or require interventions +which can directly affect each latent variable. However, in practice, the +relationship between the exogenous inputs/interventions and the latent +variables may follow some complex deterministic mechanisms. In this work, we +study the problem of identifiable representation and model learning for latent +dynamic systems. The key idea is that we use an inductive bias inspired by +controllable canonical forms, which is invariant, sparse, and input dependent +by definition. We prove that, for linear or affine nonlinear latent dynamic +systems, it is possible to identify the representations up to scaling and +determine the models up to some simple transformations. The results have +potential to provide some theoretical guarantees for developing more +trustworthy decision-making and control methods for intelligent spacecrafts. + +
+
+
+
+
+ + ☆ AdaRankGrad: Adaptive Gradient-Rank and Moments for Memory-Efficient + LLMs Training and Fine-Tuning + + +
+ Training and fine-tuning large language models (LLMs) come with challenges +related to memory and computational requirements due to the increasing size of +the model weights and the optimizer states. Various techniques have been +developed to tackle these challenges, such as low-rank adaptation (LoRA), which +involves introducing a parallel trainable low-rank matrix to the fixed +pre-trained weights at each layer. However, these methods often fall short +compared to the full-rank weight training approach, as they restrict the +parameter search to a low-rank subspace. This limitation can disrupt training +dynamics and require a full-rank warm start to mitigate the impact. In this +paper, we introduce a new method inspired by a phenomenon we formally prove: as +training progresses, the rank of the estimated layer gradients gradually +decreases, and asymptotically approaches rank one. Leveraging this, our +approach involves adaptively reducing the rank of the gradients during Adam +optimization steps, using an efficient online-updating low-rank projections +rule. We further present a randomized SVD scheme for efficiently finding the +projection matrix. Our technique enables full-parameter fine-tuning with +adaptive low-rank gradient updates, significantly reducing overall memory +requirements during training compared to state-of-the-art methods while +improving model performance in both pretraining and fine-tuning. Finally, we +provide a convergence analysis of our method and demonstrate its merits for +training and fine-tuning language and biological foundation models. + +
+
+
+
+
+ + ☆ Relaxed Equivariance via Multitask Learning + + +
+ Incorporating equivariance as an inductive bias into deep learning +architectures to take advantage of the data symmetry has been successful in +multiple applications, such as chemistry and dynamical systems. In particular, +roto-translations are crucial for effectively modeling geometric graphs and +molecules, where understanding the 3D structures enhances generalization. +However, equivariant models often pose challenges due to their high +computational complexity. In this paper, we introduce REMUL, a training +procedure for approximating equivariance with multitask learning. We show that +unconstrained models (which do not build equivariance into the architecture) +can learn approximate symmetries by minimizing an additional simple +equivariance loss. By formulating equivariance as a new learning objective, we +can control the level of approximate equivariance in the model. Our method +achieves competitive performance compared to equivariant baselines while being +$10 \times$ faster at inference and $2.5 \times$ at training. + +
+
+
+
+
+ + ☆ Population stratification for prediction of mortality in post-AKI + patients + + +
+ Acute kidney injury (AKI) is a serious clinical condition that affects up to +20% of hospitalised patients. AKI is associated with short term unplanned +hospital readmission and post-discharge mortality risk. Patient risk and +healthcare expenditures can be minimised by followup planning grounded on +predictive models and machine learning. Since AKI is multi-factorial, +predictive models specialised in different categories of patients can increase +accuracy of predictions. In the present article we present some results +following this approach. + +
+
+
+
+
+ + ☆ CASCRNet: An Atrous Spatial Pyramid Pooling and Shared Channel Residual + based Network for Capsule Endoscopy + + +
+ This manuscript summarizes work on the Capsule Vision Challenge 2024 by +MISAHUB. To address the multi-class disease classification task, which is +challenging due to the complexity and imbalance in the Capsule Vision challenge +dataset, this paper proposes CASCRNet (Capsule endoscopy-Aspp-SCR-Network), a +parameter-efficient and novel model that uses Shared Channel Residual (SCR) +blocks and Atrous Spatial Pyramid Pooling (ASPP) blocks. Further, the +performance of the proposed model is compared with other well-known approaches. +The experimental results yield that proposed model provides better disease +classification results. The proposed model was successful in classifying +diseases with an F1 Score of 78.5% and a Mean AUC of 98.3%, which is promising +given its compact architecture. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ The Probabilistic Tsetlin Machine: A Novel Approach to Uncertainty + Quantification + + +
+ Tsetlin Machines (TMs) have emerged as a compelling alternative to +conventional deep learning methods, offering notable advantages such as smaller +memory footprint, faster inference, fault-tolerant properties, and +interpretability. Although various adaptations of TMs have expanded their +applicability across diverse domains, a fundamental gap remains in +understanding how TMs quantify uncertainty in their predictions. In response, +this paper introduces the Probabilistic Tsetlin Machine (PTM) framework, aimed +at providing a robust, reliable, and interpretable approach for uncertainty +quantification. Unlike the original TM, the PTM learns the probability of +staying on each state of each Tsetlin Automaton (TA) across all clauses. These +probabilities are updated using the feedback tables that are part of the TM +framework: Type I and Type II feedback. During inference, TAs decide their +actions by sampling states based on learned probability distributions, akin to +Bayesian neural networks when generating weight values. In our experimental +analysis, we first illustrate the spread of the probabilities across TA states +for the noisy-XOR dataset. Then we evaluate the PTM alongside benchmark models +using both simulated and real-world datasets. The experiments on the simulated +dataset reveal the PTM's effectiveness in uncertainty quantification, +particularly in delineating decision boundaries and identifying regions of high +uncertainty. Moreover, when applied to multiclass classification tasks using +the Iris dataset, the PTM demonstrates competitive performance in terms of +predictive entropy and expected calibration error, showcasing its potential as +a reliable tool for uncertainty estimation. Our findings underscore the +importance of selecting appropriate models for accurate uncertainty +quantification in predictive tasks, with the PTM offering a particularly +interpretable and effective solution. + +
+
+ comment: 12 pages, 5 figures, 6 tables, accepted and presented at ICAAI 2024, + London +
+
+
+
+
+ + ☆ Is the GPU Half-Empty or Half-Full? Practical Scheduling Techniques for + LLMs + + +
+ Serving systems for Large Language Models (LLMs) improve throughput by +processing several requests concurrently. However, multiplexing hardware +resources between concurrent requests involves non-trivial scheduling +decisions. Practical serving systems typically implement these decisions at two +levels: First, a load balancer routes requests to different servers which each +hold a replica of the LLM. Then, on each server, an engine-level scheduler +decides when to run a request, or when to queue or preempt it. Improved +scheduling policies may benefit a wide range of LLM deployments and can often +be implemented as "drop-in replacements" to a system's current policy. In this +work, we survey scheduling techniques from the literature and from practical +serving systems. We find that schedulers from the literature often achieve good +performance but introduce significant complexity. In contrast, schedulers in +practical deployments often leave easy performance gains on the table but are +easy to implement, deploy and configure. This finding motivates us to introduce +two new scheduling techniques, which are both easy to implement, and outperform +current techniques on production workload traces. + +
+
+ comment: 12 pages, 11 figures +
+
+
+
+
+ + ☆ Optimal Streaming Algorithms for Multi-Armed Bandits + + +
+ This paper studies two variants of the best arm identification (BAI) problem +under the streaming model, where we have a stream of $n$ arms with reward +distributions supported on $[0,1]$ with unknown means. The arms in the stream +are arriving one by one, and the algorithm cannot access an arm unless it is +stored in a limited size memory. + We first study the streaming \eps-$top$-$k$ arms identification problem, +which asks for $k$ arms whose reward means are lower than that of the $k$-th +best arm by at most $\eps$ with probability at least $1-\delta$. For general +$\eps \in (0,1)$, the existing solution for this problem assumes $k = 1$ and +achieves the optimal sample complexity $O(\frac{n}{\eps^2} \log +\frac{1}{\delta})$ using $O(\log^*(n))$ ($\log^*(n)$ equals the number of times +that we need to apply the logarithm function on $n$ before the results is no +more than 1.) memory and a single pass of the stream. We propose an algorithm +that works for any $k$ and achieves the optimal sample complexity +$O(\frac{n}{\eps^2} \log\frac{k}{\delta})$ using a single-arm memory and a +single pass of the stream. + Second, we study the streaming BAI problem, where the objective is to +identify the arm with the maximum reward mean with at least $1-\delta$ +probability, using a single-arm memory and as few passes of the input stream as +possible. We present a single-arm-memory algorithm that achieves a near +instance-dependent optimal sample complexity within $O(\log \Delta_2^{-1})$ +passes, where $\Delta_2$ is the gap between the mean of the best arm and that +of the second best arm. + +
+
+ comment: 24pages +
+
+
+
+
+ + ☆ Non-intrusive Speech Quality Assessment with Diffusion Models Trained on + Clean Speech + + +
+ Diffusion models have found great success in generating high quality, natural +samples of speech, but their potential for density estimation for speech has so +far remained largely unexplored. In this work, we leverage an unconditional +diffusion model trained only on clean speech for the assessment of speech +quality. We show that the quality of a speech utterance can be assessed by +estimating the likelihood of a corresponding sample in the terminating Gaussian +distribution, obtained via a deterministic noising process. The resulting +method is purely unsupervised, trained only on clean speech, and therefore does +not rely on annotations. Our diffusion-based approach leverages clean speech +priors to assess quality based on how the input relates to the learned +distribution of clean data. Our proposed log-likelihoods show promising +results, correlating well with intrusive speech quality metrics such as POLQA +and SI-SDR. + +
+
+
+
+
+ + ☆ Att2CPC: Attention-Guided Lossy Attribute Compression of Point Clouds + + +
+ With the great progress of 3D sensing and acquisition technology, the volume +of point cloud data has grown dramatically, which urges the development of +efficient point cloud compression methods. In this paper, we focus on the task +of learned lossy point cloud attribute compression (PCAC). We propose an +efficient attention-based method for lossy compression of point cloud +attributes leveraging on an autoencoder architecture. Specifically, at the +encoding side, we conduct multiple downsampling to best exploit the local +attribute patterns, in which effective External Cross Attention (ECA) is +devised to hierarchically aggregate features by intergrating attributes and +geometry contexts. At the decoding side, the attributes of the point cloud are +progressively reconstructed based on the multi-scale representation and the +zero-padding upsampling tactic. To the best of our knowledge, this is the first +approach to introduce attention mechanism to point-based lossy PCAC task. We +verify the compression efficiency of our model on various sequences, including +human body frames, sparse objects, and large-scale point cloud scenes. +Experiments show that our method achieves an average improvement of 1.15 dB and +2.13 dB in BD-PSNR of Y channel and YUV channel, respectively, when comparing +with the state-of-the-art point-based method Deep-PCAC. Codes of this paper are +available at https://github.com/I2-Multimedia-Lab/Att2CPC. + +
+
+
+
+
+ + ☆ Learning Lossless Compression for High Bit-Depth Volumetric Medical + Image + + +
+ Recent advances in learning-based methods have markedly enhanced the +capabilities of image compression. However, these methods struggle with high +bit-depth volumetric medical images, facing issues such as degraded +performance, increased memory demand, and reduced processing speed. To address +these challenges, this paper presents the Bit-Division based Lossless +Volumetric Image Compression (BD-LVIC) framework, which is tailored for high +bit-depth medical volume compression. The BD-LVIC framework skillfully divides +the high bit-depth volume into two lower bit-depth segments: the Most +Significant Bit-Volume (MSBV) and the Least Significant Bit-Volume (LSBV). The +MSBV concentrates on the most significant bits of the volumetric medical image, +capturing vital structural details in a compact manner. This reduction in +complexity greatly improves compression efficiency using traditional codecs. +Conversely, the LSBV deals with the least significant bits, which encapsulate +intricate texture details. To compress this detailed information effectively, +we introduce an effective learning-based compression model equipped with a +Transformer-Based Feature Alignment Module, which exploits both intra-slice and +inter-slice redundancies to accurately align features. Subsequently, a Parallel +Autoregressive Coding Module merges these features to precisely estimate the +probability distribution of the least significant bit-planes. Our extensive +testing demonstrates that the BD-LVIC framework not only sets new performance +benchmarks across various datasets but also maintains a competitive coding +speed, highlighting its significant potential and practical utility in the +realm of volumetric medical image compression. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ A Comprehensive Analysis on the Learning Curve in Kernel Ridge + Regression + + +
+ This paper conducts a comprehensive study of the learning curves of kernel +ridge regression (KRR) under minimal assumptions. Our contributions are +three-fold: 1) we analyze the role of key properties of the kernel, such as its +spectral eigen-decay, the characteristics of the eigenfunctions, and the +smoothness of the kernel; 2) we demonstrate the validity of the Gaussian +Equivalent Property (GEP), which states that the generalization performance of +KRR remains the same when the whitened features are replaced by standard +Gaussian vectors, thereby shedding light on the success of previous analyzes +under the Gaussian Design Assumption; 3) we derive novel bounds that improve +over existing bounds across a broad range of setting such as (in)dependent +feature vectors and various combinations of eigen-decay rates in the +over/underparameterized regimes. + +
+
+
+
+
+ + ☆ Enhancing Federated Learning Convergence with Dynamic Data Queue and + Data Entropy-driven Participant Selection + + +
+ Federated Learning (FL) is a decentralized approach for collaborative model +training on edge devices. This distributed method of model training offers +advantages in privacy, security, regulatory compliance, and cost-efficiency. +Our emphasis in this research lies in addressing statistical complexity in FL, +especially when the data stored locally across devices is not identically and +independently distributed (non-IID). We have observed an accuracy reduction of +up to approximately 10\% to 30\%, particularly in skewed scenarios where each +edge device trains with only 1 class of data. This reduction is attributed to +weight divergence, quantified using the Euclidean distance between device-level +class distributions and the population distribution, resulting in a bias term +(\(\delta_k\)). As a solution, we present a method to improve convergence in FL +by creating a global subset of data on the server and dynamically distributing +it across devices using a Dynamic Data queue-driven Federated Learning (DDFL). +Next, we leverage Data Entropy metrics to observe the process during each +training round and enable reasonable device selection for aggregation. +Furthermore, we provide a convergence analysis of our proposed DDFL to justify +their viability in practical FL scenarios, aiming for better device selection, +a non-sub-optimal global model, and faster convergence. We observe that our +approach results in a substantial accuracy boost of approximately 5\% for the +MNIST dataset, around 18\% for CIFAR-10, and 20\% for CIFAR-100 with a 10\% +global subset of data, outperforming the state-of-the-art (SOTA) aggregation +algorithms. + +
+
+ comment: The Journal is submitted to IEEE Transactions in the Internet of + Things +
+
+
+
+
+ + ☆ Large Language Models Engineer Too Many Simple Features For Tabular Data + + +
+ Tabular machine learning problems often require time-consuming and +labor-intensive feature engineering. Recent efforts have focused on using large +language models (LLMs) to capitalize on their potential domain knowledge. At +the same time, researchers have observed ethically concerning negative biases +in other LLM-related use cases, such as text generation. These developments +motivated us to investigate whether LLMs exhibit a bias that negatively impacts +the performance of feature engineering. While not ethically concerning, such a +bias could hinder practitioners from fully utilizing LLMs for automated data +science. Therefore, we propose a method to detect potential biases by detecting +anomalies in the frequency of operators (e.g., adding two features) suggested +by LLMs when engineering new features. Our experiments evaluate the bias of +four LLMs, two big frontier and two small open-source models, across 27 tabular +datasets. Our results indicate that LLMs are biased toward simple operators, +such as addition, and can fail to utilize more complex operators, such as +grouping followed by aggregations. Furthermore, the bias can negatively impact +the predictive performance when using LLM-generated features. Our results call +for mitigating bias when using LLMs for feature engineering. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Scaling Robot Policy Learning via Zero-Shot Labeling with Foundation + Models + + +
+ A central challenge towards developing robots that can relate human language +to their perception and actions is the scarcity of natural language annotations +in diverse robot datasets. Moreover, robot policies that follow natural +language instructions are typically trained on either templated language or +expensive human-labeled instructions, hindering their scalability. To this end, +we introduce NILS: Natural language Instruction Labeling for Scalability. NILS +automatically labels uncurated, long-horizon robot data at scale in a zero-shot +manner without any human intervention. NILS combines pretrained vision-language +foundation models in order to detect objects in a scene, detect object-centric +changes, segment tasks from large datasets of unlabelled interaction data and +ultimately label behavior datasets. Evaluations on BridgeV2, Fractal, and a +kitchen play dataset show that NILS can autonomously annotate diverse robot +demonstrations of unlabeled and unstructured datasets while alleviating several +shortcomings of crowdsourced human annotations, such as low data quality and +diversity. We use NILS to label over 115k trajectories obtained from over 430 +hours of robot data. We open-source our auto-labeling code and generated +annotations on our website: http://robottasklabeling.github.io. + +
+
+ comment: Project Website at https://robottasklabeling.github.io/ +
+
+
+
+
+ + ☆ Locating Information in Large Language Models via Random Matrix Theory + + +
+ As large language models (LLMs) become central to AI applications, gaining a +deeper understanding of their inner workings is increasingly important. In this +work, we analyze the weight matrices of pretrained transformer models -- +specifically BERT and Llama -- using random matrix theory (RMT) as a +zero-information hypothesis. While randomly initialized weights perfectly agree +with RMT predictions, deviations emerge after training, allowing us to locate +learned structures within the models. We identify layer-type specific behaviors +that are consistent across all blocks and architectures considered. By +pinpointing regions that deviate from RMT predictions, we highlight areas of +feature learning and confirm this through comparisons with the activation +covariance matrices of the corresponding layers. Our method provides a +diagnostic tool for identifying relevant regions in transformer weights using +only the trained matrices. Additionally, we address the ongoing debate +regarding the significance of small singular values in the context of +fine-tuning and alignment in LLMs. Our findings reveal that, after fine-tuning, +small singular values play a crucial role in the models' capabilities, +suggesting that removing them in an already aligned transformer can be +detrimental, as it may compromise model alignment. + +
+
+ comment: 17 pages, 14 figures +
+
+
+
+
+ + ☆ Faster Language Models with Better Multi-Token Prediction Using Tensor + Decomposition + + +
+ We propose a new model for multi-token prediction in transformers, aiming to +enhance sampling efficiency without compromising accuracy. Motivated by recent +work that predicts the probabilities of subsequent tokens using multiple heads, +we connect this approach to rank-$1$ canonical tensor decomposition. By +generalizing it to a rank-$r$ canonical probability decomposition, we develop +an improved model that predicts multiple tokens simultaneously. This model can +also be interpreted as a mixture of experts, allowing us to leverage successful +techniques from that domain for efficient and robust training. Importantly, the +overall overhead for training and sampling remains low. Our method demonstrates +significant improvements in inference speed for both text and code generation +tasks, proving particularly beneficial within the self-speculative decoding +paradigm. It maintains its effectiveness across various model sizes and +training epochs, highlighting its robustness and scalability. + +
+
+
+
+
+ + ☆ Beyond Backpropagation: Optimization with Multi-Tangent Forward + Gradients + + +
+ The gradients used to train neural networks are typically computed using +backpropagation. While an efficient way to obtain exact gradients, +backpropagation is computationally expensive, hinders parallelization, and is +biologically implausible. Forward gradients are an approach to approximate the +gradients from directional derivatives along random tangents computed by +forward-mode automatic differentiation. So far, research has focused on using a +single tangent per step. This paper provides an in-depth analysis of +multi-tangent forward gradients and introduces an improved approach to +combining the forward gradients from multiple tangents based on orthogonal +projections. We demonstrate that increasing the number of tangents improves +both approximation quality and optimization performance across various tasks. + +
+
+
+
+
+ + ☆ Anomaly Resilient Temporal QoS Prediction using Hypergraph Convoluted + Transformer Network + + +
+ Quality-of-Service (QoS) prediction is a critical task in the service +lifecycle, enabling precise and adaptive service recommendations by +anticipating performance variations over time in response to evolving network +uncertainties and user preferences. However, contemporary QoS prediction +methods frequently encounter data sparsity and cold-start issues, which hinder +accurate QoS predictions and limit the ability to capture diverse user +preferences. Additionally, these methods often assume QoS data reliability, +neglecting potential credibility issues such as outliers and the presence of +greysheep users and services with atypical invocation patterns. Furthermore, +traditional approaches fail to leverage diverse features, including +domain-specific knowledge and complex higher-order patterns, essential for +accurate QoS predictions. In this paper, we introduce a real-time, trust-aware +framework for temporal QoS prediction to address the aforementioned challenges, +featuring an end-to-end deep architecture called the Hypergraph Convoluted +Transformer Network (HCTN). HCTN combines a hypergraph structure with graph +convolution over hyper-edges to effectively address high-sparsity issues by +capturing complex, high-order correlations. Complementing this, the transformer +network utilizes multi-head attention along with parallel 1D convolutional +layers and fully connected dense blocks to capture both fine-grained and +coarse-grained dynamic patterns. Additionally, our approach includes a +sparsity-resilient solution for detecting greysheep users and services, +incorporating their unique characteristics to improve prediction accuracy. +Trained with a robust loss function resistant to outliers, HCTN demonstrated +state-of-the-art performance on the large-scale WSDREAM-2 datasets for response +time and throughput. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ☆ Topology meets Machine Learning: An Introduction using the Euler + Characteristic Transform + + +
+ This overview article makes the case for how topological concepts can enrich +research in machine learning. Using the Euler Characteristic Transform (ECT), a +geometrical-topological invariant, as a running example, I present different +use cases that result in more efficient models for analyzing point clouds, +graphs, and meshes. Moreover, I outline a vision for how topological concepts +could be used in the future, comprising (1) the learning of functions on +topological spaces, (2) the building of hybrid models that imbue neural +networks with knowledge about the topological information in data, and (3) the +analysis of qualitative properties of neural networks. With current research +already addressing some of these aspects, this article thus serves as an +introduction and invitation to this nascent area of research. + +
+
+
+
+
+ + ☆ Escaping the Forest: Sparse Interpretable Neural Networks for Tabular + Data + + +
+ Tabular datasets are widely used in scientific disciplines such as biology. +While these disciplines have already adopted AI methods to enhance their +findings and analysis, they mainly use tree-based methods due to their +interpretability. At the same time, artificial neural networks have been shown +to offer superior flexibility and depth for rich and complex non-tabular +problems, but they are falling behind tree-based models for tabular data in +terms of performance and interpretability. Although sparsity has been shown to +improve the interpretability and performance of ANN models for complex +non-tabular datasets, enforcing sparsity structurally and formatively for +tabular data before training the model, remains an open question. To address +this question, we establish a method that infuses sparsity in neural networks +by utilising attention mechanisms to capture the features' importance in +tabular datasets. We show that our models, Sparse TABular NET or sTAB-Net with +attention mechanisms, are more effective than tree-based models, reaching the +state-of-the-art on biological datasets. They further permit the extraction of +insights from these datasets and achieve better performance than post-hoc +methods like SHAP. + +
+
+
+
+
+ + ☆ VISAGE: Video Synthesis using Action Graphs for Surgery MICCAI 2024 + + +
+ Surgical data science (SDS) is a field that analyzes patient data before, +during, and after surgery to improve surgical outcomes and skills. However, +surgical data is scarce, heterogeneous, and complex, which limits the +applicability of existing machine learning methods. In this work, we introduce +the novel task of future video generation in laparoscopic surgery. This task +can augment and enrich the existing surgical data and enable various +applications, such as simulation, analysis, and robot-aided surgery. +Ultimately, it involves not only understanding the current state of the +operation but also accurately predicting the dynamic and often unpredictable +nature of surgical procedures. Our proposed method, VISAGE (VIdeo Synthesis +using Action Graphs for Surgery), leverages the power of action scene graphs to +capture the sequential nature of laparoscopic procedures and utilizes diffusion +models to synthesize temporally coherent video sequences. VISAGE predicts the +future frames given only a single initial frame, and the action graph triplets. +By incorporating domain-specific knowledge through the action graph, VISAGE +ensures the generated videos adhere to the expected visual and motion patterns +observed in real laparoscopic procedures. The results of our experiments +demonstrate high-fidelity video generation for laparoscopy procedures, which +enables various applications in SDS. + +
+
+ comment: Accepted at MICCAI 2024 Embodied AI and Robotics for HealTHcare + (EARTH) Workshop +
+
+
+
+
+ + ☆ Can Uncertainty Quantification Enable Better Learning-based Index + Tuning? + + +
+ Index tuning is crucial for optimizing database performance by selecting +optimal indexes based on workload. The key to this process lies in an accurate +and efficient benefit estimator. Traditional methods relying on what-if tools +often suffer from inefficiency and inaccuracy. In contrast, learning-based +models provide a promising alternative but face challenges such as instability, +lack of interpretability, and complex management. To overcome these +limitations, we adopt a novel approach: quantifying the uncertainty in +learning-based models' results, thereby combining the strengths of both +traditional and learning-based methods for reliable index tuning. We propose +Beauty, the first uncertainty-aware framework that enhances learning-based +models with uncertainty quantification and uses what-if tools as a +complementary mechanism to improve reliability and reduce management +complexity. Specifically, we introduce a novel method that combines AutoEncoder +and Monte Carlo Dropout to jointly quantify uncertainty, tailored to the +characteristics of benefit estimation tasks. In experiments involving sixteen +models, our approach outperformed existing uncertainty quantification methods +in the majority of cases. We also conducted index tuning tests on six datasets. +By applying the Beauty framework, we eliminated worst-case scenarios and more +than tripled the occurrence of best-case scenarios. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ☆ Learning Versatile Skills with Curriculum Masking NeurIPS 2024 + + +
+ Masked prediction has emerged as a promising pretraining paradigm in offline +reinforcement learning (RL) due to its versatile masking schemes, enabling +flexible inference across various downstream tasks with a unified model. +Despite the versatility of masked prediction, it remains unclear how to balance +the learning of skills at different levels of complexity. To address this, we +propose CurrMask, a curriculum masking pretraining paradigm for sequential +decision making. Motivated by how humans learn by organizing knowledge in a +curriculum, CurrMask adjusts its masking scheme during pretraining for learning +versatile skills. Through extensive experiments, we show that CurrMask exhibits +superior zero-shot performance on skill prompting tasks, goal-conditioned +planning tasks, and competitive finetuning performance on offline RL tasks. +Additionally, our analysis of training dynamics reveals that CurrMask gradually +acquires skills of varying complexity by dynamically adjusting its masking +scheme. + +
+
+ comment: NeurIPS 2024 poster, 21 pages, 7 figures +
+
+
+
+
+ + ☆ Continual Learning on a Data Diet + + +
+ Continual Learning (CL) methods usually learn from all available data. +However, this is not the case in human cognition which efficiently focuses on +key experiences while disregarding the redundant information. Similarly, not +all data points in a dataset have equal potential; some can be more informative +than others. This disparity may significantly impact the performance, as both +the quality and quantity of samples directly influence the model's +generalizability and efficiency. Drawing inspiration from this, we explore the +potential of learning from important samples and present an empirical study for +evaluating coreset selection techniques in the context of CL to stimulate +research in this unexplored area. We train different continual learners on +increasing amounts of selected samples and investigate the learning-forgetting +dynamics by shedding light on the underlying mechanisms driving their improved +stability-plasticity balance. We present several significant observations: +learning from selectively chosen samples (i) enhances incremental accuracy, +(ii) improves knowledge retention of previous tasks, and (iii) refines learned +representations. This analysis contributes to a deeper understanding of +selective learning strategies in CL scenarios. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ☆ Beware of Calibration Data for Pruning Large Language Models + + +
+ As large language models (LLMs) are widely applied across various fields, +model compression has become increasingly crucial for reducing costs and +improving inference efficiency. Post-training pruning is a promising method +that does not require resource-intensive iterative training and only needs a +small amount of calibration data to assess the importance of parameters. +Previous research has primarily focused on designing advanced pruning methods, +while different calibration data's impact on pruning performance still lacks +systematical exploration. We fill this blank and surprisingly observe that the +effects of calibration data even value more than designing advanced pruning +strategies, especially for high sparsity. Our preliminary exploration also +discloses that using calibration data similar to the training data can yield +better performance. As pre-training data is usually inaccessible for advanced +LLMs, we further provide a self-generating calibration data synthesis strategy +to construct feasible calibration data. We conduct experiments on the recent +strong open-source LLMs (e.g., DCLM, and LLaMA-3), and the results show that +the proposed method outperforms commonly used calibration data and can +effectively enhance strong pruning methods (e.g., Wanda, OWL). + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Scalable Random Feature Latent Variable Models + + +
+ Random feature latent variable models (RFLVMs) represent the state-of-the-art +in latent variable models, capable of handling non-Gaussian likelihoods and +effectively uncovering patterns in high-dimensional data. However, their heavy +reliance on Monte Carlo sampling results in scalability issues which makes it +difficult to use these models for datasets with a massive number of +observations. To scale up RFLVMs, we turn to the optimization-based variational +Bayesian inference (VBI) algorithm which is known for its scalability compared +to sampling-based methods. However, implementing VBI for RFLVMs poses +challenges, such as the lack of explicit probability distribution functions +(PDFs) for the Dirichlet process (DP) in the kernel learning component, and the +incompatibility of existing VBI algorithms with RFLVMs. To address these +issues, we introduce a stick-breaking construction for DP to obtain an explicit +PDF and a novel VBI algorithm called ``block coordinate descent variational +inference" (BCD-VI). This enables the development of a scalable version of +RFLVMs, or in short, SRFLVM. Our proposed method shows scalability, +computational efficiency, superior performance in generating informative latent +representations and the ability of imputing missing data across various +real-world datasets, outperforming state-of-the-art competitors. + +
+
+
+
+
+ + ☆ Optimizing Load Scheduling in Power Grids Using Reinforcement Learning + and Markov Decision Processes + + +
+ Power grid load scheduling is a critical task that ensures the balance +between electricity generation and consumption while minimizing operational +costs and maintaining grid stability. Traditional optimization methods often +struggle with the dynamic and stochastic nature of power systems, especially +when faced with renewable energy sources and fluctuating demand. This paper +proposes a reinforcement learning (RL) approach using a Markov Decision Process +(MDP) framework to address the challenges of dynamic load scheduling. The MDP +is defined by a state space representing grid conditions, an action space +covering control operations like generator adjustments and storage management, +and a reward function balancing economic efficiency and system reliability. We +investigate the application of various RL algorithms, from basic Q-Learning to +more advanced Deep Q-Networks (DQN) and Actor-Critic methods, to determine +optimal scheduling policies. The proposed approach is evaluated through a +simulated power grid environment, demonstrating its potential to improve +scheduling efficiency and adapt to variable demand patterns. Our results show +that the RL-based method provides a robust and scalable solution for real-time +load scheduling, contributing to the efficient management of modern power +grids. + +
+
+
+
+
+ + ☆ PETAH: Parameter Efficient Task Adaptation for Hybrid Transformers in a + resource-limited Context + + +
+ Following their success in natural language processing (NLP), there has been +a shift towards transformer models in computer vision. While transformers +perform well and offer promising multi-tasking performance, due to their high +compute requirements, many resource-constrained applications still rely on +convolutional or hybrid models that combine the benefits of convolution and +attention layers and achieve the best results in the sub 100M parameter range. +Simultaneously, task adaptation techniques that allow for the use of one shared +transformer backbone for multiple downstream tasks, resulting in great storage +savings at negligible cost in performance, have not yet been adopted for hybrid +transformers. In this work, we investigate how to achieve the best +task-adaptation performance and introduce PETAH: Parameter Efficient Task +Adaptation for Hybrid Transformers. We further combine PETAH adaptation with +pruning to achieve highly performant and storage friendly models for +multi-tasking. In our extensive evaluation on classification and other vision +tasks, we demonstrate that our PETAH-adapted hybrid models outperform +established task-adaptation techniques for ViTs while requiring fewer +parameters and being more efficient on mobile hardware. + +
+
+
+
+
+ + ☆ Mapping the Media Landscape: Predicting Factual Reporting and Political + Bias Through Web Interactions + + +
+ Bias assessment of news sources is paramount for professionals, +organizations, and researchers who rely on truthful evidence for information +gathering and reporting. While certain bias indicators are discernible from +content analysis, descriptors like political bias and fake news pose greater +challenges. In this paper, we propose an extension to a recently presented news +media reliability estimation method that focuses on modeling outlets and their +longitudinal web interactions. Concretely, we assess the classification +performance of four reinforcement learning strategies on a large news media +hyperlink graph. Our experiments, targeting two challenging bias descriptors, +factual reporting and political bias, showed a significant performance +improvement at the source media level. Additionally, we validate our methods on +the CLEF 2023 CheckThat! Lab challenge, outperforming the reported results in +both, F1-score and the official MAE metric. Furthermore, we contribute by +releasing the largest annotated dataset of news source media, categorized with +factual reporting and political bias labels. Our findings suggest that +profiling news media sources based on their hyperlink interactions over time is +feasible, offering a bird's-eye view of evolving media landscapes. + +
+
+ comment: Accepted to CLEF 2024 +
+
+
+
+
+ + ☆ Towards Active Participant-Centric Vertical Federated Learning: Some + Representations May Be All You Need + + +
+ Vertical Federated Learning (VFL) enables collaborative model training across +different participants with distinct features and common samples, while +preserving data privacy. Existing VFL methodologies often struggle with +realistic data partitions, typically incurring high communication costs and +significant operational complexity. In this work, we introduce a novel +simplified approach to VFL, Active Participant-Centric VFL (APC-VFL), that, to +the best of our knowledge, is the first to require only a single communication +round between participants, and allows the active participant to do inference +in a non collaborative fashion. This method integrates unsupervised +representation learning with knowledge distillation to achieve comparable +accuracy to traditional VFL methods based on vertical split learning in +classical settings, reducing required communication rounds by up to +$4200\times$, while being more flexible. Our approach also shows improvements +compared to non-federated local models, as well as a comparable VFL proposal, +VFedTrans, offering an efficient and flexible solution for collaborative +learning. + +
+
+
+
+
+ + ☆ Entity-based Reinforcement Learning for Autonomous Cyber Defence CCS 2024 + + +
+ A significant challenge for autonomous cyber defence is ensuring a defensive +agent's ability to generalise across diverse network topologies and +configurations. + This capability is necessary for agents to remain effective when deployed in +dynamically changing environments, such as an enterprise network where devices +may frequently join and leave. + Standard approaches to deep reinforcement learning, where policies are +parameterised using a fixed-input multi-layer perceptron (MLP) expect +fixed-size observation and action spaces. In autonomous cyber defence, this +makes it hard to develop agents that generalise to environments with network +topologies different from those trained on, as the number of nodes affects the +natural size of the observation and action spaces. To overcome this limitation, +we reframe the problem of autonomous network defence using entity-based +reinforcement learning, where the observation and action space of an agent are +decomposed into a collection of discrete entities. This framework enables the +use of policy parameterisations specialised in compositional generalisation. +Namely, we train a Transformer-based policy on the Yawning Titan cyber-security +simulation environment and test its generalisation capabilities across various +network topologies. We demonstrate that this approach significantly outperforms +an MLP-based policy on fixed networks, and has the ability for zero-shot +generalisation to networks of a different size to those seen in training. + These findings highlight the potential for entity-based reinforcement +learning to advance the field of autonomous cyber defence by providing more +generalisable policies capable of handling variations in real-world network +environments. + +
+
+ comment: Material to appear in the proceedings of the 1st International + Workshop on Autonomous Cybersecurity at ACM CCS 2024 +
+
+
+
+
+ + ☆ Exploring structure diversity in atomic resolution microscopy with graph + neural networks + + +
+ The emergence of deep learning (DL) has provided great opportunities for the +high-throughput analysis of atomic-resolution micrographs. However, the DL +models trained by image patches in fixed size generally lack efficiency and +flexibility when processing micrographs containing diversified atomic +configurations. Herein, inspired by the similarity between the atomic +structures and graphs, we describe a few-shot learning framework based on an +equivariant graph neural network (EGNN) to analyze a library of atomic +structures (e.g., vacancies, phases, grain boundaries, doping, etc.), showing +significantly promoted robustness and three orders of magnitude reduced +computing parameters compared to the image-driven DL models, which is +especially evident for those aggregated vacancy lines with flexible lattice +distortion. Besides, the intuitiveness of graphs enables quantitative and +straightforward extraction of the atomic-scale structural features in batches, +thus statistically unveiling the self-assembly dynamics of vacancy lines under +electron beam irradiation. A versatile model toolkit is established by +integrating EGNN sub-models for single structure recognition to process images +involving varied configurations in the form of a task chain, leading to the +discovery of novel doping configurations with superior electrocatalytic +properties for hydrogen evolution reactions. This work provides a powerful tool +to explore structure diversity in a fast, accurate, and intelligent manner. + +
+
+
+
+
+ + ☆ Feature Learning in Attention Mechanisms Is More Compact and Stable Than + in Convolution + + +
+ Attention and convolution are fundamental techniques in machine learning. +While they use different approaches to learn features - attention mechanisms +capture both global and local data relathionships, while convolutional layers +focus on local patterns - both methods are effective for various tasks. +Although the feature learning of both models is well-studied individually, +there has not been a direct comparison of their feature learning dynamics. In +this paper, we compare their Lipschitz continuity with respect to the +Wasserstein distance and covering numbers under similar settings. We +demonstrate that attention processes data in a more compact and stable manner. +Compactness refers to the lower variance and intrinsic dimensionality of the +activation outputs, while stability refers to the changes between inputs and +outputs. We validate our findings through experiments using topological data +analysis, measuring the 1-, 2-, and infinity-Wasserstein distances between the +outputs of each layer from both models. Furthermore, we extend our comparison +to Vision Transformers (ViTs) and ResNets, showing that while ViTs have higher +output variance, their feature learning is more stable than that of ResNets. + +
+
+
+
+
+ + ☆ Incremental Learning of Affordances using Markov Logic Networks + + +
+ Affordances enable robots to have a semantic understanding of their +surroundings. This allows them to have more acting flexibility when completing +a given task. Capturing object affordances in a machine learning model is a +difficult task, because of their dependence on contextual information. Markov +Logic Networks (MLN) combine probabilistic reasoning with logic that is able to +capture such context. Mobile robots operate in partially known environments +wherein unseen object affordances can be observed. This new information must be +incorporated into the existing knowledge, without having to retrain the MLN +from scratch. We introduce the MLN Cumulative Learning Algorithm (MLN-CLA). +MLN-CLA learns new relations in various knowledge domains by retaining +knowledge and only updating the changed knowledge, for which the MLN is +retrained. We show that MLN-CLA is effective for accumulative learning and +zero-shot affordance inference, outperforming strong baselines. + +
+
+ comment: accepted at IEEE IRC 2024 +
+
+
+
+
+ + ☆ Self-Supervised Graph Neural Networks for Enhanced Feature Extraction in + Heterogeneous Information Networks + + +
+ This paper explores the applications and challenges of graph neural networks +(GNNs) in processing complex graph data brought about by the rapid development +of the Internet. Given the heterogeneity and redundancy problems that graph +data often have, traditional GNN methods may be overly dependent on the initial +structure and attribute information of the graph, which limits their ability to +accurately simulate more complex relationships and patterns in the graph. +Therefore, this study proposes a graph neural network model under a +self-supervised learning framework, which can flexibly combine different types +of additional information of the attribute graph and its nodes, so as to better +mine the deep features in the graph data. By introducing a self-supervisory +mechanism, it is expected to improve the adaptability of existing models to the +diversity and complexity of graph data and improve the overall performance of +the model. + +
+
+
+
+
+ + ☆ A Kernel Perspective on Distillation-based Collaborative Learning NeurIPS 2024 + + +
+ Over the past decade, there is a growing interest in collaborative learning +that can enhance AI models of multiple parties. However, it is still +challenging to enhance performance them without sharing private data and models +from individual parties. One recent promising approach is to develop +distillation-based algorithms that exploit unlabeled public data but the +results are still unsatisfactory in both theory and practice. To tackle this +problem, we rigorously analyze a representative distillation-based algorithm in +the view of kernel regression. This work provides the first theoretical results +to prove the (nearly) minimax optimality of the nonparametric collaborative +learning algorithm that does not directly share local data or models in +massively distributed statistically heterogeneous environments. Inspired by our +theoretical results, we also propose a practical distillation-based +collaborative learning algorithm based on neural network architecture. Our +algorithm successfully bridges the gap between our theoretical assumptions and +practical settings with neural networks through feature kernel matching. We +simulate various regression tasks to verify our theory and demonstrate the +practical feasibility of our proposed algorithm. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Challenge on Sound Scene Synthesis: Evaluating Text-to-Audio Generation NeurIPS 2024 + + +
+ Despite significant advancements in neural text-to-audio generation, +challenges persist in controllability and evaluation. This paper addresses +these issues through the Sound Scene Synthesis challenge held as part of the +Detection and Classification of Acoustic Scenes and Events 2024. We present an +evaluation protocol combining objective metric, namely Fr\'echet Audio +Distance, with perceptual assessments, utilizing a structured prompt format to +enable diverse captions and effective evaluation. Our analysis reveals varying +performance across sound categories and model architectures, with larger models +generally excelling but innovative lightweight approaches also showing promise. +The strong correlation between objective metrics and human ratings validates +our evaluation approach. We discuss outcomes in terms of audio quality, +controllability, and architectural considerations for text-to-audio +synthesizers, providing direction for future research. + +
+
+ comment: accepted to NeurIPS 2024 Workshop: Audio Imagination +
+
+
+
+
+ + ☆ Predicting Company Growth by Econophysics informed Machine Learning + + +
+ Predicting company growth is crucial for strategic adjustment, operational +decision-making, risk assessment, and loan eligibility reviews. Traditional +models for company growth often focus too much on theory, overlooking practical +forecasting, or they rely solely on time series forecasting techniques, +ignoring interpretability and the inherent mechanisms of company growth. In +this paper, we propose a machine learning-based prediction framework that +incorporates an econophysics model for company growth. Our model captures both +the intrinsic growth mechanisms of companies led by scaling laws and the +fluctuations influenced by random factors and individual decisions, +demonstrating superior predictive performance compared with methods that use +time series techniques alone. Its advantages are more pronounced in long-range +prediction tasks. By explicitly modeling the baseline growth and volatility +components, our model is more interpretable. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ☆ Bonsai: Gradient-free Graph Distillation for Node Classification + + +
+ Graph distillation has emerged as a promising avenue to enable scalable +training of GNNs by compressing the training dataset while preserving essential +graph characteristics. Our study uncovers significant shortcomings in current +graph distillation techniques. First, the majority of the algorithms +paradoxically require training on the full dataset to perform distillation. +Second, due to their gradient-emulating approach, these methods require fresh +distillation for any change in hyperparameters or GNN architecture, limiting +their flexibility and reusability. Finally, they fail to achieve substantial +size reduction due to synthesizing fully-connected, edge-weighted graphs. To +address these challenges, we present Bonsai, a novel graph distillation method +empowered by the observation that \textit{computation trees} form the +fundamental processing units of message-passing GNNs. Bonsai distills datasets +by encoding a careful selection of \textit{exemplar} trees that maximize the +representation of all computation trees in the training set. This unique +approach imparts Bonsai as the first linear-time, model-agnostic graph +distillation algorithm for node classification that outperforms existing +baselines across $6$ real-world datasets on accuracy, while being $22$ times +faster on average. Bonsai is grounded in rigorous mathematical guarantees on +the adopted approximation strategies making it robust to GNN architectures, +datasets, and parameters. + +
+
+
+
+
+ + ☆ Adversarial Domain Adaptation for Metal Cutting Sound Detection: + Leveraging Abundant Lab Data for Scarce Industry Data + + +
+ Cutting state monitoring in the milling process is crucial for improving +manufacturing efficiency and tool life. Cutting sound detection using machine +learning (ML) models, inspired by experienced machinists, can be employed as a +cost-effective and non-intrusive monitoring method in a complex manufacturing +environment. However, labeling industry data for training is costly and +time-consuming. Moreover, industry data is often scarce. In this study, we +propose a novel adversarial domain adaptation (DA) approach to leverage +abundant lab data to learn from scarce industry data, both labeled, for +training a cutting-sound detection model. Rather than adapting the features +from separate domains directly, we project them first into two separate latent +spaces that jointly work as the feature space for learning domain-independent +representations. We also analyze two different mechanisms for adversarial +learning where the discriminator works as an adversary and a critic in separate +settings, enabling our model to learn expressive domain-invariant and +domain-ingrained features, respectively. We collected cutting sound data from +multiple sensors in different locations, prepared datasets from lab and +industry domain, and evaluated our learning models on them. Experiments showed +that our models outperformed the multi-layer perceptron based vanilla domain +adaptation models in labeling tasks on the curated datasets, achieving near +92%, 82% and 85% accuracy respectively for three different sensors installed in +industry settings. + +
+
+ comment: 8 pages, 3 figures, 3 tables, First two named Authors have equal + contribution (Co-first author) +
+
+
+
+
+ + ☆ Securing Federated Learning Against Novel and Classic Backdoor Threats + During Foundation Model Integration + + +
+ Federated learning (FL) enables decentralized model training while preserving +privacy. Recently, integrating Foundation Models (FMs) into FL has boosted +performance but also introduced a novel backdoor attack mechanism. Attackers +can exploit the FM's capabilities to embed backdoors into synthetic data +generated by FMs used for model fusion, subsequently infecting all client +models through knowledge sharing without involvement in the long-lasting FL +process. These novel attacks render existing FL backdoor defenses ineffective, +as they primarily detect anomalies among client updates, which may appear +uniformly malicious under this attack. Our work proposes a novel data-free +defense strategy by constraining abnormal activations in the hidden feature +space during model aggregation on the server. The activation constraints, +optimized using synthetic data alongside FL training, mitigate the attack while +barely affecting model performance, as the parameters remain untouched. +Extensive experiments demonstrate its effectiveness against both novel and +classic backdoor attacks, outperforming existing defenses while maintaining +model performance. + +
+
+
+
+
+ + ♻ ☆ Pruning By Explaining Revisited: Optimizing Attribution Methods to Prune + CNNs and Transformers ECCV 2024 + + +
+ To solve ever more complex problems, Deep Neural Networks are scaled to +billions of parameters, leading to huge computational costs. An effective +approach to reduce computational requirements and increase efficiency is to +prune unnecessary components of these often over-parameterized networks. +Previous work has shown that attribution methods from the field of eXplainable +AI serve as effective means to extract and prune the least relevant network +components in a few-shot fashion. We extend the current state by proposing to +explicitly optimize hyperparameters of attribution methods for the task of +pruning, and further include transformer-based networks in our analysis. Our +approach yields higher model compression rates of large transformer- and +convolutional architectures (VGG, ResNet, ViT) compared to previous works, +while still attaining high performance on ImageNet classification tasks. Here, +our experiments indicate that transformers have a higher degree of +over-parameterization compared to convolutional neural networks. Code is +available at https://github.com/erfanhatefi/Pruning-by-eXplaining-in-PyTorch. + +
+
+ comment: Accepted as a workshop paper at ECCV 2024, 26 pages (11 pages + manuscript, 3 pages references, 12 pages appendix) +
+
+
+
+
+ + ♻ ☆ Correlated Proxies: A New Definition and Improved Mitigation for Reward + Hacking + + +
+ Because it is difficult to precisely specify complex objectives, +reinforcement learning policies are often optimized using flawed proxy rewards +that seem to capture the true objective. However, optimizing proxy rewards +frequently leads to reward hacking: the optimized reward function ceases to be +a good proxy, and the resulting policy performs poorly with respect to the +unspecified true reward. Principled solutions to reward hacking have been +impeded by the lack of a good definition for the problem. To address this, we +introduce a definition of reward hacking based on the correlation between proxy +and true rewards for states and actions seen by a "base policy" that breaks +down under optimization. We show that this definition captures reward hacking +behavior across several realistic settings, including in reinforcement learning +from human feedback (RLHF). We then show theoretically that regularization to +the base policy can effectively prevent reward hacking. While current RLHF +approaches apply a KL penalty between the action distributions of policies, our +theory suggests that it is more effective to regularize using the $\chi^2$ +divergence between the policies' occupancy measures. We intuitively show why +this type of regularization is superior and demonstrate that it better +mitigates reward hacking in practice across four realistic domains, including +RLHF for LLMs. Our code is available at https://github.com/cassidylaidlaw/orpo. + +
+
+
+
+
+ + ♻ ☆ Conditional Language Policy: A General Framework for Steerable + Multi-Objective Finetuning EMNLP 2024 + + +
+ Reward-based finetuning is crucial for aligning language policies with +intended behaviors (e.g., creativity and safety). A key challenge is to develop +steerable language models that trade-off multiple (conflicting) objectives in a +flexible and efficient manner. This paper presents Conditional Language Policy +(CLP), a general framework for finetuning language models on multiple +objectives. Building on techniques from multi-task training and +parameter-efficient finetuning, CLP learn steerable models that effectively +trade-off conflicting objectives at inference time. Notably, this does not +require training or maintaining multiple models to achieve different trade-offs +between the objectives. Through extensive experiments and ablations on two +summarization datasets, we show that CLP learns steerable language models that +outperform and Pareto-dominate the existing approaches for multi-objective +finetuning. + +
+
+ comment: 40 pages. Findings of EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ VILA-U: a Unified Foundation Model Integrating Visual Understanding and + Generation + + +
+ VILA-U is a Unified foundation model that integrates Video, Image, Language +understanding and generation. Traditional visual language models (VLMs) use +separate modules for understanding and generating visual content, which can +lead to misalignment and increased complexity. In contrast, VILA-U employs a +single autoregressive next-token prediction framework for both tasks, +eliminating the need for additional components like diffusion models. This +approach not only simplifies the model but also achieves near state-of-the-art +performance in visual language understanding and generation. The success of +VILA-U is attributed to two main factors: the unified vision tower that aligns +discrete visual tokens with textual inputs during pretraining, which enhances +visual perception, and autoregressive image generation can achieve similar +quality as diffusion models with high-quality dataset. This allows VILA-U to +perform comparably to more complex models using a fully token-based +autoregressive framework. + +
+
+ comment: Code: https://github.com/mit-han-lab/vila-u. The first two authors + contributed equally to this work +
+
+
+
+
+ + ♻ ☆ Counter-Current Learning: A Biologically Plausible Dual Network Approach + for Deep Learning NeurIPS 2024 + + +
+ Despite its widespread use in neural networks, error backpropagation has +faced criticism for its lack of biological plausibility, suffering from issues +such as the backward locking problem and the weight transport problem. These +limitations have motivated researchers to explore more biologically plausible +learning algorithms that could potentially shed light on how biological neural +systems adapt and learn. Inspired by the counter-current exchange mechanisms +observed in biological systems, we propose counter-current learning (CCL), a +biologically plausible framework for credit assignment in neural networks. This +framework employs a feedforward network to process input data and a feedback +network to process targets, with each network enhancing the other through +anti-parallel signal propagation. By leveraging the more informative signals +from the bottom layer of the feedback network to guide the updates of the top +layer of the feedforward network and vice versa, CCL enables the simultaneous +transformation of source inputs to target outputs and the dynamic mutual +influence of these transformations. Experimental results on MNIST, +FashionMNIST, CIFAR10, and CIFAR100 datasets using multi-layer perceptrons and +convolutional neural networks demonstrate that CCL achieves comparable +performance to other biologically plausible algorithms while offering a more +biologically realistic learning mechanism. Furthermore, we showcase the +applicability of our approach to an autoencoder task, underscoring its +potential for unsupervised representation learning. Our work presents a +direction for biologically inspired and plausible learning algorithms, offering +an alternative mechanism of learning and adaptation in neural networks. + +
+
+ comment: Accepted at NeurIPS 2024. Code available at + https://github.com/IandRover/CCL-NeurIPS24 +
+
+
+
+
+ + ♻ ☆ Proof of Thought : Neurosymbolic Program Synthesis allows Robust and + Interpretable Reasoning NeurIPS + 2024 + + +
+ Large Language Models (LLMs) have revolutionized natural language processing, +yet they struggle with inconsistent reasoning, particularly in novel domains +and complex logical sequences. This research introduces Proof of Thought, a +framework that enhances the reliability and transparency of LLM outputs. Our +approach bridges LLM-generated ideas with formal logic verification, employing +a custom interpreter to convert LLM outputs into First Order Logic constructs +for theorem prover scrutiny. Central to our method is an intermediary +JSON-based Domain-Specific Language, which by design balances precise logical +structures with intuitive human concepts. This hybrid representation enables +both rigorous validation and accessible human comprehension of LLM reasoning +processes. Key contributions include a robust type system with sort management +for enhanced logical integrity, explicit representation of rules for clear +distinction between factual and inferential knowledge, and a flexible +architecture that allows for easy extension to various domain-specific +applications. We demonstrate Proof of Thought's effectiveness through +benchmarking on StrategyQA and a novel multimodal reasoning task, showing +improved performance in open-ended scenarios. By providing verifiable and +interpretable results, our technique addresses critical needs for AI system +accountability and sets a foundation for human-in-the-loop oversight in +high-stakes domains. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) System 2 Reasoning At Scale Workshop +
+
+
+
+
+ + ♻ ☆ AlleNoise: large-scale text classification benchmark dataset with + real-world label noise + + +
+ Label noise remains a challenge for training robust classification models. +Most methods for mitigating label noise have been benchmarked using primarily +datasets with synthetic noise. While the need for datasets with realistic noise +distribution has partially been addressed by web-scraped benchmarks such as +WebVision and Clothing1M, those benchmarks are restricted to the computer +vision domain. With the growing importance of Transformer-based models, it is +crucial to establish text classification benchmarks for learning with noisy +labels. In this paper, we present AlleNoise, a new curated text classification +benchmark dataset with real-world instance-dependent label noise, containing +over 500,000 examples across approximately 5,600 classes, complemented with a +meaningful, hierarchical taxonomy of categories. The noise distribution comes +from actual users of a major e-commerce marketplace, so it realistically +reflects the semantics of human mistakes. In addition to the noisy labels, we +provide human-verified clean labels, which help to get a deeper insight into +the noise distribution, unlike web-scraped datasets typically used in the +field. We demonstrate that a representative selection of established methods +for learning with noisy labels is inadequate to handle such real-world noise. +In addition, we show evidence that these algorithms do not alleviate excessive +memorization. As such, with AlleNoise, we set the bar high for the development +of label noise methods that can handle real-world label noise in text +classification tasks. The code and dataset are available for download at +https://github.com/allegro/AlleNoise. + +
+
+
+
+
+ + ♻ ☆ CondTSF: One-line Plugin of Dataset Condensation for Time Series + Forecasting NeurIPS 2024 + + +
+ Dataset condensation is a newborn technique that generates a small dataset +that can be used in training deep neural networks to lower training costs. The +objective of dataset condensation is to ensure that the model trained with the +synthetic dataset can perform comparably to the model trained with full +datasets. However, existing methods predominantly concentrate on classification +tasks, posing challenges in their adaptation to time series forecasting +(TS-forecasting). This challenge arises from disparities in the evaluation of +synthetic data. In classification, the synthetic data is considered +well-distilled if the model trained with the full dataset and the model trained +with the synthetic dataset yield identical labels for the same input, +regardless of variations in output logits distribution. Conversely, in +TS-forecasting, the effectiveness of synthetic data distillation is determined +by the distance between predictions of the two models. The synthetic data is +deemed well-distilled only when all data points within the predictions are +similar. Consequently, TS-forecasting has a more rigorous evaluation +methodology compared to classification. To mitigate this gap, we theoretically +analyze the optimization objective of dataset condensation for TS-forecasting +and propose a new one-line plugin of dataset condensation designated as Dataset +Condensation for Time Series Forecasting (CondTSF) based on our analysis. +Plugging CondTSF into previous dataset condensation methods facilitates a +reduction in the distance between the predictions of the model trained with the +full dataset and the model trained with the synthetic dataset, thereby +enhancing performance. We conduct extensive experiments on eight commonly used +time series datasets. CondTSF consistently improves the performance of all +previous dataset condensation methods across all datasets, particularly at low +condensing ratios. + +
+
+ comment: Accepted by NeurIPS 2024, the project can be found at + https://github.com/RafaDD/CondTSF +
+
+
+
+
+ + ♻ ☆ Learning a quantum computer's capability + + +
+ Accurately predicting a quantum computer's capability -- which circuits it +can run and how well it can run them -- is a foundational goal of quantum +characterization and benchmarking. As modern quantum computers become +increasingly hard to simulate, we must develop accurate and scalable predictive +capability models to help researchers and stakeholders decide which quantum +computers to build and use. In this work, we propose a hardware-agnostic method +to efficiently construct scalable predictive models of a quantum computer's +capability for almost any class of circuits, and demonstrate our method using +convolutional neural networks (CNNs). Our CNN-based approach works by +efficiently representing a circuit as a three-dimensional tensor and then using +a CNN to predict its success rate. Our CNN capability models obtain +approximately a $1\%$ average absolute prediction error when modeling +processors experiencing both Markovian and non-Markovian stochastic Pauli +errors. We also apply our CNNs to model the capabilities of cloud-access +quantum computing systems, obtaining moderate prediction accuracy (average +absolute error around $2-5\%$), and we highlight the challenges to building +better neural network capability models. + +
+
+ comment: 20 pages, 11 figures, plus appendices +
+
+
+
+
+ + ♻ ☆ Federated Class-Incremental Learning with Hierarchical Generative + Prototypes + + +
+ Federated Learning (FL) aims at unburdening the training of deep models by +distributing computation across multiple devices (clients) while safeguarding +data privacy. On top of that, Federated Continual Learning (FCL) also accounts +for data distribution evolving over time, mirroring the dynamic nature of +real-world environments. While previous studies have identified Catastrophic +Forgetting and Client Drift as primary causes of performance degradation in +FCL, we shed light on the importance of Incremental Bias and Federated Bias, +which cause models to prioritize classes that are recently introduced or +locally predominant, respectively. Our proposal constrains both biases in the +last layer by efficiently finetuning a pre-trained backbone using learnable +prompts, resulting in clients that produce less biased representations and more +biased classifiers. Therefore, instead of solely relying on parameter +aggregation, we leverage generative prototypes to effectively balance the +predictions of the global model. Our method significantly improves the current +State Of The Art, providing an average increase of +7.8% in accuracy. Code to +reproduce the results is provided in the suppl. material. + +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Truly Grasp Mathematics? An Empirical + Exploration + + +
+ Despite their proficiency in math tasks, the mechanisms underlying LLMs' +mathematical reasoning abilities remain a subject of debate. Recent studies +suggest that chain-of-thought (CoT) prompts can bolster mathematical reasoning +by encouraging LLMs to employ human-like logical reasoning (System 2), enabling +them to excel on the Cognitive Reflection Test (CRT). To assess whether LLMs +genuinely possess System 2-like logical reasoning, we introduced targeted +modifications to CRT problems. Our findings reveal that, despite the use of CoT +prompts, mainstream LLMs, including the latest o1-preview model, continue to +exhibit a significant error rate. Further analysis indicates that they +predominantly rely on System 1-like intuitive reasoning and pattern matching +derived from training data, rather than demonstrating mastery of mathematical +thinking. This discovery challenges the prevailing notion that LLMs possess +genuine logical reasoning abilities and that CoT can enhance them. +Consequently, this work may temper overly optimistic projections regarding +LLMs' advancement toward artificial general intelligence. + +
+
+
+
+
+ + ♻ ☆ On the potential of Optimal Transport in Geospatial Data Science + + +
+ Prediction problems in geographic information science and transportation are +often motivated by the possibility to enhance operational efficiency and +thereby reduce emissions. Examples range from predicting car sharing demand for +relocation planning to forecasting traffic congestion for navigation purposes. +However, conventional accuracy metrics ignore the spatial distribution of the +errors, despite its relevance for operations. Here, we put forward a spatially +aware evaluation metric and loss function based on Optimal Transport (OT). Our +framework leverages partial OT and can minimize relocation costs in any spatial +prediction problem. We showcase the advantages of OT-based evaluation over +conventional metrics and further demonstrate the application of an OT loss +function for improving forecasts of bike sharing demand and charging station +occupancy. Thus, our framework not only aligns with operational considerations, +but also signifies a step forward in refining predictions within geospatial +applications. All code is available at https://github.com/mie-lab/geospatialOT. + +
+
+
+
+
+ + ♻ ☆ Quantum Architecture Search with Unsupervised Representation Learning + + +
+ Unsupervised representation learning presents new opportunities for advancing +Quantum Architecture Search (QAS) on Noisy Intermediate-Scale Quantum (NISQ) +devices. QAS is designed to optimize quantum circuits for Variational Quantum +Algorithms (VQAs). Most QAS algorithms tightly couple the search space and +search algorithm, typically requiring the evaluation of numerous quantum +circuits, resulting in high computational costs and limiting scalability to +larger quantum circuits. Predictor-based QAS algorithms mitigate this issue by +estimating circuit performance based on structure or embedding. However, these +methods often demand time-intensive labeling to optimize gate parameters across +many circuits, which is crucial for training accurate predictors. Inspired by +the classical neural architecture search algorithm Arch2vec, we investigate the +potential of unsupervised representation learning for QAS without relying on +predictors. Our framework decouples unsupervised architecture representation +learning from the search process, enabling the learned representations to be +applied across various downstream tasks. Additionally, it integrates an +improved quantum circuit graph encoding scheme, addressing the limitations of +existing representations and enhancing search efficiency. This predictor-free +approach removes the need for large labeled datasets. During the search, we +employ REINFORCE and Bayesian Optimization to explore the latent representation +space and compare their performance against baseline methods. Our results +demonstrate that the framework efficiently identifies high-performing quantum +circuits with fewer search iterations. + +
+
+ comment: 9 Pages, quantum architecture search, unsupervised representation + learning +
+
+
+
+
+ + ♻ ☆ MOTIVE: A Drug-Target Interaction Graph For Inductive Link Prediction + + +
+ Drug-target interaction (DTI) prediction is crucial for identifying new +therapeutics and detecting mechanisms of action. While structure-based methods +accurately model physical interactions between a drug and its protein target, +cell-based assays such as Cell Painting can better capture complex DTI +interactions. This paper introduces MOTIVE, a Morphological cOmpound Target +Interaction Graph dataset comprising Cell Painting features for 11,000 genes +and 3,600 compounds, along with their relationships extracted from seven +publicly available databases. We provide random, cold-source (new drugs), and +cold-target (new genes) data splits to enable rigorous evaluation under +realistic use cases. Our benchmark results show that graph neural networks that +use Cell Painting features consistently outperform those that learn from graph +structure alone, feature-based models, and topological heuristics. MOTIVE +accelerates both graph ML research and drug discovery by promoting the +development of more reliable DTI prediction models. MOTIVE resources are +available at https://github.com/carpenter-singh-lab/motive. + +
+
+
+
+
+ + ♻ ☆ Bounded KRnet and its applications to density estimation and + approximation + + +
+ In this paper, we develop an invertible mapping, called B-KRnet, on a bounded +domain and apply it to density estimation/approximation for data or the +solutions of PDEs such as the Fokker-Planck equation and the Keller-Segel +equation. Similar to KRnet, the structure of B-KRnet adapts the +pseudo-triangular structure into a normalizing flow model. The main difference +between B-KRnet and KRnet is that B-KRnet is defined on a hypercube while KRnet +is defined on the whole space, in other words, a new mechanism is introduced in +B-KRnet to maintain the exact invertibility. Using B-KRnet as a transport map, +we obtain an explicit probability density function (PDF) model that corresponds +to the pushforward of a prior (uniform) distribution on the hypercube. It can +be directly applied to density estimation when only data are available. By +coupling KRnet and B-KRnet, we define a deep generative model on a +high-dimensional domain where some dimensions are bounded and other dimensions +are unbounded. A typical case is the solution of the stationary kinetic +Fokker-Planck equation, which is a PDF of position and momentum. Based on +B-KRnet, we develop an adaptive learning approach to approximate partial +differential equations whose solutions are PDFs or can be treated as PDFs. A +variety of numerical experiments is presented to demonstrate the effectiveness +of B-KRnet. + +
+
+ comment: 26 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Linear Adversarial Concept Erasure ICML 2022 + + +
+ Modern neural models trained on textual data rely on pre-trained +representations that emerge without direct supervision. As these +representations are increasingly being used in real-world applications, the +inability to \emph{control} their content becomes an increasingly important +problem. We formulate the problem of identifying and erasing a linear subspace +that corresponds to a given concept, in order to prevent linear predictors from +recovering the concept. We model this problem as a constrained, linear maximin +game, and show that existing solutions are generally not optimal for this task. +We derive a closed-form solution for certain objectives, and propose a convex +relaxation, \method, that works well for others. When evaluated in the context +of binary gender removal, the method recovers a low-dimensional subspace whose +removal mitigates bias by intrinsic and extrinsic evaluation. We show that the +method is highly expressive, effectively mitigating bias in deep nonlinear +classifiers while maintaining tractability and interpretability. + +
+
+ comment: Accepted in ICML 2022; a revised version +
+
+
+
+
+ + ♻ ☆ Accessible, At-Home Detection of Parkinson's Disease via Multi-task + Video Analysis + + +
+ Limited accessibility to neurological care leads to underdiagnosed +Parkinson's Disease (PD), preventing early intervention. Existing AI-based PD +detection methods primarily focus on unimodal analysis of motor or speech +tasks, overlooking the multifaceted nature of the disease. To address this, we +introduce a large-scale, multi-task video dataset consisting of 1102 sessions +(each containing videos of finger tapping, facial expression, and speech tasks +captured via webcam) from 845 participants (272 with PD). We propose a novel +Uncertainty-calibrated Fusion Network (UFNet) that leverages this multimodal +data to enhance diagnostic accuracy. UFNet employs independent task-specific +networks, trained with Monte Carlo Dropout for uncertainty quantification, +followed by self-attended fusion of features, with attention weights +dynamically adjusted based on task-specific uncertainties. To ensure +patient-centered evaluation, the participants were randomly split into three +sets: 60% for training, 20% for model selection, and 20% for final performance +evaluation. UFNet significantly outperformed single-task models in terms of +accuracy, area under the ROC curve (AUROC), and sensitivity while maintaining +non-inferior specificity. Withholding uncertain predictions further boosted the +performance, achieving 88.0+-0.3%$ accuracy, 93.0+-0.2% AUROC, 79.3+-0.9% +sensitivity, and 92.6+-0.3% specificity, at the expense of not being able to +predict for 2.3+-0.3% data (+- denotes 95% confidence interval). Further +analysis suggests that the trained model does not exhibit any detectable bias +across sex and ethnic subgroups and is most effective for individuals aged +between 50 and 80. Requiring only a webcam and microphone, our approach +facilitates accessible home-based PD screening, especially in regions with +limited healthcare resources. + +
+
+
+
+
+ + ♻ ☆ Certifiably Robust Policies for Uncertain Parametric Environments + + +
+ We present a data-driven approach for producing policies that are provably +robust across unknown stochastic environments. Existing approaches can learn +models of a single environment as an interval Markov decision processes (IMDP) +and produce a robust policy with a probably approximately correct (PAC) +guarantee on its performance. However these are unable to reason about the +impact of environmental parameters underlying the uncertainty. We propose a +framework based on parametric Markov decision processes (MDPs) with unknown +distributions over parameters. We learn and analyse IMDPs for a set of unknown +sample environments induced by parameters. The key challenge is then to produce +meaningful performance guarantees that combine the two layers of uncertainty: +(1) multiple environments induced by parameters with an unknown distribution; +(2) unknown induced environments which are approximated by IMDPs. We present a +novel approach based on scenario optimisation that yields a single PAC +guarantee quantifying the risk level for which a specified performance level +can be assured in unseen environments, plus a means to trade-off risk and +performance. We implement and evaluate our framework using multiple robust +policy generation methods on a range of benchmarks. We show that our approach +produces tight bounds on a policy's performance with high confidence. + +
+
+
+
+
+ + ♻ ☆ On provable privacy vulnerabilities of graph representations + + +
+ Graph representation learning (GRL) is critical for extracting insights from +complex network structures, but it also raises security concerns due to +potential privacy vulnerabilities in these representations. This paper +investigates the structural vulnerabilities in graph neural models where +sensitive topological information can be inferred through edge reconstruction +attacks. Our research primarily addresses the theoretical underpinnings of +similarity-based edge reconstruction attacks (SERA), furnishing a +non-asymptotic analysis of their reconstruction capacities. Moreover, we +present empirical corroboration indicating that such attacks can perfectly +reconstruct sparse graphs as graph size increases. Conversely, we establish +that sparsity is a critical factor for SERA's effectiveness, as demonstrated +through analysis and experiments on (dense) stochastic block models. Finally, +we explore the resilience of private graph representations produced via noisy +aggregation (NAG) mechanism against SERA. Through theoretical analysis and +empirical assessments, we affirm the mitigation of SERA using NAG . In +parallel, we also empirically delineate instances wherein SERA demonstrates +both efficacy and deficiency in its capacity to function as an instrument for +elucidating the trade-off between privacy and utility. + +
+
+
+
+
+ + ♻ ☆ Adaptive Variance Reduction for Stochastic Optimization under Weaker + Assumptions + + +
+ This paper explores adaptive variance reduction methods for stochastic +optimization based on the STORM technique. Existing adaptive extensions of +STORM rely on strong assumptions like bounded gradients and bounded function +values, or suffer an additional $\mathcal{O}(\log T)$ term in the convergence +rate. To address these limitations, we introduce a novel adaptive STORM method +that achieves an optimal convergence rate of $\mathcal{O}(T^{-1/3})$ for +non-convex functions with our newly designed learning rate strategy. Compared +with existing approaches, our method requires weaker assumptions and attains +the optimal convergence rate without the additional $\mathcal{O}(\log T)$ term. +We also extend the proposed technique to stochastic compositional optimization, +obtaining the same optimal rate of $\mathcal{O}(T^{-1/3})$. Furthermore, we +investigate the non-convex finite-sum problem and develop another innovative +adaptive variance reduction method that achieves an optimal convergence rate of +$\mathcal{O}(n^{1/4} T^{-1/2} )$, where $n$ represents the number of component +functions. Numerical experiments across various tasks validate the +effectiveness of our method. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2406.00489 +
+
+
+
+
+ + ♻ ☆ Posterior Sampling-based Online Learning for Episodic POMDPs + + +
+ Learning in POMDPs is known to be significantly harder than in MDPs. In this +paper, we consider the online learning problem for episodic POMDPs with unknown +transition and observation models. We propose a Posterior Sampling-based +reinforcement learning algorithm for POMDPs (PS4POMDPs), which is much simpler +and more implementable compared to state-of-the-art optimism-based online +learning algorithms for POMDPs. We show that the Bayesian regret of the +proposed algorithm scales as the square root of the number of episodes and is +polynomial in the other parameters. In a general setting, the regret scales +exponentially in the horizon length $H$, and we show that this is inevitable by +providing a lower bound. However, when the POMDP is undercomplete and weakly +revealing (a common assumption in the recent literature), we establish a +polynomial Bayesian regret bound. We finally propose a posterior sampling +algorithm for multi-agent POMDPs, and show it too has sublinear regret. + +
+
+ comment: 41 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Efficient Sign-Based Optimization: Accelerating Convergence via Variance + Reduction + + +
+ Sign stochastic gradient descent (signSGD) is a communication-efficient +method that transmits only the sign of stochastic gradients for parameter +updating. Existing literature has demonstrated that signSGD can achieve a +convergence rate of $\mathcal{O}(d^{1/2}T^{-1/4})$, where $d$ represents the +dimension and $T$ is the iteration number. In this paper, we improve this +convergence rate to $\mathcal{O}(d^{1/2}T^{-1/3})$ by introducing the +Sign-based Stochastic Variance Reduction (SSVR) method, which employs variance +reduction estimators to track gradients and leverages their signs to update. +For finite-sum problems, our method can be further enhanced to achieve a +convergence rate of $\mathcal{O}(m^{1/4}d^{1/2}T^{-1/2})$, where $m$ denotes +the number of component functions. Furthermore, we investigate the +heterogeneous majority vote in distributed settings and introduce two novel +algorithms that attain improved convergence rates of +$\mathcal{O}(d^{1/2}T^{-1/2} + dn^{-1/2})$ and $\mathcal{O}(d^{1/4}T^{-1/4})$ +respectively, outperforming the previous results of $\mathcal{O}(dT^{-1/4} + +dn^{-1/2})$ and $\mathcal{O}(d^{3/8}T^{-1/8})$, where $n$ represents the number +of nodes. Numerical experiments across different tasks validate the +effectiveness of our proposed methods. + +
+
+
+
+
+ + ♻ ☆ Anomaly Prediction: A Novel Approach with Explicit Delay and Horizon + + +
+ Anomaly detection in time series data is a critical challenge across various +domains. Traditional methods typically focus on identifying anomalies in +immediate subsequent steps, often underestimating the significance of temporal +dynamics such as delay time and horizons of anomalies, which generally require +extensive post-analysis. This paper introduces a novel approach for time series +anomaly prediction, incorporating temporal information directly into the +prediction results. We propose a new dataset specifically designed to evaluate +this approach and conduct comprehensive experiments using several +state-of-the-art methods. Our results demonstrate the efficacy of our approach +in providing timely and accurate anomaly predictions, setting a new benchmark +for future research in this field. + +
+
+
+
+
+ + ♻ ☆ Acquiring Better Load Estimates by Combining Anomaly and Change Point + Detection in Power Grid Time-series Measurements + + +
+ In this paper we present novel methodology for automatic anomaly and switch +event filtering to improve load estimation in power grid systems. By leveraging +unsupervised methods with supervised optimization, our approach prioritizes +interpretability while ensuring robust and generalizable performance on unseen +data. Through experimentation, a combination of binary segmentation for change +point detection and statistical process control for anomaly detection emerges +as the most effective strategy, specifically when ensembled in a novel +sequential manner. Results indicate the clear wasted potential when filtering +is not applied. The automatic load estimation is also fairly accurate, with +approximately 90% of estimates falling within a 10% error margin, with only a +single significant failure in both the minimum and maximum load estimates +across 60 measurements in the test set. Our methodology's interpretability +makes it particularly suitable for critical infrastructure planning, thereby +enhancing decision-making processes. + +
+
+ comment: All code can be found at: https://github.com/RoelBouman/StormPhase2 +
+
+
+
+
+ + ♻ ☆ A spring-block theory of feature learning in deep neural networks + + +
+ Feature-learning deep nets progressively collapse data to a regular +low-dimensional geometry. How this phenomenon emerges from collective action of +nonlinearity, noise, learning rate, and other choices that shape the dynamics, +has eluded first-principles theories built from microscopic neuronal dynamics. +We exhibit a noise-nonlinearity phase diagram that identifies regimes where +shallow or deep layers learn more effectively. We then propose a macroscopic +mechanical theory that reproduces the diagram, explaining why some DNNs are +lazy and some active, and linking feature learning across layers to +generalization. + +
+
+
+
+
+ + ♻ ☆ CPE-Pro: A Structure-Sensitive Deep Learning Method for Protein + Representation and Origin Evaluation + + +
+ Protein structures are important for understanding their functions and +interactions. Currently, many protein structure prediction methods are +enriching the structure database. Discriminating the origin of structures is +crucial for distinguishing between experimentally resolved and computationally +predicted structures, evaluating the reliability of prediction methods, and +guiding downstream biological studies. Building on works in structure +prediction, We developed a structure-sensitive supervised deep learning model, +Crystal vs Predicted Evaluator for Protein Structure (CPE-Pro), to represent +and discriminate the origin of protein structures. CPE-Pro learns the +structural information of proteins and captures inter-structural differences to +achieve accurate traceability on four data classes, and is expected to be +extended to more. Simultaneously, we utilized Foldseek to encode protein +structures into "structure-sequences" and trained a protein Structural Sequence +Language Model, SSLM. Preliminary experiments demonstrated that, compared to +large-scale protein language models pre-trained on vast amounts of amino acid +sequences, the "structure-sequence" enables the language model to learn more +informative protein features, enhancing and optimizing structural +representations. We have provided the code, model weights, and all related +materials on https://github.com/GouWenrui/CPE-Pro-main.git. + +
+
+
+
+
+ + ♻ ☆ Towards Croppable Implicit Neural Representations NeurIPS 2024 + + +
+ Implicit Neural Representations (INRs) have peaked interest in recent years +due to their ability to encode natural signals using neural networks. While +INRs allow for useful applications such as interpolating new coordinates and +signal compression, their black-box nature makes it difficult to modify them +post-training. In this paper we explore the idea of editable INRs, and +specifically focus on the widely used cropping operation. To this end, we +present Local-Global SIRENs -- a novel INR architecture that supports cropping +by design. Local-Global SIRENs are based on combining local and global feature +extraction for signal encoding. What makes their design unique is the ability +to effortlessly remove specific portions of an encoded signal, with a +proportional weight decrease. This is achieved by eliminating the corresponding +weights from the network, without the need for retraining. We further show how +this architecture can be used to support the straightforward extension of +previously encoded signals. Beyond signal editing, we examine how the +Local-Global approach can accelerate training, enhance encoding of various +signals, improve downstream performance, and be applied to modern INRs such as +INCODE, highlighting its potential and flexibility. Code is available at +https://github.com/maorash/Local-Global-INRs. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Breaking Class Barriers: Efficient Dataset Distillation via Inter-Class + Feature Compensator + + +
+ Dataset distillation has emerged as a technique aiming to condense +informative features from large, natural datasets into a compact and synthetic +form. While recent advancements have refined this technique, its performance is +bottlenecked by the prevailing class-specific synthesis paradigm. Under this +paradigm, synthetic data is optimized exclusively for a pre-assigned one-hot +label, creating an implicit class barrier in feature condensation. This leads +to inefficient utilization of the distillation budget and oversight of +inter-class feature distributions, which ultimately limits the effectiveness +and efficiency, as demonstrated in our analysis. To overcome these constraints, +this paper presents the Inter-class Feature Compensator (INFER), an innovative +distillation approach that transcends the class-specific data-label framework +widely utilized in current dataset distillation methods. Specifically, INFER +leverages a Universal Feature Compensator (UFC) to enhance feature integration +across classes, enabling the generation of multiple additional synthetic +instances from a single UFC input. This significantly improves the efficiency +of the distillation budget. Moreover, INFER enriches inter-class interactions +during the distillation, thereby enhancing the effectiveness and +generalizability of the distilled data. By allowing for the linear +interpolation of labels similar to those in the original dataset, INFER +meticulously optimizes the synthetic data and dramatically reduces the size of +soft labels in the synthetic dataset to almost zero, establishing a new +benchmark for efficiency and effectiveness in dataset distillation. + +
+
+
+
+
+ + ♻ ☆ I've Got 99 Problems But FLOPS Ain't One + + +
+ Hyperscalers dominate the landscape of large network deployments, yet they +rarely share data or insights about the challenges they face. In light of this +supremacy, what problems can we find to solve in this space? We take an +unconventional approach to find relevant research directions, starting from +public plans to build a $100 billion datacenter for machine learning +applications. Leveraging the language models scaling laws, we discover what +workloads such a datacenter might carry and explore the challenges one may +encounter in doing so, with a focus on networking research. We conclude that +building the datacenter and training such models is technically possible, but +this requires novel wide-area transports for inter-DC communication, a +multipath transport and novel datacenter topologies for intra-datacenter +communication, high speed scale-up networks and transports, outlining a rich +research agenda for the networking community. + +
+
+
+
+
+ + ♻ ☆ TargetCall: Eliminating the Wasted Computation in Basecalling via + Pre-Basecalling Filtering + + +
+ Basecalling is an essential step in nanopore sequencing analysis where the +raw signals of nanopore sequencers are converted into nucleotide sequences, +i.e., reads. State-of-the-art basecallers employ complex deep learning models +to achieve high basecalling accuracy. This makes basecalling computationally +inefficient and memory-hungry, bottlenecking the entire genome analysis +pipeline. However, for many applications, the majority of reads do no match the +reference genome of interest (i.e., target reference) and thus are discarded in +later steps in the genomics pipeline, wasting the basecalling computation. To +overcome this issue, we propose TargetCall, the first pre-basecalling filter to +eliminate the wasted computation in basecalling. TargetCall's key idea is to +discard reads that will not match the target reference (i.e., off-target reads) +prior to basecalling. TargetCall consists of two main components: (1) +LightCall, a lightweight neural network basecaller that produces noisy reads; +and (2) Similarity Check, which labels each of these noisy reads as on-target +or off-target by matching them to the target reference. Our thorough +experimental evaluations show that TargetCall 1) improves the end-to-end +basecalling runtime performance of the state-of-the-art basecaller by 3.31x +while maintaining high (98.88%) recall in keeping on-target reads, 2) maintains +high accuracy in downstream analysis, and 3) achieves better runtime +performance, throughput, recall, precision, and generality compared to prior +works. TargetCall is available at https://github.com/CMU-SAFARI/TargetCall. + +
+
+
+
+
+ + ♻ ☆ Interpreting Context Look-ups in Transformers: Investigating + Attention-MLP Interactions EMNLP 2024 + + +
+ Understanding the inner workings of large language models (LLMs) is crucial +for advancing their theoretical foundations and real-world applications. While +the attention mechanism and multi-layer perceptrons (MLPs) have been studied +independently, their interactions remain largely unexplored. This study +investigates how attention heads and next-token neurons interact in LLMs to +predict new words. We propose a methodology to identify next-token neurons, +find prompts that highly activate them, and determine the upstream attention +heads responsible. We then generate and evaluate explanations for the activity +of these attention heads in an automated manner. Our findings reveal that some +attention heads recognize specific contexts relevant to predicting a token and +activate a downstream token-predicting neuron accordingly. This mechanism +provides a deeper understanding of how attention heads work with MLP neurons to +perform next-token prediction. Our approach offers a foundation for further +research into the intricate workings of LLMs and their impact on text +generation and understanding. + +
+
+ comment: Accepted to EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Ornstein-Uhlenbeck Adaptation as a Mechanism for Learning in Brains and + Machines + + +
+ Learning is a fundamental property of intelligent systems, observed across +biological organisms and engineered systems. While modern intelligent systems +typically rely on gradient descent for learning, the need for exact gradients +and complex information flow makes its implementation in biological and +neuromorphic systems challenging. This has motivated the exploration of +alternative learning mechanisms that can operate locally and do not rely on +exact gradients. In this work, we introduce a novel approach that leverages +noise in the parameters of the system and global reinforcement signals. Using +an Ornstein-Uhlenbeck process with adaptive dynamics, our method balances +exploration and exploitation during learning, driven by deviations from error +predictions, akin to reward prediction error. Operating in continuous time, +Orstein-Uhlenbeck adaptation (OUA) is proposed as a general mechanism for +learning dynamic, time-evolving environments. We validate our approach across +diverse tasks, including supervised learning and reinforcement learning in +feedforward and recurrent systems. Additionally, we demonstrate that it can +perform meta-learning, adjusting hyper-parameters autonomously. Our results +indicate that OUA provides a viable alternative to traditional gradient-based +methods, with potential applications in neuromorphic computing. It also hints +at a possible mechanism for noise-driven learning in the brain, where +stochastic neurotransmitter release may guide synaptic adjustments. + +
+
+
+
+
+ + ♻ ☆ FOOGD: Federated Collaboration for Both Out-of-distribution + Generalization and Detection NeurIPS 2024 + + +
+ Federated learning (FL) is a promising machine learning paradigm that +collaborates with client models to capture global knowledge. However, deploying +FL models in real-world scenarios remains unreliable due to the coexistence of +in-distribution data and unexpected out-of-distribution (OOD) data, such as +covariate-shift and semantic-shift data. Current FL researches typically +address either covariate-shift data through OOD generalization or +semantic-shift data via OOD detection, overlooking the simultaneous occurrence +of various OOD shifts. In this work, we propose FOOGD, a method that estimates +the probability density of each client and obtains reliable global distribution +as guidance for the subsequent FL process. Firstly, SM3D in FOOGD estimates +score model for arbitrary distributions without prior constraints, and detects +semantic-shift data powerfully. Then SAG in FOOGD provides invariant yet +diverse knowledge for both local covariate-shift generalization and client +performance generalization. In empirical validations, FOOGD significantly +enjoys three main advantages: (1) reliably estimating non-normalized +decentralized distributions, (2) detecting semantic shift data via score +values, and (3) generalizing to covariate-shift data by regularizing feature +extractor. The prejoct is open in https://github.com/XeniaLLL/FOOGD-main.git. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Few-Shot Adversarial Prompt Learning on Vision-Language Models NeurIPS 2024 + + +
+ The vulnerability of deep neural networks to imperceptible adversarial +perturbations has attracted widespread attention. Inspired by the success of +vision-language foundation models, previous efforts achieved zero-shot +adversarial robustness by aligning adversarial visual features with text +supervision. However, in practice, they are still unsatisfactory due to several +issues, including heavy adaptation cost, suboptimal text supervision, and +uncontrolled natural generalization capacity. In this paper, to address these +issues, we propose a few-shot adversarial prompt framework where adapting input +sequences with limited data makes significant adversarial robustness +improvement. Specifically, we achieve this by providing adversarially +correlated text supervision that is end-to-end learned from adversarial +examples. We also propose a novel training objective that enhances the +consistency of multi-modal features while encourages differentiated uni-modal +features between natural and adversarial examples. The proposed framework gives +access to learn adversarial text supervision, which provides superior +cross-modal adversarial alignment and matches state-of-the-art zero-shot +adversarial robustness with only 1% training data. Code is available at: +https://github.com/lionel-w2/FAP. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Optimal Design for Reward Modeling in RLHF + + +
+ Reinforcement Learning from Human Feedback (RLHF) has become a popular +approach to align language models (LMs) with human preferences. This method +involves collecting a large dataset of human pairwise preferences across +various text generations and using it to infer (implicitly or explicitly) a +reward model. Numerous methods have been proposed to learn the reward model and +align a LM with it. However, the costly process of collecting human preferences +has received little attention and could benefit from theoretical insights. This +paper addresses this issue and aims to formalize the reward training model in +RLHF. We frame the selection of an effective dataset as a simple regret +minimization task, using a linear contextual dueling bandit method. Given the +potentially large number of arms, this approach is more coherent than the +best-arm identification setting. We then propose an offline framework for +solving this problem. Under appropriate assumptions - linearity of the reward +model in the embedding space, and boundedness of the reward parameter - we +derive bounds on the simple regret. Finally, we provide a lower bound that +matches our upper bound up to constant and logarithmic terms. To our knowledge, +this is the first theoretical contribution in this area to provide an offline +approach as well as worst-case guarantees. + +
+
+
+
+
+ + ♻ ☆ Reducing Variance in Meta-Learning via Laplace Approximation for + Regression Tasks + + +
+ Given a finite set of sample points, meta-learning algorithms aim to learn an +optimal adaptation strategy for new, unseen tasks. Often, this data can be +ambiguous as it might belong to different tasks concurrently. This is +particularly the case in meta-regression tasks. In such cases, the estimated +adaptation strategy is subject to high variance due to the limited amount of +support data for each task, which often leads to sub-optimal generalization +performance. In this work, we address the problem of variance reduction in +gradient-based meta-learning and formalize the class of problems prone to this, +a condition we refer to as \emph{task overlap}. Specifically, we propose a +novel approach that reduces the variance of the gradient estimate by weighing +each support point individually by the variance of its posterior over the +parameters. To estimate the posterior, we utilize the Laplace approximation, +which allows us to express the variance in terms of the curvature of the loss +landscape of our meta-learner. Experimental results demonstrate the +effectiveness of the proposed method and highlight the importance of variance +reduction in meta-learning. + +
+
+
+
+
+ + ♻ ☆ Conquering the Communication Constraints to Enable Large Pre-Trained + Models in Federated Learning + + +
+ Federated learning (FL) has emerged as a promising paradigm for enabling the +collaborative training of models without centralized access to the raw data on +local devices. In the typical FL paradigm (e.g., FedAvg), model weights are +sent to and from the server each round to participating clients. Recently, the +use of small pre-trained models has been shown effective in federated learning +optimization and improving convergence. However, recent state-of-the-art +pre-trained models are getting more capable but also have more parameters. In +conventional FL, sharing the enormous model weights can quickly put a massive +communication burden on the system, especially if more capable models are +employed. Can we find a solution to enable those strong and readily-available +pre-trained models in FL to achieve excellent performance while simultaneously +reducing the communication burden? To this end, we investigate the use of +parameter-efficient fine-tuning in federated learning and thus introduce a new +framework: FedPEFT. Specifically, we systemically evaluate the performance of +FedPEFT across a variety of client stability, data distribution, and +differential privacy settings. By only locally tuning and globally sharing a +small portion of the model weights, significant reductions in the total +communication overhead can be achieved while maintaining competitive or even +better performance in a wide range of federated learning scenarios, providing +insight into a new paradigm for practical and effective federated systems. + +
+
+
+
+
+ + ♻ ☆ Generative AI Models for Different Steps in Architectural Design: A + Literature Review + + +
+ Recent advances in generative artificial intelligence (AI) technologies have +been significantly driven by models such as generative adversarial networks +(GANs), variational autoencoders (VAEs), and denoising diffusion probabilistic +models (DDPMs). Although architects recognize the potential of generative AI in +design, personal barriers often restrict their access to the latest +technological developments, thereby causing the application of generative AI in +architectural design to lag behind. Therefore, it is essential to comprehend +the principles and advancements of generative AI models and analyze their +relevance in architecture applications. This paper first provides an overview +of generative AI technologies, with a focus on probabilistic diffusion models +(DDPMs), 3D generative models, and foundation models, highlighting their recent +developments and main application scenarios. Then, the paper explains how the +abovementioned models could be utilized in architecture. We subdivide the +architectural design process into six steps and review related research +projects in each step from 2020 to the present. Lastly, this paper discusses +potential future directions for applying generative AI in the architectural +design steps. This research can help architects quickly understand the +development and latest progress of generative AI and contribute to the further +development of intelligent architecture. + +
+
+ comment: 34 pages, 14 figures, accepted by Frontiers of Architectural Research +
+
+
+
+
+ + ♻ ☆ Simplifying Deep Temporal Difference Learning + + +
+ Q-learning played a foundational role in the field reinforcement learning +(RL). However, TD algorithms with off-policy data, such as Q-learning, or +nonlinear function approximation like deep neural networks require several +additional tricks to stabilise training, primarily a replay buffer and target +networks. Unfortunately, the delayed updating of frozen network parameters in +the target network harms the sample efficiency and, similarly, the replay +buffer introduces memory and implementation overheads. In this paper, we +investigate whether it is possible to accelerate and simplify TD training while +maintaining its stability. Our key theoretical result demonstrates for the +first time that regularisation techniques such as LayerNorm can yield provably +convergent TD algorithms without the need for a target network, even with +off-policy data. Empirically, we find that online, parallelised sampling +enabled by vectorised environments stabilises training without the need of a +replay buffer. Motivated by these findings, we propose PQN, our simplified deep +online Q-Learning algorithm. Surprisingly, this simple algorithm is competitive +with more complex methods like: Rainbow in Atari, R2D2 in Hanabi, QMix in Smax, +PPO-RNN in Craftax, and can be up to 50x faster than traditional DQN without +sacrificing sample efficiency. In an era where PPO has become the go-to RL +algorithm, PQN reestablishes Q-learning as a viable alternative. + +
+
+
+
+
+ + ♻ ☆ Stable generative modeling using Schrödinger bridges + + +
+ We consider the problem of sampling from an unknown distribution for which +only a sufficiently large number of training samples are available. Such +settings have recently drawn considerable interest in the context of generative +modelling and Bayesian inference. In this paper, we propose a generative model +combining Schr\"odinger bridges and Langevin dynamics. Schr\"odinger bridges +over an appropriate reversible reference process are used to approximate the +conditional transition probability from the available training samples, which +is then implemented in a discrete-time reversible Langevin sampler to generate +new samples. By setting the kernel bandwidth in the reference process to match +the time step size used in the unadjusted Langevin algorithm, our method +effectively circumvents any stability issues typically associated with the +time-stepping of stiff stochastic differential equations. Moreover, we +introduce a novel split-step scheme, ensuring that the generated samples remain +within the convex hull of the training samples. Our framework can be naturally +extended to generate conditional samples and to Bayesian inference problems. We +demonstrate the performance of our proposed scheme through experiments on +synthetic datasets with increasing dimensions and on a stochastic subgrid-scale +parametrization conditional sampling problem as well as generating sample +trajectories of a dynamical system using conditional sampling. + +
+
+
+
+
+ + ♻ ☆ Causality-Aware Spatiotemporal Graph Neural Networks for Spatiotemporal + Time Series Imputation CIKM'2024 + + +
+ Spatiotemporal time series are usually collected via monitoring sensors +placed at different locations, which usually contain missing values due to +various failures, such as mechanical damages and Internet outages. Imputing the +missing values is crucial for analyzing time series. When recovering a specific +data point, most existing methods consider all the information relevant to that +point regardless of the cause-and-effect relationship. During data collection, +it is inevitable that some unknown confounders are included, e.g., background +noise in time series and non-causal shortcut edges in the constructed sensor +network. These confounders could open backdoor paths and establish non-causal +correlations between the input and output. Over-exploiting these non-causal +correlations could cause overfitting. In this paper, we first revisit +spatiotemporal time series imputation from a causal perspective and show how to +block the confounders via the frontdoor adjustment. Based on the results of +frontdoor adjustment, we introduce a novel Causality-Aware Spatiotemporal Graph +Neural Network (Casper), which contains a novel Prompt Based Decoder (PBD) and +a Spatiotemporal Causal Attention (SCA). PBD could reduce the impact of +confounders and SCA could discover the sparse causal relationships among +embeddings. Theoretical analysis reveals that SCA discovers causal +relationships based on the values of gradients. We evaluate Casper on three +real-world datasets, and the experimental results show that Casper could +outperform the baselines and could effectively discover causal relationships. + +
+
+ comment: Accepted by CIKM'2024. Fixed typos +
+
+
+
+
+ + ♻ ☆ MoC-System: Efficient Fault Tolerance for Sparse Mixture-of-Experts + Model Training + + +
+ As large language models continue to scale up, distributed training systems +have expanded beyond 10k nodes, intensifying the importance of fault tolerance. +Checkpoint has emerged as the predominant fault tolerance strategy, with +extensive studies dedicated to optimizing its efficiency. However, the advent +of the sparse Mixture-of-Experts (MoE) model presents new challenges due to the +substantial increase in model size, despite comparable computational demands to +dense models. + In this work, we propose the Mixture-of-Checkpoint System (MoC-System) to +orchestrate the vast array of checkpoint shards produced in distributed +training systems. MoC-System features a novel Partial Experts Checkpointing +(PEC) mechanism, an algorithm-system co-design that strategically saves a +selected subset of experts, effectively reducing the MoE checkpoint size to +levels comparable with dense models. Incorporating hybrid parallel strategies, +MoC-System involves fully sharded checkpointing strategies to evenly distribute +the workload across distributed ranks. Furthermore, MoC-System introduces a +two-level checkpointing management method that asynchronously handles in-memory +snapshots and persistence processes. + We build MoC-System upon the Megatron-DeepSpeed framework, achieving up to a +98.9% reduction in overhead for each checkpointing process compared to the +original method, during MoE model training with ZeRO-2 data parallelism and +expert parallelism. Additionally, extensive empirical analyses substantiate +that our methods enhance efficiency while maintaining comparable model +accuracy, even achieving an average accuracy increase of 1.08% on downstream +tasks. + +
+
+
+
+
+ + ♻ ☆ Automated Contrastive Learning Strategy Search for Time Series CIKM'2024 + + +
+ In recent years, Contrastive Learning (CL) has become a predominant +representation learning paradigm for time series. Most existing methods +manually build specific CL Strategies (CLS) by human heuristics for certain +datasets and tasks. However, manually developing CLS usually requires excessive +prior knowledge about the data, and massive experiments to determine the +detailed CL configurations. In this paper, we present an Automated Machine +Learning (AutoML) practice at Microsoft, which automatically learns CLS for +time series datasets and tasks, namely Automated Contrastive Learning (AutoCL). +We first construct a principled search space of size over $3\times10^{12}$, +covering data augmentation, embedding transformation, contrastive pair +construction, and contrastive losses. Further, we introduce an efficient +reinforcement learning algorithm, which optimizes CLS from the performance on +the validation tasks, to obtain effective CLS within the space. Experimental +results on various real-world datasets demonstrate that AutoCL could +automatically find the suitable CLS for the given dataset and task. From the +candidate CLS found by AutoCL on several public datasets/tasks, we compose a +transferable Generally Good Strategy (GGS), which has a strong performance for +other datasets. We also provide empirical analysis as a guide for the future +design of CLS. + +
+
+ comment: Accepted by CIKM'2024. Fixed typos +
+
+
+
+
+ + ♻ ☆ Bayesian Analysis of Combinatorial Gaussian Process Bandits + + +
+ We consider the combinatorial volatile Gaussian process (GP) semi-bandit +problem. Each round, an agent is provided a set of available base arms and must +select a subset of them to maximize the long-term cumulative reward. We study +the Bayesian setting and provide novel Bayesian cumulative regret bounds for +three GP-based algorithms: GP-UCB, GP-BayesUCB and GP-TS. Our bounds extend +previous results for GP-UCB and GP-TS to the infinite, volatile and +combinatorial setting, and to the best of our knowledge, we provide the first +regret bound for GP-BayesUCB. Volatile arms encompass other widely considered +bandit problems such as contextual bandits. Furthermore, we employ our +framework to address the challenging real-world problem of online +energy-efficient navigation, where we demonstrate its effectiveness compared to +the alternatives. + +
+
+ comment: 32 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ On the explainability of quantum neural networks based on variational + quantum circuits + + +
+ Ridge functions are used to describe and study the lower bound of the +approximation done by the neural networks which can be written as a linear +combination of activation functions. If the activation functions are also ridge +functions, these networks are called explainable neural networks. + In this brief paper, we first show that quantum neural networks which are +based on variational quantum circuits can be written as a linear combination of +ridge functions by following matrix notations. Consequently, we show that the +interpretability and explainability of such quantum neural networks can be +directly considered and studied as an approximation with the linear combination +of ridge functions. + +
+
+ comment: a brief paper,a few missing references have been added +
+
+
+
+
+ + ♻ ☆ Conformal Prediction for Causal Effects of Continuous Treatments + + +
+ Uncertainty quantification of causal effects is crucial for safety-critical +applications such as personalized medicine. A powerful approach for this is +conformal prediction, which has several practical benefits due to +model-agnostic finite-sample guarantees. Yet, existing methods for conformal +prediction of causal effects are limited to binary/discrete treatments and make +highly restrictive assumptions such as known propensity scores. In this work, +we provide a novel conformal prediction method for potential outcomes of +continuous treatments. We account for the additional uncertainty introduced +through propensity estimation so that our conformal prediction intervals are +valid even if the propensity score is unknown. Our contributions are +three-fold: (1) We derive finite-sample prediction intervals for potential +outcomes of continuous treatments. (2) We provide an algorithm for calculating +the derived intervals. (3) We demonstrate the effectiveness of the conformal +prediction intervals in experiments on synthetic and real-world datasets. To +the best of our knowledge, we are the first to propose conformal prediction for +continuous treatments when the propensity score is unknown and must be +estimated from data. + +
+
+
+
+
+ + ♻ ☆ Over-the-Air Federated Learning in Cell-Free MIMO with Long-term Power + Constraint + + +
+ Wireless networks supporting artificial intelligence have gained significant +attention, with Over-the-Air Federated Learning emerging as a key application +due to its unique transmission and distributed computing characteristics. This +paper derives error bounds for Over-the-Air Federated Learning in a Cell-free +MIMO system and formulates an optimization problem to minimize optimality gap +via joint optimization of power control and beamforming. We introduce the +MOP-LOFPC algorithm, which employs Lyapunov optimization to decouple long-term +constraints across rounds while requiring only causal channel state +information. Experimental results demonstrate that MOP-LOFPC achieves a better +and more flexible trade-off between the model's training loss and adherence to +long-term power constraints compared to existing baselines. + +
+
+
+
+
+ + ♻ ☆ Improve Value Estimation of Q Function and Reshape Reward with Monte + Carlo Tree Search + + +
+ Reinforcement learning has achieved remarkable success in perfect information +games such as Go and Atari, enabling agents to compete at the highest levels +against human players. However, research in reinforcement learning for +imperfect information games has been relatively limited due to the more complex +game structures and randomness. Traditional methods face challenges in training +and improving performance in imperfect information games due to issues like +inaccurate Q value estimation and reward sparsity. In this paper, we focus on +Uno, an imperfect information game, and aim to address these problems by +reducing Q value overestimation and reshaping reward function. We propose a +novel algorithm that utilizes Monte Carlo Tree Search to average the value +estimations in Q function. Even though we choose Double Deep Q Learning as the +foundational framework in this paper, our method can be generalized and used in +any algorithm which needs Q value estimation, such as the Actor-Critic. +Additionally, we employ Monte Carlo Tree Search to reshape the reward structure +in the game environment. We compare our algorithm with several traditional +methods applied to games such as Double Deep Q Learning, Deep Monte Carlo and +Neural Fictitious Self Play, and the experiments demonstrate that our algorithm +consistently outperforms these approaches, especially as the number of players +in Uno increases, indicating a higher level of difficulty. + +
+
+
+
+
+ + ♻ ☆ Using Stochastic Gradient Descent to Smooth Nonconvex Functions: + Analysis of Implicit Graduated Optimization + + +
+ The graduated optimization approach is a heuristic method for finding global +optimal solutions for nonconvex functions by using a function smoothing +operation with stochastic noise. We show that stochastic noise in stochastic +gradient descent (SGD) has the effect of smoothing the objective function, the +degree of which is determined by the learning rate, batch size, and variance of +the stochastic gradient. Using this finding, we propose and analyze a new +graduated optimization algorithm that varies the degree of smoothing by varying +the learning rate and batch size, and provide experimental results on image +classification tasks with ResNets that support our theoretical findings. We +further show that there is an interesting correlation between the degree of +smoothing by SGD's stochastic noise, the well-studied ``sharpness'' indicator, +and the generalization performance of the model. + +
+
+ comment: The latest version was updated in October 2024. Under review +
+
+
+
+
+ + ♻ ☆ Do causal predictors generalize better to new domains? NeurIPS'24 + + +
+ We study how well machine learning models trained on causal features +generalize across domains. We consider 16 prediction tasks on tabular datasets +covering applications in health, employment, education, social benefits, and +politics. Each dataset comes with multiple domains, allowing us to test how +well a model trained in one domain performs in another. For each prediction +task, we select features that have a causal influence on the target of +prediction. Our goal is to test the hypothesis that models trained on causal +features generalize better across domains. Without exception, we find that +predictors using all available features, regardless of causality, have better +in-domain and out-of-domain accuracy than predictors using causal features. +Moreover, even the absolute drop in accuracy from one domain to the other is no +better for causal predictors than for models that use all features. In +addition, we show that recent causal machine learning methods for domain +generalization do not perform better in our evaluation than standard predictors +trained on the set of causal features. Likewise, causal discovery algorithms +either fail to run or select causal variables that perform no better than our +selection. Extensive robustness checks confirm that our findings are stable +under variable misclassification. + +
+
+ comment: 118 pages, 55 figures, accepted at NeurIPS'24 +
+
+
+
+
+ + ♻ ☆ Explainable Hierarchical Urban Representation Learning for Commuting + Flow Prediction + + +
+ Commuting flow prediction is an essential task for municipal operations in +the real world. Previous studies have revealed that it is feasible to estimate +the commuting origin-destination (OD) demand within a city using multiple +auxiliary data. However, most existing methods are not suitable to deal with a +similar task at a large scale, namely within a prefecture or the whole nation, +owing to the increased number of geographical units that need to be maintained. +In addition, region representation learning is a universal approach for gaining +urban knowledge for diverse metropolitan downstream tasks. Although many +researchers have developed comprehensive frameworks to describe urban units +from multi-source data, they have not clarified the relationship between the +selected geographical elements. Furthermore, metropolitan areas naturally +preserve ranked structures, like cities and their inclusive districts, which +makes elucidating relations between cross-level urban units necessary. +Therefore, we develop a heterogeneous graph-based model to generate meaningful +region embeddings at multiple spatial resolutions for predicting different +types of inter-level OD flows. To demonstrate the effectiveness of the proposed +method, extensive experiments were conducted using real-world aggregated mobile +phone datasets collected from Shizuoka Prefecture, Japan. The results indicate +that our proposed model outperforms existing models in terms of a uniform urban +structure. We extend the understanding of predicted results using reasonable +explanations to enhance the credibility of the model. + +
+
+
+
+
+ + ♻ ☆ Generative Forests NeurIPS'24 + + +
+ We focus on generative AI for a type of data that still represent one of the +most prevalent form of data: tabular data. Our paper introduces two key +contributions: a new powerful class of forest-based models fit for such tasks +and a simple training algorithm with strong convergence guarantees in a +boosting model that parallels that of the original weak / strong supervised +learning setting. This algorithm can be implemented by a few tweaks to the most +popular induction scheme for decision tree induction (i.e. supervised learning) +with two classes. Experiments on the quality of generated data display +substantial improvements compared to the state of the art. The losses our +algorithm minimize and the structure of our models make them practical for +related tasks that require fast estimation of a density given a generative +model and an observation (even partially specified): such tasks include missing +data imputation and density estimation. Additional experiments on these tasks +reveal that our models can be notably good contenders to diverse state of the +art methods, relying on models as diverse as (or mixing elements of) trees, +neural nets, kernels or graphical models. + +
+
+ comment: NeurIPS'24 +
+
+
+
+
+ + ♻ ☆ Probabilistic ML Verification via Weighted Model Integration + + +
+ In machine learning (ML) verification, the majority of procedures are +non-quantitative and therefore cannot be used for verifying probabilistic +models, or be applied in domains where hard guarantees are practically +unachievable. The probabilistic formal verification (PFV) of ML models is in +its infancy, with the existing approaches limited to specific ML models, +properties, or both. This contrasts with standard formal methods techniques, +whose successful adoption in real-world scenarios is also due to their support +for a wide range of properties and diverse systems. We propose a unifying +framework for the PFV of ML systems based on Weighted Model Integration (WMI), +a relatively recent formalism for probabilistic inference with algebraic and +logical constraints. Crucially, reducing the PFV of ML models to WMI enables +the verification of many properties of interest over a wide range of systems, +addressing multiple limitations of deterministic verification and ad-hoc +algorithms. We substantiate the generality of the approach on prototypical +tasks involving the verification of group fairness, monotonicity, robustness to +noise, probabilistic local robustness and equivalence among predictors. We +characterize the challenges related to the scalability of the approach and, +through our WMI-based perspective, we show how successful scaling techniques in +the ML verification literature can be generalized beyond their original scope. + +
+
+
+
+
+ + ♻ ☆ Multi-Excitation Projective Simulation with a Many-Body Physics Inspired + Inductive Bias + + +
+ With the impressive progress of deep learning, applications relying on +machine learning are increasingly being integrated into daily life. However, +most deep learning models have an opaque, oracle-like nature making it +difficult to interpret and understand their decisions. This problem led to the +development of the field known as eXplainable Artificial Intelligence (XAI). +One method in this field known as Projective Simulation (PS) models a +chain-of-thought as a random walk of a particle on a graph with vertices that +have concepts attached to them. While this description has various benefits, +including the possibility of quantization, it cannot be naturally used to model +thoughts that combine several concepts simultaneously. To overcome this +limitation, we introduce Multi-Excitation Projective Simulation (mePS), a +generalization that considers a chain-of-thought to be a random walk of several +particles on a hypergraph. A definition for a dynamic hypergraph is put forward +to describe the agent's training history along with applications to AI and +hypergraph visualization. An inductive bias inspired by the remarkably +successful few-body interaction models used in quantum many-body physics is +formalized for our classical mePS framework and employed to tackle the +exponential complexity associated with naive implementations of hypergraphs. We +prove that our inductive bias reduces the complexity from exponential to +polynomial, with the exponent representing the cutoff on how many particles can +interact. We numerically apply our method to two toy environments and a more +complex scenario modelling the diagnosis of a broken computer. These +environments demonstrate the resource savings provided by an appropriate choice +of inductive bias, as well as showcasing aspects of interpretability. A quantum +model for mePS is also briefly outlined and some future directions for it are +discussed. + +
+
+ comment: 26 pages, 8 figures; Code repository at + https://github.com/MariusKrumm/ManyBodyMEPS. Reorganized main text for better + readability +
+
+
+
+
+ + ♻ ☆ Uncertainty Estimation and Quantification for LLMs: A Simple Supervised + Approach + + +
+ In this paper, we study the problem of uncertainty estimation and calibration +for LLMs. We begin by formulating the uncertainty estimation problem, a +relevant yet underexplored area in existing literature. We then propose a +supervised approach that leverages labeled datasets to estimate the uncertainty +in LLMs' responses. Based on the formulation, we illustrate the difference +between the uncertainty estimation for LLMs and that for standard ML models and +explain why the hidden neurons of the LLMs may contain uncertainty information. +Our designed approach demonstrates the benefits of utilizing hidden activations +to enhance uncertainty estimation across various tasks and shows robust +transferability in out-of-distribution settings. We distinguish the uncertainty +estimation task from the uncertainty calibration task and show that better +uncertainty estimation leads to better calibration performance. Furthermore, +our method is easy to implement and adaptable to different levels of model +accessibility including black box, grey box, and white box. + +
+
+ comment: 29 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Towards Foundation Model for Chemical Reactor Modeling: Meta-Learning + with Physics-Informed Adaptation + + +
+ In this work, we present a novel application of foundation models for +chemical reactor modeling. Accurate modeling of real-world chemical reactors +through first-principles is often challenging, and the process of rebuilding +and retraining models for each new chemical process is inefficient. This raises +a critical question: can we develop a single, universal neural network (i.e., a +foundation model) that can rapidly adapt to any new chemical process in a +reactor? To address this, we propose a foundation model for chemical reactor +modeling that employs a meta-learning approach, followed by physics-informed +fine-tuning on new tasks with only a few data samples. Our model is designed to +generalize across three classic reactor types: continuous stirred tank +reactors, batch reactors, and plug flow reactors. Compared to conventional +methods such as data-driven learning, physics-informed learning, transfer +learning, and meta-learning, our approach demonstrates superior performance in +few-shot scenarios. Specifically, it shows rapid adaptation to unseen reactions +with varying integer orders across different reactor set-ups, requiring minimal +data for fine-tuning. Source code is available at +https://github.com/killingbear999/chemical-reactor-foundation-model. + +
+
+
+
+
+ + ♻ ☆ xLSTM-Mixer: Multivariate Time Series Forecasting by Mixing via Scalar + Memories + + +
+ Time series data is prevalent across numerous fields, necessitating the +development of robust and accurate forecasting models. Capturing patterns both +within and between temporal and multivariate components is crucial for reliable +predictions. We introduce xLSTM-Mixer, a model designed to effectively +integrate temporal sequences, joint time-variate information, and multiple +perspectives for robust forecasting. Our approach begins with a linear forecast +shared across variates, which is then refined by xLSTM blocks. These blocks +serve as key elements for modeling the complex dynamics of challenging time +series data. xLSTM-Mixer ultimately reconciles two distinct views to produce +the final forecast. Our extensive evaluations demonstrate xLSTM-Mixer's +superior long-term forecasting performance compared to recent state-of-the-art +methods. A thorough model analysis provides further insights into its key +components and confirms its robustness and effectiveness. This work contributes +to the resurgence of recurrent models in time series forecasting. + +
+
+
+
+
+ + ♻ ☆ The Art of Imitation: Learning Long-Horizon Manipulation Tasks from Few + Demonstrations + + +
+ Task Parametrized Gaussian Mixture Models (TP-GMM) are a sample-efficient +method for learning object-centric robot manipulation tasks. However, there are +several open challenges to applying TP-GMMs in the wild. In this work, we +tackle three crucial challenges synergistically. First, end-effector velocities +are non-Euclidean and thus hard to model using standard GMMs. We thus propose +to factorize the robot's end-effector velocity into its direction and +magnitude, and model them using Riemannian GMMs. Second, we leverage the +factorized velocities to segment and sequence skills from complex demonstration +trajectories. Through the segmentation, we further align skill trajectories and +hence leverage time as a powerful inductive bias. Third, we present a method to +automatically detect relevant task parameters per skill from visual +observations. Our approach enables learning complex manipulation tasks from +just five demonstrations while using only RGB-D observations. Extensive +experimental evaluations on RLBench demonstrate that our approach achieves +state-of-the-art performance with 20-fold improved sample efficiency. Our +policies generalize across different environments, object instances, and object +positions, while the learned skills are reusable. + +
+
+
+
+
+ + ♻ ☆ Integral Operator Approaches for Scattered Data Fitting on Spheres + + +
+ This paper focuses on scattered data fitting problems on spheres. We study +the approximation performance of a class of weighted spectral filter +algorithms, including Tikhonov regularization, Landaweber iteration, spectral +cut-off, and iterated Tikhonov, in fitting noisy data with possibly unbounded +random noise. For the analysis, we develop an integral operator approach that +can be regarded as an extension of the widely used sampling inequality approach +and norming set method in the community of scattered data fitting. After +providing an equivalence between the operator differences and quadrature rules, +we succeed in deriving optimal Sobolev-type error estimates of weighted +spectral filter algorithms. Our derived error estimates do not suffer from the +saturation phenomenon for Tikhonov regularization in the literature, +native-space-barrier for existing error analysis and adapts to different +embedding spaces. We also propose a divide-and-conquer scheme to equip weighted +spectral filter algorithms to reduce their computational burden and present the +optimal approximation error bounds. + +
+
+
+
+
+ + ♻ ☆ Hadamard Representations: Augmenting Hyperbolic Tangents in RL + + +
+ Activation functions are one of the key components of a deep neural network. +The most commonly used activation functions can be classed into the category of +continuously differentiable (e.g. tanh) and linear-unit functions (e.g. ReLU), +both having their own strengths and drawbacks with respect to downstream +performance and representation capacity through learning (e.g. measured by the +number of dead neurons and the effective rank). In reinforcement learning, the +performance of continuously differentiable activations often falls short as +compared to linear-unit functions. We provide insights into the vanishing +gradients associated with the former, and show that the dying neuron problem is +not exclusive to ReLU's. To alleviate vanishing gradients and the resulting +dying neuron problem occurring with continuously differentiable activations, we +propose a Hadamard representation. Using deep Q-networks and proximal policy +optimization in the Atari domain, we show faster learning, a reduction in dead +neurons and increased effective rank. + +
+
+ comment: 24 pages, 19 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Binarized Simplicial Convolutional Neural Networks + + +
+ Graph Neural Networks have a limitation of solely processing features on +graph nodes, neglecting data on high-dimensional structures such as edges and +triangles. Simplicial Convolutional Neural Networks (SCNN) represent +higher-order structures using simplicial complexes to break this limitation +albeit still lacking time efficiency. In this paper, we propose a novel neural +network architecture on simplicial complexes named Binarized Simplicial +Convolutional Neural Networks (Bi-SCNN) based on the combination of simplicial +convolution with a binary-sign forward propagation strategy. The usage of the +Hodge Laplacian on a binary-sign forward propagation enables Bi-SCNN to +efficiently and effectively represent simplicial features that have +higher-order structures than traditional graph node representations. Compared +to the previous Simplicial Convolutional Neural Networks, the reduced model +complexity of Bi-SCNN shortens the execution time without sacrificing the +prediction performance and is less prone to the over-smoothing effect. +Experimenting with real-world citation and ocean-drifter data confirmed that +our proposed Bi-SCNN is efficient and accurate. + +
+
+
+
+
+ + ♻ ☆ Understanding Gradient Boosting Classifier: Training, Prediction, and + the Role of $γ_j$ + + +
+ The Gradient Boosting Classifier (GBC) is a widely used machine learning +algorithm for binary classification, which builds decision trees iteratively to +minimize prediction errors. This document explains the GBC's training and +prediction processes, focusing on the computation of terminal node values +$\gamma_j$, which are crucial to optimizing the logistic loss function. We +derive $\gamma_j$ through a Taylor series approximation and provide a +step-by-step pseudocode for the algorithm's implementation. The guide explains +the theory of GBC and its practical application, demonstrating its +effectiveness in binary classification tasks. We provide a step-by-step example +in the appendix to help readers understand. + +
+
+
+
+
+ + ♻ ☆ Statistical Efficiency of Distributional Temporal Difference Learning NeurIPS 2024 + + +
+ Distributional reinforcement learning (DRL) has achieved empirical success in +various domains. One core task in the field of DRL is distributional policy +evaluation, which involves estimating the return distribution $\eta^\pi$ for a +given policy $\pi$. The distributional temporal difference learning has been +accordingly proposed, which is an extension of the temporal difference learning +(TD) in the classic RL area. In the tabular case, \citet{rowland2018analysis} +and \citet{rowland2023analysis} proved the asymptotic convergence of two +instances of distributional TD, namely categorical temporal difference learning +(CTD) and quantile temporal difference learning (QTD), respectively. In this +paper, we go a step further and analyze the finite-sample performance of +distributional TD. To facilitate theoretical analysis, we propose +non-parametric distributional TD learning (NTD). For a $\gamma$-discounted +infinite-horizon tabular Markov decision process, we show that for NTD we need +$\tilde{O}\left(\frac{1}{\varepsilon^{2p}(1-\gamma)^{2p+1}}\right)$ iterations +to achieve an $\varepsilon$-optimal estimator with high probability, when the +estimation error is measured by the $p$-Wasserstein distance. This sample +complexity bound is minimax optimal up to logarithmic factors in the case of +the $1$-Wasserstein distance. To achieve this, we establish a novel Freedman's +inequality in Hilbert spaces, which would be of independent interest. In +addition, we revisit CTD, showing that the same non-asymptotic convergence +bounds hold for CTD in the case of the $p$-Wasserstein distance for $p\geq 1$. + +
+
+ comment: NeurIPS 2024 (oral) +
+
+
+
+
+ + ♻ ☆ On the Design and Performance of Machine Learning Based Error Correcting + Decoders + + +
+ This paper analyzes the design and competitiveness of four neural network +(NN) architectures recently proposed as decoders for forward error correction +(FEC) codes. We first consider the so-called single-label neural network (SLNN) +and the multi-label neural network (MLNN) decoders which have been reported to +achieve near maximum likelihood (ML) performance. Here, we show analytically +that SLNN and MLNN decoders can always achieve ML performance, regardless of +the code dimensions -- although at the cost of computational complexity -- and +no training is in fact required. We then turn our attention to two +transformer-based decoders: the error correction code transformer (ECCT) and +the cross-attention message passing transformer (CrossMPT). We compare their +performance against traditional decoders, and show that ordered statistics +decoding outperforms these transformer-based decoders. The results in this +paper cast serious doubts on the application of NN-based FEC decoders in the +short and medium block length regime. + +
+
+ comment: 6 pages, 4 figures, submitted for possible presentation in a + conference (v2: Pre-FEC BER curves are corrected) +
+
+
+
+
+ + ♻ ☆ Adaptive Spatio-temporal Estimation on the Graph Edges via Line Graph + Transformation + + +
+ Spatio-temporal estimation of signals on graph edges is challenging because +most conventional Graph Signal Processing techniques are defined on the graph +nodes. Leveraging the Line Graph transform, the Line Graph Least Mean Square +(LGLMS) algorithm is proposed to conduct adaptive estimation of time-varying +edge signals by projecting the edge signals from edge space to node space. +LGLMS is an adaptive algorithm analogous to the classical LMS algorithm but +applied to graph edges. Unlike edge-specific methods, LGLMS retains all GSP +concepts and techniques originally designed for graph nodes, without the need +for redefinition on the edges. Experimenting with transportation graphs and +meteorological graphs, with the signal observations having noisy and missing +values, we confirmed that LGLMS is suitable for the online prediction of +time-varying edge signals. + +
+
+
+
+
+ + ♻ ☆ Understanding Transfer Learning via Mean-field Analysis + + +
+ We propose a novel framework for exploring generalization errors of transfer +learning through the lens of differential calculus on the space of probability +measures. In particular, we consider two main transfer learning scenarios, +$\alpha$-ERM and fine-tuning with the KL-regularized empirical risk +minimization and establish generic conditions under which the generalization +error and the population risk convergence rates for these scenarios are +studied. Based on our theoretical results, we show the benefits of transfer +learning with a one-hidden-layer neural network in the mean-field regime under +some suitable integrability and regularity assumptions on the loss and +activation functions. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Universal approximation results for neural networks with non-polynomial + activation function over non-compact domains + + +
+ In this paper, we generalize the universal approximation property of +single-hidden-layer feed-forward neural networks beyond the classical +formulation over compact domains. More precisely, by assuming that the +activation function is non-polynomial, we derive universal approximation +results for neural networks within function spaces over non-compact subsets of +a Euclidean space, e.g., weighted spaces, $L^p$-spaces, and (weighted) Sobolev +spaces over unbounded domains, where the latter includes the approximation of +the (weak) derivatives. Furthermore, we provide some dimension-independent +rates for approximating a function with sufficiently regular and integrable +Fourier transform by neural networks with non-polynomial activation function. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2312.08410 +
+
+
+
+
+ + ♻ ☆ Generative AI Security: Challenges and Countermeasures + + +
+ Generative AI's expanding footprint across numerous industries has led to +both excitement and increased scrutiny. This paper delves into the unique +security challenges posed by Generative AI, and outlines potential research +directions for managing these risks. + +
+
+
+
+
+ + ♻ ☆ Empirical investigation of multi-source cross-validation in clinical ECG + classification + + +
+ Traditionally, machine learning-based clinical prediction models have been +trained and evaluated on patient data from a single source, such as a hospital. +Cross-validation methods can be used to estimate the accuracy of such models on +new patients originating from the same source, by repeated random splitting of +the data. However, such estimates tend to be highly overoptimistic when +compared to accuracy obtained from deploying models to sources not represented +in the dataset, such as a new hospital. The increasing availability of +multi-source medical datasets provides new opportunities for obtaining more +comprehensive and realistic evaluations of expected accuracy through +source-level cross-validation designs. + In this study, we present a systematic empirical evaluation of standard +K-fold cross-validation and leave-source-out cross-validation methods in a +multi-source setting. We consider the task of electrocardiogram based +cardiovascular disease classification, combining and harmonizing the openly +available PhysioNet CinC Challenge 2021 and the Shandong Provincial Hospital +datasets for our study. + Our results show that K-fold cross-validation, both on single-source and +multi-source data, systemically overestimates prediction performance when the +end goal is to generalize to new sources. Leave-source-out cross-validation +provides more reliable performance estimates, having close to zero bias though +larger variability. The evaluation highlights the dangers of obtaining +misleading cross-validation results on medical data and demonstrates how these +issues can be mitigated when having access to multi-source data. + +
+
+ comment: 19 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ P1-KAN an effective Kolmogorov Arnold Network for function approximation + + +
+ A new Kolmogorov-Arnold network (KAN) is proposed to approximate potentially +irregular functions in high dimension. We show that it outperforms multilayer +perceptrons in terms of accuracy and converges faster. We also compare it with +several proposed KAN networks: the original spline-based KAN network appears to +be more effective for smooth functions, while the P1-KAN network is more +effective for irregular functions. + +
+
+
+
+
+ + ♻ ☆ Masked Clinical Modelling: A Framework for Synthetic and Augmented + Survival Data Generation + + +
+ Access to real clinical data is often restricted due to privacy obligations, +creating significant barriers for healthcare research. Synthetic datasets +provide a promising solution, enabling secure data sharing and model +development. However, most existing approaches focus on data realism rather +than utility -- ensuring that models trained on synthetic data yield clinically +meaningful insights comparable to those trained on real data. In this paper, we +present Masked Clinical Modelling (MCM), a framework inspired by masked +language modelling, designed for both data synthesis and conditional data +augmentation. We evaluate this prototype on the WHAS500 dataset using Cox +Proportional Hazards models, focusing on the preservation of hazard ratios as +key clinical metrics. Our results show that data generated using the MCM +framework improves both discrimination and calibration in survival analysis, +outperforming existing methods. MCM demonstrates strong potential to support +survival data analysis and broader healthcare applications. + +
+
+ comment: Re-archived due to incorrect ORCiD. Last edited: 2024-10-23 +
+
+
+
+
+ + ♻ ☆ Advancing Open-Set Domain Generalization Using Evidential Bi-Level + Hardest Domain Scheduler NeurIPS 2024 + + +
+ In Open-Set Domain Generalization (OSDG), the model is exposed to both new +variations of data appearance (domains) and open-set conditions, where both +known and novel categories are present at test time. The challenges of this +task arise from the dual need to generalize across diverse domains and +accurately quantify category novelty, which is critical for applications in +dynamic environments. Recently, meta-learning techniques have demonstrated +superior results in OSDG, effectively orchestrating the meta-train and -test +tasks by employing varied random categories and predefined domain partition +strategies. These approaches prioritize a well-designed training schedule over +traditional methods that focus primarily on data augmentation and the +enhancement of discriminative feature learning. The prevailing meta-learning +models in OSDG typically utilize a predefined sequential domain scheduler to +structure data partitions. However, a crucial aspect that remains inadequately +explored is the influence brought by strategies of domain schedulers during +training. In this paper, we observe that an adaptive domain scheduler benefits +more in OSDG compared with prefixed sequential and random domain schedulers. We +propose the Evidential Bi-Level Hardest Domain Scheduler (EBiL-HaDS) to achieve +an adaptive domain scheduler. This method strategically sequences domains by +assessing their reliabilities in utilizing a follower network, trained with +confidence scores learned in an evidential manner, regularized by max rebiasing +discrepancy, and optimized in a bi-level manner. The results show that our +method substantially improves OSDG performance and achieves more discriminative +embeddings for both the seen and unseen categories. The source code is publicly +available at https://github.com/KPeng9510/EBiL-HaDS. + +
+
+ comment: Accepted to NeurIPS 2024. The source code is publicly available at + https://github.com/KPeng9510/EBiL-HaDS +
+
+
+
+
+ + ♻ ☆ Diffusion-Reward Adversarial Imitation Learning + + +
+ Imitation learning aims to learn a policy from observing expert +demonstrations without access to reward signals from environments. Generative +adversarial imitation learning (GAIL) formulates imitation learning as +adversarial learning, employing a generator policy learning to imitate expert +behaviors and discriminator learning to distinguish the expert demonstrations +from agent trajectories. Despite its encouraging results, GAIL training is +often brittle and unstable. Inspired by the recent dominance of diffusion +models in generative modeling, we propose Diffusion-Reward Adversarial +Imitation Learning (DRAIL), which integrates a diffusion model into GAIL, +aiming to yield more robust and smoother rewards for policy learning. +Specifically, we propose a diffusion discriminative classifier to construct an +enhanced discriminator, and design diffusion rewards based on the classifier's +output for policy learning. Extensive experiments are conducted in navigation, +manipulation, and locomotion, verifying DRAIL's effectiveness compared to prior +imitation learning methods. Moreover, additional experimental results +demonstrate the generalizability and data efficiency of DRAIL. Visualized +learned reward functions of GAIL and DRAIL suggest that DRAIL can produce more +robust and smoother rewards. Project page: +https://nturobotlearninglab.github.io/DRAIL/ + +
+
+
+
+
+ + ♻ ☆ UCB Exploration for Fixed-Budget Bayesian Best Arm Identification + + +
+ We study best-arm identification (BAI) in the fixed-budget setting. Adaptive +allocations based on upper confidence bounds (UCBs), such as UCBE, are known to +work well in BAI. However, it is well-known that its optimal regret is +theoretically dependent on instances, which we show to be an artifact in many +fixed-budget BAI problems. In this paper we propose an UCB exploration +algorithm that is both theoretically and empirically efficient for the fixed +budget BAI problem under a Bayesian setting. The key idea is to learn prior +information, which can enhance the performance of UCB-based BAI algorithm as it +has done in the cumulative regret minimization problem. We establish bounds on +the failure probability and the simple regret for the Bayesian BAI problem, +providing upper bounds of order $\tilde{O}(\sqrt{K/n})$, up to logarithmic +factors, where $n$ represents the budget and $K$ denotes the number of arms. +Furthermore, we demonstrate through empirical results that our approach +consistently outperforms state-of-the-art baselines. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Challenge on Sound Scene Synthesis: Evaluating Text-to-Audio Generation NeurIPS 2024 + + +
+ Despite significant advancements in neural text-to-audio generation, +challenges persist in controllability and evaluation. This paper addresses +these issues through the Sound Scene Synthesis challenge held as part of the +Detection and Classification of Acoustic Scenes and Events 2024. We present an +evaluation protocol combining objective metric, namely Fr\'echet Audio +Distance, with perceptual assessments, utilizing a structured prompt format to +enable diverse captions and effective evaluation. Our analysis reveals varying +performance across sound categories and model architectures, with larger models +generally excelling but innovative lightweight approaches also showing promise. +The strong correlation between objective metrics and human ratings validates +our evaluation approach. We discuss outcomes in terms of audio quality, +controllability, and architectural considerations for text-to-audio +synthesizers, providing direction for future research. + +
+
+ comment: accepted to NeurIPS 2024 Workshop: Audio Imagination +
+
+
+
+
+ + ♻ ☆ LocoMotion: Learning Motion-Focused Video-Language Representations ACCV 2024 + + +
+ This paper strives for motion-focused video-language representations. +Existing methods to learn video-language representations use spatial-focused +data, where identifying the objects and scene is often enough to distinguish +the relevant caption. We instead propose LocoMotion to learn from +motion-focused captions that describe the movement and temporal progression of +local object motions. We achieve this by adding synthetic motions to videos and +using the parameters of these motions to generate corresponding captions. +Furthermore, we propose verb-variation paraphrasing to increase the caption +variety and learn the link between primitive motions and high-level verbs. With +this, we are able to learn a motion-focused video-language representation. +Experiments demonstrate our approach is effective for a variety of downstream +tasks, particularly when limited data is available for fine-tuning. Code is +available: https://hazeldoughty.github.io/Papers/LocoMotion/ + +
+
+ comment: ACCV 2024 Oral +
+
+
+
+
+ + ♻ ☆ Multi-Track MusicLDM: Towards Versatile Music Generation with Latent + Diffusion Model + + +
+ Diffusion models have shown promising results in cross-modal generation tasks +involving audio and music, such as text-to-sound and text-to-music generation. +These text-controlled music generation models typically focus on generating +music by capturing global musical attributes like genre and mood. However, +music composition is a complex, multilayered task that often involves musical +arrangement as an integral part of the process. This process involves composing +each instrument to align with existing ones in terms of beat, dynamics, +harmony, and melody, requiring greater precision and control over tracks than +text prompts usually provide. In this work, we address these challenges by +extending the MusicLDM, a latent diffusion model for music, into a multi-track +generative model. By learning the joint probability of tracks sharing a +context, our model is capable of generating music across several tracks that +correspond well to each other, either conditionally or unconditionally. +Additionally, our model is capable of arrangement generation, where the model +can generate any subset of tracks given the others (e.g., generating a piano +track complementing given bass and drum tracks). We compared our model with an +existing multi-track generative model and demonstrated that our model achieves +considerable improvements across objective metrics for both total and +arrangement generation tasks. + +
+
+
+
+
+ + ♻ ☆ Dual Inverse Degradation Network for Real-World SDRTV-to-HDRTV + Conversion + + +
+ In this study, we address the emerging necessity of converting Standard +Dynamic Range Television (SDRTV) content into High Dynamic Range Television +(HDRTV) in light of the limited number of native HDRTV content. A principal +technical challenge in this conversion is the exacerbation of coding artifacts +inherent in SDRTV, which detrimentally impacts the quality of the resulting +HDRTV. To address this issue, our method introduces a novel approach that +conceptualizes the SDRTV-to-HDRTV conversion as a composite task involving dual +degradation restoration. This encompasses inverse tone mapping in conjunction +with video restoration. We propose Dual Inversion Downgraded SDRTV to HDRTV +Network (DIDNet), which can accurately perform inverse tone mapping while +preventing encoding artifacts from being amplified, thereby significantly +improving visual quality. DIDNet integrates an intermediate auxiliary loss +function to effectively separate the dual degradation restoration tasks and +efficient learning of both artifact reduction and inverse tone mapping during +end-to-end training. Additionally, DIDNet introduces a spatio-temporal feature +alignment module for video frame fusion, which augments texture quality and +reduces artifacts. The architecture further includes a dual-modulation +convolution mechanism for optimized inverse tone mapping. Recognizing the +richer texture and high-frequency information in HDRTV compared to SDRTV, we +further introduce a wavelet attention module to enhance frequency features. Our +approach demonstrates marked superiority over existing state-of-the-art +techniques in terms of quantitative performance and visual quality. + +
+
+
+
+
+ + ♻ ☆ OpenMU: Your Swiss Army Knife for Music Understanding + + +
+ We present OpenMU-Bench, a large-scale benchmark suite for addressing the +data scarcity issue in training multimodal language models to understand music. +To construct OpenMU-Bench, we leveraged existing datasets and bootstrapped new +annotations. OpenMU-Bench also broadens the scope of music understanding by +including lyrics understanding and music tool usage. Using OpenMU-Bench, we +trained our music understanding model, OpenMU, with extensive ablations, +demonstrating that OpenMU outperforms baseline models such as MU-Llama. Both +OpenMU and OpenMU-Bench are open-sourced to facilitate future research in music +understanding and to enhance creative music production efficiency. + +
+
+ comment: Resources: https://github.com/mzhaojp22/openmu +
+
+
+
+
+ + ♻ ☆ Exploring Self-Supervised Skeleton-Based Human Action Recognition under + Occlusions + + +
+ To integrate self-supervised skeleton-based action recognition methods into +autonomous robotic systems, it is crucial to consider adverse situations +involving target occlusions. Such a scenario, despite its practical relevance, +is rarely addressed in existing self-supervised skeleton-based action +recognition methods. To empower models with the capacity to address occlusion, +we propose a simple and effective method. We first pre-train using occluded +skeleton sequences, then use k-means clustering (KMeans) on sequence embeddings +to group semantically similar samples. Next, we propose KNN-Imputation to fill +in missing skeleton data based on the closest sample neighbors. Imputing +incomplete skeleton sequences to create relatively complete sequences as input +provides significant benefits to existing skeleton-based self-supervised +methods. Meanwhile, building on the state-of-the-art Partial Spatio-Temporal +Learning (PSTL), we introduce an Occluded Partial Spatio-Temporal Learning +(OPSTL) framework. This enhancement utilizes Adaptive Spatial Masking (ASM) for +better use of high-quality, intact skeletons. The new proposed method is +verified on the challenging occluded versions of the NTURGB+D 60 and NTURGB+D +120. The source code is publicly available at https://github.com/cyfml/OPSTL. + +
+
+ comment: The source code is publicly available at + https://github.com/cyfml/OPSTL +
+
+
+
+
+ + ♻ ☆ MMBench-Video: A Long-Form Multi-Shot Benchmark for Holistic Video + Understanding NeurIPS 2024 + + +
+ The advent of large vision-language models (LVLMs) has spurred research into +their applications in multi-modal contexts, particularly in video +understanding. Traditional VideoQA benchmarks, despite providing quantitative +metrics, often fail to encompass the full spectrum of video content and +inadequately assess models' temporal comprehension. To address these +limitations, we introduce MMBench-Video, a quantitative benchmark designed to +rigorously evaluate LVLMs' proficiency in video understanding. MMBench-Video +incorporates lengthy videos from YouTube and employs free-form questions, +mirroring practical use cases. The benchmark is meticulously crafted to probe +the models' temporal reasoning skills, with all questions human-annotated +according to a carefully constructed ability taxonomy. We employ GPT-4 for +automated assessment, demonstrating superior accuracy and robustness over +earlier LLM-based evaluations. Utilizing MMBench-Video, we have conducted +comprehensive evaluations that include both proprietary and open-source LVLMs +for images and videos. MMBench-Video stands as a valuable resource for the +research community, facilitating improved evaluation of LVLMs and catalyzing +progress in the field of video understanding. The evalutation code of +MMBench-Video will be integrated into VLMEvalKit: +https://github.com/open-compass/VLMEvalKit. + +
+
+ comment: Accepted in NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 55 + +
+
+
+ + ☆ Do Robot Snakes Dream like Electric Sheep? Investigating the Effects of + Architectural Inductive Biases on Hallucination + + +
+ The growth in prominence of large language models (LLMs) in everyday life can +be largely attributed to their generative abilities, yet some of this is also +owed to the risks and costs associated with their use. On one front is their +tendency to \textit{hallucinate} false or misleading information, limiting +their reliability. On another is the increasing focus on the computational +limitations associated with traditional self-attention based LLMs, which has +brought about new alternatives, in particular recurrent models, meant to +overcome them. Yet it remains uncommon to consider these two concerns +simultaneously. Do changes in architecture exacerbate/alleviate existing +concerns about hallucinations? Do they affect how and where they occur? Through +an extensive evaluation, we study how these architecture-based inductive biases +affect the propensity to hallucinate. While hallucination remains a general +phenomenon not limited to specific architectures, the situations in which they +occur and the ease with which specific types of hallucinations can be induced +can significantly differ based on the model architecture. These findings +highlight the need for better understanding both these problems in conjunction +with each other, as well as consider how to design more universal techniques +for handling hallucinations. + +
+
+
+
+
+ + ☆ Decoding Time Series with LLMs: A Multi-Agent Framework for Cross-Domain + Annotation + + +
+ Time series data is ubiquitous across various domains, including +manufacturing, finance, and healthcare. High-quality annotations are essential +for effectively understanding time series and facilitating downstream tasks; +however, obtaining such annotations is challenging, particularly in +mission-critical domains. In this paper, we propose TESSA, a multi-agent system +designed to automatically generate both general and domain-specific annotations +for time series data. TESSA introduces two agents: a general annotation agent +and a domain-specific annotation agent. The general agent captures common +patterns and knowledge across multiple source domains, leveraging both +time-series-wise and text-wise features to generate general annotations. +Meanwhile, the domain-specific agent utilizes limited annotations from the +target domain to learn domain-specific terminology and generate targeted +annotations. Extensive experiments on multiple synthetic and real-world +datasets demonstrate that TESSA effectively generates high-quality annotations, +outperforming existing methods. + +
+
+ comment: 23 pages, 9 figures, 24 tables +
+
+
+
+
+ + ☆ Interação entre robôs humanoides: desenvolvendo a + colaboração e comunicação autônoma + + +
+ This study investigates the interaction between humanoid robots NAO and +Pepper, emphasizing their potential applications in educational settings. NAO, +widely used in education, and Pepper, designed for social interactions, of er +new opportunities for autonomous communication and collaboration. Through a +series of programmed interactions, the robots demonstrated their ability to +communicate and coordinate actions autonomously, highlighting their potential +as tools for enhancing learning environments. The research also explores the +integration of emerging technologies, such as artificial intelligence, into +these systems, allowing robots to learn from each other and adapt their +behavior. The findings suggest that NAO and Pepper can significantly contribute +to both technical learning and the development of social and emotional skills +in students, of ering innovative pedagogical approaches through the use of +humanoid robotics. + +
+
+ comment: in Portuguese language +
+
+
+
+
+ + ☆ In Context Learning and Reasoning for Symbolic Regression with Large + Language Models + + +
+ Large Language Models (LLMs) are transformer-based machine learning models +that have shown remarkable performance in tasks for which they were not +explicitly trained. Here, we explore the potential of LLMs to perform symbolic +regression -- a machine-learning method for finding simple and accurate +equations from datasets. We prompt GPT-4 to suggest expressions from data, +which are then optimized and evaluated using external Python tools. These +results are fed back to GPT-4, which proposes improved expressions while +optimizing for complexity and loss. Using chain-of-thought prompting, we +instruct GPT-4 to analyze the data, prior expressions, and the scientific +context (expressed in natural language) for each problem before generating new +expressions. We evaluated the workflow in rediscovery of five well-known +scientific equations from experimental data, and on an additional dataset +without a known equation. GPT-4 successfully rediscovered all five equations, +and in general, performed better when prompted to use a scratchpad and consider +scientific context. We also demonstrate how strategic prompting improves the +model's performance and how the natural language interface simplifies +integrating theory with data. Although this approach does not outperform +established SR programs where target equations are more complex, LLMs can +nonetheless iterate toward improved solutions while following instructions and +incorporating scientific context in natural language. + +
+
+
+
+
+ + ☆ Evaluating AI-Generated Essays with GRE Analytical Writing Assessment + + +
+ The recent revolutionary advance in generative AI enables the generation of +realistic and coherent texts by large language models (LLMs). Despite many +existing evaluation metrics on the quality of the generated texts, there is +still a lack of rigorous assessment of how well LLMs perform in complex and +demanding writing assessments. This study examines essays generated by ten +leading LLMs for the analytical writing assessment of the Graduate Record Exam +(GRE). We assessed these essays using both human raters and the e-rater +automated scoring engine as used in the GRE scoring pipeline. Notably, the +top-performing GPT-4o received an average score of 4.67, falling between +"generally thoughtful, well-developed analysis of the issue and conveys meaning +clearly" and "presents a competent analysis of the issue and conveys meaning +with acceptable clarity" according to the GRE scoring guideline. We also +evaluated the detection accuracy of these essays, with detectors trained on +essays generated by the same and different LLMs. + +
+
+ comment: 20 pages, 6 figures +
+
+
+
+
+ + ☆ Artificial Intelligence in Brazilian News: A Mixed-Methods Analysis + + +
+ The current surge in Artificial Intelligence (AI) interest, reflected in +heightened media coverage since 2009, has sparked significant debate on AI's +implications for privacy, social justice, workers' rights, and democracy. The +media plays a crucial role in shaping public perception and acceptance of AI +technologies. However, research into how AI appears in media has primarily +focused on anglophone contexts, leaving a gap in understanding how AI is +represented globally. This study addresses this gap by analyzing 3,560 news +articles from Brazilian media published between July 1, 2023, and February 29, +2024, from 13 popular online news outlets. Using Computational Grounded Theory +(CGT), the study applies Latent Dirichlet Allocation (LDA), BERTopic, and +Named-Entity Recognition to investigate the main topics in AI coverage and the +entities represented. The findings reveal that Brazilian news coverage of AI is +dominated by topics related to applications in the workplace and product +launches, with limited space for societal concerns, which mostly focus on +deepfakes and electoral integrity. The analysis also highlights a significant +presence of industry-related entities, indicating a strong influence of +corporate agendas in the country's news. This study underscores the need for a +more critical and nuanced discussion of AI's societal impacts in Brazilian +media. + +
+
+ comment: 18 pages, 8 figures, 3 tables +
+
+
+
+
+ + ☆ Scalable Influence and Fact Tracing for Large Language Model Pretraining + + +
+ Training data attribution (TDA) methods aim to attribute model outputs back +to specific training examples, and the application of these methods to large +language model (LLM) outputs could significantly advance model transparency and +data curation. However, it has been challenging to date to apply these methods +to the full scale of LLM pretraining. In this paper, we refine existing +gradient-based methods to work effectively at scale, allowing us to retrieve +influential examples for an 8B-parameter language model from a pretraining +corpus of over 160B tokens with no need for subsampling or pre-filtering. Our +method combines several techniques, including optimizer state correction, a +task-specific Hessian approximation, and normalized encodings, which we find to +be critical for performance at scale. In quantitative evaluations on a fact +tracing task, our method performs best at identifying examples that influence +model predictions, but classical, model-agnostic retrieval methods such as BM25 +still perform better at finding passages which explicitly contain relevant +facts. These results demonstrate a misalignment between factual attribution and +causal influence. With increasing model size and training tokens, we find that +influence more closely aligns with attribution. Finally, we examine different +types of examples identified as influential by our method, finding that while +many directly entail a particular fact, others support the same output by +reinforcing priors on relation types, common entities, and names. + +
+
+
+
+
+ + ☆ AdvWeb: Controllable Black-box Attacks on VLM-powered Web Agents + + +
+ Vision Language Models (VLMs) have revolutionized the creation of generalist +web agents, empowering them to autonomously complete diverse tasks on +real-world websites, thereby boosting human efficiency and productivity. +However, despite their remarkable capabilities, the safety and security of +these agents against malicious attacks remain critically underexplored, raising +significant concerns about their safe deployment. To uncover and exploit such +vulnerabilities in web agents, we provide AdvWeb, a novel black-box attack +framework designed against web agents. AdvWeb trains an adversarial prompter +model that generates and injects adversarial prompts into web pages, misleading +web agents into executing targeted adversarial actions such as inappropriate +stock purchases or incorrect bank transactions, actions that could lead to +severe real-world consequences. With only black-box access to the web agent, we +train and optimize the adversarial prompter model using DPO, leveraging both +successful and failed attack strings against the target agent. Unlike prior +approaches, our adversarial string injection maintains stealth and control: (1) +the appearance of the website remains unchanged before and after the attack, +making it nearly impossible for users to detect tampering, and (2) attackers +can modify specific substrings within the generated adversarial string to +seamlessly change the attack objective (e.g., purchasing stocks from a +different company), enhancing attack flexibility and efficiency. We conduct +extensive evaluations, demonstrating that AdvWeb achieves high success rates in +attacking SOTA GPT-4V-based VLM agent across various web tasks. Our findings +expose critical vulnerabilities in current LLM/VLM-based agents, emphasizing +the urgent need for developing more reliable web agents and effective defenses. +Our code and data are available at https://ai-secure.github.io/AdvWeb/ . + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Do Vision-Language Models Represent Space and How? Evaluating Spatial + Frame of Reference Under Ambiguities NeurIPS 2024 + + +
+ Spatial expressions in situated communication can be ambiguous, as their +meanings vary depending on the frames of reference (FoR) adopted by speakers +and listeners. While spatial language understanding and reasoning by +vision-language models (VLMs) have gained increasing attention, potential +ambiguities in these models are still under-explored. To address this issue, we +present the COnsistent Multilingual Frame Of Reference Test (COMFORT), an +evaluation protocol to systematically assess the spatial reasoning capabilities +of VLMs. We evaluate nine state-of-the-art VLMs using COMFORT. Despite showing +some alignment with English conventions in resolving ambiguities, our +experiments reveal significant shortcomings of VLMs: notably, the models (1) +exhibit poor robustness and consistency, (2) lack the flexibility to +accommodate multiple FoRs, and (3) fail to adhere to language-specific or +culture-specific conventions in cross-lingual tests, as English tends to +dominate other languages. With a growing effort to align vision-language models +with human cognitive intuitions, we call for more attention to the ambiguous +nature and cross-cultural diversity of spatial reasoning. + +
+
+ comment: Accepted to Pluralistic Alignment @ NeurIPS 2024 | Project page: + https://spatial-comfort.github.io/ +
+
+
+
+
+ + ☆ AMUSD: Asynchronous Multi-Device Speculative Decoding for LLM + Acceleration + + +
+ Large language models typically generate tokens autoregressively, using each +token as input for the next. Recent work on Speculative Decoding has sought to +accelerate this process by employing a smaller, faster draft model to more +quickly generate candidate tokens. These candidates are then verified in +parallel by the larger (original) verify model, resulting in overall speedup +compared to using the larger model by itself in an autoregressive fashion. In +this work, we introduce AMUSD (Asynchronous Multi-device Speculative Decoding), +a system that further accelerates generation by decoupling the draft and verify +phases into a continuous, asynchronous approach. Unlike conventional +speculative decoding, where only one model (draft or verify) performs token +generation at a time, AMUSD enables both models to perform predictions +independently on separate devices (e.g., GPUs). We evaluate our approach over +multiple datasets and show that AMUSD achieves an average 29% improvement over +speculative decoding and up to 1.96$\times$ speedup over conventional +autoregressive decoding, while achieving identical output quality. Our system +is open-source and available at https://github.com/BradMcDanel/AMUSD/. + +
+
+ comment: 4 pages, 5 figures, 1 table, 1 algorithm +
+
+
+
+
+ + ☆ All Entities are Not Created Equal: Examining the Long Tail for + Fine-Grained Entity Typing + + +
+ Pre-trained language models (PLMs) are trained on large amounts of data, +which helps capture world knowledge alongside linguistic competence. Due to +this, they are extensively used for ultra-fine entity typing tasks, where they +provide the entity knowledge held in its parameter space. Given that PLMs learn +from co-occurrence patterns, they likely contain more knowledge or less +knowledge about entities depending on their how frequent they are in the +pre-training data. In this work, we probe PLMs to elicit encoded entity +probabilities and demonstrate that they highly correlate with their frequency +in large-scale internet data. Then, we demonstrate that entity-typing +approaches that rely on PLMs struggle with entities at the long tail on the +distribution. Our findings suggests that we need to go beyond PLMs to produce +solutions that perform well for rare, new or infrequent entities. + +
+
+
+
+
+ + ☆ Captions Speak Louder than Images (CASLIE): Generalizing Foundation + Models for E-commerce from High-quality Multimodal Instruction Data + + +
+ Leveraging multimodal data to drive breakthroughs in e-commerce applications +through Multimodal Foundation Models (MFMs) is gaining increasing attention +from the research community. However, there are significant challenges that +hinder the optimal use of multimodal e-commerce data by foundation models: (1) +the scarcity of large-scale, high-quality multimodal benchmark datasets; and +(2) the lack of effective multimodal information integration methods. To +address these challenges, in this paper, we introduce MMECInstruct, the +first-ever, large-scale, and high-quality multimodal instruction dataset for +e-commerce. We also develop CASLIE, a simple, lightweight, yet effective +framework for integrating multimodal information for e-commerce. Leveraging +MMECInstruct, we fine-tune a series of e-commerce MFMs within CASLIE, denoted +as CASLIE models. Our comprehensive evaluation demonstrates that CASLIE models +substantially outperform 5 categories of advanced baseline models in the +in-domain evaluation. Moreover, CASLIE models show strong generalizability to +out-of-domain settings. MMECInstruct and CASLIE models are publicly accessible +through https://ninglab.github.io/CASLIE/. + +
+
+ comment: Xinyi Ling and Bo Peng contributed equally to this paper +
+
+
+
+
+ + ☆ Are Large Language Models Ready for Travel Planning? + + +
+ While large language models (LLMs) show promise in hospitality and tourism, +their ability to provide unbiased service across demographic groups remains +unclear. This paper explores gender and ethnic biases when LLMs are utilized as +travel planning assistants. To investigate this issue, we apply machine +learning techniques to analyze travel suggestions generated from three +open-source LLMs. Our findings reveal that the performance of race and gender +classifiers substantially exceeds random chance, indicating differences in how +LLMs engage with varied subgroups. Specifically, outputs align with cultural +expectations tied to certain races and genders. To minimize the effect of these +stereotypes, we used a stop-word classification strategy, which decreased +identifiable differences, with no disrespectful terms found. However, +hallucinations related to African American and gender minority groups were +noted. In conclusion, while LLMs can generate travel plans seemingly free from +bias, it remains essential to verify the accuracy and appropriateness of their +recommendations. + +
+
+
+
+
+ + ☆ Literature Meets Data: A Synergistic Approach to Hypothesis Generation + + +
+ AI holds promise for transforming scientific processes, including hypothesis +generation. Prior work on hypothesis generation can be broadly categorized into +theory-driven and data-driven approaches. While both have proven effective in +generating novel and plausible hypotheses, it remains an open question whether +they can complement each other. To address this, we develop the first method +that combines literature-based insights with data to perform LLM-powered +hypothesis generation. We apply our method on five different datasets and +demonstrate that integrating literature and data outperforms other baselines +(8.97\% over few-shot, 15.75\% over literature-based alone, and 3.37\% over +data-driven alone). Additionally, we conduct the first human evaluation to +assess the utility of LLM-generated hypotheses in assisting human +decision-making on two challenging tasks: deception detection and AI generated +content detection. Our results show that human accuracy improves significantly +by 7.44\% and 14.19\% on these tasks, respectively. These findings suggest that +integrating literature-based and data-driven approaches provides a +comprehensive and nuanced framework for hypothesis generation and could open +new avenues for scientific inquiry. + +
+
+ comment: 30 pages, 7 figures, code link: + https://github.com/ChicagoHAI/hypothesis-generation +
+
+
+
+
+ + ☆ Altogether: Image Captioning via Re-aligning Alt-text EMNLP 2024 + + +
+ This paper focuses on creating synthetic data to improve the quality of image +captions. Existing works typically have two shortcomings. First, they caption +images from scratch, ignoring existing alt-text metadata, and second, lack +transparency if the captioners' training data (e.g. GPT) is unknown. In this +paper, we study a principled approach Altogether based on the key idea to edit +and re-align existing alt-texts associated with the images. To generate +training data, we perform human annotation where annotators start with the +existing alt-text and re-align it to the image content in multiple rounds, +consequently constructing captions with rich visual concepts. This differs from +prior work that carries out human annotation as a one-time description task +solely based on images and annotator knowledge. We train a captioner on this +data that generalizes the process of re-aligning alt-texts at scale. Our +results show our Altogether approach leads to richer image captions that also +improve text-to-image generation and zero-shot image classification tasks. + +
+
+ comment: accepted by EMNLP 2024; MetaCLIPv2 +
+
+
+
+
+ + ☆ JMMMU: A Japanese Massive Multi-discipline Multimodal Understanding + Benchmark for Culture-aware Evaluation + + +
+ Accelerating research on Large Multimodal Models (LMMs) in non-English +languages is crucial for enhancing user experiences across broader populations. +In this paper, we introduce JMMMU (Japanese MMMU), the first large-scale +Japanese benchmark designed to evaluate LMMs on expert-level tasks based on the +Japanese cultural context. To facilitate comprehensive culture-aware +evaluation, JMMMU features two complementary subsets: (i) culture-agnostic (CA) +subset, where the culture-independent subjects (e.g., Math) are selected and +translated into Japanese, enabling one-to-one comparison with its English +counterpart MMMU; and (ii) culture-specific (CS) subset, comprising newly +crafted subjects that reflect Japanese cultural context. Using the CA subset, +we observe performance drop in many LMMs when evaluated in Japanese, which is +purely attributable to language variation. Using the CS subset, we reveal their +inadequate Japanese cultural understanding. Further, by combining both subsets, +we identify that some LMMs perform well on the CA subset but not on the CS +subset, exposing a shallow understanding of the Japanese language that lacks +depth in cultural understanding. We hope this work will not only help advance +LMM performance in Japanese but also serve as a guideline to create +high-standard, culturally diverse benchmarks for multilingual LMM development. +The project page is https://mmmu-japanese-benchmark.github.io/JMMMU/. + +
+
+ comment: Project page: https://mmmu-japanese-benchmark.github.io/JMMMU/ +
+
+
+
+
+ + ☆ PyramidDrop: Accelerating Your Large Vision-Language Models via Pyramid + Visual Redundancy Reduction + + +
+ In large vision-language models (LVLMs), images serve as inputs that carry a +wealth of information. As the idiom "A picture is worth a thousand words" +implies, representing a single image in current LVLMs can require hundreds or +even thousands of tokens. This results in significant computational costs, +which grow quadratically as input image resolution increases, thereby severely +impacting the efficiency of both training and inference. Previous approaches +have attempted to reduce the number of image tokens either before or within the +early layers of LVLMs. However, these strategies inevitably result in the loss +of crucial image information, ultimately diminishing model performance. To +address this challenge, we conduct an empirical study revealing that all visual +tokens are necessary for LVLMs in the shallow layers, and token redundancy +progressively increases in the deeper layers of the model. To this end, we +propose PyramidDrop, a visual redundancy reduction strategy for LVLMs to boost +their efficiency in both training and inference with neglectable performance +loss. Specifically, we partition the LVLM into several stages and drop part of +the image tokens at the end of each stage with a pre-defined ratio, creating +pyramid-like visual tokens across model layers. The dropping is based on a +lightweight similarity calculation with a negligible time overhead. Extensive +experiments demonstrate that PyramidDrop can achieve a 40% training time and +55% inference FLOPs acceleration of LLaVA-NeXT with comparable performance. +Besides, the PyramidDrop could also serve as a plug-and-play strategy for +inference acceleration without training, with better performance and lower +inference cost than counterparts. We hope that the insights and approach +introduced by PyramidDrop will inspire future research to further investigate +the role of image tokens in LVLMs. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Towards Reliable Evaluation of Behavior Steering Interventions in LLMs NeurIPS 2024 + + +
+ Representation engineering methods have recently shown promise for enabling +efficient steering of model behavior. However, evaluation pipelines for these +methods have primarily relied on subjective demonstrations, instead of +quantitative, objective metrics. We aim to take a step towards addressing this +issue by advocating for four properties missing from current evaluations: (i) +contexts sufficiently similar to downstream tasks should be used for assessing +intervention quality; (ii) model likelihoods should be accounted for; (iii) +evaluations should allow for standardized comparisons across different target +behaviors; and (iv) baseline comparisons should be offered. We introduce an +evaluation pipeline grounded in these criteria, offering both a quantitative +and visual analysis of how effectively a given method works. We use this +pipeline to evaluate two representation engineering methods on how effectively +they can steer behaviors such as truthfulness and corrigibility, finding that +some interventions are less effective than previously reported. + +
+
+ comment: Accepted to the NeurIPS 2024 - Workshop on Foundation Model + Interventions +
+
+
+
+
+ + ☆ SELA: Tree-Search Enhanced LLM Agents for Automated Machine Learning + + +
+ Automated Machine Learning (AutoML) approaches encompass traditional methods +that optimize fixed pipelines for model selection and ensembling, as well as +newer LLM-based frameworks that autonomously build pipelines. While LLM-based +agents have shown promise in automating machine learning tasks, they often +generate low-diversity and suboptimal code, even after multiple iterations. To +overcome these limitations, we introduce Tree-Search Enhanced LLM Agents +(SELA), an innovative agent-based system that leverages Monte Carlo Tree Search +(MCTS) to optimize the AutoML process. By representing pipeline configurations +as trees, our framework enables agents to conduct experiments intelligently and +iteratively refine their strategies, facilitating a more effective exploration +of the machine learning solution space. This novel approach allows SELA to +discover optimal pathways based on experimental feedback, improving the overall +quality of the solutions. In an extensive evaluation across 20 machine learning +datasets, we compare the performance of traditional and agent-based AutoML +methods, demonstrating that SELA achieves a win rate of 65% to 80% against each +baseline across all datasets. These results underscore the significant +potential of agent-based strategies in AutoML, offering a fresh perspective on +tackling complex machine learning challenges. + +
+
+ comment: The code is available at https://github.com/geekan/MetaGPT +
+
+
+
+
+ + ☆ Large Language Models Empowered Personalized Web Agents + + +
+ Web agents have emerged as a promising direction to automate Web task +completion based on user instructions, significantly enhancing user experience. +Recently, Web agents have evolved from traditional agents to Large Language +Models (LLMs)-based Web agents. Despite their success, existing LLM-based Web +agents overlook the importance of personalized data (e.g., user profiles and +historical Web behaviors) in assisting the understanding of users' personalized +instructions and executing customized actions. To overcome the limitation, we +first formulate the task of LLM-empowered personalized Web agents, which +integrate personalized data and user instructions to personalize instruction +comprehension and action execution. To address the absence of a comprehensive +evaluation benchmark, we construct a Personalized Web Agent Benchmark +(PersonalWAB), featuring user instructions, personalized user data, Web +functions, and two evaluation paradigms across three personalized Web tasks. +Moreover, we propose a Personalized User Memory-enhanced Alignment (PUMA) +framework to adapt LLMs to the personalized Web agent task. PUMA utilizes a +memory bank with a task-specific retrieval strategy to filter relevant +historical Web behaviors. Based on the behaviors, PUMA then aligns LLMs for +personalized action execution through fine-tuning and direct preference +optimization. Extensive experiments validate the superiority of PUMA over +existing Web agents on PersonalWAB. + +
+
+ comment: The code and data are available on the project website + https://hongrucai.github.io/PersonalWAB/ +
+
+
+
+
+ + ☆ Automated Spinal MRI Labelling from Reports Using a Large Language Model MICCAI 2024 + + +
+ We propose a general pipeline to automate the extraction of labels from +radiology reports using large language models, which we validate on spinal MRI +reports. The efficacy of our labelling method is measured on five distinct +conditions: spinal cancer, stenosis, spondylolisthesis, cauda equina +compression and herniation. Using open-source models, our method equals or +surpasses GPT-4 on a held-out set of reports. Furthermore, we show that the +extracted labels can be used to train imaging models to classify the identified +conditions in the accompanying MR scans. All classifiers trained using +automated labels achieve comparable performance to models trained using scans +manually annotated by clinicians. Code can be found at +https://github.com/robinyjpark/AutoLabelClassifier. + +
+
+ comment: Accepted to Medical Image Computing and Computer Assisted + Intervention (MICCAI 2024, Spotlight). 11 pages plus appendix +
+
+
+
+
+ + ☆ Fine-Tuning Large Language Models to Appropriately Abstain with Semantic + Entropy NeurIPS + + +
+ Large Language Models (LLMs) are known to hallucinate, whereby they generate +plausible but inaccurate text. This phenomenon poses significant risks in +critical applications, such as medicine or law, necessitating robust +hallucination mitigation strategies. While recent works have proposed +fine-tuning methods to teach LLMs to abstain from answering questions beyond +their knowledge or capabilities, these methods rely on the existence of +ground-truth labels or are limited to short-form responses. To address these +limitations, we propose fine-tuning using semantic entropy, an uncertainty +measure derived from introspection into the model which does not require +external labels. We demonstrate that our approach matches or outperforms models +fine-tuned using prior work and achieves strong performance for both short and +long-form generations on a range of datasets. + +
+
+ comment: Accepted to NeurIPS Safe Generative AI Workshop 2024 +
+
+
+
+
+ + ☆ Dhoroni: Exploring Bengali Climate Change and Environmental Views with a + Multi-Perspective News Dataset and Natural Language Processing + + +
+ Climate change poses critical challenges globally, disproportionately +affecting low-income countries that often lack resources and linguistic +representation on the international stage. Despite Bangladesh's status as one +of the most vulnerable nations to climate impacts, research gaps persist in +Bengali-language studies related to climate change and NLP. To address this +disparity, we introduce Dhoroni, a novel Bengali (Bangla) climate change and +environmental news dataset, comprising a 2300 annotated Bangla news articles, +offering multiple perspectives such as political influence, +scientific/statistical data, authenticity, stance detection, and stakeholder +involvement. Furthermore, we present an in-depth exploratory analysis of +Dhoroni and introduce BanglaBERT-Dhoroni family, a novel baseline model family +for climate and environmental opinion detection in Bangla, fine-tuned on our +dataset. This research contributes significantly to enhancing accessibility and +analysis of climate discourse in Bengali (Bangla), addressing crucial +communication and research gaps in climate-impacted regions like Bangladesh +with 180 million people. + +
+
+ comment: In Review +
+
+
+
+
+ + ☆ Context-aware Prompt Tuning: Advancing In-Context Learning with + Adversarial Methods + + +
+ Fine-tuning Large Language Models (LLMs) typically involves updating at least +a few billions of parameters. A more parameter-efficient approach is Prompt +Tuning (PT), which updates only a few learnable tokens, and differently, +In-Context Learning (ICL) adapts the model to a new task by simply including +examples in the input without any training. When applying optimization-based +methods, such as fine-tuning and PT for few-shot learning, the model is +specifically adapted to the small set of training examples, whereas ICL leaves +the model unchanged. This distinction makes traditional learning methods more +prone to overfitting; in contrast, ICL is less sensitive to the few-shot +scenario. While ICL is not prone to overfitting, it does not fully extract the +information that exists in the training examples. This work introduces +Context-aware Prompt Tuning (CPT), a method inspired by ICL, PT, and +adversarial attacks. We build on the ICL strategy of concatenating examples +before the input, but we extend this by PT-like learning, refining the context +embedding through iterative optimization to extract deeper insights from the +training examples. We carefully modify specific context tokens, considering the +unique structure of input and output formats. Inspired by adversarial attacks, +we adjust the input based on the labels present in the context, focusing on +minimizing, rather than maximizing, the loss. Moreover, we apply a projected +gradient descent algorithm to keep token embeddings close to their original +values, under the assumption that the user-provided data is inherently +valuable. Our method has been shown to achieve superior accuracy across +multiple classification tasks using various LLM models. + +
+
+
+
+
+ + ☆ Creativity in AI: Progresses and Challenges + + +
+ Creativity is the ability to produce novel, useful, and surprising ideas, and +has been widely studied as a crucial aspect of human cognition. Machine +creativity on the other hand has been a long-standing challenge. With the rise +of advanced generative AI, there has been renewed interest and debate regarding +AI's creative capabilities. Therefore, it is imperative to revisit the state of +creativity in AI and identify key progresses and remaining challenges. In this +work, we survey leading works studying the creative capabilities of AI systems, +focusing on creative problem-solving, linguistic, artistic, and scientific +creativity. Our review suggests that while the latest AI models are largely +capable of producing linguistically and artistically creative outputs such as +poems, images, and musical pieces, they struggle with tasks that require +creative problem-solving, abstract thinking and compositionality and their +generations suffer from a lack of diversity, originality, long-range +incoherence and hallucinations. We also discuss key questions concerning +copyright and authorship issues with generative models. Furthermore, we +highlight the need for a comprehensive evaluation of creativity that is +process-driven and considers several dimensions of creativity. Finally, we +propose future research directions to improve the creativity of AI outputs, +drawing inspiration from cognitive science and psychology. + +
+
+ comment: 44 pages +
+
+
+
+
+ + ☆ MiniPLM: Knowledge Distillation for Pre-Training Language Models + + +
+ Knowledge distillation (KD) is widely used to train small, high-performing +student language models (LMs) using large teacher LMs. While effective in +fine-tuning, KD during pre-training faces challenges in efficiency, +flexibility, and effectiveness. Existing methods either incur high +computational costs due to online teacher inference, require tokenization +matching between teacher and student LMs, or risk losing the difficulty and +diversity of the teacher-generated training data. To address these issues, we +propose MiniPLM, a KD framework for pre-training LMs by refining the training +data distribution with the teacher's knowledge. For efficiency, MiniPLM +performs offline teacher LM inference, allowing KD for multiple student LMs +without adding training-time costs. For flexibility, MiniPLM operates solely on +the training corpus, enabling KD across model families. For effectiveness, +MiniPLM leverages the differences between large and small LMs to enhance the +difficulty and diversity of the training data, helping student LMs acquire +versatile and sophisticated knowledge. Extensive experiments demonstrate that +MiniPLM boosts the student LMs' performance on 9 widely used downstream tasks, +improves the language modeling capabilities, and reduces pre-training +computation. The benefit of MiniPLM extends to large pre-training scales, +evidenced by the extrapolation of the scaling curves. Further analysis reveals +that MiniPLM supports KD across model families and enhances the utilization of +pre-training data. Our model, code, and data are available at +https://github.com/thu-coai/MiniPLM. + +
+
+
+
+
+ + ☆ Exploring Possibilities of AI-Powered Legal Assistance in Bangladesh + through Large Language Modeling + + +
+ Purpose: Bangladesh's legal system struggles with major challenges like +delays, complexity, high costs, and millions of unresolved cases, which deter +many from pursuing legal action due to lack of knowledge or financial +constraints. This research seeks to develop a specialized Large Language Model +(LLM) to assist in the Bangladeshi legal system. Methods: We created +UKIL-DB-EN, an English corpus of Bangladeshi legal documents, by collecting and +scraping data on various legal acts. We fine-tuned the GPT-2 model on this +dataset to develop GPT2-UKIL-EN, an LLM focused on providing legal assistance +in English. Results: The model was rigorously evaluated using semantic +assessments, including case studies supported by expert opinions. The +evaluation provided promising results, demonstrating the potential for the +model to assist in legal matters within Bangladesh. Conclusion: Our work +represents the first structured effort toward building an AI-based legal +assistant for Bangladesh. While the results are encouraging, further +refinements are necessary to improve the model's accuracy, credibility, and +safety. This is a significant step toward creating a legal AI capable of +serving the needs of a population of 180 million. + +
+
+ comment: In Review +
+
+
+
+
+ + ☆ Audio-to-Score Conversion Model Based on Whisper methodology + + +
+ This thesis develops a Transformer model based on Whisper, which extracts +melodies and chords from music audio and records them into ABC notation. A +comprehensive data processing workflow is customized for ABC notation, +including data cleansing, formatting, and conversion, and a mutation mechanism +is implemented to increase the diversity and quality of training data. This +thesis innovatively introduces the "Orpheus' Score", a custom notation system +that converts music information into tokens, designs a custom vocabulary +library, and trains a corresponding custom tokenizer. Experiments show that +compared to traditional algorithms, the model has significantly improved +accuracy and performance. While providing a convenient audio-to-score tool for +music enthusiasts, this work also provides new ideas and tools for research in +music information processing. + +
+
+ comment: 5 pages, 7 figures +
+
+
+
+
+ + ☆ VoiceBench: Benchmarking LLM-Based Voice Assistants + + +
+ Building on the success of large language models (LLMs), recent advancements +such as GPT-4o have enabled real-time speech interactions through LLM-based +voice assistants, offering a significantly improved user experience compared to +traditional text-based interactions. However, the absence of benchmarks +designed to evaluate these speech interaction capabilities has hindered +progress of LLM-based voice assistants development. Current evaluations focus +primarily on automatic speech recognition (ASR) or general knowledge evaluation +with clean speeches, neglecting the more intricate, real-world scenarios that +involve diverse speaker characteristics, environmental and content factors. To +address this, we introduce VoiceBench, the first benchmark designed to provide +a multi-faceted evaluation of LLM-based voice assistants. VoiceBench also +includes both real and synthetic spoken instructions that incorporate the above +three key real-world variations. Extensive experiments reveal the limitations +of current LLM-based voice assistant models and offer valuable insights for +future research and development in this field. + +
+
+ comment: Work in progress. Data is available at + https://github.com/MatthewCYM/VoiceBench +
+
+
+
+
+ + ☆ From Attention to Activation: Unravelling the Enigmas of Large Language + Models + + +
+ We study two strange phenomena in auto-regressive Transformers: (1) the +dominance of the first token in attention heads; (2) the occurrence of large +outlier activations in the hidden states. We find that popular large language +models, such as Llama attend maximally to the first token in 98% of attention +heads, a behaviour we attribute to the softmax function. To mitigate this +issue, we propose a reformulation of softmax to softmax-1. Furthermore, we +identify adaptive optimisers, e.g. Adam, as the primary contributor to the +large outlier activations and introduce OrthoAdam, a novel optimiser that +utilises orthogonal matrices to transform gradients, to address this issue. +Finally, not only do our methods prevent these phenomena from occurring, but +additionally, they enable Transformers to sustain their performance when +quantised using basic algorithms, something that standard methods are unable to +do. In summary, our methods reduce the attention proportion on the first token +from 65% to 3.3%, the activation kurtosis in the hidden states from 1657 to +3.1, and perplexity penalty under 4-bit weight quantisation from 3565 to 0.3. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Self-calibration for Language Model Quantization and Pruning + + +
+ Quantization and pruning are fundamental approaches for model compression, +enabling efficient inference for language models. In a post-training setting, +state-of-the-art quantization and pruning methods require calibration data, a +small set of unlabeled examples. Conventionally, randomly sampled web text is +used, aiming to reflect the model training data. However, this poses two key +problems: (1) unrepresentative calibration examples can harm model performance, +and (2) organizations increasingly avoid releasing model training data. In this +paper, we propose self-calibration as a solution. Our approach requires no +external data, instead leveraging the model itself to generate synthetic +calibration data as a better approximation of the pre-training data +distribution. We extensively compare the performance of self-calibration with +several baselines, across a variety of models, compression methods, and tasks. +Our approach proves consistently competitive in maximizing downstream task +performance, frequently outperforming even using real data. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Interchangeable Token Embeddings for Extendable Vocabulary and + Alpha-Equivalence + + +
+ We propose a novel approach for learning interchangeable tokens in language +models to obtain an extendable vocabulary that can generalize to new tokens. +Our method is designed to address alpha-equivalence, the principle that +renaming bound variables in a syntactic expression preserves semantics. This +property arises in many formal languages such as temporal logics, in which all +proposition symbols represent the same concept but are distinguishable from +each other. To handle such tokens, we develop a dual-part embedding approach. +The first part is shared across all interchangeable tokens, thereby enforcing +that they represent the same core concept. The second part is randomly +generated for each token, which enables distinguishability. We evaluate our +method in a Transformer encoder-decoder model on two tasks: solving linear +temporal logic formulae and copying with extendable vocabulary. Our method +demonstrates promising generalization capabilities in addition to introducing a +favorable inductive bias for alpha-equivalence. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Improving Pinterest Search Relevance Using Large Language Models CIKM 2024 + + +
+ To improve relevance scoring on Pinterest Search, we integrate Large Language +Models (LLMs) into our search relevance model, leveraging carefully designed +text representations to predict the relevance of Pins effectively. Our approach +uses search queries alongside content representations that include captions +extracted from a generative visual language model. These are further enriched +with link-based text data, historically high-quality engaged queries, +user-curated boards, Pin titles and Pin descriptions, creating robust models +for predicting search relevance. We use a semi-supervised learning approach to +efficiently scale up the amount of training data, expanding beyond the +expensive human labeled data available. By utilizing multilingual LLMs, our +system extends training data to include unseen languages and domains, despite +initial data and annotator expertise being confined to English. Furthermore, we +distill from the LLM-based model into real-time servable model architectures +and features. We provide comprehensive offline experimental validation for our +proposed techniques and demonstrate the gains achieved through the final +deployed system at scale. + +
+
+ comment: CIKM 2024 Workshop on Industrial Recommendation Systems +
+
+
+
+
+ + ☆ Can General-Purpose Large Language Models Generalize to English-Thai + Machine Translation ? EMNLP 2024 + + +
+ Large language models (LLMs) perform well on common tasks but struggle with +generalization in low-resource and low-computation settings. We examine this +limitation by testing various LLMs and specialized translation models on +English-Thai machine translation and code-switching datasets. Our findings +reveal that under more strict computational constraints, such as 4-bit +quantization, LLMs fail to translate effectively. In contrast, specialized +models, with comparable or lower computational requirements, consistently +outperform LLMs. This underscores the importance of specialized models for +maintaining performance under resource constraints. + +
+
+ comment: Accepted in GenBench EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ NVLM: Open Frontier-Class Multimodal LLMs + + +
+ We introduce NVLM 1.0, a family of frontier-class multimodal large language +models (LLMs) that achieve state-of-the-art results on vision-language tasks, +rivaling the leading proprietary models (e.g., GPT-4o) and open-access models +(e.g., Llama 3-V 405B and InternVL 2). Remarkably, NVLM 1.0 shows improved +text-only performance over its LLM backbone after multimodal training. In terms +of model design, we perform a comprehensive comparison between decoder-only +multimodal LLMs (e.g., LLaVA) and cross-attention-based models (e.g., +Flamingo). Based on the strengths and weaknesses of both approaches, we propose +a novel architecture that enhances both training efficiency and multimodal +reasoning capabilities. Furthermore, we introduce a 1-D tile-tagging design for +tile-based dynamic high-resolution images, which significantly boosts +performance on multimodal reasoning and OCR-related tasks. Regarding training +data, we meticulously curate and provide detailed information on our multimodal +pretraining and supervised fine-tuning datasets. Our findings indicate that +dataset quality and task diversity are more important than scale, even during +the pretraining phase, across all architectures. Notably, we develop +production-grade multimodality for the NVLM-1.0 models, enabling them to excel +in vision-language tasks while maintaining and even improving text-only +performance compared to their LLM backbones. To achieve this, we craft and +integrate a high-quality text-only dataset into multimodal training, alongside +a substantial amount of multimodal math and reasoning data, leading to enhanced +math and coding capabilities across modalities. To advance research in the +field, we release the model weights at https://huggingface.co/nvidia/NVLM-D-72B +and will open-source the training code for the community soon. + +
+
+ comment: Fixed the typos. For more information, please visit our project page + at: https://research.nvidia.com/labs/adlr/NVLM-1 +
+
+
+
+
+ + ♻ ☆ How to Evaluate Reward Models for RLHF + + +
+ We introduce a new benchmark for reward models that quantifies their ability +to produce strong language models through RLHF (Reinforcement Learning from +Human Feedback). The gold-standard approach is to run a full RLHF training +pipeline and directly probe downstream LLM performance. However, this process +is prohibitively expensive. To address this, we build a predictive model of +downstream LLM performance by evaluating the reward model on proxy tasks. These +proxy tasks consist of a large-scale human preference and a verifiable +correctness preference dataset, in which we measure 12 metrics across 12 +domains. To investigate which reward model metrics are most correlated to +gold-standard RLHF outcomes, we launch an end-to-end RLHF experiment on a +large-scale crowdsourced human preference platform to view real reward model +downstream performance as ground truth. Ultimately, we compile our data and +findings into Preference Proxy Evaluations (PPE), the first reward model +benchmark explicitly linked to post-RLHF real-world human preference +performance, which we open-source for public use and further development. Our +code and evaluations can be found at https://github.com/lmarena/PPE . + +
+
+
+
+
+ + ♻ ☆ A Comparative Study on Reasoning Patterns of OpenAI's o1 Model + + +
+ Enabling Large Language Models (LLMs) to handle a wider range of complex +tasks (e.g., coding, math) has drawn great attention from many researchers. As +LLMs continue to evolve, merely increasing the number of model parameters +yields diminishing performance improvements and heavy computational costs. +Recently, OpenAI's o1 model has shown that inference strategies (i.e., +Test-time Compute methods) can also significantly enhance the reasoning +capabilities of LLMs. However, the mechanisms behind these methods are still +unexplored. In our work, to investigate the reasoning patterns of o1, we +compare o1 with existing Test-time Compute methods (BoN, Step-wise BoN, Agent +Workflow, and Self-Refine) by using OpenAI's GPT-4o as a backbone on general +reasoning benchmarks in three domains (i.e., math, coding, commonsense +reasoning). Specifically, first, our experiments show that the o1 model has +achieved the best performance on most datasets. Second, as for the methods of +searching diverse responses (e.g., BoN), we find the reward models' capability +and the search space both limit the upper boundary of these methods. Third, as +for the methods that break the problem into many sub-problems, the Agent +Workflow has achieved better performance than Step-wise BoN due to the +domain-specific system prompt for planning better reasoning processes. Fourth, +it is worth mentioning that we have summarized six reasoning patterns of o1, +and provided a detailed analysis on several reasoning benchmarks. + +
+
+
+
+
+ + ♻ ☆ NutriBench: A Dataset for Evaluating Large Language Models in + Carbohydrate Estimation from Meal Descriptions + + +
+ Accurate nutrition estimation helps people make informed dietary choices and +is essential in the prevention of serious health complications. We present +NutriBench, the first publicly available natural language meal description +nutrition benchmark. NutriBench consists of 11,857 meal descriptions generated +from real-world global dietary intake data. The data is human-verified and +annotated with macro-nutrient labels, including carbohydrates, proteins, fats, +and calories. We conduct an extensive evaluation of NutriBench on the task of +carbohydrate estimation, testing twelve leading Large Language Models (LLMs), +including GPT-4o, Llama3.1, Qwen2, Gemma2, and OpenBioLLM models, using +standard, Chain-of-Thought and Retrieval-Augmented Generation strategies. +Additionally, we present a study involving professional nutritionists, finding +that LLMs can provide more accurate and faster estimates. Finally, we perform a +real-world risk assessment by simulating the effect of carbohydrate predictions +on the blood glucose levels of individuals with diabetes. Our work highlights +the opportunities and challenges of using LLMs for nutrition estimation, +demonstrating their potential to aid professionals and laypersons and improve +health outcomes. Our benchmark is publicly available at: +https://mehak126.github.io/nutribench.html + +
+
+
+
+
+ + ♻ ☆ Characterizing the Accuracy -- Efficiency Trade-off of Low-rank + Decomposition in Language Models + + +
+ Recent large language models (LLMs) employ billions of parameters to enable +broad problem-solving capabilities. Such language models also tend to be +memory-bound because of the dominance of matrix-vector and matrix-matrix +multiplications with low arithmetic intensity. Therefore, optimizing the memory +footprint and traffic is an important optimization direction for LLMs today. +Model compression methods such as quantization and parameter pruning have been +actively explored to achieve memory footprint and traffic optimization. +However, the accuracy-efficiency trade-off of rank pruning (i.e., low-rank +decomposition) for LLMs is not well-understood yet. Therefore, in this work, we +characterize the accuracy-efficiency trade-off of a low-rank decomposition +method, specifically Tucker decomposition, on recent language models, including +an open-source LLM, Llama 2. We formalize the low-rank decomposition design +space and show that the decomposition design space is enormous (e.g., +O($2^{39}$) for Llama2-7B). To navigate such a vast design space, we formulate +it and perform thorough case studies of accuracy-efficiency trade-offs using +six widely used LLM benchmarks on BERT and Llama 2 models. Our results show +that we can achieve a 9\% model size reduction with minimal accuracy drops, +which range from 4\%p (\%p refers to "percentage point," which refers to the +absolute difference between two percentage numbers; 74\% -> 78\% = 4\%p +increase) to 10\%p, depending on the difficulty of the benchmark, without any +retraining to recover accuracy after decomposition. The results show that +low-rank decomposition can be a promising direction for LLM-based applications +that require real-time service at scale (e.g., AI agent and real-time coding +assistant), where the latency is as important as the model accuracy. + +
+
+
+
+
+ + ♻ ☆ LLM Processes: Numerical Predictive Distributions Conditioned on Natural + Language + + +
+ Machine learning practitioners often face significant challenges in formally +integrating their prior knowledge and beliefs into predictive models, limiting +the potential for nuanced and context-aware analyses. Moreover, the expertise +needed to integrate this prior knowledge into probabilistic modeling typically +limits the application of these models to specialists. Our goal is to build a +regression model that can process numerical data and make probabilistic +predictions at arbitrary locations, guided by natural language text which +describes a user's prior knowledge. Large Language Models (LLMs) provide a +useful starting point for designing such a tool since they 1) provide an +interface where users can incorporate expert insights in natural language and +2) provide an opportunity for leveraging latent problem-relevant knowledge +encoded in LLMs that users may not have themselves. We start by exploring +strategies for eliciting explicit, coherent numerical predictive distributions +from LLMs. We examine these joint predictive distributions, which we call LLM +Processes, over arbitrarily-many quantities in settings such as forecasting, +multi-dimensional regression, black-box optimization, and image modeling. We +investigate the practical details of prompting to elicit coherent predictive +distributions, and demonstrate their effectiveness at regression. Finally, we +demonstrate the ability to usefully incorporate text into numerical +predictions, improving predictive performance and giving quantitative structure +that reflects qualitative descriptions. This lets us begin to explore the rich, +grounded hypothesis space that LLMs implicitly encode. + +
+
+
+
+
+ + ♻ ☆ Token-wise Influential Training Data Retrieval for Large Language Models ACL 2024 + + +
+ Given a Large Language Model (LLM) generation, how can we identify which +training data led to this generation? In this paper, we proposed RapidIn, a +scalable framework adapting to LLMs for estimating the influence of each +training data. The proposed framework consists of two stages: caching and +retrieval. First, we compress the gradient vectors by over 200,000x, allowing +them to be cached on disk or in GPU/CPU memory. Then, given a generation, +RapidIn efficiently traverses the cached gradients to estimate the influence +within minutes, achieving over a 6,326x speedup. Moreover, RapidIn supports +multi-GPU parallelization to substantially accelerate caching and retrieval. +Our empirical result confirms the efficiency and effectiveness of RapidIn. + +
+
+ comment: Accepted to ACL 2024. Keywords: Influence Function, Influence + Estimation, Training Data Attribution +
+
+
+
+
+ + ♻ ☆ Learning to Poison Large Language Models During Instruction Tuning + + +
+ The advent of Large Language Models (LLMs) has marked significant +achievements in language processing and reasoning capabilities. Despite their +advancements, LLMs face vulnerabilities to data poisoning attacks, where +adversaries insert backdoor triggers into training data to manipulate outputs +for malicious purposes. This work further identifies additional security risks +in LLMs by designing a new data poisoning attack tailored to exploit the +instruction tuning process. We propose a novel gradient-guided backdoor trigger +learning (GBTL) algorithm to identify adversarial triggers efficiently, +ensuring an evasion of detection by conventional defenses while maintaining +content integrity. Through experimental validation across various tasks, +including sentiment analysis, domain generation, and question answering, our +poisoning strategy demonstrates a high success rate in compromising various +LLMs' outputs. We further propose two defense strategies against data poisoning +attacks, including in-context learning (ICL) and continuous learning (CL), +which effectively rectify the behavior of LLMs and significantly reduce the +decline in performance. Our work highlights the significant security risks +present during the instruction tuning of LLMs and emphasizes the necessity of +safeguarding LLMs against data poisoning attacks. + +
+
+
+
+
+ + ♻ ☆ On the Diversity of Synthetic Data and its Impact on Training Large + Language Models + + +
+ The rise of Large Language Models (LLMs) has accentuated the need for +diverse, high-quality pre-training data. Synthetic data emerges as a viable +solution to the challenges of data scarcity and inaccessibility. While previous +literature has focused predominantly on the quality and quantity of real data, +our work enables the measurement of diversity in synthetic data and explores +its impact on LLM performance. We study the downstream effects of synthetic +data diversity during both the pre-training and fine-tuning stages by +introducing a new diversity metric, \textit{LLM cluster-agent}, designed to +evaluate the diversity of synthetic datasets. Through a series of controlled +experiments with models of 350M and 1.4B parameters, we demonstrate that the +proposed cluster-based LLM scoring of diversity correlates positively with both +pre-training and supervised fine-tuning performance. Our findings also reveal +that synthetic data diversity in pre-training affects supervised fine-tuning +more significantly than pre-training itself, even for smaller models. We hope +this study advances our understanding of the optimal use of synthetic data in +LLM training and opens new avenues for efficient data generation processes. + +
+
+
+
+
+ + ♻ ☆ CDQuant: Greedy Coordinate Descent for Accurate LLM Quantization + + +
+ Large language models (LLMs) have recently demonstrated remarkable +performance across diverse language tasks. But their deployment is often +constrained by their substantial computational and storage requirements. +Quantization has emerged as a key technique for addressing this challenge, +enabling the compression of large models with minimal impact on performance. +The recent GPTQ algorithm, a post-training quantization (PTQ) method, has +proven highly effective for compressing LLMs, sparking a wave of research that +leverages GPTQ as a core component. Recognizing the pivotal role of GPTQ in the +PTQ landscape, we introduce CDQuant, a simple and scalable alternative to GPTQ +with improved performance. CDQuant uses greedy coordinate descent to minimize +the layer-wise reconstruction loss to achieve high-quality quantized weights. +Our algorithm is easy to implement and scales efficiently to models with +hundreds of billions of parameters. We perform extensive evaluation on Gemma, +and PaLM2 model families, and demonstrate that CDQuant consistently outperforms +GPTQ in 2-4 bit weight quantization. Moreover, CDQuant improves the performance +of state-of-the-art PTQ techniques such as QuIP and FrameQuant when used as a +replacement for their GPTQ component, resulting in further gains in quality. + +
+
+
+
+
+ + ♻ ☆ S2-Attention: Hardware-Aware Context Sharding Among Attention Heads + + +
+ Sparse attention, which selectively attends to a subset of tokens in the +context was supposed to be efficient. However, its theoretical reduction in +FLOPs has rarely translated into wall-clock speed-up over its dense attention +counterparts due to the lack of hardware-aware optimizations like +FlashAttention. Meanwhile, it remains unclear whether sparse attention can +maintain the model's quality at a scale of today's large language models (LLMs) +and how. This paper presents Sparsely-Sharded(S2) Attention, a Triton library +that provides kernel optimization for sparse attention customizable at both +per-head and per-context-range levels. S2-Attention enables the exploration of +novel and high-performance sparse attention techniques, which we demonstrate +through extensive ablations across a wide range of sparse attention designs at +various model scales. From these insights, we present several basic guidelines +to design sparse attention that can achieve not only practical efficiency +improvements, but also strong downstream performance. To achieve high +parallelization and optimized memory IO, sparse attention should shard the +context heterogeneously across attention heads, where each head attends to a +different subset of tokens while collectively covering the full context. +Meanwhile, we find hybrid architectures combining sparse and dense attention +particularly beneficial in practice. S2-Attention achieves wall-clock speedup +of 8.79X, 15.87X, 25.3X compared to the strong FlashAttention-2 baseline with +strong downstream performance on-par with full attention and perfect retrieval +performance at a 128k context length. At inference, for 7B models, our model, +with the help of our S2-Attention kernel, achieves 4.5x speed-up compared to +dense counterparts. S2-Attention is released with easy-to-customize APIs for +direct usage in Megatron and vLLM. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ FLAG: Financial Long Document Classification via AMR-based GNN + + +
+ The advent of large language models (LLMs) has initiated much research into +their various financial applications. However, in applying LLMs on long +documents, semantic relations are not explicitly incorporated, and a full or +arbitrarily sparse attention operation is employed. In recent years, progress +has been made in Abstract Meaning Representation (AMR), which is a graph-based +representation of text to preserve its semantic relations. Since AMR can +represent semantic relationships at a deeper level, it can be beneficially +utilized by graph neural networks (GNNs) for constructing effective +document-level graph representations built upon LLM embeddings to predict +target metrics in the financial domain. We propose FLAG: Financial Long +document classification via AMR-based GNN, an AMR graph based framework to +generate document-level embeddings for long financial document classification. +We construct document-level graphs from sentence-level AMR graphs, endow them +with specialized LLM word embeddings in the financial domain, apply a deep +learning mechanism that utilizes a GNN, and examine the efficacy of our +AMR-based approach in predicting labeled target data from long financial +documents. Extensive experiments are conducted on a dataset of quarterly +earnings calls transcripts of companies in various sectors of the economy, as +well as on a corpus of more recent earnings calls of companies in the S&P 1500 +Composite Index. We find that our AMR-based approach outperforms fine-tuning +LLMs directly on text in predicting stock price movement trends at different +time horizons in both datasets. Our work also outperforms previous work +utilizing document graphs and GNNs for text classification. + +
+
+ comment: 8 pages, 3 figures, to be published in CIFEr Conference 2024 as + "Semantic Graph Learning for Trend Prediction from Long Financial Documents" +
+
+
+
+
+ + ♻ ☆ BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive + Retrieval + + +
+ Existing retrieval benchmarks primarily consist of information-seeking +queries (e.g., aggregated questions from search engines) where keyword or +semantic-based retrieval is usually sufficient. However, many complex +real-world queries require in-depth reasoning to identify relevant documents +that go beyond surface form matching. For example, finding documentation for a +coding question requires understanding the logic and syntax of the functions +involved. To better benchmark retrieval on such challenging queries, we +introduce BRIGHT, the first text retrieval benchmark that requires intensive +reasoning to retrieve relevant documents. Our dataset consists of 1,384 +real-world queries spanning diverse domains, such as economics, psychology, +mathematics, and coding. These queries are drawn from naturally occurring and +carefully curated human data. Extensive evaluation reveals that even +state-of-the-art retrieval models perform poorly on BRIGHT. The leading model +on the MTEB leaderboard (Muennighoff et al., 2023), which achieves a score of +59.0 nDCG@10, produces a score of nDCG@10 of 18.3 on BRIGHT. We show that +incorporating explicit reasoning about the query improves retrieval performance +by up to 12.2 points. Moreover, incorporating retrieved documents from the +top-performing retriever boosts question-answering performance by over 6.6 +points. We believe that BRIGHT paves the way for future research on retrieval +systems in more realistic and challenging settings. + +
+
+ comment: 48 pages +
+
+
+
+
+ + ♻ ☆ Context-Parametric Inversion: Why Instruction Finetuning May Not + Actually Improve Context Reliance + + +
+ A standard practice when using large language models is for users to +supplement their instruction with an input context containing new information +for the model to process. However, models struggle to reliably follow the input +context, especially when it conflicts with their parametric knowledge from +pretraining. In-principle, one would expect models to adapt to the user context +better after instruction finetuning, particularly when handling knowledge +conflicts. However, we observe a surprising failure mode: during instruction +tuning, the context reliance under knowledge conflicts initially increases as +expected, but then gradually decreases as instruction finetuning progresses. +This happens while the performance on standard benchmarks keeps on increasing +far after this drop. We call this phenomenon context-parametric inversion and +observe it across multiple general purpose instruction tuning datasets such as +TULU, Alpaca and Ultrachat, across different model families like Llama, +Mistral, and Pythia. We perform various controlled studies and theoretical +analysis to show that context-parametric inversion occurs due to examples in +the instruction finetuning data where the input context provides information +that aligns with model's parametric knowledge. Our analysis suggests some +natural mitigation strategies with limited but insightful gains, and serves as +a useful starting point in addressing this deficiency in instruction +finetuning. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ The Causal Influence of Grammatical Gender on Distributional Semantics + + +
+ How much meaning influences gender assignment across languages is an active +area of research in linguistics and cognitive science. We can view current +approaches as aiming to determine where gender assignment falls on a spectrum, +from being fully arbitrarily determined to being largely semantically +determined. For the latter case, there is a formulation of the neo-Whorfian +hypothesis, which claims that even inanimate noun gender influences how people +conceive of and talk about objects (using the choice of adjective used to +modify inanimate nouns as a proxy for meaning). We offer a novel, causal +graphical model that jointly represents the interactions between a noun's +grammatical gender, its meaning, and adjective choice. In accordance with past +results, we find a significant relationship between the gender of nouns and the +adjectives that modify them. However, when we control for the meaning of the +noun, the relationship between grammatical gender and adjective choice is near +zero and insignificant. + +
+
+
+
+
+ + ♻ ☆ Can Large Language Models Identify Authorship? EMNLP 2024 + + +
+ The ability to accurately identify authorship is crucial for verifying +content authenticity and mitigating misinformation. Large Language Models +(LLMs) have demonstrated an exceptional capacity for reasoning and +problem-solving. However, their potential in authorship analysis remains +under-explored. Traditional studies have depended on hand-crafted stylistic +features, whereas state-of-the-art approaches leverage text embeddings from +pre-trained language models. These methods, which typically require fine-tuning +on labeled data, often suffer from performance degradation in cross-domain +applications and provide limited explainability. This work seeks to address +three research questions: (1) Can LLMs perform zero-shot, end-to-end authorship +verification effectively? (2) Are LLMs capable of accurately attributing +authorship among multiple candidates authors (e.g., 10 and 20)? (3) Can LLMs +provide explainability in authorship analysis, particularly through the role of +linguistic features? Moreover, we investigate the integration of explicit +linguistic features to guide LLMs in their reasoning processes. Our assessment +demonstrates LLMs' proficiency in both tasks without the need for +domain-specific fine-tuning, providing explanations into their decision making +via a detailed analysis of linguistic features. This establishes a new +benchmark for future research on LLM-based authorship analysis. + +
+
+ comment: Accepted to EMNLP 2024 Findings. The main paper is 9 pages long, with + 16 pages total. The code, results, dataset, and additional resources are + available on the project website: https://llm-authorship.github.io/ +
+
+
+
+
+ + ♻ ☆ The Impact of Large Language Models in Academia: from Writing to + Speaking + + +
+ Large language models (LLMs) are increasingly impacting human society, +particularly in textual information. Based on more than 30,000 papers and 1,000 +presentations from machine learning conferences, we examined and compared the +words used in writing and speaking, representing the first large-scale study of +how LLMs influence the two main modes of verbal communication and expression +within the same group of people. Our empirical results show that LLM-style +words such as "significant" have been used more frequently in abstracts and +oral presentations. The impact on speaking is beginning to emerge and is likely +to grow in the future, calling attention to the implicit influence and ripple +effect of LLMs on human society. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ♻ ☆ Levels of AI Agents: from Rules to Large Language Models + + +
+ AI agents are defined as artificial entities to perceive the environment, +make decisions and take actions. Inspired by the 6 levels of autonomous driving +by Society of Automotive Engineers, the AI agents are also categorized based on +utilities and strongness, as the following levels: L0, no AI, with tools taking +into account perception plus actions; L1, using rule-based AI; L2, making +rule-based AI replaced by IL/RL-based AI, with additional reasoning & decision +making; L3, applying LLM-based AI instead of IL/RL-based AI, additionally +setting up memory & reflection; L4, based on L3, facilitating autonomous +learning & generalization; L5, based on L4, appending personality of emotion +and character and collaborative behavior with multi-agents. + +
+
+
+
+
+ + ♻ ☆ LLMs left, right, and center: Assessing GPT's capabilities to label + political bias from web domains + + +
+ This research investigates whether OpenAI's GPT-4, a state-of-the-art large +language model, can accurately classify the political bias of news sources +based solely on their URLs. Given the subjective nature of political labels, +third-party bias ratings like those from Ad Fontes Media, AllSides, and Media +Bias/Fact Check (MBFC) are often used in research to analyze news source +diversity. This study aims to determine if GPT-4 can replicate these human +ratings on a seven-degree scale ("far-left" to "far-right"). The analysis +compares GPT-4's classifications against MBFC's, and controls for website +popularity using Open PageRank scores. Findings reveal a high correlation +($\text{Spearman's } \rho = .89$, $n = 5,877$, $p < 0.001$) between GPT-4's and +MBFC's ratings, indicating the model's potential reliability. However, GPT-4 +abstained from classifying approximately $\frac{2}{3}$ of the dataset. It is +more likely to abstain from rating unpopular websites, which also suffer from +less accurate assessments. The LLM tends to avoid classifying sources that MBFC +considers to be centrist, resulting in more polarized outputs. Finally, this +analysis shows a slight leftward skew in GPT's classifications compared to +MBFC's. Therefore, while this paper suggests that while GPT-4 can be a +scalable, cost-effective tool for political bias classification of news +websites, its use should be as a complement to human judgment to mitigate +biases. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Toolshed: Scale Tool-Equipped Agents with Advanced RAG-Tool Fusion and + Tool Knowledge Bases + + +
+ Recent advancements in tool-equipped Agents (LLMs) have enabled complex tasks +like secure database interactions and multi-agent code development. However, +scaling tool capacity beyond agent reasoning or model limits remains a +challenge. In this paper, we address these challenges by introducing Toolshed +Knowledge Bases, a tool knowledge base (vector database) designed to store +enhanced tool representations and optimize tool selection for large-scale +tool-equipped Agents. Additionally, we propose Advanced RAG-Tool Fusion, a +novel ensemble of tool-applied advanced retrieval-augmented generation (RAG) +techniques across the pre-retrieval, intra-retrieval, and post-retrieval +phases, without requiring model fine-tuning. During pre-retrieval, tool +documents are enhanced with key information and stored in the Toolshed +Knowledge Base. Intra-retrieval focuses on query planning and transformation to +increase retrieval accuracy. Post-retrieval refines the retrieved tool +documents and enables self-reflection. Furthermore, by varying both the total +number of tools (tool-M) an Agent has access to and the tool selection +threshold (top-k), we address trade-offs between retrieval accuracy, agent +performance, and token cost. Our approach achieves 46%, 56%, and 47% absolute +improvements on the ToolE single-tool, ToolE multi-tool and Seal-Tools +benchmark datasets, respectively (Recall@5). + +
+
+
+
+
+ + ♻ ☆ NaturalBench: Evaluating Vision-Language Models on Natural Adversarial + Samples NeurIPS 24 + + +
+ Vision-language models (VLMs) have made significant progress in recent +visual-question-answering (VQA) benchmarks that evaluate complex +visio-linguistic reasoning. However, are these models truly effective? In this +work, we show that VLMs still struggle with natural images and questions that +humans can easily answer, which we term natural adversarial samples. We also +find it surprisingly easy to generate these VQA samples from natural image-text +corpora using off-the-shelf models like CLIP and ChatGPT. We propose a +semi-automated approach to collect a new benchmark, NaturalBench, for reliably +evaluating VLMs with 10,000 human-verified VQA samples. Crucially, we adopt a +$\textbf{vision-centric}$ design by pairing each question with two images that +yield different answers, preventing blind solutions from answering without +using the images. This makes NaturalBench more challenging than previous +benchmarks that can be solved with commonsense priors. We evaluate 53 +state-of-the-art VLMs on NaturalBench, showing that models like +LLaVA-OneVision, Cambrian-1, Llama3.2-Vision, Molmo, Qwen2-VL, and even GPT-4o +lag 50%-70% behind human performance (over 90%). We analyze why NaturalBench is +hard from two angles: (1) Compositionality: Solving NaturalBench requires +diverse visio-linguistic skills, including understanding attribute bindings, +object relationships, and advanced reasoning like logic and counting. To this +end, unlike prior work that uses a single tag per sample, we tag each +NaturalBench sample with 1 to 8 skill tags for fine-grained evaluation. (2) +Biases: NaturalBench exposes severe biases in VLMs, as models often choose the +same answer regardless of the image. Lastly, we apply our benchmark curation +method to diverse data sources, including long captions (over 100 words) and +non-English languages like Chinese and Hindi, highlighting its potential for +dynamic evaluations of VLMs. + +
+
+ comment: Accepted to NeurIPS 24; We open-source our dataset at: + https://huggingface.co/datasets/BaiqiL/NaturalBench ; Project page at: + https://linzhiqiu.github.io/papers/naturalbench/ +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 26 + +
+
+
+ + ☆ LongVU: Spatiotemporal Adaptive Compression for Long Video-Language + Understanding + + +
+ Multimodal Large Language Models (MLLMs) have shown promising progress in +understanding and analyzing video content. However, processing long videos +remains a significant challenge constrained by LLM's context size. To address +this limitation, we propose LongVU, a spatiotemporal adaptive compression +mechanism thats reduces the number of video tokens while preserving visual +details of long videos. Our idea is based on leveraging cross-modal query and +inter-frame dependencies to adaptively reduce temporal and spatial redundancy +in videos. Specifically, we leverage DINOv2 features to remove redundant frames +that exhibit high similarity. Then we utilize text-guided cross-modal query for +selective frame feature reduction. Further, we perform spatial token reduction +across frames based on their temporal dependencies. Our adaptive compression +strategy effectively processes a large number of frames with little visual +information loss within given context length. Our LongVU consistently surpass +existing methods across a variety of video understanding benchmarks, especially +on hour-long video understanding tasks such as VideoMME and MLVU. Given a +light-weight LLM, our LongVU also scales effectively into a smaller size with +state-of-the-art video understanding performance. + +
+
+ comment: Project page: https://vision-cair.github.io/LongVU +
+
+
+
+
+ + ☆ SigCLR: Sigmoid Contrastive Learning of Visual Representations + + +
+ We propose SigCLR: Sigmoid Contrastive Learning of Visual Representations. +SigCLR utilizes the logistic loss that only operates on pairs and does not +require a global view as in the cross-entropy loss used in SimCLR. We show that +logistic loss shows competitive performance on CIFAR-10, CIFAR-100, and Tiny-IN +compared to other established SSL objectives. Our findings verify the +importance of learnable bias as in the case of SigLUP, however, it requires a +fixed temperature as in the SimCLR to excel. Overall, SigCLR is a promising +replacement for the SimCLR which is ubiquitous and has shown tremendous success +in various domains. + +
+
+ comment: Neurips 2024 SSL Workshop +
+
+
+
+
+ + ☆ AG-SLAM: Active Gaussian Splatting SLAM + + +
+ We present AG-SLAM, the first active SLAM system utilizing 3D Gaussian +Splatting (3DGS) for online scene reconstruction. In recent years, radiance +field scene representations, including 3DGS have been widely used in SLAM and +exploration, but actively planning trajectories for robotic exploration is +still unvisited. In particular, many exploration methods assume precise +localization and thus do not mitigate the significant risk of constructing a +trajectory, which is difficult for a SLAM system to operate on. This can cause +camera tracking failure and lead to failures in real-world robotic +applications. Our method leverages Fisher Information to balance the dual +objectives of maximizing the information gain for the environment while +minimizing the cost of localization errors. Experiments conducted on the Gibson +and Habitat-Matterport 3D datasets demonstrate state-of-the-art results of the +proposed method. + +
+
+
+
+
+ + ☆ Geometric Graph Neural Network Modeling of Human Interactions in Crowded + Environments + + +
+ Modeling human trajectories in crowded environments is challenging due to the +complex nature of pedestrian behavior and interactions. This paper proposes a +geometric graph neural network (GNN) architecture that integrates domain +knowledge from psychological studies to model pedestrian interactions and +predict future trajectories. Unlike prior studies using complete graphs, we +define interaction neighborhoods using pedestrians' field of view, motion +direction, and distance-based kernel functions to construct graph +representations of crowds. Evaluations across multiple datasets demonstrate +improved prediction accuracy through reduced average and final displacement +error metrics. Our findings underscore the importance of integrating domain +knowledge with data-driven approaches for effective modeling of human +interactions in crowds. + +
+
+ comment: \c{opyright} 2024 the authors. This work has been accepted to IFAC + for publication under a Creative Commons Licence CC-BY-NC-ND +
+
+
+
+
+ + ☆ Efficient Feature Extraction Using Light-Weight CNN Attention-Based Deep + Learning Architectures for Ultrasound Fetal Plane Classification + + +
+ Ultrasound fetal imaging is beneficial to support prenatal development +because it is affordable and non-intrusive. Nevertheless, fetal plane +classification (FPC) remains challenging and time-consuming for obstetricians +since it depends on nuanced clinical aspects, which increases the difficulty in +identifying relevant features of the fetal anatomy. Thus, to assist with its +accurate feature extraction, a lightweight artificial intelligence architecture +leveraging convolutional neural networks and attention mechanisms is proposed +to classify the largest benchmark ultrasound dataset. The approach fine-tunes +from lightweight EfficientNet feature extraction backbones pre-trained on the +ImageNet1k. to classify key fetal planes such as the brain, femur, thorax, +cervix, and abdomen. Our methodology incorporates the attention mechanism to +refine features and 3-layer perceptrons for classification, achieving superior +performance with the highest Top-1 accuracy of 96.25%, Top-2 accuracy of 99.80% +and F1-Score of 0.9576. Importantly, the model has 40x fewer trainable +parameters than existing benchmark ensemble or transformer pipelines, +facilitating easy deployment on edge devices to help clinical practitioners +with real-time FPC. The findings are also interpreted using GradCAM to carry +out clinical correlation to aid doctors with diagnostics and improve treatment +plans for expectant mothers. + +
+
+ comment: Submitted to Computers in Biology and Medicine journal +
+
+
+
+
+ + ☆ Denoise-I2W: Mapping Images to Denoising Words for Accurate Zero-Shot + Composed Image Retrieval IJCAI 2024 + + +
+ Zero-Shot Composed Image Retrieval (ZS-CIR) supports diverse tasks with a +broad range of visual content manipulation intentions that can be related to +domain, scene, object, and attribute. A key challenge for ZS-CIR is to +accurately map image representation to a pseudo-word token that captures the +manipulation intention relevant image information for generalized CIR. However, +existing methods between the retrieval and pre-training stages lead to +significant redundancy in the pseudo-word tokens. In this paper, we propose a +novel denoising image-to-word mapping approach, named Denoise-I2W, for mapping +images into denoising pseudo-word tokens that, without intention-irrelevant +visual information, enhance accurate ZS-CIR. Specifically, a pseudo triplet +construction module first automatically constructs pseudo triples +(\textit{i.e.,} a pseudo-reference image, a pseudo-manipulation text, and a +target image) for pre-training the denoising mapping network. Then, a +pseudo-composed mapping module maps the pseudo-reference image to a pseudo-word +token and combines it with the pseudo-manipulation text with manipulation +intention. This combination aligns with the target image, facilitating +denoising intention-irrelevant visual information for mapping. Our proposed +Denoise-I2W is a model-agnostic and annotation-free approach. It demonstrates +strong generalization capabilities across three state-of-the-art ZS-CIR models +on four benchmark datasets. By integrating Denoise-I2W with existing best +models, we obtain consistent and significant performance boosts ranging from +1.45\% to 4.17\% over the best methods without increasing inference costs. and +achieve new state-of-the-art results on ZS-CIR. Our code is available at +\url{https://github.com/Pter61/denoise-i2w-tmm}. + +
+
+ comment: This work was submitted to IJCAI 2024, with a score of weak accept + and borderline accept +
+
+
+
+
+ + ☆ Do Vision-Language Models Represent Space and How? Evaluating Spatial + Frame of Reference Under Ambiguities NeurIPS 2024 + + +
+ Spatial expressions in situated communication can be ambiguous, as their +meanings vary depending on the frames of reference (FoR) adopted by speakers +and listeners. While spatial language understanding and reasoning by +vision-language models (VLMs) have gained increasing attention, potential +ambiguities in these models are still under-explored. To address this issue, we +present the COnsistent Multilingual Frame Of Reference Test (COMFORT), an +evaluation protocol to systematically assess the spatial reasoning capabilities +of VLMs. We evaluate nine state-of-the-art VLMs using COMFORT. Despite showing +some alignment with English conventions in resolving ambiguities, our +experiments reveal significant shortcomings of VLMs: notably, the models (1) +exhibit poor robustness and consistency, (2) lack the flexibility to +accommodate multiple FoRs, and (3) fail to adhere to language-specific or +culture-specific conventions in cross-lingual tests, as English tends to +dominate other languages. With a growing effort to align vision-language models +with human cognitive intuitions, we call for more attention to the ambiguous +nature and cross-cultural diversity of spatial reasoning. + +
+
+ comment: Accepted to Pluralistic Alignment @ NeurIPS 2024 | Project page: + https://spatial-comfort.github.io/ +
+
+
+
+
+ + ☆ PtychoFormer: A Transformer-based Model for Ptychographic Phase + Retrieval + + +
+ Ptychography is a computational method of microscopy that recovers +high-resolution transmission images of samples from a series of diffraction +patterns. While conventional phase retrieval algorithms can iteratively recover +the images, they require oversampled diffraction patterns, incur significant +computational costs, and struggle to recover the absolute phase of the sample's +transmission function. Deep learning algorithms for ptychography are a +promising approach to resolving the limitations of iterative algorithms. We +present PtychoFormer, a hierarchical transformer-based model for data-driven +single-shot ptychographic phase retrieval. PtychoFormer processes subsets of +diffraction patterns, generating local inferences that are seamlessly stitched +together to produce a high-quality reconstruction. Our model exhibits tolerance +to sparsely scanned diffraction patterns and achieves up to 3600 times faster +imaging speed than the extended ptychographic iterative engine (ePIE). We also +propose the extended-PtychoFormer (ePF), a hybrid approach that combines the +benefits of PtychoFormer with the ePIE. ePF minimizes global phase shifts and +significantly enhances reconstruction quality, achieving state-of-the-art phase +retrieval in ptychography. + +
+
+ comment: 20 pages, 12 figures +
+
+
+
+
+ + ☆ Image-aware Evaluation of Generated Medical Reports + + +
+ The paper proposes a novel evaluation metric for automatic medical report +generation from X-ray images, VLScore. It aims to overcome the limitations of +existing evaluation methods, which either focus solely on textual similarities, +ignoring clinical aspects, or concentrate only on a single clinical aspect, the +pathology, neglecting all other factors. The key idea of our metric is to +measure the similarity between radiology reports while considering the +corresponding image. We demonstrate the benefit of our metric through +evaluation on a dataset where radiologists marked errors in pairs of reports, +showing notable alignment with radiologists' judgments. In addition, we provide +a new dataset for evaluating metrics. This dataset includes well-designed +perturbations that distinguish between significant modifications (e.g., removal +of a diagnosis) and insignificant ones. It highlights the weaknesses in current +evaluation metrics and provides a clear framework for analysis. + +
+
+
+
+
+ + ☆ Offline Evaluation of Set-Based Text-to-Image Generation + + +
+ Text-to-Image (TTI) systems often support people during ideation, the early +stages of a creative process when exposure to a broad set of relevant images +can help explore the design space. Since ideation is an important subclass of +TTI tasks, understanding how to quantitatively evaluate TTI systems according +to how well they support ideation is crucial to promoting research and +development for these users. However, existing evaluation metrics for TTI +remain focused on distributional similarity metrics like Fr\'echet Inception +Distance (FID). We take an alternative approach and, based on established +methods from ranking evaluation, develop TTI evaluation metrics with explicit +models of how users browse and interact with sets of spatially arranged +generated images. Our proposed offline evaluation metrics for TTI not only +capture how relevant generated images are with respect to the user's ideation +need but also take into consideration the diversity and arrangement of the set +of generated images. We analyze our proposed family of TTI metrics using human +studies on image grids generated by three different TTI systems based on +subsets of the widely used benchmarks such as MS-COCO captions and Localized +Narratives as well as prompts used in naturalistic settings. Our results +demonstrate that grounding metrics in how people use systems is an important +and understudied area of benchmark design. + +
+
+
+
+
+ + ☆ Altogether: Image Captioning via Re-aligning Alt-text EMNLP 2024 + + +
+ This paper focuses on creating synthetic data to improve the quality of image +captions. Existing works typically have two shortcomings. First, they caption +images from scratch, ignoring existing alt-text metadata, and second, lack +transparency if the captioners' training data (e.g. GPT) is unknown. In this +paper, we study a principled approach Altogether based on the key idea to edit +and re-align existing alt-texts associated with the images. To generate +training data, we perform human annotation where annotators start with the +existing alt-text and re-align it to the image content in multiple rounds, +consequently constructing captions with rich visual concepts. This differs from +prior work that carries out human annotation as a one-time description task +solely based on images and annotator knowledge. We train a captioner on this +data that generalizes the process of re-aligning alt-texts at scale. Our +results show our Altogether approach leads to richer image captions that also +improve text-to-image generation and zero-shot image classification tasks. + +
+
+ comment: accepted by EMNLP 2024; MetaCLIPv2 +
+
+
+
+
+ + ☆ SpectroMotion: Dynamic 3D Reconstruction of Specular Scenes + + +
+ We present SpectroMotion, a novel approach that combines 3D Gaussian +Splatting (3DGS) with physically-based rendering (PBR) and deformation fields +to reconstruct dynamic specular scenes. Previous methods extending 3DGS to +model dynamic scenes have struggled to accurately represent specular surfaces. +Our method addresses this limitation by introducing a residual correction +technique for accurate surface normal computation during deformation, +complemented by a deformable environment map that adapts to time-varying +lighting conditions. We implement a coarse-to-fine training strategy that +significantly enhances both scene geometry and specular color prediction. We +demonstrate that our model outperforms prior methods for view synthesis of +scenes containing dynamic specular objects and that it is the only existing +3DGS method capable of synthesizing photorealistic real-world dynamic specular +scenes, outperforming state-of-the-art methods in rendering complex, dynamic, +and specular scenes. + +
+
+ comment: Project page: https://cdfan0627.github.io/spectromotion/ +
+
+
+
+
+ + ☆ JMMMU: A Japanese Massive Multi-discipline Multimodal Understanding + Benchmark for Culture-aware Evaluation + + +
+ Accelerating research on Large Multimodal Models (LMMs) in non-English +languages is crucial for enhancing user experiences across broader populations. +In this paper, we introduce JMMMU (Japanese MMMU), the first large-scale +Japanese benchmark designed to evaluate LMMs on expert-level tasks based on the +Japanese cultural context. To facilitate comprehensive culture-aware +evaluation, JMMMU features two complementary subsets: (i) culture-agnostic (CA) +subset, where the culture-independent subjects (e.g., Math) are selected and +translated into Japanese, enabling one-to-one comparison with its English +counterpart MMMU; and (ii) culture-specific (CS) subset, comprising newly +crafted subjects that reflect Japanese cultural context. Using the CA subset, +we observe performance drop in many LMMs when evaluated in Japanese, which is +purely attributable to language variation. Using the CS subset, we reveal their +inadequate Japanese cultural understanding. Further, by combining both subsets, +we identify that some LMMs perform well on the CA subset but not on the CS +subset, exposing a shallow understanding of the Japanese language that lacks +depth in cultural understanding. We hope this work will not only help advance +LMM performance in Japanese but also serve as a guideline to create +high-standard, culturally diverse benchmarks for multilingual LMM development. +The project page is https://mmmu-japanese-benchmark.github.io/JMMMU/. + +
+
+ comment: Project page: https://mmmu-japanese-benchmark.github.io/JMMMU/ +
+
+
+
+
+ + ☆ PyramidDrop: Accelerating Your Large Vision-Language Models via Pyramid + Visual Redundancy Reduction + + +
+ In large vision-language models (LVLMs), images serve as inputs that carry a +wealth of information. As the idiom "A picture is worth a thousand words" +implies, representing a single image in current LVLMs can require hundreds or +even thousands of tokens. This results in significant computational costs, +which grow quadratically as input image resolution increases, thereby severely +impacting the efficiency of both training and inference. Previous approaches +have attempted to reduce the number of image tokens either before or within the +early layers of LVLMs. However, these strategies inevitably result in the loss +of crucial image information, ultimately diminishing model performance. To +address this challenge, we conduct an empirical study revealing that all visual +tokens are necessary for LVLMs in the shallow layers, and token redundancy +progressively increases in the deeper layers of the model. To this end, we +propose PyramidDrop, a visual redundancy reduction strategy for LVLMs to boost +their efficiency in both training and inference with neglectable performance +loss. Specifically, we partition the LVLM into several stages and drop part of +the image tokens at the end of each stage with a pre-defined ratio, creating +pyramid-like visual tokens across model layers. The dropping is based on a +lightweight similarity calculation with a negligible time overhead. Extensive +experiments demonstrate that PyramidDrop can achieve a 40% training time and +55% inference FLOPs acceleration of LLaVA-NeXT with comparable performance. +Besides, the PyramidDrop could also serve as a plug-and-play strategy for +inference acceleration without training, with better performance and lower +inference cost than counterparts. We hope that the insights and approach +introduced by PyramidDrop will inspire future research to further investigate +the role of image tokens in LVLMs. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Breaking the Memory Barrier: Near Infinite Batch Size Scaling for + Contrastive Loss + + +
+ Contrastive loss is a powerful approach for representation learning, where +larger batch sizes enhance performance by providing more negative samples to +better distinguish between similar and dissimilar data. However, scaling batch +sizes is constrained by the quadratic growth in GPU memory consumption, +primarily due to the full instantiation of the similarity matrix. To address +this, we propose a tile-based computation strategy that partitions the +contrastive loss calculation into arbitrary small blocks, avoiding full +materialization of the similarity matrix. Furthermore, we introduce a +multi-level tiling strategy to leverage the hierarchical structure of +distributed systems, employing ring-based communication at the GPU level to +optimize synchronization and fused kernels at the CUDA core level to reduce I/O +overhead. Experimental results show that the proposed method scales batch sizes +to unprecedented levels. For instance, it enables contrastive training of a +CLIP-ViT-L/14 model with a batch size of 4M or 12M using 8 or 32 A800 80GB +without sacrificing any accuracy. Compared to SOTA memory-efficient solutions, +it achieves a two-order-of-magnitude reduction in memory while maintaining +comparable speed. The code will be made publicly available. + +
+
+
+
+
+ + ☆ LVSM: A Large View Synthesis Model with Minimal 3D Inductive Bias + + +
+ We propose the Large View Synthesis Model (LVSM), a novel transformer-based +approach for scalable and generalizable novel view synthesis from sparse-view +inputs. We introduce two architectures: (1) an encoder-decoder LVSM, which +encodes input image tokens into a fixed number of 1D latent tokens, functioning +as a fully learned scene representation, and decodes novel-view images from +them; and (2) a decoder-only LVSM, which directly maps input images to +novel-view outputs, completely eliminating intermediate scene representations. +Both models bypass the 3D inductive biases used in previous methods -- from 3D +representations (e.g., NeRF, 3DGS) to network designs (e.g., epipolar +projections, plane sweeps) -- addressing novel view synthesis with a fully +data-driven approach. While the encoder-decoder model offers faster inference +due to its independent latent representation, the decoder-only LVSM achieves +superior quality, scalability, and zero-shot generalization, outperforming +previous state-of-the-art methods by 1.5 to 3.5 dB PSNR. Comprehensive +evaluations across multiple datasets demonstrate that both LVSM variants +achieve state-of-the-art novel view synthesis quality. Notably, our models +surpass all previous methods even with reduced computational resources (1-2 +GPUs). Please see our website for more details: +https://haian-jin.github.io/projects/LVSM/ . + +
+
+ comment: project page: https://haian-jin.github.io/projects/LVSM/ +
+
+
+
+
+ + ♻ ☆ NVLM: Open Frontier-Class Multimodal LLMs + + +
+ We introduce NVLM 1.0, a family of frontier-class multimodal large language +models (LLMs) that achieve state-of-the-art results on vision-language tasks, +rivaling the leading proprietary models (e.g., GPT-4o) and open-access models +(e.g., Llama 3-V 405B and InternVL 2). Remarkably, NVLM 1.0 shows improved +text-only performance over its LLM backbone after multimodal training. In terms +of model design, we perform a comprehensive comparison between decoder-only +multimodal LLMs (e.g., LLaVA) and cross-attention-based models (e.g., +Flamingo). Based on the strengths and weaknesses of both approaches, we propose +a novel architecture that enhances both training efficiency and multimodal +reasoning capabilities. Furthermore, we introduce a 1-D tile-tagging design for +tile-based dynamic high-resolution images, which significantly boosts +performance on multimodal reasoning and OCR-related tasks. Regarding training +data, we meticulously curate and provide detailed information on our multimodal +pretraining and supervised fine-tuning datasets. Our findings indicate that +dataset quality and task diversity are more important than scale, even during +the pretraining phase, across all architectures. Notably, we develop +production-grade multimodality for the NVLM-1.0 models, enabling them to excel +in vision-language tasks while maintaining and even improving text-only +performance compared to their LLM backbones. To achieve this, we craft and +integrate a high-quality text-only dataset into multimodal training, alongside +a substantial amount of multimodal math and reasoning data, leading to enhanced +math and coding capabilities across modalities. To advance research in the +field, we release the model weights at https://huggingface.co/nvidia/NVLM-D-72B +and will open-source the training code for the community soon. + +
+
+ comment: Fixed the typos. For more information, please visit our project page + at: https://research.nvidia.com/labs/adlr/NVLM-1 +
+
+
+
+
+ + ♻ ☆ MoRE: Multi-Modal Contrastive Pre-training with Transformers on X-Rays, + ECGs, and Diagnostic Report + + +
+ In this paper, we introduce a novel Multi-Modal Contrastive Pre-training +Framework that synergistically combines X-rays, electrocardiograms (ECGs), and +radiology/cardiology reports. Our approach leverages transformers to encode +these diverse modalities into a unified representation space, aiming to enhance +diagnostic accuracy and facilitate comprehensive patient assessments. We +utilize LoRA-Peft to significantly reduce trainable parameters in the LLM and +incorporate recent linear attention dropping strategy in the Vision +Transformer(ViT) for smoother attention. Furthermore, we provide novel +multimodal attention explanations and retrieval for our model. To the best of +our knowledge, we are the first to propose an integrated model that combines +X-ray, ECG, and Radiology/Cardiology Report with this approach. By utilizing +contrastive loss, MoRE effectively aligns modality-specific features into a +coherent embedding, which supports various downstream tasks such as zero-shot +classification and multimodal retrieval. Employing our proposed methodology, we +achieve state-of-the-art (SOTA) on the Mimic-IV, CheXpert, Edema Severity, and +PtbXl downstream datasets, surpassing existing multimodal approaches. Our +proposed framework shows significant improvements in capturing intricate +inter-modal relationships and its robustness in medical diagnosis that +establishes a framework for future research in multimodal learning in the +healthcare sector. + +
+
+ comment: 10 pages, 5 figures, 9 tables. Supplementary detail in Appendix. Code + made available in Github for reproducibility +
+
+
+
+
+ + ♻ ☆ RetriBooru: Leakage-Free Retrieval of Conditions from Reference Images + for Subject-Driven Generation + + +
+ Diffusion-based methods have demonstrated remarkable capabilities in +generating a diverse array of high-quality images, sparking interests for +styled avatars, virtual try-on, and more. Previous methods use the same +reference image as the target. An overlooked aspect is the leakage of the +target's spatial information, style, etc. from the reference, harming the +generated diversity and causing shortcuts. However, this approach continues as +widely available datasets usually consist of single images not grouped by +identities, and it is expensive to recollect large-scale same-identity data. +Moreover, existing metrics adopt decoupled evaluation on text alignment and +identity preservation, which fail at distinguishing between balanced outputs +and those that over-fit to one aspect. In this paper, we propose a multi-level, +same-identity dataset RetriBooru, which groups anime characters by both face +and cloth identities. RetriBooru enables adopting reference images of the same +character and outfits as the target, while keeping flexible gestures and +actions. We benchmark previous methods on our dataset, and demonstrate the +effectiveness of training with a reference image different from target (but +same identity). We introduce a new concept composition task, where the +conditioning encoder learns to retrieve different concepts from several +reference images, and modify a baseline network RetriNet for the new task. +Finally, we introduce a novel class of metrics named Similarity Weighted +Diversity (SWD), to measure the overlooked diversity and better evaluate the +alignment between similarity and diversity. + +
+
+
+
+
+ + ♻ ☆ AUPIMO: Redefining Visual Anomaly Detection Benchmarks with High Speed + and Low Tolerance BMVC 2024 + + +
+ Recent advances in visual anomaly detection research have seen AUROC and +AUPRO scores on public benchmark datasets such as MVTec and VisA converge +towards perfect recall, giving the impression that these benchmarks are +near-solved. However, high AUROC and AUPRO scores do not always reflect +qualitative performance, which limits the validity of these metrics in +real-world applications. We argue that the artificial ceiling imposed by the +lack of an adequate evaluation metric restrains progression of the field, and +it is crucial that we revisit the evaluation metrics used to rate our +algorithms. In response, we introduce Per-IMage Overlap (PIMO), a novel metric +that addresses the shortcomings of AUROC and AUPRO. PIMO retains the +recall-based nature of the existing metrics but introduces two distinctions: +the assignment of curves (and respective area under the curve) is per-image, +and its X-axis relies solely on normal images. Measuring recall per image +simplifies instance score indexing and is more robust to noisy annotations. As +we show, it also accelerates computation and enables the usage of statistical +tests to compare models. By imposing low tolerance for false positives on +normal images, PIMO provides an enhanced model validation procedure and +highlights performance variations across datasets. Our experiments demonstrate +that PIMO offers practical advantages and nuanced performance insights that +redefine anomaly detection benchmarks -- notably challenging the perception +that MVTec AD and VisA datasets have been solved by contemporary models. +Available on GitHub: https://github.com/jpcbertoldo/aupimo. + +
+
+ comment: Accepted to BMVC 2024. Official implementation: + https://github.com/jpcbertoldo/aupimo. Integrated in anomalib + https://github.com/openvinotoolkit/anomalib. This research was conducted + during Google Summer of Code 2023 (GSoC 2023) with the anomalib team from + Intel's OpenVINO Toolkit +
+
+
+
+
+ + ♻ ☆ XReal: Realistic Anatomy and Pathology-Aware X-ray Generation via + Controllable Diffusion Model + + +
+ Large-scale generative models have demonstrated impressive capabilities in +producing visually compelling images, with increasing applications in medical +imaging. However, they continue to grapple with hallucination challenges and +the generation of anatomically inaccurate outputs. These limitations are mainly +due to the reliance on textual inputs and lack of spatial control over the +generated images, hindering the potential usefulness of such models in +real-life settings. In this work, we present XReal, a novel controllable +diffusion model for generating realistic chest X-ray images through precise +anatomy and pathology location control. Our lightweight method comprises an +Anatomy Controller and a Pathology Controller to introduce spatial control over +anatomy and pathology in a pre-trained Text-to-Image Diffusion Model, +respectively, without fine-tuning the model. XReal outperforms state-of-the-art +X-ray diffusion models in quantitative metrics and radiologists' ratings, +showing significant gains in anatomy and pathology realism. Our model holds +promise for advancing generative models in medical imaging, offering greater +precision and adaptability while inviting further exploration in this evolving +field. The code and pre-trained model weights are publicly available at +https://github.com/BioMedIA-MBZUAI/XReal. + +
+
+
+
+
+ + ♻ ☆ Fine-tuning with Very Large Dropout + + +
+ It is impossible today to pretend that the practice of machine learning is +compatible with the idea that training and testing data follow the same +distribution. Several authors have recently used ensemble techniques to show +how scenarios involving multiple data distributions are best served by +representations that are both richer than those obtained by regularizing for +the best in-distribution performance, and richer than those obtained under the +influence of the implicit sparsity bias of common stochastic gradient +procedures. + This contribution investigates the use of very high dropout rates instead of +ensembles to obtain such rich representations. Although training a deep network +from scratch using such dropout rates is virtually impossible, fine-tuning a +large pre-trained model under such conditions is not only possible but also +achieves out-of-distribution performances that exceed those of both ensembles +and weight averaging methods such as model soups. This result has practical +significance because the importance of the fine-tuning scenario has +considerably grown in recent years. This result also provides interesting +insights on the nature of rich representations and on the intrinsically linear +nature of fine-tuning a large network using a comparatively small dataset. + +
+
+ comment: Fine-tuning with very large dropout outperforms weight-averaging and + ensemble on ResNet and large vision transformer +
+
+
+
+
+ + ♻ ☆ D2S: Representing sparse descriptors and 3D coordinates for camera + relocalization + + +
+ State-of-the-art visual localization methods mostly rely on complex +procedures to match local descriptors and 3D point clouds. However, these +procedures can incur significant costs in terms of inference, storage, and +updates over time. In this study, we propose a direct learning-based approach +that utilizes a simple network named D2S to represent complex local descriptors +and their scene coordinates. Our method is characterized by its simplicity and +cost-effectiveness. It solely leverages a single RGB image for localization +during the testing phase and only requires a lightweight model to encode a +complex sparse scene. The proposed D2S employs a combination of a simple loss +function and graph attention to selectively focus on robust descriptors while +disregarding areas such as clouds, trees, and several dynamic objects. This +selective attention enables D2S to effectively perform a binary-semantic +classification for sparse descriptors. Additionally, we propose a simple +outdoor dataset to evaluate the capabilities of visual localization methods in +scene-specific generalization and self-updating from unlabeled observations. +Our approach outperforms the previous regression-based methods in both indoor +and outdoor environments. It demonstrates the ability to generalize beyond +training data, including scenarios involving transitions from day to night and +adapting to domain shifts. The source code, trained models, dataset, and demo +videos are available at the following link: https://thpjp.github.io/d2s. + +
+
+ comment: Accepted to IEEE Robotics and Automation Letters +
+
+
+
+
+ + ♻ ☆ DEAR: Disentangled Environment and Agent Representations for + Reinforcement Learning without Reconstruction IROS 2024 + + +
+ Reinforcement Learning (RL) algorithms can learn robotic control tasks from +visual observations, but they often require a large amount of data, especially +when the visual scene is complex and unstructured. In this paper, we explore +how the agent's knowledge of its shape can improve the sample efficiency of +visual RL methods. We propose a novel method, Disentangled Environment and +Agent Representations (DEAR), that uses the segmentation mask of the agent as +supervision to learn disentangled representations of the environment and the +agent through feature separation constraints. Unlike previous approaches, DEAR +does not require reconstruction of visual observations. These representations +are then used as an auxiliary loss to the RL objective, encouraging the agent +to focus on the relevant features of the environment. We evaluate DEAR on two +challenging benchmarks: Distracting DeepMind control suite and Franka Kitchen +manipulation tasks. Our findings demonstrate that DEAR surpasses +state-of-the-art methods in sample efficiency, achieving comparable or superior +performance with reduced parameters. Our results indicate that integrating +agent knowledge into visual RL methods has the potential to enhance their +learning efficiency and robustness. + +
+
+ comment: 6 pages, 7 figures, 2 tables. Accepted at 2024 IEEE/RSJ International + Conference on Intelligent Robots and Systems (IROS 2024) +
+
+
+
+
+ + ♻ ☆ Counter-Hypothetical Particle Filters for Single Object Pose Tracking ICRA + + +
+ Particle filtering is a common technique for six degrees of freedom (6D) pose +estimation due to its ability to tractably represent belief over object pose. +However, the particle filter is prone to particle deprivation due to the +high-dimensional nature of 6D pose. When particle deprivation occurs, it can +cause mode collapse of the underlying belief distribution during importance +sampling. If the region surrounding the true state suffers from mode collapse, +recovering its belief is challenging since the area is no longer represented in +the probability mass formed by the particles. Previous methods mitigate this +problem by randomizing and resetting particles in the belief distribution, but +determining the frequency of reinvigoration has relied on hand-tuning abstract +heuristics. In this paper, we estimate the necessary reinvigoration rate at +each time step by introducing a Counter-Hypothetical likelihood function, which +is used alongside the standard likelihood. Inspired by the notions of +plausibility and implausibility from Evidential Reasoning, the addition of our +Counter-Hypothetical likelihood function assigns a level of doubt to each +particle. The competing cumulative values of confidence and doubt across the +particle set are used to estimate the level of failure within the filter, in +order to determine the portion of particles to be reinvigorated. We demonstrate +the effectiveness of our method on the rigid body object 6D pose tracking task. + +
+
+ comment: International Conference on Robotics and Automation (ICRA) 2023 +
+
+
+
+
+ + ♻ ☆ EF-3DGS: Event-Aided Free-Trajectory 3D Gaussian Splatting + + +
+ Scene reconstruction from casually captured videos has wide applications in +real-world scenarios. With recent advancements in differentiable rendering +techniques, several methods have attempted to simultaneously optimize scene +representations (NeRF or 3DGS) and camera poses. Despite recent progress, +existing methods relying on traditional camera input tend to fail in high-speed +(or equivalently low-frame-rate) scenarios. Event cameras, inspired by +biological vision, record pixel-wise intensity changes asynchronously with high +temporal resolution, providing valuable scene and motion information in blind +inter-frame intervals. In this paper, we introduce the event camera to aid +scene construction from a casually captured video for the first time, and +propose Event-Aided Free-Trajectory 3DGS, called EF-3DGS, which seamlessly +integrates the advantages of event cameras into 3DGS through three key +components. First, we leverage the Event Generation Model (EGM) to fuse events +and frames, supervising the rendered views observed by the event stream. +Second, we adopt the Contrast Maximization (CMax) framework in a piece-wise +manner to extract motion information by maximizing the contrast of the Image of +Warped Events (IWE), thereby calibrating the estimated poses. Besides, based on +the Linear Event Generation Model (LEGM), the brightness information encoded in +the IWE is also utilized to constrain the 3DGS in the gradient domain. Third, +to mitigate the absence of color information of events, we introduce +photometric bundle adjustment (PBA) to ensure view consistency across events +and frames. We evaluate our method on the public Tanks and Temples benchmark +and a newly collected real-world dataset, RealEv-DAVIS. Our project page is +https://lbh666.github.io/ef-3dgs/. + +
+
+ comment: Project Page: https://lbh666.github.io/ef-3dgs/ +
+
+
+
+
+
+
+
+ + Information Retrieval 22 + +
+
+
+ + ☆ ProveRAG: Provenance-Driven Vulnerability Analysis with Automated + Retrieval-Augmented LLMs + + +
+ In cybersecurity, security analysts face the challenge of mitigating newly +discovered vulnerabilities in real-time, with over 300,000 Common +Vulnerabilities and Exposures (CVEs) identified since 1999. The sheer volume of +known vulnerabilities complicates the detection of patterns for unknown +threats. While LLMs can assist, they often hallucinate and lack alignment with +recent threats. Over 25,000 vulnerabilities have been identified so far in +2024, which are introduced after popular LLMs' (e.g., GPT-4) training data +cutoff. This raises a major challenge of leveraging LLMs in cybersecurity, +where accuracy and up-to-date information are paramount. In this work, we aim +to improve the adaptation of LLMs in vulnerability analysis by mimicking how +analysts perform such tasks. We propose ProveRAG, an LLM-powered system +designed to assist in rapidly analyzing CVEs with automated retrieval +augmentation of web data while self-evaluating its responses with verifiable +evidence. ProveRAG incorporates a self-critique mechanism to help alleviate +omission and hallucination common in the output of LLMs applied in +cybersecurity applications. The system cross-references data from verifiable +sources (NVD and CWE), giving analysts confidence in the actionable insights +provided. Our results indicate that ProveRAG excels in delivering verifiable +evidence to the user with over 99% and 97% accuracy in exploitation and +mitigation strategies, respectively. This system outperforms direct prompting +and chunking retrieval in vulnerability analysis by overcoming temporal and +context-window limitations. ProveRAG guides analysts to secure their systems +more effectively while documenting the process for future audits. + +
+
+
+
+
+ + ☆ Captions Speak Louder than Images (CASLIE): Generalizing Foundation + Models for E-commerce from High-quality Multimodal Instruction Data + + +
+ Leveraging multimodal data to drive breakthroughs in e-commerce applications +through Multimodal Foundation Models (MFMs) is gaining increasing attention +from the research community. However, there are significant challenges that +hinder the optimal use of multimodal e-commerce data by foundation models: (1) +the scarcity of large-scale, high-quality multimodal benchmark datasets; and +(2) the lack of effective multimodal information integration methods. To +address these challenges, in this paper, we introduce MMECInstruct, the +first-ever, large-scale, and high-quality multimodal instruction dataset for +e-commerce. We also develop CASLIE, a simple, lightweight, yet effective +framework for integrating multimodal information for e-commerce. Leveraging +MMECInstruct, we fine-tune a series of e-commerce MFMs within CASLIE, denoted +as CASLIE models. Our comprehensive evaluation demonstrates that CASLIE models +substantially outperform 5 categories of advanced baseline models in the +in-domain evaluation. Moreover, CASLIE models show strong generalizability to +out-of-domain settings. MMECInstruct and CASLIE models are publicly accessible +through https://ninglab.github.io/CASLIE/. + +
+
+ comment: Xinyi Ling and Bo Peng contributed equally to this paper +
+
+
+
+
+ + ☆ Offline Evaluation of Set-Based Text-to-Image Generation + + +
+ Text-to-Image (TTI) systems often support people during ideation, the early +stages of a creative process when exposure to a broad set of relevant images +can help explore the design space. Since ideation is an important subclass of +TTI tasks, understanding how to quantitatively evaluate TTI systems according +to how well they support ideation is crucial to promoting research and +development for these users. However, existing evaluation metrics for TTI +remain focused on distributional similarity metrics like Fr\'echet Inception +Distance (FID). We take an alternative approach and, based on established +methods from ranking evaluation, develop TTI evaluation metrics with explicit +models of how users browse and interact with sets of spatially arranged +generated images. Our proposed offline evaluation metrics for TTI not only +capture how relevant generated images are with respect to the user's ideation +need but also take into consideration the diversity and arrangement of the set +of generated images. We analyze our proposed family of TTI metrics using human +studies on image grids generated by three different TTI systems based on +subsets of the widely used benchmarks such as MS-COCO captions and Localized +Narratives as well as prompts used in naturalistic settings. Our results +demonstrate that grounding metrics in how people use systems is an important +and understudied area of benchmark design. + +
+
+
+
+
+ + ☆ Large Language Models Empowered Personalized Web Agents + + +
+ Web agents have emerged as a promising direction to automate Web task +completion based on user instructions, significantly enhancing user experience. +Recently, Web agents have evolved from traditional agents to Large Language +Models (LLMs)-based Web agents. Despite their success, existing LLM-based Web +agents overlook the importance of personalized data (e.g., user profiles and +historical Web behaviors) in assisting the understanding of users' personalized +instructions and executing customized actions. To overcome the limitation, we +first formulate the task of LLM-empowered personalized Web agents, which +integrate personalized data and user instructions to personalize instruction +comprehension and action execution. To address the absence of a comprehensive +evaluation benchmark, we construct a Personalized Web Agent Benchmark +(PersonalWAB), featuring user instructions, personalized user data, Web +functions, and two evaluation paradigms across three personalized Web tasks. +Moreover, we propose a Personalized User Memory-enhanced Alignment (PUMA) +framework to adapt LLMs to the personalized Web agent task. PUMA utilizes a +memory bank with a task-specific retrieval strategy to filter relevant +historical Web behaviors. Based on the behaviors, PUMA then aligns LLMs for +personalized action execution through fine-tuning and direct preference +optimization. Extensive experiments validate the superiority of PUMA over +existing Web agents on PersonalWAB. + +
+
+ comment: The code and data are available on the project website + https://hongrucai.github.io/PersonalWAB/ +
+
+
+
+
+ + ☆ Improving Pinterest Search Relevance Using Large Language Models CIKM 2024 + + +
+ To improve relevance scoring on Pinterest Search, we integrate Large Language +Models (LLMs) into our search relevance model, leveraging carefully designed +text representations to predict the relevance of Pins effectively. Our approach +uses search queries alongside content representations that include captions +extracted from a generative visual language model. These are further enriched +with link-based text data, historically high-quality engaged queries, +user-curated boards, Pin titles and Pin descriptions, creating robust models +for predicting search relevance. We use a semi-supervised learning approach to +efficiently scale up the amount of training data, expanding beyond the +expensive human labeled data available. By utilizing multilingual LLMs, our +system extends training data to include unseen languages and domains, despite +initial data and annotator expertise being confined to English. Furthermore, we +distill from the LLM-based model into real-time servable model architectures +and features. We provide comprehensive offline experimental validation for our +proposed techniques and demonstrate the gains achieved through the final +deployed system at scale. + +
+
+ comment: CIKM 2024 Workshop on Industrial Recommendation Systems +
+
+
+
+
+ + ☆ TELII: Temporal Event Level Inverted Indexing for Cohort Discovery on a + Large Covid-19 EHR Dataset + + +
+ Cohort discovery is a crucial step in clinical research on Electronic Health +Record (EHR) data. Temporal queries, which are common in cohort discovery, can +be time-consuming and prone to errors when processed on large EHR datasets. In +this work, we introduce TELII, a temporal event level inverted indexing method +designed for cohort discovery on large EHR datasets. TELII is engineered to +pre-compute and store the relations along with the time difference between +events, thereby providing fast and accurate temporal query capabilities. We +implemented TELII for the OPTUM de-identified COVID-19 EHR dataset, which +contains data from 8.87 million patients. We demonstrate four common temporal +query tasks and their implementation using TELII with a MongoDB backend. Our +results show that the temporal query speed for TELII is up to 2000 times faster +than that of existing non-temporal inverted indexes. TELII achieves +millisecond-level response times, enabling users to quickly explore event +relations and find preliminary evidence for their research questions. Not only +is TELII practical and straightforward to implement, but it also offers easy +adaptability to other EHR datasets. These advantages underscore TELII's +potential to serve as the query engine for EHR-based applications, ensuring +fast, accurate, and user-friendly query responses. + +
+
+
+
+
+ + ☆ Enhancing Answer Attribution for Faithful Text Generation with Large + Language Models + + +
+ The increasing popularity of Large Language Models (LLMs) in recent years has +changed the way users interact with and pose questions to AI-based +conversational systems. An essential aspect for increasing the trustworthiness +of generated LLM answers is the ability to trace the individual claims from +responses back to relevant sources that support them, the process known as +answer attribution. While recent work has started exploring the task of answer +attribution in LLMs, some challenges still remain. In this work, we first +perform a case study analyzing the effectiveness of existing answer attribution +methods, with a focus on subtasks of answer segmentation and evidence +retrieval. Based on the observed shortcomings, we propose new methods for +producing more independent and contextualized claims for better retrieval and +attribution. The new methods are evaluated and shown to improve the performance +of answer attribution components. We end with a discussion and outline of +future directions for the task. + +
+
+ comment: Accepted to KDIR 2024 (part of IC3K 2024) +
+
+
+
+
+ + ☆ Bridging the Modality Gap: Dimension Information Alignment and Sparse + Spatial Constraint for Image-Text Matching + + +
+ Many contrastive learning based models have achieved advanced performance in +image-text matching tasks. The key of these models lies in analyzing the +correlation between image-text pairs, which involves cross-modal interaction of +embeddings in corresponding dimensions. However, the embeddings of different +modalities are from different models or modules, and there is a significant +modality gap. Directly interacting such embeddings lacks rationality and may +capture inaccurate correlation. Therefore, we propose a novel method called +DIAS to bridge the modality gap from two aspects: (1) We align the information +representation of embeddings from different modalities in corresponding +dimension to ensure the correlation calculation is based on interactions of +similar information. (2) The spatial constraints of inter- and intra-modalities +unmatched pairs are introduced to ensure the effectiveness of semantic +alignment of the model. Besides, a sparse correlation algorithm is proposed to +select strong correlated spatial relationships, enabling the model to learn +more significant features and avoid being misled by weak correlation. Extensive +experiments demonstrate the superiority of DIAS, achieving 4.3\%-10.2\% rSum +improvements on Flickr30k and MSCOCO benchmarks. + +
+
+
+
+
+ + ☆ Neural Collaborative Filtering Classification Model to Obtain Prediction + Reliabilities + + +
+ Neural collaborative filtering is the state of art field in the recommender +systems area; it provides some models that obtain accurate predictions and +recommendations. These models are regression-based, and they just return rating +predictions. This paper proposes the use of a classification-based approach, +returning both rating predictions and their reliabilities. The extra +information (prediction reliabilities) can be used in a variety of relevant +collaborative filtering areas such as detection of shilling attacks, +recommendations explanation or navigational tools to show users and items +dependences. Additionally, recommendation reliabilities can be gracefully +provided to users: "probably you will like this film", "almost certainly you +will like this song", etc. This paper provides the proposed neural +architecture; it also tests that the quality of its recommendation results is +as good as the state of art baselines. Remarkably, individual rating +predictions are improved by using the proposed architecture compared to +baselines. Experiments have been performed making use of four popular public +datasets, showing generalizable quality results. Overall, the proposed +architecture improves individual rating predictions quality, maintains +recommendation results and opens the doors to a set of relevant collaborative +filtering fields. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ Bridging Search and Recommendation in Generative Retrieval: Does One + Task Help the Other? RecSys'24 + + +
+ Generative retrieval for search and recommendation is a promising paradigm +for retrieving items, offering an alternative to traditional methods that +depend on external indexes and nearest-neighbor searches. Instead, generative +models directly associate inputs with item IDs. Given the breakthroughs of +Large Language Models (LLMs), these generative systems can play a crucial role +in centralizing a variety of Information Retrieval (IR) tasks in a single model +that performs tasks such as query understanding, retrieval, recommendation, +explanation, re-ranking, and response generation. Despite the growing interest +in such a unified generative approach for IR systems, the advantages of using a +single, multi-task model over multiple specialized models are not well +established in the literature. This paper investigates whether and when such a +unified approach can outperform task-specific models in the IR tasks of search +and recommendation, broadly co-existing in multiple industrial online +platforms, such as Spotify, YouTube, and Netflix. Previous work shows that (1) +the latent representations of items learned by generative recommenders are +biased towards popularity, and (2) content-based and +collaborative-filtering-based information can improve an item's +representations. Motivated by this, our study is guided by two hypotheses: [H1] +the joint training regularizes the estimation of each item's popularity, and +[H2] the joint training regularizes the item's latent representations, where +search captures content-based aspects of an item and recommendation captures +collaborative-filtering aspects. Our extensive experiments with both simulated +and real-world data support both [H1] and [H2] as key contributors to the +effectiveness improvements observed in the unified search and recommendation +generative models over the single-task approaches. + +
+
+ comment: Accepted for publication in the 18th ACM Conference on Recommender + Systems (RecSys'24) +
+
+
+
+
+ + ☆ Beyond Retrieval: Generating Narratives in Conversational Recommender + Systems + + +
+ The recent advances in Large Language Model's generation and reasoning +capabilities present an opportunity to develop truly conversational +recommendation systems. However, effectively integrating recommender system +knowledge into LLMs for natural language generation which is tailored towards +recommendation tasks remains a challenge. This paper addresses this challenge +by making two key contributions. + First, we introduce a new dataset (REGEN) for natural language generation +tasks in conversational recommendations. REGEN (Reviews Enhanced with +GEnerative Narratives) extends the Amazon Product Reviews dataset with rich +user narratives, including personalized explanations of product preferences, +product endorsements for recommended items, and summaries of user purchase +history. REGEN is made publicly available to facilitate further research. +Furthermore, we establish benchmarks using well-known generative metrics, and +perform an automated evaluation of the new dataset using a rater LLM. Second, +the paper introduces a fusion architecture (CF model with an LLM) which serves +as a baseline for REGEN. And to the best of our knowledge, represents the first +attempt to analyze the capabilities of LLMs in understanding recommender +signals and generating rich narratives. We demonstrate that LLMs can +effectively learn from simple fusion architectures utilizing interaction-based +CF embeddings, and this can be further enhanced using the metadata and +personalization data associated with items. Our experiments show that combining +CF and content embeddings leads to improvements of 4-12% in key language +metrics compared to using either type of embedding individually. We also +provide an analysis to interpret how CF and content embeddings contribute to +this new generative task. + +
+
+
+
+
+ + ☆ Coarse-to-fine Dynamic Uplift Modeling for Real-time Video + Recommendation + + +
+ With the rise of short video platforms, video recommendation technology faces +more complex challenges. Currently, there are multiple non-personalized modules +in the video recommendation pipeline that urgently need personalized modeling +techniques for improvement. Inspired by the success of uplift modeling in +online marketing, we attempt to implement uplift modeling in the video +recommendation scenario. However, we face two main challenges: 1) Design and +utilization of treatments, and 2) Capture of user real-time interest. To +address them, we design adjusting the distribution of videos with varying +durations as the treatment and propose Coarse-to-fine Dynamic Uplift Modeling +(CDUM) for real-time video recommendation. CDUM consists of two modules, CPM +and FIC. The former module fully utilizes the offline features of users to +model their long-term preferences, while the latter module leverages online +real-time contextual features and request-level candidates to model users' +real-time interests. These two modules work together to dynamically identify +and targeting specific user groups and applying treatments effectively. +Further, we conduct comprehensive experiments on the offline public and +industrial datasets and online A/B test, demonstrating the superiority and +effectiveness of our proposed CDUM. Our proposed CDUM is eventually fully +deployed on the Kuaishou platform, serving hundreds of millions of users every +day. The source code will be provided after the paper is accepted. + +
+
+ comment: 9 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ Cutting Through the Confusion and Hype: Understanding the True Potential + of Generative AI + + +
+ This paper explores the nuanced landscape of generative AI (genAI), +particularly focusing on neural network-based models like Large Language Models +(LLMs). While genAI garners both optimistic enthusiasm and sceptical criticism, +this work seeks to provide a balanced examination of its capabilities, +limitations, and the profound impact it may have on societal functions and +personal interactions. The first section demystifies language-based genAI +through detailed discussions on how LLMs learn, their computational needs, +distinguishing features from supporting technologies, and the inherent +limitations in their accuracy and reliability. Real-world examples illustrate +the practical applications and implications of these technologies. The latter +part of the paper adopts a systems perspective, evaluating how the integration +of LLMs with existing technologies can enhance productivity and address +emerging concerns. It highlights the need for significant investment to +understand the implications of recent advancements, advocating for a +well-informed dialogue to ethically and responsibly integrate genAI into +diverse sectors. The paper concludes with prospective developments and +recommendations, emphasizing a forward-looking approach to harnessing genAI`s +potential while mitigating its risks. + +
+
+
+
+
+ + ☆ Distill-SynthKG: Distilling Knowledge Graph Synthesis Workflow for + Improved Coverage and Efficiency + + +
+ Knowledge graphs (KGs) generated by large language models (LLMs) are becoming +increasingly valuable for Retrieval-Augmented Generation (RAG) applications +that require knowledge-intensive reasoning. However, existing KG extraction +methods predominantly rely on prompt-based approaches, which are inefficient +for processing large-scale corpora. These approaches often suffer from +information loss, particularly with long documents, due to the lack of +specialized design for KG construction. Additionally, there is a gap in +evaluation datasets and methodologies for ontology-free KG construction. To +overcome these limitations, we propose SynthKG, a multi-step, document-level +ontology-free KG synthesis workflow based on LLMs. By fine-tuning a smaller LLM +on the synthesized document-KG pairs, we streamline the multi-step process into +a single-step KG generation approach called Distill-SynthKG, substantially +reducing the number of LLM inference calls. Furthermore, we re-purpose existing +question-answering datasets to establish KG evaluation datasets and introduce +new evaluation metrics. Using KGs produced by Distill-SynthKG, we also design a +novel graph-based retrieval framework for RAG. Experimental results demonstrate +that Distill-SynthKG not only surpasses all baseline models in KG quality -- +including models up to eight times larger -- but also consistently excels in +retrieval and question-answering tasks. Our proposed graph retrieval framework +also outperforms all KG-retrieval methods across multiple benchmark datasets. +We release the SynthKG dataset and Distill-SynthKG model publicly to support +further research and development. + +
+
+
+
+
+ + ☆ Optimizing LLMs with Direct Preferences: A Data Efficiency Perspective + + +
+ Aligning the output of Large Language Models (LLMs) with human preferences +(e.g., by means of reinforcement learning with human feedback, or RLHF) is +essential for ensuring their effectiveness in real-world scenarios. Despite +significant advancements in LLM alignment techniques, the impact of different +type of preference data on model performance has yet to be systematically +explored. In this study, we investigate the scalability, data efficiency, and +effectiveness of Direct Preference Optimization (DPO) in fine-tuning +pre-trained LLMs, aiming to reduce their dependency on extensive amounts of +preference data, which is expensive to collect. We (1) systematically compare +the performance of models fine-tuned with varying percentages of a combined +preference judgement dataset to define the improvement curve of DPO and assess +its effectiveness in data-constrained environments; and (2) provide insights +for the development of an optimal approach for selective preference data usage. +Our study reveals that increasing the amount of data used for training +generally enhances and stabilizes model performance. Moreover, the use of a +combination of diverse datasets significantly improves model effectiveness. +Furthermore, when models are trained separately using different types of +prompts, models trained with conversational prompts outperformed those trained +with question answering prompts. + +
+
+
+
+
+ + ♻ ☆ Token-wise Influential Training Data Retrieval for Large Language Models ACL 2024 + + +
+ Given a Large Language Model (LLM) generation, how can we identify which +training data led to this generation? In this paper, we proposed RapidIn, a +scalable framework adapting to LLMs for estimating the influence of each +training data. The proposed framework consists of two stages: caching and +retrieval. First, we compress the gradient vectors by over 200,000x, allowing +them to be cached on disk or in GPU/CPU memory. Then, given a generation, +RapidIn efficiently traverses the cached gradients to estimate the influence +within minutes, achieving over a 6,326x speedup. Moreover, RapidIn supports +multi-GPU parallelization to substantially accelerate caching and retrieval. +Our empirical result confirms the efficiency and effectiveness of RapidIn. + +
+
+ comment: Accepted to ACL 2024. Keywords: Influence Function, Influence + Estimation, Training Data Attribution +
+
+
+
+
+ + ♻ ☆ BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive + Retrieval + + +
+ Existing retrieval benchmarks primarily consist of information-seeking +queries (e.g., aggregated questions from search engines) where keyword or +semantic-based retrieval is usually sufficient. However, many complex +real-world queries require in-depth reasoning to identify relevant documents +that go beyond surface form matching. For example, finding documentation for a +coding question requires understanding the logic and syntax of the functions +involved. To better benchmark retrieval on such challenging queries, we +introduce BRIGHT, the first text retrieval benchmark that requires intensive +reasoning to retrieve relevant documents. Our dataset consists of 1,384 +real-world queries spanning diverse domains, such as economics, psychology, +mathematics, and coding. These queries are drawn from naturally occurring and +carefully curated human data. Extensive evaluation reveals that even +state-of-the-art retrieval models perform poorly on BRIGHT. The leading model +on the MTEB leaderboard (Muennighoff et al., 2023), which achieves a score of +59.0 nDCG@10, produces a score of nDCG@10 of 18.3 on BRIGHT. We show that +incorporating explicit reasoning about the query improves retrieval performance +by up to 12.2 points. Moreover, incorporating retrieved documents from the +top-performing retriever boosts question-answering performance by over 6.6 +points. We believe that BRIGHT paves the way for future research on retrieval +systems in more realistic and challenging settings. + +
+
+ comment: 48 pages +
+
+
+
+
+ + ♻ ☆ RecPrompt: A Self-tuning Prompting Framework for News Recommendation + Using Large Language Models + + +
+ News recommendations heavily rely on Natural Language Processing (NLP) +methods to analyze, understand, and categorize content, enabling personalized +suggestions based on user interests and reading behaviors. Large Language +Models (LLMs) like GPT-4 have shown promising performance in understanding +natural language. However, the extent of their applicability to news +recommendation systems remains to be validated. This paper introduces +RecPrompt, the first self-tuning prompting framework for news recommendation, +leveraging the capabilities of LLMs to perform complex news recommendation +tasks. This framework incorporates a news recommender and a prompt optimizer +that applies an iterative bootstrapping process to enhance recommendations +through automatic prompt engineering. Extensive experimental results with 400 +users show that RecPrompt can achieve an improvement of 3.36% in AUC, 10.49% in +MRR, 9.64% in nDCG@5, and 6.20% in nDCG@10 compared to deep neural models. +Additionally, we introduce TopicScore, a novel metric to assess explainability +by evaluating LLM's ability to summarize topics of interest for users. The +results show LLM's effectiveness in accurately identifying topics of interest +and delivering comprehensive topic-based explanations. + +
+
+ comment: 5 pages, 2 figures, and 2 tables +
+
+
+
+
+ + ♻ ☆ Research on Travel Route Planing Problems Based on Greedy Algorithm + + +
+ The route planning problem based on the greedy algorithm represents a method +of identifying the optimal or near-optimal route between a given start point +and end point. In this paper, the PCA method is employed initially to downscale +the city evaluation indexes, extract the key principal components, and then +downscale the data using the KMO and TOPSIS algorithms, all of which are based +on the MindSpore framework. Secondly, for the dataset that does not pass the +KMO test, the entropy weight method and TOPSIS method will be employed for +comprehensive evaluation. Finally, a route planning algorithm is proposed and +optimised based on the greedy algorithm, which provides personalised route +customisation according to the different needs of tourists. In addition, the +local travelling efficiency, the time required to visit tourist attractions and +the necessary daily breaks are considered in order to reduce the cost and avoid +falling into the locally optimal solution. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Generate and Instantiate What You Prefer: Text-Guided Diffusion for + Sequential Recommendation + + +
+ Recent advancements in generative recommendation systems, particularly in the +realm of sequential recommendation tasks, have shown promise in enhancing +generalization to new items. Among these approaches, diffusion-based generative +recommendation has emerged as an effective tool, leveraging its ability to +capture data distributions and generate high-quality samples. Despite +effectiveness, two primary challenges have been identified: 1) the lack of +consistent modeling of data distribution for oracle items; and 2) the +difficulty in scaling to more informative control signals beyond historical +interactions. These issues stem from the uninformative nature of ID embeddings, +which necessitate random initialization and limit the incorporation of +additional control signals. To address these limitations, we propose iDreamRec +to involve more concrete prior knowledge to establish item embeddings, +particularly through detailed item text descriptions and advanced Text +Embedding Models (TEM). More importantly, by converting item descriptions into +embeddings aligned with TEM, we enable the integration of intention +instructions as control signals to guide the generation of oracle items. +Experimental results on four datasets demonstrate that iDreamRec not only +outperforms existing diffusion-based generative recommenders but also +facilitates the incorporation of intention instructions for more precise and +effective recommendation generation. + +
+
+
+
+
+ + ♻ ☆ A Text is Worth Several Tokens: Text Embedding from LLMs Secretly Aligns + Well with The Key Tokens + + +
+ Text embeddings from large language models (LLMs) have achieved excellent +results in tasks such as information retrieval, semantic textual similarity, +etc. In this work, we show an interesting finding: when feeding a text into the +embedding LLMs, the obtained text embedding will be able to be aligned with the +key tokens in the input text. We first fully analyze this phenomenon on eight +embedding LLMs and show that this phenomenon is universal and is not affected +by model architecture, training strategy, and embedding method. With a deeper +analysis, we then find that the main change in embedding space between the +embedding LLMs and their original generative LLMs is in the first principal +component. By adjusting the first principal component, we can align text +embedding with the key tokens. Finally, we give several examples to demonstrate +the vast application potential of this finding: (1) we propose a simple and +practical sparse retrieval method based on the aligned tokens, which can +achieve 80\% of the dense retrieval effect of the same model while reducing the +computation significantly; (2) we show that our findings provide a fresh +perspective to help understand fuzzy concepts (e.g., semantic relatedness vs. +semantic similarity) and emerging technologies (e.g., instruction-following +embedding) in this field. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ♻ ☆ Automatic Search of Multiword Place Names on Historical Maps SP + + +
+ Historical maps are invaluable sources of information about the past, and +scanned historical maps are increasingly accessible in online libraries. To +retrieve maps from these large libraries that contain specific places of +interest, previous work has applied computer vision techniques to recognize +words on historical maps, enabling searches for maps that contain specific +place names. However, searching for multiword place names is challenging due to +complex layouts of text labels on historical maps. This paper proposes an +efficient query method for searching a given multiword place name on historical +maps. Using existing methods to recognize words on historical maps, we link +single-word text labels into potential multiword phrases by constructing +minimum spanning trees. These trees aim to link pairs of text labels that are +spatially close and have similar height, angle, and capitalization. We then +query these trees for the given multiword place name. We evaluate the proposed +method in two experiments: 1) to evaluate the accuracy of the minimum spanning +tree approach at linking multiword place names and 2) to evaluate the number +and time range of maps retrieved by the query approach. The resulting maps +reveal how places using multiword names have changed on a large number of maps +from across history. + +
+
+ comment: 4 pages, 4 figures, and 2 tables. To be published in proceedings ACM + SIGSPATIAL 2024 GeoSearch Workshop +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Personalized Playback Technology: How Short Video Services Create + Excellent User Experience + + +
+ Short-form video content has become increasingly popular and influential in +recent years. Its concise yet engaging format aligns well with todays' +fast-paced and on-the-go lifestyles, making it a dominating trend in the +digital world. As one of the front runners in the short video platform space, +ByteDance has been highly successful in delivering a one-of-a-kind short video +experience and attracting billions of users worldwide. One key contributing +factor is its advanced end-to-end personalized short video playback technology, +where we pioneered and developed the new technical field over the past five +years to optimize user experience. This paper introduces the major concepts and +methodologies of this personalized video playback technology that distinguish +it from traditional multimedia technologies. More details, including goal +setting, iterative process, modeling, experimental methods and required +supporting systems, are also provided to encourage deeper research in this +area. + +
+
+
+
+
+ + ♻ ☆ NVLM: Open Frontier-Class Multimodal LLMs + + +
+ We introduce NVLM 1.0, a family of frontier-class multimodal large language +models (LLMs) that achieve state-of-the-art results on vision-language tasks, +rivaling the leading proprietary models (e.g., GPT-4o) and open-access models +(e.g., Llama 3-V 405B and InternVL 2). Remarkably, NVLM 1.0 shows improved +text-only performance over its LLM backbone after multimodal training. In terms +of model design, we perform a comprehensive comparison between decoder-only +multimodal LLMs (e.g., LLaVA) and cross-attention-based models (e.g., +Flamingo). Based on the strengths and weaknesses of both approaches, we propose +a novel architecture that enhances both training efficiency and multimodal +reasoning capabilities. Furthermore, we introduce a 1-D tile-tagging design for +tile-based dynamic high-resolution images, which significantly boosts +performance on multimodal reasoning and OCR-related tasks. Regarding training +data, we meticulously curate and provide detailed information on our multimodal +pretraining and supervised fine-tuning datasets. Our findings indicate that +dataset quality and task diversity are more important than scale, even during +the pretraining phase, across all architectures. Notably, we develop +production-grade multimodality for the NVLM-1.0 models, enabling them to excel +in vision-language tasks while maintaining and even improving text-only +performance compared to their LLM backbones. To achieve this, we craft and +integrate a high-quality text-only dataset into multimodal training, alongside +a substantial amount of multimodal math and reasoning data, leading to enhanced +math and coding capabilities across modalities. To advance research in the +field, we release the model weights at https://huggingface.co/nvidia/NVLM-D-72B +and will open-source the training code for the community soon. + +
+
+ comment: Fixed the typos. For more information, please visit our project page + at: https://research.nvidia.com/labs/adlr/NVLM-1 +
+
+
+
+
+ + ♻ ☆ AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and + Results + + +
+ Video quality assessment (VQA) is a crucial task in the development of video +compression standards, as it directly impacts the viewer experience. This paper +presents the results of the Compressed Video Quality Assessment challenge, held +in conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV +2024. The challenge aimed to evaluate the performance of VQA methods on a +diverse dataset of 459 videos, encoded with 14 codecs of various compression +standards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a +comprehensive collection of compression artifacts. To measure the methods +performance, we employed traditional correlation coefficients between their +predictions and subjective scores, which were collected via large-scale +crowdsourced pairwise human comparisons. For training purposes, participants +were provided with the Compressed Video Quality Assessment Dataset (CVQAD), a +previously developed dataset of 1022 videos. Up to 30 participating teams +registered for the challenge, while we report the results of 6 teams, which +submitted valid final solutions and code for reproducing the results. Moreover, +we calculated and present the performance of state-of-the-art VQA methods on +the developed dataset, providing a comprehensive benchmark for future research. +The dataset, results, and online leaderboard are publicly available at +https://challenges.videoprocessing.ai/challenges/compressedvideo-quality-assessment.html. + +
+
+
+
+
+ + ♻ ☆ WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio + Language Modeling + + +
+ Language models have been effectively applied to modeling natural signals, +such as images, video, speech, and audio. A crucial component of these models +is the codec tokenizer, which compresses high-dimensional natural signals into +lower-dimensional discrete tokens. In this paper, we introduce WavTokenizer, +which offers several advantages over previous SOTA acoustic codec models in the +audio domain: 1)extreme compression. By compressing the layers of quantizers +and the temporal dimension of the discrete codec, one-second audio of 24kHz +sampling rate requires only a single quantizer with 40 or 75 tokens. 2)improved +subjective quality. Despite the reduced number of tokens, WavTokenizer achieves +state-of-the-art reconstruction quality with outstanding UTMOS scores and +inherently contains richer semantic information. Specifically, we achieve these +results by designing a broader VQ space, extended contextual windows, and +improved attention networks, as well as introducing a powerful multi-scale +discriminator and an inverse Fourier transform structure. We conducted +extensive reconstruction experiments in the domains of speech, audio, and +music. WavTokenizer exhibited strong performance across various objective and +subjective metrics compared to state-of-the-art models. We also tested semantic +information, VQ utilization, and adaptability to generative models. +Comprehensive ablation studies confirm the necessity of each module in +WavTokenizer. The related code, demos, and pre-trained models are available at +https://github.com/jishengpeng/WavTokenizer. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ♻ ☆ Reducing Hallucinations in Vision-Language Models via Latent Space + Steering + + +
+ Hallucination poses a challenge to the deployment of large vision-language +models (LVLMs) in applications. Unlike in large language models (LLMs), +hallucination in LVLMs often arises from misalignments between visual inputs +and textual outputs. This paper investigates the underlying mechanisms of +hallucination, focusing on the unique structure of LVLMs that distinguishes +them from large language models (LLMs). We identify that hallucinations often +arise from the sensitivity of text decoders to vision inputs, a natural +phenomenon when image encoders and text decoders are pre-trained separately. +Inspired by this, we introduce Visual and Textual Intervention (VTI), a novel +technique designed to reduce hallucinations by steering latent space +representations during inference to enhance the stability of vision features. +As a task-agnostic test-time intervention, VTI can be easily applied to any +problem without additional cost. Extensive experiments demonstrate that it can +effectively reduce hallucinations and outperform baseline methods across +multiple metrics, highlighting the critical role of vision feature stability in +LVLMs. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ♻ ☆ Designing Network Algorithms via Large Language Models + + +
+ We introduce NADA, the first framework to autonomously design network +algorithms by leveraging the generative capabilities of large language models +(LLMs). Starting with an existing algorithm implementation, NADA enables LLMs +to create a wide variety of alternative designs in the form of code blocks. It +then efficiently identifies the top-performing designs through a series of +filtering techniques, minimizing the need for full-scale evaluations and +significantly reducing computational costs. Using adaptive bitrate (ABR) +streaming as a case study, we demonstrate that NADA produces novel ABR +algorithms -- previously unknown to human developers -- that consistently +outperform the original algorithm in diverse network environments, including +broadband, satellite, 4G, and 5G. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ STAR: A Simple Training-free Approach for Recommendations using Large + Language Models + + +
+ Recent progress in large language models (LLMs) offers promising new +approaches for recommendation system (RecSys) tasks. While the current +state-of-the-art methods rely on fine-tuning LLMs to achieve optimal results, +this process is costly and introduces significant engineering complexities. +Conversely, methods that bypass fine-tuning and use LLMs directly are less +resource-intensive but often fail to fully capture both semantic and +collaborative information, resulting in sub-optimal performance compared to +their fine-tuned counterparts. In this paper, we propose a Simple Training-free +Approach for Recommendation (STAR), a framework that utilizes LLMs and can be +applied to various recommendation tasks without the need for fine-tuning. Our +approach involves a retrieval stage that uses semantic embeddings from LLMs +combined with collaborative user information to retrieve candidate items. We +then apply an LLM for pairwise ranking to enhance next-item prediction. +Experimental results on the Amazon Review dataset show competitive performance +for next item prediction, even with our retrieval stage alone. Our full method +achieves Hits@10 performance of +23.8% on Beauty, +37.5% on Toys and Games, and +-1.8% on Sports and Outdoors relative to the best supervised models. This +framework offers an effective alternative to traditional supervised models, +highlighting the potential of LLMs in recommendation systems without extensive +training or custom architectures. + +
+
+
+
+
+ + ☆ Limpeh ga li gong: Challenges in Singlish Annotations + + +
+ Singlish, or Colloquial Singapore English, is a language formed from oral and +social communication within multicultural Singapore. In this work, we work on a +fundamental Natural Language Processing (NLP) task: Parts-Of-Speech (POS) +tagging of Singlish sentences. For our analysis, we build a parallel Singlish +dataset containing direct English translations and POS tags, with translation +and POS annotation done by native Singlish speakers. Our experiments show that +automatic transition- and transformer- based taggers perform with only $\sim +80\%$ accuracy when evaluated against human-annotated POS labels, suggesting +that there is indeed room for improvement on computation analysis of the +language. We provide an exposition of challenges in Singlish annotation: its +inconsistencies in form and semantics, the highly context-dependent particles +of the language, its structural unique expressions, and the variation of the +language on different mediums. Our task definition, resultant labels and +results reflects the challenges in analysing colloquial languages formulated +from a variety of dialects, and paves the way for future studies beyond POS +tagging. + +
+
+
+
+
+ + ☆ PODTILE: Facilitating Podcast Episode Browsing with Auto-generated + Chapters CIKM + + +
+ Listeners of long-form talk-audio content, such as podcast episodes, often +find it challenging to understand the overall structure and locate relevant +sections. A practical solution is to divide episodes into +chapters--semantically coherent segments labeled with titles and timestamps. +Since most episodes on our platform at Spotify currently lack creator-provided +chapters, automating the creation of chapters is essential. Scaling the +chapterization of podcast episodes presents unique challenges. First, episodes +tend to be less structured than written texts, featuring spontaneous +discussions with nuanced transitions. Second, the transcripts are usually +lengthy, averaging about 16,000 tokens, which necessitates efficient processing +that can preserve context. To address these challenges, we introduce PODTILE, a +fine-tuned encoder-decoder transformer to segment conversational data. The +model simultaneously generates chapter transitions and titles for the input +transcript. To preserve context, each input text is augmented with global +context, including the episode's title, description, and previous chapter +titles. In our intrinsic evaluation, PODTILE achieved an 11% improvement in +ROUGE score over the strongest baseline. Additionally, we provide insights into +the practical benefits of auto-generated chapters for listeners navigating +episode content. Our findings indicate that auto-generated chapters serve as a +useful tool for engaging with less popular podcasts. Finally, we present +empirical evidence that using chapter titles can enhance effectiveness of +sparse retrieval in search tasks. + +
+
+ comment: 9 pages, 4 figures, CIKM industry track 2024 +
+
+
+
+
+ + ☆ Unleashing the Potential of Multi-Channel Fusion in Retrieval for + Personalized Recommendations + + +
+ Recommender systems (RS) are pivotal in managing information overload in +modern digital services. A key challenge in RS is efficiently processing vast +item pools to deliver highly personalized recommendations under strict latency +constraints. Multi-stage cascade ranking addresses this by employing +computationally efficient retrieval methods to cover diverse user interests, +followed by more precise ranking models to refine the results. In the retrieval +stage, multi-channel retrieval is often used to generate distinct item subsets +from different candidate generators, leveraging the complementary strengths of +these methods to maximize coverage. However, forwarding all retrieved items +overwhelms downstream rankers, necessitating truncation. Despite advancements +in individual retrieval methods, multi-channel fusion, the process of +efficiently merging multi-channel retrieval results, remains underexplored. We +are the first to identify and systematically investigate multi-channel fusion +in the retrieval stage. Current industry practices often rely on heuristic +approaches and manual designs, which often lead to suboptimal performance. +Moreover, traditional gradient-based methods like SGD are unsuitable for this +task due to the non-differentiable nature of the selection process. In this +paper, we explore advanced channel fusion strategies by assigning +systematically optimized weights to each channel. We utilize black-box +optimization techniques, including the Cross Entropy Method and Bayesian +Optimization for global weight optimization, alongside policy gradient-based +approaches for personalized merging. Our methods enhance both personalization +and flexibility, achieving significant performance improvements across multiple +datasets and yielding substantial gains in real-world deployments, offering a +scalable solution for optimizing multi-channel fusion in retrieval. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Surprising Patterns in Musical Influence Networks + + +
+ Analyzing musical influence networks, such as those formed by artist +influence or sampling, has provided valuable insights into contemporary Western +music. Here, computational methods like centrality rankings help identify +influential artists. However, little attention has been given to how influence +changes over time. In this paper, we apply Bayesian Surprise to track the +evolution of musical influence networks. Using two networks -- one of artist +influence and another of covers, remixes, and samples -- our results reveal +significant periods of change in network structure. Additionally, we +demonstrate that Bayesian Surprise is a flexible framework for testing various +hypotheses on network evolution with real-world data. + +
+
+ comment: To appear in the Latin American Musical Information Retrieval + Workshop +
+
+
+
+
+ + ☆ Developing Retrieval Augmented Generation (RAG) based LLM Systems from + PDFs: An Experience Report + + +
+ This paper presents an experience report on the development of Retrieval +Augmented Generation (RAG) systems using PDF documents as the primary data +source. The RAG architecture combines generative capabilities of Large Language +Models (LLMs) with the precision of information retrieval. This approach has +the potential to redefine how we interact with and augment both structured and +unstructured knowledge in generative models to enhance transparency, accuracy, +and contextuality of responses. The paper details the end-to-end pipeline, from +data collection, preprocessing, to retrieval indexing and response generation, +highlighting technical challenges and practical solutions. We aim to offer +insights to researchers and practitioners developing similar systems using two +distinct approaches: OpenAI's Assistant API with GPT Series and Llama's +open-source models. The practical implications of this research lie in +enhancing the reliability of generative AI systems in various sectors where +domain-specific knowledge and real-time information retrieval is important. The +Python code used in this work is also available at: +https://github.com/GPT-Laboratory/RAG-LLM-Development-Guidebook-from-PDFs. + +
+
+ comment: 36 pages, 8 figures, 2 tables, and python code snippets +
+
+
+
+
+ + ☆ Centrality-aware Product Retrieval and Ranking EMNLP 2024 + + +
+ This paper addresses the challenge of improving user experience on e-commerce +platforms by enhancing product ranking relevant to users' search queries. +Ambiguity and complexity of user queries often lead to a mismatch between the +user's intent and retrieved product titles or documents. Recent approaches have +proposed the use of Transformer-based models, which need millions of annotated +query-title pairs during the pre-training stage, and this data often does not +take user intent into account. To tackle this, we curate samples from existing +datasets at eBay, manually annotated with buyer-centric relevance scores and +centrality scores, which reflect how well the product title matches the users' +intent. We introduce a User-intent Centrality Optimization (UCO) approach for +existing models, which optimises for the user intent in semantic product +search. To that end, we propose a dual-loss based optimisation to handle hard +negatives, i.e., product titles that are semantically relevant but do not +reflect the user's intent. Our contributions include curating challenging +evaluation sets and implementing UCO, resulting in significant product ranking +efficiency improvements observed for different evaluation metrics. Our work +aims to ensure that the most buyer-centric titles for a query are ranked +higher, thereby, enhancing the user experience on e-commerce platforms. + +
+
+ comment: EMNLP 2024: Industry track +
+
+
+
+
+ + ☆ Using GPT Models for Qualitative and Quantitative News Analytics in the + 2024 US Presidental Election Process + + +
+ The paper considers an approach of using Google Search API and GPT-4o model +for qualitative and quantitative analyses of news through retrieval-augmented +generation (RAG). This approach was applied to analyze news about the 2024 US +presidential election process. Different news sources for different time +periods have been analyzed. Quantitative scores generated by GPT model have +been analyzed using Bayesian regression to derive trend lines. The +distributions found for the regression parameters allow for the analysis of +uncertainty in the election process. The obtained results demonstrate that +using the GPT models for news analysis, one can get informative analytics and +provide key insights that can be applied in further analyses of election +processes. + +
+
+
+
+
+ + ☆ Improve Dense Passage Retrieval with Entailment Tuning EMNLP 2024 + + +
+ Retrieval module can be plugged into many downstream NLP tasks to improve +their performance, such as open-domain question answering and +retrieval-augmented generation. The key to a retrieval system is to calculate +relevance scores to query and passage pairs. However, the definition of +relevance is often ambiguous. We observed that a major class of relevance +aligns with the concept of entailment in NLI tasks. Based on this observation, +we designed a method called entailment tuning to improve the embedding of dense +retrievers. Specifically, we unify the form of retrieval data and NLI data +using existence claim as a bridge. Then, we train retrievers to predict the +claims entailed in a passage with a variant task of masked prediction. Our +method can be efficiently plugged into current dense retrieval methods, and +experiments show the effectiveness of our method. + +
+
+ comment: EMNLP 2024 Main +
+
+
+
+
+ + ☆ Who's Who: Large Language Models Meet Knowledge Conflicts in Practice EMNLP 2024 + + +
+ Retrieval-augmented generation (RAG) methods are viable solutions for +addressing the static memory limits of pre-trained language models. +Nevertheless, encountering conflicting sources of information within the +retrieval context is an inevitable practical challenge. In such situations, the +language models are recommended to transparently inform users about the +conflicts rather than autonomously deciding what to present based on their +inherent biases. To analyze how current large language models (LLMs) align with +our recommendation, we introduce WhoQA, a public benchmark dataset to examine +model's behavior in knowledge conflict situations. We induce conflicts by +asking about a common property among entities having the same name, resulting +in questions with up to 8 distinctive answers. WhoQA evaluation set includes 5K +questions across 13 Wikidata property types and 150K Wikipedia entities. Our +experiments show that despite the simplicity of WhoQA questions, knowledge +conflicts significantly degrades LLMs' performance in RAG settings. + +
+
+ comment: Accepted to EMNLP 2024 Findings +
+
+
+
+
+ + ☆ A Survey of Conversational Search + + +
+ As a cornerstone of modern information access, search engines have become +indispensable in everyday life. With the rapid advancements in AI and natural +language processing (NLP) technologies, particularly large language models +(LLMs), search engines have evolved to support more intuitive and intelligent +interactions between users and systems. Conversational search, an emerging +paradigm for next-generation search engines, leverages natural language +dialogue to facilitate complex and precise information retrieval, thus +attracting significant attention. Unlike traditional keyword-based search +engines, conversational search systems enhance user experience by supporting +intricate queries, maintaining context over multi-turn interactions, and +providing robust information integration and processing capabilities. Key +components such as query reformulation, search clarification, conversational +retrieval, and response generation work in unison to enable these sophisticated +interactions. In this survey, we explore the recent advancements and potential +future directions in conversational search, examining the critical modules that +constitute a conversational search system. We highlight the integration of LLMs +in enhancing these systems and discuss the challenges and opportunities that +lie ahead in this dynamic field. Additionally, we provide insights into +real-world applications and robust evaluations of current conversational search +systems, aiming to guide future research and development in conversational +search. + +
+
+ comment: 35 pages, 8 figures, continue to update +
+
+
+
+
+ + ♻ ☆ OAEI-LLM: A Benchmark Dataset for Understanding Large Language Model + Hallucinations in Ontology Matching + + +
+ Hallucinations of large language models (LLMs) commonly occur in +domain-specific downstream tasks, with no exception in ontology matching (OM). +The prevalence of using LLMs for OM raises the need for benchmarks to better +understand LLM hallucinations. The OAEI-LLM dataset is an extended version of +the Ontology Alignment Evaluation Initiative (OAEI) datasets that evaluate +LLM-specific hallucinations in OM tasks. We outline the methodology used in +dataset construction and schema extension, and provide examples of potential +use cases. + +
+
+ comment: 5 pages, 1 figure, 1 table +
+
+
+
+
+ + ♻ ☆ LLM-based SPARQL Query Generation from Natural Language over Federated + Knowledge Graphs + + +
+ We introduce a Retrieval-Augmented Generation (RAG) system for translating +user questions into accurate federated SPARQL queries over bioinformatics +knowledge graphs (KGs) leveraging Large Language Models (LLMs). To enhance +accuracy and reduce hallucinations in query generation, our system utilises +metadata from the KGs, including query examples and schema information, and +incorporates a validation step to correct generated queries. The system is +available online at chat.expasy.org. + +
+
+
+
+
+ + ♻ ☆ QUIDS: Query Intent Generation via Dual Space Modeling + + +
+ Query understanding is a crucial component of Information Retrieval (IR), +aimed at identifying the underlying search intent of textual queries. However, +most existing approaches oversimplify this task into query classification or +clustering, which fails to fully capture the nuanced intent behind the query. +In this paper, we address the task of query intent generation: to automatically +generate detailed and precise intent descriptions for search queries using +relevant and irrelevant documents given a query. These intent descriptions can +help users understand why the search engine considered the top-ranked documents +relevant, and provide more transparency to the retrieval process. We propose a +dual-space model that uses semantic relevance and irrelevance information in +the returned documents to explain the understanding of the query intent. +Specifically, in the encoding process, we project, separate, and distinguish +relevant and irrelevant documents in the representation space. Then, we +introduce a semantic decoupling model in the novel disentangling space, where +the semantics of irrelevant information are removed from the relevant space, +ensuring that only the essential and relevant intent is captured. This process +refines the understanding of the query and provides more accurate explanations +for the search results. Experiments on benchmark data demonstrate that our +methods produce high-quality query intent descriptions, outperforming existing +methods for this task, as well as state-of-the-art query-based summarization +methods. A token-level visualization of attention scores reveals that our model +effectively reduces the focus on irrelevant intent topics. Our findings open up +promising research and application directions for query intent generation, +particularly in exploratory search. + +
+
+
+
+
+ + ♻ ☆ Lightweight Correlation-Aware Table Compression NeurIPS 2024 + + +
+ The growing adoption of data lakes for managing relational data necessitates +efficient, open storage formats that provide high scan performance and +competitive compression ratios. While existing formats achieve fast scans +through lightweight encoding techniques, they have reached a plateau in terms +of minimizing storage footprint. Recently, correlation-aware compression +schemes have been shown to reduce file sizes further. Yet, current approaches +either incur significant scan overheads or require manual specification of +correlations, limiting their practicability. We present $\texttt{Virtual}$, a +framework that integrates seamlessly with existing open formats to +automatically leverage data correlations, achieving substantial compression +gains while having minimal scan performance overhead. Experiments on data-gov +datasets show that $\texttt{Virtual}$ reduces file sizes by up to 40% compared +to Apache Parquet. + +
+
+ comment: Third Table Representation Learning Workshop (TRL @ NeurIPS 2024) +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ AlignVSR: Audio-Visual Cross-Modal Alignment for Visual Speech + Recognition + + +
+ Visual Speech Recognition (VSR) aims to recognize corresponding text by +analyzing visual information from lip movements. Due to the high variability +and weak information of lip movements, VSR tasks require effectively utilizing +any information from any source and at any level. In this paper, we propose a +VSR method based on audio-visual cross-modal alignment, named AlignVSR. The +method leverages the audio modality as an auxiliary information source and +utilizes the global and local correspondence between the audio and visual +modalities to improve visual-to-text inference. Specifically, the method first +captures global alignment between video and audio through a cross-modal +attention mechanism from video frames to a bank of audio units. Then, based on +the temporal correspondence between audio and video, a frame-level local +alignment loss is introduced to refine the global alignment, improving the +utility of the audio information. Experimental results on the LRS2 and +CNVSRC.Single datasets consistently show that AlignVSR outperforms several +mainstream VSR methods, demonstrating its superior and robust performance. + +
+
+
+
+
+ + ☆ Enhancing Multimodal Affective Analysis with Learned Live Comment + Features + + +
+ Live comments, also known as Danmaku, are user-generated messages that are +synchronized with video content. These comments overlay directly onto streaming +videos, capturing viewer emotions and reactions in real-time. While prior work +has leveraged live comments in affective analysis, its use has been limited due +to the relative rarity of live comments across different video platforms. To +address this, we first construct the Live Comment for Affective Analysis +(LCAffect) dataset which contains live comments for English and Chinese videos +spanning diverse genres that elicit a wide spectrum of emotions. Then, using +this dataset, we use contrastive learning to train a video encoder to produce +synthetic live comment features for enhanced multimodal affective content +analysis. Through comprehensive experimentation on a wide range of affective +analysis tasks (sentiment, emotion recognition, and sarcasm detection) in both +English and Chinese, we demonstrate that these synthetic live comment features +significantly improve performance over state-of-the-art methods. + +
+
+
+
+
+ + ☆ Shorter Is Different: Characterizing the Dynamics of Short-Form Video + Platforms + + +
+ The emerging short-form video platforms have been growing tremendously and +become one of the leading social media recently. Although the expanded +popularity of these platforms has attracted increasing research attention, +there has been a lack of understanding of whether and how they deviate from +traditional long-form video-sharing platforms such as YouTube and Bilibili. To +address this, we conduct a large-scale data-driven analysis of Kuaishou, one of +the largest short-form video platforms in China. Based on 248 million videos +uploaded to the platform across all categories, we identify their notable +differences from long-form video platforms through a comparison study with +Bilibili, a leading long-form video platform in China. We find that videos are +shortened by multiples on Kuaishou, with distinctive categorical distributions +over-represented by life-related rather than interest-based videos. Users +interact with videos less per view, but top videos can even more effectively +acquire users' collective attention. More importantly, ordinary content +creators have higher probabilities of producing hit videos. Our results shed +light on the uniqueness of short-form video platforms and pave the way for +future research and design for better short-form video ecology. + +
+
+
+
+
+ + ☆ Modelling Concurrent RTP Flows for End-to-end Predictions of QoS in Real + Time Communications + + +
+ The Real-time Transport Protocol (RTP)-based real-time communications (RTC) +applications, exemplified by video conferencing, have experienced an +unparalleled surge in popularity and development in recent years. In pursuit of +optimizing their performance, the prediction of Quality of Service (QoS) +metrics emerges as a pivotal endeavor, bolstering network monitoring and +proactive solutions. However, contemporary approaches are confined to +individual RTP flows and metrics, falling short in relationship capture and +computational efficiency. To this end, we propose Packet-to-Prediction (P2P), a +novel deep learning (DL) framework that hinges on raw packets to simultaneously +process concurrent RTP flows and perform end-to-end prediction of multiple QoS +metrics. Specifically, we implement a streamlined architecture, namely +length-free Transformer with cross and neighbourhood attention, capable of +handling an unlimited number of RTP flows, and employ a multi-task learning +paradigm to forecast four key metrics in a single shot. Our work is based on +extensive traffic collected during real video calls, and conclusively, P2P +excels comparative models in both prediction performance and temporal +efficiency. + +
+
+
+
+
+ + ♻ ☆ Proceedings of The second international workshop on eXplainable AI for + the Arts (XAIxArts) + + +
+ This second international workshop on explainable AI for the Arts (XAIxArts) +brought together a community of researchers in HCI, Interaction Design, AI, +explainable AI (XAI), and digital arts to explore the role of XAI for the Arts. +Workshop held at the 16th ACM Conference on Creativity and Cognition (C&C +2024), Chicago, USA. + +
+
+ comment: Proceedings of The second international workshop on eXplainable AI + for the Arts (XAIxArts) +
+
+
+
+
+ + ♻ ☆ CinePile: A Long Video Question Answering Dataset and Benchmark + + +
+ Current datasets for long-form video understanding often fall short of +providing genuine long-form comprehension challenges, as many tasks derived +from these datasets can be successfully tackled by analyzing just one or a few +random frames from a video. To address this issue, we present a novel dataset +and benchmark, CinePile, specifically designed for authentic long-form video +understanding. This paper details our innovative approach for creating a +question-answer dataset, utilizing advanced LLMs with human-in-the-loop and +building upon human-generated raw data. Our comprehensive dataset comprises +305,000 multiple-choice questions (MCQs), covering various visual and +multimodal aspects, including temporal comprehension, understanding +human-object interactions, and reasoning about events or actions within a +scene. Additionally, we fine-tuned open-source Video-LLMs on the training split +and evaluated both open-source and proprietary video-centric LLMs on the test +split of our dataset. The findings indicate that although current models +underperform compared to humans, fine-tuning these models can lead to +significant improvements in their performance. + +
+
+ comment: Project page with all the artifacts - + https://ruchitrawal.github.io/cinepile/. Updated version with adversarial + refinement pipeline and more model evaluations +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ ConTReGen: Context-driven Tree-structured Retrieval for Open-domain + Long-form Text Generation EMNLP'24 + + +
+ Open-domain long-form text generation requires generating coherent, +comprehensive responses that address complex queries with both breadth and +depth. This task is challenging due to the need to accurately capture diverse +facets of input queries. Existing iterative retrieval-augmented generation +(RAG) approaches often struggle to delve deeply into each facet of complex +queries and integrate knowledge from various sources effectively. This paper +introduces ConTReGen, a novel framework that employs a context-driven, +tree-structured retrieval approach to enhance the depth and relevance of +retrieved content. ConTReGen integrates a hierarchical, top-down in-depth +exploration of query facets with a systematic bottom-up synthesis, ensuring +comprehensive coverage and coherent integration of multifaceted information. +Extensive experiments on multiple datasets, including LFQA and ODSUM, alongside +a newly introduced dataset, ODSUM-WikiHow, demonstrate that ConTReGen +outperforms existing state-of-the-art RAG models. + +
+
+ comment: Accepted at EMNLP'24 Findings +
+
+
+
+
+ + ☆ Deep Class-guided Hashing for Multi-label Cross-modal Retrieval + + +
+ Deep hashing, due to its low cost and efficient retrieval advantages, is +widely valued in cross-modal retrieval. However, existing cross-modal hashing +methods either explore the relationships between data points, which inevitably +leads to intra-class dispersion, or explore the relationships between data +points and categories while ignoring the preservation of inter-class structural +relationships, resulting in the generation of suboptimal hash codes. How to +maintain both intra-class aggregation and inter-class structural relationships, +In response to this issue, this paper proposes a DCGH method. Specifically, we +use proxy loss as the mainstay to maintain intra-class aggregation of data, +combined with pairwise loss to maintain inter-class structural relationships, +and on this basis, further propose a variance constraint to address the +semantic bias issue caused by the combination. A large number of comparative +experiments on three benchmark datasets show that the DCGH method has +comparable or even better performance compared to existing cross-modal +retrieval methods. The code for the implementation of our DCGH framework is +available at https://github.com/donnotnormal/DCGH. + +
+
+
+
+
+ + ☆ Performance-Driven QUBO for Recommender Systems on Quantum Annealers + + +
+ We propose Counterfactual Analysis Quadratic Unconstrained Binary +Optimization (CAQUBO) to solve QUBO problems for feature selection in +recommender systems. CAQUBO leverages counterfactual analysis to measure the +impact of individual features and feature combinations on model performance and +employs the measurements to construct the coefficient matrix for a quantum +annealer to select the optimal feature combinations for recommender systems, +thereby improving their final recommendation performance. By establishing +explicit connections between features and the recommendation performance, the +proposed approach demonstrates superior performance compared to the +state-of-the-art quantum annealing methods. Extensive experiments indicate that +integrating quantum computing with counterfactual analysis holds great promise +for addressing these challenges. + +
+
+
+
+
+ + ☆ HyQE: Ranking Contexts with Hypothetical Query Embeddings + + +
+ In retrieval-augmented systems, context ranking techniques are commonly +employed to reorder the retrieved contexts based on their relevance to a user +query. A standard approach is to measure this relevance through the similarity +between contexts and queries in the embedding space. However, such similarity +often fails to capture the relevance. Alternatively, large language models +(LLMs) have been used for ranking contexts. However, they can encounter +scalability issues when the number of candidate contexts grows and the context +window sizes of the LLMs remain constrained. Additionally, these approaches +require fine-tuning LLMs with domain-specific data. In this work, we introduce +a scalable ranking framework that combines embedding similarity and LLM +capabilities without requiring LLM fine-tuning. Our framework uses a +pre-trained LLM to hypothesize the user query based on the retrieved contexts +and ranks the context based on the similarity between the hypothesized queries +and the user query. Our framework is efficient at inference time and is +compatible with many other retrieval and ranking techniques. Experimental +results show that our method improves the ranking performance across multiple +benchmarks. The complete code and data are available at +https://github.com/zwc662/hyqe + +
+
+
+
+
+ + ♻ ☆ STaRK: Benchmarking LLM Retrieval on Textual and Relational Knowledge + Bases NeurIPS 2024 + + +
+ Answering real-world complex queries, such as complex product search, often +requires accurate retrieval from semi-structured knowledge bases that involve +blend of unstructured (e.g., textual descriptions of products) and structured +(e.g., entity relations of products) information. However, many previous works +studied textual and relational retrieval tasks as separate topics. To address +the gap, we develop STARK, a large-scale Semi-structure retrieval benchmark on +Textual and Relational Knowledge Bases. Our benchmark covers three domains: +product search, academic paper search, and queries in precision medicine. We +design a novel pipeline to synthesize realistic user queries that integrate +diverse relational information and complex textual properties, together with +their ground-truth answers (items). We conduct rigorous human evaluation to +validate the quality of our synthesized queries. We further enhance the +benchmark with high-quality human-generated queries to provide an authentic +reference. STARK serves as a comprehensive testbed for evaluating the +performance of retrieval systems driven by large language models (LLMs). Our +experiments suggest that STARK presents significant challenges to the current +retrieval and LLM systems, highlighting the need for more capable +semi-structured retrieval systems. The benchmark data and code are available on +https://github.com/snap-stanford/STaRK. + +
+
+ comment: NeurIPS 2024 Track on Datasets and Benchmarks. 26 Pages, 6 Figures. + Website: https://stark.stanford.edu/ +
+
+
+
+
+ + ♻ ☆ Diffusion-based Contrastive Learning for Sequential Recommendation + + +
+ Contrastive learning has been effectively utilized to enhance the training of +sequential recommendation models by leveraging informative self-supervised +signals. Most existing approaches generate augmented views of the same user +sequence through random augmentation and subsequently maximize their agreement +in the representation space. However, these methods often neglect the +rationality of the augmented samples. Due to significant uncertainty, random +augmentation can disrupt the semantic information and interest evolution +patterns inherent in the original user sequences. Moreover, pulling +semantically inconsistent sequences closer in the representation space can +render the user sequence embeddings insensitive to variations in user +preferences, which contradicts the primary objective of sequential +recommendation. To address these limitations, we propose the Context-aware +Diffusion-based Contrastive Learning for Sequential Recommendation, named +CaDiRec. The core idea is to leverage context information to generate more +reasonable augmented views. Specifically, CaDiRec employs a context-aware +diffusion model to generate alternative items for the given positions within a +sequence. These generated items are aligned with their respective context +information and can effectively replace the corresponding original items, +thereby generating a positive view of the original sequence. By considering two +different augmentations of the same user sequence, we can construct a pair of +positive samples for contrastive learning. To ensure representation cohesion, +we train the entire framework in an end-to-end manner, with shared item +embeddings between the diffusion model and the recommendation model. Extensive +experiments on five benchmark datasets demonstrate the advantages of our +proposed method over existing baselines. + +
+
+
+
+
+ + ♻ ☆ Parenting: Optimizing Knowledge Selection of Retrieval-Augmented + Language Models with Parameter Decoupling and Tailored Tuning + + +
+ Retrieval-Augmented Generation (RAG) offers an effective solution to the +issues faced by Large Language Models (LLMs) in hallucination generation and +knowledge obsolescence by incorporating externally retrieved knowledge. +However, existing methods lack effective control mechanisms for integrating +internal and external knowledge. Inspired by human cognitive processes, we +propose Parenting, a novel framework that decouples, identifies, and +purposefully optimizes parameter subspaces related to adherence and robustness. +Specifically, Parenting utilizes a key parameter mining method that combines +forward and backward propagation signals to localize subspaces representing +different capabilities. Then, Parenting employs a type-tailored tuning +strategy, applying specific and appropriate optimizations to different +subspaces, aiming to achieve a balanced enhancement of both adherence and +robustness. Extensive experiments on various datasets and models validate the +effectiveness and generalizability of our method. + +
+
+
+
+
+ + ♻ ☆ UniRAG: Universal Retrieval Augmentation for Multi-Modal Large Language + Models + + +
+ Recently, Multi-Modal (MM) Large Language Models (LLMs) have unlocked many +complex use-cases that require MM understanding (e.g., image captioning or +visual question answering) and MM generation (e.g., text-guided image +generation or editing) capabilities. To further improve the output fidelity of +MM-LLMs we introduce UniRAG, a plug-and-play technique that adds relevant +retrieved information to prompts as few-shot examples during inference. Unlike +the common belief that Retrieval Augmentation (RA) mainly improves generation +or understanding of uncommon entities, our evaluation results on the MSCOCO +dataset with common entities show that both proprietary models like GPT-4o and +Gemini-Pro and smaller open-source models like LLaVA, LaVIT, and Emu2 +significantly enhance their generation quality when their input prompts are +augmented with relevant information retrieved by MM retrievers like UniIR +models. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Multi-modal clothing recommendation model based on large model and VAE + enhancement + + +
+ Accurately recommending products has long been a subject requiring in-depth +research. This study proposes a multimodal paradigm for clothing +recommendations. Specifically, it designs a multimodal analysis method that +integrates clothing description texts and images, utilizing a pre-trained large +language model to deeply explore the hidden meanings of users and products. +Additionally, a variational encoder is employed to learn the relationship +between user information and products to address the cold start problem in +recommendation systems. This study also validates the significant performance +advantages of this method over various recommendation system methods through +extensive ablation experiments, providing crucial practical guidance for the +comprehensive optimization of recommendation systems. + +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ EVA: An Embodied World Model for Future Video Anticipation + + +
+ World models integrate raw data from various modalities, such as images and +language to simulate comprehensive interactions in the world, thereby +displaying crucial roles in fields like mixed reality and robotics. Yet, +applying the world model for accurate video prediction is quite challenging due +to the complex and dynamic intentions of the various scenes in practice. In +this paper, inspired by the human rethinking process, we decompose the complex +video prediction into four meta-tasks that enable the world model to handle +this issue in a more fine-grained manner. Alongside these tasks, we introduce a +new benchmark named Embodied Video Anticipation Benchmark (EVA-Bench) to +provide a well-rounded evaluation. EVA-Bench focused on evaluating the video +prediction ability of human and robot actions, presenting significant +challenges for both the language model and the generation model. Targeting +embodied video prediction, we propose the Embodied Video Anticipator (EVA), a +unified framework aiming at video understanding and generation. EVA integrates +a video generation model with a visual language model, effectively combining +reasoning capabilities with high-quality generation. Moreover, to enhance the +generalization of our framework, we tailor-designed a multi-stage pretraining +paradigm that adaptatively ensembles LoRA to produce high-fidelity results. +Extensive experiments on EVA-Bench highlight the potential of EVA to +significantly improve performance in embodied scenes, paving the way for +large-scale pre-trained models in real-world prediction tasks. + +
+
+
+
+
+ + ☆ Scene Graph Generation with Role-Playing Large Language Models NeurIPS 2024 + + +
+ Current approaches for open-vocabulary scene graph generation (OVSGG) use +vision-language models such as CLIP and follow a standard zero-shot pipeline -- +computing similarity between the query image and the text embeddings for each +category (i.e., text classifiers). In this work, we argue that the text +classifiers adopted by existing OVSGG methods, i.e., category-/part-level +prompts, are scene-agnostic as they remain unchanged across contexts. Using +such fixed text classifiers not only struggles to model visual relations with +high variance, but also falls short in adapting to distinct contexts. To plug +these intrinsic shortcomings, we devise SDSGG, a scene-specific description +based OVSGG framework where the weights of text classifiers are adaptively +adjusted according to the visual content. In particular, to generate +comprehensive and diverse descriptions oriented to the scene, an LLM is asked +to play different roles (e.g., biologist and engineer) to analyze and discuss +the descriptive features of a given scene from different views. Unlike previous +efforts simply treating the generated descriptions as mutually equivalent text +classifiers, SDSGG is equipped with an advanced renormalization mechanism to +adjust the influence of each text classifier based on its relevance to the +presented scene (this is what the term "specific" means). Furthermore, to +capture the complicated interplay between subjects and objects, we propose a +new lightweight module called mutual visual adapter. It refines CLIP's ability +to recognize relations by learning an interaction-aware semantic space. +Extensive experiments on prevalent benchmarks show that SDSGG outperforms +top-leading methods by a clear margin. + +
+
+ comment: NeurIPS 2024. Code: https://github.com/guikunchen/SDSGG +
+
+
+
+
+ + ☆ ContextDet: Temporal Action Detection with Adaptive Context Aggregation + + +
+ Temporal action detection (TAD), which locates and recognizes action +segments, remains a challenging task in video understanding due to variable +segment lengths and ambiguous boundaries. Existing methods treat neighboring +contexts of an action segment indiscriminately, leading to imprecise boundary +predictions. We introduce a single-stage ContextDet framework, which makes use +of large-kernel convolutions in TAD for the first time. Our model features a +pyramid adaptive context aggragation (ACA) architecture, capturing long context +and improving action discriminability. Each ACA level consists of two novel +modules. The context attention module (CAM) identifies salient contextual +information, encourages context diversity, and preserves context integrity +through a context gating block (CGB). The long context module (LCM) makes use +of a mixture of large- and small-kernel convolutions to adaptively gather +long-range context and fine-grained local features. Additionally, by varying +the length of these large kernels across the ACA pyramid, our model provides +lightweight yet effective context aggregation and action discrimination. We +conducted extensive experiments and compared our model with a number of +advanced TAD methods on six challenging TAD benchmarks: MultiThumos, Charades, +FineAction, EPIC-Kitchens 100, Thumos14, and HACS, demonstrating superior +accuracy at reduced inference speed. + +
+
+
+
+
+ + ☆ GSSF: Generalized Structural Sparse Function for Deep Cross-modal Metric + Learning + + +
+ Cross-modal metric learning is a prominent research topic that bridges the +semantic heterogeneity between vision and language. Existing methods frequently +utilize simple cosine or complex distance metrics to transform the pairwise +features into a similarity score, which suffers from an inadequate or +inefficient capability for distance measurements. Consequently, we propose a +Generalized Structural Sparse Function to dynamically capture thorough and +powerful relationships across modalities for pair-wise similarity learning +while remaining concise but efficient. Specifically, the distance metric +delicately encapsulates two formats of diagonal and block-diagonal terms, +automatically distinguishing and highlighting the cross-channel relevancy and +dependency inside a structured and organized topology. Hence, it thereby +empowers itself to adapt to the optimal matching patterns between the paired +features and reaches a sweet spot between model complexity and capability. +Extensive experiments on cross-modal and two extra uni-modal retrieval tasks +(image-text retrieval, person re-identification, fine-grained image retrieval) +have validated its superiority and flexibility over various popular retrieval +frameworks. More importantly, we further discover that it can be seamlessly +incorporated into multiple application scenarios, and demonstrates promising +prospects from Attention Mechanism to Knowledge Distillation in a plug-and-play +manner. Our code is publicly available at: https://github.com/Paranioar/GSSF. + +
+
+ comment: 12 pages, 9 figures, Accepted by TIP2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Crafting Tomorrow: The Influence of Design Choices on Fresh Content in + Social Media Recommendation + + +
+ The rise in popularity of social media platforms, has resulted in millions of +new, content pieces being created every day. This surge in content creation +underscores the need to pay attention to our design choices as they can greatly +impact how long content remains relevant. In today's landscape where regularly +recommending new content is crucial, particularly in the absence of detailed +information, a variety of factors such as UI features, algorithms and system +settings contribute to shaping the journey of content across the platform. +While previous research has focused on how new content affects users' +experiences, this study takes a different approach by analyzing these decisions +considering the content itself. + Through a series of carefully crafted experiments we explore how seemingly +small decisions can influence the longevity of content, measured by metrics +like Content Progression (CVP) and Content Survival (CSR). We also emphasize +the importance of recognizing the stages that content goes through underscoring +the need to tailor strategies for each stage as a one size fits all approach +may not be effective. Additionally we argue for a departure from traditional +experimental setups in the study of content lifecycles, to avoid potential +misunderstandings while proposing advanced techniques, to achieve greater +precision and accuracy in the evaluation process. + +
+
+
+
+
+ + ☆ Mining Asymmetric Intertextuality + + +
+ This paper introduces a new task in Natural Language Processing (NLP) and +Digital Humanities (DH): Mining Asymmetric Intertextuality. Asymmetric +intertextuality refers to one-sided relationships between texts, where one text +cites, quotes, or borrows from another without reciprocation. These +relationships are common in literature and historical texts, where a later work +references aclassical or older text that remain static. + We propose a scalable and adaptive approach for mining asymmetric +intertextuality, leveraging a split-normalize-merge paradigm. In this approach, +documents are split into smaller chunks, normalized into structured data using +LLM-assisted metadata extraction, and merged during querying to detect both +explicit and implicit intertextual relationships. Our system handles +intertextuality at various levels, from direct quotations to paraphrasing and +cross-document influence, using a combination of metadata filtering, vector +similarity search, and LLM-based verification. + This method is particularly well-suited for dynamically growing corpora, such +as expanding literary archives or historical databases. By enabling the +continuous integration of new documents, the system can scale efficiently, +making it highly valuable for digital humanities practitioners in literacy +studies, historical research and related fields. + +
+
+
+
+
+ + ☆ Incorporating Group Prior into Variational Inference for Tail-User + Behavior Modeling in CTR Prediction + + +
+ User behavior modeling -- which aims to extract user interests from +behavioral data -- has shown great power in Click-through rate (CTR) +prediction, a key component in recommendation systems. Recently, +attention-based algorithms have become a promising direction, as attention +mechanisms emphasize the relevant interactions from rich behaviors. However, +the methods struggle to capture the preferences of tail users with sparse +interaction histories. To address the problem, we propose a novel variational +inference approach, namely Group Prior Sampler Variational Inference (GPSVI), +which introduces group preferences as priors to refine latent user interests +for tail users. In GPSVI, the extent of adjustments depends on the estimated +uncertainty of individual preference modeling. In addition, We further enhance +the expressive power of variational inference by a volume-preserving flow. An +appealing property of the GPSVI method is its ability to revert to traditional +attention for head users with rich behavioral data while consistently enhancing +performance for long-tail users with sparse behaviors. Rigorous analysis and +extensive experiments demonstrate that GPSVI consistently improves the +performance of tail users. Moreover, online A/B testing on a large-scale +real-world recommender system further confirms the effectiveness of our +proposed approach. + +
+
+
+
+
+ + ☆ A Recommendation Model Utilizing Separation Embedding and Self-Attention + for Feature Mining + + +
+ With the explosive growth of Internet data, users are facing the problem of +information overload, which makes it a challenge to efficiently obtain the +required resources. Recommendation systems have emerged in this context. By +filtering massive amounts of information, they provide users with content that +meets their needs, playing a key role in scenarios such as advertising +recommendation and product recommendation. However, traditional click-through +rate prediction and TOP-K recommendation mechanisms are gradually unable to +meet the recommendations needs in modern life scenarios due to high +computational complexity, large memory consumption, long feature selection +time, and insufficient feature interaction. This paper proposes a +recommendations system model based on a separation embedding cross-network. The +model uses an embedding neural network layer to transform sparse feature +vectors into dense embedding vectors, and can independently perform feature +cross operations on different dimensions, thereby improving the accuracy and +depth of feature mining. Experimental results show that the model shows +stronger adaptability and higher prediction accuracy in processing complex data +sets, effectively solving the problems existing in existing models. + +
+
+
+
+
+ + ☆ Transit Pulse: Utilizing Social Media as a Source for Customer Feedback + and Information Extraction with Large Language Model + + +
+ Users of the transit system flood social networks daily with messages that +contain valuable insights crucial for improving service quality. These posts +help transit agencies quickly identify emerging issues. Parsing topics and +sentiments is key to gaining comprehensive insights to foster service +excellence. However, the volume of messages makes manual analysis impractical, +and standard NLP techniques like Term Frequency-Inverse Document Frequency +(TF-IDF) fall short in nuanced interpretation. Traditional sentiment analysis +separates topics and sentiments before integrating them, often missing the +interaction between them. This incremental approach complicates classification +and reduces analytical productivity. To address these challenges, we propose a +novel approach to extracting and analyzing transit-related information, +including sentiment and sarcasm detection, identification of unusual system +problems, and location data from social media. Our method employs Large +Language Models (LLM), specifically Llama 3, for a streamlined analysis free +from pre-established topic labels. To enhance the model's domain-specific +knowledge, we utilize Retrieval-Augmented Generation (RAG), integrating +external knowledge sources into the information extraction pipeline. We +validated our method through extensive experiments comparing its performance +with traditional NLP approaches on user tweet data from the real world transit +system. Our results demonstrate the potential of LLMs to transform social media +data analysis in the public transit domain, providing actionable insights and +enhancing transit agencies' responsiveness by extracting a broader range of +information. + +
+
+ comment: 17 pages, 21 figures +
+
+
+
+
+ + ☆ Visual Navigation of Digital Libraries: Retrieval and Classification of + Images in the National Library of Norway's Digitised Book Collection + + +
+ Digital tools for text analysis have long been essential for the +searchability and accessibility of digitised library collections. Recent +computer vision advances have introduced similar capabilities for visual +materials, with deep learning-based embeddings showing promise for analysing +visual heritage. Given that many books feature visuals in addition to text, +taking advantage of these breakthroughs is critical to making library +collections open and accessible. In this work, we present a proof-of-concept +image search application for exploring images in the National Library of +Norway's pre-1900 books, comparing Vision Transformer (ViT), Contrastive +Language-Image Pre-training (CLIP), and Sigmoid loss for Language-Image +Pre-training (SigLIP) embeddings for image retrieval and classification. Our +results show that the application performs well for exact image retrieval, with +SigLIP embeddings slightly outperforming CLIP and ViT in both retrieval and +classification tasks. Additionally, SigLIP-based image classification can aid +in cleaning image datasets from a digitisation pipeline. + +
+
+ comment: 13 pages, 2 figures, 4 tables, Accepted to the 2024 Computational + Humanities Research Conference (CHR) +
+
+
+
+
+ + ♻ ☆ Enhancing Short-Text Topic Modeling with LLM-Driven Context Expansion + and Prefix-Tuned VAEs EMNLP + + +
+ Topic modeling is a powerful technique for uncovering hidden themes within a +collection of documents. However, the effectiveness of traditional topic models +often relies on sufficient word co-occurrence, which is lacking in short texts. +Therefore, existing approaches, whether probabilistic or neural, frequently +struggle to extract meaningful patterns from such data, resulting in incoherent +topics. To address this challenge, we propose a novel approach that leverages +large language models (LLMs) to extend short texts into more detailed sequences +before applying topic modeling. To further improve the efficiency and solve the +problem of semantic inconsistency from LLM-generated texts, we propose to use +prefix tuning to train a smaller language model coupled with a variational +autoencoder for short-text topic modeling. Our method significantly improves +short-text topic modeling performance, as demonstrated by extensive experiments +on real-world datasets with extreme data sparsity, outperforming current +state-of-the-art topic models. + +
+
+ comment: EMNLP Findings 2024. arXiv admin note: substantial text overlap with + arXiv:2310.15420 +
+
+
+
+
+ + ♻ ☆ A Survey on Intent-aware Recommender Systems + + +
+ Many modern online services feature personalized recommendations. A central +challenge when providing such recommendations is that the reason why an +individual user accesses the service may change from visit to visit or even +during an ongoing usage session. To be effective, a recommender system should +therefore aim to take the users' probable intent of using the service at a +certain point in time into account. In recent years, researchers have thus +started to address this challenge by incorporating intent-awareness into +recommender systems. Correspondingly, a number of technical approaches were put +forward, including diversification techniques, intent prediction models or +latent intent modeling approaches. In this paper, we survey and categorize +existing approaches to building the next generation of Intent-Aware Recommender +Systems (IARS). Based on an analysis of current evaluation practices, we +outline open gaps and possible future directions in this area, which in +particular include the consideration of additional interaction signals and +contextual information to further improve the effectiveness of such systems. + +
+
+
+
+
+ + ♻ ☆ Dreaming User Multimodal Representation Guided by The Platonic + Representation Hypothesis for Micro-Video Recommendation + + +
+ The proliferation of online micro-video platforms has underscored the +necessity for advanced recommender systems to mitigate information overload and +deliver tailored content. Despite advancements, accurately and promptly +capturing dynamic user interests remains a formidable challenge. Inspired by +the Platonic Representation Hypothesis, which posits that different data +modalities converge towards a shared statistical model of reality, we introduce +DreamUMM (Dreaming User Multi-Modal Representation), a novel approach +leveraging user historical behaviors to create real-time user representation in +a multimoda space. DreamUMM employs a closed-form solution correlating user +video preferences with multimodal similarity, hypothesizing that user interests +can be effectively represented in a unified multimodal space. Additionally, we +propose Candidate-DreamUMM for scenarios lacking recent user behavior data, +inferring interests from candidate videos alone. Extensive online A/B tests +demonstrate significant improvements in user engagement metrics, including +active days and play count. The successful deployment of DreamUMM in two +micro-video platforms with hundreds of millions of daily active users, +illustrates its practical efficacy and scalability in personalized micro-video +content delivery. Our work contributes to the ongoing exploration of +representational convergence by providing empirical evidence supporting the +potential for user interest representations to reside in a multimodal space. + +
+
+ comment: 4 Figure; 2 Table +
+
+
+
+
+ + ♻ ☆ PromptReps: Prompting Large Language Models to Generate Dense and Sparse + Representations for Zero-Shot Document Retrieval EMNLP2024 + + +
+ Utilizing large language models (LLMs) for zero-shot document ranking is done +in one of two ways: (1) prompt-based re-ranking methods, which require no +further training but are only feasible for re-ranking a handful of candidate +documents due to computational costs; and (2) unsupervised contrastive trained +dense retrieval methods, which can retrieve relevant documents from the entire +corpus but require a large amount of paired text data for contrastive training. +In this paper, we propose PromptReps, which combines the advantages of both +categories: no need for training and the ability to retrieve from the whole +corpus. Our method only requires prompts to guide an LLM to generate query and +document representations for effective document retrieval. Specifically, we +prompt the LLMs to represent a given text using a single word, and then use the +last token's hidden states and the corresponding logits associated with the +prediction of the next token to construct a hybrid document retrieval system. +The retrieval system harnesses both dense text embedding and sparse +bag-of-words representations given by the LLM. Our experimental evaluation on +the MSMARCO, TREC deep learning and BEIR zero-shot document retrieval datasets +illustrates that this simple prompt-based LLM retrieval method can achieve a +similar or higher retrieval effectiveness than state-of-the-art LLM embedding +methods that are trained with large amounts of unsupervised data, especially +when using a larger LLM. + +
+
+ comment: EMNLP2024 main +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ DiffuseST: Unleashing the Capability of the Diffusion Model for Style + Transfer + + +
+ Style transfer aims to fuse the artistic representation of a style image with +the structural information of a content image. Existing methods train specific +networks or utilize pre-trained models to learn content and style features. +However, they rely solely on textual or spatial representations that are +inadequate to achieve the balance between content and style. In this work, we +propose a novel and training-free approach for style transfer, combining +textual embedding with spatial features and separating the injection of content +or style. Specifically, we adopt the BLIP-2 encoder to extract the textual +representation of the style image. We utilize the DDIM inversion technique to +extract intermediate embeddings in content and style branches as spatial +features. Finally, we harness the step-by-step property of diffusion models by +separating the injection of content and style in the target branch, which +improves the balance between content preservation and style fusion. Various +experiments have demonstrated the effectiveness and robustness of our proposed +DiffeseST for achieving balanced and controllable style transfer results, as +well as the potential to extend to other tasks. + +
+
+ comment: Accepted to ACMMM Asia 2024. Code is available at + https://github.com/I2-Multimedia-Lab/DiffuseST +
+
+
+
+
+ + ☆ Testing and validation of innovative eXtended Reality technologies for + astronaut training in a partial-gravity parabolic flight campaign + + +
+ The use of eXtended Reality (XR) technologies in the space domain has +increased significantly over the past few years as it can offer many advantages +when simulating complex and challenging environments. Space agencies are +currently using these disruptive tools to train astronauts for Extravehicular +Activities (EVAs), to test equipment and procedures, and to assess spacecraft +and hardware designs. With the Moon being the current focus of the next +generation of space exploration missions, simulating its harsh environment is +one of the key areas where XR can be applied, particularly for astronaut +training. Peculiar lunar lighting conditions in combination with reduced +gravity levels will highly impact human locomotion especially for movements +such as walking, jumping, and running. In order to execute operations on the +lunar surface and to safely live on the Moon for an extended period of time, +innovative training methodologies and tools such as XR are becoming paramount +to perform pre-mission validation and certification. This research work +presents the findings of the experiments aimed at exploring the integration of +XR technology and parabolic flight activities for astronaut training. In +addition, the study aims to consolidate these findings into a set of guidelines +that can assist future researchers who wish to incorporate XR technology into +lunar training and preparation activities, including the use of such XR tools +during long duration missions. + +
+
+ comment: 75th International Astronautical Congress (IAC), Milan, Italy, 14-18 + October 2024 +
+
+
+
+
+ + ♻ ☆ Towards Multimodal Emotional Support Conversation Systems + + +
+ The integration of conversational artificial intelligence (AI) into mental +health care promises a new horizon for therapist-client interactions, aiming to +closely emulate the depth and nuance of human conversations. Despite the +potential, the current landscape of conversational AI is markedly limited by +its reliance on single-modal data, constraining the systems' ability to +empathize and provide effective emotional support. This limitation stems from a +paucity of resources that encapsulate the multimodal nature of human +communication essential for therapeutic counseling. To address this gap, we +introduce the Multimodal Emotional Support Conversation (MESC) dataset, a +first-of-its-kind resource enriched with comprehensive annotations across text, +audio, and video modalities. This dataset captures the intricate interplay of +user emotions, system strategies, system emotion, and system responses, setting +a new precedent in the field. Leveraging the MESC dataset, we propose a general +Sequential Multimodal Emotional Support framework (SMES) grounded in +Therapeutic Skills Theory. Tailored for multimodal dialogue systems, the SMES +framework incorporates an LLM-based reasoning model that sequentially generates +user emotion recognition, system strategy prediction, system emotion +prediction, and response generation. Our rigorous evaluations demonstrate that +this framework significantly enhances the capability of AI systems to mimic +therapist behaviors with heightened empathy and strategic responsiveness. By +integrating multimodal data in this innovative manner, we bridge the critical +gap between emotion recognition and emotional support, marking a significant +advancement in conversational AI for mental health support. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 19 + +
+
+
+ + ☆ The S2 Hierarchical Discrete Global Grid as a Nexus for Data + Representation, Integration, and Querying Across Geospatial Knowledge Graphs + + +
+ Geospatial Knowledge Graphs (GeoKGs) have become integral to the growing +field of Geospatial Artificial Intelligence. Initiatives like the U.S. National +Science Foundation's Open Knowledge Network program aim to create an ecosystem +of nation-scale, cross-disciplinary GeoKGs that provide AI-ready geospatial +data aligned with FAIR principles. However, building this infrastructure +presents key challenges, including 1) managing large volumes of data, 2) the +computational complexity of discovering topological relations via SPARQL, and +3) conflating multi-scale raster and vector data. Discrete Global Grid Systems +(DGGS) help tackle these issues by offering efficient data integration and +representation strategies. The KnowWhereGraph utilizes Google's S2 Geometry -- +a DGGS framework -- to enable efficient multi-source data processing, +qualitative spatial querying, and cross-graph integration. This paper outlines +the implementation of S2 within KnowWhereGraph, emphasizing its role in +topologically enriching and semantically compressing data. Ultimately, this +work demonstrates the potential of DGGS frameworks, particularly S2, for +building scalable GeoKGs. + +
+
+
+
+
+ + ☆ SIMformer: Single-Layer Vanilla Transformer Can Learn Free-Space + Trajectory Similarity + + +
+ Free-space trajectory similarity calculation, e.g., DTW, Hausdorff, and +Frechet, often incur quadratic time complexity, thus learning-based methods +have been proposed to accelerate the computation. The core idea is to train an +encoder to transform trajectories into representation vectors and then compute +vector similarity to approximate the ground truth. However, existing methods +face dual challenges of effectiveness and efficiency: 1) they all utilize +Euclidean distance to compute representation similarity, which leads to the +severe curse of dimensionality issue -- reducing the distinguishability among +representations and significantly affecting the accuracy of subsequent +similarity search tasks; 2) most of them are trained in triplets manner and +often necessitate additional information which downgrades the efficiency; 3) +previous studies, while emphasizing the scalability in terms of efficiency, +overlooked the deterioration of effectiveness when the dataset size grows. To +cope with these issues, we propose a simple, yet accurate, fast, scalable model +that only uses a single-layer vanilla transformer encoder as the feature +extractor and employs tailored representation similarity functions to +approximate various ground truth similarity measures. Extensive experiments +demonstrate our model significantly mitigates the curse of dimensionality issue +and outperforms the state-of-the-arts in effectiveness, efficiency, and +scalability. + +
+
+
+
+
+ + ☆ Enhancing AI Accessibility in Veterinary Medicine: Linking Classifiers + and Electronic Health Records + + +
+ In the rapidly evolving landscape of veterinary healthcare, integrating +machine learning (ML) clinical decision-making tools with electronic health +records (EHRs) promises to improve diagnostic accuracy and patient care. +However, the seamless integration of ML classifiers into existing EHRs in +veterinary medicine is frequently hindered by the rigidity of EHR systems or +the limited availability of IT resources. To address this shortcoming, we +present Anna, a freely-available software solution that provides ML classifier +results for EHR laboratory data in real-time. + +
+
+
+
+
+ + ☆ DiSCo Meets LLMs: A Unified Approach for Sparse Retrieval and Contextual + Distillation in Conversational Search + + +
+ Conversational Search (CS) is the task of retrieving relevant documents from +a corpus within a conversational context, combining retrieval with +conversational context modeling. With the explosion of Large Language Models +(LLMs), the CS field has seen major improvements with LLMs rewriting user +queries, accounting for conversational context. However, engaging LLMs at +inference time harms efficiency. Current methods address this by distilling +embeddings from human-rewritten queries to learn the context modeling task. +Yet, these approaches predominantly focus on context modeling, and only treat +the contrastive component of the retrieval task within a +distillation-independent loss term. To address these limitations, we propose a +new distillation method, as a relaxation of the previous objective, unifying +retrieval and context modeling. We relax the existing training objectives by +distilling similarity scores between conversations and documents, rather than +relying solely on representation learning. Our proposed distillation objective +allows for more freedom in the representation space and leverages the +contrastive nature of document relevance. Through experiments on Learned Sparse +Retrieval (LSR) across 5 CS datasets, our approach demonstrates substantial +improvements in both in-domain and out-of-domain retrieval performance, +outperforming state-of-the-art with gains of up to 6 points in recall for +out-of-domain datasets. Additionally, through the relaxation of the objective, +we propose a multi-teacher distillation, using multiple LLMs as teachers, +yielding additional gains, and outperforming the teachers themselves in +in-domain experiments. Finally, analysis of the sparsity of the models reveals +that our distillation allows for better control over the sparsity of the +trained models. + +
+
+
+
+
+ + ☆ RAG-ConfusionQA: A Benchmark for Evaluating LLMs on Confusing Questions + + +
+ Conversational AI agents use Retrieval Augmented Generation (RAG) to provide +verifiable document-grounded responses to user inquiries. However, many natural +questions do not have good answers: about 25\% contain false +assumptions~\cite{Yu2023:CREPE}, and over 50\% are +ambiguous~\cite{Min2020:AmbigQA}. RAG agents need high-quality data to improve +their responses to confusing questions. This paper presents a novel synthetic +data generation method to efficiently create a diverse set of context-grounded +confusing questions from a given document corpus. We conduct an empirical +comparative evaluation of several large language models as RAG agents to +measure the accuracy of confusion detection and appropriate response +generation. We contribute a benchmark dataset to the public domain. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ SPFresh: Incremental In-Place Update for Billion-Scale Vector Search SOSP 23 + + +
+ Approximate Nearest Neighbor Search (ANNS) is now widely used in various +applications, ranging from information retrieval, question answering, and +recommendation, to search for similar high-dimensional vectors. As the amount +of vector data grows continuously, it becomes important to support updates to +vector index, the enabling technique that allows for efficient and accurate +ANNS on vectors. Because of the curse of high dimensionality, it is often +costly to identify the right neighbors of a single new vector, a necessary +process for index update. To amortize update costs, existing systems maintain a +secondary index to accumulate updates, which are merged by the main index by +global rebuilding the entire index periodically. However, this approach has +high fluctuations of search latency and accuracy, not even to mention that it +requires substantial resources and is extremely time-consuming for rebuilds. We +introduce SPFresh, a system that supports in-place vector updates. At the heart +of SPFresh is LIRE, a lightweight incremental rebalancing protocol to split +vector partitions and reassign vectors in the nearby partitions to adapt to +data distribution shift. LIRE achieves low-overhead vector updates by only +reassigning vectors at the boundary between partitions, where in a high-quality +vector index the amount of such vectors are deemed small. With LIRE, SPFresh +provides superior query latency and accuracy to solutions based on global +rebuild, with only 1% of DRAM and less than 10% cores needed at the peak +compared to the state-of-the-art, in a billion scale vector index with 1% of +daily vector update rate. + +
+
+ comment: SOSP 23 +
+
+
+
+
+ + ☆ ChartifyText: Automated Chart Generation from Data-Involved Texts via + LLM + + +
+ Text documents with numerical values involved are widely used in various +applications such as scientific research, economy, public health and +journalism. However, it is difficult for readers to quickly interpret such +data-involved texts and gain deep insights. To fill this research gap, this +work aims to automatically generate charts to accurately convey the underlying +data and ideas to readers, which is essentially a challenging task. The +challenges originate from text ambiguities, intrinsic sparsity and uncertainty +of data in text documents, and subjective sentiment differences. Specifically, +we propose ChartifyText, a novel fully-automated approach that leverages Large +Language Models (LLMs) to convert complex data-involved texts to expressive +charts. It consists of two major modules: tabular data inference and expressive +chart generation. The tabular data inference module employs systematic prompt +engineering to guide the LLM (e.g., GPT-4) to infer table data, where data +ranges, uncertainties, missing data values and corresponding subjective +sentiments are explicitly considered. The expressive chart generation module +augments standard charts with intuitive visual encodings and concise texts to +accurately convey the underlying data and insights. We extensively evaluate the +effectiveness of ChartifyText on real-world data-involved text documents +through case studies, in-depth interviews with three visualization experts, and +a carefully-designed user study with 15 participants. The results demonstrate +the usefulness and effectiveness of ChartifyText in helping readers efficiently +and effectively make sense of data-involved texts. + +
+
+
+
+
+ + ☆ Graph Neural Patching for Cold-Start Recommendations + + +
+ The cold start problem in recommender systems remains a critical challenge. +Current solutions often train hybrid models on auxiliary data for both cold and +warm users/items, potentially degrading the experience for the latter. This +drawback limits their viability in practical scenarios where the satisfaction +of existing warm users/items is paramount. Although graph neural networks +(GNNs) excel at warm recommendations by effective collaborative signal +modeling, they haven't been effectively leveraged for the cold-start issue +within a user-item graph, which is largely due to the lack of initial +connections for cold user/item entities. Addressing this requires a GNN adept +at cold-start recommendations without sacrificing performance for existing +ones. To this end, we introduce Graph Neural Patching for Cold-Start +Recommendations (GNP), a customized GNN framework with dual functionalities: +GWarmer for modeling collaborative signal on existing warm users/items and +Patching Networks for simulating and enhancing GWarmer's performance on +cold-start recommendations. Extensive experiments on three benchmark datasets +confirm GNP's superiority in recommending both warm and cold users/items. + +
+
+ comment: 13 pages, accepted by Australasian Database Conference 2024. arXiv + admin note: substantial text overlap with arXiv:2209.12215 +
+
+
+
+
+ + ☆ Personalized Image Generation with Large Multimodal Models + + +
+ Personalized content filtering, such as recommender systems, has become a +critical infrastructure to alleviate information overload. However, these +systems merely filter existing content and are constrained by its limited +diversity, making it difficult to meet users' varied content needs. To address +this limitation, personalized content generation has emerged as a promising +direction with broad applications. Nevertheless, most existing research focuses +on personalized text generation, with relatively little attention given to +personalized image generation. The limited work in personalized image +generation faces challenges in accurately capturing users' visual preferences +and needs from noisy user-interacted images and complex multimodal +instructions. Worse still, there is a lack of supervised data for training +personalized image generation models. + To overcome the challenges, we propose a Personalized Image Generation +Framework named Pigeon, which adopts exceptional large multimodal models with +three dedicated modules to capture users' visual preferences and needs from +noisy user history and multimodal instructions. To alleviate the data scarcity, +we introduce a two-stage preference alignment scheme, comprising masked +preference reconstruction and pairwise preference alignment, to align Pigeon +with the personalized image generation task. We apply Pigeon to personalized +sticker and movie poster generation, where extensive quantitative results and +human evaluation highlight its superiority over various generative baselines. + +
+
+
+
+
+ + ☆ Optimizing Retrieval-Augmented Generation with Elasticsearch for + Enhanced Question-Answering Systems + + +
+ This study aims to improve the accuracy and quality of large-scale language +models (LLMs) in answering questions by integrating Elasticsearch into the +Retrieval Augmented Generation (RAG) framework. The experiment uses the +Stanford Question Answering Dataset (SQuAD) version 2.0 as the test dataset and +compares the performance of different retrieval methods, including traditional +methods based on keyword matching or semantic similarity calculation, BM25-RAG +and TF-IDF- RAG, and the newly proposed ES-RAG scheme. The results show that +ES-RAG not only has obvious advantages in retrieval efficiency but also +performs well in key indicators such as accuracy, which is 0.51 percentage +points higher than TF-IDF-RAG. In addition, Elasticsearch's powerful search +capabilities and rich configuration options enable the entire +question-answering system to better handle complex queries and provide more +flexible and efficient responses based on the diverse needs of users. Future +research directions can further explore how to optimize the interaction +mechanism between Elasticsearch and LLM, such as introducing higher-level +semantic understanding and context-awareness capabilities, to achieve a more +intelligent and humanized question-answering experience. + +
+
+
+
+
+ + ☆ Towards Robust Transcription: Exploring Noise Injection Strategies for + Training Data Augmentation + + +
+ Recent advancements in Automatic Piano Transcription (APT) have significantly +improved system performance, but the impact of noisy environments on the system +performance remains largely unexplored. This study investigates the impact of +white noise at various Signal-to-Noise Ratio (SNR) levels on state-of-the-art +APT models and evaluates the performance of the Onsets and Frames model when +trained on noise-augmented data. We hope this research provides valuable +insights as preliminary work toward developing transcription models that +maintain consistent performance across a range of acoustic conditions. + +
+
+ comment: Accepted to the Late-Breaking Demo Session of the 25th International + Society for Music Information Retrieval (ISMIR) Conference, 2024 +
+
+
+
+
+ + ♻ ☆ Retrieval-Enhanced Machine Learning: Synthesis and Opportunities + + +
+ In the field of language modeling, models augmented with retrieval components +have emerged as a promising solution to address several challenges faced in the +natural language processing (NLP) field, including knowledge grounding, +interpretability, and scalability. Despite the primary focus on NLP, we posit +that the paradigm of retrieval-enhancement can be extended to a broader +spectrum of machine learning (ML) such as computer vision, time series +prediction, and computational biology. Therefore, this work introduces a formal +framework of this paradigm, Retrieval-Enhanced Machine Learning (REML), by +synthesizing the literature in various domains in ML with consistent notations +which is missing from the current literature. Also, we found that while a +number of studies employ retrieval components to augment their models, there is +a lack of integration with foundational Information Retrieval (IR) research. We +bridge this gap between the seminal IR research and contemporary REML studies +by investigating each component that comprises the REML framework. Ultimately, +the goal of this work is to equip researchers across various disciplines with a +comprehensive, formally structured framework of retrieval-enhanced models, +thereby fostering interdisciplinary future research. + +
+
+
+
+
+ + ♻ ☆ EasyRec: Simple yet Effective Language Models for Recommendation + + +
+ Deep neural networks have become a powerful technique for learning +representations from user-item interaction data in collaborative filtering (CF) +for recommender systems. However, many existing methods heavily rely on unique +user and item IDs, which limits their ability to perform well in practical +zero-shot learning scenarios where sufficient training data may be unavailable. +Inspired by the success of language models (LMs) and their strong +generalization capabilities, a crucial question arises: How can we harness the +potential of language models to empower recommender systems and elevate its +generalization capabilities to new heights? In this study, we propose EasyRec - +an effective and easy-to-use approach that seamlessly integrates text-based +semantic understanding with collaborative signals. EasyRec employs a +text-behavior alignment framework, which combines contrastive learning with +collaborative language model tuning, to ensure a strong alignment between the +text-enhanced semantic space and the collaborative behavior information. +Extensive empirical evaluations across diverse real-world datasets demonstrate +the superior performance of EasyRec compared to state-of-the-art alternative +models, particularly in the challenging text-based zero-shot recommendation +scenarios. Furthermore, the study highlights the potential of seamlessly +integrating EasyRec as a plug-and-play component into text-enhanced +collaborative filtering frameworks, thereby empowering existing recommender +systems to elevate their recommendation performance and adapt to the evolving +user preferences in dynamic environments. For better result reproducibility of +our EasyRec framework, the model implementation details, source code, and +datasets are available at the link: https://github.com/HKUDS/EasyRec. + +
+
+
+
+
+ + ♻ ☆ Improving Retrieval in Sponsored Search by Leveraging Query Context + Signals EMNLP 2024 + + +
+ Accurately retrieving relevant bid keywords for user queries is critical in +Sponsored Search but remains challenging, particularly for short, ambiguous +queries. Existing dense and generative retrieval models often fail to capture +nuanced user intent in these cases. To address this, we propose an approach to +enhance query understanding by augmenting queries with rich contextual signals +derived from web search results and large language models, stored in an online +cache. Specifically, we use web search titles and snippets to ground queries in +real-world information and utilize GPT-4 to generate query rewrites and +explanations that clarify user intent. These signals are efficiently integrated +through a Fusion-in-Decoder based Unity architecture, enabling both dense and +generative retrieval with serving costs on par with traditional context-free +models. To address scenarios where context is unavailable in the cache, we +introduce context glancing, a curriculum learning strategy that improves model +robustness and performance even without contextual signals during inference. +Extensive offline experiments demonstrate that our context-aware approach +substantially outperforms context-free models. Furthermore, online A/B testing +on a prominent search engine across 160+ countries shows significant +improvements in user engagement and revenue. + +
+
+ comment: Accepted to EMNLP 2024 Industry Track. 10 pages, 10 tables, 1 figure +
+
+
+
+
+ + ♻ ☆ Revisiting BPR: A Replicability Study of a Common Recommender System + Baseline RecSys + '24 + + +
+ Bayesian Personalized Ranking (BPR), a collaborative filtering approach based +on matrix factorization, frequently serves as a benchmark for recommender +systems research. However, numerous studies often overlook the nuances of BPR +implementation, claiming that it performs worse than newly proposed methods +across various tasks. In this paper, we thoroughly examine the features of the +BPR model, indicating their impact on its performance, and investigate +open-source BPR implementations. Our analysis reveals inconsistencies between +these implementations and the original BPR paper, leading to a significant +decrease in performance of up to 50% for specific implementations. Furthermore, +through extensive experiments on real-world datasets under modern evaluation +settings, we demonstrate that with proper tuning of its hyperparameters, the +BPR model can achieve performance levels close to state-of-the-art methods on +the top-n recommendation tasks and even outperform them on specific datasets. +Specifically, on the Million Song Dataset, the BPR model with hyperparameters +tuning statistically significantly outperforms Mult-VAE by 10% in NDCG@100 with +binary relevance function. + +
+
+ comment: This paper is accepted at the Reproducibility track of the ACM RecSys + '24 conference +
+
+
+
+
+ + ♻ ☆ Conversational Recommender System and Large Language Model Are Made for + Each Other in E-commerce Pre-sales Dialogue EMNLP 2023 + + +
+ E-commerce pre-sales dialogue aims to understand and elicit user needs and +preferences for the items they are seeking so as to provide appropriate +recommendations. Conversational recommender systems (CRSs) learn user +representation and provide accurate recommendations based on dialogue context, +but rely on external knowledge. Large language models (LLMs) generate responses +that mimic pre-sales dialogues after fine-tuning, but lack domain-specific +knowledge for accurate recommendations. Intuitively, the strengths of LLM and +CRS in E-commerce pre-sales dialogues are complementary, yet no previous work +has explored this. This paper investigates the effectiveness of combining LLM +and CRS in E-commerce pre-sales dialogues, proposing two collaboration methods: +CRS assisting LLM and LLM assisting CRS. We conduct extensive experiments on a +real-world dataset of Ecommerce pre-sales dialogues. We analyze the impact of +two collaborative approaches with two CRSs and two LLMs on four tasks of +Ecommerce pre-sales dialogue. We find that collaborations between CRS and LLM +can be very effective in some cases. + +
+
+ comment: EMNLP 2023 Findings +
+
+
+
+
+ + ♻ ☆ Starbucks: Improved Training for 2D Matryoshka Embeddings + + +
+ Effective approaches that can scale embedding model depth (i.e. layers) and +embedding size allow for the creation of models that are highly scalable across +different computational resources and task requirements. While the recently +proposed 2D Matryoshka training approach can efficiently produce a single +embedding model such that its sub-layers and sub-dimensions can measure text +similarity, its effectiveness is significantly worse than if smaller models +were trained separately. To address this issue, we propose Starbucks, a new +training strategy for Matryoshka-like embedding models, which encompasses both +the fine-tuning and pre-training phases. For the fine-tuning phase, we discover +that, rather than sampling a random sub-layer and sub-dimensions for each +training steps, providing a fixed list of layer-dimension pairs, from small +size to large sizes, and computing the loss across all pairs significantly +improves the effectiveness of 2D Matryoshka embedding models, bringing them on +par with their separately trained counterparts. To further enhance performance, +we introduce a new pre-training strategy, which applies masked autoencoder +language modelling to sub-layers and sub-dimensions during pre-training, +resulting in a stronger backbone for subsequent fine-tuning of the embedding +model. Experimental results on both semantic text similarity and retrieval +benchmarks demonstrate that the proposed pre-training and fine-tuning +strategies significantly improved the effectiveness over 2D Matryoshka models, +enabling Starbucks models to perform more efficiently and effectively than +separately trained models. + +
+
+
+
+
+ + ♻ ☆ Graph Neural Network Enhanced Retrieval for Question Answering of LLMs + + +
+ Retrieval augmented generation has revolutionized large language model (LLM) +outputs by providing factual supports. Nevertheless, it struggles to capture +all the necessary knowledge for complex reasoning questions. Existing retrieval +methods typically divide reference documents into passages, treating them in +isolation. These passages, however, are often interrelated, such as passages +that are contiguous or share the same keywords. Therefore, it is crucial to +recognize such relatedness for enhancing the retrieval process. In this paper, +we propose a novel retrieval method, called GNN-Ret, which leverages graph +neural networks (GNNs) to enhance retrieval by exploiting the relatedness +between passages. Specifically, we first construct a graph of passages by +connecting passages that are structure-related or keyword-related. A graph +neural network (GNN) is then leveraged to exploit the relationships between +passages and improve the retrieval of supporting passages. Furthermore, we +extend our method to handle multi-hop reasoning questions using a recurrent +graph neural network (RGNN), named RGNN-Ret. At each step, RGNN-Ret integrates +the graphs of passages from previous steps, thereby enhancing the retrieval of +supporting passages. Extensive experiments on benchmark datasets demonstrate +that GNN-Ret achieves higher accuracy for question answering with a single +query of LLMs than strong baselines that require multiple queries, and RGNN-Ret +further improves accuracy and achieves state-of-the-art performance, with up to +10.4% accuracy improvement on the 2WikiMQA dataset. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ FINED: Feed Instance-Wise Information Need with Essential and + Disentangled Parametric Knowledge from the Past + + +
+ Recommender models play a vital role in various industrial scenarios, while +often faced with the catastrophic forgetting problem caused by the fast +shifting data distribution. To alleviate this problem, a common approach is to +reuse knowledge from the historical data. However, preserving the vast and +fast-accumulating data is hard, which causes dramatic storage overhead. +Memorizing old data through a parametric knowledge base is then proposed, which +compresses the vast amount of raw data into model parameters. Despite the +flexibility, how to improve the memorization and generalization capabilities of +the parametric knowledge base and suit the flexible information need of each +instance are challenging. In this paper, we propose FINED to Feed INstance-wise +information need with Essential and Disentangled parametric knowledge from past +data for recommendation enhancement. Concretely, we train a knowledge extractor +that extracts knowledge patterns of arbitrary order from past data and a +knowledge encoder that memorizes the arbitrary order patterns, which serves as +the retrieval key generator and memory network respectively in the following +knowledge reusing phase. The whole process is regularized by the proposed two +constraints, which improve the capabilities of the parametric knowledge base +without increasing the size of it. The essential principle helps to compress +the input into representative vectors that capture the task-relevant +information and filter out the noisy information. The disentanglement principle +reduces the redundancy of stored information and pushes the knowledge base to +focus on capturing the disentangled invariant patterns. These two rules +together promote rational compression of information for robust and generalized +knowledge representations. Extensive experiments on two datasets justify the +effectiveness of the proposed method. + +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ Parallel Backpropagation for Inverse of a Convolution with Application + to Normalizing Flows + + +
+ Inverse of an invertible convolution is an important operation that comes up +in Normalizing Flows, Image Deblurring, etc. The naive algorithm for +backpropagation of this operation using Gaussian elimination has running time +$O(n^3)$ where $n$ is the number of pixels in the image. We give a fast +parallel backpropagation algorithm with running time $O(\sqrt{n})$ for a square +image and provide a GPU implementation of the same. Inverse Convolutions are +usually used in Normalizing Flows in the sampling pass, making them slow. We +propose to use Inverse Convolutions in the forward (image to latent vector) +pass of the Normalizing flow. Since the sampling pass is the inverse of the +forward pass, it will use convolutions only, resulting in efficient sampling +times. We use our parallel backpropagation algorithm for optimizing the inverse +convolution layer resulting in fast training times also. We implement this +approach in various Normalizing Flow backbones, resulting in our Inverse-Flow +models. We benchmark Inverse-Flow on standard datasets and show significantly +improved sampling times with similar bits per dimension compared to previous +models. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ RA-BLIP: Multimodal Adaptive Retrieval-Augmented Bootstrapping + Language-Image Pre-training + + +
+ Multimodal Large Language Models (MLLMs) have recently received substantial +interest, which shows their emerging potential as general-purpose models for +various vision-language tasks. MLLMs involve significant external knowledge +within their parameters; however, it is challenging to continually update these +models with the latest knowledge, which involves huge computational costs and +poor interpretability. Retrieval augmentation techniques have proven to be +effective plugins for both LLMs and MLLMs. In this study, we propose multimodal +adaptive Retrieval-Augmented Bootstrapping Language-Image Pre-training +(RA-BLIP), a novel retrieval-augmented framework for various MLLMs. Considering +the redundant information within vision modality, we first leverage the +question to instruct the extraction of visual information through interactions +with one set of learnable queries, minimizing irrelevant interference during +retrieval and generation. Besides, we introduce a pre-trained multimodal +adaptive fusion module to achieve question text-to-multimodal retrieval and +integration of multimodal knowledge by projecting visual and language +modalities into a unified semantic space. Furthermore, we present an Adaptive +Selection Knowledge Generation (ASKG) strategy to train the generator to +autonomously discern the relevance of retrieved knowledge, which realizes +excellent denoising performance. Extensive experiments on open multimodal +question-answering datasets demonstrate that RA-BLIP achieves significant +performance and surpasses the state-of-the-art retrieval-augmented models. + +
+
+ comment: 10 pages, 6 figures, Journal +
+
+
+
+
+ + ♻ ☆ An automatic mixing speech enhancement system for multi-track audio + + +
+ We propose a speech enhancement system for multitrack audio. The system will +minimize auditory masking while allowing one to hear multiple simultaneous +speakers. The system can be used in multiple communication scenarios e.g., +teleconferencing, invoice gaming, and live streaming. The ITU-R BS.1387 +Perceptual Evaluation of Audio Quality (PEAQ) model is used to evaluate the +amount of masking in the audio signals. Different audio effects e.g., level +balance, equalization, dynamic range compression, and spatialization are +applied via an iterative Harmony searching algorithm that aims to minimize the +masking. In the subjective listening test, the designed system can compete with +mixes by professional sound engineers and outperforms mixes by existing +auto-mixing systems. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ Movie101v2: Improved Movie Narration Benchmark + + +
+ Automatic movie narration aims to generate video-aligned plot descriptions to +assist visually impaired audiences. Unlike standard video captioning, it +involves not only describing key visual details but also inferring plots that +unfold across multiple movie shots, presenting distinct and complex challenges. +To advance this field, we introduce Movie101v2, a large-scale, bilingual +dataset with enhanced data quality specifically designed for movie narration. +Revisiting the task, we propose breaking down the ultimate goal of automatic +movie narration into three progressive stages, offering a clear roadmap with +corresponding evaluation metrics. Based on our new benchmark, we baseline a +range of large vision-language models, including GPT-4V, and conduct an +in-depth analysis of the challenges in narration generation. Our findings +highlight that achieving applicable movie narration generation is a fascinating +goal that requires significant research. + +
+
+
+
+
+ + ♻ ☆ Perceptual Quality Assessment of Octree-RAHT Encoded 3D Point Clouds + + +
+ No-reference bitstream-layer point cloud quality assessment (PCQA) can be +deployed without full decoding at any network node to achieve real-time quality +monitoring. In this work, we focus on the PCQA problem dedicated to Octree-RAHT +encoding mode. First, to address the issue that existing PCQA databases have a +small scale and limited distortion levels, we establish the WPC5.0 database +which is the first one dedicated to Octree-RAHT encoding mode with a scale of +400 distorted point clouds (PCs) including 4 geometric multiplied by 5 attitude +distortion levels. Then, we propose the first PCQA model dedicated to +Octree-RAHT encoding mode by parsing PC bitstreams without full decoding. The +model introduces texture bitrate (TBPP) to predict texture complexity (TC) and +further derives the texture distortion factor. In addition, the Geometric +Quantization Parameter (PQS) is used to estimate the geometric distortion +factor, which is then integrated into the model along with the texture +distortion factor to obtain the proposed PCQA model named streamPCQ-OR. The +proposed model has been compared with other advanced PCQA methods on the +WPC5.0, BASICS and M-PCCD databases, and experimental results show that our +model has excellent performance while having very low computational complexity, +providing a reliable choice for time-critical applications. To facilitate +subsequent research, the database and source code will be publicly released at +https://github.com/qdushl/Waterloo-Point-Cloud-Database-5.0. + +
+
+
+
+
+ + ♻ ☆ Evaluating Semantic Variation in Text-to-Image Synthesis: A Causal + Perspective + + +
+ Accurate interpretation and visualization of human instructions are crucial +for text-to-image (T2I) synthesis. However, current models struggle to capture +semantic variations from word order changes, and existing evaluations, relying +on indirect metrics like text-image similarity, fail to reliably assess these +challenges. This often obscures poor performance on complex or uncommon +linguistic patterns by the focus on frequent word combinations. To address +these deficiencies, we propose a novel metric called SemVarEffect and a +benchmark named SemVarBench, designed to evaluate the causality between +semantic variations in inputs and outputs in T2I synthesis. Semantic variations +are achieved through two types of linguistic permutations, while avoiding +easily predictable literal variations. Experiments reveal that the +CogView-3-Plus and Ideogram 2 performed the best, achieving a score of 0.2/1. +Semantic variations in object relations are less understood than attributes, +scoring 0.07/1 compared to 0.17-0.19/1. We found that cross-modal alignment in +UNet or Transformers plays a crucial role in handling semantic variations, a +factor previously overlooked by a focus on textual encoders. Our work +establishes an effective evaluation framework that advances the T2I synthesis +community's exploration of human instruction understanding. Our benchmark and +code are available at https://github.com/zhuxiangru/SemVarBench . + +
+
+ comment: The only change in the current version update is the replacement of + the template with a more precise one +
+
+
+
+
+ + ♻ ☆ MixEval-X: Any-to-Any Evaluations from Real-World Data Mixtures + + +
+ Perceiving and generating diverse modalities are crucial for AI models to +effectively learn from and engage with real-world signals, necessitating +reliable evaluations for their development. We identify two major issues in +current evaluations: (1) inconsistent standards, shaped by different +communities with varying protocols and maturity levels; and (2) significant +query, grading, and generalization biases. To address these, we introduce +MixEval-X, the first any-to-any, real-world benchmark designed to optimize and +standardize evaluations across diverse input and output modalities. We propose +multi-modal benchmark mixture and adaptation-rectification pipelines to +reconstruct real-world task distributions, ensuring evaluations generalize +effectively to real-world use cases. Extensive meta-evaluations show our +approach effectively aligns benchmark samples with real-world task +distributions. Meanwhile, MixEval-X's model rankings correlate strongly with +that of crowd-sourced real-world evaluations (up to 0.98) while being much more +efficient. We provide comprehensive leaderboards to rerank existing models and +organizations and offer insights to enhance understanding of multi-modal +evaluations and inform future research. + +
+
+
+
+
+ + ♻ ☆ Synthesizing Sentiment-Controlled Feedback For Multimodal Text and Image + Data + + +
+ The ability to generate sentiment-controlled feedback in response to +multimodal inputs comprising text and images addresses a critical gap in +human-computer interaction. This capability allows systems to provide +empathetic, accurate, and engaging responses, with useful applications in +education, healthcare, marketing, and customer service. To this end, we have +constructed a large-scale Controllable Multimodal Feedback Synthesis (CMFeed) +dataset and propose a controllable feedback synthesis system. The system +features an encoder, decoder, and controllability block for textual and visual +inputs. It extracts features using a transformer and Faster R-CNN networks, +combining them to generate feedback. The CMFeed dataset includes images, texts, +reactions to the posts, human comments with relevance scores, and reactions to +these comments. These reactions train the model to produce feedback with +specified sentiments, achieving a sentiment classification accuracy of 77.23\%, +which is 18.82\% higher than the accuracy without controllability. The system +also incorporates a similarity module for assessing feedback relevance through +rank-based metrics and an interpretability technique to analyze the +contributions of textual and visual features during feedback generation. Access +to the CMFeed dataset and the system's code is available at +https://github.com/MIntelligence-Group/CMFeed. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 32 + +
+
+
+ + ☆ Best in Tau@LLMJudge: Criteria-Based Relevance Evaluation with Llama3 + + +
+ Traditional evaluation of information retrieval (IR) systems relies on +human-annotated relevance labels, which can be both biased and costly at scale. +In this context, large language models (LLMs) offer an alternative by allowing +us to directly prompt them to assign relevance labels for passages associated +with each query. In this study, we explore alternative methods to directly +prompt LLMs for assigned relevance labels, by exploring two hypotheses: + Hypothesis 1 assumes that it is helpful to break down "relevance" into +specific criteria - exactness, coverage, topicality, and contextual fit. We +explore different approaches that prompt large language models (LLMs) to obtain +criteria-level grades for all passages, and we consider various ways to +aggregate criteria-level grades into a relevance label. Hypothesis 2 assumes +that differences in linguistic style between queries and passages may +negatively impact the automatic relevance label prediction. We explore whether +improvements can be achieved by first synthesizing a summary of the passage in +the linguistic style of a query, and then using this summary in place of the +passage to assess its relevance. + We include an empirical evaluation of our approaches based on data from the +LLMJudge challenge run in Summer 2024, where our "Four Prompts" approach +obtained the highest scores in Kendall's tau. + +
+
+
+
+
+ + ☆ Efficient Retrieval of Temporal Event Sequences from Textual + Descriptions + + +
+ Retrieving temporal event sequences from textual descriptions is essential +for applications such as analyzing e-commerce behavior, monitoring social media +activities, and tracking criminal incidents. In this paper, we introduce +TPP-LLM-Embedding, a unified model for efficiently embedding and retrieving +event sequences based on natural language descriptions. Built on the TPP-LLM +framework, which integrates large language models with temporal point +processes, our model encodes both event types and times, generating a +sequence-level representation through pooling. Textual descriptions are +embedded using the same architecture, ensuring a shared embedding space for +both sequences and descriptions. We optimize a contrastive loss based on +similarity between these embeddings, bringing matching pairs closer and +separating non-matching ones. TPP-LLM-Embedding enables efficient retrieval and +demonstrates superior performance compared to baseline models across diverse +datasets. + +
+
+
+
+
+ + ☆ FinQAPT: Empowering Financial Decisions with End-to-End LLM-driven + Question Answering Pipeline + + +
+ Financial decision-making hinges on the analysis of relevant information +embedded in the enormous volume of documents in the financial domain. To +address this challenge, we developed FinQAPT, an end-to-end pipeline that +streamlines the identification of relevant financial reports based on a query, +extracts pertinent context, and leverages Large Language Models (LLMs) to +perform downstream tasks. To evaluate the pipeline, we experimented with +various techniques to optimize the performance of each module using the FinQA +dataset. We introduced a novel clustering-based negative sampling technique to +enhance context extraction and a novel prompting method called Dynamic N-shot +Prompting to boost the numerical question-answering capabilities of LLMs. At +the module level, we achieved state-of-the-art accuracy on FinQA, attaining an +accuracy of 80.6\%. However, at the pipeline level, we observed decreased +performance due to challenges in extracting relevant context from financial +reports. We conducted a detailed error analysis of each module and the +end-to-end pipeline, pinpointing specific challenges that must be addressed to +develop a robust solution for handling complex financial tasks. + +
+
+ comment: Accepted in ICAIF 2024, 8 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ Identifying High Consideration E-Commerce Search Queries EMNLP 2024 + + +
+ In e-commerce, high consideration search missions typically require careful +and elaborate decision making, and involve a substantial research investment +from customers. We consider the task of identifying High Consideration (HC) +queries. Identifying such queries enables e-commerce sites to better serve user +needs using targeted experiences such as curated QA widgets that help users +reach purchase decisions. We explore the task by proposing an Engagement-based +Query Ranking (EQR) approach, focusing on query ranking to indicate potential +engagement levels with query-related shopping knowledge content during product +search. Unlike previous studies on predicting trends, EQR prioritizes +query-level features related to customer behavior, finance, and catalog +information rather than popularity signals. We introduce an accurate and +scalable method for EQR and present experimental results demonstrating its +effectiveness. Offline experiments show strong ranking performance. Human +evaluation shows a precision of 96% for HC queries identified by our model. The +model was commercially deployed, and shown to outperform human-selected queries +in terms of downstream customer impact, as measured through engagement. + +
+
+ comment: Accepted by EMNLP 2024 (Industry Track) +
+
+
+
+
+ + ☆ Knowledge-Aware Query Expansion with Large Language Models for Textual + and Relational Retrieval + + +
+ Large language models (LLMs) have been used to generate query expansions +augmenting original queries for improving information search. Recent studies +also explore providing LLMs with initial retrieval results to generate query +expansions more grounded to document corpus. However, these methods mostly +focus on enhancing textual similarities between search queries and target +documents, overlooking document relations. For queries like "Find me a highly +rated camera for wildlife photography compatible with my Nikon F-Mount lenses", +existing methods may generate expansions that are semantically similar but +structurally unrelated to user intents. To handle such semi-structured queries +with both textual and relational requirements, in this paper we propose a +knowledge-aware query expansion framework, augmenting LLMs with structured +document relations from knowledge graph (KG). To further address the limitation +of entity-based scoring in existing KG-based methods, we leverage document +texts as rich KG node representations and use document-based relation filtering +for our Knowledge-Aware Retrieval (KAR). Extensive experiments on three +datasets of diverse domains show the advantages of our method compared against +state-of-the-art baselines on textual and relational semi-structured retrieval. + +
+
+
+
+
+ + ☆ Disjointness Violations in Wikidata + + +
+ Disjointness checks are among the most important constraint checks in a +knowledge base and can be used to help detect and correct incorrect statements +and internal contradictions. Wikidata is a very large, community-managed +knowledge base. Because of both its size and construction, Wikidata contains +many incorrect statements and internal contradictions. We analyze the current +modeling of disjointness on Wikidata, identify patterns that cause these +disjointness violations and categorize them. We use SPARQL queries to identify +each ``culprit'' causing a disjointness violation and lay out formulas to +identify and fix conflicting information. We finally discuss how disjointness +information could be better modeled and expanded in Wikidata in the future. + +
+
+ comment: Sixth International Knowledge Graph and Semantic Web Conference +
+
+
+
+
+ + ☆ Pessimistic Evaluation + + +
+ Traditional evaluation of information access systems has focused primarily on +average utility across a set of information needs (information retrieval) or +users (recommender systems). In this work, we argue that evaluating only with +average metric measurements assumes utilitarian values not aligned with +traditions of information access based on equal access. We advocate for +pessimistic evaluation of information access systems focusing on worst case +utility. These methods are (a) grounded in ethical and pragmatic concepts, (b) +theoretically complementary to existing robustness and fairness methods, and +(c) empirically validated across a set of retrieval and recommendation tasks. +These results suggest that pessimistic evaluation should be included in +existing experimentation processes to better understand the behavior of +systems, especially when concerned with principles of social good. + +
+
+
+
+
+ + ☆ Large Language Models as Narrative-Driven Recommenders + + +
+ Narrative-driven recommenders aim to provide personalized suggestions for +user requests expressed in free-form text such as "I want to watch a thriller +with a mind-bending story, like Shutter Island." Although large language models +(LLMs) have been shown to excel in processing general natural language queries, +their effectiveness for handling such recommendation requests remains +relatively unexplored. To close this gap, we compare the performance of 38 +open- and closed-source LLMs of various sizes, such as LLama 3.2 and GPT-4o, in +a movie recommendation setting. For this, we utilize a gold-standard, +crowdworker-annotated dataset of posts from reddit's movie suggestion community +and employ various prompting strategies, including zero-shot, identity, and +few-shot prompting. Our findings demonstrate the ability of LLMs to generate +contextually relevant movie recommendations, significantly outperforming other +state-of-the-art approaches, such as doc2vec. While we find that closed-source +and large-parameterized models generally perform best, medium-sized open-source +models remain competitive, being only slightly outperformed by their more +computationally expensive counterparts. Furthermore, we observe no significant +differences across prompting strategies for most models, underscoring the +effectiveness of simple approaches such as zero-shot prompting for +narrative-driven recommendations. Overall, this work offers valuable insights +for recommender system researchers as well as practitioners aiming to integrate +LLMs into real-world recommendation tools. + +
+
+ comment: Under review; 19 pages +
+
+
+
+
+ + ☆ Cross-Domain Sequential Recommendation via Neural Process + + +
+ Cross-Domain Sequential Recommendation (CDSR) is a hot topic in +sequence-based user interest modeling, which aims at utilizing a single model +to predict the next items for different domains. To tackle the CDSR, many +methods are focused on domain overlapped users' behaviors fitting, which +heavily relies on the same user's different-domain item sequences collaborating +signals to capture the synergy of cross-domain item-item correlation. Indeed, +these overlapped users occupy a small fraction of the entire user set only, +which introduces a strong assumption that the small group of domain overlapped +users is enough to represent all domain user behavior characteristics. However, +intuitively, such a suggestion is biased, and the insufficient learning +paradigm in non-overlapped users will inevitably limit model performance. +Further, it is not trivial to model non-overlapped user behaviors in CDSR +because there are no other domain behaviors to collaborate with, which causes +the observed single-domain users' behavior sequences to be hard to contribute +to cross-domain knowledge mining. Considering such a phenomenon, we raise a +challenging and unexplored question: How to unleash the potential of +non-overlapped users' behaviors to empower CDSR? + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Context-aware adaptive personalised recommendation: a meta-hybrid + + +
+ Recommenders take place on a wide scale of e-commerce systems, reducing the +problem of information overload. The most common approach is to choose a +recommender used by the system to make predictions. However, users vary from +each other; thus, a one-fits-all approach seems to be sub-optimal. In this +paper, we propose a meta-hybrid recommender that uses machine learning to +predict an optimal algorithm. In this way, the best-performing recommender is +used for each specific session and user. This selection depends on contextual +and preferential information collected about the user. We use standard +MovieLens and The Movie DB datasets for offline evaluation. We show that based +on the proposed model, it is possible to predict which recommender will provide +the most precise recommendations to a user. The theoretical performance of our +meta-hybrid outperforms separate approaches by 20-50% in normalized Discounted +Gain and Root Mean Square Error metrics. However, it is hard to obtain the +optimal performance based on widely-used standard information stored about +users. + +
+
+
+
+
+ + ☆ Comparing the Utility, Preference, and Performance of Course Material + Search Functionality and Retrieval-Augmented Generation Large Language Model + (RAG-LLM) AI Chatbots in Information-Seeking Tasks + + +
+ Providing sufficient support for students requires substantial resources, +especially considering the growing enrollment numbers. Students need help in a +variety of tasks, ranging from information-seeking to requiring support with +course assignments. To explore the utility of recent large language models +(LLMs) as a support mechanism, we developed an LLM-powered AI chatbot that +augments the answers that are produced with information from the course +materials. To study the effect of the LLM-powered AI chatbot, we conducted a +lab-based user study (N=14), in which the participants worked on tasks from a +web software development course. The participants were divided into two groups, +where one of the groups first had access to the chatbot and then to a more +traditional search functionality, while another group started with the search +functionality and was then given the chatbot. We assessed the participants' +performance and perceptions towards the chatbot and the search functionality +and explored their preferences towards the support functionalities. Our +findings highlight that both support mechanisms are seen as useful and that +support mechanisms work well for specific tasks, while less so for other tasks. +We also observe that students tended to prefer the second support mechanism +more, where students who were first given the chatbot tended to prefer the +search functionality and vice versa. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ SBI-RAG: Enhancing Math Word Problem Solving for Students through + Schema-Based Instruction and Retrieval-Augmented Generation NeurIPS'24 + + +
+ Many students struggle with math word problems (MWPs), often finding it +difficult to identify key information and select the appropriate mathematical +operations.Schema-based instruction (SBI) is an evidence-based strategy that +helps students categorize problems based on their structure, improving +problem-solving accuracy. Building on this, we propose a Schema-Based +Instruction Retrieval-Augmented Generation (SBI-RAG) framework that +incorporates a large language model (LLM).Our approach emphasizes step-by-step +reasoning by leveraging schemas to guide solution generation. We evaluate its +performance on the GSM8K dataset, comparing it with GPT-4 and GPT-3.5 Turbo, +and introduce a "reasoning score" metric to assess solution quality. Our +findings suggest that SBI-RAG enhances reasoning clarity and problem-solving +accuracy, potentially providing educational benefits for students + +
+
+ comment: Accepted to the 4th MATH-AI Workshop at NeurIPS'24 +
+
+
+
+
+ + ☆ Disentangling Likes and Dislikes in Personalized Generative Explainable + Recommendation + + +
+ Recent research on explainable recommendation generally frames the task as a +standard text generation problem, and evaluates models simply based on the +textual similarity between the predicted and ground-truth explanations. +However, this approach fails to consider one crucial aspect of the systems: +whether their outputs accurately reflect the users' (post-purchase) sentiments, +i.e., whether and why they would like and/or dislike the recommended items. To +shed light on this issue, we introduce new datasets and evaluation methods that +focus on the users' sentiments. Specifically, we construct the datasets by +explicitly extracting users' positive and negative opinions from their +post-purchase reviews using an LLM, and propose to evaluate systems based on +whether the generated explanations 1) align well with the users' sentiments, +and 2) accurately identify both positive and negative opinions of users on the +target items. We benchmark several recent models on our datasets and +demonstrate that achieving strong performance on existing metrics does not +ensure that the generated explanations align well with the users' sentiments. +Lastly, we find that existing models can provide more sentiment-aware +explanations when the users' (predicted) ratings for the target items are +directly fed into the models as input. We will release our code and datasets +upon acceptance. + +
+
+
+
+
+ + ☆ MixEHR-Nest: Identifying Subphenotypes within Electronic Health Records + through Hierarchical Guided-Topic Modeling + + +
+ Automatic subphenotyping from electronic health records (EHRs)provides +numerous opportunities to understand diseases with unique subgroups and enhance +personalized medicine for patients. However, existing machine learning +algorithms either focus on specific diseases for better interpretability or +produce coarse-grained phenotype topics without considering nuanced disease +patterns. In this study, we propose a guided topic model, MixEHR-Nest, to infer +sub-phenotype topics from thousands of disease using multi-modal EHR data. +Specifically, MixEHR-Nest detects multiple subtopics from each phenotype topic, +whose prior is guided by the expert-curated phenotype concepts such as +Phenotype Codes (PheCodes) or Clinical Classification Software (CCS) codes. We +evaluated MixEHR-Nest on two EHR datasets: (1) the MIMIC-III dataset consisting +of over 38 thousand patients from intensive care unit (ICU) from Beth Israel +Deaconess Medical Center (BIDMC) in Boston, USA; (2) the healthcare +administrative database PopHR, comprising 1.3 million patients from Montreal, +Canada. Experimental results demonstrate that MixEHR-Nest can identify +subphenotypes with distinct patterns within each phenotype, which are +predictive for disease progression and severity. Consequently, MixEHR-Nest +distinguishes between type 1 and type 2 diabetes by inferring subphenotypes +using CCS codes, which do not differentiate these two subtype concepts. +Additionally, MixEHR-Nest not only improved the prediction accuracy of +short-term mortality of ICU patients and initial insulin treatment in diabetic +patients but also revealed the contributions of subphenotypes. For longitudinal +analysis, MixEHR-Nest identified subphenotypes of distinct age prevalence under +the same phenotypes, such as asthma, leukemia, epilepsy, and depression. The +MixEHR-Nest software is available at GitHub: +https://github.com/li-lab-mcgill/MixEHR-Nest. + +
+
+
+
+
+ + ☆ Transformers4NewsRec: A Transformer-based News Recommendation Framework + + +
+ Pre-trained transformer models have shown great promise in various natural +language processing tasks, including personalized news recommendations. To +harness the power of these models, we introduce Transformers4NewsRec, a new +Python framework built on the \textbf{Transformers} library. This framework is +designed to unify and compare the performance of various news recommendation +models, including deep neural networks and graph-based models. +Transformers4NewsRec offers flexibility in terms of model selection, data +preprocessing, and evaluation, allowing both quantitative and qualitative +analysis. + +
+
+
+
+
+ + ☆ Retrieval-Enhanced Named Entity Recognition + + +
+ When combined with In-Context Learning, a technique that enables models to +adapt to new tasks by incorporating task-specific examples or demonstrations +directly within the input prompt, autoregressive language models have achieved +good performance in a wide range of tasks and applications. However, this +combination has not been properly explored in the context of named entity +recognition, where the structure of this task poses unique challenges. We +propose RENER (Retrieval-Enhanced Named Entity Recognition), a technique for +named entity recognition using autoregressive language models based on +In-Context Learning and information retrieval techniques. When presented with +an input text, RENER fetches similar examples from a dataset of training +examples that are used to enhance a language model to recognize named entities +from this input text. RENER is modular and independent of the underlying +language model and information retrieval algorithms. Experimental results show +that in the CrossNER collection we achieve state-of-the-art performance with +the proposed technique and that information retrieval can increase the F-score +by up to 11 percentage points. + +
+
+ comment: 13 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ Preference Diffusion for Recommendation + + +
+ Recommender systems predict personalized item rankings based on user +preference distributions derived from historical behavior data. Recently, +diffusion models (DMs) have gained attention in recommendation for their +ability to model complex distributions, yet current DM-based recommenders often +rely on traditional objectives like mean squared error (MSE) or recommendation +objectives, which are not optimized for personalized ranking tasks or fail to +fully leverage DM's generative potential. To address this, we propose +PreferDiff, a tailored optimization objective for DM-based recommenders. +PreferDiff transforms BPR into a log-likelihood ranking objective and +integrates multiple negative samples to better capture user preferences. +Specifically, we employ variational inference to handle the intractability +through minimizing the variational upper bound and replaces MSE with cosine +error to improve alignment with recommendation tasks. Finally, we balance +learning generation and preference to enhance the training stability of DMs. +PreferDiff offers three key benefits: it is the first personalized ranking loss +designed specifically for DM-based recommenders and it improves ranking and +faster convergence by addressing hard negatives. We also prove that it is +theoretically connected to Direct Preference Optimization which indicates that +it has the potential to align user preferences in DM-based recommenders via +generative modeling. Extensive experiments across three benchmarks validate its +superior recommendation performance and commendable general sequential +recommendation capabilities. Our codes are available at +\url{https://github.com/lswhim/PreferDiff}. + +
+
+
+
+
+ + ♻ ☆ Improving Bilingual Lexicon Induction with Cross-Encoder Reranking EMNLP 2022 + + +
+ Bilingual lexicon induction (BLI) with limited bilingual supervision is a +crucial yet challenging task in multilingual NLP. Current state-of-the-art BLI +methods rely on the induction of cross-lingual word embeddings (CLWEs) to +capture cross-lingual word similarities; such CLWEs are obtained 1) via +traditional static models (e.g., VecMap), or 2) by extracting type-level CLWEs +from multilingual pretrained language models (mPLMs), or 3) through combining +the former two options. In this work, we propose a novel semi-supervised +post-hoc reranking method termed BLICEr (BLI with Cross-Encoder Reranking), +applicable to any precalculated CLWE space, which improves their BLI +capability. The key idea is to 'extract' cross-lingual lexical knowledge from +mPLMs, and then combine it with the original CLWEs. This crucial step is done +via 1) creating a word similarity dataset, comprising positive word pairs +(i.e., true translations) and hard negative pairs induced from the original +CLWE space, and then 2) fine-tuning an mPLM (e.g., mBERT or XLM-R) in a +cross-encoder manner to predict the similarity scores. At inference, we 3) +combine the similarity score from the original CLWE space with the score from +the BLI-tuned cross-encoder. BLICEr establishes new state-of-the-art results on +two standard BLI benchmarks spanning a wide spectrum of diverse languages: it +substantially outperforms a series of strong baselines across the board. We +also validate the robustness of BLICEr with different CLWEs. + +
+
+ comment: Findings of EMNLP 2022 +
+
+
+
+
+ + ♻ ☆ The Moral Case for Using Language Model Agents for Recommendation + + +
+ Our information and communication environment has fallen short of the ideals +that networked global communication might have served. Identifying all the +causes of its pathologies is difficult, but existing recommender systems very +likely play a contributing role. In this paper, which draws on the normative +tools of philosophy of computing, informed by empirical and technical insights +from natural language processing and recommender systems, we make the moral +case for an alternative approach. We argue that existing recommenders +incentivise mass surveillance, concentrate power, fall prey to narrow +behaviourism, and compromise user agency. Rather than just trying to avoid +algorithms entirely, or to make incremental improvements to the current +paradigm, researchers and engineers should explore an alternative paradigm: the +use of language model (LM) agents to source and curate content that matches +users' preferences and values, expressed in natural language. The use of LM +agents for recommendation poses its own challenges, including those related to +candidate generation, computational efficiency, preference modelling, and +prompt injection. Nonetheless, if implemented successfully LM agents could: +guide us through the digital public sphere without relying on mass +surveillance; shift power away from platforms towards users; optimise for what +matters instead of just for behavioural proxies; and scaffold our agency +instead of undermining it. + +
+
+
+
+
+ + ♻ ☆ Improving Word Translation via Two-Stage Contrastive Learning ACL 2022 + + +
+ Word translation or bilingual lexicon induction (BLI) is a key cross-lingual +task, aiming to bridge the lexical gap between different languages. In this +work, we propose a robust and effective two-stage contrastive learning +framework for the BLI task. At Stage C1, we propose to refine standard +cross-lingual linear maps between static word embeddings (WEs) via a +contrastive learning objective; we also show how to integrate it into the +self-learning procedure for even more refined cross-lingual maps. In Stage C2, +we conduct BLI-oriented contrastive fine-tuning of mBERT, unlocking its word +translation capability. We also show that static WEs induced from the +`C2-tuned' mBERT complement static WEs from Stage C1. Comprehensive experiments +on standard BLI datasets for diverse languages and different experimental +setups demonstrate substantial gains achieved by our framework. While the BLI +method from Stage C1 already yields substantial gains over all state-of-the-art +BLI methods in our comparison, even stronger improvements are met with the full +two-stage framework: e.g., we report gains for 112/112 BLI setups, spanning 28 +language pairs. + +
+
+ comment: ACL 2022 Main +
+
+
+
+
+ + ♻ ☆ Into the Unknown Unknowns: Engaged Human Learning through Participation + in Language Model Agent Conversations EMNLP 2024 + + +
+ While language model (LM)-powered chatbots and generative search engines +excel at answering concrete queries, discovering information in the terrain of +unknown unknowns remains challenging for users. To emulate the common +educational scenario where children/students learn by listening to and +participating in conversations of their parents/teachers, we create +Collaborative STORM (Co-STORM). Unlike QA systems that require users to ask all +the questions, Co-STORM lets users observe and occasionally steer the discourse +among several LM agents. The agents ask questions on the user's behalf, +allowing the user to discover unknown unknowns serendipitously. To facilitate +user interaction, Co-STORM assists users in tracking the discourse by +organizing the uncovered information into a dynamic mind map, ultimately +generating a comprehensive report as takeaways. For automatic evaluation, we +construct the WildSeek dataset by collecting real information-seeking records +with user goals. Co-STORM outperforms baseline methods on both discourse trace +and report quality. In a further human evaluation, 70% of participants prefer +Co-STORM over a search engine, and 78% favor it over a RAG chatbot. + +
+
+ comment: EMNLP 2024 Main +
+
+
+
+
+ + ♻ ☆ Mixed-initiative Query Rewriting in Conversational Passage Retrieval + + +
+ In this paper, we report our methods and experiments for the TREC +Conversational Assistance Track (CAsT) 2022. In this work, we aim to reproduce +multi-stage retrieval pipelines and explore one of the potential benefits of +involving mixed-initiative interaction in conversational passage retrieval +scenarios: reformulating raw queries. Before the first ranking stage of a +multi-stage retrieval pipeline, we propose a mixed-initiative query rewriting +module, which achieves query rewriting based on the mixed-initiative +interaction between the users and the system, as the replacement for the neural +rewriting method. Specifically, we design an algorithm to generate appropriate +questions related to the ambiguities in raw queries, and another algorithm to +reformulate raw queries by parsing users' feedback and incorporating it into +the raw query. For the first ranking stage of our multi-stage pipelines, we +adopt a sparse ranking function: BM25, and a dense retrieval method: +TCT-ColBERT. For the second-ranking step, we adopt a pointwise reranker: +MonoT5, and a pairwise reranker: DuoT5. Experiments on both TREC CAsT 2021 and +TREC CAsT 2022 datasets show the effectiveness of our mixed-initiative-based +query rewriting (or query reformulation) method on improving retrieval +performance compared with two popular reformulators: a neural reformulator: +CANARD-T5 and a rule-based reformulator: historical query reformulator(HQE). + +
+
+ comment: https://trec.nist.gov/pubs/trec31/papers/udel_fang.C.pdf +
+
+
+
+
+ + ♻ ☆ Behavior Alignment: A New Perspective of Evaluating LLM-based + Conversational Recommender Systems SIGIR + + +
+ Large Language Models (LLMs) have demonstrated great potential in +Conversational Recommender Systems (CRS). However, the application of LLMs to +CRS has exposed a notable discrepancy in behavior between LLM-based CRS and +human recommenders: LLMs often appear inflexible and passive, frequently +rushing to complete the recommendation task without sufficient inquiry.This +behavior discrepancy can lead to decreased accuracy in recommendations and +lower user satisfaction. Despite its importance, existing studies in CRS lack a +study about how to measure such behavior discrepancy. To fill this gap, we +propose Behavior Alignment, a new evaluation metric to measure how well the +recommendation strategies made by a LLM-based CRS are consistent with human +recommenders'. Our experiment results show that the new metric is better +aligned with human preferences and can better differentiate how systems perform +than existing evaluation metrics. As Behavior Alignment requires explicit and +costly human annotations on the recommendation strategies, we also propose a +classification-based method to implicitly measure the Behavior Alignment based +on the responses. The evaluation results confirm the robustness of the method. + +
+
+ comment: Accepted by the 47th International ACM SIGIR Conference on Research + and Development in Information Retrieval +
+
+
+
+
+ + ♻ ☆ ZeQR: Zero-shot Query Reformulation for Conversational Search SIGIR + + +
+ As the popularity of voice assistants continues to surge, conversational +search has gained increased attention in Information Retrieval. However, data +sparsity issues in conversational search significantly hinder the progress of +supervised conversational search methods. Consequently, researchers are +focusing more on zero-shot conversational search approaches. Nevertheless, +existing zero-shot methods face three primary limitations: they are not +universally applicable to all retrievers, their effectiveness lacks sufficient +explainability, and they struggle to resolve common conversational ambiguities +caused by omission. To address these limitations, we introduce a novel +Zero-shot Query Reformulation (or Query Rewriting) (ZeQR) framework that +reformulates queries based on previous dialogue contexts without requiring +supervision from conversational search data. Specifically, our framework +utilizes language models designed for machine reading comprehension tasks to +explicitly resolve two common ambiguities: coreference and omission, in raw +queries. In comparison to existing zero-shot methods, our approach is +universally applicable to any retriever without additional adaptation or +indexing. It also provides greater explainability and effectively enhances +query intent understanding because ambiguities are explicitly and proactively +resolved. Through extensive experiments on four TREC conversational datasets, +we demonstrate the effectiveness of our method, which consistently outperforms +state-of-the-art baselines. + +
+
+ comment: Accepted by the 9th ACM SIGIR International Conference on the Theory + of Information Retrieval +
+
+
+
+
+ + ♻ ☆ Dataset Condensation for Recommendation + + +
+ Training recommendation models on large datasets requires significant time +and resources. It is desired to construct concise yet informative datasets for +efficient training. Recent advances in dataset condensation show promise in +addressing this problem by synthesizing small datasets. However, applying +existing methods of dataset condensation to recommendation has limitations: (1) +they fail to generate discrete user-item interactions, and (2) they could not +preserve users' potential preferences. To address the limitations, we propose a +lightweight condensation framework tailored for recommendation (DConRec), +focusing on condensing user-item historical interaction sets. Specifically, we +model the discrete user-item interactions via a probabilistic approach and +design a pre-augmentation module to incorporate the potential preferences of +users into the condensed datasets. While the substantial size of datasets leads +to costly optimization, we propose a lightweight policy gradient estimation to +accelerate the data synthesis. Experimental results on multiple real-world +datasets have demonstrated the effectiveness and efficiency of our framework. +Besides, we provide a theoretical analysis of the provable convergence of +DConRec. Our implementation is available at: +https://github.com/JiahaoWuGit/DConRec. + +
+
+ comment: Accepted by IEEE TKDE. Also titled as "Condensing Pre-augmented + Recommendation Data via Lightweight Policy Gradient Estimation" +
+
+
+
+
+ + ♻ ☆ Mixed-Precision Embeddings for Large-Scale Recommendation Models + + +
+ Embedding techniques have become essential components of large databases in +the deep learning era. By encoding discrete entities, such as words, items, or +graph nodes, into continuous vector spaces, embeddings facilitate more +efficient storage, retrieval, and processing in large databases. Especially in +the domain of recommender systems, millions of categorical features are encoded +as unique embedding vectors, which facilitates the modeling of similarities and +interactions among features. However, numerous embedding vectors can result in +significant storage overhead. In this paper, we aim to compress the embedding +table through quantization techniques. Given that features vary in importance +levels, we seek to identify an appropriate precision for each feature to +balance model accuracy and memory usage. To this end, we propose a novel +embedding compression method, termed Mixed-Precision Embeddings (MPE). +Specifically, to reduce the size of the search space, we first group features +by frequency and then search precision for each feature group. MPE further +learns the probability distribution over precision levels for each feature +group, which can be used to identify the most suitable precision with a +specially designed sampling strategy. Extensive experiments on three public +datasets demonstrate that MPE significantly outperforms existing embedding +compression methods. Remarkably, MPE achieves about 200x compression on the +Criteo dataset without comprising the prediction accuracy. + +
+
+ comment: under submision +
+
+
+
+
+ + ♻ ☆ Chatbot-Based Ontology Interaction Using Large Language Models and + Domain-Specific Standards + + +
+ The following contribution introduces a concept that employs Large Language +Models (LLMs) and a chatbot interface to enhance SPARQL query generation for +ontologies, thereby facilitating intuitive access to formalized knowledge. +Utilizing natural language inputs, the system converts user inquiries into +accurate SPARQL queries that strictly query the factual content of the +ontology, effectively preventing misinformation or fabrication by the LLM. To +enhance the quality and precision of outcomes, additional textual information +from established domain-specific standards is integrated into the ontology for +precise descriptions of its concepts and relationships. An experimental study +assesses the accuracy of generated SPARQL queries, revealing significant +benefits of using LLMs for querying ontologies and highlighting areas for +future research. + +
+
+ comment: \c{opyright} 2024 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ DiffATR: Diffusion-based Generative Modeling for Audio-Text Retrieval + + +
+ Existing audio-text retrieval (ATR) methods are essentially discriminative +models that aim to maximize the conditional likelihood, represented as +p(candidates|query). Nevertheless, this methodology fails to consider the +intrinsic data distribution p(query), leading to difficulties in discerning +out-of-distribution data. In this work, we attempt to tackle this constraint +through a generative perspective and model the relationship between audio and +text as their joint probability p(candidates,query). To this end, we present a +diffusion-based ATR framework (DiffATR), which models ATR as an iterative +procedure that progressively generates joint distribution from noise. +Throughout its training phase, DiffATR is optimized from both generative and +discriminative viewpoints: the generator is refined through a generation loss, +while the feature extractor benefits from a contrastive loss, thus combining +the merits of both methodologies. Experiments on the AudioCaps and Clotho +datasets with superior performances, verify the effectiveness of our approach. +Notably, without any alterations, our DiffATR consistently exhibits strong +performance in out-of-domain retrieval settings. + +
+
+ comment: Accepted by Interspeech2024 +
+
+
+
+
+ + ♻ ☆ Probability Distribution Learning: A theoretical framework for Deep + Learning + + +
+ This paper introduces probability distribution learning (PD learning), a +novel theoretical learning framework. Departing from the traditional +statistical learning framework, PD learning focuses on learning the underlying +probability distribution, which is modeled as a random variable within the +probability simplex. Within this framework, the learning error is decomposed +into uncertainty, estimation error, and the model's fitting error. +Subsequently, we present the methodology for calculating uncertainty, along +with optimization strategies for both estimation error and fitting error. Given +that minimizing the fitting error typically constitutes a non-convex +optimization problem, we introduce a standard loss function and the gradient +structural control (GSC) algorithm, and demonstrate that by employing this +function, the optima of fitting error minimization can be approached by +reducing the gradient norm and structural error. Furthermore, we apply the PD +learning framework to deep learning, elucidating the mechanisms by which +techniques such as random parameter initialization, over-parameterization, +bias-variance trade-off, and dropout influence deep model training. Finally, +experimental results on various models validate the effectiveness of the +proposed framework. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2105.04026 by other + authors. arXiv admin note: text overlap with arXiv:2105.04026 by other + authors +
+
+
+
+
+ + ♻ ☆ NFT1000: A Cross-Modal Dataset for Non-Fungible Token Retrieval + + +
+ With the rise of "Metaverse" and "Web 3.0", Non-Fungible Token (NFT) has +emerged as a kind of pivotal digital asset, garnering significant attention. By +the end of March 2024, more than 1.7 billion NFTs have been minted across +various blockchain platforms. To effectively locate a desired NFT, conducting +searches within a vast array of NFTs is essential. The challenge in NFT +retrieval is heightened due to the high degree of similarity among different +NFTs, regarding regional and semantic aspects. In this paper, we will introduce +a benchmark dataset named "NFT Top1000 Visual-Text Dataset" (NFT1000), +containing 7.56 million image-text pairs, and being collected from 1000 most +famous PFP1 NFT collections2 by sales volume on the Ethereum blockchain. Based +on this dataset and leveraging the CLIP series of pre-trained models as our +foundation, we propose the dynamic masking fine-tuning scheme. This innovative +approach results in a 7.4\% improvement in the top1 accuracy rate, while +utilizing merely 13\% of the total training data (0.79 million vs. 6.1 +million). We also propose a robust metric Comprehensive Variance Index (CVI) to +assess the similarity and retrieval difficulty of visual-text pairs data. The +dataset will be released as an open-source resource. For more details, please +refer to: https://github.com/ShuxunoO/NFT-Net.git. + +
+
+ comment: 11 pages,12figures to be published in ACM Multimedia 2024, see + https://openreview.net/forum?id=xUtNrKH8iB¬eId=xUtNrKH8iB +
+
+
+
+
+ + ♻ ☆ RAGEval: Scenario Specific RAG Evaluation Dataset Generation Framework + + +
+ Retrieval-Augmented Generation (RAG) is a powerful approach that enables +large language models (LLMs) to incorporate external knowledge. However, +evaluating the effectiveness of RAG systems in specialized scenarios remains +challenging due to the high costs of data construction and the lack of suitable +evaluation metrics. This paper introduces RAGEval, a framework designed to +assess RAG systems across diverse scenarios by generating high-quality +documents, questions, answers, and references through a schema-based pipeline. +With a focus on factual accuracy, we propose three novel metrics Completeness, +Hallucination, and Irrelevance to rigorously evaluate LLM-generated responses. +Experimental results show that RAGEval outperforms zero-shot and one-shot +methods in terms of clarity, safety, conformity, and richness of generated +samples. Furthermore, the use of LLMs for scoring the proposed metrics +demonstrates a high level of consistency with human evaluations. RAGEval +establishes a new paradigm for evaluating RAG systems in real-world +applications. + +
+
+ comment: https://github.com/OpenBMB/RAGEval +
+
+
+
+
+ + ♻ ☆ A Theory for Token-Level Harmonization in Retrieval-Augmented Generation + + +
+ Retrieval-augmented generation (RAG) utilizes retrieved texts to enhance +large language models (LLMs). Studies show that while RAG provides valuable +external information (benefit), it may also mislead LLMs (detriment) with noisy +or incorrect retrieved texts. Although many existing methods attempt to +preserve benefit and avoid detriment, they lack a theoretical explanation for +RAG. The benefit and detriment in the next token prediction of RAG remain a +black box that cannot be quantified or compared in an explainable manner, so +existing methods are data-driven, need additional utility evaluators or +post-hoc. This paper takes the first step towards providing a theory to explain +and trade off the benefit and detriment in RAG. First, we model RAG as the +fusion between distribution of LLMs knowledge and distribution of retrieved +texts. Then, we formalize the trade-off between the value of external knowledge +(benefit) and its potential risk of misleading LLMs (detriment) in next token +prediction of RAG by distribution difference in this fusion. Finally, we prove +that the actual effect of RAG on the token, which is the comparison between +benefit and detriment, can be predicted without any training or accessing the +utility of retrieval. Based on our theory, we propose a practical novel method, +Tok-RAG, which achieves collaborative generation between the pure LLM and RAG +at token level to preserve benefit and avoid detriment. Experiments in +real-world tasks using LLMs such as OPT, LLaMA-2, and Mistral show the +effectiveness of our method and support our theoretical findings. + +
+
+ comment: 25 pages +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Improving Multi-modal Large Language Model through Boosting Vision + Capabilities + + +
+ We focus on improving the visual understanding capability for boosting the +vision-language models. We propose \textbf{Arcana}, a multiModal language +model, which introduces two crucial techniques. First, we present Multimodal +LoRA (MM-LoRA), a module designed to enhance the decoder. Unlike traditional +language-driven decoders, MM-LoRA consists of two parallel LoRAs -- one for +vision and one for language -- each with its own parameters. This disentangled +parameters design allows for more specialized learning in each modality and +better integration of multimodal information. Second, we introduce the Query +Ladder adapter (QLadder) to improve the visual encoder. QLadder employs a +learnable ``\textit{ladder}'' structure to deeply aggregates the intermediate +representations from the frozen pretrained visual encoder (e.g., CLIP image +encoder). This enables the model to learn new and informative visual features, +as well as remaining the powerful capabilities of the pretrained visual +encoder. These techniques collectively enhance Arcana's visual perception +power, enabling it to leverage improved visual information for more accurate +and contextually relevant outputs across various multimodal scenarios. +Extensive experiments and ablation studies demonstrate the effectiveness and +generalization capability of our Arcana. The code and re-annotated data are +available at \url{https://arcana-project-page.github.io}. + +
+
+
+
+
+ + ☆ Multimodal growth and development assessment model + + +
+ With the development of social economy and the improvement of people's +attention to health, the growth and development of children and adolescents has +become an important indicator to measure the level of national health. +Therefore, accurate and timely assessment of children's growth and development +has become increasingly important. At the same time, global health +inequalities, especially child malnutrition and stunting in developing +countries, urgently require effective assessment tools to monitor and +intervene. In recent years, the rapid development of technologies such as big +data, artificial intelligence, and cloud computing, and the cross-integration +of multiple disciplines such as biomedicine, statistics, and computer science +have promoted the rapid development of large-scale models for growth and +development assessment. However, there are still problems such as too single +evaluation factors, inaccurate diagnostic results, and inability to give +accurate and reasonable recommendations. The multi-modal growth and development +assessment model uses the public data set of RSNA ( North American College of +Radiology ) as the training set, and the data set of the Department of +Pediatrics of Huaibei People's Hospital as the open source test set. The +embedded ICL module enables the model to quickly adapt and identify the tasks +that need to be done to ensure that under the premise of considering multiple +evaluation factors, accurate diagnosis results and reasonable medical +recommendations are given, so as to provide solutions to the above problems and +promote the development of the medical field. + +
+
+ comment: 7 Pages 7 Figures +
+
+
+
+
+ + ☆ MeloTrans: A Text to Symbolic Music Generation Model Following Human + Composition Habit + + +
+ At present, neural network models show powerful sequence prediction ability +and are used in many automatic composition models. In comparison, the way +humans compose music is very different from it. Composers usually start by +creating musical motifs and then develop them into music through a series of +rules. This process ensures that the music has a specific structure and +changing pattern. However, it is difficult for neural network models to learn +these composition rules from training data, which results in a lack of +musicality and diversity in the generated music. This paper posits that +integrating the learning capabilities of neural networks with human-derived +knowledge may lead to better results. To archive this, we develop the +POP909$\_$M dataset, the first to include labels for musical motifs and their +variants, providing a basis for mimicking human compositional habits. Building +on this, we propose MeloTrans, a text-to-music composition model that employs +principles of motif development rules. Our experiments demonstrate that +MeloTrans excels beyond existing music generation models and even surpasses +Large Language Models (LLMs) like ChatGPT-4. This highlights the importance of +merging human insights with neural network capabilities to achieve superior +symbolic music generation. + +
+
+
+
+
+ + ☆ Remember, Retrieve and Generate: Understanding Infinite Visual Concepts + as Your Personalized Assistant + + +
+ The development of large language models (LLMs) has significantly enhanced +the capabilities of multimodal LLMs (MLLMs) as general assistants. However, +lack of user-specific knowledge still restricts their application in human's +daily life. In this paper, we introduce the Retrieval Augmented Personalization +(RAP) framework for MLLMs' personalization. Starting from a general MLLM, we +turn it into a personalized assistant in three steps. (a) Remember: We design a +key-value database to store user-related information, e.g., user's name, avatar +and other attributes. (b) Retrieve: When the user initiates a conversation, RAP +will retrieve relevant information from the database using a multimodal +retriever. (c) Generate: The input query and retrieved concepts' information +are fed into MLLMs to generate personalized, knowledge-augmented responses. +Unlike previous methods, RAP allows real-time concept editing via updating the +external database. To further improve generation quality and alignment with +user-specific information, we design a pipeline for data collection and create +a specialized dataset for personalized training of MLLMs. Based on the dataset, +we train a series of MLLMs as personalized multimodal assistants. By +pretraining on large-scale dataset, RAP-MLLMs can generalize to infinite visual +concepts without additional finetuning. Our models demonstrate outstanding +flexibility and generation quality across a variety of tasks, such as +personalized image captioning, question answering and visual recognition. The +code, data and models are available at https://github.com/Hoar012/RAP-MLLM. + +
+
+
+
+
+ + ♻ ☆ Beyond Coarse-Grained Matching in Video-Text Retrieval ACCV 2024 + + +
+ Video-text retrieval has seen significant advancements, yet the ability of +models to discern subtle differences in captions still requires verification. +In this paper, we introduce a new approach for fine-grained evaluation. Our +approach can be applied to existing datasets by automatically generating hard +negative test captions with subtle single-word variations across nouns, verbs, +adjectives, adverbs, and prepositions. We perform comprehensive experiments +using four state-of-the-art models across two standard benchmarks (MSR-VTT and +VATEX) and two specially curated datasets enriched with detailed descriptions +(VLN-UVO and VLN-OOPS), resulting in a number of novel insights: 1) our +analyses show that the current evaluation benchmarks fall short in detecting a +model's ability to perceive subtle single-word differences, 2) our fine-grained +evaluation highlights the difficulty models face in distinguishing such subtle +variations. To enhance fine-grained understanding, we propose a new baseline +that can be easily combined with current methods. Experiments on our +fine-grained evaluations demonstrate that this approach enhances a model's +ability to understand fine-grained differences. + +
+
+ comment: Accepted to ACCV 2024 +
+
+
+
+
+ + ♻ ☆ SaMoye: Zero-shot Singing Voice Conversion Model Based on Feature + Disentanglement and Enhancement + + +
+ Singing voice conversion (SVC) aims to convert a singer's voice to another +singer's from a reference audio while keeping the original semantics. However, +existing SVC methods can hardly perform zero-shot due to incomplete feature +disentanglement or dependence on the speaker look-up table. We propose the +first open-source high-quality zero-shot SVC model SaMoye that can convert +singing to human and non-human timbre. SaMoye disentangles the singing voice's +features into content, timbre, and pitch features, where we combine multiple +ASR models and compress the content features to reduce timbre leaks. Besides, +we enhance the timbre features by unfreezing the speaker encoder and mixing the +speaker embedding with top-3 similar speakers. We also establish an +unparalleled large-scale dataset to guarantee zero-shot performance, which +comprises more than 1,815 hours of pure singing voice and 6,367 speakers. We +conduct objective and subjective experiments to find that SaMoye outperforms +other models in zero-shot SVC tasks even under extreme conditions like +converting singing to animals' timbre. The code and weight of SaMoye are +available on https://github.com/CarlWangChina/SaMoye-SVC. The weights, code, +dataset, and documents of SaMoye are publicly available on +\url{https://github.com/CarlWangChina/SaMoye-SVC}. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Leveraging LLM Embeddings for Cross Dataset Label Alignment and Zero + Shot Music Emotion Prediction + + +
+ In this work, we present a novel method for music emotion recognition that +leverages Large Language Model (LLM) embeddings for label alignment across +multiple datasets and zero-shot prediction on novel categories. First, we +compute LLM embeddings for emotion labels and apply non-parametric clustering +to group similar labels, across multiple datasets containing disjoint labels. +We use these cluster centers to map music features (MERT) to the LLM embedding +space. To further enhance the model, we introduce an alignment regularization +that enables dissociation of MERT embeddings from different clusters. This +further enhances the model's ability to better adaptation to unseen datasets. +We demonstrate the effectiveness of our approach by performing zero-shot +inference on a new dataset, showcasing its ability to generalize to unseen +labels without additional training. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 22 + +
+
+
+ + ☆ Is Semantic Chunking Worth the Computational Cost? + + +
+ Recent advances in Retrieval-Augmented Generation (RAG) systems have +popularized semantic chunking, which aims to improve retrieval performance by +dividing documents into semantically coherent segments. Despite its growing +adoption, the actual benefits over simpler fixed-size chunking, where documents +are split into consecutive, fixed-size segments, remain unclear. This study +systematically evaluates the effectiveness of semantic chunking using three +common retrieval-related tasks: document retrieval, evidence retrieval, and +retrieval-based answer generation. The results show that the computational +costs associated with semantic chunking are not justified by consistent +performance gains. These findings challenge the previous assumptions about +semantic chunking and highlight the need for more efficient chunking strategies +in RAG systems. + +
+
+
+
+
+ + ☆ Supply Chain Network Extraction and Entity Classification Leveraging + Large Language Models + + +
+ Supply chain networks are critical to the operational efficiency of +industries, yet their increasing complexity presents significant challenges in +mapping relationships and identifying the roles of various entities. +Traditional methods for constructing supply chain networks rely heavily on +structured datasets and manual data collection, limiting their scope and +efficiency. In contrast, recent advancements in Natural Language Processing +(NLP) and large language models (LLMs) offer new opportunities for discovering +and analyzing supply chain networks using unstructured text data. This paper +proposes a novel approach that leverages LLMs to extract and process raw +textual information from publicly available sources to construct a +comprehensive supply chain graph. We focus on the civil engineering sector as a +case study, demonstrating how LLMs can uncover hidden relationships among +companies, projects, and other entities. Additionally, we fine-tune an LLM to +classify entities within the supply chain graph, providing detailed insights +into their roles and relationships. The results show that domain-specific +fine-tuning improves classification accuracy, highlighting the potential of +LLMs for industry-specific supply chain analysis. Our contributions include the +development of a supply chain graph for the civil engineering sector, as well +as a fine-tuned LLM model that enhances entity classification and understanding +of supply chain networks. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ LLM Confidence Evaluation Measures in Zero-Shot CSS Classification + + +
+ Assessing classification confidence is critical for leveraging large language +models (LLMs) in automated labeling tasks, especially in the sensitive domains +presented by Computational Social Science (CSS) tasks. In this paper, we make +three key contributions: (1) we propose an uncertainty quantification (UQ) +performance measure tailored for data annotation tasks, (2) we compare, for the +first time, five different UQ strategies across three distinct LLMs and CSS +data annotation tasks, (3) we introduce a novel UQ aggregation strategy that +effectively identifies low-confidence LLM annotations and disproportionately +uncovers data incorrectly labeled by the LLMs. Our results demonstrate that our +proposed UQ aggregation strategy improves upon existing methods andcan be used +to significantly improve human-in-the-loop data annotation processes. + +
+
+
+
+
+ + ☆ LFOSum: Summarizing Long-form Opinions with Large Language Models + + +
+ Online reviews play a pivotal role in influencing consumer decisions across +various domains, from purchasing products to selecting hotels or restaurants. +However, the sheer volume of reviews -- often containing repetitive or +irrelevant content -- leads to information overload, making it challenging for +users to extract meaningful insights. Traditional opinion summarization models +face challenges in handling long inputs and large volumes of reviews, while +newer Large Language Model (LLM) approaches often fail to generate accurate and +faithful summaries. To address those challenges, this paper introduces (1) a +new dataset of long-form user reviews, each entity comprising over a thousand +reviews, (2) two training-free LLM-based summarization approaches that scale to +long inputs, and (3) automatic evaluation metrics. Our dataset of user reviews +is paired with in-depth and unbiased critical summaries by domain experts, +serving as a reference for evaluation. Additionally, our novel reference-free +evaluation metrics provide a more granular, context-sensitive assessment of +summary faithfulness. We benchmark several open-source and closed-source LLMs +using our methods. Our evaluation reveals that LLMs still face challenges in +balancing sentiment and format adherence in long-form summaries, though +open-source models can narrow the gap when relevant information is retrieved in +a focused manner. + +
+
+
+
+
+ + ☆ Towards Computational Analysis of Pansori Singing + + +
+ Pansori is one of the most representative vocal genres of Korean traditional +music, which has an elaborated vocal melody line with strong vibrato. Although +the music is transmitted orally without any music notation, transcribing +pansori music in Western staff notation has been introduced for several +purposes, such as documentation of music, education, or research. In this +paper, we introduce computational analysis of pansori based on both audio and +corresponding transcription, how modern Music Information Retrieval tasks can +be used in analyzing traditional music and how it revealed different audio +characteristics of what pansori contains. + +
+
+ comment: Late-Breaking Demo Session of the 25th International Society for + Music Information Retrieval (ISMIR) Conference, 2024 +
+
+
+
+
+ + ☆ RosePO: Aligning LLM-based Recommenders with Human Values + + +
+ Recently, there has been a growing interest in leveraging Large Language +Models (LLMs) for recommendation systems, which usually adapt a pre-trained LLM +to the recommendation scenario through supervised fine-tuning (SFT). However, +both the pre-training and SFT stages fail to explicitly model the comparative +relationships of a user's preferences on different items. To construct a +"helpful and harmless" LLM-based recommender, we propose a general framework -- +Recommendation with smoothing personalized Preference Optimization (RosePO), +which better aligns with customized human values during the post-training +stage. Specifically, in addition to the input and chosen response that +naturally align with SFT data, we design a rejected sampling strategy tailored +for enhancing helpfulness, along with two strategies aimed at mitigating biases +to promote harmlessness. To ensure robustness against uncertain labels present +in automatically constructed preference data, we introduce a personalized +smoothing factor predicted by a preference oracle into the optimization +objective. Evaluation on three real-world datasets demonstrates the +effectiveness of our method, showcasing not only improved recommendation +performance but also mitigation of semantic hallucination and popularity bias. + +
+
+
+
+
+ + ☆ P4GCN: Vertical Federated Social Recommendation with Privacy-Preserving + Two-Party Graph Convolution Networks + + +
+ In recent years, graph neural networks (GNNs) have been commonly utilized for +social recommendation systems. However, real-world scenarios often present +challenges related to user privacy and business constraints, inhibiting direct +access to valuable social information from other platforms. While many existing +methods have tackled matrix factorization-based social recommendations without +direct social data access, developing GNN-based federated social recommendation +models under similar conditions remains largely unexplored. To address this +issue, we propose a novel vertical federated social recommendation method +leveraging privacy-preserving two-party graph convolution networks (P4GCN) to +enhance recommendation accuracy without requiring direct access to sensitive +social information. First, we introduce a Sandwich-Encryption module to ensure +comprehensive data privacy during the collaborative computing process. Second, +we provide a thorough theoretical analysis of the privacy guarantees, +considering the participation of both curious and honest parties. Extensive +experiments on four real-world datasets demonstrate that P4GCN outperforms +state-of-the-art methods in terms of recommendation accuracy. The code is +available at https://github.com/WwZzz/P4GCN. + +
+
+
+
+
+ + ☆ Unifying Economic and Language Models for Enhanced Sentiment Analysis of + the Oil Market + + +
+ Crude oil, a critical component of the global economy, has its prices +influenced by various factors such as economic trends, political events, and +natural disasters. Traditional prediction methods based on historical data have +their limits in forecasting, but recent advancements in natural language +processing bring new possibilities for event-based analysis. In particular, +Language Models (LM) and their advancement, the Generative Pre-trained +Transformer (GPT), have shown potential in classifying vast amounts of natural +language. However, these LMs often have difficulty with domain-specific +terminology, limiting their effectiveness in the crude oil sector. Addressing +this gap, we introduce CrudeBERT, a fine-tuned LM specifically for the crude +oil market. The results indicate that CrudeBERT's sentiment scores align more +closely with the WTI Futures curve and significantly enhance price predictions, +underscoring the crucial role of integrating economic principles into LMs. + +
+
+
+
+
+ + ☆ Mitigating Dual Latent Confounding Biases in Recommender Systems + + +
+ Recommender systems are extensively utilised across various areas to predict +user preferences for personalised experiences and enhanced user engagement and +satisfaction. Traditional recommender systems, however, are complicated by +confounding bias, particularly in the presence of latent confounders that +affect both item exposure and user feedback. Existing debiasing methods often +fail to capture the complex interactions caused by latent confounders in +interaction data, especially when dual latent confounders affect both the user +and item sides. To address this, we propose a novel debiasing method that +jointly integrates the Instrumental Variables (IV) approach and identifiable +Variational Auto-Encoder (iVAE) for Debiased representation learning in +Recommendation systems, referred to as IViDR. Specifically, IViDR leverages the +embeddings of user features as IVs to address confounding bias caused by latent +confounders between items and user feedback, and reconstructs the embedding of +items to obtain debiased interaction data. Moreover, IViDR employs an +Identifiable Variational Auto-Encoder (iVAE) to infer identifiable +representations of latent confounders between item exposure and user feedback +from both the original and debiased interaction data. Additionally, we provide +theoretical analyses of the soundness of using IV and the identifiability of +the latent representations. Extensive experiments on both synthetic and +real-world datasets demonstrate that IViDR outperforms state-of-the-art models +in reducing bias and providing reliable recommendations. + +
+
+
+
+
+ + ☆ REFINE on Scarce Data: Retrieval Enhancement through Fine-Tuning via + Model Fusion of Embedding Models + + +
+ Retrieval augmented generation (RAG) pipelines are commonly used in tasks +such as question-answering (QA), relying on retrieving relevant documents from +a vector store computed using a pretrained embedding model. However, if the +retrieved context is inaccurate, the answers generated using the large language +model (LLM) may contain errors or hallucinations. Although pretrained embedding +models have advanced, adapting them to new domains remains challenging. +Fine-tuning is a potential solution, but industry settings often lack the +necessary fine-tuning data. To address these challenges, we propose REFINE, a +novel technique that generates synthetic data from available documents and then +uses a model fusion approach to fine-tune embeddings for improved retrieval +performance in new domains, while preserving out-of-domain capability. We +conducted experiments on the two public datasets: SQUAD and RAG-12000 and a +proprietary TOURISM dataset. Results demonstrate that even the standard +fine-tuning with the proposed data augmentation technique outperforms the +vanilla pretrained model. Furthermore, when combined with model fusion, the +proposed approach achieves superior performance, with a 5.76% improvement in +recall on the TOURISM dataset, and 6.58 % and 0.32% enhancement on SQUAD and +RAG-12000 respectively. + +
+
+ comment: Accepted in AJCAI'24 +
+
+
+
+
+ + ☆ Multi-Cause Deconfounding for Recommender Systems with Latent + Confounders + + +
+ In recommender systems, various latent confounding factors (e.g., user social +environment and item public attractiveness) can affect user behavior, item +exposure, and feedback in distinct ways. These factors may directly or +indirectly impact user feedback and are often shared across items or users, +making them multi-cause latent confounders. However, existing methods typically +fail to account for latent confounders between users and their feedback, as +well as those between items and user feedback simultaneously. To address the +problem of multi-cause latent confounders, we propose a multi-cause +deconfounding method for recommender systems with latent confounders (MCDCF). +MCDCF leverages multi-cause causal effect estimation to learn substitutes for +latent confounders associated with both users and items, using user behaviour +data. Specifically, MCDCF treats the multiple items that users interact with +and the multiple users that interact with items as treatment variables, +enabling it to learn substitutes for the latent confounders that influence the +estimation of causality between users and their feedback, as well as between +items and user feedback. Additionally, we theoretically demonstrate the +soundness of our MCDCF method. Extensive experiments on three real-world +datasets demonstrate that our MCDCF method effectively recovers latent +confounders related to users and items, reducing bias and thereby improving +recommendation accuracy. + +
+
+
+
+
+ + ☆ Comprehending Knowledge Graphs with Large Language Models for + Recommender Systems + + +
+ Recently, the introduction of knowledge graphs (KGs) has significantly +advanced recommender systems by facilitating the discovery of potential +associations between items. However, existing methods still face several +limitations. First, most KGs suffer from missing facts or limited scopes. This +can lead to biased knowledge representations, thereby constraining the model's +performance. Second, existing methods typically convert textual information +into IDs, resulting in the loss of natural semantic connections between +different items. Third, existing methods struggle to capture high-order +relationships in global KGs due to their inefficient layer-by-layer information +propagation mechanisms, which are prone to introducing significant noise. To +address these limitations, we propose a novel method called CoLaKG, which +leverages large language models (LLMs) for knowledge-aware recommendation. The +extensive world knowledge and remarkable reasoning capabilities of LLMs enable +them to supplement KGs. Additionally, the strong text comprehension abilities +of LLMs allow for a better understanding of semantic information. Based on +this, we first extract subgraphs centered on each item from the KG and convert +them into textual inputs for the LLM. The LLM then outputs its comprehension of +these item-centered subgraphs, which are subsequently transformed into semantic +embeddings. Furthermore, to utilize the global information of the KG, we +construct an item-item graph using these semantic embeddings, which can +directly capture higher-order associations between items. Both the semantic +embeddings and the structural information from the item-item graph are +effectively integrated into the recommendation model through our designed +representation alignment and neighbor augmentation modules. Extensive +experiments on four real-world datasets demonstrate the superiority of our +method. + +
+
+
+
+
+ + ☆ Triple Modality Fusion: Aligning Visual, Textual, and Graph Data with + Large Language Models for Multi-Behavior Recommendations + + +
+ Integrating diverse data modalities is crucial for enhancing the performance +of personalized recommendation systems. Traditional models, which often rely on +singular data sources, lack the depth needed to accurately capture the +multifaceted nature of item features and user behaviors. This paper introduces +a novel framework for multi-behavior recommendations, leveraging the fusion of +triple-modality, which is visual, textual, and graph data through alignment +with large language models (LLMs). By incorporating visual information, we +capture contextual and aesthetic item characteristics; textual data provides +insights into user interests and item features in detail; and graph data +elucidates relationships within the item-behavior heterogeneous graphs. Our +proposed model called Triple Modality Fusion (TMF) utilizes the power of LLMs +to align and integrate these three modalities, achieving a comprehensive +representation of user behaviors. The LLM models the user's interactions +including behaviors and item features in natural languages. Initially, the LLM +is warmed up using only natural language-based prompts. We then devise the +modality fusion module based on cross-attention and self-attention mechanisms +to integrate different modalities from other models into the same embedding +space and incorporate them into an LLM. Extensive experiments demonstrate the +effectiveness of our approach in improving recommendation accuracy. Further +ablation studies validate the effectiveness of our model design and benefits of +the TMF. + +
+
+
+
+
+ + ☆ AT-RAG: An Adaptive RAG Model Enhancing Query Efficiency with Topic + Filtering and Iterative Reasoning + + +
+ Recent advancements in QA with LLM, like GPT-4, have shown limitations in +handling complex multi-hop queries. We propose AT-RAG, a novel multistep RAG +incorporating topic modeling for efficient document retrieval and reasoning. +Using BERTopic, our model dynamically assigns topics to queries, improving +retrieval accuracy and efficiency. We evaluated AT-RAG on multihop benchmark +datasets QA and a medical case study QA. Results show significant improvements +in correctness, completeness, and relevance compared to existing methods. +AT-RAG reduces retrieval time while maintaining high precision, making it +suitable for general tasks QA and complex domain-specific challenges such as +medical QA. The integration of topic filtering and iterative reasoning enables +our model to handle intricate queries efficiently, which makes it suitable for +applications that require nuanced information retrieval and decision-making. + +
+
+
+
+
+ + ♻ ☆ LitSearch: A Retrieval Benchmark for Scientific Literature Search EMNLP 2024 + + +
+ Literature search questions, such as "Where can I find research on the +evaluation of consistency in generated summaries?" pose significant challenges +for modern search engines and retrieval systems. These questions often require +a deep understanding of research concepts and the ability to reason across +entire articles. In this work, we introduce LitSearch, a retrieval benchmark +comprising 597 realistic literature search queries about recent ML and NLP +papers. LitSearch is constructed using a combination of (1) questions generated +by GPT-4 based on paragraphs containing inline citations from research papers +and (2) questions manually written by authors about their recently published +papers. All LitSearch questions were manually examined or edited by experts to +ensure high quality. We extensively benchmark state-of-the-art retrieval models +and also evaluate two LLM-based reranking pipelines. We find a significant +performance gap between BM25 and state-of-the-art dense retrievers, with a +24.8% absolute difference in recall@5. The LLM-based reranking strategies +further improve the best-performing dense retriever by 4.4%. Additionally, +commercial search engines and research tools like Google Search perform poorly +on LitSearch, lagging behind the best dense retriever by up to 32 recall +points. Taken together, these results show that LitSearch is an informative new +testbed for retrieval systems while catering to a real-world use case. + +
+
+ comment: Accepted by EMNLP 2024. Dataset and code are available at + https://github.com/princeton-nlp/LitSearch +
+
+
+
+
+ + ♻ ☆ PromptDSI: Prompt-based Rehearsal-free Instance-wise Incremental + Learning for Document Retrieval + + +
+ Differentiable Search Index (DSI) utilizes Pre-trained Language Models (PLMs) +for efficient document retrieval without relying on external indexes. However, +DSI needs full re-training to handle updates in dynamic corpora, causing +significant computational inefficiencies. We introduce PromptDSI, a +prompt-based rehearsal-free approach for instance-wise incremental learning +document retrieval. PromptDSI attaches prompts to the frozen PLM's encoder of +DSI, leveraging its powerful representation to efficiently index new corpora +while maintaining a balance between stability and plasticity. We eliminate the +initial forward pass of prompt-based continual learning methods that doubles +training and inference time. Moreover, we propose a topic-aware prompt pool +that employs neural topic embeddings as fixed keys. This strategy ensures +diverse and effective prompt usage, addressing the challenge of parameter +underutilization caused by the collapse of the query-key matching mechanism. +Our empirical evaluations demonstrate that BERT-based PromptDSI matches IncDSI +in managing forgetting while improving new corpora performance by more than 4% +Hits@10 on NQ320k and upto 3% MRR@10 on MS MARCO 300k. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ DIRAS: Efficient LLM Annotation of Document Relevance in Retrieval + Augmented Generation + + +
+ Retrieval Augmented Generation (RAG) is widely employed to ground responses +to queries on domain-specific documents. But do RAG implementations leave out +important information when answering queries that need an integrated analysis +of information (e.g., Tell me good news in the stock market today.)? To address +these concerns, RAG developers need to annotate information retrieval (IR) data +for their domain of interest, which is challenging because (1) domain-specific +queries usually need nuanced definitions of relevance beyond shallow semantic +relevance; and (2) human or GPT-4 annotation is costly and cannot cover all +(query, document) pairs (i.e., annotation selection bias), thus harming the +effectiveness in evaluating IR recall. To address these challenges, we propose +DIRAS (Domain-specific Information Retrieval Annotation with Scalability), a +manual-annotation-free schema that fine-tunes open-sourced LLMs to consider +nuanced relevance definition and annotate (partial) relevance labels with +calibrated relevance scores. Extensive evaluation shows that DIRAS enables +smaller (8B) LLMs to achieve GPT-4-level performance on annotating and ranking +unseen (query, document) pairs, and is helpful for real-world RAG development. +All code, LLM generations, and human annotations can be found in +\url{https://github.com/EdisonNi-hku/DIRAS}. + +
+
+
+
+
+ + ♻ ☆ FLEX: Expert-level False-Less EXecution Metric for Reliable Text-to-SQL + Benchmark + + +
+ Text-to-SQL systems have become crucial for translating natural language into +SQL queries in various industries, enabling non-technical users to perform +complex data operations. The need for accurate evaluation methods has increased +as these systems have grown more sophisticated. However, the Execution Accuracy +(EX), the most prevalent evaluation metric, still shows many false positives +and negatives. Thus, this paper introduces FLEX (False-Less EXecution), a novel +approach to evaluating text-to-SQL systems using large language models (LLMs) +to emulate human expert-level evaluation of SQL queries. Our metric improves +agreement with human experts (from 62 to 87.04 in Cohen's kappa) with +comprehensive context and sophisticated criteria. Our extensive experiments +yield several key insights: (1) Models' performance increases by over 2.6 +points on average, substantially affecting rankings on Spider and BIRD +benchmarks; (2) The underestimation of models in EX primarily stems from +annotation quality issues; and (3) Model performance on particularly +challenging questions tends to be overestimated. This work contributes to a +more accurate and nuanced evaluation of text-to-SQL systems, potentially +reshaping our understanding of state-of-the-art performance in this field. + +
+
+ comment: preprint, under review +
+
+
+
+
+ + ♻ ☆ A Practice-Friendly LLM-Enhanced Paradigm with Preference Parsing for + Sequential Recommendation + + +
+ The training paradigm integrating large language models (LLM) is gradually +reshaping sequential recommender systems (SRS) and has shown promising results. +However, most existing LLM-enhanced methods rely on rich textual information on +the item side and instance-level supervised fine-tuning (SFT) to inject +collaborative information into LLM, which is inefficient and limited in many +applications. To alleviate these problems, this paper proposes a +practice-friendly LLM-enhanced paradigm with preference parsing (P2Rec) for +SRS. Specifically, in the information reconstruction stage, we design a new +user-level SFT task for collaborative information injection with the assistance +of a pre-trained SRS model, which is more efficient and compatible with limited +text information. Our goal is to let LLM learn to reconstruct a corresponding +prior preference distribution from each user's interaction sequence, where LLM +needs to effectively parse the latent category of each item and the +relationship between different items to accomplish this task. In the +information augmentation stage, we feed each item into LLM to obtain a set of +enhanced embeddings that combine collaborative information and LLM inference +capabilities. These embeddings can then be used to help train various future +SRS models. Finally, we verify the effectiveness and efficiency of our TSLRec +on three SRS benchmark datasets. + +
+
+
+
+
+ + ♻ ☆ Knowledge Circuits in Pretrained Transformers NeurIPS 2024 + + +
+ The remarkable capabilities of modern large language models are rooted in +their vast repositories of knowledge encoded within their parameters, enabling +them to perceive the world and engage in reasoning. The inner workings of how +these models store knowledge have long been a subject of intense interest and +investigation among researchers. To date, most studies have concentrated on +isolated components within these models, such as the Multilayer Perceptrons and +attention head. In this paper, we delve into the computation graph of the +language model to uncover the knowledge circuits that are instrumental in +articulating specific knowledge. The experiments, conducted with GPT2 and +TinyLLAMA, have allowed us to observe how certain information heads, relation +heads, and Multilayer Perceptrons collaboratively encode knowledge within the +model. Moreover, we evaluate the impact of current knowledge editing techniques +on these knowledge circuits, providing deeper insights into the functioning and +constraints of these editing methodologies. Finally, we utilize knowledge +circuits to analyze and interpret language model behaviors such as +hallucinations and in-context learning. We believe the knowledge circuits hold +potential for advancing our understanding of Transformers and guiding the +improved design of knowledge editing. Code and data are available in +https://github.com/zjunlp/KnowledgeCircuits. + +
+
+ comment: NeurIPS 2024, 32 pages +
+
+
+
+
+ + ♻ ☆ $\textit{lucie}$: An Improved Python Package for Loading Datasets from + the UCI Machine Learning Repository + + +
+ The University of California--Irvine (UCI) Machine Learning (ML) Repository +(UCIMLR) is consistently cited as one of the most popular dataset repositories, +hosting hundreds of high-impact datasets. However, a significant portion, +including 28.4% of the top 250, cannot be imported via the $\textit{ucimlrepo}$ +package that is provided and recommended by the UCIMLR website. Instead, they +are hosted as .zip files, containing nonstandard formats that are difficult to +import without additional ad hoc processing. To address this issue, here we +present $\textit{lucie}$ -- $\underline{l}oad$ $\underline{U}niversity$ +$\underline{C}alifornia$ $\underline{I}rvine$ $\underline{e}xamples$ -- a +utility that automatically determines the data format and imports many of these +previously non-importable datasets, while preserving as much of a tabular data +structure as possible. $\textit{lucie}$ was designed using the top 100 most +popular datasets and benchmarked on the next 130, where it resulted in a +success rate of 95.4% vs. 73.1% for $\textit{ucimlrepo}$. $\textit{lucie}$ is +available as a Python package on PyPI with 98% code coverage. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Beyond Inter-Item Relations: Dynamic Adaption for Enhancing LLM-Based + Sequential Recommendation + + +
+ Sequential recommender systems (SRS) predict the next items that users may +prefer based on user historical interaction sequences. Inspired by the rise of +large language models (LLMs) in various AI applications, there is a surge of +work on LLM-based SRS. Despite their attractive performance, existing LLM-based +SRS still exhibit some limitations, including neglecting intra-item relations, +ignoring long-term collaborative knowledge and using inflexible architecture +designs for adaption. To alleviate these issues, we propose an LLM-based +sequential recommendation model named DARec. Built on top of coarse-grained +adaption for capturing inter-item relations, DARec is further enhanced with (1) +context masking that models intra-item relations to help LLM better understand +token and item semantics in the context of SRS, (2) collaborative knowledge +injection that helps LLM incorporate long-term collaborative knowledge, and (3) +a dynamic adaption mechanism that uses Bayesian optimization to flexibly choose +layer-wise adapter architectures in order to better incorporate different +sequential information. Extensive experiments demonstrate that DARec can +effectively handle sequential recommendation in a dynamic and adaptive manner. + +
+
+ comment: 11 pages, 14 figures +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Self-Comparison for Dataset-Level Membership Inference in Large + (Vision-)Language Models + + +
+ Large Language Models (LLMs) and Vision-Language Models (VLMs) have made +significant advancements in a wide range of natural language processing and +vision-language tasks. Access to large web-scale datasets has been a key factor +in their success. However, concerns have been raised about the unauthorized use +of copyrighted materials and potential copyright infringement. Existing +methods, such as sample-level Membership Inference Attacks (MIA) and +distribution-based dataset inference, distinguish member data (data used for +training) and non-member data by leveraging the common observation that models +tend to memorize and show greater confidence in member data. Nevertheless, +these methods face challenges when applied to LLMs and VLMs, such as the +requirement for ground-truth member data or non-member data that shares the +same distribution as the test data. In this paper, we propose a novel +dataset-level membership inference method based on Self-Comparison. We find +that a member prefix followed by a non-member suffix (paraphrased from a member +suffix) can further trigger the model's memorization on training data. Instead +of directly comparing member and non-member data, we introduce paraphrasing to +the second half of the sequence and evaluate how the likelihood changes before +and after paraphrasing. Unlike prior approaches, our method does not require +access to ground-truth member data or non-member data in identical +distribution, making it more practical. Extensive experiments demonstrate that +our proposed method outperforms traditional MIA and dataset inference +techniques across various datasets and models, including including public +models, fine-tuned models, and API-based commercial models. + +
+
+
+
+
+ + ☆ MuVi: Video-to-Music Generation with Semantic Alignment and Rhythmic + Synchronization + + +
+ Generating music that aligns with the visual content of a video has been a +challenging task, as it requires a deep understanding of visual semantics and +involves generating music whose melody, rhythm, and dynamics harmonize with the +visual narratives. This paper presents MuVi, a novel framework that effectively +addresses these challenges to enhance the cohesion and immersive experience of +audio-visual content. MuVi analyzes video content through a specially designed +visual adaptor to extract contextually and temporally relevant features. These +features are used to generate music that not only matches the video's mood and +theme but also its rhythm and pacing. We also introduce a contrastive +music-visual pre-training scheme to ensure synchronization, based on the +periodicity nature of music phrases. In addition, we demonstrate that our +flow-matching-based music generator has in-context learning ability, allowing +us to control the style and genre of the generated music. Experimental results +show that MuVi demonstrates superior performance in both audio quality and +temporal synchronization. The generated music video samples are available at +https://muvi-v2m.github.io. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ☆ Embedding an Ethical Mind: Aligning Text-to-Image Synthesis via + Lightweight Value Optimization + + +
+ Recent advancements in diffusion models trained on large-scale data have +enabled the generation of indistinguishable human-level images, yet they often +produce harmful content misaligned with human values, e.g., social bias, and +offensive content. Despite extensive research on Large Language Models (LLMs), +the challenge of Text-to-Image (T2I) model alignment remains largely +unexplored. Addressing this problem, we propose LiVO (Lightweight Value +Optimization), a novel lightweight method for aligning T2I models with human +values. LiVO only optimizes a plug-and-play value encoder to integrate a +specified value principle with the input prompt, allowing the control of +generated images over both semantics and values. Specifically, we design a +diffusion model-tailored preference optimization loss, which theoretically +approximates the Bradley-Terry model used in LLM alignment but provides a more +flexible trade-off between image quality and value conformity. To optimize the +value encoder, we also develop a framework to automatically construct a +text-image preference dataset of 86k (prompt, aligned image, violating image, +value principle) samples. Without updating most model parameters and through +adaptive value selection from the input prompt, LiVO significantly reduces +harmful outputs and achieves faster convergence, surpassing several strong +baselines and taking an initial step towards ethically aligned T2I models. + +
+
+ comment: Accepted by ACM Multimedia 2024. The dataset and code can be found at + https://github.com/achernarwang/LiVO +
+
+
+
+
+ + ☆ Rethinking Bjøntegaard Delta for Compression Efficiency Evaluation: + Are We Calculating It Precisely and Reliably? + + +
+ For decades, the Bj{\o}ntegaard Delta (BD) has been the metric for evaluating +codec Rate-Distortion (R-D) performance. Yet, in most studies, BD is determined +using just 4-5 R-D data points, could this be sufficient? As codecs and quality +metrics advance, does the conventional BD estimation still hold up? Crucially, +are the performance improvements of new codecs and tools genuine, or merely +artifacts of estimation flaws? This paper addresses these concerns by +reevaluating BD estimation. We present a novel approach employing a +parameterized deep neural network to model R-D curves with high precision +across various metrics, accompanied by a comprehensive R-D dataset. This +approach both assesses the reliability of BD calculations and serves as a +precise BD estimator. Our findings advocate for the adoption of rigorous R-D +sampling and reliability metrics in future compression research to ensure the +validity and reliability of results. + +
+
+
+
+
+ + ☆ OmnixR: Evaluating Omni-modality Language Models on Reasoning across + Modalities + + +
+ We introduce OmnixR, an evaluation suite designed to benchmark SoTA +Omni-modality Language Models, such as GPT-4o and Gemini. Evaluating OLMs, +which integrate multiple modalities such as text, vision, and audio, presents +unique challenges. Particularly, the user message might often consist of +multiple modalities, such that OLMs have to establish holistic understanding +and reasoning across modalities to accomplish the task. Existing benchmarks are +limited to single modality or dual-modality tasks, overlooking comprehensive +multi-modal assessments of model reasoning. To address this, OmnixR offers two +evaluation variants: (1)synthetic subset: a synthetic dataset generated +automatically by translating text into multiple modalities--audio, images, +video, and hybrids (Omnify). (2)realistic subset: a real-world dataset, +manually curated and annotated by experts, for evaluating cross-modal reasoning +in natural settings. OmnixR presents a unique evaluation towards assessing OLMs +over a diverse mix of modalities, such as a question that involves video, +audio, and text, providing a rigorous cross-modal reasoning testbed unlike any +existing benchmarks. Our experiments find that all state-of-the-art OLMs +struggle with OmnixR questions that require integrating information from +multiple modalities to answer. Further analysis highlights differences in +reasoning behavior, underscoring the challenges of omni-modal AI alignment. + +
+
+ comment: 19 pages, 6 figures, 12 tables +
+
+
+
+
+ + ☆ Test-time adaptation for image compression with distribution + regularization + + +
+ Current test- or compression-time adaptation image compression (TTA-IC) +approaches, which leverage both latent and decoder refinements as a two-step +adaptation scheme, have potentially enhanced the rate-distortion (R-D) +performance of learned image compression models on cross-domain compression +tasks, \textit{e.g.,} from natural to screen content images. However, compared +with the emergence of various decoder refinement variants, the latent +refinement, as an inseparable ingredient, is barely tailored to cross-domain +scenarios. To this end, we aim to develop an advanced latent refinement method +by extending the effective hybrid latent refinement (HLR) method, which is +designed for \textit{in-domain} inference improvement but shows noticeable +degradation of the rate cost in \textit{cross-domain} tasks. Specifically, we +first provide theoretical analyses, in a cue of marginalization approximation +from in- to cross-domain scenarios, to uncover that the vanilla HLR suffers +from an underlying mismatch between refined Gaussian conditional and hyperprior +distributions, leading to deteriorated joint probability approximation of +marginal distribution with increased rate consumption. To remedy this issue, we +introduce a simple Bayesian approximation-endowed \textit{distribution +regularization} to encourage learning a better joint probability approximation +in a plug-and-play manner. Extensive experiments on six in- and cross-domain +datasets demonstrate that our proposed method not only improves the R-D +performance compared with other latent refinement counterparts, but also can be +flexibly integrated into existing TTA-IC methods with incremental benefits. + +
+
+
+
+
+ + ♻ ☆ Video-to-Audio Generation with Hidden Alignment + + +
+ Generating semantically and temporally aligned audio content in accordance +with video input has become a focal point for researchers, particularly +following the remarkable breakthrough in text-to-video generation. In this +work, we aim to offer insights into the video-to-audio generation paradigm, +focusing on three crucial aspects: vision encoders, auxiliary embeddings, and +data augmentation techniques. Beginning with a foundational model built on a +simple yet surprisingly effective intuition, we explore various vision encoders +and auxiliary embeddings through ablation studies. Employing a comprehensive +evaluation pipeline that emphasizes generation quality and video-audio +synchronization alignment, we demonstrate that our model exhibits +state-of-the-art video-to-audio generation capabilities. Furthermore, we +provide critical insights into the impact of different data augmentation +methods on enhancing the generation framework's overall capacity. We showcase +possibilities to advance the challenge of generating synchronized audio from +semantic and temporal perspectives. We hope these insights will serve as a +stepping stone toward developing more realistic and accurate audio-visual +generation models. + +
+
+ comment: https://sites.google.com/view/vta-ldm +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`