diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..3d278b79 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-11-07T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2406.00922v3","updated":"2024-11-07T18:59:30Z","published":"2024-06-03T01:32:52Z","title":"MediQ: Question-Asking LLMs and a Benchmark for Reliable Interactive\n Clinical Reasoning","summary":" Users typically engage with LLMs interactively, yet most existing benchmarks\nevaluate them in a static, single-turn format, posing reliability concerns in\ninteractive scenarios. We identify a key obstacle towards reliability: LLMs are\ntrained to answer any question, even with incomplete context or insufficient\nknowledge. In this paper, we propose to change the static paradigm to an\ninteractive one, develop systems that proactively ask questions to gather more\ninformation and respond reliably, and introduce an benchmark - MediQ - to\nevaluate question-asking ability in LLMs. MediQ simulates clinical interactions\nconsisting of a Patient System and an adaptive Expert System; with potentially\nincomplete initial information, the Expert refrains from making diagnostic\ndecisions when unconfident, and instead elicits missing details via follow-up\nquestions. We provide a pipeline to convert single-turn medical benchmarks into\nan interactive format. Our results show that directly prompting\nstate-of-the-art LLMs to ask questions degrades performance, indicating that\nadapting LLMs to proactive information-seeking settings is nontrivial. We\nexperiment with abstention strategies to better estimate model confidence and\ndecide when to ask questions, improving diagnostic accuracy by 22.3%; however,\nperformance still lags compared to an (unrealistic in practice) upper bound\nwith complete information upfront. Further analyses show improved interactive\nperformance with filtering irrelevant contexts and reformatting conversations.\nOverall, we introduce a novel problem towards LLM reliability, an interactive\nMediQ benchmark and a novel question-asking system, and highlight directions to\nextend LLMs' information-seeking abilities in critical domains.\n","authors":["Shuyue Stella Li","Vidhisha Balachandran","Shangbin Feng","Jonathan S. Ilgen","Emma Pierson","Pang Wei Koh","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2406.00922v3.pdf","comment":"29 pages, 12 figures"},{"id":"http://arxiv.org/abs/2411.05001v1","updated":"2024-11-07T18:59:28Z","published":"2024-11-07T18:59:28Z","title":"Analyzing The Language of Visual Tokens","summary":" With the introduction of transformer-based models for vision and language\ntasks, such as LLaVA and Chameleon, there has been renewed interest in the\ndiscrete tokenized representation of images. These models often treat image\npatches as discrete tokens, analogous to words in natural language, learning\njoint alignments between visual and human languages. However, little is known\nabout the statistical behavior of these visual languages - whether they follow\nsimilar frequency distributions, grammatical structures, or topologies as\nnatural languages. In this paper, we take a natural-language-centric approach\nto analyzing discrete visual languages and uncover striking similarities and\nfundamental differences. We demonstrate that, although visual languages adhere\nto Zipfian distributions, higher token innovation drives greater entropy and\nlower compression, with tokens predominantly representing object parts,\nindicating intermediate granularity. We also show that visual languages lack\ncohesive grammatical structures, leading to higher perplexity and weaker\nhierarchical organization compared to natural languages. Finally, we\ndemonstrate that, while vision models align more closely with natural languages\nthan other models, this alignment remains significantly weaker than the\ncohesion found within natural languages. Through these experiments, we\ndemonstrate how understanding the statistical properties of discrete visual\nlanguages can inform the design of more effective computer vision models.\n","authors":["David M. Chan","Rodolfo Corona","Joonyong Park","Cheol Jun Cho","Yutong Bai","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2411.05001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05000v1","updated":"2024-11-07T18:59:27Z","published":"2024-11-07T18:59:27Z","title":"Needle Threading: Can LLMs Follow Threads through Near-Million-Scale\n Haystacks?","summary":" As the context limits of Large Language Models (LLMs) increase, the range of\npossible applications and downstream functions broadens. In many real-world\ntasks, decisions depend on details scattered across collections of often\ndisparate documents containing mostly irrelevant information. Long-context LLMs\nappear well-suited to this form of complex information retrieval and reasoning,\nwhich has traditionally proven costly and time-consuming. However, although the\ndevelopment of longer context models has seen rapid gains in recent years, our\nunderstanding of how effectively LLMs use their context has not kept pace. To\naddress this, we conduct a set of retrieval experiments designed to evaluate\nthe capabilities of 17 leading LLMs, such as their ability to follow threads of\ninformation through the context window. Strikingly, we find that many models\nare remarkably threadsafe: capable of simultaneously following multiple threads\nwithout significant loss in performance. Still, for many models, we find the\neffective context limit is significantly shorter than the supported context\nlength, with accuracy decreasing as the context window grows. Our study also\nhighlights the important point that token counts from different tokenizers\nshould not be directly compared -- they often correspond to substantially\ndifferent numbers of written characters. We release our code and long-context\nexperimental data.\n","authors":["Jonathan Roberts","Kai Han","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2411.05000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04997v1","updated":"2024-11-07T18:59:16Z","published":"2024-11-07T18:59:16Z","title":"LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation","summary":" CLIP is one of the most important multimodal foundational models today. What\npowers CLIP's capabilities? The rich supervision signals provided by natural\nlanguage, the carrier of human knowledge, shape a powerful cross-modal\nrepresentation space. However, with the rapid advancements in large language\nmodels LLMs like GPT-4 and LLaMA, the boundaries of language comprehension and\ngeneration are continually being pushed. This raises an intriguing question:\ncan the capabilities of LLMs be harnessed to further improve multimodal\nrepresentation learning? The potential benefits of incorporating LLMs into CLIP\nare clear. LLMs' strong textual understanding can fundamentally improve CLIP's\nability to handle image captions, drastically enhancing its ability to process\nlong and complex texts, a well-known limitation of vanilla CLIP. Moreover, LLMs\nare trained on a vast corpus of text, possessing open-world knowledge. This\nallows them to expand on caption information during training, increasing the\nefficiency of the learning process. In this paper, we propose LLM2CLIP, a novel\napproach that embraces the power of LLMs to unlock CLIP's potential. By\nfine-tuning the LLM in the caption space with contrastive learning, we extract\nits textual capabilities into the output embeddings, significantly improving\nthe output layer's textual discriminability. We then design an efficient\ntraining process where the fine-tuned LLM acts as a powerful teacher for CLIP's\nvisual encoder. Thanks to the LLM's presence, we can now incorporate longer and\nmore complex captions without being restricted by vanilla CLIP's text encoder's\ncontext window and ability limitations. Our experiments demonstrate that this\napproach brings substantial improvements in cross-modal tasks.\n","authors":["Weiquan Huang","Aoqi Wu","Yifan Yang","Xufang Luo","Yuqing Yang","Liang Hu","Qi Dai","Xiyang Dai","Dongdong Chen","Chong Luo","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2411.04997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04996v1","updated":"2024-11-07T18:59:06Z","published":"2024-11-07T18:59:06Z","title":"Mixture-of-Transformers: A Sparse and Scalable Architecture for\n Multi-Modal Foundation Models","summary":" The development of large language models (LLMs) has expanded to multi-modal\nsystems capable of processing text, images, and speech within a unified\nframework. Training these models demands significantly larger datasets and\ncomputational resources compared to text-only LLMs. To address the scaling\nchallenges, we introduce Mixture-of-Transformers (MoT), a sparse multi-modal\ntransformer architecture that significantly reduces pretraining computational\ncosts. MoT decouples non-embedding parameters of the model by modality --\nincluding feed-forward networks, attention matrices, and layer normalization --\nenabling modality-specific processing with global self-attention over the full\ninput sequence. We evaluate MoT across multiple settings and model scales. In\nthe Chameleon 7B setting (autoregressive text-and-image generation), MoT\nmatches the dense baseline's performance using only 55.8\\% of the FLOPs. When\nextended to include speech, MoT reaches speech performance comparable to the\ndense baseline with only 37.2\\% of the FLOPs. In the Transfusion setting, where\ntext and image are trained with different objectives, a 7B MoT model matches\nthe image modality performance of the dense baseline with one third of the\nFLOPs, and a 760M MoT model outperforms a 1.4B dense baseline across key image\ngeneration metrics. System profiling further highlights MoT's practical\nbenefits, achieving dense baseline image quality in 47.2\\% of the wall-clock\ntime and text quality in 75.6\\% of the wall-clock time (measured on AWS\np4de.24xlarge instances with NVIDIA A100 GPUs).\n","authors":["Weixin Liang","Lili Yu","Liang Luo","Srinivasan Iyer","Ning Dong","Chunting Zhou","Gargi Ghosh","Mike Lewis","Wen-tau Yih","Luke Zettlemoyer","Xi Victoria Lin"],"pdf_url":"https://arxiv.org/pdf/2411.04996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04986v1","updated":"2024-11-07T18:55:09Z","published":"2024-11-07T18:55:09Z","title":"The Semantic Hub Hypothesis: Language Models Share Semantic\n Representations Across Languages and Modalities","summary":" Modern language models can process inputs across diverse languages and\nmodalities. We hypothesize that models acquire this capability through learning\na shared representation space across heterogeneous data types (e.g., different\nlanguages and modalities), which places semantically similar inputs near one\nanother, even if they are from different modalities/languages. We term this the\nsemantic hub hypothesis, following the hub-and-spoke model from neuroscience\n(Patterson et al., 2007) which posits that semantic knowledge in the human\nbrain is organized through a transmodal semantic \"hub\" which integrates\ninformation from various modality-specific \"spokes\" regions. We first show that\nmodel representations for semantically equivalent inputs in different languages\nare similar in the intermediate layers, and that this space can be interpreted\nusing the model's dominant pretraining language via the logit lens. This\ntendency extends to other data types, including arithmetic expressions, code,\nand visual/audio inputs. Interventions in the shared representation space in\none data type also predictably affect model outputs in other data types,\nsuggesting that this shared representations space is not simply a vestigial\nbyproduct of large-scale training on broad data, but something that is actively\nutilized by the model during input processing.\n","authors":["Zhaofeng Wu","Xinyan Velocity Yu","Dani Yogatama","Jiasen Lu","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2411.04986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04975v1","updated":"2024-11-07T18:49:33Z","published":"2024-11-07T18:49:33Z","title":"SuffixDecoding: A Model-Free Approach to Speeding Up Large Language\n Model Inference","summary":" We present SuffixDecoding, a novel model-free approach to accelerating large\nlanguage model (LLM) inference through speculative decoding. Unlike existing\nmethods that rely on draft models or specialized decoding heads, SuffixDecoding\nleverages suffix trees built from previously generated outputs to efficiently\npredict candidate token sequences. Our approach enables flexible\ntree-structured speculation without the overhead of maintaining and\norchestrating additional models. SuffixDecoding builds and dynamically updates\nsuffix trees to capture patterns in the generated text, using them to construct\nspeculation trees through a principled scoring mechanism based on empirical\ntoken frequencies. SuffixDecoding requires only CPU memory which is plentiful\nand underutilized on typical LLM serving nodes. We demonstrate that\nSuffixDecoding achieves competitive speedups compared to model-based approaches\nacross diverse workloads including open-domain chat, code generation, and\ntext-to-SQL tasks. For open-ended chat and code generation tasks,\nSuffixDecoding achieves up to $1.4\\times$ higher output throughput than\nSpecInfer and up to $1.1\\times$ lower time-per-token (TPOT) latency. For a\nproprietary multi-LLM text-to-SQL application, SuffixDecoding achieves up to\n$2.9\\times$ higher output throughput and $3\\times$ lower latency than\nspeculative decoding. Our evaluation shows that SuffixDecoding maintains high\nacceptance rates even with small reference corpora of 256 examples, while\ncontinuing to improve performance as more historical outputs are incorporated.\n","authors":["Gabriele Oliaro","Zhihao Jia","Daniel Campos","Aurick Qiao"],"pdf_url":"https://arxiv.org/pdf/2411.04975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04965v1","updated":"2024-11-07T18:41:50Z","published":"2024-11-07T18:41:50Z","title":"BitNet a4.8: 4-bit Activations for 1-bit LLMs","summary":" Recent research on the 1-bit Large Language Models (LLMs), such as BitNet\nb1.58, presents a promising direction for reducing the inference cost of LLMs\nwhile maintaining their performance. In this work, we introduce BitNet a4.8,\nenabling 4-bit activations for 1-bit LLMs. BitNet a4.8 employs a hybrid\nquantization and sparsification strategy to mitigate the quantization errors\nintroduced by the outlier channels. Specifically, we utilize 4-bit activations\nfor inputs to the attention and feed-forward network layers, while sparsifying\nintermediate states followed with 8-bit quantization. Extensive experiments\ndemonstrate that BitNet a4.8 achieves performance comparable to BitNet b1.58\nwith equivalent training costs, while being faster in inference with enabling\n4-bit (INT4/FP4) kernels. Additionally, BitNet a4.8 activates only 55% of\nparameters and supports 3-bit KV cache, further enhancing the efficiency of\nlarge-scale LLM deployment and inference.\n","authors":["Hongyu Wang","Shuming Ma","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2411.04965v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2411.04962v1","updated":"2024-11-07T18:39:04Z","published":"2024-11-07T18:39:04Z","title":"Position Paper On Diagnostic Uncertainty Estimation from Large Language\n Models: Next-Word Probability Is Not Pre-test Probability","summary":" Large language models (LLMs) are being explored for diagnostic decision\nsupport, yet their ability to estimate pre-test probabilities, vital for\nclinical decision-making, remains limited. This study evaluates two LLMs,\nMistral-7B and Llama3-70B, using structured electronic health record data on\nthree diagnosis tasks. We examined three current methods of extracting LLM\nprobability estimations and revealed their limitations. We aim to highlight the\nneed for improved techniques in LLM confidence estimation.\n","authors":["Yanjun Gao","Skatje Myers","Shan Chen","Dmitriy Dligach","Timothy A Miller","Danielle Bitterman","Guanhua Chen","Anoop Mayampurath","Matthew Churpek","Majid Afshar"],"pdf_url":"https://arxiv.org/pdf/2411.04962v1.pdf","comment":"Accepted to GenAI4Health Workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.04952v1","updated":"2024-11-07T18:29:38Z","published":"2024-11-07T18:29:38Z","title":"M3DocRAG: Multi-modal Retrieval is What You Need for Multi-page\n Multi-document Understanding","summary":" Document visual question answering (DocVQA) pipelines that answer questions\nfrom documents have broad applications. Existing methods focus on handling\nsingle-page documents with multi-modal language models (MLMs), or rely on\ntext-based retrieval-augmented generation (RAG) that uses text extraction tools\nsuch as optical character recognition (OCR). However, there are difficulties in\napplying these methods in real-world scenarios: (a) questions often require\ninformation across different pages or documents, where MLMs cannot handle many\nlong documents; (b) documents often have important information in visual\nelements such as figures, but text extraction tools ignore them. We introduce\nM3DocRAG, a novel multi-modal RAG framework that flexibly accommodates various\ndocument contexts (closed-domain and open-domain), question hops (single-hop\nand multi-hop), and evidence modalities (text, chart, figure, etc.). M3DocRAG\nfinds relevant documents and answers questions using a multi-modal retriever\nand an MLM, so that it can efficiently handle single or many documents while\npreserving visual information. Since previous DocVQA datasets ask questions in\nthe context of a specific document, we also present M3DocVQA, a new benchmark\nfor evaluating open-domain DocVQA over 3,000+ PDF documents with 40,000+ pages.\nIn three benchmarks (M3DocVQA/MMLongBench-Doc/MP-DocVQA), empirical results\nshow that M3DocRAG with ColPali and Qwen2-VL 7B achieves superior performance\nthan many strong baselines, including state-of-the-art performance in\nMP-DocVQA. We provide comprehensive analyses of different indexing, MLMs, and\nretrieval models. Lastly, we qualitatively show that M3DocRAG can successfully\nhandle various scenarios, such as when relevant information exists across\nmultiple pages and when answer evidence only exists in images.\n","authors":["Jaemin Cho","Debanjan Mahata","Ozan Irsoy","Yujie He","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2411.04952v1.pdf","comment":"Project webpage: https://m3docrag.github.io"},{"id":"http://arxiv.org/abs/2411.04950v1","updated":"2024-11-07T18:28:40Z","published":"2024-11-07T18:28:40Z","title":"Estimating the Influence of Sequentially Correlated Literary Properties\n in Textual Classification: A Data-Centric Hypothesis-Testing Approach","summary":" Stylometry aims to distinguish authors by analyzing literary traits assumed\nto reflect semi-conscious choices distinct from elements like genre or theme.\nHowever, these components often overlap, complicating text classification based\nsolely on feature distributions. While some literary properties, such as\nthematic content, are likely to manifest as correlations between adjacent text\nunits, others, like authorial style, may be independent thereof. We introduce a\nhypothesis-testing approach to evaluate the influence of sequentially\ncorrelated literary properties on text classification, aiming to determine when\nthese correlations drive classification. Using a multivariate binary\ndistribution, our method models sequential correlations between text units as a\nstochastic process, assessing the likelihood of clustering across varying\nadjacency scales. This enables us to examine whether classification is\ndominated by sequentially correlated properties or remains independent. In\nexperiments on a diverse English prose corpus, our analysis integrates\ntraditional and neural embeddings within supervised and unsupervised\nframeworks. Results demonstrate that our approach effectively identifies when\ntextual classification is not primarily influenced by sequentially correlated\nliterary properties, particularly in cases where texts differ in authorial\nstyle or genre rather than by a single author within a similar genre.\n","authors":["Gideon Yoffe","Nachum Dershowitz","Ariel Vishne","Barak Sober"],"pdf_url":"https://arxiv.org/pdf/2411.04950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14894v2","updated":"2024-11-07T18:15:23Z","published":"2024-06-21T06:30:16Z","title":"Talking the Talk Does Not Entail Walking the Walk: On the Limits of\n Large Language Models in Lexical Entailment Recognition","summary":" Verbs form the backbone of language, providing the structure and meaning to\nsentences. Yet, their intricate semantic nuances pose a longstanding challenge.\nUnderstanding verb relations through the concept of lexical entailment is\ncrucial for comprehending sentence meanings and grasping verb dynamics. This\nwork investigates the capabilities of eight Large Language Models in\nrecognizing lexical entailment relations among verbs through differently\ndevised prompting strategies and zero-/few-shot settings over verb pairs from\ntwo lexical databases, namely WordNet and HyperLex. Our findings unveil that\nthe models can tackle the lexical entailment recognition task with moderately\ngood performance, although at varying degree of effectiveness and under\ndifferent conditions. Also, utilizing few-shot prompting can enhance the\nmodels' performance. However, perfectly solving the task arises as an unmet\nchallenge for all examined LLMs, which raises an emergence for further research\ndevelopments on this topic.\n","authors":["Candida M. Greco","Lucio La Cava","Andrea Tagarelli"],"pdf_url":"https://arxiv.org/pdf/2406.14894v2.pdf","comment":"Accepted for publication at The 2024 Conference on Empirical Methods\n in Natural Language Processing (EMNLP-2024) - Findings"},{"id":"http://arxiv.org/abs/2411.04920v1","updated":"2024-11-07T17:57:03Z","published":"2024-11-07T17:57:03Z","title":"GPTKB: Building Very Large Knowledge Bases from Language Models","summary":" General-domain knowledge bases (KB), in particular the \"big three\" --\nWikidata, Yago and DBpedia -- are the backbone of many intelligent\napplications. While these three have seen steady development, comprehensive KB\nconstruction at large has seen few fresh attempts. In this work, we propose to\nbuild a large general-domain KB entirely from a large language model (LLM). We\ndemonstrate the feasibility of large-scale KB construction from LLMs, while\nhighlighting specific challenges arising around entity recognition, entity and\nproperty canonicalization, and taxonomy construction. As a prototype, we use\nGPT-4o-mini to construct GPTKB, which contains 105 million triples for more\nthan 2.9 million entities, at a cost 100x less than previous KBC projects. Our\nwork is a landmark for two fields: For NLP, for the first time, it provides\n\\textit{constructive} insights into the knowledge (or beliefs) of LLMs. For the\nSemantic Web, it shows novel ways forward for the long-standing challenge of\ngeneral-domain KB construction. GPTKB is accessible at https://gptkb.org.\n","authors":["Yujia Hu","Shrestha Ghosh","Tuan-Phong Nugyen","Simon Razniewski"],"pdf_url":"https://arxiv.org/pdf/2411.04920v1.pdf","comment":"11 pages, 4 tables"},{"id":"http://arxiv.org/abs/2406.15586v2","updated":"2024-11-07T17:56:19Z","published":"2024-06-21T18:41:22Z","title":"TinyStyler: Efficient Few-Shot Text Style Transfer with Authorship\n Embeddings","summary":" The goal of text style transfer is to transform the style of texts while\npreserving their original meaning, often with only a few examples of the target\nstyle. Existing style transfer methods generally rely on the few-shot\ncapabilities of large language models or on complex controllable text\ngeneration approaches that are inefficient and underperform on fluency metrics.\nWe introduce TinyStyler, a lightweight but effective approach, which leverages\na small language model (800M params) and pre-trained authorship embeddings to\nperform efficient, few-shot text style transfer. We evaluate on the challenging\ntask of authorship style transfer and find TinyStyler outperforms strong\napproaches such as GPT-4. We also evaluate TinyStyler's ability to perform text\nattribute style transfer (formal $\\leftrightarrow$ informal) with automatic and\nhuman evaluations and find that the approach outperforms recent controllable\ntext generation methods. Our model has been made publicly available at\nhttps://huggingface.co/tinystyler/tinystyler .\n","authors":["Zachary Horvitz","Ajay Patel","Kanishk Singh","Chris Callison-Burch","Kathleen McKeown","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2406.15586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04914v1","updated":"2024-11-07T17:53:47Z","published":"2024-11-07T17:53:47Z","title":"GASE: Generatively Augmented Sentence Encoding","summary":" We propose an approach to enhance sentence embeddings by applying generative\ntext models for data augmentation at inference time. Unlike conventional data\naugmentation that utilises synthetic training data, our approach does not\nrequire access to model parameters or the computational resources typically\nrequired for fine-tuning state-of-the-art models. Generatively Augmented\nSentence Encoding uses diverse linguistic synthetic variants of input texts\ngenerated by paraphrasing, summarising, or extracting keywords, followed by\npooling the original and synthetic embeddings. Experimental results on the\nMassive Text Embedding Benchmark for Semantic Textual Similarity (STS)\ndemonstrate performance improvements across a range of embedding models using\ndifferent generative models for augmentation. We find that generative\naugmentation leads to larger performance improvements for embedding models with\nlower baseline performance. These findings suggest that integrating generative\naugmentation at inference time adds semantic diversity and can enhance the\nrobustness and generalizability of sentence embeddings for embedding models.\nOur results show that the degree to which generative augmentation can improve\nSTS performance depends not only on the embedding model but also on the\ndataset. From a broader perspective, the approach allows trading training for\ninference compute.\n","authors":["Manuel Frank","Haithem Afli"],"pdf_url":"https://arxiv.org/pdf/2411.04914v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.04905v1","updated":"2024-11-07T17:47:25Z","published":"2024-11-07T17:47:25Z","title":"OpenCoder: The Open Cookbook for Top-Tier Code Large Language Models","summary":" Large language models (LLMs) for code have become indispensable in various\ndomains, including code generation, reasoning tasks and agent systems.While\nopen-access code LLMs are increasingly approaching the performance levels of\nproprietary models, high-quality code LLMs suitable for rigorous scientific\ninvestigation, particularly those with reproducible data processing pipelines\nand transparent training protocols, remain limited. The scarcity is due to\nvarious challenges, including resource constraints, ethical considerations, and\nthe competitive advantages of keeping models advanced. To address the gap, we\nintroduce OpenCoder, a top-tier code LLM that not only achieves performance\ncomparable to leading models but also serves as an ``open cookbook'' for the\nresearch community. Unlike most prior efforts, we release not only model\nweights and inference code, but also the reproducible training data, complete\ndata processing pipeline, rigorous experimental ablation results, and detailed\ntraining protocols for open scientific research. Through this comprehensive\nrelease, we identify the key ingredients for building a top-tier code LLM: (1)\ncode optimized heuristic rules for data cleaning and methods for data\ndeduplication, (2) recall of text corpus related to code and (3) high-quality\nsynthetic data in both annealing and supervised fine-tuning stages. By offering\nthis level of openness, we aim to broaden access to all aspects of a top-tier\ncode LLM, with OpenCoder serving as both a powerful model and an open\nfoundation to accelerate research, and enable reproducible advancements in code\nAI.\n","authors":["Siming Huang","Tianhao Cheng","Jason Klein Liu","Jiaran Hao","Liuyihan Song","Yang Xu","J. Yang","J. H. Liu","Chenchen Zhang","Linzheng Chai","Ruifeng Yuan","Zhaoxiang Zhang","Jie Fu","Qian Liu","Ge Zhang","Zili Wang","Yuan Qi","Yinghui Xu","Wei Chu"],"pdf_url":"https://arxiv.org/pdf/2411.04905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15814v2","updated":"2024-11-07T17:33:37Z","published":"2024-07-22T17:26:12Z","title":"Perceptions of Linguistic Uncertainty by Language Models and Humans","summary":" _Uncertainty expressions_ such as \"probably\" or \"highly unlikely\" are\npervasive in human language. While prior work has established that there is\npopulation-level agreement in terms of how humans quantitatively interpret\nthese expressions, there has been little inquiry into the abilities of language\nmodels in the same context. In this paper, we investigate how language models\nmap linguistic expressions of uncertainty to numerical responses. Our approach\nassesses whether language models can employ theory of mind in this setting:\nunderstanding the uncertainty of another agent about a particular statement,\nindependently of the model's own certainty about that statement. We find that 7\nout of 10 models are able to map uncertainty expressions to probabilistic\nresponses in a human-like manner. However, we observe systematically different\nbehavior depending on whether a statement is actually true or false. This\nsensitivity indicates that language models are substantially more susceptible\nto bias based on their prior knowledge (as compared to humans). These findings\nraise important questions and have broad implications for human-AI and AI-AI\ncommunication.\n","authors":["Catarina G Belem","Markelle Kelly","Mark Steyvers","Sameer Singh","Padhraic Smyth"],"pdf_url":"https://arxiv.org/pdf/2407.15814v2.pdf","comment":"Accepted at EMNLP 2024 (Main)"},{"id":"http://arxiv.org/abs/2410.04981v2","updated":"2024-11-07T17:25:45Z","published":"2024-10-07T12:22:06Z","title":"On the Rigour of Scientific Writing: Criteria, Analysis, and Insights","summary":" Rigour is crucial for scientific research as it ensures the reproducibility\nand validity of results and findings. Despite its importance, little work\nexists on modelling rigour computationally, and there is a lack of analysis on\nwhether these criteria can effectively signal or measure the rigour of\nscientific papers in practice. In this paper, we introduce a bottom-up,\ndata-driven framework to automatically identify and define rigour criteria and\nassess their relevance in scientific writing. Our framework includes rigour\nkeyword extraction, detailed rigour definition generation, and salient criteria\nidentification. Furthermore, our framework is domain-agnostic and can be\ntailored to the evaluation of scientific rigour for different areas,\naccommodating the distinct salient criteria across fields. We conducted\ncomprehensive experiments based on datasets collected from two high impact\nvenues for Machine Learning and NLP (i.e., ICLR and ACL) to demonstrate the\neffectiveness of our framework in modelling rigour. In addition, we analyse\nlinguistic patterns of rigour, revealing that framing certainty is crucial for\nenhancing the perception of scientific rigour, while suggestion certainty and\nprobability uncertainty diminish it.\n","authors":["Joseph James","Chenghao Xiao","Yucheng Li","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2410.04981v2.pdf","comment":"Accepted Findings at EMNLP 2024"},{"id":"http://arxiv.org/abs/2411.04862v1","updated":"2024-11-07T16:53:09Z","published":"2024-11-07T16:53:09Z","title":"Sentiment Analysis of Spanish Political Party Tweets Using Pre-trained\n Language Models","summary":" Title: Sentiment Analysis of Spanish Political Party Communications on\nTwitter Using Pre-trained Language Models\n Authors: Chuqiao Song, Shunzhang Chen, Xinyi Cai, Hao Chen\n Comments: 21 pages, 6 figures\n Abstract: This study investigates sentiment patterns within Spanish political\nparty communications on Twitter by leveraging BETO and RoBERTuito, two\npre-trained language models optimized for Spanish text. Using a dataset of\ntweets from major Spanish political parties: PSOE, PP, Vox, Podemos, and\nCiudadanos, spanning 2019 to 2024, this research analyzes sentiment\ndistributions and explores the relationship between sentiment expression and\nparty ideology. The findings indicate that both models consistently identify a\npredominant Neutral sentiment across all parties, with significant variations\nin Negative and Positive sentiments that align with ideological distinctions.\nSpecifically, Vox exhibits higher levels of Negative sentiment, while PSOE\ndemonstrates relatively high Positive sentiment, supporting the hypothesis that\nemotional appeals in political messaging reflect ideological stances. This\nstudy underscores the potential of pre-trained language models for non-English\nsentiment analysis on social media, providing insights into sentiment dynamics\nthat shape public discourse within Spain's multi-party political system.\n Keywords: Spanish politics, sentiment analysis, pre-trained language models,\nTwitter, BETO, RoBERTuito, political ideology, multi-party system\n","authors":["Chuqiao Song","Shunzhang Chen","Xinyi Cai","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2411.04862v1.pdf","comment":"21 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.09269v2","updated":"2024-11-07T16:43:01Z","published":"2024-02-14T15:55:30Z","title":"Personalized Large Language Models","summary":" Large language models (LLMs) have significantly advanced Natural Language\nProcessing (NLP) tasks in recent years. However, their universal nature poses\nlimitations in scenarios requiring personalized responses, such as\nrecommendation systems and chatbots. This paper investigates methods to\npersonalize LLMs, comparing fine-tuning and zero-shot reasoning approaches on\nsubjective tasks. Results demonstrate that personalized fine-tuning improves\nmodel reasoning compared to non-personalized models. Experiments on datasets\nfor emotion recognition and hate speech detection show consistent performance\ngains with personalized methods across different LLM architectures. These\nfindings underscore the importance of personalization for enhancing LLM\ncapabilities in subjective text perception tasks.\n","authors":["Stanisław Woźniak","Bartłomiej Koptyra","Arkadiusz Janz","Przemysław Kazienko","Jan Kocoń"],"pdf_url":"https://arxiv.org/pdf/2402.09269v2.pdf","comment":"Accepted to SENTIRE 2024 (ICDM Workshops):\n https://sentic.net/sentire2024wozniak.pdf"},{"id":"http://arxiv.org/abs/2411.04847v1","updated":"2024-11-07T16:33:48Z","published":"2024-11-07T16:33:48Z","title":"Prompt-Guided Internal States for Hallucination Detection of Large\n Language Models","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities across\na variety of tasks in different domains. However, they sometimes generate\nresponses that are logically coherent but factually incorrect or misleading,\nwhich is known as LLM hallucinations. Data-driven supervised methods train\nhallucination detectors by leveraging the internal states of LLMs, but\ndetectors trained on specific domains often struggle to generalize well to\nother domains. In this paper, we aim to enhance the cross-domain performance of\nsupervised detectors with only in-domain data. We propose a novel framework,\nprompt-guided internal states for hallucination detection of LLMs, namely\nPRISM. By utilizing appropriate prompts to guide changes in the structure\nrelated to text truthfulness within the LLM's internal states, we make this\nstructure more salient and consistent across texts from different domains. We\nintegrated our framework with existing hallucination detection methods and\nconducted experiments on datasets from different domains. The experimental\nresults indicate that our framework significantly enhances the cross-domain\ngeneralization of existing hallucination detection methods.\n","authors":["Fujie Zhang","Peiqi Yu","Biao Yi","Baolei Zhang","Tong Li","Zheli Liu"],"pdf_url":"https://arxiv.org/pdf/2411.04847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04825v1","updated":"2024-11-07T16:06:00Z","published":"2024-11-07T16:06:00Z","title":"VTechAGP: An Academic-to-General-Audience Text Paraphrase Dataset and\n Benchmark Models","summary":" Existing text simplification or paraphrase datasets mainly focus on\nsentence-level text generation in a general domain. These datasets are\ntypically developed without using domain knowledge. In this paper, we release a\nnovel dataset, VTechAGP, which is the first academic-to-general-audience text\nparaphrase dataset consisting of 4,938 document-level these and dissertation\nacademic and general-audience abstract pairs from 8 colleges authored over 25\nyears. We also propose a novel dynamic soft prompt generative language model,\nDSPT5. For training, we leverage a contrastive-generative loss function to\nlearn the keyword vectors in the dynamic prompt. For inference, we adopt a\ncrowd-sampling decoding strategy at both semantic and structural levels to\nfurther select the best output candidate. We evaluate DSPT5 and various\nstate-of-the-art large language models (LLMs) from multiple perspectives.\nResults demonstrate that the SOTA LLMs does not provide satisfactory outcomes,\nwhile the lightweight DSPT5 can achieve competitive results. To the best of our\nknowledge, we are the first to build a benchmark dataset and solutions for\nacademic-to-general-audience text paraphrase dataset.\n","authors":["Ming Cheng","Jiaying Gong","Chenhan Yuan","William A. Ingram","Edward Fox","Hoda Eldardiry"],"pdf_url":"https://arxiv.org/pdf/2411.04825v1.pdf","comment":"21 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.04822v1","updated":"2024-11-07T15:59:54Z","published":"2024-11-07T15:59:54Z","title":"When Does Classical Chinese Help? Quantifying Cross-Lingual Transfer in\n Hanja and Kanbun","summary":" Historical and linguistic connections within the Sinosphere have led\nresearchers to use Classical Chinese resources for cross-lingual transfer when\nprocessing historical documents from Korea and Japan. In this paper, we\nquestion the assumption of cross-lingual transferability from Classical Chinese\nto Hanja and Kanbun, the ancient written languages of Korea and Japan,\nrespectively. Our experiments across machine translation, named entity\nrecognition, and punctuation restoration tasks show minimal impact of Classical\nChinese datasets on language model performance for ancient Korean documents\nwritten in Hanja, with performance differences within $\\pm{}0.0068$ F1-score\nfor sequence labeling tasks and up to $+0.84$ BLEU score for translation. These\nlimitations persist consistently across various model sizes, architectures, and\ndomain-specific datasets. Our analysis reveals that the benefits of Classical\nChinese resources diminish rapidly as local language data increases for Hanja,\nwhile showing substantial improvements only in extremely low-resource scenarios\nfor both Korean and Japanese historical documents. These mixed results\nemphasize the need for careful empirical validation rather than assuming\nbenefits from indiscriminate cross-lingual transfer.\n","authors":["Seyoung Song","Haneul Yoo","Jiho Jin","Kyunghyun Cho","Alice Oh"],"pdf_url":"https://arxiv.org/pdf/2411.04822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04813v1","updated":"2024-11-07T15:50:40Z","published":"2024-11-07T15:50:40Z","title":"LuxBank: The First Universal Dependency Treebank for Luxembourgish","summary":" The Universal Dependencies (UD) project has significantly expanded linguistic\ncoverage across 161 languages, yet Luxembourgish, a West Germanic language\nspoken by approximately 400,000 people, has remained absent until now. In this\npaper, we introduce LuxBank, the first UD Treebank for Luxembourgish,\naddressing the gap in syntactic annotation and analysis for this `low-research'\nlanguage. We establish formal guidelines for Luxembourgish language annotation,\nproviding the foundation for the first large-scale quantitative analysis of its\nsyntax. LuxBank serves not only as a resource for linguists and language\nlearners but also as a tool for developing spell checkers and grammar checkers,\norganising existing text archives and even training large language models. By\nincorporating Luxembourgish into the UD framework, we aim to enhance the\nunderstanding of syntactic variation within West Germanic languages and offer a\nmodel for documenting smaller, semi-standardised languages. This work positions\nLuxembourgish as a valuable resource in the broader linguistic and NLP\ncommunities, contributing to the study of languages with limited research and\nresources.\n","authors":["Alistair Plum","Caroline Döhmer","Emilia Milano","Anne-Marie Lutgen","Christoph Purschke"],"pdf_url":"https://arxiv.org/pdf/2411.04813v1.pdf","comment":"Accepted at 22nd Workshop on Treebanks and Linguistic Theories (TLT\n 2024)"},{"id":"http://arxiv.org/abs/2408.16163v2","updated":"2024-11-07T15:48:11Z","published":"2024-08-28T22:51:29Z","title":"FRACTURED-SORRY-Bench: Framework for Revealing Attacks in Conversational\n Turns Undermining Refusal Efficacy and Defenses over SORRY-Bench (Automated\n Multi-shot Jailbreaks)","summary":" This paper introduces FRACTURED-SORRY-Bench, a framework for evaluating the\nsafety of Large Language Models (LLMs) against multi-turn conversational\nattacks. Building upon the SORRY-Bench dataset, we propose a simple yet\neffective method for generating adversarial prompts by breaking down harmful\nqueries into seemingly innocuous sub-questions. Our approach achieves a maximum\nincrease of +46.22\\% in Attack Success Rates (ASRs) across GPT-4, GPT-4o,\nGPT-4o-mini, and GPT-3.5-Turbo models compared to baseline methods. We\ndemonstrate that this technique poses a challenge to current LLM safety\nmeasures and highlights the need for more robust defenses against subtle,\nmulti-turn attacks.\n","authors":["Aman Priyanshu","Supriti Vijay"],"pdf_url":"https://arxiv.org/pdf/2408.16163v2.pdf","comment":"4 pages, 2 tables"},{"id":"http://arxiv.org/abs/2302.12921v3","updated":"2024-11-07T15:44:43Z","published":"2023-02-24T22:38:54Z","title":"Pre-Finetuning for Few-Shot Emotional Speech Recognition","summary":" Speech models have long been known to overfit individual speakers for many\nclassification tasks. This leads to poor generalization in settings where the\nspeakers are out-of-domain or out-of-distribution, as is common in production\nenvironments. We view speaker adaptation as a few-shot learning problem and\npropose investigating transfer learning approaches inspired by recent success\nwith pre-trained models in natural language tasks. We propose pre-finetuning\nspeech models on difficult tasks to distill knowledge into few-shot downstream\nclassification objectives. We pre-finetune Wav2Vec2.0 on every permutation of\nfour multiclass emotional speech recognition corpora and evaluate our\npre-finetuned models through 33,600 few-shot fine-tuning trials on the\nEmotional Speech Dataset.\n","authors":["Maximillian Chen","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2302.12921v3.pdf","comment":"Published at INTERSPEECH 2023. 5 pages, 4 figures. Code available at\n https://github.com/maxlchen/Speech-PreFinetuning"},{"id":"http://arxiv.org/abs/2403.00867v3","updated":"2024-11-07T15:41:38Z","published":"2024-03-01T03:29:54Z","title":"Gradient Cuff: Detecting Jailbreak Attacks on Large Language Models by\n Exploring Refusal Loss Landscapes","summary":" Large Language Models (LLMs) are becoming a prominent generative AI tool,\nwhere the user enters a query and the LLM generates an answer. To reduce harm\nand misuse, efforts have been made to align these LLMs to human values using\nadvanced training techniques such as Reinforcement Learning from Human Feedback\n(RLHF). However, recent studies have highlighted the vulnerability of LLMs to\nadversarial jailbreak attempts aiming at subverting the embedded safety\nguardrails. To address this challenge, this paper defines and investigates the\nRefusal Loss of LLMs and then proposes a method called Gradient Cuff to detect\njailbreak attempts. Gradient Cuff exploits the unique properties observed in\nthe refusal loss landscape, including functional values and its smoothness, to\ndesign an effective two-step detection strategy. Experimental results on two\naligned LLMs (LLaMA-2-7B-Chat and Vicuna-7B-V1.5) and six types of jailbreak\nattacks (GCG, AutoDAN, PAIR, TAP, Base64, and LRL) show that Gradient Cuff can\nsignificantly improve the LLM's rejection capability for malicious jailbreak\nqueries, while maintaining the model's performance for benign user queries by\nadjusting the detection threshold.\n","authors":["Xiaomeng Hu","Pin-Yu Chen","Tsung-Yi Ho"],"pdf_url":"https://arxiv.org/pdf/2403.00867v3.pdf","comment":"Accepted by NeurIPS 2024. Project page:\n https://huggingface.co/spaces/TrustSafeAI/GradientCuff-Jailbreak-Defense"},{"id":"http://arxiv.org/abs/2411.04799v1","updated":"2024-11-07T15:38:25Z","published":"2024-11-07T15:38:25Z","title":"Kwai-STaR: Transform LLMs into State-Transition Reasoners","summary":" Mathematical reasoning presents a significant challenge to the cognitive\ncapabilities of LLMs. Various methods have been proposed to enhance the\nmathematical ability of LLMs. However, few recognize the value of state\ntransition for LLM reasoning. In this work, we define mathematical\nproblem-solving as a process of transiting from an initial unsolved state to\nthe final resolved state, and propose Kwai-STaR framework, which transforms\nLLMs into State-Transition Reasoners to improve their intuitive reasoning\ncapabilities. Our approach comprises three main steps: (1) Define the state\nspace tailored to the mathematical reasoning. (2) Generate state-transition\ndata based on the state space. (3) Convert original LLMs into State-Transition\nReasoners via a curricular training strategy. Our experiments validate the\neffectiveness of Kwai-STaR in enhancing mathematical reasoning: After training\non the small-scale Kwai-STaR dataset, general LLMs, including Mistral-7B and\nLLaMA-3, achieve considerable performance gain on the GSM8K and GSM-Hard\ndataset. Additionally, the state transition-based design endows Kwai-STaR with\nremarkable training and inference efficiency. Further experiments are underway\nto establish the generality of Kwai-STaR.\n","authors":["Xingyu Lu","Yuhang Hu","Changyi Liu","Tianke Zhang","Zhenyu Yang","Zhixiang Ding","Shengsheng Qian","Meng Du","Ruiwen Kang","Kaiyu Tang","Fan Yang","Tingting Gao","Di Zhang","Hai-Tao Zheng","Bin Wen"],"pdf_url":"https://arxiv.org/pdf/2411.04799v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.04794v1","updated":"2024-11-07T15:36:05Z","published":"2024-11-07T15:36:05Z","title":"AlignXIE: Improving Multilingual Information Extraction by Cross-Lingual\n Alignment","summary":" Empirical evidence suggests that LLMs exhibit spontaneous cross-lingual\nalignment. Our findings suggest that although LLMs also demonstrate promising\ncross-lingual alignment in Information Extraction, there remains significant\nimbalance across languages, revealing an underlying deficiency in the IE\nalignment. To address this issue, we propose AlignXIE, a powerful code-based\nLLM that significantly enhances cross-lingual IE alignment through two\nstrategies. Firstly, AlignXIE formulates IE across different languages,\nespecially non-English ones, as code generation tasks, standardizing the\nrepresentation of various schemas using Python classes to ensure consistency of\nthe same ontology in different languages and align the schema. Secondly, it\nincorporates an IE cross-lingual alignment phase through a translated instance\nprediction task proposed in this paper to align the extraction process,\nutilizing ParallelNER, an IE bilingual parallel dataset with 257,190 samples,\ngenerated by our proposed LLM-based automatic pipeline for IE parallel data\nconstruction, with manual annotation to ensure quality. Ultimately, we obtain\nAlignXIE through multilingual IE instruction tuning. Although without training\nin 9 unseen languages, AlignXIE surpasses ChatGPT by $30.17\\%$ and SoTA by\n$20.03\\%$, thereby demonstrating superior cross-lingual IE capabilities.\nComprehensive evaluations on 63 IE benchmarks in Chinese and English under\nvarious settings, demonstrate that AlignXIE significantly enhances\ncross-lingual and multilingual IE through boosting the IE alignment.\n","authors":["Yuxin Zuo","Wenxuan Jiang","Wenxuan Liu","Zixuan Li","Long Bai","Hanbin Wang","Yutao Zeng","Xiaolong Jin","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.04794v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2411.04788v1","updated":"2024-11-07T15:28:20Z","published":"2024-11-07T15:28:20Z","title":"Enhancing Investment Analysis: Optimizing AI-Agent Collaboration in\n Financial Research","summary":" In recent years, the application of generative artificial intelligence\n(GenAI) in financial analysis and investment decision-making has gained\nsignificant attention. However, most existing approaches rely on single-agent\nsystems, which fail to fully utilize the collaborative potential of multiple AI\nagents. In this paper, we propose a novel multi-agent collaboration system\ndesigned to enhance decision-making in financial investment research. The\nsystem incorporates agent groups with both configurable group sizes and\ncollaboration structures to leverage the strengths of each agent group type. By\nutilizing a sub-optimal combination strategy, the system dynamically adapts to\nvarying market conditions and investment scenarios, optimizing performance\nacross different tasks. We focus on three sub-tasks: fundamentals, market\nsentiment, and risk analysis, by analyzing the 2023 SEC 10-K forms of 30\ncompanies listed on the Dow Jones Index. Our findings reveal significant\nperformance variations based on the configurations of AI agents for different\ntasks. The results demonstrate that our multi-agent collaboration system\noutperforms traditional single-agent models, offering improved accuracy,\nefficiency, and adaptability in complex financial environments. This study\nhighlights the potential of multi-agent systems in transforming financial\nanalysis and investment decision-making by integrating diverse analytical\nperspectives.\n","authors":["Xuewen Han","Neng Wang","Shangkun Che","Hongyang Yang","Kunpeng Zhang","Sean Xin Xu"],"pdf_url":"https://arxiv.org/pdf/2411.04788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17437v2","updated":"2024-11-07T15:00:00Z","published":"2024-08-30T17:41:30Z","title":"SYNTHEVAL: Hybrid Behavioral Testing of NLP Models with Synthetic\n CheckLists","summary":" Traditional benchmarking in NLP typically involves using static held-out test\nsets. However, this approach often results in an overestimation of performance\nand lacks the ability to offer comprehensive, interpretable, and dynamic\nassessments of NLP models. Recently, works like DynaBench (Kiela et al., 2021)\nand CheckList (Ribeiro et al., 2020) have addressed these limitations through\nbehavioral testing of NLP models with test types generated by a multistep\nhuman-annotated pipeline. Unfortunately, manually creating a variety of test\ntypes requires much human labor, often at prohibitive cost. In this work, we\npropose SYNTHEVAL, a hybrid behavioral testing framework that leverages large\nlanguage models (LLMs) to generate a wide range of test types for a\ncomprehensive evaluation of NLP models. SYNTHEVAL first generates sentences via\nLLMs using controlled generation, and then identifies challenging examples by\ncomparing the predictions made by LLMs with task-specific NLP models. In the\nlast stage, human experts investigate the challenging examples, manually design\ntemplates, and identify the types of failures the taskspecific models\nconsistently exhibit. We apply SYNTHEVAL to two classification tasks, sentiment\nanalysis and toxic language detection, and show that our framework is effective\nin identifying weaknesses of strong models on these tasks. We share our code in\nhttps://github.com/Loreley99/SynthEval_CheckList.\n","authors":["Raoyuan Zhao","Abdullatif Köksal","Yihong Liu","Leonie Weissweiler","Anna Korhonen","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2408.17437v2.pdf","comment":"EMNLP 2024 - Findings"},{"id":"http://arxiv.org/abs/2411.03883v2","updated":"2024-11-07T14:57:14Z","published":"2024-11-06T12:57:58Z","title":"MEG: Medical Knowledge-Augmented Large Language Models for Question\n Answering","summary":" Question answering is a natural language understanding task that involves\nreasoning over both explicit context and unstated, relevant domain knowledge.\nLarge language models (LLMs), which underpin most contemporary question\nanswering systems, struggle to induce how concepts relate in specialized\ndomains such as medicine. Existing medical LLMs are also costly to train. In\nthis work, we present MEG, a parameter-efficient approach for medical\nknowledge-augmented LLMs. MEG uses a lightweight mapping network to integrate\ngraph embeddings into the LLM, enabling it to leverage external knowledge in a\ncost-effective way. We evaluate our method on four popular medical\nmultiple-choice datasets and show that LLMs greatly benefit from the factual\ngrounding provided by knowledge graph embeddings. MEG attains an average of\n+10.2% accuracy over the Mistral-Instruct baseline, and +6.7% over specialized\nmodels like BioMistral. We also show results based on Llama-3. Finally, we show\nthat MEG's performance remains robust to the choice of graph encoder.\n","authors":["Laura Cabello","Carmen Martin-Turrero","Uchenna Akujuobi","Anders Søgaard","Carlos Bobed"],"pdf_url":"https://arxiv.org/pdf/2411.03883v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04756v1","updated":"2024-11-07T14:54:42Z","published":"2024-11-07T14:54:42Z","title":"A study of Vietnamese readability assessing through semantic and\n statistical features","summary":" Determining the difficulty of a text involves assessing various textual\nfeatures that may impact the reader's text comprehension, yet current research\nin Vietnamese has only focused on statistical features. This paper introduces a\nnew approach that integrates statistical and semantic approaches to assessing\ntext readability. Our research utilized three distinct datasets: the Vietnamese\nText Readability Dataset (ViRead), OneStopEnglish, and RACE, with the latter\ntwo translated into Vietnamese. Advanced semantic analysis methods were\nemployed for the semantic aspect using state-of-the-art language models such as\nPhoBERT, ViDeBERTa, and ViBERT. In addition, statistical methods were\nincorporated to extract syntactic and lexical features of the text. We\nconducted experiments using various machine learning models, including Support\nVector Machine (SVM), Random Forest, and Extra Trees and evaluated their\nperformance using accuracy and F1 score metrics. Our results indicate that a\njoint approach that combines semantic and statistical features significantly\nenhances the accuracy of readability classification compared to using each\nmethod in isolation. The current study emphasizes the importance of considering\nboth statistical and semantic aspects for a more accurate assessment of text\ndifficulty in Vietnamese. This contribution to the field provides insights into\nthe adaptability of advanced language models in the context of Vietnamese text\nreadability. It lays the groundwork for future research in this area.\n","authors":["Hung Tuan Le","Long Truong To","Manh Trong Nguyen","Quyen Nguyen","Trong-Hop Do"],"pdf_url":"https://arxiv.org/pdf/2411.04756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04752v1","updated":"2024-11-07T14:41:01Z","published":"2024-11-07T14:41:01Z","title":"RetrieveGPT: Merging Prompts and Mathematical Models for Enhanced\n Code-Mixed Information Retrieval","summary":" Code-mixing, the integration of lexical and grammatical elements from\nmultiple languages within a single sentence, is a widespread linguistic\nphenomenon, particularly prevalent in multilingual societies. In India, social\nmedia users frequently engage in code-mixed conversations using the Roman\nscript, especially among migrant communities who form online groups to share\nrelevant local information. This paper focuses on the challenges of extracting\nrelevant information from code-mixed conversations, specifically within Roman\ntransliterated Bengali mixed with English. This study presents a novel approach\nto address these challenges by developing a mechanism to automatically identify\nthe most relevant answers from code-mixed conversations. We have experimented\nwith a dataset comprising of queries and documents from Facebook, and Query\nRelevance files (QRels) to aid in this task. Our results demonstrate the\neffectiveness of our approach in extracting pertinent information from complex,\ncode-mixed digital conversations, contributing to the broader field of natural\nlanguage processing in multilingual and informal text environments. We use\nGPT-3.5 Turbo via prompting alongwith using the sequential nature of relevant\ndocuments to frame a mathematical model which helps to detect relevant\ndocuments corresponding to a query.\n","authors":["Aniket Deroy","Subhankar Maity"],"pdf_url":"https://arxiv.org/pdf/2411.04752v1.pdf","comment":"Accepted at FIRE 2024 (Track: Code-Mixed Information Retrieval from\n Social Media Data)"},{"id":"http://arxiv.org/abs/2310.09765v2","updated":"2024-11-07T13:56:58Z","published":"2023-10-15T07:49:56Z","title":"MILPaC: A Novel Benchmark for Evaluating Translation of Legal Text to\n Indian Languages","summary":" Most legal text in the Indian judiciary is written in complex English due to\nhistorical reasons. However, only a small fraction of the Indian population is\ncomfortable in reading English. Hence legal text needs to be made available in\nvarious Indian languages, possibly by translating the available legal text from\nEnglish. Though there has been a lot of research on translation to and between\nIndian languages, to our knowledge, there has not been much prior work on such\ntranslation in the legal domain. In this work, we construct the first\nhigh-quality legal parallel corpus containing aligned text units in English and\nnine Indian languages, that includes several low-resource languages. We also\nbenchmark the performance of a wide variety of Machine Translation (MT) systems\nover this corpus, including commercial MT systems, open-source MT systems and\nLarge Language Models. Through a comprehensive survey by Law practitioners, we\ncheck how satisfied they are with the translations by some of these MT systems,\nand how well automatic MT evaluation metrics agree with the opinions of Law\npractitioners.\n","authors":["Sayan Mahapatra","Debtanu Datta","Shubham Soni","Adrijit Goswami","Saptarshi Ghosh"],"pdf_url":"https://arxiv.org/pdf/2310.09765v2.pdf","comment":"To be published in ACM Transactions on Asian and Low-Resource\n Language Information Processing (TALLIP)"},{"id":"http://arxiv.org/abs/2411.04699v1","updated":"2024-11-07T13:33:34Z","published":"2024-11-07T13:33:34Z","title":"BhasaAnuvaad: A Speech Translation Dataset for 14 Indian Languages","summary":" Automatic Speech Translation (AST) datasets for Indian languages remain\ncritically scarce, with public resources covering fewer than 10 of the 22\nofficial languages. This scarcity has resulted in AST systems for Indian\nlanguages lagging far behind those available for high-resource languages like\nEnglish. In this paper, we first evaluate the performance of widely-used AST\nsystems on Indian languages, identifying notable performance gaps and\nchallenges. Our findings show that while these systems perform adequately on\nread speech, they struggle significantly with spontaneous speech, including\ndisfluencies like pauses and hesitations. Additionally, there is a striking\nabsence of systems capable of accurately translating colloquial and informal\nlanguage, a key aspect of everyday communication. To this end, we introduce\nBhasaAnuvaad, the largest publicly available dataset for AST involving 14\nscheduled Indian languages spanning over 44,400 hours and 17M text segments.\nBhasaAnuvaad contains data for English speech to Indic text, as well as Indic\nspeech to English text. This dataset comprises three key categories: (1)\nCurated datasets from existing resources, (2) Large-scale web mining, and (3)\nSynthetic data generation. By offering this diverse and expansive dataset, we\naim to bridge the resource gap and promote advancements in AST for low-resource\nIndian languages, especially in handling spontaneous and informal speech\npatterns.\n","authors":["Sparsh Jain","Ashwin Sankar","Devilal Choudhary","Dhairya Suman","Nikhil Narasimhan","Mohammed Safi Ur Rahman Khan","Anoop Kunchukuttan","Mitesh M Khapra","Raj Dabre"],"pdf_url":"https://arxiv.org/pdf/2411.04699v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2405.04304v5","updated":"2024-11-07T12:59:39Z","published":"2024-05-07T13:27:52Z","title":"Dynamic Speculation Lookahead Accelerates Speculative Decoding of Large\n Language Models","summary":" Speculative decoding is commonly used for reducing the inference latency of\nlarge language models. Its effectiveness depends highly on the speculation\nlookahead (SL)-the number of tokens generated by the draft model at each\niteration. In this work we show that the common practice of using the same SL\nfor all iterations (static SL) is suboptimal. We introduce DISCO (DynamIc\nSpeCulation lookahead Optimization), a novel method for dynamically selecting\nthe SL. Our experiments with four datasets show that DISCO reaches an average\nspeedup of 10% compared to the best static SL baseline, while generating the\nexact same text.\n","authors":["Jonathan Mamou","Oren Pereg","Daniel Korat","Moshe Berchansky","Nadav Timor","Moshe Wasserblat","Roy Schwartz"],"pdf_url":"https://arxiv.org/pdf/2405.04304v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02674v3","updated":"2024-11-07T12:38:41Z","published":"2024-11-04T23:21:12Z","title":"Wave Network: An Ultra-Small Language Model","summary":" We propose an innovative token representation and update method in a new\nultra-small language model: the Wave network. Specifically, we use a complex\nvector to represent each token, encoding both global and local semantics of the\ninput text. A complex vector consists of two components: a magnitude vector\nrepresenting the global semantics of the input text, and a phase vector\ncapturing the relationships between individual tokens and global semantics.\nExperiments on the AG News text classification task demonstrate that, when\ngenerating complex vectors from randomly initialized token embeddings, our\nsingle-layer Wave Network achieves 90.91% accuracy with wave interference and\n91.66% with wave modulation - outperforming a single Transformer layer using\nBERT pre-trained embeddings by 19.23% and 19.98%, respectively, and approaching\nthe accuracy of the pre-trained and fine-tuned BERT base model (94.64%).\nAdditionally, compared to BERT base, the Wave Network reduces video memory\nusage and training time by 77.34% and 85.62% during wave modulation. In\nsummary, we used a 2.4-million-parameter small language model to achieve\naccuracy comparable to a 100-million-parameter BERT model in text\nclassification.\n","authors":["Xin Zhang","Victor S. Sheng"],"pdf_url":"https://arxiv.org/pdf/2411.02674v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04649v1","updated":"2024-11-07T12:12:44Z","published":"2024-11-07T12:12:44Z","title":"DISCO: DISCovering Overfittings as Causal Rules for Text Classification\n Models","summary":" With the rapid advancement of neural language models, the deployment of\nover-parameterized models has surged, increasing the need for interpretable\nexplanations comprehensible to human inspectors. Existing post-hoc\ninterpretability methods, which often focus on unigram features of single input\ntextual instances, fail to capture the models' decision-making process fully.\nAdditionally, many methods do not differentiate between decisions based on\nspurious correlations and those based on a holistic understanding of the input.\nOur paper introduces DISCO, a novel method for discovering global, rule-based\nexplanations by identifying causal n-gram associations with model predictions.\nThis method employs a scalable sequence mining technique to extract relevant\ntext spans from training data, associate them with model predictions, and\nconduct causality checks to distill robust rules that elucidate model behavior.\nThese rules expose potential overfitting and provide insights into misleading\nfeature combinations. We validate DISCO through extensive testing,\ndemonstrating its superiority over existing methods in offering comprehensive\ninsights into complex model behaviors. Our approach successfully identifies all\nshortcuts manually introduced into the training data (100% detection rate on\nthe MultiRC dataset), resulting in an 18.8% regression in model performance --\na capability unmatched by any other method. Furthermore, DISCO supports\ninteractive explanations, enabling human inspectors to distinguish spurious\ncauses in the rule-based output. This alleviates the burden of abundant\ninstance-wise explanations and helps assess the model's risk when encountering\nout-of-distribution (OOD) data.\n","authors":["Zijian Zhang","Vinay Setty","Yumeng Wang","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2411.04649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14125v3","updated":"2024-11-07T12:07:07Z","published":"2024-05-23T02:57:42Z","title":"ALI-Agent: Assessing LLMs' Alignment with Human Values via Agent-based\n Evaluation","summary":" Large Language Models (LLMs) can elicit unintended and even harmful content\nwhen misaligned with human values, posing severe risks to users and society. To\nmitigate these risks, current evaluation benchmarks predominantly employ\nexpert-designed contextual scenarios to assess how well LLMs align with human\nvalues. However, the labor-intensive nature of these benchmarks limits their\ntest scope, hindering their ability to generalize to the extensive variety of\nopen-world use cases and identify rare but crucial long-tail risks.\nAdditionally, these static tests fail to adapt to the rapid evolution of LLMs,\nmaking it hard to evaluate timely alignment issues. To address these\nchallenges, we propose ALI-Agent, an evaluation framework that leverages the\nautonomous abilities of LLM-powered agents to conduct in-depth and adaptive\nalignment assessments. ALI-Agent operates through two principal stages:\nEmulation and Refinement. During the Emulation stage, ALI-Agent automates the\ngeneration of realistic test scenarios. In the Refinement stage, it iteratively\nrefines the scenarios to probe long-tail risks. Specifically, ALI-Agent\nincorporates a memory module to guide test scenario generation, a tool-using\nmodule to reduce human labor in tasks such as evaluating feedback from target\nLLMs, and an action module to refine tests. Extensive experiments across three\naspects of human values--stereotypes, morality, and legality--demonstrate that\nALI-Agent, as a general evaluation framework, effectively identifies model\nmisalignment. Systematic analysis also validates that the generated test\nscenarios represent meaningful use cases, as well as integrate enhanced\nmeasures to probe long-tail risks. Our code is available at\nhttps://github.com/SophieZheng998/ALI-Agent.git\n","authors":["Jingnan Zheng","Han Wang","An Zhang","Tai D. Nguyen","Jun Sun","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2405.14125v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04637v1","updated":"2024-11-07T11:51:14Z","published":"2024-11-07T11:51:14Z","title":"Hands-On Tutorial: Labeling with LLM and Human-in-the-Loop","summary":" Training and deploying machine learning models relies on a large amount of\nhuman-annotated data. As human labeling becomes increasingly expensive and\ntime-consuming, recent research has developed multiple strategies to speed up\nannotation and reduce costs and human workload: generating synthetic training\ndata, active learning, and hybrid labeling. This tutorial is oriented toward\npractical applications: we will present the basics of each strategy, highlight\ntheir benefits and limitations, and discuss in detail real-life case studies.\nAdditionally, we will walk through best practices for managing human annotators\nand controlling the quality of the final dataset. The tutorial includes a\nhands-on workshop, where attendees will be guided in implementing a hybrid\nannotation setup. This tutorial is designed for NLP practitioners from both\nresearch and industry backgrounds who are involved in or interested in\noptimizing data labeling projects.\n","authors":["Ekaterina Artemova","Akim Tsvigun","Dominik Schlechtweg","Natalia Fedorova","Sergei Tilga","Boris Obmoroshev"],"pdf_url":"https://arxiv.org/pdf/2411.04637v1.pdf","comment":"To be presented at COLING 2025"},{"id":"http://arxiv.org/abs/2411.04604v1","updated":"2024-11-07T10:39:10Z","published":"2024-11-07T10:39:10Z","title":"FASSILA: A Corpus for Algerian Dialect Fake News Detection and Sentiment\n Analysis","summary":" In the context of low-resource languages, the Algerian dialect (AD) faces\nchallenges due to the absence of annotated corpora, hindering its effective\nprocessing, notably in Machine Learning (ML) applications reliant on corpora\nfor training and assessment. This study outlines the development process of a\nspecialized corpus for Fake News (FN) detection and sentiment analysis (SA) in\nAD called FASSILA. This corpus comprises 10,087 sentences, encompassing over\n19,497 unique words in AD, and addresses the significant lack of linguistic\nresources in the language and covers seven distinct domains. We propose an\nannotation scheme for FN detection and SA, detailing the data collection,\ncleaning, and labelling process. Remarkable Inter-Annotator Agreement indicates\nthat the annotation scheme produces consistent annotations of high quality.\nSubsequent classification experiments using BERT-based models and ML models are\npresented, demonstrate promising results and highlight avenues for further\nresearch. The dataset is made freely available on GitHub\n(https://github.com/amincoding/FASSILA) to facilitate future advancements in\nthe field.\n","authors":["Amin Abdedaiem","Abdelhalim Hafedh Dahou","Mohamed Amine Cheragui","Brigitte Mathiak"],"pdf_url":"https://arxiv.org/pdf/2411.04604v1.pdf","comment":"16 pages, 6 Figuers"},{"id":"http://arxiv.org/abs/2411.04602v1","updated":"2024-11-07T10:31:31Z","published":"2024-11-07T10:31:31Z","title":"Self-Calibrated Listwise Reranking with Large Language Models","summary":" Large language models (LLMs), with advanced linguistic capabilities, have\nbeen employed in reranking tasks through a sequence-to-sequence approach. In\nthis paradigm, multiple passages are reranked in a listwise manner and a\ntextual reranked permutation is generated. However, due to the limited context\nwindow of LLMs, this reranking paradigm requires a sliding window strategy to\niteratively handle larger candidate sets. This not only increases computational\ncosts but also restricts the LLM from fully capturing all the comparison\ninformation for all candidates. To address these challenges, we propose a novel\nself-calibrated listwise reranking method, which aims to leverage LLMs to\nproduce global relevance scores for ranking. To achieve it, we first propose\nthe relevance-aware listwise reranking framework, which incorporates explicit\nlist-view relevance scores to improve reranking efficiency and enable global\ncomparison across the entire candidate set. Second, to ensure the comparability\nof the computed scores, we propose self-calibrated training that uses\npoint-view relevance assessments generated internally by the LLM itself to\ncalibrate the list-view relevance assessments. Extensive experiments and\ncomprehensive analysis on the BEIR benchmark and TREC Deep Learning Tracks\ndemonstrate the effectiveness and efficiency of our proposed method.\n","authors":["Ruiyang Ren","Yuhao Wang","Kun Zhou","Wayne Xin Zhao","Wenjie Wang","Jing Liu","Ji-Rong Wen","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2411.04602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04588v1","updated":"2024-11-07T10:17:40Z","published":"2024-11-07T10:17:40Z","title":"Tibyan Corpus: Balanced and Comprehensive Error Coverage Corpus Using\n ChatGPT for Arabic Grammatical Error Correction","summary":" Natural language processing (NLP) utilizes text data augmentation to overcome\nsample size constraints. Increasing the sample size is a natural and widely\nused strategy for alleviating these challenges. In this study, we chose Arabic\nto increase the sample size and correct grammatical errors. Arabic is\nconsidered one of the languages with limited resources for grammatical error\ncorrection (GEC). Furthermore, QALB-14 and QALB-15 are the only datasets used\nin most Arabic grammatical error correction research, with approximately 20,500\nparallel examples, which is considered low compared with other languages.\nTherefore, this study aims to develop an Arabic corpus called \"Tibyan\" for\ngrammatical error correction using ChatGPT. ChatGPT is used as a data augmenter\ntool based on a pair of Arabic sentences containing grammatical errors matched\nwith a sentence free of errors extracted from Arabic books, called guide\nsentences. Multiple steps were involved in establishing our corpus, including\nthe collection and pre-processing of a pair of Arabic texts from various\nsources, such as books and open-access corpora. We then used ChatGPT to\ngenerate a parallel corpus based on the text collected previously, as a guide\nfor generating sentences with multiple types of errors. By engaging linguistic\nexperts to review and validate the automatically generated sentences, we\nensured that they were correct and error-free. The corpus was validated and\nrefined iteratively based on feedback provided by linguistic experts to improve\nits accuracy. Finally, we used the Arabic Error Type Annotation tool (ARETA) to\nanalyze the types of errors in the Tibyan corpus. Our corpus contained 49 of\nerrors, including seven types: orthography, morphology, syntax, semantics,\npunctuation, merge, and split. The Tibyan corpus contains approximately 600 K\ntokens.\n","authors":["Ahlam Alrehili","Areej Alhothali"],"pdf_url":"https://arxiv.org/pdf/2411.04588v1.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.04585v1","updated":"2024-11-07T10:11:38Z","published":"2024-11-07T10:11:38Z","title":"The State and Fate of Summarization Datasets","summary":" Automatic summarization has consistently attracted attention, due to its\nversatility and wide application in various downstream tasks. Despite its\npopularity, we find that annotation efforts have largely been disjointed, and\nhave lacked common terminology. Consequently, it is challenging to discover\nexisting resources or identify coherent research directions. To address this,\nwe survey a large body of work spanning 133 datasets in over 100 languages,\ncreating a novel ontology covering sample properties, collection methods and\ndistribution. With this ontology we make key observations, including the lack\nin accessible high-quality datasets for low-resource languages, and the field's\nover-reliance on the news domain and on automatically collected distant\nsupervision. Finally, we make available a web interface that allows users to\ninteract and explore our ontology and dataset collection, as well as a template\nfor a summarization data card, which can be used to streamline future research\ninto a more coherent body of work.\n","authors":["Noam Dahan","Gabriel Stanovsky"],"pdf_url":"https://arxiv.org/pdf/2411.04585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04573v1","updated":"2024-11-07T09:57:57Z","published":"2024-11-07T09:57:57Z","title":"Multistage Fine-tuning Strategies for Automatic Speech Recognition in\n Low-resource Languages","summary":" This paper presents a novel multistage fine-tuning strategy designed to\nenhance automatic speech recognition (ASR) performance in low-resource\nlanguages using OpenAI's Whisper model. In this approach we aim to build ASR\nmodel for languages with limited digital resources by sequentially adapting the\nmodel across linguistically similar languages. We experimented this on the\nMalasar language, a Dravidian language spoken by approximately ten thousand\npeople in the Western Ghats of South India. Malasar language faces critical\nchallenges for technological intervention due to its lack of a native script\nand absence of digital or spoken data resources. Working in collaboration with\nWycliffe India and Malasar community members, we created a spoken Malasar\ncorpus paired with transcription in Tamil script, a closely related major\nlanguage. In our approach to build ASR model for Malasar, we first build an\nintermediate Tamil ASR, leveraging higher data availability for Tamil annotated\nspeech. This intermediate model is subsequently fine-tuned on Malasar data,\nallowing for more effective ASR adaptation despite limited resources. The\nmultistage fine-tuning strategy demonstrated significant improvements over\ndirect fine-tuning on Malasar data alone, achieving a word error rate (WER) of\n51.9%, which is 4.5% absolute reduction when compared to the direct fine-tuning\nmethod. Further a WER reduction to 47.3% was achieved through punctuation\nremoval in post-processing, which addresses formatting inconsistencies that\nimpact evaluation. Our results underscore the effectiveness of sequential\nmultistage fine-tuning combined with targeted post-processing as a scalable\nstrategy for ASR system development in low-resource languages, especially where\nlinguistic similarities can be leveraged to bridge gaps in training data.\n","authors":["Leena G Pillai","Kavya Manohar","Basil K Raju","Elizabeth Sherly"],"pdf_url":"https://arxiv.org/pdf/2411.04573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12096v3","updated":"2024-11-07T09:29:32Z","published":"2024-04-18T11:29:23Z","title":"LongEmbed: Extending Embedding Models for Long Context Retrieval","summary":" Embedding models play a pivot role in modern NLP applications such as IR and\nRAG. While the context limit of LLMs has been pushed beyond 1 million tokens,\nembedding models are still confined to a narrow context window not exceeding 8k\ntokens, refrained from application scenarios requiring long inputs such as\nlegal contracts. This paper explores context window extension of existing\nembedding models, pushing the limit to 32k without requiring additional\ntraining. First, we examine the performance of current embedding models for\nlong context retrieval on our newly constructed LongEmbed benchmark. LongEmbed\ncomprises two synthetic tasks and four carefully chosen real-world tasks,\nfeaturing documents of varying length and dispersed target information.\nBenchmarking results underscore huge room for improvement in these models.\nBased on this, comprehensive experiments show that training-free context window\nextension strategies like position interpolation can effectively extend the\ncontext window of existing embedding models by several folds, regardless of\ntheir original context being 512 or beyond 4k. Furthermore, for models\nemploying absolute position encoding (APE), we show the possibility of further\nfine-tuning to harvest notable performance gains while strictly preserving\noriginal behavior for short inputs. For models using rotary position embedding\n(RoPE), significant enhancements are observed when employing RoPE-specific\nmethods, such as NTK and SelfExtend, indicating RoPE's superiority over APE for\ncontext window extension. To facilitate future research, we release E5-Base-4k\nand E5-RoPE-Base, along with the LongEmbed benchmark.\n","authors":["Dawei Zhu","Liang Wang","Nan Yang","Yifan Song","Wenhao Wu","Furu Wei","Sujian Li"],"pdf_url":"https://arxiv.org/pdf/2404.12096v3.pdf","comment":"EMNLP 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2411.04557v1","updated":"2024-11-07T09:28:38Z","published":"2024-11-07T09:28:38Z","title":"Pruning Literals for Highly Efficient Explainability at Word Level","summary":" Designing an explainable model becomes crucial now for Natural Language\nProcessing(NLP) since most of the state-of-the-art machine learning models\nprovide a limited explanation for the prediction. In the spectrum of an\nexplainable model, Tsetlin Machine(TM) is promising because of its capability\nof providing word-level explanation using proposition logic. However, concern\nrises over the elaborated combination of literals (propositional logic) in the\nclause that makes the model difficult for humans to comprehend, despite having\na transparent learning process. In this paper, we design a post-hoc pruning of\nclauses that eliminate the randomly placed literals in the clause thereby\nmaking the model more efficiently interpretable than the vanilla TM.\nExperiments on the publicly available YELP-HAT Dataset demonstrate that the\nproposed pruned TM's attention map aligns more with the human attention map\nthan the vanilla TM's attention map. In addition, the pairwise similarity\nmeasure also surpasses the attention map-based neural network models. In terms\nof accuracy, the proposed pruning method does not degrade the accuracy\nsignificantly but rather enhances the performance up to 4% to 9% in some test\ndata.\n","authors":["Rohan Kumar Yadav","Bimal Bhattarai","Abhik Jana","Lei Jiao","Seid Muhie Yimam"],"pdf_url":"https://arxiv.org/pdf/2411.04557v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.04539v1","updated":"2024-11-07T08:54:46Z","published":"2024-11-07T08:54:46Z","title":"Best Practices for Distilling Large Language Models into BERT for Web\n Search Ranking","summary":" Recent studies have highlighted the significant potential of Large Language\nModels (LLMs) as zero-shot relevance rankers. These methods predominantly\nutilize prompt learning to assess the relevance between queries and documents\nby generating a ranked list of potential documents. Despite their promise, the\nsubstantial costs associated with LLMs pose a significant challenge for their\ndirect implementation in commercial search systems. To overcome this barrier\nand fully exploit the capabilities of LLMs for text ranking, we explore\ntechniques to transfer the ranking expertise of LLMs to a more compact model\nsimilar to BERT, using a ranking loss to enable the deployment of less\nresource-intensive models. Specifically, we enhance the training of LLMs\nthrough Continued Pre-Training, taking the query as input and the clicked title\nand summary as output. We then proceed with supervised fine-tuning of the LLM\nusing a rank loss, assigning the final token as a representative of the entire\nsentence. Given the inherent characteristics of autoregressive language models,\nonly the final token can encapsulate all preceding tokens. Additionally,\nwe introduce a hybrid point-wise and margin MSE loss to transfer the ranking\nknowledge from LLMs to smaller models like BERT. This method creates a viable\nsolution for environments with strict resource constraints. Both offline and\nonline evaluations have confirmed the efficacy of our approach, and our model\nhas been successfully integrated into a commercial web search engine as of\nFebruary 2024.\n","authors":["Dezhi Ye","Junwei Hu","Jiabin Fan","Bowen Tian","Jie Liu","Haijin Liang","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2411.04539v1.pdf","comment":"Arxiv Version"},{"id":"http://arxiv.org/abs/2411.04535v1","updated":"2024-11-07T08:48:33Z","published":"2024-11-07T08:48:33Z","title":"Meta-Reasoning Improves Tool Use in Large Language Models","summary":" External tools help large language models (LLMs) succeed at tasks where they\nwould otherwise typically fail. In existing frameworks, LLMs learn tool use\neither by in-context demonstrations or via full model fine-tuning on annotated\ndata. As these approaches do not easily scale, a recent trend is to abandon\nthem in favor of lightweight, parameter-efficient tuning paradigms. These\nmethods allow quickly alternating between the frozen LLM and its specialised\nfine-tuned version, by switching on or off a handful of additional custom\nparameters. Hence, we postulate that the generalization ability of the frozen\nmodel can be leveraged to improve tool selection. We present Tool selECTion via\nmeta-reasONing (TECTON), a two-phase system that first reasons over a task\nusing a custom fine-tuned LM head and outputs candidate tools. Then, with the\ncustom head disabled, it meta-reasons (i.e., it reasons over the previous\nreasoning process) to make a final choice. We show that TECTON results in\nsubstantial gains - both in-distribution and out-of-distribution - on a range\nof math reasoning datasets.\n","authors":["Lisa Alazraki","Marek Rei"],"pdf_url":"https://arxiv.org/pdf/2411.04535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14974v2","updated":"2024-11-07T08:41:03Z","published":"2024-05-23T18:21:59Z","title":"LOVA3: Learning to Visual Question Answering, Asking and Assessment","summary":" Question answering, asking, and assessment are three innate human traits\ncrucial for understanding the world and acquiring knowledge. By enhancing these\ncapabilities, humans can more effectively utilize data, leading to better\ncomprehension and learning outcomes. Current Multimodal Large Language Models\n(MLLMs) primarily focus on question answering, often neglecting the full\npotential of questioning and assessment skills. Inspired by the human learning\nmechanism, we introduce LOVA3, an innovative framework named \"Learning tO\nVisual question Answering, Asking and Assessment,\" designed to equip MLLMs with\nthese additional capabilities. Our approach involves the creation of two\nsupplementary training tasks GenQA and EvalQA, aiming at fostering the skills\nof asking and assessing questions in the context of images. To develop the\nquestioning ability, we compile a comprehensive set of multimodal foundational\ntasks. For assessment, we introduce a new benchmark called EvalQABench,\ncomprising 64,000 training samples (split evenly between positive and negative\nsamples) and 5,000 validation and testing samples. We posit that enhancing\nMLLMs with the capabilities to answer, ask, and assess questions will enhance\ntheir multimodal comprehension, ultimately improving overall performance. To\nvalidate this hypothesis, we train MLLMs using the LOVA3 framework and evaluate\nthem on a range of multimodal datasets and benchmarks. Our results demonstrate\nconsistent performance gains, underscoring the critical role of these\nadditional tasks in fostering comprehensive intelligence in MLLMs. The code is\navailable at https://github.com/showlab/LOVA3.\n","authors":["Henry Hengyuan Zhao","Pan Zhou","Difei Gao","Zechen Bai","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2405.14974v2.pdf","comment":"Accepted by NeurIPS 2024. The code is available at\n https://github.com/showlab/LOVA3"},{"id":"http://arxiv.org/abs/2411.04530v1","updated":"2024-11-07T08:38:32Z","published":"2024-11-07T08:38:32Z","title":"Tomato, Tomahto, Tomate: Measuring the Role of Shared Semantics among\n Subwords in Multilingual Language Models","summary":" Human understanding of language is robust to different word choices as far as\nthey represent similar semantic concepts. To what extent does our human\nintuition transfer to language models, which represent all subwords as distinct\nembeddings? In this work, we take an initial step on measuring the role of\nshared semantics among subwords in the encoder-only multilingual language\nmodels (mLMs). To this end, we form \"semantic tokens\" by merging the\nsemantically similar subwords and their embeddings, and evaluate the updated\nmLMs on 5 heterogeneous multilingual downstream tasks. Results show that the\ngeneral shared semantics could get the models a long way in making the\npredictions on mLMs with different tokenizers and model sizes. Inspections on\nthe grouped subwords show that they exhibit a wide range of semantic\nsimilarities, including synonyms and translations across many languages and\nscripts. Lastly, we found the zero-shot results with semantic tokens are on par\nor even better than the original models on certain classification tasks,\nsuggesting that the shared subword-level semantics may serve as the anchors for\ncross-lingual transferring.\n","authors":["Xinyu Zhang","Jing Lu","Vinh Q. Tran","Tal Schuster","Donald Metzler","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2411.04530v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.17382v2","updated":"2024-11-07T08:34:27Z","published":"2024-05-27T17:38:33Z","title":"ReMoDetect: Reward Models Recognize Aligned LLM's Generations","summary":" The remarkable capabilities and easy accessibility of large language models\n(LLMs) have significantly increased societal risks (e.g., fake news\ngeneration), necessitating the development of LLM-generated text (LGT)\ndetection methods for safe usage. However, detecting LGTs is challenging due to\nthe vast number of LLMs, making it impractical to account for each LLM\nindividually; hence, it is crucial to identify the common characteristics\nshared by these models. In this paper, we draw attention to a common feature of\nrecent powerful LLMs, namely the alignment training, i.e., training LLMs to\ngenerate human-preferable texts. Our key finding is that as these aligned LLMs\nare trained to maximize the human preferences, they generate texts with higher\nestimated preferences even than human-written texts; thus, such texts are\neasily detected by using the reward model (i.e., an LLM trained to model human\npreference distribution). Based on this finding, we propose two training\nschemes to further improve the detection ability of the reward model, namely\n(i) continual preference fine-tuning to make the reward model prefer aligned\nLGTs even further and (ii) reward modeling of Human/LLM mixed texts (a\nrephrased texts from human-written texts using aligned LLMs), which serves as a\nmedian preference text corpus between LGTs and human-written texts to learn the\ndecision boundary better. We provide an extensive evaluation by considering six\ntext domains across twelve aligned LLMs, where our method demonstrates\nstate-of-the-art results. Code is available at\nhttps://github.com/hyunseoklee-ai/ReMoDetect.\n","authors":["Hyunseok Lee","Jihoon Tack","Jinwoo Shin"],"pdf_url":"https://arxiv.org/pdf/2405.17382v2.pdf","comment":"Published as a conference proceeding for NeurIPS 2024"},{"id":"http://arxiv.org/abs/2308.13149v2","updated":"2024-11-07T08:20:46Z","published":"2023-08-25T03:05:33Z","title":"SciEval: A Multi-Level Large Language Model Evaluation Benchmark for\n Scientific Research","summary":" Recently, there has been growing interest in using Large Language Models\n(LLMs) for scientific research. Numerous benchmarks have been proposed to\nevaluate the ability of LLMs for scientific research. However, current\nbenchmarks are mostly based on pre-collected objective questions. This design\nsuffers from data leakage problem and lacks the evaluation of subjective Q/A\nability. In this paper, we propose SciEval, a comprehensive and\nmulti-disciplinary evaluation benchmark to address these issues. Based on\nBloom's taxonomy, SciEval covers four dimensions to systematically evaluate\nscientific research ability. In particular, we design a \"dynamic\" subset based\non scientific principles to prevent evaluation from potential data leakage.\nBoth objective and subjective questions are included in SciEval. These\ncharacteristics make SciEval a more effective benchmark for scientific research\nability evaluation of LLMs. Comprehensive experiments on most advanced LLMs\nshow that, although GPT-4 achieves SOTA performance compared to other LLMs,\nthere is still substantial room for improvement, especially for dynamic\nquestions. The codes and data are publicly available on\nhttps://github.com/OpenDFM/SciEval.\n","authors":["Liangtai Sun","Yang Han","Zihan Zhao","Da Ma","Zhennan Shen","Baocai Chen","Lu Chen","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2308.13149v2.pdf","comment":"12 pages, 17 figures, 12 tables. Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2411.04496v1","updated":"2024-11-07T07:46:06Z","published":"2024-11-07T07:46:06Z","title":"Thanos: Enhancing Conversational Agents with Skill-of-Mind-Infused Large\n Language Model","summary":" To increase social bonding with interlocutors, humans naturally acquire the\nability to respond appropriately in a given situation by considering which\nconversational skill is most suitable for the response - a process we call\nskill-of-mind. For large language model (LLM)-based conversational agents,\nplanning appropriate conversational skills, as humans do, is challenging due to\nthe complexity of social dialogue, especially in interactive scenarios. To\naddress this, we propose a skill-of-mind-annotated conversation dataset, named\nMultifaceted Skill-of-Mind, which includes multi-turn and multifaceted\nconversational skills across various interactive scenarios (e.g., long-term,\ncounseling, task-oriented), grounded in diverse social contexts (e.g.,\ndemographics, persona, rules of thumb). This dataset consists of roughly 100K\nconversations. Using this dataset, we introduce a new family of\nskill-of-mind-infused LLMs, named Thanos, with model sizes of 1B, 3B, and 8B\nparameters. With extensive experiments, these models successfully demonstrate\nthe skill-of-mind process and exhibit strong generalizability in inferring\nmultifaceted skills across a variety of domains. Moreover, we show that Thanos\nsignificantly enhances the quality of responses generated by LLM-based\nconversational agents and promotes prosocial behavior in human evaluations.\n","authors":["Young-Jun Lee","Dokyong Lee","Junyoung Youn","Kyeongjin Oh","Ho-Jin Choi"],"pdf_url":"https://arxiv.org/pdf/2411.04496v1.pdf","comment":"Code: https://github.com/passing2961/Thanos"},{"id":"http://arxiv.org/abs/2410.14979v3","updated":"2024-11-07T07:25:04Z","published":"2024-10-19T05:01:56Z","title":"Do Large Language Models Truly Grasp Mathematics? An Empirical\n Exploration From A Psychological Perspective","summary":" Despite their proficiency in math tasks, the mechanisms underlying LLMs'\nmathematical reasoning abilities remain a subject of debate. Recent studies\nsuggest that chain-of-thought (CoT) prompts can bolster mathematical reasoning\nby encouraging LLMs to employ human-like logical reasoning (System 2), enabling\nthem to excel on the Cognitive Reflection Test (CRT). To assess whether LLMs\ngenuinely possess System 2-like logical reasoning, we introduced targeted\nmodifications to CRT problems. Our findings reveal that, despite the use of CoT\nprompts, mainstream LLMs, including the latest o1-preview model, continue to\nexhibit a significant error rate. Further analysis indicates that they\npredominantly rely on System 1-like intuitive reasoning and pattern matching\nderived from training data, rather than demonstrating mastery of mathematical\nthinking. This discovery challenges the prevailing notion that LLMs possess\ngenuine logical reasoning abilities and that CoT can enhance them.\nConsequently, this work may temper overly optimistic projections regarding\nLLMs' advancement toward artificial general intelligence.\n","authors":["Wei Xie","Shuoyoucheng Ma","Zhenhua Wang","Enze Wang","Kai Chen","Xiaobing Sun","Baosheng Wang"],"pdf_url":"https://arxiv.org/pdf/2410.14979v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04090v2","updated":"2024-11-07T07:12:45Z","published":"2024-11-06T18:08:57Z","title":"A Collaborative Content Moderation Framework for Toxicity Detection\n based on Conformalized Estimates of Annotation Disagreement","summary":" Content moderation typically combines the efforts of human moderators and\nmachine learning models. However, these systems often rely on data where\nsignificant disagreement occurs during moderation, reflecting the subjective\nnature of toxicity perception. Rather than dismissing this disagreement as\nnoise, we interpret it as a valuable signal that highlights the inherent\nambiguity of the content,an insight missed when only the majority label is\nconsidered. In this work, we introduce a novel content moderation framework\nthat emphasizes the importance of capturing annotation disagreement. Our\napproach uses multitask learning, where toxicity classification serves as the\nprimary task and annotation disagreement is addressed as an auxiliary task.\nAdditionally, we leverage uncertainty estimation techniques, specifically\nConformal Prediction, to account for both the ambiguity in comment annotations\nand the model's inherent uncertainty in predicting toxicity and\ndisagreement.The framework also allows moderators to adjust thresholds for\nannotation disagreement, offering flexibility in determining when ambiguity\nshould trigger a review. We demonstrate that our joint approach enhances model\nperformance, calibration, and uncertainty estimation, while offering greater\nparameter efficiency and improving the review process in comparison to\nsingle-task methods.\n","authors":["Guillermo Villate-Castillo","Javier Del Ser","Borja Sanz"],"pdf_url":"https://arxiv.org/pdf/2411.04090v2.pdf","comment":"35 pages, 1 figure"},{"id":"http://arxiv.org/abs/2406.11709v4","updated":"2024-11-07T07:00:14Z","published":"2024-06-17T16:28:21Z","title":"Instruct, Not Assist: LLM-based Multi-Turn Planning and Hierarchical\n Questioning for Socratic Code Debugging","summary":" Socratic questioning is an effective teaching strategy, encouraging critical\nthinking and problem-solving. The conversational capabilities of large language\nmodels (LLMs) show great potential for providing scalable, real-time student\nguidance. However, current LLMs often give away solutions directly, making them\nineffective instructors. We tackle this issue in the code debugging domain with\nTreeInstruct, an Instructor agent guided by a novel state space-based planning\nalgorithm. TreeInstruct asks probing questions to help students independently\nidentify and resolve errors. It estimates a student's conceptual and\nsyntactical knowledge to dynamically construct a question tree based on their\nresponses and current knowledge state, effectively addressing both independent\nand dependent mistakes concurrently in a multi-turn interaction setting. In\naddition to using an existing single-bug debugging benchmark, we construct a\nmore challenging multi-bug dataset of 150 coding problems, incorrect solutions,\nand bug fixes -- all carefully constructed and annotated by experts. Extensive\nevaluation shows TreeInstruct's state-of-the-art performance on both datasets,\nproving it to be a more effective instructor than baselines. Furthermore, a\nreal-world case study with five students of varying skill levels further\ndemonstrates TreeInstruct's ability to guide students to debug their code\nefficiently with minimal turns and highly Socratic questioning.\n","authors":["Priyanka Kargupta","Ishika Agarwal","Dilek Hakkani-Tur","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2406.11709v4.pdf","comment":"Code available at: https://github.com/agarwalishika/TreeInstruct\n Accepted at EMNLP'24 Findings"},{"id":"http://arxiv.org/abs/2411.04473v1","updated":"2024-11-07T06:51:24Z","published":"2024-11-07T06:51:24Z","title":"ML-Promise: A Multilingual Dataset for Corporate Promise Verification","summary":" Promises made by politicians, corporate leaders, and public figures have a\nsignificant impact on public perception, trust, and institutional reputation.\nHowever, the complexity and volume of such commitments, coupled with\ndifficulties in verifying their fulfillment, necessitate innovative methods for\nassessing their credibility. This paper introduces the concept of Promise\nVerification, a systematic approach involving steps such as promise\nidentification, evidence assessment, and the evaluation of timing for\nverification. We propose the first multilingual dataset, ML-Promise, which\nincludes English, French, Chinese, Japanese, and Korean, aimed at facilitating\nin-depth verification of promises, particularly in the context of\nEnvironmental, Social, and Governance (ESG) reports. Given the growing emphasis\non corporate environmental contributions, this dataset addresses the challenge\nof evaluating corporate promises, especially in light of practices like\ngreenwashing. Our findings also explore textual and image-based baselines, with\npromising results from retrieval-augmented generation (RAG) approaches. This\nwork aims to foster further discourse on the accountability of public\ncommitments across multiple languages and domains.\n","authors":["Yohei Seki","Hakusen Shu","Anaïs Lhuissier","Hanwool Lee","Juyeon Kang","Min-Yuh Day","Chung-Chi Chen"],"pdf_url":"https://arxiv.org/pdf/2411.04473v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2410.04070v5","updated":"2024-11-07T06:21:14Z","published":"2024-10-05T08:00:55Z","title":"PAD: Personalized Alignment of LLMs at Decoding-Time","summary":" Aligning with personalized preferences, which vary significantly across\ncultural, educational, and political differences, poses a significant challenge\ndue to the computational costs and data demands of traditional alignment\nmethods. In response, this paper presents Personalized Alignment at\nDecoding-time (PAD), a novel framework designed to align LLM outputs with\ndiverse personalized preferences during the inference phase, eliminating the\nneed for additional training. By introducing a unique personalized reward\nmodeling strategy, this framework decouples the text generation process from\npersonalized preferences, facilitating the generation of generalizable\ntoken-level personalized rewards. The PAD algorithm leverages these rewards to\nguide the decoding process, dynamically tailoring the base model's predictions\nto personalized preferences. Extensive experimental results demonstrate that\nPAD not only outperforms existing training-based alignment methods in terms of\naligning with diverse preferences but also shows significant generalizability\nto preferences unseen during training and scalability across different base\nmodels. This work advances the capability of LLMs to meet user needs in\nreal-time applications, presenting a substantial step forward in personalized\nLLM alignment.\n","authors":["Ruizhe Chen","Xiaotian Zhang","Meng Luo","Wenhao Chai","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2410.04070v5.pdf","comment":"This paper presents Personalized Alignment at Decoding-time (PAD), a\n novel framework designed to align LLM outputs with diverse personalized\n preferences during the inference phase"},{"id":"http://arxiv.org/abs/2411.02887v2","updated":"2024-11-07T05:46:42Z","published":"2024-11-05T07:59:22Z","title":"The Translation of Circumlocution in Arabic Short Stories into English","summary":" This study investigates the translation of circumlocution from Arabic to\nEnglish in a corpus of short stories by renowned Arabic authors. By analyzing\nthe source and target texts, the study aims to identify and categorize\ncircumlocution instances in Arabic and their corresponding renditions in\nEnglish. The study employs Nida's (1964) translation theory as a framework to\nassess the appropriateness of the translation strategies employed. It examines\nthe extent to which translators successfully rendered Arabic circumlocution\ninto English, identifying potential challenges and limitations in the\ntranslation process. The findings reveal significant similarities between\nArabic circumlocution categories and English metadiscourse categories,\nparticularly in terms of textual and interpersonal functions. However, the\nstudy also highlights instances where translators encountered difficulties in\naccurately conveying the nuances of circumlocution, often resorting to\nstrategies like addition, subtraction, and alteration.https://ntu.edu.iq/\n","authors":["Dalal Waadallah Shehab"],"pdf_url":"https://arxiv.org/pdf/2411.02887v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04448v1","updated":"2024-11-07T05:43:50Z","published":"2024-11-07T05:43:50Z","title":"Gradient Localization Improves Lifelong Pretraining of Language Models","summary":" Large Language Models (LLMs) trained on web-scale text corpora have been\nshown to capture world knowledge in their parameters. However, the mechanism by\nwhich language models store different types of knowledge is poorly understood.\nIn this work, we examine two types of knowledge relating to temporally\nsensitive entities and demonstrate that each type is localized to different\nsets of parameters within the LLMs. We hypothesize that the lack of\nconsideration of the locality of knowledge in existing continual learning\nmethods contributes to both: the failed uptake of new information, and\ncatastrophic forgetting of previously learned information. We observe that\nsequences containing references to updated and newly mentioned entities exhibit\nlarger gradient norms in a subset of layers. We demonstrate that targeting\nparameter updates to these relevant layers can improve the performance of\ncontinually pretraining on language containing temporal drift.\n","authors":["Jared Fernandez","Yonatan Bisk","Emma Strubell"],"pdf_url":"https://arxiv.org/pdf/2411.04448v1.pdf","comment":"EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2409.18412v2","updated":"2024-11-07T05:38:31Z","published":"2024-09-27T03:00:29Z","title":"SciDFM: A Large Language Model with Mixture-of-Experts for Science","summary":" Recently, there has been a significant upsurge of interest in leveraging\nlarge language models (LLMs) to assist scientific discovery. However, most LLMs\nonly focus on general science, while they lack domain-specific knowledge, such\nas chemical molecules and amino acid sequences. To bridge these gaps, we\nintroduce SciDFM, a mixture-of-experts LLM, which is trained from scratch and\nis able to conduct college-level scientific reasoning and understand molecules\nand amino acid sequences. We collect a large-scale training corpus containing\nnumerous scientific papers and books from different disciplines as well as data\nfrom domain-specific databases. We further fine-tune the pre-trained model on\nlots of instruction data to improve performances on downstream benchmarks. From\nexperiment results, we show that SciDFM achieves strong performance on general\nscientific benchmarks such as SciEval and SciQ, and it reaches a SOTA\nperformance on domain-specific benchmarks among models of similar size. We\nfurther analyze the expert layers and show that the results of expert selection\nvary with data from different disciplines. To benefit the broader research\ncommunity, we open-source SciDFM at\nhttps://huggingface.co/OpenDFM/SciDFM-MoE-A5.6B-v1.0.\n","authors":["Liangtai Sun","Danyu Luo","Da Ma","Zihan Zhao","Baocai Chen","Zhennan Shen","Su Zhu","Lu Chen","Xin Chen","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2409.18412v2.pdf","comment":"12 pages, 1 figure, 9 tables. Technical Report, accepted by NeurIPS\n 2024 Workshop FM4Science"},{"id":"http://arxiv.org/abs/2411.04443v1","updated":"2024-11-07T05:35:39Z","published":"2024-11-07T05:35:39Z","title":"ACCIO: Table Understanding Enhanced via Contrastive Learning with\n Aggregations","summary":" The attention to table understanding using recent natural language models has\nbeen growing. However, most related works tend to focus on learning the\nstructure of the table directly. Just as humans improve their understanding of\nsentences by comparing them, they can also enhance their understanding by\ncomparing tables. With this idea, in this paper, we introduce ACCIO, tAble\nunderstanding enhanCed via Contrastive learnIng with aggregatiOns, a novel\napproach to enhancing table understanding by contrasting original tables with\ntheir pivot summaries through contrastive learning. ACCIO trains an encoder to\nbring these table pairs closer together. Through validation via column type\nannotation, ACCIO achieves competitive performance with a macro F1 score of\n91.1 compared to state-of-the-art methods. This work represents the first\nattempt to utilize pairs of tables for table embedding, promising significant\nadvancements in table comprehension. Our code is available at\nhttps://github.com/whnhch/ACCIO/.\n","authors":["Whanhee Cho"],"pdf_url":"https://arxiv.org/pdf/2411.04443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18032v2","updated":"2024-11-07T05:10:20Z","published":"2024-10-23T17:02:59Z","title":"GraphTeam: Facilitating Large Language Model-based Graph Analysis via\n Multi-Agent Collaboration","summary":" Graphs are widely used for modeling relational data in real-world scenarios,\nsuch as social networks and urban computing. Existing LLM-based graph analysis\napproaches either integrate graph neural networks (GNNs) for specific machine\nlearning tasks, limiting their transferability, or rely solely on LLMs'\ninternal reasoning ability, resulting in suboptimal performance. To address\nthese limitations, we take advantage of recent advances in LLM-based agents,\nwhich have shown capabilities of utilizing external knowledge or tools for\nproblem solving. By simulating human problem-solving strategies such as analogy\nand collaboration, we propose a multi-agent system based on LLMs named\nGraphTeam, for graph analysis. GraphTeam consists of five LLM-based agents from\nthree modules, and the agents with different specialities can collaborate with\neach other to address complex problems. Specifically, (1) input-output\nnormalization module: the question agent extracts and refines four key\narguments from the original question, facilitating the problem understanding,\nand the answer agent organizes the results to meet the output requirement; (2)\nexternal knowledge retrieval module: we first build a knowledge base consisting\nof relevant documentation and experience information, and then the search agent\nretrieves the most relevant entries for each question. (3) problem-solving\nmodule: given the retrieved information from search agent, the coding agent\nuses established algorithms via programming to generate solutions, and in case\nthe coding agent does not work, the reasoning agent will directly compute the\nresults without programming. Extensive experiments on six graph analysis\nbenchmarks demonstrate that GraphTeam achieves state-of-the-art performance\nwith an average 25.85% improvement over the best baseline in terms of accuracy.\nThe code and data are available at https://github.com/BUPT-GAMMA/GraphTeam.\n","authors":["Xin Li","Qizhi Chu","Yubin Chen","Yang Liu","Yaoqi Liu","Zekai Yu","Weize Chen","Chen Qian","Chuan Shi","Cheng Yang"],"pdf_url":"https://arxiv.org/pdf/2410.18032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02449v3","updated":"2024-11-07T04:55:29Z","published":"2024-09-04T05:08:23Z","title":"What is lost in Normalization? Exploring Pitfalls in Multilingual ASR\n Model Evaluations","summary":" This paper explores the pitfalls in evaluating multilingual automatic speech\nrecognition (ASR) models, with a particular focus on Indic language scripts. We\ninvestigate the text normalization routine employed by leading ASR models,\nincluding OpenAI Whisper, Meta's MMS, Seamless, and Assembly AI's Conformer,\nand their unintended consequences on performance metrics. Our research reveals\nthat current text normalization practices, while aiming to standardize ASR\noutputs for fair comparison, by removing inconsistencies such as variations in\nspelling, punctuation, and special characters, are fundamentally flawed when\napplied to Indic scripts. Through empirical analysis using text similarity\nscores and in-depth linguistic examination, we demonstrate that these flaws\nlead to artificially improved performance metrics for Indic languages. We\nconclude by proposing a shift towards developing text normalization routines\nthat leverage native linguistic expertise, ensuring more robust and accurate\nevaluations of multilingual ASR models.\n","authors":["Kavya Manohar","Leena G Pillai","Elizabeth Sherly"],"pdf_url":"https://arxiv.org/pdf/2409.02449v3.pdf","comment":"Accepted to EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2411.04427v1","updated":"2024-11-07T04:38:58Z","published":"2024-11-07T04:38:58Z","title":"One fish, two fish, but not the whole sea: Alignment reduces language\n models' conceptual diversity","summary":" Researchers in social science and psychology have recently proposed using\nlarge language models (LLMs) as replacements for humans in behavioral research.\nIn addition to arguments about whether LLMs accurately capture population-level\npatterns, this has raised questions about whether LLMs capture human-like\nconceptual diversity. Separately, it is debated whether post-training alignment\n(RLHF or RLAIF) affects models' internal diversity. Inspired by human studies,\nwe use a new way of measuring the conceptual diversity of\nsynthetically-generated LLM \"populations\" by relating the internal variability\nof simulated individuals to the population-level variability. We use this\napproach to evaluate non-aligned and aligned LLMs on two domains with rich\nhuman behavioral data. While no model reaches human-like diversity, aligned\nmodels generally display less diversity than their instruction fine-tuned\ncounterparts. Our findings highlight potential trade-offs between increasing\nmodels' value alignment and decreasing the diversity of their conceptual\nrepresentations.\n","authors":["Sonia K. Murthy","Tomer Ullman","Jennifer Hu"],"pdf_url":"https://arxiv.org/pdf/2411.04427v1.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2305.15265v4","updated":"2024-11-07T04:38:33Z","published":"2023-05-24T15:52:08Z","title":"Winner-Take-All Column Row Sampling for Memory Efficient Adaptation of\n Language Model","summary":" With the rapid growth in model size, fine-tuning the large pre-trained\nlanguage model has become increasingly difficult due to its extensive memory\nusage. Previous works usually focus on reducing the number of trainable\nparameters in the network. While the model parameters do contribute to memory\nusage, the primary memory bottleneck during training arises from storing\nfeature maps, also known as activations, as they are crucial for gradient\ncalculation. Notably, neural networks are usually trained using stochastic\ngradient descent. We argue that in stochastic optimization, models can handle\nnoisy gradients as long as the gradient estimator is unbiased with reasonable\nvariance. Following this motivation, we propose a new family of unbiased\nestimators called WTA-CRS, for matrix production with reduced variance, which\nonly requires storing the sub-sampled activations for calculating the gradient.\nOur work provides both theoretical and experimental evidence that, in the\ncontext of tuning transformers, our proposed estimators exhibit lower variance\ncompared to existing ones. By replacing the linear operation with our\napproximated one in transformers, we can achieve up to 2.7$\\times$ peak memory\nreduction with almost no accuracy drop and enables up to $6.4\\times$ larger\nbatch size. Under the same hardware, WTA-CRS enables better down-streaming task\nperformance by applying larger models and/or faster training speed with larger\nbatch sizes.\n","authors":["Zirui Liu","Guanchu Wang","Shaochen Zhong","Zhaozhuo Xu","Daochen Zha","Ruixiang Tang","Zhimeng Jiang","Kaixiong Zhou","Vipin Chaudhary","Shuai Xu","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2305.15265v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04425v1","updated":"2024-11-07T04:38:29Z","published":"2024-11-07T04:38:29Z","title":"DELIFT: Data Efficient Language model Instruction Fine Tuning","summary":" Fine-tuning large language models (LLMs) is essential for enhancing their\nperformance on specific tasks but is often resource-intensive due to redundant\nor uninformative data. To address this inefficiency, we introduce DELIFT (Data\nEfficient Language model Instruction Fine-Tuning), a novel algorithm that\nsystematically optimizes data selection across the three key stages of\nfine-tuning: (1) instruction tuning, (2) task-specific fine-tuning (e.g.,\nreasoning, question-answering), and (3) continual fine-tuning (e.g.,\nincorporating new data versions). Unlike existing methods that focus on\nsingle-stage optimization or rely on computationally intensive gradient\ncalculations, DELIFT operates efficiently across all stages. Central to our\napproach is a pairwise utility metric that quantifies how beneficial a data\nsample is for improving the model's responses to other samples, effectively\nmeasuring the informational value relative to the model's current capabilities.\nBy leveraging different submodular functions applied to this metric, DELIFT\nselects diverse and optimal subsets that are useful across all stages of\nfine-tuning. Experiments across various tasks and model scales demonstrate that\nDELIFT can reduce the fine-tuning data size by up to 70% without compromising\nperformance, offering significant computational savings and outperforming\nexisting methods in both efficiency and efficacy.\n","authors":["Ishika Agarwal","Krishna Killamsetty","Lucian Popa","Marina Danilevksy"],"pdf_url":"https://arxiv.org/pdf/2411.04425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04424v1","updated":"2024-11-07T04:32:40Z","published":"2024-11-07T04:32:40Z","title":"Bayesian Calibration of Win Rate Estimation with LLM Evaluators","summary":" Recent advances in large language models (LLMs) show the potential of using\nLLMs as evaluators for assessing the quality of text generations from LLMs.\nHowever, applying LLM evaluators naively to compare or judge between different\nsystems can lead to unreliable results due to the intrinsic win rate estimation\nbias of LLM evaluators. In order to mitigate this problem, we propose two\ncalibration methods, Bayesian Win Rate Sampling (BWRS) and Bayesian\nDawid-Skene, both of which leverage Bayesian inference to more accurately infer\nthe true win rate of generative language models. We empirically validate our\nmethods on six datasets covering story generation, summarization, and\ninstruction following tasks. We show that both our methods are effective in\nimproving the accuracy of win rate estimation using LLMs as evaluators,\noffering a promising direction for reliable automatic text quality evaluation.\n","authors":["Yicheng Gao","Gonghan Xu","Zhe Wang","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2411.04424v1.pdf","comment":"Accepted by EMNLP 2024"},{"id":"http://arxiv.org/abs/2411.04421v1","updated":"2024-11-07T04:17:30Z","published":"2024-11-07T04:17:30Z","title":"Variational Low-Rank Adaptation Using IVON","summary":" We show that variational learning can significantly improve the accuracy and\ncalibration of Low-Rank Adaptation (LoRA) without a substantial increase in the\ncost. We replace AdamW by the Improved Variational Online Newton (IVON)\nalgorithm to finetune large language models. For Llama-2 with 7 billion\nparameters, IVON improves the accuracy over AdamW by 2.8% and expected\ncalibration error by 4.6%. The accuracy is also better than the other Bayesian\nalternatives, yet the cost is lower and the implementation is easier. Our work\nprovides additional evidence for the effectiveness of IVON for large language\nmodels. The code is available at\nhttps://github.com/team-approx-bayes/ivon-lora.\n","authors":["Bai Cong","Nico Daheim","Yuesong Shen","Daniel Cremers","Rio Yokota","Mohammad Emtiyaz Khan","Thomas Möllenhoff"],"pdf_url":"https://arxiv.org/pdf/2411.04421v1.pdf","comment":"Published at 38th Workshop on Fine-Tuning in Machine Learning\n (NeurIPS 2024). Code available at\n https://github.com/team-approx-bayes/ivon-lora"},{"id":"http://arxiv.org/abs/2406.18064v3","updated":"2024-11-07T04:03:04Z","published":"2024-06-26T04:49:41Z","title":"Evaluating Quality of Answers for Retrieval-Augmented Generation: A\n Strong LLM Is All You Need","summary":" We present a comprehensive study of answer quality evaluation in\nRetrieval-Augmented Generation (RAG) applications using vRAG-Eval, a novel\ngrading system that is designed to assess correctness, completeness, and\nhonesty. We further map the grading of quality aspects aforementioned into a\nbinary score, indicating an accept or reject decision, mirroring the intuitive\n\"thumbs-up\" or \"thumbs-down\" gesture commonly used in chat applications. This\napproach suits factual business contexts where a clear decision opinion is\nessential. Our assessment applies vRAG-Eval to two Large Language Models\n(LLMs), evaluating the quality of answers generated by a vanilla RAG\napplication. We compare these evaluations with human expert judgments and find\na substantial alignment between GPT-4's assessments and those of human experts,\nreaching 83% agreement on accept or reject decisions. This study highlights the\npotential of LLMs as reliable evaluators in closed-domain, closed-ended\nsettings, particularly when human evaluations require significant resources.\n","authors":["Yang Wang","Alberto Garcia Hernandez","Roman Kyslyi","Nicholas Kersting"],"pdf_url":"https://arxiv.org/pdf/2406.18064v3.pdf","comment":"13 pages, 8 figures, 12 tables"},{"id":"http://arxiv.org/abs/2411.04105v2","updated":"2024-11-07T03:50:19Z","published":"2024-11-06T18:35:32Z","title":"How Transformers Solve Propositional Logic Problems: A Mechanistic\n Analysis","summary":" Large language models (LLMs) have shown amazing performance on tasks that\nrequire planning and reasoning. Motivated by this, we investigate the internal\nmechanisms that underpin a network's ability to perform complex logical\nreasoning. We first construct a synthetic propositional logic problem that\nserves as a concrete test-bed for network training and evaluation. Crucially,\nthis problem demands nontrivial planning to solve, but we can train a small\ntransformer to achieve perfect accuracy. Building on our set-up, we then pursue\nan understanding of precisely how a three-layer transformer, trained from\nscratch, solves this problem. We are able to identify certain \"planning\" and\n\"reasoning\" circuits in the network that necessitate cooperation between the\nattention blocks to implement the desired logic. To expand our findings, we\nthen study a larger model, Mistral 7B. Using activation patching, we\ncharacterize internal components that are critical in solving our logic\nproblem. Overall, our work systemically uncovers novel aspects of small and\nlarge transformers, and continues the study of how they plan and reason.\n","authors":["Guan Zhe Hong","Nishanth Dikkala","Enming Luo","Cyrus Rashtchian","Xin Wang","Rina Panigrahy"],"pdf_url":"https://arxiv.org/pdf/2411.04105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07930v4","updated":"2024-11-07T03:37:51Z","published":"2024-08-15T04:57:55Z","title":"MAG-SQL: Multi-Agent Generative Approach with Soft Schema Linking and\n Iterative Sub-SQL Refinement for Text-to-SQL","summary":" Recent In-Context Learning based methods have achieved remarkable success in\nText-to-SQL task. However, there is still a large gap between the performance\nof these models and human performance on datasets with complex database schema\nand difficult questions, such as BIRD. Besides, existing work has neglected to\nsupervise intermediate steps when solving questions iteratively with question\ndecomposition methods, and the schema linking methods used in these works are\nvery rudimentary. To address these issues, we propose MAG-SQL, a multi-agent\ngenerative approach with soft schema linking and iterative Sub-SQL refinement.\nIn our framework, an entity-based method with tables' summary is used to select\nthe columns in database, and a novel targets-conditions decomposition method is\nintroduced to decompose those complex questions. Additionally, we build a\niterative generating module which includes a Sub-SQL Generator and Sub-SQL\nRefiner, introducing external oversight for each step of generation. Through a\nseries of ablation studies, the effectiveness of each agent in our framework\nhas been demonstrated. When evaluated on the BIRD benchmark with GPT-4, MAG-SQL\nachieves an execution accuracy of 61.08%, compared to the baseline accuracy of\n46.35% for vanilla GPT-4 and the baseline accuracy of 57.56% for MAC-SQL.\nBesides, our approach makes similar progress on Spider. The codes are available\nat https://github.com/LancelotXWX/MAG-SQL.\n","authors":["Wenxuan Xie","Gaochen Wu","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07930v4.pdf","comment":"22 pages, 14 figures"},{"id":"http://arxiv.org/abs/2407.15186v4","updated":"2024-11-07T03:26:58Z","published":"2024-07-21T14:48:23Z","title":"A Survey on Employing Large Language Models for Text-to-SQL Tasks","summary":" The increasing volume of data in relational databases and the expertise\nneeded for writing SQL queries pose challenges for users to access and analyze\ndata. Text-to-SQL (Text2SQL) solves the issues by utilizing natural language\nprocessing (NLP) techniques to convert natural language into SQL queries. With\nthe development of Large Language Models (LLMs), a range of LLM-based Text2SQL\nmethods have emerged. This survey provides a comprehensive review of LLMs in\nText2SQL tasks. We review benchmark datasets, prompt engineering methods,\nfine-tuning methods, and base models in LLM-based Text2SQL methods. We provide\ninsights in each part and discuss future directions in this field.\n","authors":["Liang Shi","Zhengju Tang","Nan Zhang","Xiaotong Zhang","Zhi Yang"],"pdf_url":"https://arxiv.org/pdf/2407.15186v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11484v8","updated":"2024-11-07T03:23:16Z","published":"2024-07-16T08:20:39Z","title":"The Oscars of AI Theater: A Survey on Role-Playing with Language Models","summary":" This survey explores the burgeoning field of role-playing with language\nmodels, focusing on their development from early persona-based models to\nadvanced character-driven simulations facilitated by Large Language Models\n(LLMs). Initially confined to simple persona consistency due to limited model\ncapabilities, role-playing tasks have now expanded to embrace complex character\nportrayals involving character consistency, behavioral alignment, and overall\nattractiveness. We provide a comprehensive taxonomy of the critical components\nin designing these systems, including data, models and alignment, agent\narchitecture and evaluation. This survey not only outlines the current\nmethodologies and challenges, such as managing dynamic personal profiles and\nachieving high-level persona consistency but also suggests avenues for future\nresearch in improving the depth and realism of role-playing applications. The\ngoal is to guide future research by offering a structured overview of current\nmethodologies and identifying potential areas for improvement. Related\nresources and papers are available at\nhttps://github.com/nuochenpku/Awesome-Role-Play-Papers.\n","authors":["Nuo Chen","Yan Wang","Yang Deng","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2407.11484v8.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2411.02603v3","updated":"2024-11-07T03:17:42Z","published":"2024-11-04T20:53:04Z","title":"FactTest: Factuality Testing in Large Language Models with Finite-Sample\n and Distribution-Free Guarantees","summary":" The propensity of Large Language Models (LLMs) to generate hallucinations and\nnon-factual content undermines their reliability in high-stakes domains, where\nrigorous control over Type I errors (the conditional probability of incorrectly\nclassifying hallucinations as truthful content) is essential. Despite its\nimportance, formal verification of LLM factuality with such guarantees remains\nlargely unexplored. In this paper, we introduce FactTest, a novel framework\nthat statistically assesses whether a LLM can confidently provide correct\nanswers to given questions with high-probability correctness guarantees. We\nformulate factuality testing as hypothesis testing problem to enforce an upper\nbound of Type I errors at user-specified significance levels. Notably, we prove\nthat our framework also ensures strong Type II error control under mild\nconditions and can be extended to maintain its effectiveness when covariate\nshifts exist. Our approach is distribution-free and works for any number of\nhuman-annotated samples. It is model-agnostic and applies to any black-box or\nwhite-box LM. Extensive experiments on question-answering (QA) and\nmultiple-choice benchmarks demonstrate that FactTest effectively detects\nhallucinations and improves the model's ability to abstain from answering\nunknown questions, leading to an over 40% accuracy improvement.\n","authors":["Fan Nie","Xiaotian Hou","Shuhang Lin","James Zou","Huaxiu Yao","Linjun Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.02603v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17767v2","updated":"2024-11-07T03:16:02Z","published":"2024-05-28T02:46:11Z","title":"Linguistic Collapse: Neural Collapse in (Large) Language Models","summary":" Neural collapse ($\\mathcal{NC}$) is a phenomenon observed in classification\ntasks where top-layer representations collapse into their class means, which\nbecome equinorm, equiangular and aligned with the classifiers. These behaviors\n-- associated with generalization and robustness -- would manifest under\nspecific conditions: models are trained towards zero loss, with noise-free\nlabels belonging to balanced classes, which do not outnumber the model's hidden\ndimension. Recent studies have explored $\\mathcal{NC}$ in the absence of one or\nmore of these conditions to extend and capitalize on the associated benefits of\nideal geometries. Language modeling presents a curious frontier, as\n\\textit{training by token prediction} constitutes a classification task where\nnone of the conditions exist: the vocabulary is imbalanced and exceeds the\nembedding dimension; different tokens might correspond to similar contextual\nembeddings; and large language models (LLMs) in particular are typically only\ntrained for a few epochs. This paper empirically investigates the impact of\nscaling the architectures and training of causal language models (CLMs) on\ntheir progression towards $\\mathcal{NC}$. We find that $\\mathcal{NC}$\nproperties that develop with scale (and regularization) are linked to\ngeneralization. Moreover, there is evidence of some relationship between\n$\\mathcal{NC}$ and generalization independent of scale. Our work thereby\nunderscores the generality of $\\mathcal{NC}$ as it extends to the novel and\nmore challenging setting of language modeling. Downstream, we seek to inspire\nfurther research on the phenomenon to deepen our understanding of LLMs -- and\nneural networks at large -- and improve existing architectures based on\n$\\mathcal{NC}$-related properties. Our code is hosted on GitHub at\nhttps://github.com/rhubarbwu/linguistic-collapse .\n","authors":["Robert Wu","Vardan Papyan"],"pdf_url":"https://arxiv.org/pdf/2405.17767v2.pdf","comment":"NeurIPS 2024; 36 pages; 30 figures"},{"id":"http://arxiv.org/abs/2403.18802v4","updated":"2024-11-07T03:14:38Z","published":"2024-03-27T17:48:55Z","title":"Long-form factuality in large language models","summary":" Large language models (LLMs) often generate content that contains factual\nerrors when responding to fact-seeking prompts on open-ended topics. To\nbenchmark a model's long-form factuality in open domains, we first use GPT-4 to\ngenerate LongFact, a prompt set comprising thousands of questions spanning 38\ntopics. We then propose that LLM agents can be used as automated evaluators for\nlong-form factuality through a method which we call Search-Augmented Factuality\nEvaluator (SAFE). SAFE utilizes an LLM to break down a long-form response into\na set of individual facts and to evaluate the accuracy of each fact using a\nmulti-step reasoning process comprising sending search queries to Google Search\nand determining whether a fact is supported by the search results. Furthermore,\nwe propose extending F1 score as an aggregated metric for long-form factuality.\nTo do so, we balance the percentage of supported facts in a response\n(precision) with the percentage of provided facts relative to a hyperparameter\nrepresenting a user's preferred response length (recall).\n Empirically, we demonstrate that LLM agents can outperform crowdsourced human\nannotators - on a set of ~16k individual facts, SAFE agrees with crowdsourced\nhuman annotators 72% of the time, and on a random subset of 100 disagreement\ncases, SAFE wins 76% of the time. At the same time, SAFE is more than 20 times\ncheaper than human annotators. We also benchmark thirteen language models on\nLongFact across four model families (Gemini, GPT, Claude, and PaLM-2), finding\nthat larger language models generally achieve better long-form factuality.\nLongFact, SAFE, and all experimental code are available at\nhttps://github.com/google-deepmind/long-form-factuality.\n","authors":["Jerry Wei","Chengrun Yang","Xinying Song","Yifeng Lu","Nathan Hu","Jie Huang","Dustin Tran","Daiyi Peng","Ruibo Liu","Da Huang","Cosmo Du","Quoc V. Le"],"pdf_url":"https://arxiv.org/pdf/2403.18802v4.pdf","comment":"NeurIPS 2024; 72 pages, 18 figures, 30 tables. Code at\n https://github.com/google-deepmind/long-form-factuality"},{"id":"http://arxiv.org/abs/2409.19487v3","updated":"2024-11-07T03:05:18Z","published":"2024-09-28T23:59:46Z","title":"HealthQ: Unveiling Questioning Capabilities of LLM Chains in Healthcare\n Conversations","summary":" In digital healthcare, large language models (LLMs) have primarily been\nutilized to enhance question-answering capabilities and improve patient\ninteractions. However, effective patient care necessitates LLM chains that can\nactively gather information by posing relevant questions. This paper presents\nHealthQ, a novel framework designed to evaluate the questioning capabilities of\nLLM healthcare chains. We implemented several LLM chains, including\nRetrieval-Augmented Generation (RAG), Chain of Thought (CoT), and reflective\nchains, and introduced an LLM judge to assess the relevance and informativeness\nof the generated questions. To validate HealthQ, we employed traditional\nNatural Language Processing (NLP) metrics such as Recall-Oriented Understudy\nfor Gisting Evaluation (ROUGE) and Named Entity Recognition (NER)-based set\ncomparison, and constructed two custom datasets from public medical note\ndatasets, ChatDoctor and MTS-Dialog. Our contributions are threefold: we\nprovide the first comprehensive study on the questioning capabilities of LLMs\nin healthcare conversations, develop a novel dataset generation pipeline, and\npropose a detailed evaluation methodology.\n","authors":["Ziyu Wang","Hao Li","Di Huang","Amir M. Rahmani"],"pdf_url":"https://arxiv.org/pdf/2409.19487v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03644v2","updated":"2024-11-07T02:44:34Z","published":"2024-11-06T03:48:41Z","title":"Deploying Multi-task Online Server with Large Language Model","summary":" In the industry, numerous tasks are deployed online. Traditional approaches\noften tackle each task separately by its own network, which leads to excessive\ncosts for developing and scaling models, especially in the context of large\nlanguage models. Although multi-task methods can save costs through parameter\nsharing, they often struggle to outperform single-task methods in real-world\napplications. To tackle these challenges, we present a three-stage multi-task\nlearning framework for large language models. It involves task filtering,\nfollowed by fine-tuning on high-resource tasks, and finally fine-tuning on all\ntasks. We conducted comprehensive experiments in single-task and multi-task\nsettings. Our approach, exemplified on different benchmarks, demonstrates that\nit is able to achieve performance comparable to the single-task method while\nreducing up to 90.9\\% of its overhead.\n","authors":["Yincen Qu","Chao Ma","Xiangying Dai","Hui Zhou","Yiting Wu","Hengyue Liu"],"pdf_url":"https://arxiv.org/pdf/2411.03644v2.pdf","comment":"Accepted by COLING 2025 Industry Track"},{"id":"http://arxiv.org/abs/2411.04368v1","updated":"2024-11-07T01:58:42Z","published":"2024-11-07T01:58:42Z","title":"Measuring short-form factuality in large language models","summary":" We present SimpleQA, a benchmark that evaluates the ability of language\nmodels to answer short, fact-seeking questions. We prioritized two properties\nin designing this eval. First, SimpleQA is challenging, as it is adversarially\ncollected against GPT-4 responses. Second, responses are easy to grade, because\nquestions are created such that there exists only a single, indisputable\nanswer. Each answer in SimpleQA is graded as either correct, incorrect, or not\nattempted. A model with ideal behavior would get as many questions correct as\npossible while not attempting the questions for which it is not confident it\nknows the correct answer. SimpleQA is a simple, targeted evaluation for whether\nmodels \"know what they know,\" and our hope is that this benchmark will remain\nrelevant for the next few generations of frontier models. SimpleQA can be found\nat https://github.com/openai/simple-evals.\n","authors":["Jason Wei","Nguyen Karina","Hyung Won Chung","Yunxin Joy Jiao","Spencer Papay","Amelia Glaese","John Schulman","William Fedus"],"pdf_url":"https://arxiv.org/pdf/2411.04368v1.pdf","comment":"Blog post: https://openai.com/index/introducing-simpleqa/"},{"id":"http://arxiv.org/abs/2411.04358v1","updated":"2024-11-07T01:31:48Z","published":"2024-11-07T01:31:48Z","title":"Robust and Efficient Fine-tuning of LLMs with Bayesian\n Reparameterization of Low-Rank Adaptation","summary":" Large Language Models (LLMs) are highly resource-intensive to fine-tune due\nto their enormous size. While low-rank adaptation is a prominent\nparameter-efficient fine-tuning approach, it suffers from sensitivity to\nhyperparameter choices, leading to instability in model performance on\nfine-tuning downstream tasks. This paper highlights the importance of effective\nparameterization in low-rank fine-tuning to reduce estimator variance and\nenhance the stability of final model outputs. We propose MonteCLoRA, an\nefficient fine-tuning technique, employing Monte Carlo estimation to learn an\nunbiased posterior estimation of low-rank parameters with low expected\nvariance, which stabilizes fine-tuned LLMs with only O(1) additional\nparameters. MonteCLoRA shows significant improvements in accuracy and\nrobustness, achieving up to 3.8% higher accuracy and 8.6% greater robustness\nthan existing efficient fine-tuning methods on natural language understanding\ntasks with pre-trained RoBERTa-base. Furthermore, in generative tasks with\npre-trained LLaMA-1-7B, MonteCLoRA demonstrates robust zero-shot performance\nwith 50% lower variance than the contemporary efficient fine-tuning methods.\nThe theoretical and empirical results presented in the paper underscore how\nparameterization and hyperpriors balance exploration-exploitation in the\nlow-rank parametric space, therefore leading to more optimal and robust\nparameter estimation during efficient fine-tuning.\n","authors":["Vaibhav Seth","Arinjay Pathak","Ayan Sengupta","Natraj Raman","Sriram Gopalakrishnan","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2411.04358v1.pdf","comment":"48 pages, 10 figures, 10 tables, Code:\n https://github.com/LCS2-IIITD/MonteCLoRA"},{"id":"http://arxiv.org/abs/2411.01222v3","updated":"2024-11-07T01:26:43Z","published":"2024-11-02T12:01:44Z","title":"$B^4$: A Black-Box Scrubbing Attack on LLM Watermarks","summary":" Watermarking has emerged as a prominent technique for LLM-generated content\ndetection by embedding imperceptible patterns. Despite supreme performance, its\nrobustness against adversarial attacks remains underexplored. Previous work\ntypically considers a grey-box attack setting, where the specific type of\nwatermark is already known. Some even necessitates knowledge about\nhyperparameters of the watermarking method. Such prerequisites are unattainable\nin real-world scenarios. Targeting at a more realistic black-box threat model\nwith fewer assumptions, we here propose $B^4$, a black-box scrubbing attack on\nwatermarks. Specifically, we formulate the watermark scrubbing attack as a\nconstrained optimization problem by capturing its objectives with two\ndistributions, a Watermark Distribution and a Fidelity Distribution. This\noptimization problem can be approximately solved using two proxy distributions.\nExperimental results across 12 different settings demonstrate the superior\nperformance of $B^4$ compared with other baselines.\n","authors":["Baizhou Huang","Xiao Pu","Xiaojun Wan"],"pdf_url":"https://arxiv.org/pdf/2411.01222v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06567v3","updated":"2024-11-07T00:54:30Z","published":"2024-07-09T05:52:26Z","title":"FinCon: A Synthesized LLM Multi-Agent System with Conceptual Verbal\n Reinforcement for Enhanced Financial Decision Making","summary":" Large language models (LLMs) have demonstrated notable potential in\nconducting complex tasks and are increasingly utilized in various financial\napplications. However, high-quality sequential financial investment\ndecision-making remains challenging. These tasks require multiple interactions\nwith a volatile environment for every decision, demanding sufficient\nintelligence to maximize returns and manage risks. Although LLMs have been used\nto develop agent systems that surpass human teams and yield impressive\ninvestment returns, opportunities to enhance multi-sourced information\nsynthesis and optimize decision-making outcomes through timely experience\nrefinement remain unexplored. Here, we introduce the FinCon, an LLM-based\nmulti-agent framework with CONceptual verbal reinforcement tailored for diverse\nFINancial tasks. Inspired by effective real-world investment firm\norganizational structures, FinCon utilizes a manager-analyst communication\nhierarchy. This structure allows for synchronized cross-functional agent\ncollaboration towards unified goals through natural language interactions and\nequips each agent with greater memory capacity than humans. Additionally, a\nrisk-control component in FinCon enhances decision quality by episodically\ninitiating a self-critiquing mechanism to update systematic investment beliefs.\nThe conceptualized beliefs serve as verbal reinforcement for the future agent's\nbehavior and can be selectively propagated to the appropriate node that\nrequires knowledge updates. This feature significantly improves performance\nwhile reducing unnecessary peer-to-peer communication costs. Moreover, FinCon\ndemonstrates strong generalization capabilities in various financial tasks,\nincluding single stock trading and portfolio management.\n","authors":["Yangyang Yu","Zhiyuan Yao","Haohang Li","Zhiyang Deng","Yupeng Cao","Zhi Chen","Jordan W. Suchow","Rong Liu","Zhenyu Cui","Zhaozhuo Xu","Denghui Zhang","Koduvayur Subbalakshmi","Guojun Xiong","Yueru He","Jimin Huang","Dong Li","Qianqian Xie"],"pdf_url":"https://arxiv.org/pdf/2407.06567v3.pdf","comment":"LLM Applications, LLM Agents, Financial Technology, Quantitative\n Finance, Algorithmic Trading, Cognitive Science"},{"id":"http://arxiv.org/abs/2411.01030v3","updated":"2024-11-07T00:23:14Z","published":"2024-11-01T21:01:13Z","title":"Birdie: Advancing State Space Models with Reward-Driven Objectives and\n Curricula","summary":" Efficient state space models (SSMs), such as linear recurrent neural networks\nand linear attention variants, offer computational advantages over Transformers\nbut struggle with tasks requiring long-range in-context retrieval-like text\ncopying, associative recall, and question answering over long contexts.\nPrevious efforts to address these challenges have focused on architectural\nmodifications, often reintroducing computational inefficiencies. In this paper,\nwe propose a novel training procedure, Birdie, that significantly enhances the\nin-context retrieval capabilities of SSMs without altering their architecture.\nOur approach combines bidirectional input processing with dynamic mixtures of\nspecialized pre-training objectives, optimized via reinforcement learning. We\nintroduce a new bidirectional SSM architecture that seamlessly transitions from\nbidirectional context processing to causal generation. Experimental evaluations\ndemonstrate that Birdie markedly improves performance on retrieval-intensive\ntasks such as multi-number phone book lookup, long paragraph\nquestion-answering, and infilling. This narrows the performance gap with\nTransformers, while retaining computational efficiency. Our findings highlight\nthe importance of training procedures in leveraging the fixed-state capacity of\nSSMs, offering a new direction to advance their capabilities. All code and\npre-trained models are available at https://www.github.com/samblouir/birdie,\nwith support for JAX and PyTorch.\n","authors":["Sam Blouir","Jimmy T. H. Smith","Antonios Anastasopoulos","Amarda Shehu"],"pdf_url":"https://arxiv.org/pdf/2411.01030v3.pdf","comment":"Accepted to EMNLP 2024 (Main Conference)"},{"id":"http://arxiv.org/abs/2411.04330v1","updated":"2024-11-07T00:10:10Z","published":"2024-11-07T00:10:10Z","title":"Scaling Laws for Precision","summary":" Low precision training and inference affect both the quality and cost of\nlanguage models, but current scaling laws do not account for this. In this\nwork, we devise \"precision-aware\" scaling laws for both training and inference.\nWe propose that training in lower precision reduces the model's \"effective\nparameter count,\" allowing us to predict the additional loss incurred from\ntraining in low precision and post-train quantization. For inference, we find\nthat the degradation introduced by post-training quantization increases as\nmodels are trained on more data, eventually making additional pretraining data\nactively harmful. For training, our scaling laws allow us to predict the loss\nof a model with different parts in different precisions, and suggest that\ntraining larger models in lower precision may be compute optimal. We unify the\nscaling laws for post and pretraining quantization to arrive at a single\nfunctional form that predicts degradation from training and inference in varied\nprecisions. We fit on over 465 pretraining runs and validate our predictions on\nmodel sizes up to 1.7B parameters trained on up to 26B tokens.\n","authors":["Tanishq Kumar","Zachary Ankner","Benjamin F. Spector","Blake Bordelon","Niklas Muennighoff","Mansheej Paul","Cengiz Pehlevan","Christopher Ré","Aditi Raghunathan"],"pdf_url":"https://arxiv.org/pdf/2411.04330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04329v1","updated":"2024-11-07T00:09:54Z","published":"2024-11-07T00:09:54Z","title":"CodeTree: Agent-guided Tree Search for Code Generation with Large\n Language Models","summary":" Pre-trained on massive amounts of code and text data, large language models\n(LLMs) have demonstrated remarkable achievements in performing code generation\ntasks. With additional execution-based feedback, these models can act as agents\nwith capabilities to self-refine and improve generated code autonomously.\nHowever, on challenging coding tasks with extremely large search space, current\nagentic approaches still struggle with multi-stage planning, generating, and\ndebugging. To address this problem, we propose CodeTree, a framework for LLM\nagents to efficiently explore the search space in different stages of the code\ngeneration process. Specifically, we adopted a unified tree structure to\nexplicitly explore different coding strategies, generate corresponding coding\nsolutions, and subsequently refine the solutions. In each stage, critical\ndecision-making (ranking, termination, expanding) of the exploration process is\nguided by both the environmental execution-based feedback and\nLLM-agent-generated feedback. We comprehensively evaluated CodeTree on 7 code\ngeneration benchmarks and demonstrated the significant performance gains of\nCodeTree against strong baselines. Using GPT-4o as the base model, we\nconsistently achieved top results of 95.1 on HumanEval, 98.7 on MBPP, and 43.0\non CodeContests. On the challenging SWEBench benchmark, our approach led to\nsignificant performance gains.\n","authors":["Jierui Li","Hung Le","Yinbo Zhou","Caiming Xiong","Silvio Savarese","Doyen Sahoo"],"pdf_url":"https://arxiv.org/pdf/2411.04329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04328v1","updated":"2024-11-07T00:09:18Z","published":"2024-11-07T00:09:18Z","title":"Balancing Transparency and Accuracy: A Comparative Analysis of\n Rule-Based and Deep Learning Models in Political Bias Classification","summary":" The unchecked spread of digital information, combined with increasing\npolitical polarization and the tendency of individuals to isolate themselves\nfrom opposing political viewpoints, has driven researchers to develop systems\nfor automatically detecting political bias in media. This trend has been\nfurther fueled by discussions on social media. We explore methods for\ncategorizing bias in US news articles, comparing rule-based and deep learning\napproaches. The study highlights the sensitivity of modern self-learning\nsystems to unconstrained data ingestion, while reconsidering the strengths of\ntraditional rule-based systems. Applying both models to left-leaning (CNN) and\nright-leaning (FOX) news articles, we assess their effectiveness on data beyond\nthe original training and test sets.This analysis highlights each model's\naccuracy, offers a framework for exploring deep-learning explainability, and\nsheds light on political bias in US news media. We contrast the opaque\narchitecture of a deep learning model with the transparency of a linguistically\ninformed rule-based model, showing that the rule-based model performs\nconsistently across different data conditions and offers greater transparency,\nwhereas the deep learning model is dependent on the training set and struggles\nwith unseen data.\n","authors":["Manuel Nunez Martinez","Sonja Schmer-Galunder","Zoey Liu","Sangpil Youm","Chathuri Jayaweera","Bonnie J. Dorr"],"pdf_url":"https://arxiv.org/pdf/2411.04328v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.05007v1","updated":"2024-11-07T18:59:58Z","published":"2024-11-07T18:59:58Z","title":"SVDQunat: Absorbing Outliers by Low-Rank Components for 4-Bit Diffusion\n Models","summary":" Diffusion models have been proven highly effective at generating high-quality\nimages. However, as these models grow larger, they require significantly more\nmemory and suffer from higher latency, posing substantial challenges for\ndeployment. In this work, we aim to accelerate diffusion models by quantizing\ntheir weights and activations to 4 bits. At such an aggressive level, both\nweights and activations are highly sensitive, where conventional post-training\nquantization methods for large language models like smoothing become\ninsufficient. To overcome this limitation, we propose SVDQuant, a new 4-bit\nquantization paradigm. Different from smoothing which redistributes outliers\nbetween weights and activations, our approach absorbs these outliers using a\nlow-rank branch. We first consolidate the outliers by shifting them from\nactivations to weights, then employ a high-precision low-rank branch to take in\nthe weight outliers with Singular Value Decomposition (SVD). This process eases\nthe quantization on both sides. However, na\\\"{\\i}vely running the low-rank\nbranch independently incurs significant overhead due to extra data movement of\nactivations, negating the quantization speedup. To address this, we co-design\nan inference engine Nunchaku that fuses the kernels of the low-rank branch into\nthose of the low-bit branch to cut off redundant memory access. It can also\nseamlessly support off-the-shelf low-rank adapters (LoRAs) without the need for\nre-quantization. Extensive experiments on SDXL, PixArt-$\\Sigma$, and FLUX.1\nvalidate the effectiveness of SVDQuant in preserving image quality. We reduce\nthe memory usage for the 12B FLUX.1 models by 3.5$\\times$, achieving\n3.0$\\times$ speedup over the 4-bit weight-only quantized baseline on the 16GB\nlaptop 4090 GPU, paving the way for more interactive applications on PCs. Our\nquantization library and inference engine are open-sourced.\n","authors":["Muyang Li","Yujun Lin","Zhekai Zhang","Tianle Cai","Xiuyu Li","Junxian Guo","Enze Xie","Chenlin Meng","Jun-Yan Zhu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2411.05007v1.pdf","comment":"Quantization Library: https://github.com/mit-han-lab/deepcompressor\n Inference Engine: https://github.com/mit-han-lab/nunchaku Website:\n https://hanlab.mit.edu/projects/svdquant Demo: https://svdquant.mit.edu Blog:\n https://hanlab.mit.edu/blog/svdquant"},{"id":"http://arxiv.org/abs/2411.05006v1","updated":"2024-11-07T18:59:54Z","published":"2024-11-07T18:59:54Z","title":"ProEdit: Simple Progression is All You Need for High-Quality 3D Scene\n Editing","summary":" This paper proposes ProEdit - a simple yet effective framework for\nhigh-quality 3D scene editing guided by diffusion distillation in a novel\nprogressive manner. Inspired by the crucial observation that multi-view\ninconsistency in scene editing is rooted in the diffusion model's large\nfeasible output space (FOS), our framework controls the size of FOS and reduces\ninconsistency by decomposing the overall editing task into several subtasks,\nwhich are then executed progressively on the scene. Within this framework, we\ndesign a difficulty-aware subtask decomposition scheduler and an adaptive 3D\nGaussian splatting (3DGS) training strategy, ensuring high quality and\nefficiency in performing each subtask. Extensive evaluation shows that our\nProEdit achieves state-of-the-art results in various scenes and challenging\nediting tasks, all through a simple framework without any expensive or\nsophisticated add-ons like distillation losses, components, or training\nprocedures. Notably, ProEdit also provides a new way to control, preview, and\nselect the \"aggressivity\" of editing operation during the editing process.\n","authors":["Jun-Kun Chen","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.05006v1.pdf","comment":"NeurIPS 2024. Project Page: https://immortalco.github.io/ProEdit/"},{"id":"http://arxiv.org/abs/2411.05005v1","updated":"2024-11-07T18:59:53Z","published":"2024-11-07T18:59:53Z","title":"Diff-2-in-1: Bridging Generation and Dense Perception with Diffusion\n Models","summary":" Beyond high-fidelity image synthesis, diffusion models have recently\nexhibited promising results in dense visual perception tasks. However, most\nexisting work treats diffusion models as a standalone component for perception\ntasks, employing them either solely for off-the-shelf data augmentation or as\nmere feature extractors. In contrast to these isolated and thus sub-optimal\nefforts, we introduce a unified, versatile, diffusion-based framework,\nDiff-2-in-1, that can simultaneously handle both multi-modal data generation\nand dense visual perception, through a unique exploitation of the\ndiffusion-denoising process. Within this framework, we further enhance\ndiscriminative visual perception via multi-modal generation, by utilizing the\ndenoising network to create multi-modal data that mirror the distribution of\nthe original training set. Importantly, Diff-2-in-1 optimizes the utilization\nof the created diverse and faithful data by leveraging a novel self-improving\nlearning mechanism. Comprehensive experimental evaluations validate the\neffectiveness of our framework, showcasing consistent performance improvements\nacross various discriminative backbones and high-quality multi-modal data\ngeneration characterized by both realism and usefulness.\n","authors":["Shuhong Zheng","Zhipeng Bao","Ruoyu Zhao","Martial Hebert","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.05005v1.pdf","comment":"26 pages, 14 figures"},{"id":"http://arxiv.org/abs/2411.05003v1","updated":"2024-11-07T18:59:45Z","published":"2024-11-07T18:59:45Z","title":"ReCapture: Generative Video Camera Controls for User-Provided Videos\n using Masked Video Fine-Tuning","summary":" Recently, breakthroughs in video modeling have allowed for controllable\ncamera trajectories in generated videos. However, these methods cannot be\ndirectly applied to user-provided videos that are not generated by a video\nmodel. In this paper, we present ReCapture, a method for generating new videos\nwith novel camera trajectories from a single user-provided video. Our method\nallows us to re-generate the reference video, with all its existing scene\nmotion, from vastly different angles and with cinematic camera motion. Notably,\nusing our method we can also plausibly hallucinate parts of the scene that were\nnot observable in the reference video. Our method works by (1) generating a\nnoisy anchor video with a new camera trajectory using multiview diffusion\nmodels or depth-based point cloud rendering and then (2) regenerating the\nanchor video into a clean and temporally consistent reangled video using our\nproposed masked video fine-tuning technique.\n","authors":["David Junhao Zhang","Roni Paiss","Shiran Zada","Nikhil Karnad","David E. Jacobs","Yael Pritch","Inbar Mosseri","Mike Zheng Shou","Neal Wadhwa","Nataniel Ruiz"],"pdf_url":"https://arxiv.org/pdf/2411.05003v1.pdf","comment":"project page: https://generative-video-camera-controls.github.io/"},{"id":"http://arxiv.org/abs/2411.05001v1","updated":"2024-11-07T18:59:28Z","published":"2024-11-07T18:59:28Z","title":"Analyzing The Language of Visual Tokens","summary":" With the introduction of transformer-based models for vision and language\ntasks, such as LLaVA and Chameleon, there has been renewed interest in the\ndiscrete tokenized representation of images. These models often treat image\npatches as discrete tokens, analogous to words in natural language, learning\njoint alignments between visual and human languages. However, little is known\nabout the statistical behavior of these visual languages - whether they follow\nsimilar frequency distributions, grammatical structures, or topologies as\nnatural languages. In this paper, we take a natural-language-centric approach\nto analyzing discrete visual languages and uncover striking similarities and\nfundamental differences. We demonstrate that, although visual languages adhere\nto Zipfian distributions, higher token innovation drives greater entropy and\nlower compression, with tokens predominantly representing object parts,\nindicating intermediate granularity. We also show that visual languages lack\ncohesive grammatical structures, leading to higher perplexity and weaker\nhierarchical organization compared to natural languages. Finally, we\ndemonstrate that, while vision models align more closely with natural languages\nthan other models, this alignment remains significantly weaker than the\ncohesion found within natural languages. Through these experiments, we\ndemonstrate how understanding the statistical properties of discrete visual\nlanguages can inform the design of more effective computer vision models.\n","authors":["David M. Chan","Rodolfo Corona","Joonyong Park","Cheol Jun Cho","Yutong Bai","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2411.05001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04997v1","updated":"2024-11-07T18:59:16Z","published":"2024-11-07T18:59:16Z","title":"LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation","summary":" CLIP is one of the most important multimodal foundational models today. What\npowers CLIP's capabilities? The rich supervision signals provided by natural\nlanguage, the carrier of human knowledge, shape a powerful cross-modal\nrepresentation space. However, with the rapid advancements in large language\nmodels LLMs like GPT-4 and LLaMA, the boundaries of language comprehension and\ngeneration are continually being pushed. This raises an intriguing question:\ncan the capabilities of LLMs be harnessed to further improve multimodal\nrepresentation learning? The potential benefits of incorporating LLMs into CLIP\nare clear. LLMs' strong textual understanding can fundamentally improve CLIP's\nability to handle image captions, drastically enhancing its ability to process\nlong and complex texts, a well-known limitation of vanilla CLIP. Moreover, LLMs\nare trained on a vast corpus of text, possessing open-world knowledge. This\nallows them to expand on caption information during training, increasing the\nefficiency of the learning process. In this paper, we propose LLM2CLIP, a novel\napproach that embraces the power of LLMs to unlock CLIP's potential. By\nfine-tuning the LLM in the caption space with contrastive learning, we extract\nits textual capabilities into the output embeddings, significantly improving\nthe output layer's textual discriminability. We then design an efficient\ntraining process where the fine-tuned LLM acts as a powerful teacher for CLIP's\nvisual encoder. Thanks to the LLM's presence, we can now incorporate longer and\nmore complex captions without being restricted by vanilla CLIP's text encoder's\ncontext window and ability limitations. Our experiments demonstrate that this\napproach brings substantial improvements in cross-modal tasks.\n","authors":["Weiquan Huang","Aoqi Wu","Yifan Yang","Xufang Luo","Yuqing Yang","Liang Hu","Qi Dai","Xiyang Dai","Dongdong Chen","Chong Luo","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2411.04997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04998v1","updated":"2024-11-07T18:59:16Z","published":"2024-11-07T18:59:16Z","title":"HourVideo: 1-Hour Video-Language Understanding","summary":" We present HourVideo, a benchmark dataset for hour-long video-language\nunderstanding. Our dataset consists of a novel task suite comprising\nsummarization, perception (recall, tracking), visual reasoning (spatial,\ntemporal, predictive, causal, counterfactual), and navigation (room-to-room,\nobject retrieval) tasks. HourVideo includes 500 manually curated egocentric\nvideos from the Ego4D dataset, spanning durations of 20 to 120 minutes, and\nfeatures 12,976 high-quality, five-way multiple-choice questions. Benchmarking\nresults reveal that multimodal models, including GPT-4 and LLaVA-NeXT, achieve\nmarginal improvements over random chance. In stark contrast, human experts\nsignificantly outperform the state-of-the-art long-context multimodal model,\nGemini Pro 1.5 (85.0% vs. 37.3%), highlighting a substantial gap in multimodal\ncapabilities. Our benchmark, evaluation toolkit, prompts, and documentation are\navailable at https://hourvideo.stanford.edu\n","authors":["Keshigeyan Chandrasegaran","Agrim Gupta","Lea M. Hadzic","Taran Kota","Jimming He","Cristóbal Eyzaguirre","Zane Durante","Manling Li","Jiajun Wu","Li Fei-Fei"],"pdf_url":"https://arxiv.org/pdf/2411.04998v1.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track; 28 pages"},{"id":"http://arxiv.org/abs/2411.04995v1","updated":"2024-11-07T18:58:57Z","published":"2024-11-07T18:58:57Z","title":"LoFi: Scalable Local Image Reconstruction with Implicit Neural\n Representation","summary":" Neural fields or implicit neural representations (INRs) have attracted\nsignificant attention in machine learning and signal processing due to their\nefficient continuous representation of images and 3D volumes. In this work, we\nbuild on INRs and introduce a coordinate-based local processing framework for\nsolving imaging inverse problems, termed LoFi (Local Field). Unlike\nconventional methods for image reconstruction, LoFi processes local information\nat each coordinate \\textit{separately} by multi-layer perceptrons (MLPs),\nrecovering the object at that specific coordinate. Similar to INRs, LoFi can\nrecover images at any continuous coordinate, enabling image reconstruction at\nmultiple resolutions. With comparable or better performance than standard CNNs\nfor image reconstruction, LoFi achieves excellent generalization to\nout-of-distribution data and memory usage almost independent of image\nresolution. Remarkably, training on $1024 \\times 1024$ images requires just 3GB\nof memory -- over 20 times less than the memory typically needed by standard\nCNNs. Additionally, LoFi's local design allows it to train on extremely small\ndatasets with less than 10 samples, without overfitting or the need for\nregularization or early stopping. Finally, we use LoFi as a denoising prior in\na plug-and-play framework for solving general inverse problems to benefit from\nits continuous image representation and strong generalization. Although trained\non low-resolution images, LoFi can be used as a low-dimensional prior to solve\ninverse problems at any resolution. We validate our framework across a variety\nof imaging modalities, from low-dose computed tomography to radio\ninterferometric imaging.\n","authors":["AmirEhsan Khorashadizadeh","Tobías I. Liaudat","Tianlin Liu","Jason D. McEwen","Ivan Dokmanić"],"pdf_url":"https://arxiv.org/pdf/2411.04995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04989v1","updated":"2024-11-07T18:56:11Z","published":"2024-11-07T18:56:11Z","title":"SG-I2V: Self-Guided Trajectory Control in Image-to-Video Generation","summary":" Methods for image-to-video generation have achieved impressive,\nphoto-realistic quality. However, adjusting specific elements in generated\nvideos, such as object motion or camera movement, is often a tedious process of\ntrial and error, e.g., involving re-generating videos with different random\nseeds. Recent techniques address this issue by fine-tuning a pre-trained model\nto follow conditioning signals, such as bounding boxes or point trajectories.\nYet, this fine-tuning procedure can be computationally expensive, and it\nrequires datasets with annotated object motion, which can be difficult to\nprocure. In this work, we introduce SG-I2V, a framework for controllable\nimage-to-video generation that is self-guided$\\unicode{x2013}$offering\nzero-shot control by relying solely on the knowledge present in a pre-trained\nimage-to-video diffusion model without the need for fine-tuning or external\nknowledge. Our zero-shot method outperforms unsupervised baselines while being\ncompetitive with supervised models in terms of visual quality and motion\nfidelity.\n","authors":["Koichi Namekata","Sherwin Bahmani","Ziyi Wu","Yash Kant","Igor Gilitschenski","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2411.04989v1.pdf","comment":"Project page: https://kmcode1.github.io/Projects/SG-I2V/"},{"id":"http://arxiv.org/abs/2411.04984v1","updated":"2024-11-07T18:55:08Z","published":"2024-11-07T18:55:08Z","title":"Planar Reflection-Aware Neural Radiance Fields","summary":" Neural Radiance Fields (NeRF) have demonstrated exceptional capabilities in\nreconstructing complex scenes with high fidelity. However, NeRF's view\ndependency can only handle low-frequency reflections. It falls short when\nhandling complex planar reflections, often interpreting them as erroneous scene\ngeometries and leading to duplicated and inaccurate scene representations. To\naddress this challenge, we introduce a reflection-aware NeRF that jointly\nmodels planar reflectors, such as windows, and explicitly casts reflected rays\nto capture the source of the high-frequency reflections. We query a single\nradiance field to render the primary color and the source of the reflection. We\npropose a sparse edge regularization to help utilize the true sources of\nreflections for rendering planar reflections rather than creating a duplicate\nalong the primary ray at the same depth. As a result, we obtain accurate scene\ngeometry. Rendering along the primary ray results in a clean, reflection-free\nview, while explicitly rendering along the reflected ray allows us to\nreconstruct highly detailed reflections. Our extensive quantitative and\nqualitative evaluations of real-world datasets demonstrate our method's\nenhanced performance in accurately handling reflections.\n","authors":["Chen Gao","Yipeng Wang","Changil Kim","Jia-Bin Huang","Johannes Kopf"],"pdf_url":"https://arxiv.org/pdf/2411.04984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04967v1","updated":"2024-11-07T18:43:17Z","published":"2024-11-07T18:43:17Z","title":"AsCAN: Asymmetric Convolution-Attention Networks for Efficient\n Recognition and Generation","summary":" Neural network architecture design requires making many crucial decisions.\nThe common desiderata is that similar decisions, with little modifications, can\nbe reused in a variety of tasks and applications. To satisfy that,\narchitectures must provide promising latency and performance trade-offs,\nsupport a variety of tasks, scale efficiently with respect to the amounts of\ndata and compute, leverage available data from other tasks, and efficiently\nsupport various hardware. To this end, we introduce AsCAN -- a hybrid\narchitecture, combining both convolutional and transformer blocks. We revisit\nthe key design principles of hybrid architectures and propose a simple and\neffective \\emph{asymmetric} architecture, where the distribution of\nconvolutional and transformer blocks is \\emph{asymmetric}, containing more\nconvolutional blocks in the earlier stages, followed by more transformer blocks\nin later stages. AsCAN supports a variety of tasks: recognition, segmentation,\nclass-conditional image generation, and features a superior trade-off between\nperformance and latency. We then scale the same architecture to solve a\nlarge-scale text-to-image task and show state-of-the-art performance compared\nto the most recent public and commercial models. Notably, even without any\ncomputation optimization for transformer blocks, our models still yield faster\ninference speed than existing works featuring efficient attention mechanisms,\nhighlighting the advantages and the value of our approach.\n","authors":["Anil Kag","Huseyin Coskun","Jierun Chen","Junli Cao","Willi Menapace","Aliaksandr Siarohin","Sergey Tulyakov","Jian Ren"],"pdf_url":"https://arxiv.org/pdf/2411.04967v1.pdf","comment":"NeurIPS 2024. Project Page:\n https://snap-research.github.io/snap_image/"},{"id":"http://arxiv.org/abs/2401.09980v2","updated":"2024-11-07T18:43:06Z","published":"2024-01-18T13:51:20Z","title":"A Comparative Analysis of U-Net-based models for Segmentation of Cardiac\n MRI","summary":" Medical imaging refers to the technologies and methods utilized to view the\nhuman body and its inside, in order to diagnose, monitor, or even treat medical\ndisorders. This paper aims to explore the application of deep learning\ntechniques in the semantic segmentation of Cardiac short-axis MRI (Magnetic\nResonance Imaging) images, aiming to enhance the diagnosis, monitoring, and\ntreatment of medical disorders related to the heart. The focus centers on\nimplementing various architectures that are derivatives of U-Net, to\neffectively isolate specific parts of the heart for comprehensive anatomical\nand functional analysis. Through a combination of images, graphs, and\nquantitative metrics, the efficacy of the models and their predictions are\nshowcased. Additionally, this paper addresses encountered challenges and\noutline strategies for future improvements. This abstract provides a concise\noverview of the efforts in utilizing deep learning for cardiac image\nsegmentation, emphasizing both the accomplishments and areas for further\nrefinement.\n","authors":["Ketan Suhaas Saichandran"],"pdf_url":"https://arxiv.org/pdf/2401.09980v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04963v1","updated":"2024-11-07T18:40:17Z","published":"2024-11-07T18:40:17Z","title":"VAIR: Visuo-Acoustic Implicit Representations for Low-Cost, Multi-Modal\n Transparent Surface Reconstruction in Indoor Scenes","summary":" Mobile robots operating indoors must be prepared to navigate challenging\nscenes that contain transparent surfaces. This paper proposes a novel method\nfor the fusion of acoustic and visual sensing modalities through implicit\nneural representations to enable dense reconstruction of transparent surfaces\nin indoor scenes. We propose a novel model that leverages generative latent\noptimization to learn an implicit representation of indoor scenes consisting of\ntransparent surfaces. We demonstrate that we can query the implicit\nrepresentation to enable volumetric rendering in image space or 3D geometry\nreconstruction (point clouds or mesh) with transparent surface prediction. We\nevaluate our method's effectiveness qualitatively and quantitatively on a new\ndataset collected using a custom, low-cost sensing platform featuring RGB-D\ncameras and ultrasonic sensors. Our method exhibits significant improvement\nover state-of-the-art for transparent surface reconstruction.\n","authors":["Advaith V. Sethuraman","Onur Bagoren","Harikrishnan Seetharaman","Dalton Richardson","Joseph Taylor","Katherine A. Skinner"],"pdf_url":"https://arxiv.org/pdf/2411.04963v1.pdf","comment":"https://umfieldrobotics.github.io/VAIR_site/"},{"id":"http://arxiv.org/abs/2411.04956v1","updated":"2024-11-07T18:32:00Z","published":"2024-11-07T18:32:00Z","title":"Uncovering Hidden Subspaces in Video Diffusion Models Using\n Re-Identification","summary":" Latent Video Diffusion Models can easily deceive casual observers and domain\nexperts alike thanks to the produced image quality and temporal consistency.\nBeyond entertainment, this creates opportunities around safe data sharing of\nfully synthetic datasets, which are crucial in healthcare, as well as other\ndomains relying on sensitive personal information. However, privacy concerns\nwith this approach have not fully been addressed yet, and models trained on\nsynthetic data for specific downstream tasks still perform worse than those\ntrained on real data. This discrepancy may be partly due to the sampling space\nbeing a subspace of the training videos, effectively reducing the training data\nsize for downstream models. Additionally, the reduced temporal consistency when\ngenerating long videos could be a contributing factor.\n In this paper, we first show that training privacy-preserving models in\nlatent space is computationally more efficient and generalize better.\nFurthermore, to investigate downstream degradation factors, we propose to use a\nre-identification model, previously employed as a privacy preservation filter.\nWe demonstrate that it is sufficient to train this model on the latent space of\nthe video generator. Subsequently, we use these models to evaluate the subspace\ncovered by synthetic video datasets and thus introduce a new way to measure the\nfaithfulness of generative machine learning models. We focus on a specific\napplication in healthcare echocardiography to illustrate the effectiveness of\nour novel methods. Our findings indicate that only up to 30.8% of the training\nvideos are learned in latent video diffusion models, which could explain the\nlack of performance when training downstream tasks on synthetic data.\n","authors":["Mischa Dombrowski","Hadrien Reynaud","Bernhard Kainz"],"pdf_url":"https://arxiv.org/pdf/2411.04956v1.pdf","comment":"8 pages, 5 tables, 6 figures"},{"id":"http://arxiv.org/abs/2411.04954v1","updated":"2024-11-07T18:31:08Z","published":"2024-11-07T18:31:08Z","title":"CAD-MLLM: Unifying Multimodality-Conditioned CAD Generation With MLLM","summary":" This paper aims to design a unified Computer-Aided Design (CAD) generation\nsystem that can easily generate CAD models based on the user's inputs in the\nform of textual description, images, point clouds, or even a combination of\nthem. Towards this goal, we introduce the CAD-MLLM, the first system capable of\ngenerating parametric CAD models conditioned on the multimodal input.\nSpecifically, within the CAD-MLLM framework, we leverage the command sequences\nof CAD models and then employ advanced large language models (LLMs) to align\nthe feature space across these diverse multi-modalities data and CAD models'\nvectorized representations. To facilitate the model training, we design a\ncomprehensive data construction and annotation pipeline that equips each CAD\nmodel with corresponding multimodal data. Our resulting dataset, named\nOmni-CAD, is the first multimodal CAD dataset that contains textual\ndescription, multi-view images, points, and command sequence for each CAD\nmodel. It contains approximately 450K instances and their CAD construction\nsequences. To thoroughly evaluate the quality of our generated CAD models, we\ngo beyond current evaluation metrics that focus on reconstruction quality by\nintroducing additional metrics that assess topology quality and surface\nenclosure extent. Extensive experimental results demonstrate that CAD-MLLM\nsignificantly outperforms existing conditional generative methods and remains\nhighly robust to noises and missing points. The project page and more\nvisualizations can be found at: https://cad-mllm.github.io/\n","authors":["Jingwei Xu","Chenyu Wang","Zibo Zhao","Wen Liu","Yi Ma","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2411.04954v1.pdf","comment":"Project page: https://cad-mllm.github.io/"},{"id":"http://arxiv.org/abs/2411.04952v1","updated":"2024-11-07T18:29:38Z","published":"2024-11-07T18:29:38Z","title":"M3DocRAG: Multi-modal Retrieval is What You Need for Multi-page\n Multi-document Understanding","summary":" Document visual question answering (DocVQA) pipelines that answer questions\nfrom documents have broad applications. Existing methods focus on handling\nsingle-page documents with multi-modal language models (MLMs), or rely on\ntext-based retrieval-augmented generation (RAG) that uses text extraction tools\nsuch as optical character recognition (OCR). However, there are difficulties in\napplying these methods in real-world scenarios: (a) questions often require\ninformation across different pages or documents, where MLMs cannot handle many\nlong documents; (b) documents often have important information in visual\nelements such as figures, but text extraction tools ignore them. We introduce\nM3DocRAG, a novel multi-modal RAG framework that flexibly accommodates various\ndocument contexts (closed-domain and open-domain), question hops (single-hop\nand multi-hop), and evidence modalities (text, chart, figure, etc.). M3DocRAG\nfinds relevant documents and answers questions using a multi-modal retriever\nand an MLM, so that it can efficiently handle single or many documents while\npreserving visual information. Since previous DocVQA datasets ask questions in\nthe context of a specific document, we also present M3DocVQA, a new benchmark\nfor evaluating open-domain DocVQA over 3,000+ PDF documents with 40,000+ pages.\nIn three benchmarks (M3DocVQA/MMLongBench-Doc/MP-DocVQA), empirical results\nshow that M3DocRAG with ColPali and Qwen2-VL 7B achieves superior performance\nthan many strong baselines, including state-of-the-art performance in\nMP-DocVQA. We provide comprehensive analyses of different indexing, MLMs, and\nretrieval models. Lastly, we qualitatively show that M3DocRAG can successfully\nhandle various scenarios, such as when relevant information exists across\nmultiple pages and when answer evidence only exists in images.\n","authors":["Jaemin Cho","Debanjan Mahata","Ozan Irsoy","Yujie He","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2411.04952v1.pdf","comment":"Project webpage: https://m3docrag.github.io"},{"id":"http://arxiv.org/abs/2401.08426v5","updated":"2024-11-07T18:22:41Z","published":"2024-01-16T15:11:29Z","title":"GD doesn't make the cut: Three ways that non-differentiability affects\n neural network training","summary":" This paper critically examines the fundamental distinctions between gradient\nmethods applied to non-differentiable functions (NGDMs) and classical gradient\ndescents (GDs) for differentiable functions, revealing significant gaps in\ncurrent deep learning optimization theory. We demonstrate that NGDMs exhibit\nmarkedly different convergence properties compared to GDs, strongly challenging\nthe applicability of extensive neural network convergence literature based on\n$L-smoothness$ to non-smooth neural networks. Our analysis reveals paradoxical\nbehavior of NDGM solutions for $L_{1}$-regularized problems, where increasing\nregularization counterintuitively leads to larger $L_{1}$ norms of optimal\nsolutions. This finding calls into question widely adopted $L_{1}$ penalization\ntechniques for network pruning. We further challenge the common assumption that\noptimization algorithms like RMSProp behave similarly in differentiable and\nnon-differentiable contexts. Expanding on the Edge of Stability phenomenon, we\ndemonstrate its occurrence in a broader class of functions, including Lipschitz\ncontinuous convex differentiable functions. This finding raises important\nquestions about its relevance and interpretation in non-convex,\nnon-differentiable neural networks, particularly those using ReLU activations.\nOur work identifies critical misunderstandings of NDGMs in influential\nliterature, stemming from an overreliance on strong smoothness assumptions.\nThese findings necessitate a reevaluation of optimization dynamics in deep\nlearning, emphasizing the crucial need for more nuanced theoretical foundations\nin analyzing these complex systems.\n","authors":["Siddharth Krishna Kumar"],"pdf_url":"https://arxiv.org/pdf/2401.08426v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04942v1","updated":"2024-11-07T18:20:28Z","published":"2024-11-07T18:20:28Z","title":"A Reinforcement Learning-Based Automatic Video Editing Method Using\n Pre-trained Vision-Language Model","summary":" In this era of videos, automatic video editing techniques attract more and\nmore attention from industry and academia since they can reduce workloads and\nlower the requirements for human editors. Existing automatic editing systems\nare mainly scene- or event-specific, e.g., soccer game broadcasting, yet the\nautomatic systems for general editing, e.g., movie or vlog editing which covers\nvarious scenes and events, were rarely studied before, and converting the\nevent-driven editing method to a general scene is nontrivial. In this paper, we\npropose a two-stage scheme for general editing. Firstly, unlike previous works\nthat extract scene-specific features, we leverage the pre-trained\nVision-Language Model (VLM) to extract the editing-relevant representations as\nediting context. Moreover, to close the gap between the professional-looking\nvideos and the automatic productions generated with simple guidelines, we\npropose a Reinforcement Learning (RL)-based editing framework to formulate the\nediting problem and train the virtual editor to make better sequential editing\ndecisions. Finally, we evaluate the proposed method on a more general editing\ntask with a real movie dataset. Experimental results demonstrate the\neffectiveness and benefits of the proposed context representation and the\nlearning ability of our RL-based editing framework.\n","authors":["Panwen Hu","Nan Xiao","Feifei Li","Yongquan Chen","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2411.04942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04933v1","updated":"2024-11-07T18:12:49Z","published":"2024-11-07T18:12:49Z","title":"SaSR-Net: Source-Aware Semantic Representation Network for Enhancing\n Audio-Visual Question Answering","summary":" Audio-Visual Question Answering (AVQA) is a challenging task that involves\nanswering questions based on both auditory and visual information in videos. A\nsignificant challenge is interpreting complex multi-modal scenes, which include\nboth visual objects and sound sources, and connecting them to the given\nquestion. In this paper, we introduce the Source-aware Semantic Representation\nNetwork (SaSR-Net), a novel model designed for AVQA. SaSR-Net utilizes\nsource-wise learnable tokens to efficiently capture and align audio-visual\nelements with the corresponding question. It streamlines the fusion of audio\nand visual information using spatial and temporal attention mechanisms to\nidentify answers in multi-modal scenes. Extensive experiments on the Music-AVQA\nand AVQA-Yang datasets show that SaSR-Net outperforms state-of-the-art AVQA\nmethods.\n","authors":["ianyu Yang","Yiyang Nan","Lisen Dai","Zhenwen Liang","Yapeng Tian","Xiangliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.04933v1.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2411.04928v1","updated":"2024-11-07T18:07:31Z","published":"2024-11-07T18:07:31Z","title":"DimensionX: Create Any 3D and 4D Scenes from a Single Image with\n Controllable Video Diffusion","summary":" In this paper, we introduce \\textbf{DimensionX}, a framework designed to\ngenerate photorealistic 3D and 4D scenes from just a single image with video\ndiffusion. Our approach begins with the insight that both the spatial structure\nof a 3D scene and the temporal evolution of a 4D scene can be effectively\nrepresented through sequences of video frames. While recent video diffusion\nmodels have shown remarkable success in producing vivid visuals, they face\nlimitations in directly recovering 3D/4D scenes due to limited spatial and\ntemporal controllability during generation. To overcome this, we propose\nST-Director, which decouples spatial and temporal factors in video diffusion by\nlearning dimension-aware LoRAs from dimension-variant data. This controllable\nvideo diffusion approach enables precise manipulation of spatial structure and\ntemporal dynamics, allowing us to reconstruct both 3D and 4D representations\nfrom sequential frames with the combination of spatial and temporal dimensions.\nAdditionally, to bridge the gap between generated videos and real-world scenes,\nwe introduce a trajectory-aware mechanism for 3D generation and an\nidentity-preserving denoising strategy for 4D generation. Extensive experiments\non various real-world and synthetic datasets demonstrate that DimensionX\nachieves superior results in controllable video generation, as well as in 3D\nand 4D scene generation, compared with previous methods.\n","authors":["Wenqiang Sun","Shuo Chen","Fangfu Liu","Zilong Chen","Yueqi Duan","Jun Zhang","Yikai Wang"],"pdf_url":"https://arxiv.org/pdf/2411.04928v1.pdf","comment":"Project Page: https://chenshuo20.github.io/DimensionX/"},{"id":"http://arxiv.org/abs/2411.04925v1","updated":"2024-11-07T18:00:33Z","published":"2024-11-07T18:00:33Z","title":"StoryAgent: Customized Storytelling Video Generation via Multi-Agent\n Collaboration","summary":" The advent of AI-Generated Content (AIGC) has spurred research into automated\nvideo generation to streamline conventional processes. However, automating\nstorytelling video production, particularly for customized narratives, remains\nchallenging due to the complexity of maintaining subject consistency across\nshots. While existing approaches like Mora and AesopAgent integrate multiple\nagents for Story-to-Video (S2V) generation, they fall short in preserving\nprotagonist consistency and supporting Customized Storytelling Video Generation\n(CSVG). To address these limitations, we propose StoryAgent, a multi-agent\nframework designed for CSVG. StoryAgent decomposes CSVG into distinct subtasks\nassigned to specialized agents, mirroring the professional production process.\nNotably, our framework includes agents for story design, storyboard generation,\nvideo creation, agent coordination, and result evaluation. Leveraging the\nstrengths of different models, StoryAgent enhances control over the generation\nprocess, significantly improving character consistency. Specifically, we\nintroduce a customized Image-to-Video (I2V) method, LoRA-BE, to enhance\nintra-shot temporal consistency, while a novel storyboard generation pipeline\nis proposed to maintain subject consistency across shots. Extensive experiments\ndemonstrate the effectiveness of our approach in synthesizing highly consistent\nstorytelling videos, outperforming state-of-the-art methods. Our contributions\ninclude the introduction of StoryAgent, a versatile framework for video\ngeneration tasks, and novel techniques for preserving protagonist consistency.\n","authors":["Panwen Hu","Jin Jiang","Jianqi Chen","Mingfei Han","Shengcai Liao","Xiaojun Chang","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2411.04925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04924v1","updated":"2024-11-07T17:59:31Z","published":"2024-11-07T17:59:31Z","title":"MVSplat360: Feed-Forward 360 Scene Synthesis from Sparse Views","summary":" We introduce MVSplat360, a feed-forward approach for 360{\\deg} novel view\nsynthesis (NVS) of diverse real-world scenes, using only sparse observations.\nThis setting is inherently ill-posed due to minimal overlap among input views\nand insufficient visual information provided, making it challenging for\nconventional methods to achieve high-quality results. Our MVSplat360 addresses\nthis by effectively combining geometry-aware 3D reconstruction with temporally\nconsistent video generation. Specifically, it refactors a feed-forward 3D\nGaussian Splatting (3DGS) model to render features directly into the latent\nspace of a pre-trained Stable Video Diffusion (SVD) model, where these features\nthen act as pose and visual cues to guide the denoising process and produce\nphotorealistic 3D-consistent views. Our model is end-to-end trainable and\nsupports rendering arbitrary views with as few as 5 sparse input views. To\nevaluate MVSplat360's performance, we introduce a new benchmark using the\nchallenging DL3DV-10K dataset, where MVSplat360 achieves superior visual\nquality compared to state-of-the-art methods on wide-sweeping or even 360{\\deg}\nNVS tasks. Experiments on the existing benchmark RealEstate10K also confirm the\neffectiveness of our model. The video results are available on our project\npage: https://donydchen.github.io/mvsplat360.\n","authors":["Yuedong Chen","Chuanxia Zheng","Haofei Xu","Bohan Zhuang","Andrea Vedaldi","Tat-Jen Cham","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2411.04924v1.pdf","comment":"NeurIPS 2024, Project page: https://donydchen.github.io/mvsplat360,\n Code: https://github.com/donydchen/mvsplat360"},{"id":"http://arxiv.org/abs/2411.04923v1","updated":"2024-11-07T17:59:27Z","published":"2024-11-07T17:59:27Z","title":"VideoGLaMM: A Large Multimodal Model for Pixel-Level Visual Grounding in\n Videos","summary":" Fine-grained alignment between videos and text is challenging due to complex\nspatial and temporal dynamics in videos. Existing video-based Large Multimodal\nModels (LMMs) handle basic conversations but struggle with precise pixel-level\ngrounding in videos. To address this, we introduce VideoGLaMM, a LMM designed\nfor fine-grained pixel-level grounding in videos based on user-provided textual\ninputs. Our design seamlessly connects three key components: a Large Language\nModel, a dual vision encoder that emphasizes both spatial and temporal details,\nand a spatio-temporal decoder for accurate mask generation. This connection is\nfacilitated via tunable V-L and L-V adapters that enable close Vision-Language\n(VL) alignment. The architecture is trained to synchronize both spatial and\ntemporal elements of video content with textual instructions. To enable\nfine-grained grounding, we curate a multimodal dataset featuring detailed\nvisually-grounded conversations using a semiautomatic annotation pipeline,\nresulting in a diverse set of 38k video-QA triplets along with 83k objects and\n671k masks. We evaluate VideoGLaMM on three challenging tasks: Grounded\nConversation Generation, Visual Grounding, and Referring Video Segmentation.\nExperimental results show that our model consistently outperforms existing\napproaches across all three tasks.\n","authors":["Shehan Munasinghe","Hanan Gani","Wenqi Zhu","Jiale Cao","Eric Xing","Fahad Shahbaz Khan","Salman Khan"],"pdf_url":"https://arxiv.org/pdf/2411.04923v1.pdf","comment":"Technical Report of VideoGLaMM"},{"id":"http://arxiv.org/abs/2411.04919v1","updated":"2024-11-07T17:56:16Z","published":"2024-11-07T17:56:16Z","title":"Stem-OB: Generalizable Visual Imitation Learning with Stem-Like\n Convergent Observation through Diffusion Inversion","summary":" Visual imitation learning methods demonstrate strong performance, yet they\nlack generalization when faced with visual input perturbations, including\nvariations in lighting and textures, impeding their real-world application. We\npropose Stem-OB that utilizes pretrained image diffusion models to suppress\nlow-level visual differences while maintaining high-level scene structures.\nThis image inversion process is akin to transforming the observation into a\nshared representation, from which other observations stem, with extraneous\ndetails removed. Stem-OB contrasts with data-augmentation approaches as it is\nrobust to various unspecified appearance changes without the need for\nadditional training. Our method is a simple yet highly effective plug-and-play\nsolution. Empirical results confirm the effectiveness of our approach in\nsimulated tasks and show an exceptionally significant improvement in real-world\napplications, with an average increase of 22.2% in success rates compared to\nthe best baseline. See https://hukz18.github.io/Stem-Ob/ for more info.\n","authors":["Kaizhe Hu","Zihang Rui","Yao He","Yuyao Liu","Pu Hua","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2411.04919v1.pdf","comment":"Arxiv preprint version"},{"id":"http://arxiv.org/abs/2411.04912v1","updated":"2024-11-07T17:51:28Z","published":"2024-11-07T17:51:28Z","title":"Robust Iris Centre Localisation for Assistive Eye-Gaze Tracking","summary":" In this research work, we address the problem of robust iris centre\nlocalisation in unconstrained conditions as a core component of our eye-gaze\ntracking platform. We investigate the application of U-Net variants for\nsegmentation-based and regression-based approaches to improve our iris centre\nlocalisation, which was previously based on Bayes' classification. The achieved\nresults are comparable to or better than the state-of-the-art, offering a\ndrastic improvement over those achieved by the Bayes' classifier, and without\nsacrificing the real-time performance of our eye-gaze tracking platform.\n","authors":["Nipun Sandamal Ranasekara Pathiranage","Stefania Cristina","Kenneth P. Camilleri"],"pdf_url":"https://arxiv.org/pdf/2411.04912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04892v1","updated":"2024-11-07T17:31:21Z","published":"2024-11-07T17:31:21Z","title":"In the Era of Prompt Learning with Vision-Language Models","summary":" Large-scale foundation models like CLIP have shown strong zero-shot\ngeneralization but struggle with domain shifts, limiting their adaptability. In\nour work, we introduce \\textsc{StyLIP}, a novel domain-agnostic prompt learning\nstrategy for Domain Generalization (DG). StyLIP disentangles visual style and\ncontent in CLIP`s vision encoder by using style projectors to learn\ndomain-specific prompt tokens and combining them with content features. Trained\ncontrastively, this approach enables seamless adaptation across domains,\noutperforming state-of-the-art methods on multiple DG benchmarks. Additionally,\nwe propose AD-CLIP for unsupervised domain adaptation (DA), leveraging CLIP`s\nfrozen vision backbone to learn domain-invariant prompts through image style\nand content features. By aligning domains in embedding space with entropy\nminimization, AD-CLIP effectively handles domain shifts, even when only target\ndomain samples are available. Lastly, we outline future work on class discovery\nusing prompt learning for semantic segmentation in remote sensing, focusing on\nidentifying novel or rare classes in unstructured environments. This paves the\nway for more adaptive and generalizable models in complex, real-world\nscenarios.\n","authors":["Ankit Jha"],"pdf_url":"https://arxiv.org/pdf/2411.04892v1.pdf","comment":"ICVGIP 2024, Young Faculty Symposium"},{"id":"http://arxiv.org/abs/2410.03728v2","updated":"2024-11-07T17:19:26Z","published":"2024-09-30T10:50:12Z","title":"Exploring QUIC Dynamics: A Large-Scale Dataset for Encrypted Traffic\n Analysis","summary":" QUIC, a new and increasingly used transport protocol, addresses and resolves\nthe limitations of TCP by offering improved security, performance, and features\nsuch as stream multiplexing and connection migration. These features, however,\nalso present challenges for network operators who need to monitor and analyze\nweb traffic. In this paper, we introduce VisQUIC, a labeled dataset comprising\nover 100,000 QUIC traces from more than 44,000 websites (URLs), collected over\na four-month period. These traces provide the foundation for generating more\nthan seven million images, with configurable parameters of window length, pixel\nresolution, normalization, and labels. These images enable an observer looking\nat the interactions between a client and a server to analyze and gain insights\nabout QUIC encrypted connections. To illustrate the dataset's potential, we\noffer a use-case example of an observer estimating the number of HTTP/3\nresponses/requests pairs in a given QUIC, which can reveal server behavior,\nclient--server interactions, and the load imposed by an observed connection. We\nformulate the problem as a discrete regression problem, train a machine\nlearning (ML) model for it, and then evaluate it using the proposed dataset on\nan example use case.\n","authors":["Barak Gahtan","Robert J. Shahla","Alex M. Bronstein","Reuven Cohen"],"pdf_url":"https://arxiv.org/pdf/2410.03728v2.pdf","comment":"The dataset and the supplementary material can be provided upon\n request"},{"id":"http://arxiv.org/abs/2407.16803v2","updated":"2024-11-07T17:10:15Z","published":"2024-07-23T19:06:44Z","title":"C3T: Cross-modal Transfer Through Time for Human Action Recognition","summary":" In order to unlock the potential of diverse sensors, we investigate a method\nto transfer knowledge between modalities using the structure of a unified\nmultimodal representation space for Human Action Recognition (HAR). We\nformalize and explore an understudied cross-modal transfer setting we term\nUnsupervised Modality Adaptation (UMA), where the modality used in testing is\nnot used in supervised training, i.e. zero labeled instances of the test\nmodality are available during training. We develop three methods to perform\nUMA: Student-Teacher (ST), Contrastive Alignment (CA), and Cross-modal Transfer\nThrough Time (C3T). Our extensive experiments on various camera+IMU datasets\ncompare these methods to each other in the UMA setting, and to their empirical\nupper bound in the supervised setting. The results indicate C3T is the most\nrobust and highest performing by at least a margin of 8%, and nears the\nsupervised setting performance even in the presence of temporal noise. This\nmethod introduces a novel mechanism for aligning signals across time-varying\nlatent vectors, extracted from the receptive field of temporal convolutions.\nOur findings suggest that C3T has significant potential for developing\ngeneralizable models for time-series sensor data, opening new avenues for\nmulti-modal learning in various applications.\n","authors":["Abhi Kamboj","Anh Duy Nguyen","Minh Do"],"pdf_url":"https://arxiv.org/pdf/2407.16803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02397v2","updated":"2024-11-07T17:06:32Z","published":"2024-11-04T18:59:44Z","title":"Adaptive Caching for Faster Video Generation with Diffusion Transformers","summary":" Generating temporally-consistent high-fidelity videos can be computationally\nexpensive, especially over longer temporal spans. More-recent Diffusion\nTransformers (DiTs) -- despite making significant headway in this context --\nhave only heightened such challenges as they rely on larger models and heavier\nattention mechanisms, resulting in slower inference speeds. In this paper, we\nintroduce a training-free method to accelerate video DiTs, termed Adaptive\nCaching (AdaCache), which is motivated by the fact that \"not all videos are\ncreated equal\": meaning, some videos require fewer denoising steps to attain a\nreasonable quality than others. Building on this, we not only cache\ncomputations through the diffusion process, but also devise a caching schedule\ntailored to each video generation, maximizing the quality-latency trade-off. We\nfurther introduce a Motion Regularization (MoReg) scheme to utilize video\ninformation within AdaCache, essentially controlling the compute allocation\nbased on motion content. Altogether, our plug-and-play contributions grant\nsignificant inference speedups (e.g. up to 4.7x on Open-Sora 720p - 2s video\ngeneration) without sacrificing the generation quality, across multiple video\nDiT baselines.\n","authors":["Kumara Kahatapitiya","Haozhe Liu","Sen He","Ding Liu","Menglin Jia","Chenyang Zhang","Michael S. Ryoo","Tian Xie"],"pdf_url":"https://arxiv.org/pdf/2411.02397v2.pdf","comment":"Project-page is available at https://adacache-dit.github.io"},{"id":"http://arxiv.org/abs/2411.04865v1","updated":"2024-11-07T16:58:18Z","published":"2024-11-07T16:58:18Z","title":"ZAHA: Introducing the Level of Facade Generalization and the Large-Scale\n Point Cloud Facade Semantic Segmentation Benchmark Dataset","summary":" Facade semantic segmentation is a long-standing challenge in photogrammetry\nand computer vision. Although the last decades have witnessed the influx of\nfacade segmentation methods, there is a lack of comprehensive facade classes\nand data covering the architectural variability. In ZAHA, we introduce Level of\nFacade Generalization (LoFG), novel hierarchical facade classes designed based\non international urban modeling standards, ensuring compatibility with\nreal-world challenging classes and uniform methods' comparison. Realizing the\nLoFG, we present to date the largest semantic 3D facade segmentation dataset,\nproviding 601 million annotated points at five and 15 classes of LoFG2 and\nLoFG3, respectively. Moreover, we analyze the performance of baseline semantic\nsegmentation methods on our introduced LoFG classes and data, complementing it\nwith a discussion on the unresolved challenges for facade segmentation. We\nfirmly believe that ZAHA shall facilitate further development of 3D facade\nsemantic segmentation methods, enabling robust segmentation indispensable in\ncreating urban digital twins.\n","authors":["Olaf Wysocki","Yue Tan","Thomas Froech","Yan Xia","Magdalena Wysocki","Ludwig Hoegner","Daniel Cremers","Christoph Holst"],"pdf_url":"https://arxiv.org/pdf/2411.04865v1.pdf","comment":"Accepted to WACV 2025 (IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV))"},{"id":"http://arxiv.org/abs/2411.04859v1","updated":"2024-11-07T16:49:25Z","published":"2024-11-07T16:49:25Z","title":"A multi-purpose automatic editing system based on lecture semantics for\n remote education","summary":" Remote teaching has become popular recently due to its convenience and\nsafety, especially under extreme circumstances like a pandemic. However, online\nstudents usually have a poor experience since the information acquired from the\nviews provided by the broadcast platforms is limited. One potential solution is\nto show more camera views simultaneously, but it is technically challenging and\ndistracting for the viewers. Therefore, an automatic multi-camera\ndirecting/editing system, which aims at selecting the most concerned view at\neach time instance to guide the attention of online students, is in urgent\ndemand. However, existing systems mostly make simple assumptions and focus on\ntracking the position of the speaker instead of the real lecture semantics, and\ntherefore have limited capacities to deliver optimal information flow. To this\nend, this paper proposes an automatic multi-purpose editing system based on the\nlecture semantics, which can both direct the multiple video streams for\nreal-time broadcasting and edit the optimal video offline for review purposes.\nOur system directs the views by semantically analyzing the class events while\nfollowing the professional directing rules, mimicking a human director to\ncapture the regions of interest from the viewpoint of the onsite students. We\nconduct both qualitative and quantitative analyses to verify the effectiveness\nof the proposed system and its components.\n","authors":["Panwen Hu","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2411.04859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04844v1","updated":"2024-11-07T16:32:29Z","published":"2024-11-07T16:32:29Z","title":"Differentiable Gaussian Representation for Incomplete CT Reconstruction","summary":" Incomplete Computed Tomography (CT) benefits patients by reducing radiation\nexposure. However, reconstructing high-fidelity images from limited views or\nangles remains challenging due to the ill-posed nature of the problem. Deep\nLearning Reconstruction (DLR) methods have shown promise in enhancing image\nquality, but the paradox between training data diversity and high\ngeneralization ability remains unsolved. In this paper, we propose a novel\nGaussian Representation for Incomplete CT Reconstruction (GRCT) without the\nusage of any neural networks or full-dose CT data. Specifically, we model the\n3D volume as a set of learnable Gaussians, which are optimized directly from\nthe incomplete sinogram. Our method can be applied to multiple views and angles\nwithout changing the architecture. Additionally, we propose a differentiable\nFast CT Reconstruction method for efficient clinical usage. Extensive\nexperiments on multiple datasets and settings demonstrate significant\nimprovements in reconstruction quality metrics and high efficiency. We plan to\nrelease our code as open-source.\n","authors":["Shaokai Wu","Yuxiang Lu","Wei Ji","Suizhi Huang","Fengyu Yang","Shalayiding Sirejiding","Qichen He","Jing Tong","Yanbiao Ji","Yue Ding","Hongtao Lu"],"pdf_url":"https://arxiv.org/pdf/2411.04844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04826v1","updated":"2024-11-07T16:07:00Z","published":"2024-11-07T16:07:00Z","title":"D$^3$epth: Self-Supervised Depth Estimation with Dynamic Mask in Dynamic\n Scenes","summary":" Depth estimation is a crucial technology in robotics. Recently,\nself-supervised depth estimation methods have demonstrated great potential as\nthey can efficiently leverage large amounts of unlabelled real-world data.\nHowever, most existing methods are designed under the assumption of static\nscenes, which hinders their adaptability in dynamic environments. To address\nthis issue, we present D$^3$epth, a novel method for self-supervised depth\nestimation in dynamic scenes. It tackles the challenge of dynamic objects from\ntwo key perspectives. First, within the self-supervised framework, we design a\nreprojection constraint to identify regions likely to contain dynamic objects,\nallowing the construction of a dynamic mask that mitigates their impact at the\nloss level. Second, for multi-frame depth estimation, we introduce a cost\nvolume auto-masking strategy that leverages adjacent frames to identify regions\nassociated with dynamic objects and generate corresponding masks. This provides\nguidance for subsequent processes. Furthermore, we propose a spectral entropy\nuncertainty module that incorporates spectral entropy to guide uncertainty\nestimation during depth fusion, effectively addressing issues arising from cost\nvolume computation in dynamic environments. Extensive experiments on KITTI and\nCityscapes datasets demonstrate that the proposed method consistently\noutperforms existing self-supervised monocular depth estimation baselines. Code\nis available at \\url{https://github.com/Csyunling/D3epth}.\n","authors":["Siyu Chen","Hong Liu","Wenhao Li","Ying Zhu","Guoquan Wang","Jianbing Wu"],"pdf_url":"https://arxiv.org/pdf/2411.04826v1.pdf","comment":"Open sourced"},{"id":"http://arxiv.org/abs/2411.04821v1","updated":"2024-11-07T15:58:17Z","published":"2024-11-07T15:58:17Z","title":"End-to-end Inception-Unet based Generative Adversarial Networks for Snow\n and Rain Removals","summary":" The superior performance introduced by deep learning approaches in removing\natmospheric particles such as snow and rain from a single image; favors their\nusage over classical ones. However, deep learning-based approaches still suffer\nfrom challenges related to the particle appearance characteristics such as\nsize, type, and transparency. Furthermore, due to the unique characteristics of\nrain and snow particles, single network based deep learning approaches struggle\nin handling both degradation scenarios simultaneously. In this paper, a global\nframework that consists of two Generative Adversarial Networks (GANs) is\nproposed where each handles the removal of each particle individually. The\narchitectures of both desnowing and deraining GANs introduce the integration of\na feature extraction phase with the classical U-net generator network which in\nturn enhances the removal performance in the presence of severe variations in\nsize and appearance. Furthermore, a realistic dataset that contains pairs of\nsnowy images next to their groundtruth images estimated using a low-rank\napproximation approach; is presented. The experiments show that the proposed\ndesnowing and deraining approaches achieve significant improvements in\ncomparison to the state-of-the-art approaches when tested on both synthetic and\nrealistic datasets.\n","authors":["Ibrahim Kajo","Mohamed Kas","Yassine Ruichek"],"pdf_url":"https://arxiv.org/pdf/2411.04821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03794v2","updated":"2024-11-07T15:56:00Z","published":"2024-07-04T09:57:44Z","title":"CardioSpectrum: Comprehensive Myocardium Motion Analysis with 3D Deep\n Learning and Geometric Insights","summary":" The ability to map left ventricle (LV) myocardial motion using computed\ntomography angiography (CTA) is essential to diagnosing cardiovascular\nconditions and guiding interventional procedures. Due to their inherent\nlocality, conventional neural networks typically have difficulty predicting\nsubtle tangential movements, which considerably lessens the level of precision\nat which myocardium three-dimensional (3D) mapping can be performed. Using 3D\noptical flow techniques and Functional Maps (FMs), we present a comprehensive\napproach to address this problem. FMs are known for their capacity to capture\nglobal geometric features, thus providing a fuller understanding of 3D\ngeometry. As an alternative to traditional segmentation-based priors, we employ\nsurface-based two-dimensional (2D) constraints derived from spectral\ncorrespondence methods. Our 3D deep learning architecture, based on the ARFlow\nmodel, is optimized to handle complex 3D motion analysis tasks. By\nincorporating FMs, we can capture the subtle tangential movements of the\nmyocardium surface precisely, hence significantly improving the accuracy of 3D\nmapping of the myocardium. The experimental results confirm the effectiveness\nof this method in enhancing myocardium motion analysis. This approach can\ncontribute to improving cardiovascular diagnosis and treatment. Our code and\nadditional resources are available at:\nhttps://shaharzuler.github.io/CardioSpectrumPage\n","authors":["Shahar Zuler","Shai Tejman-Yarden","Dan Raviv"],"pdf_url":"https://arxiv.org/pdf/2407.03794v2.pdf","comment":"This paper has been early accepted to MICCAI 2024, LNCS 15005,\n Springer, 2024"},{"id":"http://arxiv.org/abs/2411.04810v1","updated":"2024-11-07T15:47:07Z","published":"2024-11-07T15:47:07Z","title":"GANESH: Generalizable NeRF for Lensless Imaging","summary":" Lensless imaging offers a significant opportunity to develop ultra-compact\ncameras by removing the conventional bulky lens system. However, without a\nfocusing element, the sensor's output is no longer a direct image but a complex\nmultiplexed scene representation. Traditional methods have attempted to address\nthis challenge by employing learnable inversions and refinement models, but\nthese methods are primarily designed for 2D reconstruction and do not\ngeneralize well to 3D reconstruction. We introduce GANESH, a novel framework\ndesigned to enable simultaneous refinement and novel view synthesis from\nmulti-view lensless images. Unlike existing methods that require scene-specific\ntraining, our approach supports on-the-fly inference without retraining on each\nscene. Moreover, our framework allows us to tune our model to specific scenes,\nenhancing the rendering and refinement quality. To facilitate research in this\narea, we also present the first multi-view lensless dataset, LenslessScenes.\nExtensive experiments demonstrate that our method outperforms current\napproaches in reconstruction accuracy and refinement quality. Code and video\nresults are available at https://rakesh-123-cryp.github.io/Rakesh.github.io/\n","authors":["Rakesh Raj Madavan","Akshat Kaimal","Badhrinarayanan K V","Vinayak Gupta","Rohit Choudhary","Chandrakala Shanmuganathan","Kaushik Mitra"],"pdf_url":"https://arxiv.org/pdf/2411.04810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01031v2","updated":"2024-11-07T15:41:48Z","published":"2024-10-01T19:45:01Z","title":"Pediatric Wrist Fracture Detection Using Feature Context Excitation\n Modules in X-ray Images","summary":" Children often suffer wrist trauma in daily life, while they usually need\nradiologists to analyze and interpret X-ray images before surgical treatment by\nsurgeons. The development of deep learning has enabled neural networks to serve\nas computer-assisted diagnosis (CAD) tools to help doctors and experts in\nmedical image diagnostics. Since YOLOv8 model has obtained the satisfactory\nsuccess in object detection tasks, it has been applied to various fracture\ndetection. This work introduces four variants of Feature Contexts\nExcitation-YOLOv8 (FCE-YOLOv8) model, each incorporating a different FCE module\n(i.e., modules of Squeeze-and-Excitation (SE), Global Context (GC),\nGather-Excite (GE), and Gaussian Context Transformer (GCT)) to enhance the\nmodel performance. Experimental results on GRAZPEDWRI-DX dataset demonstrate\nthat our proposed YOLOv8+GC-M3 model improves the mAP@50 value from 65.78% to\n66.32%, outperforming the state-of-the-art (SOTA) model while reducing\ninference time. Furthermore, our proposed YOLOv8+SE-M3 model achieves the\nhighest mAP@50 value of 67.07%, exceeding the SOTA performance. The\nimplementation of this work is available at\nhttps://github.com/RuiyangJu/FCE-YOLOv8.\n","authors":["Rui-Yang Ju","Chun-Tse Chien","Enkaer Xieerke","Jen-Shiun Chiang"],"pdf_url":"https://arxiv.org/pdf/2410.01031v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.03163"},{"id":"http://arxiv.org/abs/2411.03225v2","updated":"2024-11-07T15:41:20Z","published":"2024-11-05T16:15:33Z","title":"Knowledge Graphs of Driving Scenes to Empower the Emerging Capabilities\n of Neurosymbolic AI","summary":" In the era of Generative AI, Neurosymbolic AI is emerging as a powerful\napproach for tasks spanning from perception to cognition. The use of\nNeurosymbolic AI has been shown to achieve enhanced capabilities, including\nimproved grounding, alignment, explainability, and reliability. However, due to\nits nascent stage, there is a lack of widely available real-world benchmark\ndatasets tailored to Neurosymbolic AI tasks. To address this gap and support\nthe evaluation of current and future methods, we introduce DSceneKG -- a suite\nof knowledge graphs of driving scenes built from real-world, high-quality\nscenes from multiple open autonomous driving datasets. In this article, we\ndetail the construction process of DSceneKG and highlight its application in\nseven different tasks. DSceneKG is publicly accessible at:\nhttps://github.com/ruwantw/DSceneKG\n","authors":["Ruwan Wickramarachchi","Cory Henson","Amit Sheth"],"pdf_url":"https://arxiv.org/pdf/2411.03225v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2310.13040v2","updated":"2024-11-07T15:40:09Z","published":"2023-10-19T17:59:12Z","title":"Interpreting CLIP: Insights on the Robustness to ImageNet Distribution\n Shifts","summary":" What distinguishes robust models from non-robust ones? While for ImageNet\ndistribution shifts it has been shown that such differences in robustness can\nbe traced back predominantly to differences in training data, so far it is not\nknown what that translates to in terms of what the model has learned. In this\nwork, we bridge this gap by probing the representation spaces of 16 robust\nzero-shot CLIP vision encoders with various backbones (ResNets and ViTs) and\npretraining sets (OpenAI, LAION-400M, LAION-2B, YFCC15M, CC12M and {DataComp}),\nand comparing them to the representation spaces of less robust models with\nidentical backbones, but different (pre)training sets or objectives (CLIP\npretraining on ImageNet-Captions, and supervised training or finetuning on\nImageNet).Through this analysis, we generate three novel insights. Firstly, we\ndetect the presence of outlier features in robust zero-shot CLIP vision\nencoders, which to the best of our knowledge is the first time these are\nobserved in non-language and non-transformer models. Secondly, we find the\nexistence of outlier features to be an indication of ImageNet shift robustness\nin models, since we only find them in robust models in our analysis. Lastly, we\nalso investigate the number of unique encoded concepts in the representation\nspace and find zero-shot CLIP models to encode a higher number of unique\nconcepts in their representation space. However, we do not find this to be an\nindicator of ImageNet shift robustness and hypothesize that it is rather\nrelated to the language supervision. Since the presence of outlier features can\nbe detected without access to any data from shifted datasets, we believe that\nthey could be a useful tool for practitioners to get a feeling for the\ndistribution shift robustness of a pretrained model during deployment.\n","authors":["Jonathan Crabbé","Pau Rodríguez","Vaishaal Shankar","Luca Zappella","Arno Blaas"],"pdf_url":"https://arxiv.org/pdf/2310.13040v2.pdf","comment":"Published in TMLR"},{"id":"http://arxiv.org/abs/2411.04796v1","updated":"2024-11-07T15:36:49Z","published":"2024-11-07T15:36:49Z","title":"MPVO: Motion-Prior based Visual Odometry for PointGoal Navigation","summary":" Visual odometry (VO) is essential for enabling accurate point-goal navigation\nof embodied agents in indoor environments where GPS and compass sensors are\nunreliable and inaccurate. However, traditional VO methods face challenges in\nwide-baseline scenarios, where fast robot motions and low frames per second\n(FPS) during inference hinder their performance, leading to drift and\ncatastrophic failures in point-goal navigation. Recent deep-learned VO methods\nshow robust performance but suffer from sample inefficiency during training;\nhence, they require huge datasets and compute resources. So, we propose a\nrobust and sample-efficient VO pipeline based on motion priors available while\nan agent is navigating an environment. It consists of a training-free\naction-prior based geometric VO module that estimates a coarse relative pose\nwhich is further consumed as a motion prior by a deep-learned VO model, which\nfinally produces a fine relative pose to be used by the navigation policy. This\nstrategy helps our pipeline achieve up to 2x sample efficiency during training\nand demonstrates superior accuracy and robustness in point-goal navigation\ntasks compared to state-of-the-art VO method(s). Realistic indoor environments\nof the Gibson dataset is used in the AI-Habitat simulator to evaluate the\nproposed approach using navigation metrics (like success/SPL) and pose metrics\n(like RPE/ATE). We hope this method further opens a direction of work where\nmotion priors from various sources can be utilized to improve VO estimates and\nachieve better results in embodied navigation tasks.\n","authors":["Sayan Paul","Ruddra dev Roychoudhury","Brojeshwar Bhowmick"],"pdf_url":"https://arxiv.org/pdf/2411.04796v1.pdf","comment":"Accepted in 50SFM Workshop of the 18th European Conference on\n Computer Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2410.16261v3","updated":"2024-11-07T15:35:52Z","published":"2024-10-21T17:58:20Z","title":"Mini-InternVL: A Flexible-Transfer Pocket Multimodal Model with 5%\n Parameters and 90% Performance","summary":" Multimodal large language models (MLLMs) have demonstrated impressive\nperformance in vision-language tasks across a broad spectrum of domains.\nHowever, the large model scale and associated high computational costs pose\nsignificant challenges for training and deploying MLLMs on consumer-grade GPUs\nor edge devices, thereby hindering their widespread application. In this work,\nwe introduce Mini-InternVL, a series of MLLMs with parameters ranging from 1B\nto 4B, which achieves 90% of the performance with only 5% of the parameters.\nThis significant improvement in efficiency and effectiveness makes our models\nmore accessible and applicable in various real-world scenarios. To further\npromote the adoption of our models, we develop a unified adaptation framework\nfor Mini-InternVL, which enables our models to transfer and outperform\nspecialized models in downstream tasks, including autonomous driving, medical\nimages, and remote sensing. We believe that our study can provide valuable\ninsights and resources to advance the development of efficient and effective\nMLLMs. Code is available at https://github.com/OpenGVLab/InternVL.\n","authors":["Zhangwei Gao","Zhe Chen","Erfei Cui","Yiming Ren","Weiyun Wang","Jinguo Zhu","Hao Tian","Shenglong Ye","Junjun He","Xizhou Zhu","Lewei Lu","Tong Lu","Yu Qiao","Jifeng Dai","Wenhai Wang"],"pdf_url":"https://arxiv.org/pdf/2410.16261v3.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2406.10395v3","updated":"2024-11-07T15:28:50Z","published":"2024-06-14T19:49:45Z","title":"BrainSegFounder: Towards 3D Foundation Models for Neuroimage\n Segmentation","summary":" The burgeoning field of brain health research increasingly leverages\nartificial intelligence (AI) to interpret and analyze neurological data. This\nstudy introduces a novel approach towards the creation of medical foundation\nmodels by integrating a large-scale multi-modal magnetic resonance imaging\n(MRI) dataset derived from 41,400 participants in its own. Our method involves\na novel two-stage pretraining approach using vision transformers. The first\nstage is dedicated to encoding anatomical structures in generally healthy\nbrains, identifying key features such as shapes and sizes of different brain\nregions. The second stage concentrates on spatial information, encompassing\naspects like location and the relative positioning of brain structures. We\nrigorously evaluate our model, BrainFounder, using the Brain Tumor Segmentation\n(BraTS) challenge and Anatomical Tracings of Lesions After Stroke v2.0 (ATLAS\nv2.0) datasets. BrainFounder demonstrates a significant performance gain,\nsurpassing the achievements of the previous winning solutions using fully\nsupervised learning. Our findings underscore the impact of scaling up both the\ncomplexity of the model and the volume of unlabeled training data derived from\ngenerally healthy brains, which enhances the accuracy and predictive\ncapabilities of the model in complex neuroimaging tasks with MRI. The\nimplications of this research provide transformative insights and practical\napplications in healthcare and make substantial steps towards the creation of\nfoundation models for Medical AI. Our pretrained models and training code can\nbe found at https://github.com/lab-smile/GatorBrain.\n","authors":["Joseph Cox","Peng Liu","Skylar E. Stolte","Yunchao Yang","Kang Liu","Kyle B. See","Huiwen Ju","Ruogu Fang"],"pdf_url":"https://arxiv.org/pdf/2406.10395v3.pdf","comment":"19 pages, 5 figures, to be published in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2411.04782v1","updated":"2024-11-07T15:22:32Z","published":"2024-11-07T15:22:32Z","title":"An Effective Pipeline for Whole-Slide Image Glomerulus Segmentation","summary":" Whole-slide images (WSI) glomerulus segmentation is essential for accurately\ndiagnosing kidney diseases. In this work, we propose a practical pipeline for\nglomerulus segmentation that effectively enhances both patch-level and\nWSI-level segmentation tasks. Our approach leverages stitching on overlapping\npatches, increasing the detection coverage, especially when glomeruli are\nlocated near patch image borders. In addition, we conduct comprehensive\nevaluations from different segmentation models across two large and diverse\ndatasets with over 30K glomerulus annotations. Experimental results demonstrate\nthat models using our pipeline outperform the previous state-of-the-art method,\nachieving superior results across both datasets and setting a new benchmark for\nglomerulus segmentation in WSIs. The code and pre-trained models are available\nat https://github.com/huuquan1994/wsi_glomerulus_seg.\n","authors":["Quan Huu Cap"],"pdf_url":"https://arxiv.org/pdf/2411.04782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03286v2","updated":"2024-11-07T15:07:59Z","published":"2024-11-05T17:35:41Z","title":"DiT4Edit: Diffusion Transformer for Image Editing","summary":" Despite recent advances in UNet-based image editing, methods for shape-aware\nobject editing in high-resolution images are still lacking. Compared to UNet,\nDiffusion Transformers (DiT) demonstrate superior capabilities to effectively\ncapture the long-range dependencies among patches, leading to higher-quality\nimage generation. In this paper, we propose DiT4Edit, the first Diffusion\nTransformer-based image editing framework. Specifically, DiT4Edit uses the\nDPM-Solver inversion algorithm to obtain the inverted latents, reducing the\nnumber of steps compared to the DDIM inversion algorithm commonly used in\nUNet-based frameworks. Additionally, we design unified attention control and\npatches merging, tailored for transformer computation streams. This integration\nallows our framework to generate higher-quality edited images faster. Our\ndesign leverages the advantages of DiT, enabling it to surpass UNet structures\nin image editing, especially in high-resolution and arbitrary-size images.\nExtensive experiments demonstrate the strong performance of DiT4Edit across\nvarious editing scenarios, highlighting the potential of Diffusion Transformers\nin supporting image editing.\n","authors":["Kunyu Feng","Yue Ma","Bingyuan Wang","Chenyang Qi","Haozhe Chen","Qifeng Chen","Zeyu Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03286v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08673v2","updated":"2024-11-07T14:49:28Z","published":"2024-10-11T09:59:21Z","title":"SpikeBottleNet: Spike-Driven Feature Compression Architecture for\n Edge-Cloud Co-Inference","summary":" Edge-cloud co-inference enables efficient deep neural network (DNN)\ndeployment by splitting the architecture between an edge device and cloud\nserver, crucial for resource-constraint edge devices. This approach requires\nbalancing on-device computations and communication costs, often achieved\nthrough compressed intermediate feature transmission. Conventional DNN\narchitectures require continuous data processing and floating point\nactivations, leading to considerable energy consumption and increased feature\nsizes, thus raising transmission costs. This challenge motivates exploring\nbinary, event-driven activations using spiking neural networks (SNNs), known\nfor their extreme energy efficiency. In this research, we propose\nSpikeBottleNet, a novel architecture for edge-cloud co-inference systems that\nintegrates a spiking neuron model to significantly reduce energy consumption on\nedge devices. A key innovation of our study is an intermediate feature\ncompression technique tailored for SNNs for efficient feature transmission.\nThis technique leverages a split computing approach to strategically place\nencoder-decoder bottleneck units within complex deep architectures like ResNet\nand MobileNet. Experimental results demonstrate that SpikeBottleNet achieves up\nto 256x bit compression in the final convolutional layer of ResNet, with\nminimal accuracy loss (0.16%). Additionally, our approach enhances edge device\nenergy efficiency by up to 144x compared to the baseline BottleNet, making it\nideal for resource-limited edge devices.\n","authors":["Maruf Hassan","Steven Davy"],"pdf_url":"https://arxiv.org/pdf/2410.08673v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04746v1","updated":"2024-11-07T14:29:02Z","published":"2024-11-07T14:29:02Z","title":"Taming Rectified Flow for Inversion and Editing","summary":" Rectified-flow-based diffusion transformers, such as FLUX and OpenSora, have\ndemonstrated exceptional performance in the field of image and video\ngeneration. Despite their robust generative capabilities, these models often\nsuffer from inaccurate inversion, which could further limit their effectiveness\nin downstream tasks such as image and video editing. To address this issue, we\npropose RF-Solver, a novel training-free sampler that enhances inversion\nprecision by reducing errors in the process of solving rectified flow ODEs.\nSpecifically, we derive the exact formulation of the rectified flow ODE and\nperform a high-order Taylor expansion to estimate its nonlinear components,\nsignificantly decreasing the approximation error at each timestep. Building\nupon RF-Solver, we further design RF-Edit, which comprises specialized\nsub-modules for image and video editing. By sharing self-attention layer\nfeatures during the editing process, RF-Edit effectively preserves the\nstructural information of the source image or video while achieving\nhigh-quality editing results. Our approach is compatible with any pre-trained\nrectified-flow-based models for image and video tasks, requiring no additional\ntraining or optimization. Extensive experiments on text-to-image generation,\nimage & video inversion, and image & video editing demonstrate the robust\nperformance and adaptability of our methods. Code is available at\nhttps://github.com/wangjiangshan0725/RF-Solver-Edit.\n","authors":["Jiangshan Wang","Junfu Pu","Zhongang Qi","Jiayi Guo","Yue Ma","Nisha Huang","Yuxin Chen","Xiu Li","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2411.04746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04732v1","updated":"2024-11-07T14:12:00Z","published":"2024-11-07T14:12:00Z","title":"Convolutional Differentiable Logic Gate Networks","summary":" With the increasing inference cost of machine learning models, there is a\ngrowing interest in models with fast and efficient inference. Recently, an\napproach for learning logic gate networks directly via a differentiable\nrelaxation was proposed. Logic gate networks are faster than conventional\nneural network approaches because their inference only requires logic gate\noperators such as NAND, OR, and XOR, which are the underlying building blocks\nof current hardware and can be efficiently executed. We build on this idea,\nextending it by deep logic gate tree convolutions, logical OR pooling, and\nresidual initializations. This allows scaling logic gate networks up by over\none order of magnitude and utilizing the paradigm of convolution. On CIFAR-10,\nwe achieve an accuracy of 86.29% using only 61 million logic gates, which\nimproves over the SOTA while being 29x smaller.\n","authors":["Felix Petersen","Hilde Kuehne","Christian Borgelt","Julian Welzel","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2411.04732v1.pdf","comment":"Published at NeurIPS 2024 (Oral)"},{"id":"http://arxiv.org/abs/2411.04724v1","updated":"2024-11-07T14:02:41Z","published":"2024-11-07T14:02:41Z","title":"Controlling Human Shape and Pose in Text-to-Image Diffusion Models via\n Domain Adaptation","summary":" We present a methodology for conditional control of human shape and pose in\npretrained text-to-image diffusion models using a 3D human parametric model\n(SMPL). Fine-tuning these diffusion models to adhere to new conditions requires\nlarge datasets and high-quality annotations, which can be more cost-effectively\nacquired through synthetic data generation rather than real-world data.\nHowever, the domain gap and low scene diversity of synthetic data can\ncompromise the pretrained model's visual fidelity. We propose a\ndomain-adaptation technique that maintains image quality by isolating\nsynthetically trained conditional information in the classifier-free guidance\nvector and composing it with another control network to adapt the generated\nimages to the input domain. To achieve SMPL control, we fine-tune a\nControlNet-based architecture on the synthetic SURREAL dataset of rendered\nhumans and apply our domain adaptation at generation time. Experiments\ndemonstrate that our model achieves greater shape and pose diversity than the\n2d pose-based ControlNet, while maintaining the visual fidelity and improving\nstability, proving its usefulness for downstream tasks such as human animation.\n","authors":["Benito Buchheim","Max Reimann","Jürgen Döllner"],"pdf_url":"https://arxiv.org/pdf/2411.04724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02340v5","updated":"2024-11-07T14:00:08Z","published":"2023-09-05T15:57:23Z","title":"Local Padding in Patch-Based GANs for Seamless Infinite-Sized Texture\n Synthesis","summary":" Texture models based on Generative Adversarial Networks (GANs) use\nzero-padding to implicitly encode positional information of the image features.\nHowever, when extending the spatial input to generate images at large sizes,\nzero-padding can often lead to degradation in image quality due to the\nincorrect positional information at the center of the image. Moreover,\nzero-padding can limit the diversity within the generated large images. In this\npaper, we propose a novel approach for generating stochastic texture images at\nlarge arbitrary sizes using GANs based on patch-by-patch generation. Instead of\nzero-padding, the model uses \\textit{local padding} in the generator that\nshares border features between the generated patches; providing positional\ncontext and ensuring consistency at the boundaries. The proposed models are\ntrainable on a single texture image and have a constant GPU scalability with\nrespect to the output image size, and hence can generate images of infinite\nsizes. We show in the experiments that our method has a significant advancement\nbeyond existing GANs-based texture models in terms of the quality and diversity\nof the generated textures. Furthermore, the implementation of local padding in\nthe state-of-the-art super-resolution models effectively eliminates tiling\nartifacts enabling large-scale super-resolution. Our code is available at\n\\url{https://github.com/ai4netzero/Infinite_Texture_GANs}.\n","authors":["Alhasan Abdellatif","Ahmed H. Elsheikh","Hannah P. Menke"],"pdf_url":"https://arxiv.org/pdf/2309.02340v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04717v1","updated":"2024-11-07T13:57:53Z","published":"2024-11-07T13:57:53Z","title":"Subspace-Constrained Quadratic Matrix Factorization: Algorithm and\n Applications","summary":" Matrix Factorization has emerged as a widely adopted framework for modeling\ndata exhibiting low-rank structures. To address challenges in manifold\nlearning, this paper presents a subspace-constrained quadratic matrix\nfactorization model. The model is designed to jointly learn key low-dimensional\nstructures, including the tangent space, the normal subspace, and the quadratic\nform that links the tangent space to a low-dimensional representation. We solve\nthe proposed factorization model using an alternating minimization method,\ninvolving an in-depth investigation of nonlinear regression and projection\nsubproblems. Theoretical properties of the quadratic projection problem and\nconvergence characteristics of the alternating strategy are also investigated.\nTo validate our approach, we conduct numerical experiments on synthetic and\nreal-world datasets. Results demonstrate that our model outperforms existing\nmethods, highlighting its robustness and efficacy in capturing core\nlow-dimensional structures.\n","authors":["Zheng Zhai","Xiaohui Li"],"pdf_url":"https://arxiv.org/pdf/2411.04717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04715v1","updated":"2024-11-07T13:56:13Z","published":"2024-11-07T13:56:13Z","title":"NeuroFly: A framework for whole-brain single neuron reconstruction","summary":" Neurons, with their elongated, tree-like dendritic and axonal structures,\nenable efficient signal integration and long-range communication across brain\nregions. By reconstructing individual neurons' morphology, we can gain valuable\ninsights into brain connectivity, revealing the structure basis of cognition,\nmovement, and perception. Despite the accumulation of extensive 3D microscopic\nimaging data, progress has been considerably hindered by the absence of\nautomated tools to streamline this process. Here we introduce NeuroFly, a\nvalidated framework for large-scale automatic single neuron reconstruction.\nThis framework breaks down the process into three distinct stages:\nsegmentation, connection, and proofreading. In the segmentation stage, we\nperform automatic segmentation followed by skeletonization to generate\nover-segmented neuronal fragments without branches. During the connection\nstage, we use a 3D image-based path following approach to extend each fragment\nand connect it with other fragments of the same neuron. Finally, human\nannotators are required only to proofread the few unresolved positions. The\nfirst two stages of our process are clearly defined computer vision problems,\nand we have trained robust baseline models to solve them. We validated\nNeuroFly's efficiency using in-house datasets that include a variety of\nchallenging scenarios, such as dense arborizations, weak axons, images with\ncontamination. We will release the datasets along with a suite of visualization\nand annotation tools for better reproducibility. Our goal is to foster\ncollaboration among researchers to address the neuron reconstruction challenge,\nultimately accelerating advancements in neuroscience research. The dataset and\ncode are available at https://github.com/beanli161514/neurofly\n","authors":["Rubin Zhao","Yang Liu","Shiqi Zhang","Zijian Yi","Yanyang Xiao","Fang Xu","Yi Yang","Pencheng Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.04715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04711v1","updated":"2024-11-07T13:53:13Z","published":"2024-11-07T13:53:13Z","title":"Progressive Multi-Level Alignments for Semi-Supervised Domain Adaptation\n SAR Target Recognition Using Simulated Data","summary":" Recently, an intriguing research trend for automatic target recognition (ATR)\nfrom synthetic aperture radar (SAR) imagery has arisen: using simulated data to\ntrain ATR models is a feasible solution to the issue of inadequate measured\ndata. To close the domain gap that exists between the real and simulated data,\nthe unsupervised domain adaptation (UDA) techniques are frequently exploited to\nconstruct ATR models. However, for UDA, the target domain lacks labeled data to\ndirect the model training, posing a great challenge to ATR performance. To\naddress the above problem, a semi-supervised domain adaptation (SSDA) framework\nhas been proposed adopting progressive multi-level alignments for simulated\ndata-aided SAR ATR. First, a progressive wavelet transform data augmentation\n(PWTDA) is presented by analyzing the discrepancies of wavelet decomposition\nsub-bands of two domain images, obtaining the domain-level alignment.\nSpecifically, the domain gap is narrowed by mixing the wavelet transform\nhigh-frequency sub-band components. Second, we develop an asymptotic\ninstance-prototype alignment (AIPA) strategy to push the source domain\ninstances close to the corresponding target prototypes, aiming to achieve\ncategory-level alignment. Moreover, the consistency alignment is implemented by\nexcavating the strong-weak augmentation consistency of both individual samples\nand the multi-sample relationship, enhancing the generalization capability of\nthe model. Extensive experiments on the Synthetic and Measured Paired Labeled\nExperiment (SAMPLE) dataset, indicate that our approach obtains recognition\naccuracies of 99.63% and 98.91% in two common experimental settings with only\none labeled sample per class of the target domain, outperforming the most\nadvanced SSDA techniques.\n","authors":["Xinzheng Zhang","Hui Zhu","Hongqian Zhuang"],"pdf_url":"https://arxiv.org/pdf/2411.04711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04707v1","updated":"2024-11-07T13:45:23Z","published":"2024-11-07T13:45:23Z","title":"From CNN to ConvRNN: Adapting Visualization Techniques for Time-Series\n Anomaly Detection","summary":" Nowadays, neural networks are commonly used to solve various problems.\nUnfortunately, despite their effectiveness, they are often perceived as black\nboxes capable of providing answers without explaining their decisions, which\nraises numerous ethical and legal concerns. Fortunately, the field of\nexplainability helps users understand these results. This aspect of machine\nlearning allows users to grasp the decision-making process of a model and\nverify the relevance of its outcomes. In this article, we focus on the learning\nprocess carried out by a ``time distributed`` convRNN, which performs anomaly\ndetection from video data.\n","authors":["Fabien Poirier"],"pdf_url":"https://arxiv.org/pdf/2411.04707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04706v1","updated":"2024-11-07T13:45:04Z","published":"2024-11-07T13:45:04Z","title":"ESC-MISR: Enhancing Spatial Correlations for Multi-Image\n Super-Resolution in Remote Sensing","summary":" Multi-Image Super-Resolution (MISR) is a crucial yet challenging research\ntask in the remote sensing community. In this paper, we address the challenging\ntask of Multi-Image Super-Resolution in Remote Sensing (MISR-RS), aiming to\ngenerate a High-Resolution (HR) image from multiple Low-Resolution (LR) images\nobtained by satellites. Recently, the weak temporal correlations among LR\nimages have attracted increasing attention in the MISR-RS task. However,\nexisting MISR methods treat the LR images as sequences with strong temporal\ncorrelations, overlooking spatial correlations and imposing temporal\ndependencies. To address this problem, we propose a novel end-to-end framework\nnamed Enhancing Spatial Correlations in MISR (ESC-MISR), which fully exploits\nthe spatial-temporal relations of multiple images for HR image reconstruction.\nSpecifically, we first introduce a novel fusion module named Multi-Image\nSpatial Transformer (MIST), which emphasizes parts with clearer global spatial\nfeatures and enhances the spatial correlations between LR images. Besides, we\nperform a random shuffle strategy for the sequential inputs of LR images to\nattenuate temporal dependencies and capture weak temporal correlations in the\ntraining stage. Compared with the state-of-the-art methods, our ESC-MISR\nachieves 0.70dB and 0.76dB cPSNR improvements on the two bands of the PROBA-V\ndataset respectively, demonstrating the superiority of our method.\n","authors":["Zhihui Zhang","Jinhui Pang","Jianan Li","Xiaoshuai Hao"],"pdf_url":"https://arxiv.org/pdf/2411.04706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17822v3","updated":"2024-11-07T13:34:24Z","published":"2024-03-26T16:00:31Z","title":"DN-Splatter: Depth and Normal Priors for Gaussian Splatting and Meshing","summary":" High-fidelity 3D reconstruction of common indoor scenes is crucial for VR and\nAR applications. 3D Gaussian splatting, a novel differentiable rendering\ntechnique, has achieved state-of-the-art novel view synthesis results with high\nrendering speeds and relatively low training times. However, its performance on\nscenes commonly seen in indoor datasets is poor due to the lack of geometric\nconstraints during optimization. In this work, we explore the use of readily\naccessible geometric cues to enhance Gaussian splatting optimization in\nchallenging, ill-posed, and textureless scenes. We extend 3D Gaussian splatting\nwith depth and normal cues to tackle challenging indoor datasets and showcase\ntechniques for efficient mesh extraction. Specifically, we regularize the\noptimization procedure with depth information, enforce local smoothness of\nnearby Gaussians, and use off-the-shelf monocular networks to achieve better\nalignment with the true scene geometry. We propose an adaptive depth loss based\non the gradient of color images, improving depth estimation and novel view\nsynthesis results over various baselines. Our simple yet effective\nregularization technique enables direct mesh extraction from the Gaussian\nrepresentation, yielding more physically accurate reconstructions of indoor\nscenes.\n","authors":["Matias Turkulainen","Xuqian Ren","Iaroslav Melekhov","Otto Seiskari","Esa Rahtu","Juho Kannala"],"pdf_url":"https://arxiv.org/pdf/2403.17822v3.pdf","comment":"To be published in 2025 IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV)"},{"id":"http://arxiv.org/abs/2411.04697v1","updated":"2024-11-07T13:31:07Z","published":"2024-11-07T13:31:07Z","title":"Dynamic Brightness Adaptation for Robust Multi-modal Image Fusion","summary":" Infrared and visible image fusion aim to integrate modality strengths for\nvisually enhanced, informative images. Visible imaging in real-world scenarios\nis susceptible to dynamic environmental brightness fluctuations, leading to\ntexture degradation. Existing fusion methods lack robustness against such\nbrightness perturbations, significantly compromising the visual fidelity of the\nfused imagery. To address this challenge, we propose the Brightness Adaptive\nmultimodal dynamic fusion framework (BA-Fusion), which achieves robust image\nfusion despite dynamic brightness fluctuations. Specifically, we introduce a\nBrightness Adaptive Gate (BAG) module, which is designed to dynamically select\nfeatures from brightness-related channels for normalization, while preserving\nbrightness-independent structural information within the source images.\nFurthermore, we propose a brightness consistency loss function to optimize the\nBAG module. The entire framework is tuned via alternating training strategies.\nExtensive experiments validate that our method surpasses state-of-the-art\nmethods in preserving multi-modal image information and visual fidelity, while\nexhibiting remarkable robustness across varying brightness levels. Our code is\navailable: https://github.com/SunYM2020/BA-Fusion.\n","authors":["Yiming Sun","Bing Cao","Pengfei Zhu","Qinghua Hu"],"pdf_url":"https://arxiv.org/pdf/2411.04697v1.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2411.04693v1","updated":"2024-11-07T13:26:20Z","published":"2024-11-07T13:26:20Z","title":"Reciprocal Point Learning Network with Large Electromagnetic Kernel for\n SAR Open-Set Recognition","summary":" The limitations of existing Synthetic Aperture Radar (SAR) Automatic Target\nRecognition (ATR) methods lie in their confinement by the closed-environment\nassumption, hindering their effective and robust handling of unknown target\ncategories in open environments. Open Set Recognition (OSR), a pivotal facet\nfor algorithmic practicality, intends to categorize known classes while\ndenoting unknown ones as \"unknown.\" The chief challenge in OSR involves\nconcurrently mitigating risks associated with generalizing features from a\nrestricted set of known classes to numerous unknown samples and the open space\nexposure to potential unknown data. To enhance open-set SAR classification, a\nmethod called scattering kernel with reciprocal learning network is proposed.\nInitially, a feature learning framework is constructed based on reciprocal\npoint learning (RPL), establishing a bounded space for potential unknown\nclasses. This approach indirectly introduces unknown information into a learner\nconfined to known classes, thereby acquiring more concise and discriminative\nrepresentations. Subsequently, considering the variability in the imaging of\ntargets at different angles and the discreteness of components in SAR images, a\nproposal is made to design convolutional kernels based on large-sized attribute\nscattering center models. This enhances the ability to extract intrinsic\nnon-linear features and specific scattering characteristics in SAR images,\nthereby improving the discriminative features of the model and mitigating the\nimpact of imaging variations on classification performance. Experiments on the\nMSTAR datasets substantiate the superior performance of the proposed approach\ncalled ASC-RPL over mainstream methods.\n","authors":["Xiayang Xiao","Zhuoxuan Li","Ruyi Zhang","Jiacheng Chen","Haipeng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.04693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04692v1","updated":"2024-11-07T13:25:52Z","published":"2024-11-07T13:25:52Z","title":"Personalized Federated Learning for Cross-view Geo-localization","summary":" In this paper we propose a methodology combining Federated Learning (FL) with\nCross-view Image Geo-localization (CVGL) techniques. We address the challenges\nof data privacy and heterogeneity in autonomous vehicle environments by\nproposing a personalized Federated Learning scenario that allows selective\nsharing of model parameters. Our method implements a coarse-to-fine approach,\nwhere clients share only the coarse feature extractors while keeping\nfine-grained features specific to local environments. We evaluate our approach\nagainst traditional centralized and single-client training schemes using the\nKITTI dataset combined with satellite imagery. Results demonstrate that our\nfederated CVGL method achieves performance close to centralized training while\nmaintaining data privacy. The proposed partial model sharing strategy shows\ncomparable or slightly better performance than classical FL, offering\nsignificant reduced communication overhead without sacrificing accuracy. Our\nwork contributes to more robust and privacy-preserving localization systems for\nautonomous vehicles operating in diverse environments\n","authors":["Christos Anagnostopoulos","Alexandros Gkillas","Nikos Piperigkos","Aris S. Lalos"],"pdf_url":"https://arxiv.org/pdf/2411.04692v1.pdf","comment":"6 pages, 2 figures, Preprint submitted to the IEEE 26th International\n Workshop on Multimedia Signal Processing (MMSP)"},{"id":"http://arxiv.org/abs/2411.04682v1","updated":"2024-11-07T13:13:23Z","published":"2024-11-07T13:13:23Z","title":"DNN-based 3D Cloud Retrieval for Variable Solar Illumination and\n Multiview Spaceborne Imaging","summary":" Climate studies often rely on remotely sensed images to retrieve\ntwo-dimensional maps of cloud properties. To advance volumetric analysis, we\nfocus on recovering the three-dimensional (3D) heterogeneous extinction\ncoefficient field of shallow clouds using multiview remote sensing data.\nClimate research requires large-scale worldwide statistics. To enable scalable\ndata processing, previous deep neural networks (DNNs) can infer at spaceborne\nremote sensing downlink rates. However, prior methods are limited to a fixed\nsolar illumination direction. In this work, we introduce the first scalable\nDNN-based system for 3D cloud retrieval that accommodates varying camera poses\nand solar directions. By integrating multiview cloud intensity images with\ncamera poses and solar direction data, we achieve greater flexibility in\nrecovery. Training of the DNN is performed by a novel two-stage scheme to\naddress the high number of degrees of freedom in this problem. Our approach\nshows substantial improvements over previous state-of-the-art, particularly in\nhandling variations in the sun's zenith angle.\n","authors":["Tamar Klein","Tom Aizenberg","Roi Ronen"],"pdf_url":"https://arxiv.org/pdf/2411.04682v1.pdf","comment":"4 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.10012v2","updated":"2024-11-07T13:11:02Z","published":"2024-03-15T04:35:25Z","title":"Representing Domain-Mixing Optical Degradation for Real-World\n Computational Aberration Correction via Vector Quantization","summary":" Relying on paired synthetic data, existing learning-based Computational\nAberration Correction (CAC) methods are confronted with the intricate and\nmultifaceted synthetic-to-real domain gap, which leads to suboptimal\nperformance in real-world applications. In this paper, in contrast to improving\nthe simulation pipeline, we deliver a novel insight into real-world CAC from\nthe perspective of Unsupervised Domain Adaptation (UDA). By incorporating\nreadily accessible unpaired real-world data into training, we formalize the\nDomain Adaptive CAC (DACAC) task, and then introduce a comprehensive Real-world\naberrated images (Realab) dataset to benchmark it. The setup task presents a\nformidable challenge due to the intricacy of understanding the target optical\ndegradation domain. To this intent, we propose a novel Quantized Domain-Mixing\nRepresentation (QDMR) framework as a potent solution to the issue. Centering\naround representing and quantizing the optical degradation which is consistent\nacross different images, QDMR adapts the CAC model to the target domain from\nthree key aspects: (1) reconstructing aberrated images of both domains by a\nVQGAN to learn a Domain-Mixing Codebook (DMC) characterizing the optical\ndegradation; (2) modulating the deep features in CAC model with DMC to transfer\nthe target domain knowledge; and (3) leveraging the trained VQGAN to generate\npseudo target aberrated images from the source ones for convincing target\ndomain supervision. Extensive experiments on both synthetic and real-world\nbenchmarks reveal that the models with QDMR consistently surpass the\ncompetitive methods in mitigating the synthetic-to-real gap, which produces\nvisually pleasant real-world CAC results with fewer artifacts. Codes and\ndatasets are made publicly available at https://github.com/zju-jiangqi/QDMR.\n","authors":["Qi Jiang","Zhonghua Yi","Shaohua Gao","Yao Gao","Xiaolong Qian","Hao Shi","Lei Sun","JinXing Niu","Kaiwei Wang","Kailun Yang","Jian Bai"],"pdf_url":"https://arxiv.org/pdf/2403.10012v2.pdf","comment":"Accepted to Optics & Laser Technology. Codes and datasets are made\n publicly available at https://github.com/zju-jiangqi/QDMR"},{"id":"http://arxiv.org/abs/2411.04679v1","updated":"2024-11-07T13:08:04Z","published":"2024-11-07T13:08:04Z","title":"CaPo: Cooperative Plan Optimization for Efficient Embodied Multi-Agent\n Cooperation","summary":" In this work, we address the cooperation problem among large language model\n(LLM) based embodied agents, where agents must cooperate to achieve a common\ngoal. Previous methods often execute actions extemporaneously and incoherently,\nwithout long-term strategic and cooperative planning, leading to redundant\nsteps, failures, and even serious repercussions in complex tasks like\nsearch-and-rescue missions where discussion and cooperative plan are crucial.\nTo solve this issue, we propose Cooperative Plan Optimization (CaPo) to enhance\nthe cooperation efficiency of LLM-based embodied agents. Inspired by human\ncooperation schemes, CaPo improves cooperation efficiency with two phases: 1)\nmeta-plan generation, and 2) progress-adaptive meta-plan and execution. In the\nfirst phase, all agents analyze the task, discuss, and cooperatively create a\nmeta-plan that decomposes the task into subtasks with detailed steps, ensuring\na long-term strategic and coherent plan for efficient coordination. In the\nsecond phase, agents execute tasks according to the meta-plan and dynamically\nadjust it based on their latest progress (e.g., discovering a target object)\nthrough multi-turn discussions. This progress-based adaptation eliminates\nredundant actions, improving the overall cooperation efficiency of agents.\nExperimental results on the ThreeDworld Multi-Agent Transport and Communicative\nWatch-And-Help tasks demonstrate that CaPo achieves much higher task completion\nrate and efficiency compared with state-of-the-arts.\n","authors":["Jie Liu","Pan Zhou","Yingjun Du","Ah-Hwee Tan","Cees G. M. Snoek","Jan-Jakob Sonke","Efstratios Gavves"],"pdf_url":"https://arxiv.org/pdf/2411.04679v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2411.04663v1","updated":"2024-11-07T12:48:39Z","published":"2024-11-07T12:48:39Z","title":"Explainable Search and Discovery of Visual Cultural Heritage Collections\n with Multimodal Large Language Models","summary":" Many cultural institutions have made large digitized visual collections\navailable online, often under permissible re-use licences. Creating interfaces\nfor exploring and searching these collections is difficult, particularly in the\nabsence of granular metadata. In this paper, we introduce a method for using\nstate-of-the-art multimodal large language models (LLMs) to enable an\nopen-ended, explainable search and discovery interface for visual collections.\nWe show how our approach can create novel clustering and recommendation systems\nthat avoid common pitfalls of methods based directly on visual embeddings. Of\nparticular interest is the ability to offer concrete textual explanations of\neach recommendation without the need to preselect the features of interest.\nTogether, these features can create a digital interface that is more open-ended\nand flexible while also being better suited to addressing privacy and ethical\nconcerns. Through a case study using a collection of documentary photographs,\nwe provide several metrics showing the efficacy and possibilities of our\napproach.\n","authors":["Taylor Arnold","Lauren Tilton"],"pdf_url":"https://arxiv.org/pdf/2411.04663v1.pdf","comment":"16 pages, CHR 2024: Computational Humanities Research Conference,\n December 4 - 6, 2024, Aarhus University, Denmark"},{"id":"http://arxiv.org/abs/2411.04659v1","updated":"2024-11-07T12:42:48Z","published":"2024-11-07T12:42:48Z","title":"Automated Image Color Mapping for a Historic Photographic Collection","summary":" In the 1970s, the United States Environmental Protection Agency sponsored\nDocumerica, a large-scale photography initiative to document environmental\nsubjects nation-wide. While over 15,000 digitized public-domain photographs\nfrom the collection are available online, most of the images were scanned from\ndamaged copies of the original prints. We present and evaluate a modified\nhistogram matching technique based on the underlying chemistry of the prints\nfor correcting the damaged images by using training data collected from a small\nset of undamaged prints. The entire set of color-adjusted Documerica images is\nmade available in an open repository.\n","authors":["Taylor Arnold","Lauren Tilton"],"pdf_url":"https://arxiv.org/pdf/2411.04659v1.pdf","comment":"11 pages, CHR 2024: Computational Humanities Research Conference,\n December 4 - 6, 2024, Aarhus University, Denmark"},{"id":"http://arxiv.org/abs/2411.04656v1","updated":"2024-11-07T12:34:25Z","published":"2024-11-07T12:34:25Z","title":"ICH-SCNet: Intracerebral Hemorrhage Segmentation and Prognosis\n Classification Network Using CLIP-guided SAM mechanism","summary":" Intracerebral hemorrhage (ICH) is the most fatal subtype of stroke and is\ncharacterized by a high incidence of disability. Accurate segmentation of the\nICH region and prognosis prediction are critically important for developing and\nrefining treatment plans for post-ICH patients. However, existing approaches\naddress these two tasks independently and predominantly focus on imaging data\nalone, thereby neglecting the intrinsic correlation between the tasks and\nmodalities. This paper introduces a multi-task network, ICH-SCNet, designed for\nboth ICH segmentation and prognosis classification. Specifically, we integrate\na SAM-CLIP cross-modal interaction mechanism that combines medical text and\nsegmentation auxiliary information with neuroimaging data to enhance\ncross-modal feature recognition. Additionally, we develop an effective feature\nfusion module and a multi-task loss function to improve performance further.\nExtensive experiments on an ICH dataset reveal that our approach surpasses\nother state-of-the-art methods. It excels in the overall performance of\nclassification tasks and outperforms competing models in all segmentation task\nmetrics.\n","authors":["Xinlei Yu","Ahmed Elazab","Ruiquan Ge","Hui Jin","Xinchen Jiang","Gangyong Jia","Qing Wu","Qinglei Shi","Changmiao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.04656v1.pdf","comment":"6 pages, 2 figures, 3 tables, published to BIBM 2024"},{"id":"http://arxiv.org/abs/2411.04646v1","updated":"2024-11-07T12:11:11Z","published":"2024-11-07T12:11:11Z","title":"DanceFusion: A Spatio-Temporal Skeleton Diffusion Transformer for\n Audio-Driven Dance Motion Reconstruction","summary":" This paper introduces DanceFusion, a novel framework for reconstructing and\ngenerating dance movements synchronized to music, utilizing a Spatio-Temporal\nSkeleton Diffusion Transformer. The framework adeptly handles incomplete and\nnoisy skeletal data common in short-form dance videos on social media platforms\nlike TikTok. DanceFusion incorporates a hierarchical Transformer-based\nVariational Autoencoder (VAE) integrated with a diffusion model, significantly\nenhancing motion realism and accuracy. Our approach introduces sophisticated\nmasking techniques and a unique iterative diffusion process that refines the\nmotion sequences, ensuring high fidelity in both motion generation and\nsynchronization with accompanying audio cues. Comprehensive evaluations\ndemonstrate that DanceFusion surpasses existing methods, providing\nstate-of-the-art performance in generating dynamic, realistic, and\nstylistically diverse dance motions. Potential applications of this framework\nextend to content creation, virtual reality, and interactive entertainment,\npromising substantial advancements in automated dance generation. Visit our\nproject page at https://th-mlab.github.io/DanceFusion/.\n","authors":["Li Zhao","Zhengmin Lu"],"pdf_url":"https://arxiv.org/pdf/2411.04646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04642v1","updated":"2024-11-07T11:54:01Z","published":"2024-11-07T11:54:01Z","title":"TAP-VL: Text Layout-Aware Pre-training for Enriched Vision-Language\n Models","summary":" Vision-Language (VL) models have garnered considerable research interest;\nhowever, they still face challenges in effectively handling text within images.\nTo address this limitation, researchers have developed two approaches. The\nfirst method involves utilizing external Optical Character Recognition (OCR)\ntools to extract textual information from images, which is then prepended to\nother textual inputs. The second strategy focuses on employing extremely\nhigh-resolution images to improve text recognition capabilities. In this paper,\nwe focus on enhancing the first strategy by introducing a novel method, named\nTAP-VL, which treats OCR information as a distinct modality and seamlessly\nintegrates it into any VL model. TAP-VL employs a lightweight transformer-based\nOCR module to receive OCR with layout information, compressing it into a short\nfixed-length sequence for input into the LLM. Initially, we conduct\nmodel-agnostic pretraining of the OCR module on unlabeled documents, followed\nby its integration into any VL architecture through brief fine-tuning.\nExtensive experiments demonstrate consistent performance improvements when\napplying TAP-VL to top-performing VL models, across scene-text and\ndocument-based VL benchmarks.\n","authors":["Jonathan Fhima","Elad Ben Avraham","Oren Nuriel","Yair Kittenplon","Roy Ganz","Aviad Aberdam","Ron Litman"],"pdf_url":"https://arxiv.org/pdf/2411.04642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04632v1","updated":"2024-11-07T11:35:31Z","published":"2024-11-07T11:35:31Z","title":"Improved Multi-Task Brain Tumour Segmentation with Synthetic Data\n Augmentation","summary":" This paper presents the winning solution of task 1 and the third-placed\nsolution of task 3 of the BraTS challenge. The use of automated tools in\nclinical practice has increased due to the development of more and more\nsophisticated and reliable algorithms. However, achieving clinical standards\nand developing tools for real-life scenarios is a major challenge. To this end,\nBraTS has organised tasks to find the most advanced solutions for specific\npurposes. In this paper, we propose the use of synthetic data to train\nstate-of-the-art frameworks in order to improve the segmentation of adult\ngliomas in a post-treatment scenario, and the segmentation of meningioma for\nradiotherapy planning. Our results suggest that the use of synthetic data leads\nto more robust algorithms, although the synthetic data generation pipeline is\nnot directly suited to the meningioma task. The code for these tasks is\navailable at https://github.com/ShadowTwin41/BraTS_2023_2024_solutions.\n","authors":["André Ferreira","Tiago Jesus","Behrus Puladi","Jens Kleesiek","Victor Alves","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2411.04632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04630v1","updated":"2024-11-07T11:29:55Z","published":"2024-11-07T11:29:55Z","title":"Brain Tumour Removing and Missing Modality Generation using 3D WDM","summary":" This paper presents the second-placed solution for task 8 and the\nparticipation solution for task 7 of BraTS 2024. The adoption of automated\nbrain analysis algorithms to support clinical practice is increasing. However,\nmany of these algorithms struggle with the presence of brain lesions or the\nabsence of certain MRI modalities. The alterations in the brain's morphology\nleads to high variability and thus poor performance of predictive models that\nwere trained only on healthy brains. The lack of information that is usually\nprovided by some of the missing MRI modalities also reduces the reliability of\nthe prediction models trained with all modalities. In order to improve the\nperformance of these models, we propose the use of conditional 3D wavelet\ndiffusion models. The wavelet transform enabled full-resolution image training\nand prediction on a GPU with 48 GB VRAM, without patching or downsampling,\npreserving all information for prediction. For the inpainting task of BraTS\n2024, the use of a large and variable number of healthy masks and the stability\nand efficiency of the 3D wavelet diffusion model resulted in 0.007, 22.61 and\n0.842 in the validation set and 0.07 , 22.8 and 0.91 in the testing set (MSE,\nPSNR and SSIM respectively). The code for these tasks is available at\nhttps://github.com/ShadowTwin41/BraTS_2023_2024_solutions.\n","authors":["André Ferreira","Gijs Luijten","Behrus Puladi","Jens Kleesiek","Victor Alves","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2411.04630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09377v2","updated":"2024-11-07T11:21:56Z","published":"2023-06-15T08:18:29Z","title":"Evaluating alignment between humans and neural network representations\n in image-based learning tasks","summary":" Humans represent scenes and objects in rich feature spaces, carrying\ninformation that allows us to generalise about category memberships and\nabstract functions with few examples. What determines whether a neural network\nmodel generalises like a human? We tested how well the representations of $86$\npretrained neural network models mapped to human learning trajectories across\ntwo tasks where humans had to learn continuous relationships and categories of\nnatural images. In these tasks, both human participants and neural networks\nsuccessfully identified the relevant stimulus features within a few trials,\ndemonstrating effective generalisation. We found that while training dataset\nsize was a core determinant of alignment with human choices, contrastive\ntraining with multi-modal data (text and imagery) was a common feature of\ncurrently publicly available models that predicted human generalisation.\nIntrinsic dimensionality of representations had different effects on alignment\nfor different model types. Lastly, we tested three sets of human-aligned\nrepresentations and found no consistent improvements in predictive accuracy\ncompared to the baselines. In conclusion, pretrained neural networks can serve\nto extract representations for cognitive models, as they appear to capture some\nfundamental aspects of cognition that are transferable across tasks. Both our\nparadigms and modelling approach offer a novel way to quantify alignment\nbetween neural networks and humans and extend cognitive science into more\nnaturalistic domains.\n","authors":["Can Demircan","Tankred Saanum","Leonardo Pettini","Marcel Binz","Blazej M Baczkowski","Christian F Doeller","Mona M Garvert","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2306.09377v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04620v1","updated":"2024-11-07T11:09:29Z","published":"2024-11-07T11:09:29Z","title":"Multi-temporal crack segmentation in concrete structure using deep\n learning approaches","summary":" Cracks are among the earliest indicators of deterioration in concrete\nstructures. Early automatic detection of these cracks can significantly extend\nthe lifespan of critical infrastructures, such as bridges, buildings, and\ntunnels, while simultaneously reducing maintenance costs and facilitating\nefficient structural health monitoring. This study investigates whether\nleveraging multi-temporal data for crack segmentation can enhance segmentation\nquality. Therefore, we compare a Swin UNETR trained on multi-temporal data with\na U-Net trained on mono-temporal data to assess the effect of temporal\ninformation compared with conventional single-epoch approaches. To this end, a\nmulti-temporal dataset comprising 1356 images, each with 32 sequential crack\npropagation images, was created. After training the models, experiments were\nconducted to analyze their generalization ability, temporal consistency, and\nsegmentation quality. The multi-temporal approach consistently outperformed its\nmono-temporal counterpart, achieving an IoU of $82.72\\%$ and a F1-score of\n$90.54\\%$, representing a significant improvement over the mono-temporal\nmodel's IoU of $76.69\\%$ and F1-score of $86.18\\%$, despite requiring only half\nof the trainable parameters. The multi-temporal model also displayed a more\nconsistent segmentation quality, with reduced noise and fewer errors. These\nresults suggest that temporal information significantly enhances the\nperformance of segmentation models, offering a promising solution for improved\ncrack detection and the long-term monitoring of concrete structures, even with\nlimited sequential data.\n","authors":["Said Harb","Pedro Achanccaray","Mehdi Maboudi","Markus Gerke"],"pdf_url":"https://arxiv.org/pdf/2411.04620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02099v2","updated":"2024-11-07T10:53:14Z","published":"2024-11-04T14:08:26Z","title":"Differentially Private Integrated Decision Gradients (IDG-DP) for\n Radar-based Human Activity Recognition","summary":" Human motion analysis offers significant potential for healthcare monitoring\nand early detection of diseases. The advent of radar-based sensing systems has\ncaptured the spotlight for they are able to operate without physical contact\nand they can integrate with pre-existing Wi-Fi networks. They are also seen as\nless privacy-invasive compared to camera-based systems. However, recent\nresearch has shown high accuracy in recognizing subjects or gender from radar\ngait patterns, raising privacy concerns. This study addresses these issues by\ninvestigating privacy vulnerabilities in radar-based Human Activity Recognition\n(HAR) systems and proposing a novel method for privacy preservation using\nDifferential Privacy (DP) driven by attributions derived with Integrated\nDecision Gradient (IDG) algorithm. We investigate Black-box Membership\nInference Attack (MIA) Models in HAR settings across various levels of\nattacker-accessible information. We extensively evaluated the effectiveness of\nthe proposed IDG-DP method by designing a CNN-based HAR model and rigorously\nassessing its resilience against MIAs. Experimental results demonstrate the\npotential of IDG-DP in mitigating privacy attacks while maintaining utility\nacross all settings, particularly excelling against label-only and shadow model\nblack-box MIA attacks. This work represents a crucial step towards balancing\nthe need for effective radar-based HAR with robust privacy protection in\nhealthcare environments.\n","authors":["Idris Zakariyya","Linda Tran","Kaushik Bhargav Sivangi","Paul Henderson","Fani Deligianni"],"pdf_url":"https://arxiv.org/pdf/2411.02099v2.pdf","comment":"Accepted at WACV 2025. 12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.04612v1","updated":"2024-11-07T10:52:57Z","published":"2024-11-07T10:52:57Z","title":"Population estimation using 3D city modelling and Carto2S datasets -- A\n case study","summary":" With the launch of Carto2S series of satellites, high resolution images\n(0.6-1.0 meters) are acquired and available for use. High resolution Digital\nElevation Model (DEM) with better accuracies can be generated using C2S\nmulti-view and multi date datasets. DEMs are further used as an input to derive\nDigital terrain models (DTMs) and to extract accurate heights of the objects\n(building and tree) over the surface of the Earth. Extracted building heights\nare validated with ground control points and can be used for generation of city\nmodelling and resource estimation like population estimation, health planning,\nwater and transport resource estimations. In this study, an attempt is made to\nassess the population of a township using high-resolution Indian remote sensing\nsatellite datasets. We used Carto 2S multi-view data and generated a precise\nDEM and DTM over a city area. Using DEM and DTM datasets, accurate heights of\nthe buildings are extracted which are further validated with ground data.\nAccurate building heights and high resolution imagery are used for generating\naccurate virtual 3D city model and assessing the number of floor and carpet\narea of the houses/ flats/ apartments. Population estimation of the area is\nmade using derived information of no of houses/ flats/ apartments from the\nsatellite datasets. Further, information about number of hospital and schools\naround the residential area is extracted from open street maps (OSM).\nPopulation estimation using satellite data and derived information from OSM\ndatasets can prove to be very good tool for local administrator and decision\nmakers.\n","authors":["Jai G Singla"],"pdf_url":"https://arxiv.org/pdf/2411.04612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04610v1","updated":"2024-11-07T10:50:39Z","published":"2024-11-07T10:50:39Z","title":"Solar potential analysis over Indian cities using high-resolution\n satellite imagery and DEM","summary":" Most of the research work in the solar potential analysis is performed\nutilizing aerial imagery, LiDAR data, and satellite imagery. However, in the\nexisting studies using satellite data, parameters such as trees/ vegetation\nshadow, adjacent higher architectural structures, and eccentric roof structures\nin urban areas were not considered, and relatively coarser-resolution datasets\nwere used for analysis. In this work, we have implemented a novel approach to\nestimate rooftop solar potential using inputs of high-resolution satellite\nimagery (0.5 cm), a digital elevation model (1m), along with ground station\nradiation data. Solar radiation analysis is performed using the diffusion\nproportion and transmissivity ratio derived from the ground station data hosted\nby IMD. It was observed that due to seasonal variations, environmental effects\nand technical reasons such as solar panel structure etc., there can be a\nsignificant loss of electricity generation up to 50%. Based on the results, it\nis also understood that using 1m DEM and 50cm satellite imagery, more authentic\nresults are produced over the urban areas.\n","authors":["Jai Singla"],"pdf_url":"https://arxiv.org/pdf/2411.04610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04607v1","updated":"2024-11-07T10:46:01Z","published":"2024-11-07T10:46:01Z","title":"Cross- and Intra-image Prototypical Learning for Multi-label Disease\n Diagnosis and Interpretation","summary":" Recent advances in prototypical learning have shown remarkable potential to\nprovide useful decision interpretations associating activation maps and\npredictions with class-specific training prototypes. Such prototypical learning\nhas been well-studied for various single-label diseases, but for quite relevant\nand more challenging multi-label diagnosis, where multiple diseases are often\nconcurrent within an image, existing prototypical learning models struggle to\nobtain meaningful activation maps and effective class prototypes due to the\nentanglement of the multiple diseases. In this paper, we present a novel Cross-\nand Intra-image Prototypical Learning (CIPL) framework, for accurate\nmulti-label disease diagnosis and interpretation from medical images. CIPL\ntakes advantage of common cross-image semantics to disentangle the multiple\ndiseases when learning the prototypes, allowing a comprehensive understanding\nof complicated pathological lesions. Furthermore, we propose a new two-level\nalignment-based regularisation strategy that effectively leverages consistent\nintra-image information to enhance interpretation robustness and predictive\nperformance. Extensive experiments show that our CIPL attains the\nstate-of-the-art (SOTA) classification accuracy in two public multi-label\nbenchmarks of disease diagnosis: thoracic radiography and fundus images.\nQuantitative interpretability results show that CIPL also has superiority in\nweakly-supervised thoracic disease localisation over other leading saliency-\nand prototype-based explanation methods.\n","authors":["Chong Wang","Fengbei Liu","Yuanhong Chen","Helen Frazer","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2411.04607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04598v1","updated":"2024-11-07T10:28:49Z","published":"2024-11-07T10:28:49Z","title":"Social EgoMesh Estimation","summary":" Accurately estimating the 3D pose of the camera wearer in egocentric video\nsequences is crucial to modeling human behavior in virtual and augmented\nreality applications. The task presents unique challenges due to the limited\nvisibility of the user's body caused by the front-facing camera mounted on\ntheir head. Recent research has explored the utilization of the scene and\nego-motion, but it has overlooked humans' interactive nature. We propose a\nnovel framework for Social Egocentric Estimation of body MEshes (SEE-ME). Our\napproach is the first to estimate the wearer's mesh using only a latent\nprobabilistic diffusion model, which we condition on the scene and, for the\nfirst time, on the social wearer-interactee interactions. Our in-depth study\nsheds light on when social interaction matters most for ego-mesh estimation; it\nquantifies the impact of interpersonal distance and gaze direction. Overall,\nSEE-ME surpasses the current best technique, reducing the pose estimation error\n(MPJPE) by 53%. The code is available at https://github.com/L-Scofano/SEEME.\n","authors":["Luca Scofano","Alessio Sampieri","Edoardo De Matteis","Indro Spinelli","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2411.04598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04596v1","updated":"2024-11-07T10:28:11Z","published":"2024-11-07T10:28:11Z","title":"The Impact of Semi-Supervised Learning on Line Segment Detection","summary":" In this paper we present a method for line segment detection in images, based\non a semi-supervised framework. Leveraging the use of a consistency loss based\non differently augmented and perturbed unlabeled images with a small amount of\nlabeled data, we show comparable results to fully supervised methods. This\nopens up application scenarios where annotation is difficult or expensive, and\nfor domain specific adaptation of models. We are specifically interested in\nreal-time and online applications, and investigate small and efficient learning\nbackbones. Our method is to our knowledge the first to target line detection\nusing modern state-of-the-art methodologies for semi-supervised learning. We\ntest the method on both standard benchmarks and domain specific scenarios for\nforestry applications, showing the tractability of the proposed method.\n","authors":["Johanna Engman","Karl Åström","Magnus Oskarsson"],"pdf_url":"https://arxiv.org/pdf/2411.04596v1.pdf","comment":"9 pages, 6 figures, 7 tables"},{"id":"http://arxiv.org/abs/2411.04595v1","updated":"2024-11-07T10:26:38Z","published":"2024-11-07T10:26:38Z","title":"TexLiverNet: Leveraging Medical Knowledge and Spatial-Frequency\n Perception for Enhanced Liver Tumor Segmentation","summary":" Integrating textual data with imaging in liver tumor segmentation is\nessential for enhancing diagnostic accuracy. However, current multi-modal\nmedical datasets offer only general text annotations, lacking lesion-specific\ndetails critical for extracting nuanced features, especially for fine-grained\nsegmentation of tumor boundaries and small lesions. To address these\nlimitations, we developed datasets with lesion-specific text annotations for\nliver tumors and introduced the TexLiverNet model. TexLiverNet employs an\nagent-based cross-attention module that integrates text features efficiently\nwith visual features, significantly reducing computational costs. Additionally,\nenhanced spatial and adaptive frequency domain perception is proposed to\nprecisely delineate lesion boundaries, reduce background interference, and\nrecover fine details in small lesions. Comprehensive evaluations on public and\nprivate datasets demonstrate that TexLiverNet achieves superior performance\ncompared to current state-of-the-art methods.\n","authors":["Xiaoyan Jiang","Zhi Zhou","Hailing Wang","Guozhong Wang","Zhijun Fang"],"pdf_url":"https://arxiv.org/pdf/2411.04595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04594v1","updated":"2024-11-07T10:25:20Z","published":"2024-11-07T10:25:20Z","title":"Verification of Neural Networks against Convolutional Perturbations via\n Parameterised Kernels","summary":" We develop a method for the efficient verification of neural networks against\nconvolutional perturbations such as blurring or sharpening. To define input\nperturbations we use well-known camera shake, box blur and sharpen kernels. We\ndemonstrate that these kernels can be linearly parameterised in a way that\nallows for a variation of the perturbation strength while preserving desired\nkernel properties. To facilitate their use in neural network verification, we\ndevelop an efficient way of convolving a given input with these parameterised\nkernels. The result of this convolution can be used to encode the perturbation\nin a verification setting by prepending a linear layer to a given network. This\nleads to tight bounds and a high effectiveness in the resulting verification\nstep. We add further precision by employing input splitting as a branch and\nbound strategy. We demonstrate that we are able to verify robustness on a\nnumber of standard benchmarks where the baseline is unable to provide any\nsafety certificates. To the best of our knowledge, this is the first solution\nfor verifying robustness against specific convolutional perturbations such as\ncamera shake.\n","authors":["Benedikt Brückner","Alessio Lomuscio"],"pdf_url":"https://arxiv.org/pdf/2411.04594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04586v1","updated":"2024-11-07T10:15:25Z","published":"2024-11-07T10:15:25Z","title":"On the Inherent Robustness of One-Stage Object Detection against\n Out-of-Distribution Data","summary":" Robustness is a fundamental aspect for developing safe and trustworthy\nmodels, particularly when they are deployed in the open world. In this work we\nanalyze the inherent capability of one-stage object detectors to robustly\noperate in the presence of out-of-distribution (OoD) data. Specifically, we\npropose a novel detection algorithm for detecting unknown objects in image\ndata, which leverages the features extracted by the model from each sample.\nDifferently from other recent approaches in the literature, our proposal does\nnot require retraining the object detector, thereby allowing for the use of\npretrained models. Our proposed OoD detector exploits the application of\nsupervised dimensionality reduction techniques to mitigate the effects of the\ncurse of dimensionality on the features extracted by the model. Furthermore, it\nutilizes high-resolution feature maps to identify potential unknown objects in\nan unsupervised fashion. Our experiments analyze the Pareto trade-off between\nthe performance detecting known and unknown objects resulting from different\nalgorithmic configurations and inference confidence thresholds. We also compare\nthe performance of our proposed algorithm to that of logits-based post-hoc OoD\nmethods, as well as possible fusion strategies. Finally, we discuss on the\ncompetitiveness of all tested methods against state-of-the-art OoD approaches\nfor object detection models over the recently published Unknown Object\nDetection benchmark. The obtained results verify that the performance of\navant-garde post-hoc OoD detectors can be further improved when combined with\nour proposed algorithm.\n","authors":["Aitor Martinez-Seras","Javier Del Ser","Alain Andres","Pablo Garcia-Bringas"],"pdf_url":"https://arxiv.org/pdf/2411.04586v1.pdf","comment":"12 figures, 4 tables, under review"},{"id":"http://arxiv.org/abs/2411.04584v1","updated":"2024-11-07T10:11:37Z","published":"2024-11-07T10:11:37Z","title":"PASSION for Dermatology: Bridging the Diversity Gap with Pigmented Skin\n Images from Sub-Saharan Africa","summary":" Africa faces a huge shortage of dermatologists, with less than one per\nmillion people. This is in stark contrast to the high demand for dermatologic\ncare, with 80% of the paediatric population suffering from largely untreated\nskin conditions. The integration of AI into healthcare sparks significant hope\nfor treatment accessibility, especially through the development of AI-supported\nteledermatology. Current AI models are predominantly trained on white-skinned\npatients and do not generalize well enough to pigmented patients. The PASSION\nproject aims to address this issue by collecting images of skin diseases in\nSub-Saharan countries with the aim of open-sourcing this data. This dataset is\nthe first of its kind, consisting of 1,653 patients for a total of 4,901\nimages. The images are representative of telemedicine settings and encompass\nthe most common paediatric conditions: eczema, fungals, scabies, and impetigo.\nWe also provide a baseline machine learning model trained on the dataset and a\ndetailed performance analysis for the subpopulations represented in the\ndataset. The project website can be found at https://passionderm.github.io/.\n","authors":["Philippe Gottfrois","Fabian Gröger","Faly Herizo Andriambololoniaina","Ludovic Amruthalingam","Alvaro Gonzalez-Jimenez","Christophe Hsu","Agnes Kessy","Simone Lionetti","Daudi Mavura","Wingston Ng'ambi","Dingase Faith Ngongonda","Marc Pouly","Mendrika Fifaliana Rakotoarisaona","Fahafahantsoa Rapelanoro Rabenja","Ibrahima Traoré","Alexander A. Navarini"],"pdf_url":"https://arxiv.org/pdf/2411.04584v1.pdf","comment":"MICCAI 2024"},{"id":"http://arxiv.org/abs/2405.09032v4","updated":"2024-11-07T10:06:18Z","published":"2024-05-15T02:03:44Z","title":"ICAL: Implicit Character-Aided Learning for Enhanced Handwritten\n Mathematical Expression Recognition","summary":" Significant progress has been made in the field of handwritten mathematical\nexpression recognition, while existing encoder-decoder methods are usually\ndifficult to model global information in $LaTeX$. Therefore, this paper\nintroduces a novel approach, Implicit Character-Aided Learning (ICAL), to mine\nthe global expression information and enhance handwritten mathematical\nexpression recognition. Specifically, we propose the Implicit Character\nConstruction Module (ICCM) to predict implicit character sequences and use a\nFusion Module to merge the outputs of the ICCM and the decoder, thereby\nproducing corrected predictions. By modeling and utilizing implicit character\ninformation, ICAL achieves a more accurate and context-aware interpretation of\nhandwritten mathematical expressions. Experimental results demonstrate that\nICAL notably surpasses the state-of-the-art(SOTA) models, improving the\nexpression recognition rate (ExpRate) by 2.25\\%/1.81\\%/1.39\\% on the CROHME\n2014/2016/2019 datasets respectively, and achieves a remarkable 69.06\\% on the\nchallenging HME100k test set. We make our code available on the GitHub:\nhttps://github.com/qingzhenduyu/ICAL\n","authors":["Jianhua Zhu","Liangcai Gao","Wenqi Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.09032v4.pdf","comment":"ICDAR 2024 Oral Paper"},{"id":"http://arxiv.org/abs/2411.04571v1","updated":"2024-11-07T09:55:36Z","published":"2024-11-07T09:55:36Z","title":"DomainGallery: Few-shot Domain-driven Image Generation by\n Attribute-centric Finetuning","summary":" The recent progress in text-to-image models pretrained on large-scale\ndatasets has enabled us to generate various images as long as we provide a text\nprompt describing what we want. Nevertheless, the availability of these models\nis still limited when we expect to generate images that fall into a specific\ndomain either hard to describe or just unseen to the models. In this work, we\npropose DomainGallery, a few-shot domain-driven image generation method which\naims at finetuning pretrained Stable Diffusion on few-shot target datasets in\nan attribute-centric manner. Specifically, DomainGallery features prior\nattribute erasure, attribute disentanglement, regularization and enhancement.\nThese techniques are tailored to few-shot domain-driven generation in order to\nsolve key issues that previous works have failed to settle. Extensive\nexperiments are given to validate the superior performance of DomainGallery on\na variety of domain-driven generation scenarios. Codes are available at\nhttps://github.com/Ldhlwh/DomainGallery.\n","authors":["Yuxuan Duan","Yan Hong","Bo Zhang","Jun Lan","Huijia Zhu","Weiqiang Wang","Jianfu Zhang","Li Niu","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.04571v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2405.16591v2","updated":"2024-11-07T09:33:40Z","published":"2024-05-26T14:50:40Z","title":"CapS-Adapter: Caption-based MultiModal Adapter in Zero-Shot\n Classification","summary":" Recent advances in vision-language foundational models, such as CLIP, have\ndemonstrated significant strides in zero-shot classification. However, the\nextensive parameterization of models like CLIP necessitates a\nresource-intensive fine-tuning process. In response, TIP-Adapter and SuS-X have\nintroduced training-free methods aimed at bolstering the efficacy of downstream\ntasks. While these approaches incorporate support sets to maintain data\ndistribution consistency between knowledge cache and test sets, they often fall\nshort in terms of generalization on the test set, particularly when faced with\ntest data exhibiting substantial distributional variations. In this work, we\npresent CapS-Adapter, an innovative method that employs a caption-based support\nset, effectively harnessing both image and caption features to exceed existing\nstate-of-the-art techniques in training-free scenarios. CapS-Adapter adeptly\nconstructs support sets that closely mirror target distributions, utilizing\ninstance-level distribution features extracted from multimodal large models. By\nleveraging CLIP's single and cross-modal strengths, CapS-Adapter enhances\npredictive accuracy through the use of multimodal support sets. Our method\nachieves outstanding zero-shot classification results across 19 benchmark\ndatasets, improving accuracy by 2.19\\% over the previous leading method. Our\ncontributions are substantiated through extensive validation on multiple\nbenchmark datasets, demonstrating superior performance and robust\ngeneralization capabilities. Our code is made publicly available at\nhttps://github.com/WLuLi/CapS-Adapter.\n","authors":["Qijie Wang","Guandu Liu","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2405.16591v2.pdf","comment":"ACM Multimedia 2024 Poster"},{"id":"http://arxiv.org/abs/2406.01494v2","updated":"2024-11-07T09:23:34Z","published":"2024-06-03T16:21:29Z","title":"Robust Classification by Coupling Data Mollification with Label\n Smoothing","summary":" Introducing training-time augmentations is a key technique to enhance\ngeneralization and prepare deep neural networks against test-time corruptions.\nInspired by the success of generative diffusion models, we propose a novel\napproach of coupling data mollification, in the form of image noising and\nblurring, with label smoothing to align predicted label confidences with image\ndegradation. The method is simple to implement, introduces negligible\noverheads, and can be combined with existing augmentations. We demonstrate\nimproved robustness and uncertainty quantification on the corrupted image\nbenchmarks of the CIFAR and TinyImageNet datasets.\n","authors":["Markus Heinonen","Ba-Hien Tran","Michael Kampffmeyer","Maurizio Filippone"],"pdf_url":"https://arxiv.org/pdf/2406.01494v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2411.04533v1","updated":"2024-11-07T08:43:42Z","published":"2024-11-07T08:43:42Z","title":"Neural Fingerprints for Adversarial Attack Detection","summary":" Deep learning models for image classification have become standard tools in\nrecent years. A well known vulnerability of these models is their\nsusceptibility to adversarial examples. These are generated by slightly\naltering an image of a certain class in a way that is imperceptible to humans\nbut causes the model to classify it wrongly as another class. Many algorithms\nhave been proposed to address this problem, falling generally into one of two\ncategories: (i) building robust classifiers (ii) directly detecting attacked\nimages. Despite the good performance of these detectors, we argue that in a\nwhite-box setting, where the attacker knows the configuration and weights of\nthe network and the detector, they can overcome the detector by running many\nexamples on a local copy, and sending only those that were not detected to the\nactual model. This problem is common in security applications where even a very\ngood model is not sufficient to ensure safety. In this paper we propose to\novercome this inherent limitation of any static defence with randomization. To\ndo so, one must generate a very large family of detectors with consistent\nperformance, and select one or more of them randomly for each input. For the\nindividual detectors, we suggest the method of neural fingerprints. In the\ntraining phase, for each class we repeatedly sample a tiny random subset of\nneurons from certain layers of the network, and if their average is\nsufficiently different between clean and attacked images of the focal class\nthey are considered a fingerprint and added to the detector bank. During test\ntime, we sample fingerprints from the bank associated with the label predicted\nby the model, and detect attacks using a likelihood ratio test. We evaluate our\ndetectors on ImageNet with different attack methods and model architectures,\nand show near-perfect detection with low rates of false detection.\n","authors":["Haim Fisher","Moni Shahar","Yehezkel S. Resheff"],"pdf_url":"https://arxiv.org/pdf/2411.04533v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2405.14974v2","updated":"2024-11-07T08:41:03Z","published":"2024-05-23T18:21:59Z","title":"LOVA3: Learning to Visual Question Answering, Asking and Assessment","summary":" Question answering, asking, and assessment are three innate human traits\ncrucial for understanding the world and acquiring knowledge. By enhancing these\ncapabilities, humans can more effectively utilize data, leading to better\ncomprehension and learning outcomes. Current Multimodal Large Language Models\n(MLLMs) primarily focus on question answering, often neglecting the full\npotential of questioning and assessment skills. Inspired by the human learning\nmechanism, we introduce LOVA3, an innovative framework named \"Learning tO\nVisual question Answering, Asking and Assessment,\" designed to equip MLLMs with\nthese additional capabilities. Our approach involves the creation of two\nsupplementary training tasks GenQA and EvalQA, aiming at fostering the skills\nof asking and assessing questions in the context of images. To develop the\nquestioning ability, we compile a comprehensive set of multimodal foundational\ntasks. For assessment, we introduce a new benchmark called EvalQABench,\ncomprising 64,000 training samples (split evenly between positive and negative\nsamples) and 5,000 validation and testing samples. We posit that enhancing\nMLLMs with the capabilities to answer, ask, and assess questions will enhance\ntheir multimodal comprehension, ultimately improving overall performance. To\nvalidate this hypothesis, we train MLLMs using the LOVA3 framework and evaluate\nthem on a range of multimodal datasets and benchmarks. Our results demonstrate\nconsistent performance gains, underscoring the critical role of these\nadditional tasks in fostering comprehensive intelligence in MLLMs. The code is\navailable at https://github.com/showlab/LOVA3.\n","authors":["Henry Hengyuan Zhao","Pan Zhou","Difei Gao","Zechen Bai","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2405.14974v2.pdf","comment":"Accepted by NeurIPS 2024. The code is available at\n https://github.com/showlab/LOVA3"},{"id":"http://arxiv.org/abs/2409.18694v2","updated":"2024-11-07T08:27:16Z","published":"2024-09-27T12:28:47Z","title":"Learning from Pattern Completion: Self-supervised Controllable\n Generation","summary":" The human brain exhibits a strong ability to spontaneously associate\ndifferent visual attributes of the same or similar visual scene, such as\nassociating sketches and graffiti with real-world visual objects, usually\nwithout supervising information. In contrast, in the field of artificial\nintelligence, controllable generation methods like ControlNet heavily rely on\nannotated training datasets such as depth maps, semantic segmentation maps, and\nposes, which limits the method's scalability. Inspired by the neural mechanisms\nthat may contribute to the brain's associative power, specifically the cortical\nmodularization and hippocampal pattern completion, here we propose a\nself-supervised controllable generation (SCG) framework. Firstly, we introduce\nan equivariant constraint to promote inter-module independence and intra-module\ncorrelation in a modular autoencoder network, thereby achieving functional\nspecialization. Subsequently, based on these specialized modules, we employ a\nself-supervised pattern completion approach for controllable generation\ntraining. Experimental results demonstrate that the proposed modular\nautoencoder effectively achieves functional specialization, including the\nmodular processing of color, brightness, and edge detection, and exhibits\nbrain-like features including orientation selectivity, color antagonism, and\ncenter-surround receptive fields. Through self-supervised training, associative\ngeneration capabilities spontaneously emerge in SCG, demonstrating excellent\ngeneralization ability to various tasks such as associative generation on\npainting, sketches, and ancient graffiti. Compared to the previous\nrepresentative method ControlNet, our proposed approach not only demonstrates\nsuperior robustness in more challenging high-noise scenarios but also possesses\nmore promising scalability potential due to its self-supervised manner.Codes\nare released on Github and Gitee.\n","authors":["Zhiqiang Chen","Guofan Fan","Jinying Gao","Lei Ma","Bo Lei","Tiejun Huang","Shan Yu"],"pdf_url":"https://arxiv.org/pdf/2409.18694v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04519v1","updated":"2024-11-07T08:20:29Z","published":"2024-11-07T08:20:29Z","title":"l0-Regularized Sparse Coding-based Interpretable Network for Multi-Modal\n Image Fusion","summary":" Multi-modal image fusion (MMIF) enhances the information content of the fused\nimage by combining the unique as well as common features obtained from\ndifferent modality sensor images, improving visualization, object detection,\nand many more tasks. In this work, we introduce an interpretable network for\nthe MMIF task, named FNet, based on an l0-regularized multi-modal convolutional\nsparse coding (MCSC) model. Specifically, for solving the l0-regularized CSC\nproblem, we develop an algorithm unrolling-based l0-regularized sparse coding\n(LZSC) block. Given different modality source images, FNet first separates the\nunique and common features from them using the LZSC block and then these\nfeatures are combined to generate the final fused image. Additionally, we\npropose an l0-regularized MCSC model for the inverse fusion process. Based on\nthis model, we introduce an interpretable inverse fusion network named IFNet,\nwhich is utilized during FNet's training. Extensive experiments show that FNet\nachieves high-quality fusion results across five different MMIF tasks.\nFurthermore, we show that FNet enhances downstream object detection in\nvisible-thermal image pairs. We have also visualized the intermediate results\nof FNet, which demonstrates the good interpretability of our network.\n","authors":["Gargi Panda","Soumitra Kundu","Saumik Bhattacharya","Aurobinda Routray"],"pdf_url":"https://arxiv.org/pdf/2411.04519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04517v1","updated":"2024-11-07T08:19:39Z","published":"2024-11-07T08:19:39Z","title":"Continuous Sign Language Recognition System using Deep Learning with\n MediaPipe Holistic","summary":" Sign languages are the language of hearing-impaired people who use visuals\nlike the hand, facial, and body movements for communication. There are\ndifferent signs and gestures representing alphabets, words, and phrases.\nNowadays approximately 300 sign languages are being practiced worldwide such as\nAmerican Sign Language (ASL), Chinese Sign Language (CSL), Indian Sign Language\n(ISL), and many more. Sign languages are dependent on the vocal language of a\nplace. Unlike vocal or spoken languages, there are no helping words in sign\nlanguage like is, am, are, was, were, will, be, etc. As only a limited\npopulation is well-versed in sign language, this lack of familiarity of sign\nlanguage hinders hearing-impaired people from communicating freely and easily\nwith everyone. This issue can be addressed by a sign language recognition (SLR)\nsystem which has the capability to translate the sign language into vocal\nlanguage. In this paper, a continuous SLR system is proposed using a deep\nlearning model employing Long Short-Term Memory (LSTM), trained and tested on\nan ISL primary dataset. This dataset is created using MediaPipe Holistic\npipeline for tracking face, hand, and body movements and collecting landmarks.\nThe system recognizes the signs and gestures in real-time with 88.23% accuracy.\n","authors":["Sharvani Srivastava","Sudhakar Singh"," Pooja","Shiv Prakash"],"pdf_url":"https://arxiv.org/pdf/2411.04517v1.pdf","comment":"14 pages, 4 figures, Wireless Pers Commun"},{"id":"http://arxiv.org/abs/2411.04509v1","updated":"2024-11-07T08:02:58Z","published":"2024-11-07T08:02:58Z","title":"FedDP: Privacy-preserving method based on federated learning for\n histopathology image segmentation","summary":" Hematoxylin and Eosin (H&E) staining of whole slide images (WSIs) is\nconsidered the gold standard for pathologists and medical practitioners for\ntumor diagnosis, surgical planning, and post-operative assessment. With the\nrapid advancement of deep learning technologies, the development of numerous\nmodels based on convolutional neural networks and transformer-based models has\nbeen applied to the precise segmentation of WSIs. However, due to privacy\nregulations and the need to protect patient confidentiality, centralized\nstorage and processing of image data are impractical. Training a centralized\nmodel directly is challenging to implement in medical settings due to these\nprivacy concerns.This paper addresses the dispersed nature and privacy\nsensitivity of medical image data by employing a federated learning framework,\nallowing medical institutions to collaboratively learn while protecting patient\nprivacy. Additionally, to address the issue of original data reconstruction\nthrough gradient inversion during the federated learning training process,\ndifferential privacy introduces noise into the model updates, preventing\nattackers from inferring the contributions of individual samples, thereby\nprotecting the privacy of the training data.Experimental results show that the\nproposed method, FedDP, minimally impacts model accuracy while effectively\nsafeguarding the privacy of cancer pathology image data, with only a slight\ndecrease in Dice, Jaccard, and Acc indices by 0.55%, 0.63%, and 0.42%,\nrespectively. This approach facilitates cross-institutional collaboration and\nknowledge sharing while protecting sensitive data privacy, providing a viable\nsolution for further research and application in the medical field.\n","authors":["Liangrui Pan","Mao Huang","Lian Wang","Pinle Qin","Shaoliang Peng"],"pdf_url":"https://arxiv.org/pdf/2411.04509v1.pdf","comment":"Accepted in BIBM2024"},{"id":"http://arxiv.org/abs/2401.15613v7","updated":"2024-11-07T07:58:50Z","published":"2024-01-28T10:00:45Z","title":"An efficient dual-branch framework via implicit self-texture enhancement\n for arbitrary-scale histopathology image super-resolution","summary":" High-quality whole-slide scanning is expensive, complex, and time-consuming,\nthus limiting the acquisition and utilization of high-resolution histopathology\nimages in daily clinical work. Deep learning-based single-image\nsuper-resolution (SISR) techniques provide an effective way to solve this\nproblem. However, the existing SISR models applied in histopathology images can\nonly work in fixed integer scaling factors, decreasing their applicability.\nThough methods based on implicit neural representation (INR) have shown\npromising results in arbitrary-scale super-resolution (SR) of natural images,\napplying them directly to histopathology images is inadequate because they have\nunique fine-grained image textures different from natural images. Thus, we\npropose an Implicit Self-Texture Enhancement-based dual-branch framework (ISTE)\nfor arbitrary-scale SR of histopathology images to address this challenge. The\nproposed ISTE contains a feature aggregation branch and a texture learning\nbranch. We employ the feature aggregation branch to enhance the learning of the\nlocal details for SR images while utilizing the texture learning branch to\nenhance the learning of high-frequency texture details. Then, we design a\ntwo-stage texture enhancement strategy to fuse the features from the two\nbranches to obtain the SR images. Experiments on publicly available datasets,\nincluding TMA, HistoSR, and the TCGA lung cancer datasets, demonstrate that\nISTE outperforms existing fixed-scale and arbitrary-scale SR algorithms across\nvarious scaling factors. Additionally, extensive experiments have shown that\nthe histopathology images reconstructed by the proposed ISTE are applicable to\ndownstream pathology image analysis tasks.\n","authors":["Minghong Duan","Linhao Qu","Zhiwei Yang","Manning Wang","Chenxi Zhang","Zhijian Song"],"pdf_url":"https://arxiv.org/pdf/2401.15613v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07269v2","updated":"2024-11-07T07:52:59Z","published":"2024-10-09T04:07:38Z","title":"Deep Learning for Surgical Instrument Recognition and Segmentation in\n Robotic-Assisted Surgeries: A Systematic Review","summary":" Applying deep learning (DL) for annotating surgical instruments in\nrobot-assisted minimally invasive surgeries (MIS) represents a significant\nadvancement in surgical technology. This systematic review examines 48 studies\nthat and advanced DL methods and architectures. These sophisticated DL models\nhave shown notable improvements in the precision and efficiency of detecting\nand segmenting surgical tools. The enhanced capabilities of these models\nsupport various clinical applications, including real-time intraoperative\nguidance, comprehensive postoperative evaluations, and objective assessments of\nsurgical skills. By accurately identifying and segmenting surgical instruments\nin video data, DL models provide detailed feedback to surgeons, thereby\nimproving surgical outcomes and reducing complication risks. Furthermore, the\napplication of DL in surgical education is transformative. The review\nunderscores the significant impact of DL on improving the accuracy of skill\nassessments and the overall quality of surgical training programs. However,\nimplementing DL in surgical tool detection and segmentation faces challenges,\nsuch as the need for large, accurately annotated datasets to train these models\neffectively. The manual annotation process is labor-intensive and\ntime-consuming, posing a significant bottleneck. Future research should focus\non automating the detection and segmentation process and enhancing the\nrobustness of DL models against environmental variations. Expanding the\napplication of DL models across various surgical specialties will be essential\nto fully realize this technology's potential. Integrating DL with other\nemerging technologies, such as augmented reality (AR), also offers promising\nopportunities to further enhance the precision and efficacy of surgical\nprocedures.\n","authors":["Fatimaelzahraa Ali Ahmed","Mahmoud Yousef","Mariam Ali Ahmed","Hasan Omar Ali","Anns Mahboob","Hazrat Ali","Zubair Shah","Omar Aboumarzouk","Abdulla Al Ansari","Shidin Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2410.07269v2.pdf","comment":"57 pages, 9 figures, Published in Artificial Intelligence Reviews\n journal "},{"id":"http://arxiv.org/abs/2411.04501v1","updated":"2024-11-07T07:50:58Z","published":"2024-11-07T07:50:58Z","title":"Pose2Trajectory: Using Transformers on Body Pose to Predict Tennis\n Player's Trajectory","summary":" Tracking the trajectory of tennis players can help camera operators in\nproduction. Predicting future movement enables cameras to automatically track\nand predict a player's future trajectory without human intervention. Predicting\nfuture human movement in the context of complex physical tasks is also\nintellectually satisfying. Swift advancements in sports analytics and the wide\navailability of videos for tennis have inspired us to propose a novel method\ncalled Pose2Trajectory, which predicts a tennis player's future trajectory as a\nsequence derived from their body joints' data and ball position. Demonstrating\nimpressive accuracy, our approach capitalizes on body joint information to\nprovide a comprehensive understanding of the human body's geometry and motion,\nthereby enhancing the prediction of the player's trajectory. We use\nencoder-decoder Transformer architecture trained on the joints and trajectory\ninformation of the players with ball positions. The predicted sequence can\nprovide information to help close-up cameras to keep tracking the tennis\nplayer, following centroid coordinates. We generate a high-quality dataset from\nmultiple videos to assist tennis player movement prediction using object\ndetection and human pose estimation methods. It contains bounding boxes and\njoint information for tennis players and ball positions in singles tennis\ngames. Our method shows promising results in predicting the tennis player's\nmovement trajectory with different sequence prediction lengths using the joints\nand trajectory information with the ball position.\n","authors":["Ali K. AlShami","Terrance Boult","Jugal Kalita"],"pdf_url":"https://arxiv.org/pdf/2411.04501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05797v2","updated":"2024-11-07T07:47:17Z","published":"2024-02-08T16:37:04Z","title":"TaE: Task-aware Expandable Representation for Long Tail Class\n Incremental Learning","summary":" Class-incremental learning is dedicated to the development of deep learning\nmodels that are capable of acquiring new knowledge while retaining previously\nlearned information. Most methods focus on balanced data distribution for each\ntask, overlooking real-world long-tailed distributions. Therefore, Long-Tailed\nClass-Incremental Learning has been introduced, which trains on data where head\nclasses have more samples than tail classes. Existing methods mainly focus on\npreserving representative samples from previous classes to combat catastrophic\nforgetting. Recently, dynamic network algorithms freeze old network structures\nand expand new ones, achieving significant performance. However, with the\nintroduction of the long-tail problem, merely extending Determined blocks can\nlead to miscalibrated predictions, while expanding the entire backbone results\nin an explosion of memory size. To address these issues, we introduce a novel\nTask-aware Expandable (TaE) framework, dynamically allocating and updating\ntask-specific trainable parameters to learn diverse representations from each\nincremental task while resisting forgetting through the majority of frozen\nmodel parameters. To further encourage the class-specific feature\nrepresentation, we develop a Centroid-Enhanced (CEd) method to guide the update\nof these task-aware parameters. This approach is designed to adaptively\nallocate feature space for every class by adjusting the distance between intra-\nand inter-class features, which can extend to all \"training from sketch\"\nalgorithms. Extensive experiments demonstrate that TaE achieves\nstate-of-the-art performance.\n","authors":["Linjie Li","Zhenyu Wu","Jiaming Liu","Yang Ji"],"pdf_url":"https://arxiv.org/pdf/2402.05797v2.pdf","comment":"Accepted to ACCV2024"},{"id":"http://arxiv.org/abs/2411.04493v1","updated":"2024-11-07T07:41:04Z","published":"2024-11-07T07:41:04Z","title":"Synergy-Guided Regional Supervision of Pseudo Labels for Semi-Supervised\n Medical Image Segmentation","summary":" Semi-supervised learning has received considerable attention for its\npotential to leverage abundant unlabeled data to enhance model robustness.\nPseudo labeling is a widely used strategy in semi supervised learning. However,\nexisting methods often suffer from noise contamination, which can undermine\nmodel performance. To tackle this challenge, we introduce a novel\nSynergy-Guided Regional Supervision of Pseudo Labels (SGRS-Net) framework.\nBuilt upon the mean teacher network, we employ a Mix Augmentation module to\nenhance the unlabeled data. By evaluating the synergy before and after\naugmentation, we strategically partition the pseudo labels into distinct\nregions. Additionally, we introduce a Region Loss Evaluation module to assess\nthe loss across each delineated area. Extensive experiments conducted on the LA\ndataset have demonstrated superior performance over state-of-the-art\ntechniques, underscoring the efficiency and practicality of our framework.\n","authors":["Tao Wang","Xinlin Zhang","Yuanbin Chen","Yuanbo Zhou","Longxuan Zhao","Tao Tan","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2411.04493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03807v2","updated":"2024-11-07T07:32:33Z","published":"2024-11-06T10:07:46Z","title":"GS2Pose: Two-stage 6D Object Pose Estimation Guided by Gaussian\n Splatting","summary":" This paper proposes a new method for accurate and robust 6D pose estimation\nof novel objects, named GS2Pose. By introducing 3D Gaussian splatting, GS2Pose\ncan utilize the reconstruction results without requiring a high-quality CAD\nmodel, which means it only requires segmented RGBD images as input.\nSpecifically, GS2Pose employs a two-stage structure consisting of coarse\nestimation followed by refined estimation. In the coarse stage, a lightweight\nU-Net network with a polarization attention mechanism, called Pose-Net, is\ndesigned. By using the 3DGS model for supervised training, Pose-Net can\ngenerate NOCS images to compute a coarse pose. In the refinement stage, GS2Pose\nformulates a pose regression algorithm following the idea of reprojection or\nBundle Adjustment (BA), referred to as GS-Refiner. By leveraging Lie algebra to\nextend 3DGS, GS-Refiner obtains a pose-differentiable rendering pipeline that\nrefines the coarse pose by comparing the input images with the rendered images.\nGS-Refiner also selectively updates parameters in the 3DGS model to achieve\nenvironmental adaptation, thereby enhancing the algorithm's robustness and\nflexibility to illuminative variation, occlusion, and other challenging\ndisruptive factors. GS2Pose was evaluated through experiments conducted on the\nLineMod dataset, where it was compared with similar algorithms, yielding highly\ncompetitive results. The code for GS2Pose will soon be released on GitHub.\n","authors":["Jilan Mei","Junbo Li","Cai Meng"],"pdf_url":"https://arxiv.org/pdf/2411.03807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04480v1","updated":"2024-11-07T07:19:28Z","published":"2024-11-07T07:19:28Z","title":"CFPNet: Improving Lightweight ToF Depth Completion via Cross-zone\n Feature Propagation","summary":" Depth completion using lightweight time-of-flight (ToF) depth sensors is\nattractive due to their low cost. However, lightweight ToF sensors usually have\na limited field of view (FOV) compared with cameras. Thus, only pixels in the\nzone area of the image can be associated with depth signals. Previous methods\nfail to propagate depth features from the zone area to the outside-zone area\neffectively, thus suffering from degraded depth completion performance outside\nthe zone. To this end, this paper proposes the CFPNet to achieve cross-zone\nfeature propagation from the zone area to the outside-zone area with two novel\nmodules. The first is a direct-attention-based propagation module (DAPM), which\nenforces direct cross-zone feature acquisition. The second is a\nlarge-kernel-based propagation module (LKPM), which realizes cross-zone feature\npropagation by utilizing convolution layers with kernel sizes up to 31. CFPNet\nachieves state-of-the-art (SOTA) depth completion performance by combining\nthese two modules properly, as verified by extensive experimental results on\nthe ZJU-L5 dataset. The code will be made public.\n","authors":["Laiyan Ding","Hualie Jiang","Rui Xu","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2411.04480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04475v1","updated":"2024-11-07T07:03:40Z","published":"2024-11-07T07:03:40Z","title":"Deep Learning Models for UAV-Assisted Bridge Inspection: A YOLO\n Benchmark Analysis","summary":" Visual inspections of bridges are critical to ensure their safety and\nidentify potential failures early. This inspection process can be rapidly and\naccurately automated by using unmanned aerial vehicles (UAVs) integrated with\ndeep learning models. However, choosing an appropriate model that is\nlightweight enough to integrate into the UAV and fulfills the strict\nrequirements for inference time and accuracy is challenging. Therefore, our\nwork contributes to the advancement of this model selection process by\nconducting a benchmark of 23 models belonging to the four newest YOLO variants\n(YOLOv5, YOLOv6, YOLOv7, YOLOv8) on COCO-Bridge-2021+, a dataset for bridge\ndetails detection. Through comprehensive benchmarking, we identify YOLOv8n,\nYOLOv7tiny, YOLOv6m, and YOLOv6m6 as the models offering an optimal balance\nbetween accuracy and processing speed, with mAP@50 scores of 0.803, 0.837,\n0.853, and 0.872, and inference times of 5.3ms, 7.5ms, 14.06ms, and 39.33ms,\nrespectively. Our findings accelerate the model selection process for UAVs,\nenabling more efficient and reliable bridge inspections.\n","authors":["Trong-Nhan Phan","Hoang-Hai Nguyen","Thi-Thu-Hien Ha","Huy-Tan Thai","Kim-Hung Le"],"pdf_url":"https://arxiv.org/pdf/2411.04475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04918v4","updated":"2024-11-07T06:58:04Z","published":"2024-09-07T21:52:58Z","title":"Training-free Zero-shot Composed Image Retrieval via Weighted Modality\n Fusion and Similarity","summary":" Composed image retrieval (CIR), which formulates the query as a combination\nof a reference image and modified text, has emerged as a new form of image\nsearch due to its enhanced ability to capture user intent. However, training a\nCIR model in a supervised manner typically requires labor-intensive collection\nof (reference image, text modifier, target image) triplets. While existing\nzero-shot CIR (ZS-CIR) methods eliminate the need for training on specific\ndownstream datasets, they still require additional pretraining on large-scale\nimage datasets. In this paper, we introduce a training-free approach for\nZS-CIR. Our approach, Weighted Modality fusion and similarity for CIR\n(WeiMoCIR), operates under the assumption that image and text modalities can be\neffectively combined using a simple weighted average. This allows the query\nrepresentation to be constructed directly from the reference image and text\nmodifier. To further enhance retrieval performance, we employ multimodal large\nlanguage models (MLLMs) to generate image captions for the database images and\nincorporate these textual captions into the similarity computation by combining\nthem with image information using a weighted average. Our approach is simple,\neasy to implement, and its effectiveness is validated through experiments on\nthe FashionIQ and CIRR datasets. Code is available at\nhttps://github.com/whats2000/WeiMoCIR.\n","authors":["Ren-Di Wu","Yu-Yen Lin","Huei-Fang Yang"],"pdf_url":"https://arxiv.org/pdf/2409.04918v4.pdf","comment":"14 pages, 6 figures, International Conference on Technologies and\n Applications of Artificial Intelligence (TAAI) Camera Ready"},{"id":"http://arxiv.org/abs/2411.04469v1","updated":"2024-11-07T06:39:50Z","published":"2024-11-07T06:39:50Z","title":"FreeCap: Hybrid Calibration-Free Motion Capture in Open Environments","summary":" We propose a novel hybrid calibration-free method FreeCap to accurately\ncapture global multi-person motions in open environments. Our system combines a\nsingle LiDAR with expandable moving cameras, allowing for flexible and precise\nmotion estimation in a unified world coordinate. In particular, We introduce a\nlocal-to-global pose-aware cross-sensor human-matching module that predicts the\nalignment among each sensor, even in the absence of calibration. Additionally,\nour coarse-to-fine sensor-expandable pose optimizer further optimizes the 3D\nhuman key points and the alignments, it is also capable of incorporating\nadditional cameras to enhance accuracy. Extensive experiments on Human-M3 and\nFreeMotion datasets demonstrate that our method significantly outperforms\nstate-of-the-art single-modal methods, offering an expandable and efficient\nsolution for multi-person motion capture across various applications.\n","authors":["Aoru Xue","Yiming Ren","Zining Song","Mao Ye","Xinge Zhu","Yuexin Ma"],"pdf_url":"https://arxiv.org/pdf/2411.04469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01723v7","updated":"2024-11-07T06:25:25Z","published":"2023-11-03T05:41:25Z","title":"Towards Calibrated Robust Fine-Tuning of Vision-Language Models","summary":" Improving out-of-distribution (OOD) generalization during in-distribution\n(ID) adaptation is a primary goal of robust fine-tuning of zero-shot models\nbeyond naive fine-tuning. However, despite decent OOD generalization\nperformance from recent robust fine-tuning methods, confidence calibration for\nreliable model output has not been fully addressed. This work proposes a robust\nfine-tuning method that improves both OOD accuracy and confidence calibration\nsimultaneously in vision language models. Firstly, we show that both OOD\nclassification and OOD calibration errors have a shared upper bound consisting\nof two terms of ID data: 1) ID calibration error and 2) the smallest singular\nvalue of the ID input covariance matrix. Based on this insight, we design a\nnovel framework that conducts fine-tuning with a constrained multimodal\ncontrastive loss enforcing a larger smallest singular value, which is further\nguided by the self-distillation of a moving-averaged model to achieve\ncalibrated prediction as well. Starting from empirical evidence supporting our\ntheoretical statements, we provide extensive experimental results on ImageNet\ndistribution shift benchmarks that demonstrate the effectiveness of our theorem\nand its practical implementation.\n","authors":["Changdae Oh","Hyesu Lim","Mijoo Kim","Dongyoon Han","Sangdoo Yun","Jaegul Choo","Alexander Hauptmann","Zhi-Qi Cheng","Kyungwoo Song"],"pdf_url":"https://arxiv.org/pdf/2311.01723v7.pdf","comment":"NeurIPS 2024 (a short version was presented at the NeurIPS 2023\n Workshop on Distribution Shifts); Major modification of (v7): Fixing the\n x-axis of Figure 3 and Pearson correlation, accordingly"},{"id":"http://arxiv.org/abs/2411.04457v1","updated":"2024-11-07T06:04:57Z","published":"2024-11-07T06:04:57Z","title":"Efficient single image non-uniformity correction algorithm","summary":" This paper introduces a new way to correct the non-uniformity (NU) in\nuncooled infrared-type images. The main defect of these uncooled images is the\nlack of a column (resp. line) time-dependent cross-calibration, resulting in a\nstrong column (resp. line) and time dependent noise. This problem can be\nconsidered as a 1D flicker of the columns inside each frame. Thus, classic\nmovie deflickering algorithms can be adapted, to equalize the columns (resp.\nthe lines). The proposed method therefore applies to the series formed by the\ncolumns of an infrared image a movie deflickering algorithm. The obtained\nsingle image method works on static images, and therefore requires no\nregistration, no camera motion compensation, and no closed aperture sensor\nequalization. Thus, the method has only one camera dependent parameter, and is\nlandscape independent. This simple method will be compared to a state of the\nart total variation single image correction on raw real and simulated images.\nThe method is real time, requiring only two operations per pixel. It involves\nno test-pattern calibration and produces no \"ghost artifacts\".\n","authors":["Yohann Tendero","Jerome Gilles","Stephane Landeau","Jean-Michel Morel"],"pdf_url":"https://arxiv.org/pdf/2411.04457v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2411.03615"},{"id":"http://arxiv.org/abs/2411.04456v1","updated":"2024-11-07T06:04:43Z","published":"2024-11-07T06:04:43Z","title":"Properties of BV-G structures + textures decomposition models.\n Application to road detection in satellite images","summary":" In this paper we present some theoretical results about a structures-textures\nimage decomposition model which was proposed by the second author. We prove a\ntheorem which gives the behavior of this model in different cases. Finally, as\na consequence of the theorem we derive an algorithm for the detection of long\nand thin objects applied to a road networks detection application in aerial or\nsatellite images.\n","authors":["Jerome Gilles","Yves Meyer"],"pdf_url":"https://arxiv.org/pdf/2411.04456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18057v2","updated":"2024-11-07T04:49:24Z","published":"2024-09-26T17:00:02Z","title":"LightAvatar: Efficient Head Avatar as Dynamic Neural Light Field","summary":" Recent works have shown that neural radiance fields (NeRFs) on top of\nparametric models have reached SOTA quality to build photorealistic head\navatars from a monocular video. However, one major limitation of the NeRF-based\navatars is the slow rendering speed due to the dense point sampling of NeRF,\npreventing them from broader utility on resource-constrained devices. We\nintroduce LightAvatar, the first head avatar model based on neural light fields\n(NeLFs). LightAvatar renders an image from 3DMM parameters and a camera pose\nvia a single network forward pass, without using mesh or volume rendering. The\nproposed approach, while being conceptually appealing, poses a significant\nchallenge towards real-time efficiency and training stability. To resolve them,\nwe introduce dedicated network designs to obtain proper representations for the\nNeLF model and maintain a low FLOPs budget. Meanwhile, we tap into a\ndistillation-based training strategy that uses a pretrained avatar model as\nteacher to synthesize abundant pseudo data for training. A warping field\nnetwork is introduced to correct the fitting error in the real data so that the\nmodel can learn better. Extensive experiments suggest that our method can\nachieve new SOTA image quality quantitatively or qualitatively, while being\nsignificantly faster than the counterparts, reporting 174.1 FPS (512x512\nresolution) on a consumer-grade GPU (RTX3090) with no customized optimization.\n","authors":["Huan Wang","Feitong Tan","Ziqian Bai","Yinda Zhang","Shichen Liu","Qiangeng Xu","Menglei Chai","Anish Prabhu","Rohit Pandey","Sean Fanello","Zeng Huang","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2409.18057v2.pdf","comment":"ECCV'24 CADL Workshop. Code:\n https://github.com/MingSun-Tse/LightAvatar-TensorFlow. V2: Corrected speed\n benchmark with GaussianAvatar"},{"id":"http://arxiv.org/abs/2312.02548v3","updated":"2024-11-07T04:44:38Z","published":"2023-12-05T07:34:30Z","title":"GeNIe: Generative Hard Negative Images Through Diffusion","summary":" Data augmentation is crucial in training deep models, preventing them from\noverfitting to limited data. Recent advances in generative AI, e.g., diffusion\nmodels, have enabled more sophisticated augmentation techniques that produce\ndata resembling natural images. We introduce GeNIe a novel augmentation method\nwhich leverages a latent diffusion model conditioned on a text prompt to\ncombine two contrasting data points (an image from the source category and a\ntext prompt from the target category) to generate challenging augmentations. To\nachieve this, we adjust the noise level (equivalently, number of diffusion\niterations) to ensure the generated image retains low-level and background\nfeatures from the source image while representing the target category,\nresulting in a hard negative sample for the source category. We further\nautomate and enhance GeNIe by adaptively adjusting the noise level selection on\na per image basis (coined as GeNIe-Ada), leading to further performance\nimprovements. Our extensive experiments, in both few-shot and long-tail\ndistribution settings, demonstrate the effectiveness of our novel augmentation\nmethod and its superior performance over the prior art. Our code is available\nat: https://github.com/UCDvision/GeNIe\n","authors":["Soroush Abbasi Koohpayegani","Anuj Singh","K L Navaneet","Hamed Pirsiavash","Hadi Jamali-Rad"],"pdf_url":"https://arxiv.org/pdf/2312.02548v3.pdf","comment":"Our code is available https://github.com/UCDvision/GeNIe"},{"id":"http://arxiv.org/abs/2411.00172v2","updated":"2024-11-07T04:41:32Z","published":"2024-10-31T19:37:47Z","title":"SeafloorAI: A Large-scale Vision-Language Dataset for Seafloor\n Geological Survey","summary":" A major obstacle to the advancements of machine learning models in marine\nscience, particularly in sonar imagery analysis, is the scarcity of AI-ready\ndatasets. While there have been efforts to make AI-ready sonar image dataset\npublicly available, they suffer from limitations in terms of environment\nsetting and scale. To bridge this gap, we introduce SeafloorAI, the first\nextensive AI-ready datasets for seafloor mapping across 5 geological layers\nthat is curated in collaboration with marine scientists. We further extend the\ndataset to SeafloorGenAI by incorporating the language component in order to\nfacilitate the development of both vision- and language-capable machine\nlearning models for sonar imagery. The dataset consists of 62 geo-distributed\ndata surveys spanning 17,300 square kilometers, with 696K sonar images, 827K\nannotated segmentation masks, 696K detailed language descriptions and\napproximately 7M question-answer pairs. By making our data processing source\ncode publicly available, we aim to engage the marine science community to\nenrich the data pool and inspire the machine learning community to develop more\nrobust models. This collaborative approach will enhance the capabilities and\napplications of our datasets within both fields.\n","authors":["Kien X. Nguyen","Fengchun Qiao","Arthur Trembanis","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2411.00172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04420v1","updated":"2024-11-07T04:16:15Z","published":"2024-11-07T04:16:15Z","title":"BendVLM: Test-Time Debiasing of Vision-Language Embeddings","summary":" Vision-language model (VLM) embeddings have been shown to encode biases\npresent in their training data, such as societal biases that prescribe negative\ncharacteristics to members of various racial and gender identities. VLMs are\nbeing quickly adopted for a variety of tasks ranging from few-shot\nclassification to text-guided image generation, making debiasing VLM embeddings\ncrucial. Debiasing approaches that fine-tune the VLM often suffer from\ncatastrophic forgetting. On the other hand, fine-tuning-free methods typically\nutilize a \"one-size-fits-all\" approach that assumes that correlation with the\nspurious attribute can be explained using a single linear direction across all\npossible inputs. In this work, we propose Bend-VLM, a nonlinear,\nfine-tuning-free approach for VLM embedding debiasing that tailors the\ndebiasing operation to each unique input. This allows for a more flexible\ndebiasing approach. Additionally, we do not require knowledge of the set of\ninputs a priori to inference time, making our method more appropriate for\nonline, open-set tasks such as retrieval and text guided image generation.\n","authors":["Walter Gerych","Haoran Zhang","Kimia Hamidieh","Eileen Pan","Maanas Sharma","Thomas Hartvigsen","Marzyeh Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2411.04420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01981v2","updated":"2024-11-07T04:10:10Z","published":"2024-11-04T11:09:47Z","title":"Typicalness-Aware Learning for Failure Detection","summary":" Deep neural networks (DNNs) often suffer from the overconfidence issue, where\nincorrect predictions are made with high confidence scores, hindering the\napplications in critical systems. In this paper, we propose a novel approach\ncalled Typicalness-Aware Learning (TAL) to address this issue and improve\nfailure detection performance. We observe that, with the cross-entropy loss,\nmodel predictions are optimized to align with the corresponding labels via\nincreasing logit magnitude or refining logit direction. However, regarding\natypical samples, the image content and their labels may exhibit disparities.\nThis discrepancy can lead to overfitting on atypical samples, ultimately\nresulting in the overconfidence issue that we aim to address. To tackle the\nproblem, we have devised a metric that quantifies the typicalness of each\nsample, enabling the dynamic adjustment of the logit magnitude during the\ntraining process. By allowing atypical samples to be adequately fitted while\npreserving reliable logit direction, the problem of overconfidence can be\nmitigated. TAL has been extensively evaluated on benchmark datasets, and the\nresults demonstrate its superiority over existing failure detection methods.\nSpecifically, TAL achieves a more than 5% improvement on CIFAR100 in terms of\nthe Area Under the Risk-Coverage Curve (AURC) compared to the state-of-the-art.\nCode is available at https://github.com/liuyijungoon/TAL.\n","authors":["Yijun Liu","Jiequan Cui","Zhuotao Tian","Senqiao Yang","Qingdong He","Xiaoling Wang","Jingyong Su"],"pdf_url":"https://arxiv.org/pdf/2411.01981v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.03348v2","updated":"2024-11-07T04:05:58Z","published":"2024-11-03T18:44:28Z","title":"Undermining Image and Text Classification Algorithms Using Adversarial\n Attacks","summary":" Machine learning models are prone to adversarial attacks, where inputs can be\nmanipulated in order to cause misclassifications. While previous research has\nfocused on techniques like Generative Adversarial Networks (GANs), there's\nlimited exploration of GANs and Synthetic Minority Oversampling Technique\n(SMOTE) in text and image classification models to perform adversarial attacks.\nOur study addresses this gap by training various machine learning models and\nusing GANs and SMOTE to generate additional data points aimed at attacking text\nclassification models. Furthermore, we extend our investigation to face\nrecognition models, training a Convolutional Neural Network(CNN) and subjecting\nit to adversarial attacks with fast gradient sign perturbations on key features\nidentified by GradCAM, a technique used to highlight key image characteristics\nCNNs use in classification. Our experiments reveal a significant vulnerability\nin classification models. Specifically, we observe a 20 % decrease in accuracy\nfor the top-performing text classification models post-attack, along with a 30\n% decrease in facial recognition accuracy. This highlights the susceptibility\nof these models to manipulation of input data. Adversarial attacks not only\ncompromise the security but also undermine the reliability of machine learning\nsystems. By showcasing the impact of adversarial attacks on both text\nclassification and face recognition models, our study underscores the urgent\nneed for develop robust defenses against such vulnerabilities.\n","authors":["Langalibalele Lunga","Suhas Sreehari"],"pdf_url":"https://arxiv.org/pdf/2411.03348v2.pdf","comment":"Accepted for presentation at Electronic Imaging Conference 2025"},{"id":"http://arxiv.org/abs/2411.04406v1","updated":"2024-11-07T03:55:23Z","published":"2024-11-07T03:55:23Z","title":"Image Understanding Makes for A Good Tokenizer for Image Generation","summary":" Abstract Modern image generation (IG) models have been shown to capture rich\nsemantics valuable for image understanding (IU) tasks. However, the potential\nof IU models to improve IG performance remains uncharted. We address this issue\nusing a token-based IG framework, which relies on effective tokenizers to\nproject images into token sequences. Currently, pixel reconstruction (e.g.,\nVQGAN) dominates the training objective for image tokenizers. In contrast, our\napproach adopts the feature reconstruction objective, where tokenizers are\ntrained by distilling knowledge from pretrained IU encoders. Comprehensive\ncomparisons indicate that tokenizers with strong IU capabilities achieve\nsuperior IG performance across a variety of metrics, datasets, tasks, and\nproposal networks. Notably, VQ-KD CLIP achieves $4.10$ FID on ImageNet-1k\n(IN-1k). Visualization suggests that the superiority of VQ-KD can be partly\nattributed to the rich semantics within the VQ-KD codebook. We further\nintroduce a straightforward pipeline to directly transform IU encoders into\ntokenizers, demonstrating exceptional effectiveness for IG tasks. These\ndiscoveries may energize further exploration into image tokenizer research and\ninspire the community to reassess the relationship between IU and IG. The code\nis released at https://github.com/magic-research/vector_quantization.\n","authors":["Luting Wang","Yang Zhao","Zijian Zhang","Jiashi Feng","Si Liu","Bingyi Kang"],"pdf_url":"https://arxiv.org/pdf/2411.04406v1.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2310.03739v5","updated":"2024-11-07T03:54:22Z","published":"2023-10-05T17:59:18Z","title":"Aligning Text-to-Image Diffusion Models with Reward Backpropagation","summary":" Text-to-image diffusion models have recently emerged at the forefront of\nimage generation, powered by very large-scale unsupervised or weakly supervised\ntext-to-image training datasets. Due to their unsupervised training,\ncontrolling their behavior in downstream tasks, such as maximizing\nhuman-perceived image quality, image-text alignment, or ethical image\ngeneration, is difficult. Recent works finetune diffusion models to downstream\nreward functions using vanilla reinforcement learning, notorious for the high\nvariance of the gradient estimators. In this paper, we propose AlignProp, a\nmethod that aligns diffusion models to downstream reward functions using\nend-to-end backpropagation of the reward gradient through the denoising\nprocess. While naive implementation of such backpropagation would require\nprohibitive memory resources for storing the partial derivatives of modern\ntext-to-image models, AlignProp finetunes low-rank adapter weight modules and\nuses gradient checkpointing, to render its memory usage viable. We test\nAlignProp in finetuning diffusion models to various objectives, such as\nimage-text semantic alignment, aesthetics, compressibility and controllability\nof the number of objects present, as well as their combinations. We show\nAlignProp achieves higher rewards in fewer training steps than alternatives,\nwhile being conceptually simpler, making it a straightforward choice for\noptimizing diffusion models for differentiable reward functions of interest.\nCode and Visualization results are available at https://align-prop.github.io/.\n","authors":["Mihir Prabhudesai","Anirudh Goyal","Deepak Pathak","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2310.03739v5.pdf","comment":"This paper is subsumed by a later paper of ours: arXiv:2407.08737"},{"id":"http://arxiv.org/abs/2411.04404v1","updated":"2024-11-07T03:48:35Z","published":"2024-11-07T03:48:35Z","title":"Enhancing Bronchoscopy Depth Estimation through Synthetic-to-Real Domain\n Adaptation","summary":" Monocular depth estimation has shown promise in general imaging tasks, aiding\nin localization and 3D reconstruction. While effective in various domains, its\napplication to bronchoscopic images is hindered by the lack of labeled data,\nchallenging the use of supervised learning methods. In this work, we propose a\ntransfer learning framework that leverages synthetic data with depth labels for\ntraining and adapts domain knowledge for accurate depth estimation in real\nbronchoscope data. Our network demonstrates improved depth prediction on real\nfootage using domain adaptation compared to training solely on synthetic data,\nvalidating our approach.\n","authors":["Qingyao Tian","Huai Liao","Xinyan Huang","Lujie Li","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2411.04404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04399v1","updated":"2024-11-07T03:28:24Z","published":"2024-11-07T03:28:24Z","title":"ProGraph: Temporally-alignable Probability Guided Graph Topological\n Modeling for 3D Human Reconstruction","summary":" Current 3D human motion reconstruction methods from monocular videos rely on\nfeatures within the current reconstruction window, leading to distortion and\ndeformations in the human structure under local occlusions or blurriness in\nvideo frames. To estimate realistic 3D human mesh sequences based on incomplete\nfeatures, we propose Temporally-alignable Probability Guided Graph Topological\nModeling for 3D Human Reconstruction (ProGraph). For missing parts recovery, we\nexploit the explicit topological-aware probability distribution across the\nentire motion sequence. To restore the complete human, Graph Topological\nModeling (GTM) learns the underlying topological structure, focusing on the\nrelationships inherent in the individual parts. Next, to generate blurred\nmotion parts, Temporal-alignable Probability Distribution (TPDist) utilizes the\nGTM to predict features based on distribution. This interactive mechanism\nfacilitates motion consistency, allowing the restoration of human parts.\nFurthermore, Hierarchical Human Loss (HHLoss) constrains the probability\ndistribution errors of inter-frame features during topological structure\nvariation. Our Method achieves superior results than other SOTA methods in\naddressing occlusions and blurriness on 3DPW.\n","authors":["Hongsheng Wang","Zehui Feng","Tong Xiao","Genfan Yang","Shengyu Zhang","Fei Wu","Feng Lin"],"pdf_url":"https://arxiv.org/pdf/2411.04399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00132v2","updated":"2024-11-07T03:22:56Z","published":"2024-10-31T18:33:39Z","title":"Beyond Accuracy: Ensuring Correct Predictions With Correct Rationales","summary":" Large pretrained foundation models demonstrate exceptional performance and,\nin some high-stakes applications, even surpass human experts. However, most of\nthese models are currently evaluated primarily on prediction accuracy,\noverlooking the validity of the rationales behind their accurate predictions.\nFor the safe deployment of foundation models, there is a pressing need to\nensure double-correct predictions, i.e., correct prediction backed by correct\nrationales. To achieve this, we propose a two-phase scheme: First, we curate a\nnew dataset that offers structured rationales for visual recognition tasks.\nSecond, we propose a rationale-informed optimization method to guide the model\nin disentangling and localizing visual evidence for each rationale, without\nrequiring manual annotations. Extensive experiments and ablation studies\ndemonstrate that our model outperforms state-of-the-art models by up to 10.1%\nin prediction accuracy across a wide range of tasks. Furthermore, our method\nsignificantly improves the model's rationale correctness, improving\nlocalization by 7.5% and disentanglement by 36.5%. Our dataset, source code,\nand pretrained weights: https://github.com/deep-real/DCP\n","authors":["Tang Li","Mengmeng Ma","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2411.00132v2.pdf","comment":"In Proceedings of the 38th Conference on Neural Information\n Processing Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2410.23247v3","updated":"2024-11-07T02:52:47Z","published":"2024-10-30T17:30:35Z","title":"bit2bit: 1-bit quanta video reconstruction via self-supervised photon\n prediction","summary":" Quanta image sensors, such as SPAD arrays, are an emerging sensor technology,\nproducing 1-bit arrays representing photon detection events over exposures as\nshort as a few nanoseconds. In practice, raw data are post-processed using\nheavy spatiotemporal binning to create more useful and interpretable images at\nthe cost of degrading spatiotemporal resolution. In this work, we propose\nbit2bit, a new method for reconstructing high-quality image stacks at the\noriginal spatiotemporal resolution from sparse binary quanta image data.\nInspired by recent work on Poisson denoising, we developed an algorithm that\ncreates a dense image sequence from sparse binary photon data by predicting the\nphoton arrival location probability distribution. However, due to the binary\nnature of the data, we show that the assumption of a Poisson distribution is\ninadequate. Instead, we model the process with a Bernoulli lattice process from\nthe truncated Poisson. This leads to the proposal of a novel self-supervised\nsolution based on a masked loss function. We evaluate our method using both\nsimulated and real data. On simulated data from a conventional video, we\nachieve 34.35 mean PSNR with extremely photon-sparse binary input (<0.06\nphotons per pixel per frame). We also present a novel dataset containing a wide\nrange of real SPAD high-speed videos under various challenging imaging\nconditions. The scenes cover strong/weak ambient light, strong motion,\nultra-fast events, etc., which will be made available to the community, on\nwhich we demonstrate the promise of our approach. Both reconstruction quality\nand throughput substantially surpass the state-of-the-art methods (e.g., Quanta\nBurst Photography (QBP)). Our approach significantly enhances the visualization\nand usability of the data, enabling the application of existing analysis\ntechniques.\n","authors":["Yehe Liu","Alexander Krull","Hector Basevi","Ales Leonardis","Michael W. Jenkins"],"pdf_url":"https://arxiv.org/pdf/2410.23247v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.14789v3","updated":"2024-11-07T02:43:03Z","published":"2024-08-27T05:31:30Z","title":"Revisiting Surgical Instrument Segmentation Without Human Intervention:\n A Graph Partitioning View","summary":" Surgical instrument segmentation (SIS) on endoscopic images stands as a\nlong-standing and essential task in the context of computer-assisted\ninterventions for boosting minimally invasive surgery. Given the recent surge\nof deep learning methodologies and their data-hungry nature, training a neural\npredictive model based on massive expert-curated annotations has been\ndominating and served as an off-the-shelf approach in the field, which could,\nhowever, impose prohibitive burden to clinicians for preparing fine-grained\npixel-wise labels corresponding to the collected surgical video frames. In this\nwork, we propose an unsupervised method by reframing the video frame\nsegmentation as a graph partitioning problem and regarding image pixels as\ngraph nodes, which is significantly different from the previous efforts. A\nself-supervised pre-trained model is firstly leveraged as a feature extractor\nto capture high-level semantic features. Then, Laplacian matrixs are computed\nfrom the features and are eigendecomposed for graph partitioning. On the \"deep\"\neigenvectors, a surgical video frame is meaningfully segmented into different\nmodules such as tools and tissues, providing distinguishable semantic\ninformation like locations, classes, and relations. The segmentation problem\ncan then be naturally tackled by applying clustering or threshold on the\neigenvectors. Extensive experiments are conducted on various datasets (e.g.,\nEndoVis2017, EndoVis2018, UCL, etc.) for different clinical endpoints. Across\nall the challenging scenarios, our method demonstrates outstanding performance\nand robustness higher than unsupervised state-of-the-art (SOTA) methods. The\ncode is released at https://github.com/MingyuShengSMY/GraphClusteringSIS.git.\n","authors":["Mingyu Sheng","Jianan Fan","Dongnan Liu","Ron Kikinis","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2408.14789v3.pdf","comment":"Accepted by The 32nd ACM International Conference on Multimedia (ACM\n MM 2024) Workshop on Multimedia Computing for Health and Medicine (MCHM)"},{"id":"http://arxiv.org/abs/2411.03695v2","updated":"2024-11-07T02:40:12Z","published":"2024-11-06T06:33:55Z","title":"AMNCutter: Affinity-Attention-Guided Multi-View Normalized Cutter for\n Unsupervised Surgical Instrument Segmentation","summary":" Surgical instrument segmentation (SIS) is pivotal for robotic-assisted\nminimally invasive surgery, assisting surgeons by identifying surgical\ninstruments in endoscopic video frames. Recent unsupervised surgical instrument\nsegmentation (USIS) methods primarily rely on pseudo-labels derived from\nlow-level features such as color and optical flow, but these methods show\nlimited effectiveness and generalizability in complex and unseen endoscopic\nscenarios. In this work, we propose a label-free unsupervised model featuring a\nnovel module named Multi-View Normalized Cutter (m-NCutter). Different from\nprevious USIS works, our model is trained using a graph-cutting loss function\nthat leverages patch affinities for supervision, eliminating the need for\npseudo-labels. The framework adaptively determines which affinities from which\nlevels should be prioritized. Therefore, the low- and high-level features and\ntheir affinities are effectively integrated to train a label-free unsupervised\nmodel, showing superior effectiveness and generalization ability. We conduct\ncomprehensive experiments across multiple SIS datasets to validate our\napproach's state-of-the-art (SOTA) performance, robustness, and exceptional\npotential as a pre-trained model. Our code is released at\nhttps://github.com/MingyuShengSMY/AMNCutter.\n","authors":["Mingyu Sheng","Jianan Fan","Dongnan Liu","Ron Kikinis","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2411.03695v2.pdf","comment":"Accepted by the 2025 IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV 2025)"},{"id":"http://arxiv.org/abs/2406.02880v2","updated":"2024-11-07T02:26:49Z","published":"2024-06-05T02:54:46Z","title":"Controllable Talking Face Generation by Implicit Facial Keypoints\n Editing","summary":" Audio-driven talking face generation has garnered significant interest within\nthe domain of digital human research. Existing methods are encumbered by\nintricate model architectures that are intricately dependent on each other,\ncomplicating the process of re-editing image or video inputs. In this work, we\npresent ControlTalk, a talking face generation method to control face\nexpression deformation based on driven audio, which can construct the head pose\nand facial expression including lip motion for both single image or sequential\nvideo inputs in a unified manner. By utilizing a pre-trained video synthesis\nrenderer and proposing the lightweight adaptation, ControlTalk achieves precise\nand naturalistic lip synchronization while enabling quantitative control over\nmouth opening shape. Our experiments show that our method is superior to\nstate-of-the-art performance on widely used benchmarks, including HDTF and\nMEAD. The parameterized adaptation demonstrates remarkable generalization\ncapabilities, effectively handling expression deformation across same-ID and\ncross-ID scenarios, and extending its utility to out-of-domain portraits,\nregardless of languages. Code is available at\nhttps://github.com/NetEase-Media/ControlTalk.\n","authors":["Dong Zhao","Jiaying Shi","Wenjun Li","Shudong Wang","Shenghui Xu","Zhaoming Pan"],"pdf_url":"https://arxiv.org/pdf/2406.02880v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18782v2","updated":"2024-11-07T01:31:00Z","published":"2024-05-29T05:42:25Z","title":"Principled Probabilistic Imaging using Diffusion Models as Plug-and-Play\n Priors","summary":" Diffusion models (DMs) have recently shown outstanding capabilities in\nmodeling complex image distributions, making them expressive image priors for\nsolving Bayesian inverse problems. However, most existing DM-based methods rely\non approximations in the generative process to be generic to different inverse\nproblems, leading to inaccurate sample distributions that deviate from the\ntarget posterior defined within the Bayesian framework. To harness the\ngenerative power of DMs while avoiding such approximations, we propose a Markov\nchain Monte Carlo algorithm that performs posterior sampling for general\ninverse problems by reducing it to sampling the posterior of a Gaussian\ndenoising problem. Crucially, we leverage a general DM formulation as a unified\ninterface that allows for rigorously solving the denoising problem with a range\nof state-of-the-art DMs. We demonstrate the effectiveness of the proposed\nmethod on six inverse problems (three linear and three nonlinear), including a\nreal-world black hole imaging problem. Experimental results indicate that our\nproposed method offers more accurate reconstructions and posterior estimation\ncompared to existing DM-based imaging inverse methods.\n","authors":["Zihui Wu","Yu Sun","Yifan Chen","Bingliang Zhang","Yisong Yue","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2405.18782v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.04357v1","updated":"2024-11-07T01:30:30Z","published":"2024-11-07T01:30:30Z","title":"MegaPortrait: Revisiting Diffusion Control for High-fidelity Portrait\n Generation","summary":" We propose MegaPortrait. It's an innovative system for creating personalized\nportrait images in computer vision. It has three modules: Identity Net, Shading\nNet, and Harmonization Net. Identity Net generates learned identity using a\ncustomized model fine-tuned with source images. Shading Net re-renders\nportraits using extracted representations. Harmonization Net fuses pasted faces\nand the reference image's body for coherent results. Our approach with\noff-the-shelf Controlnets is better than state-of-the-art AI portrait products\nin identity preservation and image fidelity. MegaPortrait has a simple but\neffective design and we compare it with other methods and products to show its\nsuperiority.\n","authors":["Han Yang","Sotiris Anagnostidis","Enis Simsar","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2411.04357v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2406.05768v6","updated":"2024-11-07T01:29:26Z","published":"2024-06-09T12:55:50Z","title":"TLCM: Training-efficient Latent Consistency Model for Image Generation\n with 2-8 Steps","summary":" Distilling latent diffusion models (LDMs) into ones that are fast to sample\nfrom is attracting growing research interest. However, the majority of existing\nmethods face two critical challenges: (1) They hinge on long training using a\nhuge volume of real data. (2) They routinely lead to quality degradation for\ngeneration, especially in text-image alignment. This paper proposes a novel\ntraining-efficient Latent Consistency Model (TLCM) to overcome these\nchallenges. Our method first accelerates LDMs via data-free multistep latent\nconsistency distillation (MLCD), and then data-free latent consistency\ndistillation is proposed to efficiently guarantee the inter-segment consistency\nin MLCD. Furthermore, we introduce bags of techniques, e.g., distribution\nmatching, adversarial learning, and preference learning, to enhance TLCM's\nperformance at few-step inference without any real data. TLCM demonstrates a\nhigh level of flexibility by enabling adjustment of sampling steps within the\nrange of 2 to 8 while still producing competitive outputs compared to full-step\napproaches. Notably, TLCM enjoys the data-free merit by employing synthetic\ndata from the teacher for distillation. With just 70 training hours on an A100\nGPU, a 3-step TLCM distilled from SDXL achieves an impressive CLIP Score of\n33.68 and an Aesthetic Score of 5.97 on the MSCOCO-2017 5K benchmark,\nsurpassing various accelerated models and even outperforming the teacher model\nin human preference metrics. We also demonstrate the versatility of TLCMs in\napplications including image style transfer, controllable generation, and\nChinese-to-image generation.\n","authors":["Qingsong Xie","Zhenyi Liao","Zhijie Deng","Chen chen","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2406.05768v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06633v2","updated":"2024-11-07T01:14:29Z","published":"2024-07-09T07:59:34Z","title":"Variational Zero-shot Multispectral Pansharpening","summary":" Pansharpening aims to generate a high spatial resolution multispectral image\n(HRMS) by fusing a low spatial resolution multispectral image (LRMS) and a\npanchromatic image (PAN). The most challenging issue for this task is that only\nthe to-be-fused LRMS and PAN are available, and the existing deep\nlearning-based methods are unsuitable since they rely on many training pairs.\nTraditional variational optimization (VO) based methods are well-suited for\naddressing such a problem. They focus on carefully designing explicit fusion\nrules as well as regularizations for an optimization problem, which are based\non the researcher's discovery of the image relationships and image structures.\nUnlike previous VO-based methods, in this work, we explore such complex\nrelationships by a parameterized term rather than a manually designed one.\nSpecifically, we propose a zero-shot pansharpening method by introducing a\nneural network into the optimization objective. This network estimates a\nrepresentation component of HRMS, which mainly describes the relationship\nbetween HRMS and PAN. In this way, the network achieves a similar goal to the\nso-called deep image prior because it implicitly regulates the relationship\nbetween the HRMS and PAN images through its inherent structure. We directly\nminimize this optimization objective via network parameters and the expected\nHRMS image through iterative updating. Extensive experiments on various\nbenchmark datasets demonstrate that our proposed method can achieve better\nperformance compared with other state-of-the-art methods. The codes are\navailable at https://github.com/xyrui/PSDip.\n","authors":["Xiangyu Rui","Xiangyong Cao","Yining Li","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2407.06633v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04351v1","updated":"2024-11-07T01:12:01Z","published":"2024-11-07T01:12:01Z","title":"LidaRefer: Outdoor 3D Visual Grounding for Autonomous Driving with\n Transformers","summary":" 3D visual grounding (VG) aims to locate relevant objects or regions within 3D\nscenes based on natural language descriptions. Although recent methods for\nindoor 3D VG have successfully transformer-based architectures to capture\nglobal contextual information and enable fine-grained cross-modal fusion, they\nare unsuitable for outdoor environments due to differences in the distribution\nof point clouds between indoor and outdoor settings. Specifically, first,\nextensive LiDAR point clouds demand unacceptable computational and memory\nresources within transformers due to the high-dimensional visual features.\nSecond, dominant background points and empty spaces in sparse LiDAR point\nclouds complicate cross-modal fusion owing to their irrelevant visual\ninformation. To address these challenges, we propose LidaRefer, a\ntransformer-based 3D VG framework designed for large-scale outdoor scenes.\nMoreover, during training, we introduce a simple and effective localization\nmethod, which supervises the decoder's queries to localize not only a target\nobject but also ambiguous objects that might be confused as the target due to\nthe exhibition of similar attributes in a scene or the incorrect understanding\nof a language description. This supervision enhances the model's ability to\ndistinguish ambiguous objects from a target by learning the differences in\ntheir spatial relationships and attributes. LidaRefer achieves state-of-the-art\nperformance on Talk2Car-3D, a 3D VG dataset for autonomous driving, with\nsignificant improvements under various evaluation settings.\n","authors":["Yeong-Seung Baek","Heung-Seon Oh"],"pdf_url":"https://arxiv.org/pdf/2411.04351v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.04348v1","updated":"2024-11-07T01:10:05Z","published":"2024-11-07T01:10:05Z","title":"UEVAVD: A Dataset for Developing UAV's Eye View Active Object Detection","summary":" Occlusion is a longstanding difficulty that challenges the UAV-based object\ndetection. Many works address this problem by adapting the detection model.\nHowever, few of them exploit that the UAV could fundamentally improve detection\nperformance by changing its viewpoint. Active Object Detection (AOD) offers an\neffective way to achieve this purpose. Through Deep Reinforcement Learning\n(DRL), AOD endows the UAV with the ability of autonomous path planning to\nsearch for the observation that is more conducive to target identification.\nUnfortunately, there exists no available dataset for developing the UAV AOD\nmethod. To fill this gap, we released a UAV's eye view active vision dataset\nnamed UEVAVD and hope it can facilitate research on the UAV AOD problem.\nAdditionally, we improve the existing DRL-based AOD method by incorporating the\ninductive bias when learning the state representation. First, due to the\npartial observability, we use the gated recurrent unit to extract state\nrepresentations from the observation sequence instead of the single-view\nobservation. Second, we pre-decompose the scene with the Segment Anything Model\n(SAM) and filter out the irrelevant information with the derived masks. With\nthese practices, the agent could learn an active viewing policy with better\ngeneralization capability. The effectiveness of our innovations is validated by\nthe experiments on the UEVAVD dataset. Our dataset will soon be available at\nhttps://github.com/Leo000ooo/UEVAVD_dataset.\n","authors":["Xinhua Jiang","Tianpeng Liu","Li Liu","Zhen Liu","Yongxiang Liu"],"pdf_url":"https://arxiv.org/pdf/2411.04348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21739v2","updated":"2024-11-07T00:37:50Z","published":"2024-10-29T04:54:45Z","title":"SS3DM: Benchmarking Street-View Surface Reconstruction with a Synthetic\n 3D Mesh Dataset","summary":" Reconstructing accurate 3D surfaces for street-view scenarios is crucial for\napplications such as digital entertainment and autonomous driving simulation.\nHowever, existing street-view datasets, including KITTI, Waymo, and nuScenes,\nonly offer noisy LiDAR points as ground-truth data for geometric evaluation of\nreconstructed surfaces. These geometric ground-truths often lack the necessary\nprecision to evaluate surface positions and do not provide data for assessing\nsurface normals. To overcome these challenges, we introduce the SS3DM dataset,\ncomprising precise \\textbf{S}ynthetic \\textbf{S}treet-view \\textbf{3D}\n\\textbf{M}esh models exported from the CARLA simulator. These mesh models\nfacilitate accurate position evaluation and include normal vectors for\nevaluating surface normal. To simulate the input data in realistic driving\nscenarios for 3D reconstruction, we virtually drive a vehicle equipped with six\nRGB cameras and five LiDAR sensors in diverse outdoor scenes. Leveraging this\ndataset, we establish a benchmark for state-of-the-art surface reconstruction\nmethods, providing a comprehensive evaluation of the associated challenges.\n For more information, visit our homepage at https://ss3dm.top.\n","authors":["Yubin Hu","Kairui Wen","Heng Zhou","Xiaoyang Guo","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2410.21739v2.pdf","comment":"NeurIPS 2024, Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2411.04335v1","updated":"2024-11-07T00:22:38Z","published":"2024-11-07T00:22:38Z","title":"GazeGen: Gaze-Driven User Interaction for Visual Content Generation","summary":" We present GazeGen, a user interaction system that generates visual content\n(images and videos) for locations indicated by the user's eye gaze. GazeGen\nallows intuitive manipulation of visual content by targeting regions of\ninterest with gaze. Using advanced techniques in object detection and\ngenerative AI, GazeGen performs gaze-controlled image adding/deleting,\nrepositioning, and surface material changes of image objects, and converts\nstatic images into videos. Central to GazeGen is the DFT Gaze (Distilled and\nFine-Tuned Gaze) agent, an ultra-lightweight model with only 281K parameters,\nperforming accurate real-time gaze predictions tailored to individual users'\neyes on small edge devices. GazeGen is the first system to combine visual\ncontent generation with real-time gaze estimation, made possible exclusively by\nDFT Gaze. This real-time gaze estimation enables various visual content\ngeneration tasks, all controlled by the user's gaze. The input for DFT Gaze is\nthe user's eye images, while the inputs for visual content generation are the\nuser's view and the predicted gaze point from DFT Gaze. To achieve efficient\ngaze predictions, we derive the small model from a large model (10x larger) via\nnovel knowledge distillation and personal adaptation techniques. We integrate\nknowledge distillation with a masked autoencoder, developing a compact yet\npowerful gaze estimation model. This model is further fine-tuned with Adapters,\nenabling highly accurate and personalized gaze predictions with minimal user\ninput. DFT Gaze ensures low-latency and precise gaze tracking, supporting a\nwide range of gaze-driven tasks. We validate the performance of DFT Gaze on AEA\nand OpenEDS2020 benchmarks, demonstrating low angular gaze error and low\nlatency on the edge device (Raspberry Pi 4). Furthermore, we describe\napplications of GazeGen, illustrating its versatility and effectiveness in\nvarious usage scenarios.\n","authors":["He-Yen Hsieh","Ziyun Li","Sai Qian Zhang","Wei-Te Mark Ting","Kao-Den Chang","Barbara De Salvo","Chiao Liu","H. T. Kung"],"pdf_url":"https://arxiv.org/pdf/2411.04335v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.04332v1","updated":"2024-11-07T00:14:39Z","published":"2024-11-07T00:14:39Z","title":"HandCraft: Anatomically Correct Restoration of Malformed Hands in\n Diffusion Generated Images","summary":" Generative text-to-image models, such as Stable Diffusion, have demonstrated\na remarkable ability to generate diverse, high-quality images. However, they\nare surprisingly inept when it comes to rendering human hands, which are often\nanatomically incorrect or reside in the \"uncanny valley\". In this paper, we\npropose a method HandCraft for restoring such malformed hands. This is achieved\nby automatically constructing masks and depth images for hands as conditioning\nsignals using a parametric model, allowing a diffusion-based image editor to\nfix the hand's anatomy and adjust its pose while seamlessly integrating the\nchanges into the original image, preserving pose, color, and style. Our\nplug-and-play hand restoration solution is compatible with existing pretrained\ndiffusion models, and the restoration process facilitates adoption by eschewing\nany fine-tuning or training requirements for the diffusion models. We also\ncontribute MalHand datasets that contain generated images with a wide variety\nof malformed hands in several styles for hand detector training and hand\nrestoration benchmarking, and demonstrate through qualitative and quantitative\nevaluation that HandCraft not only restores anatomical correctness but also\nmaintains the integrity of the overall image.\n","authors":["Zhenyue Qin","Yiqun Zhang","Yang Liu","Dylan Campbell"],"pdf_url":"https://arxiv.org/pdf/2411.04332v1.pdf","comment":"Accepted by WACV 2025"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2406.09215v3","updated":"2024-11-07T18:30:53Z","published":"2024-06-13T15:16:11Z","title":"On Softmax Direct Preference Optimization for Recommendation","summary":" Recommender systems aim to predict personalized rankings based on user\npreference data. With the rise of Language Models (LMs), LM-based recommenders\nhave been widely explored due to their extensive world knowledge and powerful\nreasoning abilities. Most of the LM-based recommenders convert historical\ninteractions into language prompts, pairing with a positive item as the target\nresponse and fine-tuning LM with a language modeling loss. However, the current\nobjective fails to fully leverage preference data and is not optimized for\npersonalized ranking tasks, which hinders the performance of LM-based\nrecommenders. Inspired by the current advancement of Direct Preference\nOptimization (DPO) in human preference alignment and the success of softmax\nloss in recommendations, we propose Softmax-DPO (S-DPO) to instill ranking\ninformation into the LM to help LM-based recommenders distinguish preferred\nitems from negatives, rather than solely focusing on positives. Specifically,\nwe incorporate multiple negatives in user preference data and devise an\nalternative version of DPO loss tailored for LM-based recommenders, which is\nextended from the traditional full-ranking Plackett-Luce (PL) model to partial\nrankings and connected to softmax sampling strategies. Theoretically, we bridge\nS-DPO with the softmax loss over negative sampling and find that it has an\ninherent benefit of mining hard negatives, which assures its exceptional\ncapabilities in recommendation tasks. Empirically, extensive experiments\nconducted on three real-world datasets demonstrate the superiority of S-DPO to\neffectively model user preference and further boost recommendation performance\nwhile providing better rewards for preferred items. Our codes are available at\nhttps://github.com/chenyuxin1999/S-DPO.\n","authors":["Yuxin Chen","Junfei Tan","An Zhang","Zhengyi Yang","Leheng Sheng","Enzhi Zhang","Xiang Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2406.09215v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.14894v2","updated":"2024-11-07T18:15:23Z","published":"2024-06-21T06:30:16Z","title":"Talking the Talk Does Not Entail Walking the Walk: On the Limits of\n Large Language Models in Lexical Entailment Recognition","summary":" Verbs form the backbone of language, providing the structure and meaning to\nsentences. Yet, their intricate semantic nuances pose a longstanding challenge.\nUnderstanding verb relations through the concept of lexical entailment is\ncrucial for comprehending sentence meanings and grasping verb dynamics. This\nwork investigates the capabilities of eight Large Language Models in\nrecognizing lexical entailment relations among verbs through differently\ndevised prompting strategies and zero-/few-shot settings over verb pairs from\ntwo lexical databases, namely WordNet and HyperLex. Our findings unveil that\nthe models can tackle the lexical entailment recognition task with moderately\ngood performance, although at varying degree of effectiveness and under\ndifferent conditions. Also, utilizing few-shot prompting can enhance the\nmodels' performance. However, perfectly solving the task arises as an unmet\nchallenge for all examined LLMs, which raises an emergence for further research\ndevelopments on this topic.\n","authors":["Candida M. Greco","Lucio La Cava","Andrea Tagarelli"],"pdf_url":"https://arxiv.org/pdf/2406.14894v2.pdf","comment":"Accepted for publication at The 2024 Conference on Empirical Methods\n in Natural Language Processing (EMNLP-2024) - Findings"},{"id":"http://arxiv.org/abs/2411.04798v1","updated":"2024-11-07T15:38:14Z","published":"2024-11-07T15:38:14Z","title":"Orbit: A Framework for Designing and Evaluating Multi-objective Rankers","summary":" Machine learning in production needs to balance multiple objectives: This is\nparticularly evident in ranking or recommendation models, where conflicting\nobjectives such as user engagement, satisfaction, diversity, and novelty must\nbe considered at the same time. However, designing multi-objective rankers is\ninherently a dynamic wicked problem -- there is no single optimal solution, and\nthe needs evolve over time. Effective design requires collaboration between\ncross-functional teams and careful analysis of a wide range of information. In\nthis work, we introduce Orbit, a conceptual framework for Objective-centric\nRanker Building and Iteration. The framework places objectives at the center of\nthe design process, to serve as boundary objects for communication and guide\npractitioners for design and evaluation. We implement Orbit as an interactive\nsystem, which enables stakeholders to interact with objective spaces directly\nand supports real-time exploration and evaluation of design trade-offs. We\nevaluate Orbit through a user study involving twelve industry practitioners,\nshowing that it supports efficient design space exploration, leads to more\ninformed decision-making, and enhances awareness of the inherent trade-offs of\nmultiple objectives. Orbit (1) opens up new opportunities of an\nobjective-centric design process for any multi-objective ML models, as well as\n(2) sheds light on future designs that push practitioners to go beyond a narrow\nmetric-centric or example-centric mindset.\n","authors":["Chenyang Yang","Tesi Xiao","Michael Shavlovsky","Christian Kästner","Tongshuang Wu"],"pdf_url":"https://arxiv.org/pdf/2411.04798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14331v2","updated":"2024-11-07T14:48:18Z","published":"2024-10-18T09:43:30Z","title":"ChartifyText: Automated Chart Generation from Data-Involved Texts via\n LLM","summary":" Text documents with numerical values involved are widely used in various\napplications such as scientific research, economy, public health and\njournalism. However, it is difficult for readers to quickly interpret such\ndata-involved texts and gain deep insights. To fill this research gap, this\nwork aims to automatically generate charts to accurately convey the underlying\ndata and ideas to readers, which is essentially a challenging task. The\nchallenges originate from text ambiguities, intrinsic sparsity and uncertainty\nof data in text documents, and subjective sentiment differences. Specifically,\nwe propose ChartifyText, a novel fully-automated approach that leverages Large\nLanguage Models (LLMs) to convert complex data-involved texts to expressive\ncharts. It consists of two major modules: tabular data inference and expressive\nchart generation. The tabular data inference module employs systematic prompt\nengineering to guide the LLM (e.g., GPT-4) to infer table data, where data\nranges, uncertainties, missing data values and corresponding subjective\nsentiments are explicitly considered. The expressive chart generation module\naugments standard charts with intuitive visual encodings and concise texts to\naccurately convey the underlying data and insights. We extensively evaluate the\neffectiveness of ChartifyText on real-world data-involved text documents\nthrough case studies, in-depth interviews with three visualization experts, and\na carefully-designed user study with 15 participants. The results demonstrate\nthe usefulness and effectiveness of ChartifyText in helping readers efficiently\nand effectively make sense of data-involved texts.\n","authors":["Songheng Zhang","Lei Wang","Toby Jia-Jun Li","Qiaomu Shen","Yixin Cao","Yong Wang"],"pdf_url":"https://arxiv.org/pdf/2410.14331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04677v1","updated":"2024-11-07T13:03:21Z","published":"2024-11-07T13:03:21Z","title":"Lightning IR: Straightforward Fine-tuning and Inference of\n Transformer-based Language Models for Information Retrieval","summary":" A wide range of transformer-based language models have been proposed for\ninformation retrieval tasks. However, fine-tuning and inference of these models\nis often complex and requires substantial engineering effort. This paper\nintroduces Lightning IR, a PyTorch Lightning-based framework for fine-tuning\nand inference of transformer-based language models for information retrieval.\nLightning IR provides a modular and extensible architecture that supports all\nstages of an information retrieval pipeline: from fine-tuning and indexing to\nsearching and re-ranking. It is designed to be straightforward to use,\nscalable, and reproducible. Lightning IR is available as open-source:\nhttps://github.com/webis-de/lightning-ir.\n","authors":["Ferdinand Schlatt","Maik Fröbe","Matthias Hagen"],"pdf_url":"https://arxiv.org/pdf/2411.04677v1.pdf","comment":"Accepted as a demo at WSDM'25"},{"id":"http://arxiv.org/abs/2410.05779v2","updated":"2024-11-07T10:44:59Z","published":"2024-10-08T08:00:12Z","title":"LightRAG: Simple and Fast Retrieval-Augmented Generation","summary":" Retrieval-Augmented Generation (RAG) systems enhance large language models\n(LLMs) by integrating external knowledge sources, enabling more accurate and\ncontextually relevant responses tailored to user needs. However, existing RAG\nsystems have significant limitations, including reliance on flat data\nrepresentations and inadequate contextual awareness, which can lead to\nfragmented answers that fail to capture complex inter-dependencies. To address\nthese challenges, we propose LightRAG, which incorporates graph structures into\ntext indexing and retrieval processes. This innovative framework employs a\ndual-level retrieval system that enhances comprehensive information retrieval\nfrom both low-level and high-level knowledge discovery. Additionally, the\nintegration of graph structures with vector representations facilitates\nefficient retrieval of related entities and their relationships, significantly\nimproving response times while maintaining contextual relevance. This\ncapability is further enhanced by an incremental update algorithm that ensures\nthe timely integration of new data, allowing the system to remain effective and\nresponsive in rapidly changing data environments. Extensive experimental\nvalidation demonstrates considerable improvements in retrieval accuracy and\nefficiency compared to existing approaches. We have made our LightRAG\nopen-source and available at the link: https://github.com/HKUDS/LightRAG.\n","authors":["Zirui Guo","Lianghao Xia","Yanhua Yu","Tu Ao","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2410.05779v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04602v1","updated":"2024-11-07T10:31:31Z","published":"2024-11-07T10:31:31Z","title":"Self-Calibrated Listwise Reranking with Large Language Models","summary":" Large language models (LLMs), with advanced linguistic capabilities, have\nbeen employed in reranking tasks through a sequence-to-sequence approach. In\nthis paradigm, multiple passages are reranked in a listwise manner and a\ntextual reranked permutation is generated. However, due to the limited context\nwindow of LLMs, this reranking paradigm requires a sliding window strategy to\niteratively handle larger candidate sets. This not only increases computational\ncosts but also restricts the LLM from fully capturing all the comparison\ninformation for all candidates. To address these challenges, we propose a novel\nself-calibrated listwise reranking method, which aims to leverage LLMs to\nproduce global relevance scores for ranking. To achieve it, we first propose\nthe relevance-aware listwise reranking framework, which incorporates explicit\nlist-view relevance scores to improve reranking efficiency and enable global\ncomparison across the entire candidate set. Second, to ensure the comparability\nof the computed scores, we propose self-calibrated training that uses\npoint-view relevance assessments generated internally by the LLM itself to\ncalibrate the list-view relevance assessments. Extensive experiments and\ncomprehensive analysis on the BEIR benchmark and TREC Deep Learning Tracks\ndemonstrate the effectiveness and efficiency of our proposed method.\n","authors":["Ruiyang Ren","Yuhao Wang","Kun Zhou","Wayne Xin Zhao","Wenjie Wang","Jing Liu","Ji-Rong Wen","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2411.04602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04539v1","updated":"2024-11-07T08:54:46Z","published":"2024-11-07T08:54:46Z","title":"Best Practices for Distilling Large Language Models into BERT for Web\n Search Ranking","summary":" Recent studies have highlighted the significant potential of Large Language\nModels (LLMs) as zero-shot relevance rankers. These methods predominantly\nutilize prompt learning to assess the relevance between queries and documents\nby generating a ranked list of potential documents. Despite their promise, the\nsubstantial costs associated with LLMs pose a significant challenge for their\ndirect implementation in commercial search systems. To overcome this barrier\nand fully exploit the capabilities of LLMs for text ranking, we explore\ntechniques to transfer the ranking expertise of LLMs to a more compact model\nsimilar to BERT, using a ranking loss to enable the deployment of less\nresource-intensive models. Specifically, we enhance the training of LLMs\nthrough Continued Pre-Training, taking the query as input and the clicked title\nand summary as output. We then proceed with supervised fine-tuning of the LLM\nusing a rank loss, assigning the final token as a representative of the entire\nsentence. Given the inherent characteristics of autoregressive language models,\nonly the final token can encapsulate all preceding tokens. Additionally,\nwe introduce a hybrid point-wise and margin MSE loss to transfer the ranking\nknowledge from LLMs to smaller models like BERT. This method creates a viable\nsolution for environments with strict resource constraints. Both offline and\nonline evaluations have confirmed the efficacy of our approach, and our model\nhas been successfully integrated into a commercial web search engine as of\nFebruary 2024.\n","authors":["Dezhi Ye","Junwei Hu","Jiabin Fan","Bowen Tian","Jie Liu","Haijin Liang","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2411.04539v1.pdf","comment":"Arxiv Version"},{"id":"http://arxiv.org/abs/2411.04403v1","updated":"2024-11-07T03:46:43Z","published":"2024-11-07T03:46:43Z","title":"Towards Competitive Search Relevance For Inference-Free Learned Sparse\n Retrievers","summary":" Learned sparse retrieval, which can efficiently perform retrieval through\nmature inverted-index engines, has garnered growing attention in recent years.\nParticularly, the inference-free sparse retrievers are attractive as they\neliminate online model inference in the retrieval phase thereby avoids huge\ncomputational cost, offering reasonable throughput and latency. However, even\nthe state-of-the-art (SOTA) inference-free sparse models lag far behind in\nterms of search relevance when compared to both sparse and dense siamese\nmodels. Towards competitive search relevance for inference-free sparse\nretrievers, we argue that they deserve dedicated training methods other than\nusing same ones with siamese encoders. In this paper, we propose two different\napproaches for performance improvement. First, we introduce the IDF-aware FLOPS\nloss, which introduces Inverted Document Frequency (IDF) to the sparsification\nof representations. We find that it mitigates the negative impact of the FLOPS\nregularization on search relevance, allowing the model to achieve a better\nbalance between accuracy and efficiency. Moreover, we propose a heterogeneous\nensemble knowledge distillation framework that combines siamese dense and\nsparse retrievers to generate supervisory signals during the pre-training\nphase. The ensemble framework of dense and sparse retriever capitalizes on\ntheir strengths respectively, providing a strong upper bound for knowledge\ndistillation. To concur the diverse feedback from heterogeneous supervisors, we\nnormalize and then aggregate the outputs of the teacher models to eliminate\nscore scale differences. On the BEIR benchmark, our model outperforms existing\nSOTA inference-free sparse model by \\textbf{3.3 NDCG@10 score}. It exhibits\nsearch relevance comparable to siamese sparse retrievers and client-side\nlatency only \\textbf{1.1x that of BM25}.\n","authors":["Zhichao Geng","Dongyu Ru","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.04403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04366v1","updated":"2024-11-07T01:52:46Z","published":"2024-11-07T01:52:46Z","title":"The Concatenator: A Bayesian Approach To Real Time Concatenative\n Musaicing","summary":" We present ``The Concatenator,'' a real time system for audio-guided\nconcatenative synthesis. Similarly to Driedger et al.'s ``musaicing'' (or\n``audio mosaicing'') technique, we concatenate a set number of windows within a\ncorpus of audio to re-create the harmonic and percussive aspects of a target\naudio stream. Unlike Driedger's NMF-based technique, however, we instead use an\nexplicitly Bayesian point of view, where corpus window indices are hidden\nstates and the target audio stream is an observation. We use a particle filter\nto infer the best hidden corpus states in real-time. Our transition model\nincludes a tunable parameter to control the time-continuity of corpus grains,\nand our observation model allows users to prioritize how quickly windows change\nto match the target. Because the computational complexity of the system is\nindependent of the corpus size, our system scales to corpora that are hours\nlong, which is an important feature in the age of vast audio data collections.\nWithin The Concatenator module itself, composers can vary grain length, fit to\ntarget, and pitch shift in real time while reacting to the sounds they hear,\nenabling them to rapidly iterate ideas. To conclude our work, we evaluate our\nsystem with extensive quantitative tests of the effects of parameters, as well\nas a qualitative evaluation with artistic insights. Based on the quality of the\nresults, we believe the real-time capability unlocks new avenues for musical\nexpression and control, suitable for live performance and modular synthesis\nintegration, which furthermore represents an essential breakthrough in\nconcatenative synthesis technology.\n","authors":["Christopher Tralie","Ben Cantil"],"pdf_url":"https://arxiv.org/pdf/2411.04366v1.pdf","comment":"12 pages, 6 figures, Accepted for Publication in The International\n Society for Music Information Retrieval Proceedings, 2024"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2411.05007v1","updated":"2024-11-07T18:59:58Z","published":"2024-11-07T18:59:58Z","title":"SVDQunat: Absorbing Outliers by Low-Rank Components for 4-Bit Diffusion\n Models","summary":" Diffusion models have been proven highly effective at generating high-quality\nimages. However, as these models grow larger, they require significantly more\nmemory and suffer from higher latency, posing substantial challenges for\ndeployment. In this work, we aim to accelerate diffusion models by quantizing\ntheir weights and activations to 4 bits. At such an aggressive level, both\nweights and activations are highly sensitive, where conventional post-training\nquantization methods for large language models like smoothing become\ninsufficient. To overcome this limitation, we propose SVDQuant, a new 4-bit\nquantization paradigm. Different from smoothing which redistributes outliers\nbetween weights and activations, our approach absorbs these outliers using a\nlow-rank branch. We first consolidate the outliers by shifting them from\nactivations to weights, then employ a high-precision low-rank branch to take in\nthe weight outliers with Singular Value Decomposition (SVD). This process eases\nthe quantization on both sides. However, na\\\"{\\i}vely running the low-rank\nbranch independently incurs significant overhead due to extra data movement of\nactivations, negating the quantization speedup. To address this, we co-design\nan inference engine Nunchaku that fuses the kernels of the low-rank branch into\nthose of the low-bit branch to cut off redundant memory access. It can also\nseamlessly support off-the-shelf low-rank adapters (LoRAs) without the need for\nre-quantization. Extensive experiments on SDXL, PixArt-$\\Sigma$, and FLUX.1\nvalidate the effectiveness of SVDQuant in preserving image quality. We reduce\nthe memory usage for the 12B FLUX.1 models by 3.5$\\times$, achieving\n3.0$\\times$ speedup over the 4-bit weight-only quantized baseline on the 16GB\nlaptop 4090 GPU, paving the way for more interactive applications on PCs. Our\nquantization library and inference engine are open-sourced.\n","authors":["Muyang Li","Yujun Lin","Zhekai Zhang","Tianle Cai","Xiuyu Li","Junxian Guo","Enze Xie","Chenlin Meng","Jun-Yan Zhu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2411.05007v1.pdf","comment":"Quantization Library: https://github.com/mit-han-lab/deepcompressor\n Inference Engine: https://github.com/mit-han-lab/nunchaku Website:\n https://hanlab.mit.edu/projects/svdquant Demo: https://svdquant.mit.edu Blog:\n https://hanlab.mit.edu/blog/svdquant"},{"id":"http://arxiv.org/abs/2411.05005v1","updated":"2024-11-07T18:59:53Z","published":"2024-11-07T18:59:53Z","title":"Diff-2-in-1: Bridging Generation and Dense Perception with Diffusion\n Models","summary":" Beyond high-fidelity image synthesis, diffusion models have recently\nexhibited promising results in dense visual perception tasks. However, most\nexisting work treats diffusion models as a standalone component for perception\ntasks, employing them either solely for off-the-shelf data augmentation or as\nmere feature extractors. In contrast to these isolated and thus sub-optimal\nefforts, we introduce a unified, versatile, diffusion-based framework,\nDiff-2-in-1, that can simultaneously handle both multi-modal data generation\nand dense visual perception, through a unique exploitation of the\ndiffusion-denoising process. Within this framework, we further enhance\ndiscriminative visual perception via multi-modal generation, by utilizing the\ndenoising network to create multi-modal data that mirror the distribution of\nthe original training set. Importantly, Diff-2-in-1 optimizes the utilization\nof the created diverse and faithful data by leveraging a novel self-improving\nlearning mechanism. Comprehensive experimental evaluations validate the\neffectiveness of our framework, showcasing consistent performance improvements\nacross various discriminative backbones and high-quality multi-modal data\ngeneration characterized by both realism and usefulness.\n","authors":["Shuhong Zheng","Zhipeng Bao","Ruoyu Zhao","Martial Hebert","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.05005v1.pdf","comment":"26 pages, 14 figures"},{"id":"http://arxiv.org/abs/2411.05003v1","updated":"2024-11-07T18:59:45Z","published":"2024-11-07T18:59:45Z","title":"ReCapture: Generative Video Camera Controls for User-Provided Videos\n using Masked Video Fine-Tuning","summary":" Recently, breakthroughs in video modeling have allowed for controllable\ncamera trajectories in generated videos. However, these methods cannot be\ndirectly applied to user-provided videos that are not generated by a video\nmodel. In this paper, we present ReCapture, a method for generating new videos\nwith novel camera trajectories from a single user-provided video. Our method\nallows us to re-generate the reference video, with all its existing scene\nmotion, from vastly different angles and with cinematic camera motion. Notably,\nusing our method we can also plausibly hallucinate parts of the scene that were\nnot observable in the reference video. Our method works by (1) generating a\nnoisy anchor video with a new camera trajectory using multiview diffusion\nmodels or depth-based point cloud rendering and then (2) regenerating the\nanchor video into a clean and temporally consistent reangled video using our\nproposed masked video fine-tuning technique.\n","authors":["David Junhao Zhang","Roni Paiss","Shiran Zada","Nikhil Karnad","David E. Jacobs","Yael Pritch","Inbar Mosseri","Mike Zheng Shou","Neal Wadhwa","Nataniel Ruiz"],"pdf_url":"https://arxiv.org/pdf/2411.05003v1.pdf","comment":"project page: https://generative-video-camera-controls.github.io/"},{"id":"http://arxiv.org/abs/2411.05001v1","updated":"2024-11-07T18:59:28Z","published":"2024-11-07T18:59:28Z","title":"Analyzing The Language of Visual Tokens","summary":" With the introduction of transformer-based models for vision and language\ntasks, such as LLaVA and Chameleon, there has been renewed interest in the\ndiscrete tokenized representation of images. These models often treat image\npatches as discrete tokens, analogous to words in natural language, learning\njoint alignments between visual and human languages. However, little is known\nabout the statistical behavior of these visual languages - whether they follow\nsimilar frequency distributions, grammatical structures, or topologies as\nnatural languages. In this paper, we take a natural-language-centric approach\nto analyzing discrete visual languages and uncover striking similarities and\nfundamental differences. We demonstrate that, although visual languages adhere\nto Zipfian distributions, higher token innovation drives greater entropy and\nlower compression, with tokens predominantly representing object parts,\nindicating intermediate granularity. We also show that visual languages lack\ncohesive grammatical structures, leading to higher perplexity and weaker\nhierarchical organization compared to natural languages. Finally, we\ndemonstrate that, while vision models align more closely with natural languages\nthan other models, this alignment remains significantly weaker than the\ncohesion found within natural languages. Through these experiments, we\ndemonstrate how understanding the statistical properties of discrete visual\nlanguages can inform the design of more effective computer vision models.\n","authors":["David M. Chan","Rodolfo Corona","Joonyong Park","Cheol Jun Cho","Yutong Bai","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2411.05001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04999v1","updated":"2024-11-07T18:59:27Z","published":"2024-11-07T18:59:27Z","title":"DynaMem: Online Dynamic Spatio-Semantic Memory for Open World Mobile\n Manipulation","summary":" Significant progress has been made in open-vocabulary mobile manipulation,\nwhere the goal is for a robot to perform tasks in any environment given a\nnatural language description. However, most current systems assume a static\nenvironment, which limits the system's applicability in real-world scenarios\nwhere environments frequently change due to human intervention or the robot's\nown actions. In this work, we present DynaMem, a new approach to open-world\nmobile manipulation that uses a dynamic spatio-semantic memory to represent a\nrobot's environment. DynaMem constructs a 3D data structure to maintain a\ndynamic memory of point clouds, and answers open-vocabulary object localization\nqueries using multimodal LLMs or open-vocabulary features generated by\nstate-of-the-art vision-language models. Powered by DynaMem, our robots can\nexplore novel environments, search for objects not found in memory, and\ncontinuously update the memory as objects move, appear, or disappear in the\nscene. We run extensive experiments on the Stretch SE3 robots in three real and\nnine offline scenes, and achieve an average pick-and-drop success rate of 70%\non non-stationary objects, which is more than a 2x improvement over\nstate-of-the-art static systems. Our code as well as our experiment and\ndeployment videos are open sourced and can be found on our project website:\nhttps://dynamem.github.io/\n","authors":["Peiqi Liu","Zhanqiu Guo","Mohit Warke","Soumith Chintala","Chris Paxton","Nur Muhammad Mahi Shafiullah","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2411.04999v1.pdf","comment":"Website: https://dynamem.github.io"},{"id":"http://arxiv.org/abs/2411.04998v1","updated":"2024-11-07T18:59:16Z","published":"2024-11-07T18:59:16Z","title":"HourVideo: 1-Hour Video-Language Understanding","summary":" We present HourVideo, a benchmark dataset for hour-long video-language\nunderstanding. Our dataset consists of a novel task suite comprising\nsummarization, perception (recall, tracking), visual reasoning (spatial,\ntemporal, predictive, causal, counterfactual), and navigation (room-to-room,\nobject retrieval) tasks. HourVideo includes 500 manually curated egocentric\nvideos from the Ego4D dataset, spanning durations of 20 to 120 minutes, and\nfeatures 12,976 high-quality, five-way multiple-choice questions. Benchmarking\nresults reveal that multimodal models, including GPT-4 and LLaVA-NeXT, achieve\nmarginal improvements over random chance. In stark contrast, human experts\nsignificantly outperform the state-of-the-art long-context multimodal model,\nGemini Pro 1.5 (85.0% vs. 37.3%), highlighting a substantial gap in multimodal\ncapabilities. Our benchmark, evaluation toolkit, prompts, and documentation are\navailable at https://hourvideo.stanford.edu\n","authors":["Keshigeyan Chandrasegaran","Agrim Gupta","Lea M. Hadzic","Taran Kota","Jimming He","Cristóbal Eyzaguirre","Zane Durante","Manling Li","Jiajun Wu","Li Fei-Fei"],"pdf_url":"https://arxiv.org/pdf/2411.04998v1.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track; 28 pages"},{"id":"http://arxiv.org/abs/2411.04995v1","updated":"2024-11-07T18:58:57Z","published":"2024-11-07T18:58:57Z","title":"LoFi: Scalable Local Image Reconstruction with Implicit Neural\n Representation","summary":" Neural fields or implicit neural representations (INRs) have attracted\nsignificant attention in machine learning and signal processing due to their\nefficient continuous representation of images and 3D volumes. In this work, we\nbuild on INRs and introduce a coordinate-based local processing framework for\nsolving imaging inverse problems, termed LoFi (Local Field). Unlike\nconventional methods for image reconstruction, LoFi processes local information\nat each coordinate \\textit{separately} by multi-layer perceptrons (MLPs),\nrecovering the object at that specific coordinate. Similar to INRs, LoFi can\nrecover images at any continuous coordinate, enabling image reconstruction at\nmultiple resolutions. With comparable or better performance than standard CNNs\nfor image reconstruction, LoFi achieves excellent generalization to\nout-of-distribution data and memory usage almost independent of image\nresolution. Remarkably, training on $1024 \\times 1024$ images requires just 3GB\nof memory -- over 20 times less than the memory typically needed by standard\nCNNs. Additionally, LoFi's local design allows it to train on extremely small\ndatasets with less than 10 samples, without overfitting or the need for\nregularization or early stopping. Finally, we use LoFi as a denoising prior in\na plug-and-play framework for solving general inverse problems to benefit from\nits continuous image representation and strong generalization. Although trained\non low-resolution images, LoFi can be used as a low-dimensional prior to solve\ninverse problems at any resolution. We validate our framework across a variety\nof imaging modalities, from low-dose computed tomography to radio\ninterferometric imaging.\n","authors":["AmirEhsan Khorashadizadeh","Tobías I. Liaudat","Tianlin Liu","Jason D. McEwen","Ivan Dokmanić"],"pdf_url":"https://arxiv.org/pdf/2411.04995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04992v1","updated":"2024-11-07T18:57:24Z","published":"2024-11-07T18:57:24Z","title":"Which bits went where? Past and future transfer entropy decomposition\n with the information bottleneck","summary":" Whether the system under study is a shoal of fish, a collection of neurons,\nor a set of interacting atmospheric and oceanic processes, transfer entropy\nmeasures the flow of information between time series and can detect possible\ncausal relationships. Much like mutual information, transfer entropy is\ngenerally reported as a single value summarizing an amount of shared variation,\nyet a more fine-grained accounting might illuminate much about the processes\nunder study. Here we propose to decompose transfer entropy and localize the\nbits of variation on both sides of information flow: that of the originating\nprocess's past and that of the receiving process's future. We employ the\ninformation bottleneck (IB) to compress the time series and identify the\ntransferred entropy. We apply our method to decompose the transfer entropy in\nseveral synthetic recurrent processes and an experimental mouse dataset of\nconcurrent behavioral and neural activity. Our approach highlights the nuanced\ndynamics within information flow, laying a foundation for future explorations\ninto the intricate interplay of temporal processes in complex systems.\n","authors":["Kieran A. Murphy","Zhuowen Yin","Dani S. Bassett"],"pdf_url":"https://arxiv.org/pdf/2411.04992v1.pdf","comment":"NeurIPS 2024 workshop \"Machine learning and the physical sciences\"\n Camera ready"},{"id":"http://arxiv.org/abs/2411.04990v1","updated":"2024-11-07T18:56:37Z","published":"2024-11-07T18:56:37Z","title":"Clustering in Causal Attention Masking","summary":" This work presents a modification of the self-attention dynamics proposed by\nGeshkovski et al. (arXiv:2312.10794) to better reflect the practically\nrelevant, causally masked attention used in transformer architectures for\ngenerative AI. This modification translates into an interacting particle system\nthat cannot be interpreted as a mean-field gradient flow. Despite this loss of\nstructure, we significantly strengthen the results of Geshkovski et al.\n(arXiv:2312.10794) in this context: While previous rigorous results focused on\ncases where all three matrices (Key, Query, and Value) were scaled identities,\nwe prove asymptotic convergence to a single cluster for arbitrary key-query\nmatrices and a value matrix equal to the identity. Additionally, we establish a\nconnection to the classical R\\'enyi parking problem from combinatorial geometry\nto make initial theoretical steps towards demonstrating the existence of\nmeta-stable states.\n","authors":["Nikita Karagodin","Yury Polyanskiy","Philippe Rigollet"],"pdf_url":"https://arxiv.org/pdf/2411.04990v1.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024), 22 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.04989v1","updated":"2024-11-07T18:56:11Z","published":"2024-11-07T18:56:11Z","title":"SG-I2V: Self-Guided Trajectory Control in Image-to-Video Generation","summary":" Methods for image-to-video generation have achieved impressive,\nphoto-realistic quality. However, adjusting specific elements in generated\nvideos, such as object motion or camera movement, is often a tedious process of\ntrial and error, e.g., involving re-generating videos with different random\nseeds. Recent techniques address this issue by fine-tuning a pre-trained model\nto follow conditioning signals, such as bounding boxes or point trajectories.\nYet, this fine-tuning procedure can be computationally expensive, and it\nrequires datasets with annotated object motion, which can be difficult to\nprocure. In this work, we introduce SG-I2V, a framework for controllable\nimage-to-video generation that is self-guided$\\unicode{x2013}$offering\nzero-shot control by relying solely on the knowledge present in a pre-trained\nimage-to-video diffusion model without the need for fine-tuning or external\nknowledge. Our zero-shot method outperforms unsupervised baselines while being\ncompetitive with supervised models in terms of visual quality and motion\nfidelity.\n","authors":["Koichi Namekata","Sherwin Bahmani","Ziyi Wu","Yash Kant","Igor Gilitschenski","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2411.04989v1.pdf","comment":"Project page: https://kmcode1.github.io/Projects/SG-I2V/"},{"id":"http://arxiv.org/abs/2411.04987v1","updated":"2024-11-07T18:55:10Z","published":"2024-11-07T18:55:10Z","title":"Few-Shot Task Learning through Inverse Generative Modeling","summary":" Learning the intents of an agent, defined by its goals or motion style, is\noften extremely challenging from just a few examples. We refer to this problem\nas task concept learning and present our approach, Few-Shot Task Learning\nthrough Inverse Generative Modeling (FTL-IGM), which learns new task concepts\nby leveraging invertible neural generative models. The core idea is to pretrain\na generative model on a set of basic concepts and their demonstrations. Then,\ngiven a few demonstrations of a new concept (such as a new goal or a new\naction), our method learns the underlying concepts through backpropagation\nwithout updating the model weights, thanks to the invertibility of the\ngenerative model. We evaluate our method in five domains -- object\nrearrangement, goal-oriented navigation, motion caption of human actions,\nautonomous driving, and real-world table-top manipulation. Our experimental\nresults demonstrate that via the pretrained generative model, we successfully\nlearn novel concepts and generate agent plans or motion corresponding to these\nconcepts in (1) unseen environments and (2) in composition with training\nconcepts.\n","authors":["Aviv Netanyahu","Yilun Du","Antonia Bronars","Jyothish Pari","Joshua Tenenbaum","Tianmin Shu","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.04987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04976v1","updated":"2024-11-07T18:50:14Z","published":"2024-11-07T18:50:14Z","title":"Noisy Zero-Shot Coordination: Breaking The Common Knowledge Assumption\n In Zero-Shot Coordination Games","summary":" Zero-shot coordination (ZSC) is a popular setting for studying the ability of\nreinforcement learning (RL) agents to coordinate with novel partners. Prior ZSC\nformulations assume the $\\textit{problem setting}$ is common knowledge: each\nagent knows the underlying Dec-POMDP, knows others have this knowledge, and so\non ad infinitum. However, this assumption rarely holds in complex real-world\nsettings, which are often difficult to fully and correctly specify. Hence, in\nsettings where this common knowledge assumption is invalid, agents trained\nusing ZSC methods may not be able to coordinate well. To address this\nlimitation, we formulate the $\\textit{noisy zero-shot coordination}$ (NZSC)\nproblem. In NZSC, agents observe different noisy versions of the ground truth\nDec-POMDP, which are assumed to be distributed according to a fixed noise\nmodel. Only the distribution of ground truth Dec-POMDPs and the noise model are\ncommon knowledge. We show that a NZSC problem can be reduced to a ZSC problem\nby designing a meta-Dec-POMDP with an augmented state space consisting of all\nthe ground-truth Dec-POMDPs. For solving NZSC problems, we propose a simple and\nflexible meta-learning method called NZSC training, in which the agents are\ntrained across a distribution of coordination problems - which they only get to\nobserve noisy versions of. We show that with NZSC training, RL agents can be\ntrained to coordinate well with novel partners even when the (exact) problem\nsetting of the coordination is not common knowledge.\n","authors":["Usman Anwar","Ashish Pandian","Jia Wan","David Krueger","Jakob Foerster"],"pdf_url":"https://arxiv.org/pdf/2411.04976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04975v1","updated":"2024-11-07T18:49:33Z","published":"2024-11-07T18:49:33Z","title":"SuffixDecoding: A Model-Free Approach to Speeding Up Large Language\n Model Inference","summary":" We present SuffixDecoding, a novel model-free approach to accelerating large\nlanguage model (LLM) inference through speculative decoding. Unlike existing\nmethods that rely on draft models or specialized decoding heads, SuffixDecoding\nleverages suffix trees built from previously generated outputs to efficiently\npredict candidate token sequences. Our approach enables flexible\ntree-structured speculation without the overhead of maintaining and\norchestrating additional models. SuffixDecoding builds and dynamically updates\nsuffix trees to capture patterns in the generated text, using them to construct\nspeculation trees through a principled scoring mechanism based on empirical\ntoken frequencies. SuffixDecoding requires only CPU memory which is plentiful\nand underutilized on typical LLM serving nodes. We demonstrate that\nSuffixDecoding achieves competitive speedups compared to model-based approaches\nacross diverse workloads including open-domain chat, code generation, and\ntext-to-SQL tasks. For open-ended chat and code generation tasks,\nSuffixDecoding achieves up to $1.4\\times$ higher output throughput than\nSpecInfer and up to $1.1\\times$ lower time-per-token (TPOT) latency. For a\nproprietary multi-LLM text-to-SQL application, SuffixDecoding achieves up to\n$2.9\\times$ higher output throughput and $3\\times$ lower latency than\nspeculative decoding. Our evaluation shows that SuffixDecoding maintains high\nacceptance rates even with small reference corpora of 256 examples, while\ncontinuing to improve performance as more historical outputs are incorporated.\n","authors":["Gabriele Oliaro","Zhihao Jia","Daniel Campos","Aurick Qiao"],"pdf_url":"https://arxiv.org/pdf/2411.04975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04967v1","updated":"2024-11-07T18:43:17Z","published":"2024-11-07T18:43:17Z","title":"AsCAN: Asymmetric Convolution-Attention Networks for Efficient\n Recognition and Generation","summary":" Neural network architecture design requires making many crucial decisions.\nThe common desiderata is that similar decisions, with little modifications, can\nbe reused in a variety of tasks and applications. To satisfy that,\narchitectures must provide promising latency and performance trade-offs,\nsupport a variety of tasks, scale efficiently with respect to the amounts of\ndata and compute, leverage available data from other tasks, and efficiently\nsupport various hardware. To this end, we introduce AsCAN -- a hybrid\narchitecture, combining both convolutional and transformer blocks. We revisit\nthe key design principles of hybrid architectures and propose a simple and\neffective \\emph{asymmetric} architecture, where the distribution of\nconvolutional and transformer blocks is \\emph{asymmetric}, containing more\nconvolutional blocks in the earlier stages, followed by more transformer blocks\nin later stages. AsCAN supports a variety of tasks: recognition, segmentation,\nclass-conditional image generation, and features a superior trade-off between\nperformance and latency. We then scale the same architecture to solve a\nlarge-scale text-to-image task and show state-of-the-art performance compared\nto the most recent public and commercial models. Notably, even without any\ncomputation optimization for transformer blocks, our models still yield faster\ninference speed than existing works featuring efficient attention mechanisms,\nhighlighting the advantages and the value of our approach.\n","authors":["Anil Kag","Huseyin Coskun","Jierun Chen","Junli Cao","Willi Menapace","Aliaksandr Siarohin","Sergey Tulyakov","Jian Ren"],"pdf_url":"https://arxiv.org/pdf/2411.04967v1.pdf","comment":"NeurIPS 2024. Project Page:\n https://snap-research.github.io/snap_image/"},{"id":"http://arxiv.org/abs/2401.09980v2","updated":"2024-11-07T18:43:06Z","published":"2024-01-18T13:51:20Z","title":"A Comparative Analysis of U-Net-based models for Segmentation of Cardiac\n MRI","summary":" Medical imaging refers to the technologies and methods utilized to view the\nhuman body and its inside, in order to diagnose, monitor, or even treat medical\ndisorders. This paper aims to explore the application of deep learning\ntechniques in the semantic segmentation of Cardiac short-axis MRI (Magnetic\nResonance Imaging) images, aiming to enhance the diagnosis, monitoring, and\ntreatment of medical disorders related to the heart. The focus centers on\nimplementing various architectures that are derivatives of U-Net, to\neffectively isolate specific parts of the heart for comprehensive anatomical\nand functional analysis. Through a combination of images, graphs, and\nquantitative metrics, the efficacy of the models and their predictions are\nshowcased. Additionally, this paper addresses encountered challenges and\noutline strategies for future improvements. This abstract provides a concise\noverview of the efforts in utilizing deep learning for cardiac image\nsegmentation, emphasizing both the accomplishments and areas for further\nrefinement.\n","authors":["Ketan Suhaas Saichandran"],"pdf_url":"https://arxiv.org/pdf/2401.09980v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04965v1","updated":"2024-11-07T18:41:50Z","published":"2024-11-07T18:41:50Z","title":"BitNet a4.8: 4-bit Activations for 1-bit LLMs","summary":" Recent research on the 1-bit Large Language Models (LLMs), such as BitNet\nb1.58, presents a promising direction for reducing the inference cost of LLMs\nwhile maintaining their performance. In this work, we introduce BitNet a4.8,\nenabling 4-bit activations for 1-bit LLMs. BitNet a4.8 employs a hybrid\nquantization and sparsification strategy to mitigate the quantization errors\nintroduced by the outlier channels. Specifically, we utilize 4-bit activations\nfor inputs to the attention and feed-forward network layers, while sparsifying\nintermediate states followed with 8-bit quantization. Extensive experiments\ndemonstrate that BitNet a4.8 achieves performance comparable to BitNet b1.58\nwith equivalent training costs, while being faster in inference with enabling\n4-bit (INT4/FP4) kernels. Additionally, BitNet a4.8 activates only 55% of\nparameters and supports 3-bit KV cache, further enhancing the efficiency of\nlarge-scale LLM deployment and inference.\n","authors":["Hongyu Wang","Shuming Ma","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2411.04965v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2410.02472v3","updated":"2024-11-07T18:30:38Z","published":"2024-10-03T13:25:15Z","title":"Meta-Models: An Architecture for Decoding LLM Behaviors Through\n Interpreted Embeddings and Natural Language","summary":" As Large Language Models (LLMs) become increasingly integrated into our daily\nlives, the potential harms from deceptive behavior underlie the need for\nfaithfully interpreting their decision-making. While traditional probing\nmethods have shown some effectiveness, they remain best for narrowly scoped\ntasks while more comprehensive explanations are still necessary. To this end,\nwe investigate meta-models-an architecture using a \"meta-model\" that takes\nactivations from an \"input-model\" and answers natural language questions about\nthe input-model's behaviors. We evaluate the meta-model's ability to generalize\nby training them on selected task types and assessing their out-of-distribution\nperformance in deceptive scenarios. Our findings show that meta-models\ngeneralize well to out-of-distribution tasks and point towards opportunities\nfor future research in this area. Our code is available at\nhttps://github.com/acostarelli/meta-models-public .\n","authors":["Anthony Costarelli","Mat Allen","Severin Field"],"pdf_url":"https://arxiv.org/pdf/2410.02472v3.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.04946v1","updated":"2024-11-07T18:23:30Z","published":"2024-11-07T18:23:30Z","title":"SPGD: Steepest Perturbed Gradient Descent Optimization","summary":" Optimization algorithms are pivotal in advancing various scientific and\nindustrial fields but often encounter obstacles such as trapping in local\nminima, saddle points, and plateaus (flat regions), which makes the convergence\nto reasonable or near-optimal solutions particularly challenging. This paper\npresents the Steepest Perturbed Gradient Descent (SPGD), a novel algorithm that\ninnovatively combines the principles of the gradient descent method with\nperiodic uniform perturbation sampling to effectively circumvent these\nimpediments and lead to better solutions whenever possible. SPGD is\ndistinctively designed to generate a set of candidate solutions and select the\none exhibiting the steepest loss difference relative to the current solution.\nIt enhances the traditional gradient descent approach by integrating a\nstrategic exploration mechanism that significantly increases the likelihood of\nescaping sub-optimal local minima and navigating complex optimization\nlandscapes effectively. Our approach not only retains the directed efficiency\nof gradient descent but also leverages the exploratory benefits of stochastic\nperturbations, thus enabling a more comprehensive search for global optima\nacross diverse problem spaces. We demonstrate the efficacy of SPGD in solving\nthe 3D component packing problem, an NP-hard challenge. Preliminary results\nshow a substantial improvement over four established methods, particularly on\nresponse surfaces with complex topographies and in multidimensional non-convex\ncontinuous optimization problems. Comparative analyses with established 2D\nbenchmark functions highlight SPGD's superior performance, showcasing its\nability to navigate complex optimization landscapes. These results emphasize\nSPGD's potential as a versatile tool for a wide range of optimization problems.\n","authors":["Amir M. Vahedi","Horea T. Ilies"],"pdf_url":"https://arxiv.org/pdf/2411.04946v1.pdf","comment":"28 pages, 26 figures, submitted to Journal of Mechanical Design"},{"id":"http://arxiv.org/abs/2401.08426v5","updated":"2024-11-07T18:22:41Z","published":"2024-01-16T15:11:29Z","title":"GD doesn't make the cut: Three ways that non-differentiability affects\n neural network training","summary":" This paper critically examines the fundamental distinctions between gradient\nmethods applied to non-differentiable functions (NGDMs) and classical gradient\ndescents (GDs) for differentiable functions, revealing significant gaps in\ncurrent deep learning optimization theory. We demonstrate that NGDMs exhibit\nmarkedly different convergence properties compared to GDs, strongly challenging\nthe applicability of extensive neural network convergence literature based on\n$L-smoothness$ to non-smooth neural networks. Our analysis reveals paradoxical\nbehavior of NDGM solutions for $L_{1}$-regularized problems, where increasing\nregularization counterintuitively leads to larger $L_{1}$ norms of optimal\nsolutions. This finding calls into question widely adopted $L_{1}$ penalization\ntechniques for network pruning. We further challenge the common assumption that\noptimization algorithms like RMSProp behave similarly in differentiable and\nnon-differentiable contexts. Expanding on the Edge of Stability phenomenon, we\ndemonstrate its occurrence in a broader class of functions, including Lipschitz\ncontinuous convex differentiable functions. This finding raises important\nquestions about its relevance and interpretation in non-convex,\nnon-differentiable neural networks, particularly those using ReLU activations.\nOur work identifies critical misunderstandings of NDGMs in influential\nliterature, stemming from an overreliance on strong smoothness assumptions.\nThese findings necessitate a reevaluation of optimization dynamics in deep\nlearning, emphasizing the crucial need for more nuanced theoretical foundations\nin analyzing these complex systems.\n","authors":["Siddharth Krishna Kumar"],"pdf_url":"https://arxiv.org/pdf/2401.08426v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04939v1","updated":"2024-11-07T18:15:38Z","published":"2024-11-07T18:15:38Z","title":"Pareto Set Identification With Posterior Sampling","summary":" The problem of identifying the best answer among a collection of items having\nreal-valued distribution is well-understood.\n Despite its practical relevance for many applications, fewer works have\nstudied its extension when multiple and potentially conflicting metrics are\navailable to assess an item's quality.\n Pareto set identification (PSI) aims to identify the set of answers whose\nmeans are not uniformly worse than another.\n This paper studies PSI in the transductive linear setting with potentially\ncorrelated objectives.\n Building on posterior sampling in both the stopping and the sampling rules,\nwe propose the PSIPS algorithm that deals simultaneously with structure and\ncorrelation without paying the computational cost of existing oracle-based\nalgorithms.\n Both from a frequentist and Bayesian perspective, PSIPS is asymptotically\noptimal.\n We demonstrate its good empirical performance in real-world and synthetic\ninstances.\n","authors":["Cyrille Kone","Marc Jourdan","Emilie Kaufmann"],"pdf_url":"https://arxiv.org/pdf/2411.04939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04936v1","updated":"2024-11-07T18:13:31Z","published":"2024-11-07T18:13:31Z","title":"Fed-LDR: Federated Local Data-infused Graph Creation with Node-centric\n Model Refinement","summary":" The rapid acceleration of global urbanization has introduced novel challenges\nin enhancing urban infrastructure and services. Spatio-temporal data,\nintegrating spatial and temporal dimensions, has emerged as a critical tool for\nunderstanding urban phenomena and promoting sustainability. In this context,\nFederated Learning (FL) has gained prominence as a distributed learning\nparadigm aligned with the privacy requirements of urban IoT environments.\nHowever, integrating traditional and deep learning models into the FL framework\nposes significant challenges, particularly in capturing complex spatio-temporal\ndependencies and adapting to diverse urban conditions. To address these\nchallenges, we propose the Federated Local Data-Infused Graph Creation with\nNode-centric Model Refinement (Fed-LDR) algorithm. Fed-LDR leverages FL and\nGraph Convolutional Networks (GCN) to enhance spatio-temporal data analysis in\nurban environments. The algorithm comprises two key modules: (1) the Local\nData-Infused Graph Creation (LDIGC) module, which dynamically reconfigures\nadjacency matrices to reflect evolving spatial relationships within urban\nenvironments, and (2) the Node-centric Model Refinement (NoMoR) module, which\ncustomizes model parameters for individual urban nodes to accommodate\nheterogeneity. Evaluations on the PeMSD4 and PeMSD8 datasets demonstrate\nFed-LDR's superior performance over six baseline methods. Fed-LDR achieved the\nlowest Mean Absolute Error (MAE) values of 20.15 and 17.30, and the lowest Root\nMean Square Error (RMSE) values of 32.30 and 27.15, respectively, while\nmaintaining a high correlation coefficient of 0.96 across both datasets.\nNotably, on the PeMSD4 dataset, Fed-LDR reduced MAE and RMSE by up to 81\\% and\n78\\%, respectively, compared to the best-performing baseline FedMedian.\n","authors":["Jiechao Gao","Yuangang Li","Syeda Faiza Ahmed"],"pdf_url":"https://arxiv.org/pdf/2411.04936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04915v1","updated":"2024-11-07T17:55:07Z","published":"2024-11-07T17:55:07Z","title":"Evaluating Robustness of Reinforcement Learning Algorithms for\n Autonomous Shipping","summary":" Recently, there has been growing interest in autonomous shipping due to its\npotential to improve maritime efficiency and safety. The use of advanced\ntechnologies, such as artificial intelligence, can address the current\nnavigational and operational challenges in autonomous shipping. In particular,\ninland waterway transport (IWT) presents a unique set of challenges, such as\ncrowded waterways and variable environmental conditions. In such dynamic\nsettings, the reliability and robustness of autonomous shipping solutions are\ncritical factors for ensuring safe operations. This paper examines the\nrobustness of benchmark deep reinforcement learning (RL) algorithms,\nimplemented for IWT within an autonomous shipping simulator, and their ability\nto generate effective motion planning policies. We demonstrate that a\nmodel-free approach can achieve an adequate policy in the simulator,\nsuccessfully navigating port environments never encountered during training. We\nfocus particularly on Soft-Actor Critic (SAC), which we show to be inherently\nmore robust to environmental disturbances compared to MuZero, a\nstate-of-the-art model-based RL algorithm. In this paper, we take a significant\nstep towards developing robust, applied RL frameworks that can be generalized\nto various vessel types and navigate complex port- and inland environments and\nscenarios.\n","authors":["Bavo Lesy","Ali Anwar","Siegfried Mercelis"],"pdf_url":"https://arxiv.org/pdf/2411.04915v1.pdf","comment":"5 pages, 4 figures. Will be presented at IEEE RAAI 2024"},{"id":"http://arxiv.org/abs/2411.04913v1","updated":"2024-11-07T17:51:55Z","published":"2024-11-07T17:51:55Z","title":"Structure Matters: Dynamic Policy Gradient","summary":" In this work, we study $\\gamma$-discounted infinite-horizon tabular Markov\ndecision processes (MDPs) and introduce a framework called dynamic policy\ngradient (DynPG). The framework directly integrates dynamic programming with\n(any) policy gradient method, explicitly leveraging the Markovian property of\nthe environment. DynPG dynamically adjusts the problem horizon during training,\ndecomposing the original infinite-horizon MDP into a sequence of contextual\nbandit problems. By iteratively solving these contextual bandits, DynPG\nconverges to the stationary optimal policy of the infinite-horizon MDP. To\ndemonstrate the power of DynPG, we establish its non-asymptotic global\nconvergence rate under the tabular softmax parametrization, focusing on the\ndependencies on salient but essential parameters of the MDP. By combining\nclassical arguments from dynamic programming with more recent convergence\narguments of policy gradient schemes, we prove that softmax DynPG scales\npolynomially in the effective horizon $(1-\\gamma)^{-1}$. Our findings contrast\nrecent exponential lower bound examples for vanilla policy gradient.\n","authors":["Sara Klein","Xiangyuan Zhang","Tamer Başar","Simon Weissmann","Leif Döring"],"pdf_url":"https://arxiv.org/pdf/2411.04913v1.pdf","comment":"46 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.04907v1","updated":"2024-11-07T17:48:37Z","published":"2024-11-07T17:48:37Z","title":"Enhancing Missing Data Imputation through Combined Bipartite Graph and\n Complete Directed Graph","summary":" In this paper, we aim to address a significant challenge in the field of\nmissing data imputation: identifying and leveraging the interdependencies among\nfeatures to enhance missing data imputation for tabular data. We introduce a\nnovel framework named the Bipartite and Complete Directed Graph Neural Network\n(BCGNN). Within BCGNN, observations and features are differentiated as two\ndistinct node types, and the values of observed features are converted into\nattributed edges linking them. The bipartite segment of our framework\ninductively learns embedding representations for nodes, efficiently utilizing\nthe comprehensive information encapsulated in the attributed edges. In\nparallel, the complete directed graph segment adeptly outlines and communicates\nthe complex interdependencies among features. When compared to contemporary\nleading imputation methodologies, BCGNN consistently outperforms them,\nachieving a noteworthy average reduction of 15% in mean absolute error for\nfeature imputation tasks under different missing mechanisms. Our extensive\nexperimental investigation confirms that an in-depth grasp of the\ninterdependence structure substantially enhances the model's feature embedding\nability. We also highlight the model's superior performance in label prediction\ntasks involving missing data, and its formidable ability to generalize to\nunseen data points.\n","authors":["Zhaoyang Zhang","Hongtu Zhu","Ziqi Chen","Yingjie Zhang","Hai Shu"],"pdf_url":"https://arxiv.org/pdf/2411.04907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06754v3","updated":"2024-11-07T17:46:23Z","published":"2024-09-10T16:05:02Z","title":"Scaling Law Hypothesis for Multimodal Model","summary":" We propose a scaling law hypothesis for multimodal models processing text,\naudio, images, and video within a shared token and embedding space. Our\nframework predicts model performance based on modality-specific compression and\ntokenization efficiency, extending established scaling laws from text-based\ndecoder models to mixed-modality systems. We explore whether leveraging more\ntraining data in multiple modalities can reduce the size of the multimodal\nmodel, enabling efficient deployment on resource-constrained devices.\n","authors":["Qingyun Sun","Zhen Guo"],"pdf_url":"https://arxiv.org/pdf/2409.06754v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04899v1","updated":"2024-11-07T17:41:07Z","published":"2024-11-07T17:41:07Z","title":"Sampling-guided Heterogeneous Graph Neural Network with Temporal\n Smoothing for Scalable Longitudinal Data Imputation","summary":" In this paper, we propose a novel framework, the Sampling-guided\nHeterogeneous Graph Neural Network (SHT-GNN), to effectively tackle the\nchallenge of missing data imputation in longitudinal studies. Unlike\ntraditional methods, which often require extensive preprocessing to handle\nirregular or inconsistent missing data, our approach accommodates arbitrary\nmissing data patterns while maintaining computational efficiency. SHT-GNN\nmodels both observations and covariates as distinct node types, connecting\nobservation nodes at successive time points through subject-specific\nlongitudinal subnetworks, while covariate-observation interactions are\nrepresented by attributed edges within bipartite graphs. By leveraging\nsubject-wise mini-batch sampling and a multi-layer temporal smoothing\nmechanism, SHT-GNN efficiently scales to large datasets, while effectively\nlearning node representations and imputing missing data. Extensive experiments\non both synthetic and real-world datasets, including the Alzheimer's Disease\nNeuroimaging Initiative (ADNI) dataset, demonstrate that SHT-GNN significantly\noutperforms existing imputation methods, even with high missing data rates. The\nempirical results highlight SHT-GNN's robust imputation capabilities and\nsuperior performance, particularly in the context of complex, large-scale\nlongitudinal data.\n","authors":["Zhaoyang Zhang","Ziqi Chen","Qiao Liu","Jinhan Xie","Hongtu Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.04899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15814v2","updated":"2024-11-07T17:33:37Z","published":"2024-07-22T17:26:12Z","title":"Perceptions of Linguistic Uncertainty by Language Models and Humans","summary":" _Uncertainty expressions_ such as \"probably\" or \"highly unlikely\" are\npervasive in human language. While prior work has established that there is\npopulation-level agreement in terms of how humans quantitatively interpret\nthese expressions, there has been little inquiry into the abilities of language\nmodels in the same context. In this paper, we investigate how language models\nmap linguistic expressions of uncertainty to numerical responses. Our approach\nassesses whether language models can employ theory of mind in this setting:\nunderstanding the uncertainty of another agent about a particular statement,\nindependently of the model's own certainty about that statement. We find that 7\nout of 10 models are able to map uncertainty expressions to probabilistic\nresponses in a human-like manner. However, we observe systematically different\nbehavior depending on whether a statement is actually true or false. This\nsensitivity indicates that language models are substantially more susceptible\nto bias based on their prior knowledge (as compared to humans). These findings\nraise important questions and have broad implications for human-AI and AI-AI\ncommunication.\n","authors":["Catarina G Belem","Markelle Kelly","Mark Steyvers","Sameer Singh","Padhraic Smyth"],"pdf_url":"https://arxiv.org/pdf/2407.15814v2.pdf","comment":"Accepted at EMNLP 2024 (Main)"},{"id":"http://arxiv.org/abs/2410.03728v2","updated":"2024-11-07T17:19:26Z","published":"2024-09-30T10:50:12Z","title":"Exploring QUIC Dynamics: A Large-Scale Dataset for Encrypted Traffic\n Analysis","summary":" QUIC, a new and increasingly used transport protocol, addresses and resolves\nthe limitations of TCP by offering improved security, performance, and features\nsuch as stream multiplexing and connection migration. These features, however,\nalso present challenges for network operators who need to monitor and analyze\nweb traffic. In this paper, we introduce VisQUIC, a labeled dataset comprising\nover 100,000 QUIC traces from more than 44,000 websites (URLs), collected over\na four-month period. These traces provide the foundation for generating more\nthan seven million images, with configurable parameters of window length, pixel\nresolution, normalization, and labels. These images enable an observer looking\nat the interactions between a client and a server to analyze and gain insights\nabout QUIC encrypted connections. To illustrate the dataset's potential, we\noffer a use-case example of an observer estimating the number of HTTP/3\nresponses/requests pairs in a given QUIC, which can reveal server behavior,\nclient--server interactions, and the load imposed by an observed connection. We\nformulate the problem as a discrete regression problem, train a machine\nlearning (ML) model for it, and then evaluate it using the proposed dataset on\nan example use case.\n","authors":["Barak Gahtan","Robert J. Shahla","Alex M. Bronstein","Reuven Cohen"],"pdf_url":"https://arxiv.org/pdf/2410.03728v2.pdf","comment":"The dataset and the supplementary material can be provided upon\n request"},{"id":"http://arxiv.org/abs/2310.09254v4","updated":"2024-11-07T17:14:38Z","published":"2023-10-13T17:12:04Z","title":"GENOT: Entropic (Gromov) Wasserstein Flow Matching with Applications to\n Single-Cell Genomics","summary":" Single-cell genomics has significantly advanced our understanding of cellular\nbehavior, catalyzing innovations in treatments and precision medicine. However,\nsingle-cell sequencing technologies are inherently destructive and can only\nmeasure a limited array of data modalities simultaneously. This limitation\nunderscores the need for new methods capable of realigning cells. Optimal\ntransport (OT) has emerged as a potent solution, but traditional discrete\nsolvers are hampered by scalability, privacy, and out-of-sample estimation\nissues. These challenges have spurred the development of neural network-based\nsolvers, known as neural OT solvers, that parameterize OT maps. Yet, these\nmodels often lack the flexibility needed for broader life science applications.\nTo address these deficiencies, our approach learns stochastic maps (i.e.\ntransport plans), allows for any cost function, relaxes mass conservation\nconstraints and integrates quadratic solvers to tackle the complex challenges\nposed by the (Fused) Gromov-Wasserstein problem. Utilizing flow matching as a\nbackbone, our method offers a flexible and effective framework. We demonstrate\nits versatility and robustness through applications in cell development\nstudies, cellular drug response modeling, and cross-modality cell translation,\nillustrating significant potential for enhancing therapeutic strategies.\n","authors":["Dominik Klein","Théo Uscidda","Fabian Theis","Marco Cuturi"],"pdf_url":"https://arxiv.org/pdf/2310.09254v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04876v1","updated":"2024-11-07T17:13:16Z","published":"2024-11-07T17:13:16Z","title":"Non-Euclidean Mixture Model for Social Network Embedding","summary":" It is largely agreed that social network links are formed due to either\nhomophily or social influence. Inspired by this, we aim at understanding the\ngeneration of links via providing a novel embedding-based graph formation\nmodel. Different from existing graph representation learning, where link\ngeneration probabilities are defined as a simple function of the corresponding\nnode embeddings, we model the link generation as a mixture model of the two\nfactors. In addition, we model the homophily factor in spherical space and the\ninfluence factor in hyperbolic space to accommodate the fact that (1) homophily\nresults in cycles and (2) influence results in hierarchies in networks. We also\ndesign a special projection to align these two spaces. We call this model\nNon-Euclidean Mixture Model, i.e., NMM. We further integrate NMM with our\nnon-Euclidean graph variational autoencoder (VAE) framework, NMM-GNN. NMM-GNN\nlearns embeddings through a unified framework which uses non-Euclidean GNN\nencoders, non-Euclidean Gaussian priors, a non-Euclidean decoder, and a novel\nspace unification loss component to unify distinct non-Euclidean geometric\nspaces. Experiments on public datasets show NMM-GNN significantly outperforms\nstate-of-the-art baselines on social network generation and classification\ntasks, demonstrating its ability to better explain how the social network is\nformed.\n","authors":["Roshni G. Iyer","Yewen Wang","Wei Wang","Yizhou Sun"],"pdf_url":"https://arxiv.org/pdf/2411.04876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16803v2","updated":"2024-11-07T17:10:15Z","published":"2024-07-23T19:06:44Z","title":"C3T: Cross-modal Transfer Through Time for Human Action Recognition","summary":" In order to unlock the potential of diverse sensors, we investigate a method\nto transfer knowledge between modalities using the structure of a unified\nmultimodal representation space for Human Action Recognition (HAR). We\nformalize and explore an understudied cross-modal transfer setting we term\nUnsupervised Modality Adaptation (UMA), where the modality used in testing is\nnot used in supervised training, i.e. zero labeled instances of the test\nmodality are available during training. We develop three methods to perform\nUMA: Student-Teacher (ST), Contrastive Alignment (CA), and Cross-modal Transfer\nThrough Time (C3T). Our extensive experiments on various camera+IMU datasets\ncompare these methods to each other in the UMA setting, and to their empirical\nupper bound in the supervised setting. The results indicate C3T is the most\nrobust and highest performing by at least a margin of 8%, and nears the\nsupervised setting performance even in the presence of temporal noise. This\nmethod introduces a novel mechanism for aligning signals across time-varying\nlatent vectors, extracted from the receptive field of temporal convolutions.\nOur findings suggest that C3T has significant potential for developing\ngeneralizable models for time-series sensor data, opening new avenues for\nmulti-modal learning in various applications.\n","authors":["Abhi Kamboj","Anh Duy Nguyen","Minh Do"],"pdf_url":"https://arxiv.org/pdf/2407.16803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04867v1","updated":"2024-11-07T16:59:32Z","published":"2024-11-07T16:59:32Z","title":"Think Smart, Act SMARL! Analyzing Probabilistic Logic Driven Safety in\n Multi-Agent Reinforcement Learning","summary":" An important challenge for enabling the deployment of reinforcement learning\n(RL) algorithms in the real world is safety. This has resulted in the recent\nresearch field of Safe RL, which aims to learn optimal policies that are safe.\nOne successful approach in that direction is probabilistic logic shields (PLS),\na model-based Safe RL technique that uses formal specifications based on\nprobabilistic logic programming, constraining an agent's policy to comply with\nthose specifications in a probabilistic sense. However, safety is inherently a\nmulti-agent concept, since real-world environments often involve multiple\nagents interacting simultaneously, leading to a complex system which is hard to\ncontrol. Moreover, safe multi-agent RL (Safe MARL) is still underexplored. In\norder to address this gap, in this paper we ($i$) introduce Shielded MARL\n(SMARL) by extending PLS to MARL -- in particular, we introduce Probabilistic\nLogic Temporal Difference Learning (PLTD) to enable shielded independent\nQ-learning (SIQL), and introduce shielded independent PPO (SIPPO) using\nprobabilistic logic policy gradients; ($ii$) show its positive effect and use\nas an equilibrium selection mechanism in various game-theoretic environments\nincluding two-player simultaneous games, extensive-form games, stochastic\ngames, and some grid-world extensions in terms of safety, cooperation, and\nalignment with normative behaviors; and ($iii$) look into the asymmetric case\nwhere only one agent is shielded, and show that the shielded agent has a\nsignificant influence on the unshielded one, providing further evidence of\nSMARL's ability to enhance safety and cooperation in diverse multi-agent\nenvironments.\n","authors":["Satchit Chatterji","Erman Acar"],"pdf_url":"https://arxiv.org/pdf/2411.04867v1.pdf","comment":"19 pages, 14 figures"},{"id":"http://arxiv.org/abs/2410.13835v2","updated":"2024-11-07T16:57:02Z","published":"2024-10-17T17:54:06Z","title":"Active-Dormant Attention Heads: Mechanistically Demystifying\n Extreme-Token Phenomena in LLMs","summary":" Practitioners have consistently observed three puzzling phenomena in\ntransformer-based large language models (LLMs): attention sinks, value-state\ndrains, and residual-state peaks, collectively referred to as extreme-token\nphenomena. These phenomena are characterized by certain so-called \"sink tokens\"\nreceiving disproportionately high attention weights, exhibiting significantly\nsmaller value states, and having much larger residual-state norms than those of\nother tokens. These extreme tokens give rise to various challenges in LLM\ninference, quantization, and interpretability.\n We elucidate the mechanisms behind extreme-token phenomena. First, we show\nthat these phenomena arise in very simple architectures -- transformers with\none to three layers -- trained on a toy model, the Bigram-Backcopy (BB) task.\nIn this setting, we identify an active-dormant mechanism, where attention heads\nbecome sinks for specific input domains while remaining non-sinks for others.\nOur theoretical analysis of the training dynamics reveals that these phenomena\nare driven by a mutual reinforcement mechanism. Building on these insights, we\npropose strategies to mitigate extreme-token phenomena during pretraining,\nincluding replacing softmax with ReLU and Adam with SGD. Next, we extend our\nanalysis to pretrained LLMs, including Llama and OLMo, showing that many\nattention heads exhibit a similar active-dormant mechanism as in the BB task,\nand that the mutual reinforcement mechanism also governs the emergence of\nextreme-token phenomena during LLM pretraining. Our results reveal that many of\nthe static and dynamic properties of extreme-token phenomena predicted by the\nBB task align with observations in pretrained LLMs.\n","authors":["Tianyu Guo","Druv Pai","Yu Bai","Jiantao Jiao","Michael I. Jordan","Song Mei"],"pdf_url":"https://arxiv.org/pdf/2410.13835v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04863v1","updated":"2024-11-07T16:54:54Z","published":"2024-11-07T16:54:54Z","title":"OneProt: Towards Multi-Modal Protein Foundation Models","summary":" Recent AI advances have enabled multi-modal systems to model and translate\ndiverse information spaces. Extending beyond text and vision, we introduce\nOneProt, a multi-modal AI for proteins that integrates structural, sequence,\nalignment, and binding site data. Using the ImageBind framework, OneProt aligns\nthe latent spaces of modality encoders along protein sequences. It demonstrates\nstrong performance in retrieval tasks and surpasses state-of-the-art methods in\nvarious downstream tasks, including metal ion binding classification,\ngene-ontology annotation, and enzyme function prediction. This work expands\nmulti-modal capabilities in protein models, paving the way for applications in\ndrug discovery, biocatalytic reaction planning, and protein engineering.\n","authors":["Klemens Flöge","Srisruthi Udayakumar","Johanna Sommer","Marie Piraud","Stefan Kesselheim","Vincent Fortuin","Stephan Günneman","Karel J van der Weg","Holger Gohlke","Alina Bazarova","Erinc Merdivan"],"pdf_url":"https://arxiv.org/pdf/2411.04863v1.pdf","comment":"28 pages, 15 figures, 7 tables"},{"id":"http://arxiv.org/abs/2311.12530v3","updated":"2024-11-07T16:52:39Z","published":"2023-11-21T11:21:53Z","title":"An efficient likelihood-free Bayesian inference method based on\n sequential neural posterior estimation","summary":" Sequential neural posterior estimation (SNPE) techniques have been recently\nproposed for dealing with simulation-based models with intractable likelihoods.\nUnlike approximate Bayesian computation, SNPE techniques learn the posterior\nfrom sequential simulation using neural network-based conditional density\nestimators by minimizing a specific loss function. The SNPE method proposed by\nLueckmann et al. (2017) used a calibration kernel to boost the sample weights\naround the observed data, resulting in a concentrated loss function. However,\nthe use of calibration kernels may increase the variances of both the empirical\nloss and its gradient, making the training inefficient. To improve the\nstability of SNPE, this paper proposes to use an adaptive calibration kernel\nand several variance reduction techniques. The proposed method greatly speeds\nup the process of training and provides a better approximation of the posterior\nthan the original SNPE method and some existing competitors as confirmed by\nnumerical experiments. We also manage to demonstrate the superiority of the\nproposed method for a high-dimensional model with real-world dataset.\n","authors":["Yifei Xiong","Xiliang Yang","Sanguo Zhang","Zhijian He"],"pdf_url":"https://arxiv.org/pdf/2311.12530v3.pdf","comment":"28 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.04855v1","updated":"2024-11-07T16:47:06Z","published":"2024-11-07T16:47:06Z","title":"Clinicians' Voice: Fundamental Considerations for XAI in Healthcare","summary":" Explainable AI (XAI) holds the promise of advancing the implementation and\nadoption of AI-based tools in practice, especially in high-stakes environments\nlike healthcare. However, most of the current research is disconnected from its\npractical applications and lacks input of end users. To address this, we\nconducted semi-structured interviews with clinicians to discuss their thoughts,\nhopes, and concerns. We find that clinicians generally think positively about\ndeveloping AI-based tools for clinical practice, but they have concerns about\nhow these will fit into their workflow and how it will impact clinician-patient\nrelations. We further identify education of clinicians on AI as a crucial\nfactor for the success of AI in healthcare and highlight aspects clinicians are\nlooking for in (X)AI-based tools. In contrast to other studies, we take on a\nholistic and exploratory perspective to identify general requirements, which is\nnecessary before moving on to testing specific (X)AI products for healthcare.\n","authors":["T. E. Röber","R. Goedhart","S. İ. Birbil"],"pdf_url":"https://arxiv.org/pdf/2411.04855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04852v1","updated":"2024-11-07T16:39:29Z","published":"2024-11-07T16:39:29Z","title":"Conformalized Credal Regions for Classification with Ambiguous Ground\n Truth","summary":" An open question in \\emph{Imprecise Probabilistic Machine Learning} is how to\nempirically derive a credal region (i.e., a closed and convex family of\nprobabilities on the output space) from the available data, without any prior\nknowledge or assumption. In classification problems, credal regions are a tool\nthat is able to provide provable guarantees under realistic assumptions by\ncharacterizing the uncertainty about the distribution of the labels. Building\non previous work, we show that credal regions can be directly constructed using\nconformal methods. This allows us to provide a novel extension of classical\nconformal prediction to problems with ambiguous ground truth, that is, when the\nexact labels for given inputs are not exactly known. The resulting construction\nenjoys desirable practical and theoretical properties: (i) conformal coverage\nguarantees, (ii) smaller prediction sets (compared to classical conformal\nprediction regions) and (iii) disentanglement of uncertainty sources\n(epistemic, aleatoric). We empirically verify our findings on both synthetic\nand real datasets.\n","authors":["Michele Caprio","David Stutz","Shuo Li","Arnaud Doucet"],"pdf_url":"https://arxiv.org/pdf/2411.04852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04845v1","updated":"2024-11-07T16:32:50Z","published":"2024-11-07T16:32:50Z","title":"Asymptotic regularity of a generalised stochastic Halpern scheme with\n applications","summary":" We provide abstract, general and highly uniform rates of asymptotic\nregularity for a generalized stochastic Halpern-style iteration, which\nincorporates a second mapping in the style of a Krasnoselskii-Mann iteration.\nThis iteration is general in two ways: First, it incorporates stochasticity in\na completely abstract way rather than fixing a sampling method; secondly, it\nincludes as special cases stochastic versions of various schemes from the\noptimization literature, including Halpern's iteration as well as a\nKrasnoselskii-Mann iteration with Tikhonov regularization terms in the sense of\nBo\\c{t}, Csetnek and Meier. For these particular cases, we in particular obtain\nlinear rates of asymptotic regularity, matching (or improving) the currently\nbest known rates for these iterations in stochastic optimization, and quadratic\nrates of asymptotic regularity are obtained in the context of inner product\nspaces for the general iteration. We utilize these rates to give bounds on the\noracle complexity of such iterations under suitable variance assumptions and\nbatching strategies, again presented in an abstract style. Finally, we sketch\nhow the schemes presented here can be instantiated in the context of\nreinforcement learning to yield novel methods for Q-learning.\n","authors":["Nicholas Pischke","Thomas Powell"],"pdf_url":"https://arxiv.org/pdf/2411.04845v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2411.04843v1","updated":"2024-11-07T16:31:31Z","published":"2024-11-07T16:31:31Z","title":"Learning in Budgeted Auctions with Spacing Objectives","summary":" In many repeated auction settings, participants care not only about how\nfrequently they win but also how their winnings are distributed over time. This\nproblem arises in various practical domains where avoiding congested demand is\ncrucial, such as online retail sales and compute services, as well as in\nadvertising campaigns that require sustained visibility over time. We introduce\na simple model of this phenomenon, modeling it as a budgeted auction where the\nvalue of a win is a concave function of the time since the last win. This\nimplies that for a given number of wins, even spacing over time is optimal. We\nalso extend our model and results to the case when not all wins result in\n\"conversions\" (realization of actual gains), and the probability of conversion\ndepends on a context. The goal is to maximize and evenly space conversions\nrather than just wins.\n We study the optimal policies for this setting in second-price auctions and\noffer learning algorithms for the bidders that achieve low regret against the\noptimal bidding policy in a Bayesian online setting. Our main result is a\ncomputationally efficient online learning algorithm that achieves $\\tilde\nO(\\sqrt T)$ regret. We achieve this by showing that an infinite-horizon Markov\ndecision process (MDP) with the budget constraint in expectation is essentially\nequivalent to our problem, even when limiting that MDP to a very small number\nof states. The algorithm achieves low regret by learning a bidding policy that\nchooses bids as a function of the context and the system's state, which will be\nthe time elapsed since the last win (or conversion). We show that\nstate-independent strategies incur linear regret even without uncertainty of\nconversions. We complement this by showing that there are state-independent\nstrategies that, while still having linear regret, achieve a $(1-\\frac 1 e)$\napproximation to the optimal reward.\n","authors":["Giannis Fikioris","Robert Kleinberg","Yoav Kolumbus","Raunak Kumar","Yishay Mansour","Éva Tardos"],"pdf_url":"https://arxiv.org/pdf/2411.04843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04838v1","updated":"2024-11-07T16:29:03Z","published":"2024-11-07T16:29:03Z","title":"Machine learning and optimization-based approaches to duality in\n statistical physics","summary":" The notion of duality -- that a given physical system can have two different\nmathematical descriptions -- is a key idea in modern theoretical physics.\nEstablishing a duality in lattice statistical mechanics models requires the\nconstruction of a dual Hamiltonian and a map from the original to the dual\nobservables. By using simple neural networks to parameterize these maps and\nintroducing a loss function that penalises the difference between correlation\nfunctions in original and dual models, we formulate the process of duality\ndiscovery as an optimization problem. We numerically solve this problem and\nshow that our framework can rediscover the celebrated Kramers-Wannier duality\nfor the 2d Ising model, reconstructing the known mapping of temperatures. We\nalso discuss an alternative approach which uses known features of the mapping\nof topological lines to reduce the problem to optimizing the couplings in a\ndual Hamiltonian, and explore next-to-nearest neighbour deformations of the 2d\nIsing duality. We discuss future directions and prospects for discovering new\ndualities within this framework.\n","authors":["Andrea E. V. Ferrari","Prateek Gupta","Nabil Iqbal"],"pdf_url":"https://arxiv.org/pdf/2411.04838v1.pdf","comment":"27 pages + appendices, lots of plots"},{"id":"http://arxiv.org/abs/2411.04832v1","updated":"2024-11-07T16:13:54Z","published":"2024-11-07T16:13:54Z","title":"Plasticity Loss in Deep Reinforcement Learning: A Survey","summary":" Akin to neuroplasticity in human brains, the plasticity of deep neural\nnetworks enables their quick adaption to new data. This makes plasticity\nparticularly crucial for deep Reinforcement Learning (RL) agents: Once\nplasticity is lost, an agent's performance will inevitably plateau because it\ncannot improve its policy to account for changes in the data distribution,\nwhich are a necessary consequence of its learning process. Thus, developing\nwell-performing and sample-efficient agents hinges on their ability to remain\nplastic during training. Furthermore, the loss of plasticity can be connected\nto many other issues plaguing deep RL, such as training instabilities, scaling\nfailures, overestimation bias, and insufficient exploration. With this survey,\nwe aim to provide an overview of the emerging research on plasticity loss for\nacademics and practitioners of deep reinforcement learning. First, we propose a\nunified definition of plasticity loss based on recent works, relate it to\ndefinitions from the literature, and discuss metrics for measuring plasticity\nloss. Then, we categorize and discuss numerous possible causes of plasticity\nloss before reviewing currently employed mitigation strategies. Our taxonomy is\nthe first systematic overview of the current state of the field. Lastly, we\ndiscuss prevalent issues within the literature, such as a necessity for broader\nevaluation, and provide recommendations for future research, like gaining a\nbetter understanding of an agent's neural activity and behavior.\n","authors":["Timo Klein","Lukas Miklautz","Kevin Sidak","Claudia Plant","Sebastian Tschiatschek"],"pdf_url":"https://arxiv.org/pdf/2411.04832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14837v2","updated":"2024-11-07T16:13:14Z","published":"2024-10-18T19:17:48Z","title":"Topological obstruction to the training of shallow ReLU neural networks","summary":" Studying the interplay between the geometry of the loss landscape and the\noptimization trajectories of simple neural networks is a fundamental step for\nunderstanding their behavior in more complex settings. This paper reveals the\npresence of topological obstruction in the loss landscape of shallow ReLU\nneural networks trained using gradient flow. We discuss how the homogeneous\nnature of the ReLU activation function constrains the training trajectories to\nlie on a product of quadric hypersurfaces whose shape depends on the particular\ninitialization of the network's parameters. When the neural network's output is\na single scalar, we prove that these quadrics can have multiple connected\ncomponents, limiting the set of reachable parameters during training. We\nanalytically compute the number of these components and discuss the possibility\nof mapping one to the other through neuron rescaling and permutation. In this\nsimple setting, we find that the non-connectedness results in a topological\nobstruction, which, depending on the initialization, can make the global\noptimum unreachable. We validate this result with numerical experiments.\n","authors":["Marco Nurisso","Pierrick Leroy","Francesco Vaccarino"],"pdf_url":"https://arxiv.org/pdf/2410.14837v2.pdf","comment":"23 pages, 5 figures, Conference on Neural Information Processing\n Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2409.00220v2","updated":"2024-11-07T16:09:14Z","published":"2024-08-30T19:25:28Z","title":"Learning Latent Space Dynamics with Model-Form Uncertainties: A\n Stochastic Reduced-Order Modeling Approach","summary":" This paper presents a probabilistic approach to represent and quantify\nmodel-form uncertainties in the reduced-order modeling of complex systems using\noperator inference techniques. Such uncertainties can arise in the selection of\nan appropriate state-space representation, in the projection step that\nunderlies many reduced-order modeling methods, or as a byproduct of\nconsiderations made during training, to name a few. Following previous works in\nthe literature, the proposed method captures these uncertainties by expanding\nthe approximation space through the randomization of the projection matrix.\nThis is achieved by combining Riemannian projection and retraction operators -\nacting on a subset of the Stiefel manifold - with an information-theoretic\nformulation. The efficacy of the approach is assessed on canonical problems in\nfluid mechanics by identifying and quantifying the impact of model-form\nuncertainties on the inferred operators.\n","authors":["Jin Yi Yong","Rudy Geelen","Johann Guilleminot"],"pdf_url":"https://arxiv.org/pdf/2409.00220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04826v1","updated":"2024-11-07T16:07:00Z","published":"2024-11-07T16:07:00Z","title":"D$^3$epth: Self-Supervised Depth Estimation with Dynamic Mask in Dynamic\n Scenes","summary":" Depth estimation is a crucial technology in robotics. Recently,\nself-supervised depth estimation methods have demonstrated great potential as\nthey can efficiently leverage large amounts of unlabelled real-world data.\nHowever, most existing methods are designed under the assumption of static\nscenes, which hinders their adaptability in dynamic environments. To address\nthis issue, we present D$^3$epth, a novel method for self-supervised depth\nestimation in dynamic scenes. It tackles the challenge of dynamic objects from\ntwo key perspectives. First, within the self-supervised framework, we design a\nreprojection constraint to identify regions likely to contain dynamic objects,\nallowing the construction of a dynamic mask that mitigates their impact at the\nloss level. Second, for multi-frame depth estimation, we introduce a cost\nvolume auto-masking strategy that leverages adjacent frames to identify regions\nassociated with dynamic objects and generate corresponding masks. This provides\nguidance for subsequent processes. Furthermore, we propose a spectral entropy\nuncertainty module that incorporates spectral entropy to guide uncertainty\nestimation during depth fusion, effectively addressing issues arising from cost\nvolume computation in dynamic environments. Extensive experiments on KITTI and\nCityscapes datasets demonstrate that the proposed method consistently\noutperforms existing self-supervised monocular depth estimation baselines. Code\nis available at \\url{https://github.com/Csyunling/D3epth}.\n","authors":["Siyu Chen","Hong Liu","Wenhao Li","Ying Zhu","Guoquan Wang","Jianbing Wu"],"pdf_url":"https://arxiv.org/pdf/2411.04826v1.pdf","comment":"Open sourced"},{"id":"http://arxiv.org/abs/2411.04825v1","updated":"2024-11-07T16:06:00Z","published":"2024-11-07T16:06:00Z","title":"VTechAGP: An Academic-to-General-Audience Text Paraphrase Dataset and\n Benchmark Models","summary":" Existing text simplification or paraphrase datasets mainly focus on\nsentence-level text generation in a general domain. These datasets are\ntypically developed without using domain knowledge. In this paper, we release a\nnovel dataset, VTechAGP, which is the first academic-to-general-audience text\nparaphrase dataset consisting of 4,938 document-level these and dissertation\nacademic and general-audience abstract pairs from 8 colleges authored over 25\nyears. We also propose a novel dynamic soft prompt generative language model,\nDSPT5. For training, we leverage a contrastive-generative loss function to\nlearn the keyword vectors in the dynamic prompt. For inference, we adopt a\ncrowd-sampling decoding strategy at both semantic and structural levels to\nfurther select the best output candidate. We evaluate DSPT5 and various\nstate-of-the-art large language models (LLMs) from multiple perspectives.\nResults demonstrate that the SOTA LLMs does not provide satisfactory outcomes,\nwhile the lightweight DSPT5 can achieve competitive results. To the best of our\nknowledge, we are the first to build a benchmark dataset and solutions for\nacademic-to-general-audience text paraphrase dataset.\n","authors":["Ming Cheng","Jiaying Gong","Chenhan Yuan","William A. Ingram","Edward Fox","Hoda Eldardiry"],"pdf_url":"https://arxiv.org/pdf/2411.04825v1.pdf","comment":"21 pages, 3 figures"},{"id":"http://arxiv.org/abs/2110.14427v5","updated":"2024-11-07T15:59:59Z","published":"2021-10-27T13:38:25Z","title":"The ODE Method for Asymptotic Statistics in Stochastic Approximation and\n Reinforcement Learning","summary":" The paper concerns the $d$-dimensional stochastic approximation recursion, $$\n\\theta_{n+1}= \\theta_n + \\alpha_{n + 1} f(\\theta_n, \\Phi_{n+1}) $$ where $ \\{\n\\Phi_n \\}$ is a stochastic process on a general state space, satisfying a\nconditional Markov property that allows for parameter-dependent noise. The main\nresults are established under additional conditions on the mean flow and a\nversion of the Donsker-Varadhan Lyapunov drift condition known as (DV3):\n {(i)} An appropriate Lyapunov function is constructed that implies\nconvergence of the estimates in $L_4$.\n {(ii)} A functional central limit theorem (CLT) is established, as well as\nthe usual one-dimensional CLT for the normalized error. Moment bounds combined\nwith the CLT imply convergence of the normalized covariance $\\textsf{E} [ z_n\nz_n^T ]$ to the asymptotic covariance in the CLT, where $z_n{=:}\n(\\theta_n-\\theta^*)/\\sqrt{\\alpha_n}$.\n {(iii)} The CLT holds for the normalized version $z^{\\text{PR}}_n{=:}\n\\sqrt{n} [\\theta^{\\text{PR}}_n -\\theta^*]$, of the averaged parameters\n$\\theta^{\\text{PR}}_n {=:} n^{-1} \\sum_{k=1}^n\\theta_k$, subject to standard\nassumptions on the step-size. Moreover, the covariance in the CLT coincides\nwith the minimal covariance of Polyak and Ruppert.\n {(iv)} An example is given where $f$ and $\\bar{f}$ are linear in $\\theta$,\nand $\\Phi$ is a geometrically ergodic Markov chain but does not satisfy (DV3).\nWhile the algorithm is convergent, the second moment of $\\theta_n$ is unbounded\nand in fact diverges.\n {\\bf This arXiv version 3 represents a major extension of the results in\nprior versions.} The main results now allow for parameter-dependent noise, as\nis often the case in applications to reinforcement learning.\n","authors":["Vivek Borkar","Shuhang Chen","Adithya Devraj","Ioannis Kontoyiannis","Sean Meyn"],"pdf_url":"https://arxiv.org/pdf/2110.14427v5.pdf","comment":"2 figures"},{"id":"http://arxiv.org/abs/2311.01968v2","updated":"2024-11-07T15:52:24Z","published":"2023-11-03T15:10:05Z","title":"Latent Diffusion Model for Conditional Reservoir Facies Generation","summary":" Creating accurate and geologically realistic reservoir facies based on\nlimited measurements is crucial for field development and reservoir management,\nespecially in the oil and gas sector. Traditional two-point geostatistics,\nwhile foundational, often struggle to capture complex geological patterns.\nMulti-point statistics offers more flexibility, but comes with its own\nchallenges related to pattern configurations and storage limits. With the rise\nof Generative Adversarial Networks (GANs) and their success in various fields,\nthere has been a shift towards using them for facies generation. However,\nrecent advances in the computer vision domain have shown the superiority of\ndiffusion models over GANs. Motivated by this, a novel Latent Diffusion Model\nis proposed, which is specifically designed for conditional generation of\nreservoir facies. The proposed model produces high-fidelity facies realizations\nthat rigorously preserve conditioning data. It significantly outperforms a\nGAN-based alternative. Our implementation on GitHub:\n\\url{https://github.com/ML4ITS/Latent-Diffusion-Model-for-Conditional-Reservoir-Facies-Generation}.\n","authors":["Daesoo Lee","Oscar Ovanger","Jo Eidsvik","Erlend Aune","Jacob Skauvold","Ragnar Hauge"],"pdf_url":"https://arxiv.org/pdf/2311.01968v2.pdf","comment":"accepted in Computers & Geosciences"},{"id":"http://arxiv.org/abs/2411.04814v1","updated":"2024-11-07T15:50:42Z","published":"2024-11-07T15:50:42Z","title":"A Simple Packing Algorithm for Optimized Mapping of Artificial Neural\n Networks onto Non-Volatile Memory Cross-Bar Arrays","summary":" Neuromorphic computing with crossbar arrays has emerged as a promising\nalternative to improve computing efficiency for machine learning. Previous work\nhas focused on implementing crossbar arrays to perform basic mathematical\noperations. However, in this paper, we explore the impact of mapping the layers\nof an artificial neural network onto physical cross-bar arrays arranged in\ntiles across a chip. We have developed a simplified mapping algorithm to\ndetermine the number of physical tiles, with fixed optimal array dimensions,\nand to estimate the minimum area occupied by these tiles for a given design\nobjective. This simplified algorithm is compared with conventional binary\nlinear optimization, which solves the equivalent bin-packing problem. We have\nfound that the optimum solution is not necessarily related to the minimum\nnumber of tiles; rather, it is shown to be an interaction between tile array\ncapacity and the scaling properties of its peripheral circuits. Additionally,\nwe have discovered that square arrays are not always the best choice for\noptimal mapping, and that performance optimization comes at the cost of total\ntile area\n","authors":["W. Haensch"],"pdf_url":"https://arxiv.org/pdf/2411.04814v1.pdf","comment":"24 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.04812v1","updated":"2024-11-07T15:49:53Z","published":"2024-11-07T15:49:53Z","title":"Soft Hoeffding Tree: A Transparent and Differentiable Model on Data\n Streams","summary":" We propose soft Hoeffding trees (SoHoT) as a new differentiable and\ntransparent model for possibly infinite and changing data streams. Stream\nmining algorithms such as Hoeffding trees grow based on the incoming data\nstream, but they currently lack the adaptability of end-to-end deep learning\nsystems. End-to-end learning can be desirable if a feature representation is\nlearned by a neural network and used in a tree, or if the outputs of trees are\nfurther processed in a deep learning model or workflow. Different from\nHoeffding trees, soft trees can be integrated into such systems due to their\ndifferentiability, but are neither transparent nor explainable. Our novel model\ncombines the extensibility and transparency of Hoeffding trees with the\ndifferentiability of soft trees. We introduce a new gating function to regulate\nthe balance between univariate and multivariate splits in the tree. Experiments\nare performed on 20 data streams, comparing SoHoT to standard Hoeffding trees,\nHoeffding trees with limited complexity, and soft trees applying a sparse\nactivation function for sample routing. The results show that soft Hoeffding\ntrees outperform Hoeffding trees in estimating class probabilities and, at the\nsame time, maintain transparency compared to soft trees, with relatively small\nlosses in terms of AUROC and cross-entropy. We also demonstrate how to trade\noff transparency against performance using a hyperparameter, obtaining\nunivariate splits at one end of the spectrum and multivariate splits at the\nother.\n","authors":["Kirsten Köbschall","Lisa Hartung","Stefan Kramer"],"pdf_url":"https://arxiv.org/pdf/2411.04812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04811v1","updated":"2024-11-07T15:49:03Z","published":"2024-11-07T15:49:03Z","title":"Defending Deep Regression Models against Backdoor Attacks","summary":" Deep regression models are used in a wide variety of safety-critical\napplications, but are vulnerable to backdoor attacks. Although many defenses\nhave been proposed for classification models, they are ineffective as they do\nnot consider the uniqueness of regression models. First, the outputs of\nregression models are continuous values instead of discretized labels. Thus,\nthe potential infected target of a backdoored regression model has infinite\npossibilities, which makes it impossible to be determined by existing defenses.\nSecond, the backdoor behavior of backdoored deep regression models is triggered\nby the activation values of all the neurons in the feature space, which makes\nit difficult to be detected and mitigated using existing defenses. To resolve\nthese problems, we propose DRMGuard, the first defense to identify if a deep\nregression model in the image domain is backdoored or not. DRMGuard formulates\nthe optimization problem for reverse engineering based on the unique\noutput-space and feature-space characteristics of backdoored deep regression\nmodels. We conduct extensive evaluations on two regression tasks and four\ndatasets. The results show that DRMGuard can consistently defend against\nvarious backdoor attacks. We also generalize four state-of-the-art defenses\ndesigned for classifiers to regression models, and compare DRMGuard with them.\nThe results show that DRMGuard significantly outperforms all those defenses.\n","authors":["Lingyu Du","Yupei Liu","Jinyuan Jia","Guohao Lan"],"pdf_url":"https://arxiv.org/pdf/2411.04811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12921v3","updated":"2024-11-07T15:44:43Z","published":"2023-02-24T22:38:54Z","title":"Pre-Finetuning for Few-Shot Emotional Speech Recognition","summary":" Speech models have long been known to overfit individual speakers for many\nclassification tasks. This leads to poor generalization in settings where the\nspeakers are out-of-domain or out-of-distribution, as is common in production\nenvironments. We view speaker adaptation as a few-shot learning problem and\npropose investigating transfer learning approaches inspired by recent success\nwith pre-trained models in natural language tasks. We propose pre-finetuning\nspeech models on difficult tasks to distill knowledge into few-shot downstream\nclassification objectives. We pre-finetune Wav2Vec2.0 on every permutation of\nfour multiclass emotional speech recognition corpora and evaluate our\npre-finetuned models through 33,600 few-shot fine-tuning trials on the\nEmotional Speech Dataset.\n","authors":["Maximillian Chen","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2302.12921v3.pdf","comment":"Published at INTERSPEECH 2023. 5 pages, 4 figures. Code available at\n https://github.com/maxlchen/Speech-PreFinetuning"},{"id":"http://arxiv.org/abs/2403.00867v3","updated":"2024-11-07T15:41:38Z","published":"2024-03-01T03:29:54Z","title":"Gradient Cuff: Detecting Jailbreak Attacks on Large Language Models by\n Exploring Refusal Loss Landscapes","summary":" Large Language Models (LLMs) are becoming a prominent generative AI tool,\nwhere the user enters a query and the LLM generates an answer. To reduce harm\nand misuse, efforts have been made to align these LLMs to human values using\nadvanced training techniques such as Reinforcement Learning from Human Feedback\n(RLHF). However, recent studies have highlighted the vulnerability of LLMs to\nadversarial jailbreak attempts aiming at subverting the embedded safety\nguardrails. To address this challenge, this paper defines and investigates the\nRefusal Loss of LLMs and then proposes a method called Gradient Cuff to detect\njailbreak attempts. Gradient Cuff exploits the unique properties observed in\nthe refusal loss landscape, including functional values and its smoothness, to\ndesign an effective two-step detection strategy. Experimental results on two\naligned LLMs (LLaMA-2-7B-Chat and Vicuna-7B-V1.5) and six types of jailbreak\nattacks (GCG, AutoDAN, PAIR, TAP, Base64, and LRL) show that Gradient Cuff can\nsignificantly improve the LLM's rejection capability for malicious jailbreak\nqueries, while maintaining the model's performance for benign user queries by\nadjusting the detection threshold.\n","authors":["Xiaomeng Hu","Pin-Yu Chen","Tsung-Yi Ho"],"pdf_url":"https://arxiv.org/pdf/2403.00867v3.pdf","comment":"Accepted by NeurIPS 2024. Project page:\n https://huggingface.co/spaces/TrustSafeAI/GradientCuff-Jailbreak-Defense"},{"id":"http://arxiv.org/abs/2406.09014v5","updated":"2024-11-07T15:41:04Z","published":"2024-06-13T11:38:58Z","title":"Deep learning empowered sensor fusion boosts infant movement\n classification","summary":" To assess the integrity of the developing nervous system, the Prechtl general\nmovement assessment (GMA) is recognized for its clinical value in diagnosing\nneurological impairments in early infancy. GMA has been increasingly augmented\nthrough machine learning approaches intending to scale-up its application,\ncircumvent costs in the training of human assessors and further standardize\nclassification of spontaneous motor patterns. Available deep learning tools,\nall of which are based on single sensor modalities, are however still\nconsiderably inferior to that of well-trained human assessors. These approaches\nare hardly comparable as all models are designed, trained and evaluated on\nproprietary/silo-data sets. With this study we propose a sensor fusion approach\nfor assessing fidgety movements (FMs). FMs were recorded from 51 typically\ndeveloping participants. We compared three different sensor modalities\n(pressure, inertial, and visual sensors). Various combinations and two sensor\nfusion approaches (late and early fusion) for infant movement classification\nwere tested to evaluate whether a multi-sensor system outperforms single\nmodality assessments. Convolutional neural network (CNN) architectures were\nused to classify movement patterns. The performance of the three-sensor fusion\n(classification accuracy of 94.5%) was significantly higher than that of any\nsingle modality evaluated. We show that the sensor fusion approach is a\npromising avenue for automated classification of infant motor patterns. The\ndevelopment of a robust sensor fusion system may significantly enhance AI-based\nearly recognition of neurofunctions, ultimately facilitating automated early\ndetection of neurodevelopmental conditions.\n","authors":["Tomas Kulvicius","Dajie Zhang","Luise Poustka","Sven Bölte","Lennart Jahn","Sarah Flügge","Marc Kraft","Markus Zweckstetter","Karin Nielsen-Saines","Florentin Wörgötter","Peter B Marschik"],"pdf_url":"https://arxiv.org/pdf/2406.09014v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14758v2","updated":"2024-11-07T15:40:25Z","published":"2024-05-23T16:29:29Z","title":"Axioms for AI Alignment from Human Feedback","summary":" In the context of reinforcement learning from human feedback (RLHF), the\nreward function is generally derived from maximum likelihood estimation of a\nrandom utility model based on pairwise comparisons made by humans. The problem\nof learning a reward function is one of preference aggregation that, we argue,\nlargely falls within the scope of social choice theory. From this perspective,\nwe can evaluate different aggregation methods via established axioms, examining\nwhether these methods meet or fail well-known standards. We demonstrate that\nboth the Bradley-Terry-Luce Model and its broad generalizations fail to meet\nbasic axioms. In response, we develop novel rules for learning reward functions\nwith strong axiomatic guarantees. A key innovation from the standpoint of\nsocial choice is that our problem has a linear structure, which greatly\nrestricts the space of feasible rules and leads to a new paradigm that we call\nlinear social choice.\n","authors":["Luise Ge","Daniel Halpern","Evi Micha","Ariel D. Procaccia","Itai Shapira","Yevgeniy Vorobeychik","Junlin Wu"],"pdf_url":"https://arxiv.org/pdf/2405.14758v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13040v2","updated":"2024-11-07T15:40:09Z","published":"2023-10-19T17:59:12Z","title":"Interpreting CLIP: Insights on the Robustness to ImageNet Distribution\n Shifts","summary":" What distinguishes robust models from non-robust ones? While for ImageNet\ndistribution shifts it has been shown that such differences in robustness can\nbe traced back predominantly to differences in training data, so far it is not\nknown what that translates to in terms of what the model has learned. In this\nwork, we bridge this gap by probing the representation spaces of 16 robust\nzero-shot CLIP vision encoders with various backbones (ResNets and ViTs) and\npretraining sets (OpenAI, LAION-400M, LAION-2B, YFCC15M, CC12M and {DataComp}),\nand comparing them to the representation spaces of less robust models with\nidentical backbones, but different (pre)training sets or objectives (CLIP\npretraining on ImageNet-Captions, and supervised training or finetuning on\nImageNet).Through this analysis, we generate three novel insights. Firstly, we\ndetect the presence of outlier features in robust zero-shot CLIP vision\nencoders, which to the best of our knowledge is the first time these are\nobserved in non-language and non-transformer models. Secondly, we find the\nexistence of outlier features to be an indication of ImageNet shift robustness\nin models, since we only find them in robust models in our analysis. Lastly, we\nalso investigate the number of unique encoded concepts in the representation\nspace and find zero-shot CLIP models to encode a higher number of unique\nconcepts in their representation space. However, we do not find this to be an\nindicator of ImageNet shift robustness and hypothesize that it is rather\nrelated to the language supervision. Since the presence of outlier features can\nbe detected without access to any data from shifted datasets, we believe that\nthey could be a useful tool for practitioners to get a feeling for the\ndistribution shift robustness of a pretrained model during deployment.\n","authors":["Jonathan Crabbé","Pau Rodríguez","Vaishaal Shankar","Luca Zappella","Arno Blaas"],"pdf_url":"https://arxiv.org/pdf/2310.13040v2.pdf","comment":"Published in TMLR"},{"id":"http://arxiv.org/abs/2411.04794v1","updated":"2024-11-07T15:36:05Z","published":"2024-11-07T15:36:05Z","title":"AlignXIE: Improving Multilingual Information Extraction by Cross-Lingual\n Alignment","summary":" Empirical evidence suggests that LLMs exhibit spontaneous cross-lingual\nalignment. Our findings suggest that although LLMs also demonstrate promising\ncross-lingual alignment in Information Extraction, there remains significant\nimbalance across languages, revealing an underlying deficiency in the IE\nalignment. To address this issue, we propose AlignXIE, a powerful code-based\nLLM that significantly enhances cross-lingual IE alignment through two\nstrategies. Firstly, AlignXIE formulates IE across different languages,\nespecially non-English ones, as code generation tasks, standardizing the\nrepresentation of various schemas using Python classes to ensure consistency of\nthe same ontology in different languages and align the schema. Secondly, it\nincorporates an IE cross-lingual alignment phase through a translated instance\nprediction task proposed in this paper to align the extraction process,\nutilizing ParallelNER, an IE bilingual parallel dataset with 257,190 samples,\ngenerated by our proposed LLM-based automatic pipeline for IE parallel data\nconstruction, with manual annotation to ensure quality. Ultimately, we obtain\nAlignXIE through multilingual IE instruction tuning. Although without training\nin 9 unseen languages, AlignXIE surpasses ChatGPT by $30.17\\%$ and SoTA by\n$20.03\\%$, thereby demonstrating superior cross-lingual IE capabilities.\nComprehensive evaluations on 63 IE benchmarks in Chinese and English under\nvarious settings, demonstrate that AlignXIE significantly enhances\ncross-lingual and multilingual IE through boosting the IE alignment.\n","authors":["Yuxin Zuo","Wenxuan Jiang","Wenxuan Liu","Zixuan Li","Long Bai","Hanbin Wang","Yutao Zeng","Xiaolong Jin","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.04794v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2411.04788v1","updated":"2024-11-07T15:28:20Z","published":"2024-11-07T15:28:20Z","title":"Enhancing Investment Analysis: Optimizing AI-Agent Collaboration in\n Financial Research","summary":" In recent years, the application of generative artificial intelligence\n(GenAI) in financial analysis and investment decision-making has gained\nsignificant attention. However, most existing approaches rely on single-agent\nsystems, which fail to fully utilize the collaborative potential of multiple AI\nagents. In this paper, we propose a novel multi-agent collaboration system\ndesigned to enhance decision-making in financial investment research. The\nsystem incorporates agent groups with both configurable group sizes and\ncollaboration structures to leverage the strengths of each agent group type. By\nutilizing a sub-optimal combination strategy, the system dynamically adapts to\nvarying market conditions and investment scenarios, optimizing performance\nacross different tasks. We focus on three sub-tasks: fundamentals, market\nsentiment, and risk analysis, by analyzing the 2023 SEC 10-K forms of 30\ncompanies listed on the Dow Jones Index. Our findings reveal significant\nperformance variations based on the configurations of AI agents for different\ntasks. The results demonstrate that our multi-agent collaboration system\noutperforms traditional single-agent models, offering improved accuracy,\nefficiency, and adaptability in complex financial environments. This study\nhighlights the potential of multi-agent systems in transforming financial\nanalysis and investment decision-making by integrating diverse analytical\nperspectives.\n","authors":["Xuewen Han","Neng Wang","Shangkun Che","Hongyang Yang","Kunpeng Zhang","Sean Xin Xu"],"pdf_url":"https://arxiv.org/pdf/2411.04788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04784v1","updated":"2024-11-07T15:26:38Z","published":"2024-11-07T15:26:38Z","title":"Navigating Trade-offs: Policy Summarization for Multi-Objective\n Reinforcement Learning","summary":" Multi-objective reinforcement learning (MORL) is used to solve problems\ninvolving multiple objectives. An MORL agent must make decisions based on the\ndiverse signals provided by distinct reward functions. Training an MORL agent\nyields a set of solutions (policies), each presenting distinct trade-offs among\nthe objectives (expected returns). MORL enhances explainability by enabling\nfine-grained comparisons of policies in the solution set based on their\ntrade-offs as opposed to having a single policy. However, the solution set is\ntypically large and multi-dimensional, where each policy (e.g., a neural\nnetwork) is represented by its objective values.\n We propose an approach for clustering the solution set generated by MORL. By\nconsidering both policy behavior and objective values, our clustering method\ncan reveal the relationship between policy behaviors and regions in the\nobjective space. This approach can enable decision makers (DMs) to identify\noverarching trends and insights in the solution set rather than examining each\npolicy individually. We tested our method in four multi-objective environments\nand found it outperformed traditional k-medoids clustering. Additionally, we\ninclude a case study that demonstrates its real-world application.\n","authors":["Zuzanna Osika","Jazmin Zatarain-Salazar","Frans A. Oliehoek","Pradeep K. Murukannaiah"],"pdf_url":"https://arxiv.org/pdf/2411.04784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16148v3","updated":"2024-11-07T15:23:59Z","published":"2024-06-23T16:04:26Z","title":"Towards Open Respiratory Acoustic Foundation Models: Pretraining and\n Benchmarking","summary":" Respiratory audio, such as coughing and breathing sounds, has predictive\npower for a wide range of healthcare applications, yet is currently\nunder-explored. The main problem for those applications arises from the\ndifficulty in collecting large labeled task-specific data for model\ndevelopment. Generalizable respiratory acoustic foundation models pretrained\nwith unlabeled data would offer appealing advantages and possibly unlock this\nimpasse. However, given the safety-critical nature of healthcare applications,\nit is pivotal to also ensure openness and replicability for any proposed\nfoundation model solution. To this end, we introduce OPERA, an OPEn Respiratory\nAcoustic foundation model pretraining and benchmarking system, as the first\napproach answering this need. We curate large-scale respiratory audio datasets\n(~136K samples, over 400 hours), pretrain three pioneering foundation models,\nand build a benchmark consisting of 19 downstream respiratory health tasks for\nevaluation. Our pretrained models demonstrate superior performance (against\nexisting acoustic models pretrained with general audio on 16 out of 19 tasks)\nand generalizability (to unseen datasets and new respiratory audio modalities).\nThis highlights the great promise of respiratory acoustic foundation models and\nencourages more studies using OPERA as an open resource to accelerate research\non respiratory audio for health. The system is accessible from\nhttps://github.com/evelyn0414/OPERA.\n","authors":["Yuwei Zhang","Tong Xia","Jing Han","Yu Wu","Georgios Rizos","Yang Liu","Mohammed Mosuily","Jagmohan Chauhan","Cecilia Mascolo"],"pdf_url":"https://arxiv.org/pdf/2406.16148v3.pdf","comment":"accepted by NeurIPS 2024 Track Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2411.04777v1","updated":"2024-11-07T15:16:36Z","published":"2024-11-07T15:16:36Z","title":"Learn to Solve Vehicle Routing Problems ASAP: A Neural Optimization\n Approach for Time-Constrained Vehicle Routing Problems with Finite Vehicle\n Fleet","summary":" Finding a feasible and prompt solution to the Vehicle Routing Problem (VRP)\nis a prerequisite for efficient freight transportation, seamless logistics, and\nsustainable mobility. Traditional optimization methods reach their limits when\nconfronted with the real-world complexity of VRPs, which involve numerous\nconstraints and objectives. Recently, the ability of generative Artificial\nIntelligence (AI) to solve combinatorial tasks, known as Neural Combinatorial\nOptimization (NCO), demonstrated promising results, offering new perspectives.\nIn this study, we propose an NCO approach to solve a time-constrained\ncapacitated VRP with a finite vehicle fleet size. The approach is based on an\nencoder-decoder architecture, formulated in line with the Policy Optimization\nwith Multiple Optima (POMO) protocol and trained via a Proximal Policy\nOptimization (PPO) algorithm. We successfully trained the policy with multiple\nobjectives (minimizing the total distance while maximizing vehicle utilization)\nand evaluated it on medium and large instances, benchmarking it against\nstate-of-the-art heuristics. The method is able to find adequate and\ncost-efficient solutions, showing both flexibility and robust generalization.\nFinally, we provide a critical analysis of the solution generated by NCO and\ndiscuss the challenges and opportunities of this new branch of intelligent\nlearning algorithms emerging in optimization science, focusing on freight\ntransportation.\n","authors":["Elija Deineko","Carina Kehrt"],"pdf_url":"https://arxiv.org/pdf/2411.04777v1.pdf","comment":"Affiliation: German Aerospace Center (DLR), Institute of Transport\n Research, Rudower Chaussee 7, 12489 Berlin Correspondence:\n Elija.deineko@dlr.de"},{"id":"http://arxiv.org/abs/2411.04775v1","updated":"2024-11-07T15:15:27Z","published":"2024-11-07T15:15:27Z","title":"Learning dynamical systems from data: Gradient-based dictionary\n optimization","summary":" The Koopman operator plays a crucial role in analyzing the global behavior of\ndynamical systems. Existing data-driven methods for approximating the Koopman\noperator or discovering the governing equations of the underlying system\ntypically require a fixed set of basis functions, also called dictionary. The\noptimal choice of basis functions is highly problem-dependent and often\nrequires domain knowledge. We present a novel gradient descent-based\noptimization framework for learning suitable and interpretable basis functions\nfrom data and show how it can be used in combination with EDMD, SINDy, and\nPDE-FIND. We illustrate the efficacy of the proposed approach with the aid of\nvarious benchmark problems such as the Ornstein-Uhlenbeck process, Chua's\ncircuit, a nonlinear heat equation, as well as protein-folding data.\n","authors":["Mohammad Tabish","Neil K. Chada","Stefan Klus"],"pdf_url":"https://arxiv.org/pdf/2411.04775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03955v8","updated":"2024-11-07T15:07:22Z","published":"2024-01-08T15:21:21Z","title":"Tiny Time Mixers (TTMs): Fast Pre-trained Models for Enhanced\n Zero/Few-Shot Forecasting of Multivariate Time Series","summary":" Large pre-trained models excel in zero/few-shot learning for language and\nvision tasks but face challenges in multivariate time series (TS) forecasting\ndue to diverse data characteristics. Consequently, recent research efforts have\nfocused on developing pre-trained TS forecasting models. These models, whether\nbuilt from scratch or adapted from large language models (LLMs), excel in\nzero/few-shot forecasting tasks. However, they are limited by slow performance,\nhigh computational demands, and neglect of cross-channel and exogenous\ncorrelations. To address this, we introduce Tiny Time Mixers (TTM), a compact\nmodel (starting from 1M parameters) with effective transfer learning\ncapabilities, trained exclusively on public TS datasets. TTM, based on the\nlight-weight TSMixer architecture, incorporates innovations like adaptive\npatching, diverse resolution sampling, and resolution prefix tuning to handle\npre-training on varied dataset resolutions with minimal model capacity.\nAdditionally, it employs multi-level modeling to capture channel correlations\nand infuse exogenous signals during fine-tuning. TTM outperforms existing\npopular benchmarks in zero/few-shot forecasting by (4-40%), while reducing\ncomputational requirements significantly. Moreover, TTMs are lightweight and\ncan be executed even on CPU-only machines, enhancing usability and fostering\nwider adoption in resource-constrained environments. The model weights for\nreproducibility and research use are available at\nhttps://huggingface.co/ibm/ttm-research-r2/, while enterprise-use weights under\nthe Apache license can be accessed as follows: the initial TTM-Q variant at\nhttps://huggingface.co/ibm-granite/granite-timeseries-ttm-r1, and the latest\nvariants (TTM-B, TTM-E, TTM-A) weights are available at\nhttps://huggingface.co/ibm-granite/granite-timeseries-ttm-r2.\n","authors":["Vijay Ekambaram","Arindam Jati","Pankaj Dayama","Sumanta Mukherjee","Nam H. Nguyen","Wesley M. Gifford","Chandra Reddy","Jayant Kalagnanam"],"pdf_url":"https://arxiv.org/pdf/2401.03955v8.pdf","comment":"Accepted at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2411.04761v1","updated":"2024-11-07T14:59:23Z","published":"2024-11-07T14:59:23Z","title":"Mining the Minoria: Unknown, Under-represented, and Under-performing\n Minority Groups","summary":" Due to a variety of reasons, such as privacy, data in the wild often misses\nthe grouping information required for identifying minorities. On the other\nhand, it is known that machine learning models are only as good as the data\nthey are trained on and, hence, may underperform for the under-represented\nminority groups. The missing grouping information presents a dilemma for\nresponsible data scientists who find themselves in an unknown-unknown\nsituation, where not only do they not have access to the grouping attributes\nbut do not also know what groups to consider.\n This paper is an attempt to address this dilemma. Specifically, we propose a\nminority mining problem, where we find vectors in the attribute space that\nreveal potential groups that are under-represented and under-performing.\nTechnically speaking, we propose a geometric transformation of data into a dual\nspace and use notions such as the arrangement of hyperplanes to design an\nefficient algorithm for the problem in lower dimensions. Generalizing our\nsolution to the higher dimensions is cursed by dimensionality. Therefore, we\npropose a solution based on smart exploration of the search space for such\ncases. We conduct comprehensive experiments using real-world and synthetic\ndatasets alongside the theoretical analysis. Our experiment results demonstrate\nthe effectiveness of our proposed solutions in mining the unknown,\nunder-represented, and under-performing minorities.\n","authors":["Mohsen Dehghankar","Abolfazl Asudeh"],"pdf_url":"https://arxiv.org/pdf/2411.04761v1.pdf","comment":"This paper is currently under review at VLDB 2025"},{"id":"http://arxiv.org/abs/2411.04760v1","updated":"2024-11-07T14:58:51Z","published":"2024-11-07T14:58:51Z","title":"Zero-Shot Temporal Resolution Domain Adaptation for Spiking Neural\n Networks","summary":" Spiking Neural Networks (SNNs) are biologically-inspired deep neural networks\nthat efficiently extract temporal information while offering promising gains in\nterms of energy efficiency and latency when deployed on neuromorphic devices.\nHowever, SNN model parameters are sensitive to temporal resolution, leading to\nsignificant performance drops when the temporal resolution of target data at\nthe edge is not the same with that of the pre-deployment source data used for\ntraining, especially when fine-tuning is not possible at the edge. To address\nthis challenge, we propose three novel domain adaptation methods for adapting\nneuron parameters to account for the change in time resolution without\nre-training on target time-resolution. The proposed methods are based on a\nmapping between neuron dynamics in SNNs and State Space Models (SSMs); and are\napplicable to general neuron models. We evaluate the proposed methods under\nspatio-temporal data tasks, namely the audio keyword spotting datasets SHD and\nMSWC as well as the image classification NMINST dataset. Our methods provide an\nalternative to - and in majority of the cases significantly outperform - the\nexisting reference method that simply scales the time constant. Moreover, our\nresults show that high accuracy on high temporal resolution data can be\nobtained by time efficient training on lower temporal resolution data and model\nadaptation.\n","authors":["Sanja Karilanova","Maxime Fabre","Emre Neftci","Ayça Özçelikkale"],"pdf_url":"https://arxiv.org/pdf/2411.04760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03883v2","updated":"2024-11-07T14:57:14Z","published":"2024-11-06T12:57:58Z","title":"MEG: Medical Knowledge-Augmented Large Language Models for Question\n Answering","summary":" Question answering is a natural language understanding task that involves\nreasoning over both explicit context and unstated, relevant domain knowledge.\nLarge language models (LLMs), which underpin most contemporary question\nanswering systems, struggle to induce how concepts relate in specialized\ndomains such as medicine. Existing medical LLMs are also costly to train. In\nthis work, we present MEG, a parameter-efficient approach for medical\nknowledge-augmented LLMs. MEG uses a lightweight mapping network to integrate\ngraph embeddings into the LLM, enabling it to leverage external knowledge in a\ncost-effective way. We evaluate our method on four popular medical\nmultiple-choice datasets and show that LLMs greatly benefit from the factual\ngrounding provided by knowledge graph embeddings. MEG attains an average of\n+10.2% accuracy over the Mistral-Instruct baseline, and +6.7% over specialized\nmodels like BioMistral. We also show results based on Llama-3. Finally, we show\nthat MEG's performance remains robust to the choice of graph encoder.\n","authors":["Laura Cabello","Carmen Martin-Turrero","Uchenna Akujuobi","Anders Søgaard","Carlos Bobed"],"pdf_url":"https://arxiv.org/pdf/2411.03883v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01211v2","updated":"2024-11-07T14:51:44Z","published":"2024-11-02T11:04:45Z","title":"Spatial Transformers for Radio Map Estimation","summary":" Radio map estimation (RME) involves spatial interpolation of radio\nmeasurements to predict metrics such as the received signal strength at\nlocations where no measurements were collected. The most popular estimators\nnowadays project the measurement locations to a regular grid and complete the\nresulting measurement tensor with a convolutional deep neural network.\nUnfortunately, these approaches suffer from poor spatial resolution and require\na great number of parameters. The first contribution of this paper addresses\nthese limitations by means of an attention-based estimator named Spatial\nTransfOrmer for Radio Map estimation (STORM). This scheme not only outperforms\nthe existing estimators, but also exhibits lower computational complexity,\ntranslation equivariance, rotation equivariance, and full spatial resolution.\nThe second contribution is an extended transformer architecture that allows\nSTORM to perform active sensing, by which the next measurement location is\nselected based on the previous measurements. This is particularly useful for\nminimization of drive tests (MDT) in cellular networks, where operators request\nuser equipment to collect measurements. Finally, STORM is extensively validated\nby experiments with one ray-tracing and two real-measurement datasets.\n","authors":["Pham Q. Viet","Daniel Romero"],"pdf_url":"https://arxiv.org/pdf/2411.01211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13873v2","updated":"2024-11-07T14:50:06Z","published":"2024-10-02T12:54:21Z","title":"On the Robustness of Machine Learning Models in Predicting Thermodynamic\n Properties: a Case of Searching for New Quasicrystal Approximants","summary":" Despite an artificial intelligence-assisted modeling of disordered crystals\nis a widely used and well-tried method of new materials design, the issues of\nits robustness, reliability, and stability are still not resolved and even not\ndiscussed enough. To highlight it, in this work we composed a series of nested\nintermetallic approximants of quasicrystals datasets and trained various\nmachine learning models on them correspondingly. Our qualitative and, what is\nmore important, quantitative assessment of the difference in the predictions\nclearly shows that different reasonable changes in the training sample can lead\nto the completely different set of the predicted potentially new materials. We\nalso showed the advantage of pre-training and proposed a simple yet effective\ntrick of sequential training to increase stability.\n","authors":["Fedor S. Avilov","Roman A. Eremin","Semen A. Budennyy","Innokentiy S. Humonen"],"pdf_url":"https://arxiv.org/pdf/2410.13873v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12970v3","updated":"2024-11-07T14:45:25Z","published":"2023-08-24T17:59:54Z","title":"NeuralClothSim: Neural Deformation Fields Meet the Thin Shell Theory","summary":" Despite existing 3D cloth simulators producing realistic results, they\npredominantly operate on discrete surface representations (e.g. points and\nmeshes) with a fixed spatial resolution, which often leads to large memory\nconsumption and resolution-dependent simulations. Moreover, back-propagating\ngradients through the existing solvers is difficult, and they cannot be easily\nintegrated into modern neural architectures. In response, this paper re-thinks\nphysically plausible cloth simulation: We propose NeuralClothSim, i.e., a new\nquasistatic cloth simulator using thin shells, in which surface deformation is\nencoded in neural network weights in the form of a neural field. Our\nmemory-efficient solver operates on a new continuous coordinate-based surface\nrepresentation called neural deformation fields (NDFs); it supervises NDF\nequilibria with the laws of the non-linear Kirchhoff-Love shell theory with a\nnon-linear anisotropic material model. NDFs are adaptive: They 1) allocate\ntheir capacity to the deformation details and 2) allow surface state queries at\narbitrary spatial resolutions without re-training. We show how to train\nNeuralClothSim while imposing hard boundary conditions and demonstrate multiple\napplications, such as material interpolation and simulation editing. The\nexperimental results highlight the effectiveness of our continuous neural\nformulation. See our project page: https://4dqv.mpi-inf.mpg.de/NeuralClothSim/.\n","authors":["Navami Kairanda","Marc Habermann","Christian Theobalt","Vladislav Golyanik"],"pdf_url":"https://arxiv.org/pdf/2308.12970v3.pdf","comment":"33 pages, 23 figures and 3 tables; project page:\n https://4dqv.mpi-inf.mpg.de/NeuralClothSim/"},{"id":"http://arxiv.org/abs/2409.18624v2","updated":"2024-11-07T14:36:04Z","published":"2024-09-27T10:50:49Z","title":"Unsupervised Cognition","summary":" Unsupervised learning methods have a soft inspiration in cognition models. To\nthis day, the most successful unsupervised learning methods revolve around\nclustering samples in a mathematical space. In this paper we propose a\nstate-of-the-art, primitive-based, unsupervised learning approach for\ndecision-making inspired by a novel cognition framework. This\nrepresentation-centric approach models the input space constructively as a\ndistributed hierarchical structure in an input-agnostic way. We compared our\napproach with both current state-of-the-art unsupervised learning\nclassification, and with current state-of-the-art cancer type classification.\nWe show how our proposal outperforms previous state-of-the-art. We also\nevaluate some cognition-like properties of our proposal where it not only\noutperforms the compared algorithms (even supervised learning ones), but it\nalso shows a different, more cognition-like, behaviour.\n","authors":["Alfredo Ibias","Hector Antona","Guillem Ramirez-Miranda","Enric Guinovart","Eduard Alarcon"],"pdf_url":"https://arxiv.org/pdf/2409.18624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04747v1","updated":"2024-11-07T14:29:05Z","published":"2024-11-07T14:29:05Z","title":"Equivariant Graph Attention Networks with Structural Motifs for\n Predicting Cell Line-Specific Synergistic Drug Combinations","summary":" Cancer is the second leading cause of death, with chemotherapy as one of the\nprimary forms of treatment. As a result, researchers are turning to drug\ncombination therapy to decrease drug resistance and increase efficacy. Current\nmethods of drug combination screening, such as in vivo and in vitro, are\ninefficient due to stark time and monetary costs. In silico methods have become\nincreasingly important for screening drugs, but current methods are inaccurate\nand generalize poorly to unseen anticancer drugs. In this paper, I employ a\ngeometric deep-learning model utilizing a graph attention network that is\nequivariant to 3D rotations, translations, and reflections with structural\nmotifs. Additionally, the gene expression of cancer cell lines is utilized to\nclassify synergistic drug combinations specific to each cell line. I compared\nthe proposed geometric deep learning framework to current state-of-the-art\n(SOTA) methods, and the proposed model architecture achieved greater\nperformance on all 12 benchmark tasks performed on the DrugComb dataset.\nSpecifically, the proposed framework outperformed other SOTA methods by an\naccuracy difference greater than 28%. Based on these results, I believe that\nthe equivariant graph attention network's capability of learning geometric data\naccounts for the large performance improvements. The model's ability to\ngeneralize to foreign drugs is thought to be due to the structural motifs\nproviding a better representation of the molecule. Overall, I believe that the\nproposed equivariant geometric deep learning framework serves as an effective\ntool for virtually screening anticancer drug combinations for further\nvalidation in a wet lab environment. The code for this work is made available\nonline at: https://github.com/WeToTheMoon/EGAT_DrugSynergy.\n","authors":["Zachary Schwehr"],"pdf_url":"https://arxiv.org/pdf/2411.04747v1.pdf","comment":"8 pages, 1 figure, Presented at IEEE CIBCB"},{"id":"http://arxiv.org/abs/2411.04744v1","updated":"2024-11-07T14:27:49Z","published":"2024-11-07T14:27:49Z","title":"Respecting the limit:Bayesian optimization with a bound on the optimal\n value","summary":" In many real-world optimization problems, we have prior information about\nwhat objective function values are achievable. In this paper, we study the\nscenario that we have either exact knowledge of the minimum value or a,\npossibly inexact, lower bound on its value. We propose bound-aware Bayesian\noptimization (BABO), a Bayesian optimization method that uses a new surrogate\nmodel and acquisition function to utilize such prior information. We present\nSlogGP, a new surrogate model that incorporates bound information and adapts\nthe Expected Improvement (EI) acquisition function accordingly. Empirical\nresults on a variety of benchmarks demonstrate the benefit of taking prior\ninformation about the optimal value into account, and that the proposed\napproach significantly outperforms existing techniques. Furthermore, we notice\nthat even in the absence of prior information on the bound, the proposed SlogGP\nsurrogate model still performs better than the standard GP model in most cases,\nwhich we explain by its larger expressiveness.\n","authors":["Hanyang Wang","Juergen Branke","Matthias Poloczek"],"pdf_url":"https://arxiv.org/pdf/2411.04744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20194v5","updated":"2024-11-07T14:25:33Z","published":"2024-05-30T15:58:22Z","title":"Occam Gradient Descent","summary":" Deep learning neural network models must be large enough to adapt to their\nproblem domain, while small enough to avoid overfitting training data during\ngradient descent. To balance these competing demands, overprovisioned deep\nlearning models such as transformers are trained for a single epoch on large\ndata sets, and hence inefficient with both computing resources and training\ndata. In response to these inefficiencies, we exploit learning theory to derive\nOccam Gradient Descent, an algorithm that interleaves adaptive reduction of\nmodel size to minimize generalization error, with gradient descent on model\nweights to minimize fitting error. In contrast, traditional gradient descent\ngreedily minimizes fitting error without regard to generalization error. Our\nalgorithm simultaneously descends the space of weights and topological size of\nany neural network without modification. With respect to loss, compute and\nmodel size, our experiments show (a) on image classification benchmarks, linear\nand convolutional neural networks trained with Occam Gradient Descent\noutperform traditional gradient descent with or without post-train pruning; (b)\non a range of tabular data classification tasks, neural networks trained with\nOccam Gradient Descent outperform traditional gradient descent, as well as\nRandom Forests; (c) on natural language transformers, Occam Gradient Descent\noutperforms traditional gradient descent.\n","authors":["B. N. Kausik"],"pdf_url":"https://arxiv.org/pdf/2405.20194v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16872v2","updated":"2024-11-07T14:19:42Z","published":"2023-11-28T15:24:02Z","title":"A unified weighting framework for evaluating nearest neighbour\n classification","summary":" We present the first comprehensive and large-scale evaluation of classical\n(NN), fuzzy (FNN) and fuzzy rough (FRNN) nearest neighbour classification. We\nstandardise existing proposals for nearest neighbour weighting with kernel\nfunctions, applied to the distance values and/or ranks of the nearest\nneighbours of a test instance. In particular, we show that the theoretically\noptimal Samworth weights converge to a kernel. Kernel functions are closely\nrelated to fuzzy negation operators, and we propose a new kernel based on Yager\nnegation. We also consider various distance and scaling measures, which we show\ncan be related to each other. Through a systematic series of experiments on 85\nreal-life classification datasets, we find that NN, FNN and FRNN all perform\nbest with Boscovich distance, and that NN and FRNN perform best with a\ncombination of Samworth rank- and distance-weights and scaling by the mean\nabsolute deviation around the median ($r_1$), the standard deviation ($r_2$) or\nthe semi-interquartile range ($r_{\\infty}^*$), while FNN performs best with\nonly Samworth distance-weights and $r_1$- or $r_2$-scaling. However, NN\nachieves comparable performance with Yager-$\\frac{1}{2}$ distance-weights,\nwhich are simpler to implement than a combination of Samworth distance- and\nrank-weights. Finally, FRNN generally outperforms NN, which in turn performs\nsystematically better than FNN.\n","authors":["Oliver Urs Lenz","Henri Bollaert","Chris Cornelis"],"pdf_url":"https://arxiv.org/pdf/2311.16872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04732v1","updated":"2024-11-07T14:12:00Z","published":"2024-11-07T14:12:00Z","title":"Convolutional Differentiable Logic Gate Networks","summary":" With the increasing inference cost of machine learning models, there is a\ngrowing interest in models with fast and efficient inference. Recently, an\napproach for learning logic gate networks directly via a differentiable\nrelaxation was proposed. Logic gate networks are faster than conventional\nneural network approaches because their inference only requires logic gate\noperators such as NAND, OR, and XOR, which are the underlying building blocks\nof current hardware and can be efficiently executed. We build on this idea,\nextending it by deep logic gate tree convolutions, logical OR pooling, and\nresidual initializations. This allows scaling logic gate networks up by over\none order of magnitude and utilizing the paradigm of convolution. On CIFAR-10,\nwe achieve an accuracy of 86.29% using only 61 million logic gates, which\nimproves over the SOTA while being 29x smaller.\n","authors":["Felix Petersen","Hilde Kuehne","Christian Borgelt","Julian Welzel","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2411.04732v1.pdf","comment":"Published at NeurIPS 2024 (Oral)"},{"id":"http://arxiv.org/abs/2411.04728v1","updated":"2024-11-07T14:08:35Z","published":"2024-11-07T14:08:35Z","title":"Neuromorphic Wireless Split Computing with Multi-Level Spikes","summary":" Inspired by biological processes, neuromorphic computing utilizes spiking\nneural networks (SNNs) to perform inference tasks, offering significant\nefficiency gains for workloads involving sequential data. Recent advances in\nhardware and software have demonstrated that embedding a few bits of payload in\neach spike exchanged between the spiking neurons can further enhance inference\naccuracy. In a split computing architecture, where the SNN is divided across\ntwo separate devices, the device storing the first layers must share\ninformation about the spikes generated by the local output neurons with the\nother device. Consequently, the advantages of multi-level spikes must be\nbalanced against the challenges of transmitting additional bits between the two\ndevices.\n This paper addresses these challenges by investigating a wireless\nneuromorphic split computing architecture employing multi-level SNNs. For this\nsystem, we present the design of digital and analog modulation schemes\noptimized for an orthogonal frequency division multiplexing (OFDM) radio\ninterface. Simulation and experimental results using software-defined radios\nprovide insights into the performance gains of multi-level SNN models and the\noptimal payload size as a function of the quality of the connection between a\ntransmitter and receiver.\n","authors":["Dengyu Wu","Jiechen Chen","Bipin Rajendran","H. Vincent Poor","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2411.04728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16103v3","updated":"2024-11-07T14:00:45Z","published":"2024-10-21T15:31:06Z","title":"LDAdam: Adaptive Optimization from Low-Dimensional Gradient Statistics","summary":" We introduce LDAdam, a memory-efficient optimizer for training large models,\nthat performs adaptive optimization steps within lower dimensional subspaces,\nwhile consistently exploring the full parameter space during training. This\nstrategy keeps the optimizer's memory footprint to a fraction of the model\nsize. LDAdam relies on a new projection-aware update rule for the optimizer\nstates that allows for transitioning between subspaces, i.e., estimation of the\nstatistics of the projected gradients. To mitigate the errors due to low-rank\nprojection, LDAdam integrates a new generalized error feedback mechanism, which\nexplicitly accounts for both gradient and optimizer state compression. We prove\nthe convergence of LDAdam under standard assumptions, and show that LDAdam\nallows for accurate and efficient fine-tuning and pre-training of language\nmodels.\n","authors":["Thomas Robert","Mher Safaryan","Ionut-Vlad Modoranu","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2410.16103v3.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2411.04717v1","updated":"2024-11-07T13:57:53Z","published":"2024-11-07T13:57:53Z","title":"Subspace-Constrained Quadratic Matrix Factorization: Algorithm and\n Applications","summary":" Matrix Factorization has emerged as a widely adopted framework for modeling\ndata exhibiting low-rank structures. To address challenges in manifold\nlearning, this paper presents a subspace-constrained quadratic matrix\nfactorization model. The model is designed to jointly learn key low-dimensional\nstructures, including the tangent space, the normal subspace, and the quadratic\nform that links the tangent space to a low-dimensional representation. We solve\nthe proposed factorization model using an alternating minimization method,\ninvolving an in-depth investigation of nonlinear regression and projection\nsubproblems. Theoretical properties of the quadratic projection problem and\nconvergence characteristics of the alternating strategy are also investigated.\nTo validate our approach, we conduct numerical experiments on synthetic and\nreal-world datasets. Results demonstrate that our model outperforms existing\nmethods, highlighting its robustness and efficacy in capturing core\nlow-dimensional structures.\n","authors":["Zheng Zhai","Xiaohui Li"],"pdf_url":"https://arxiv.org/pdf/2411.04717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17113v3","updated":"2024-11-07T13:55:50Z","published":"2024-09-25T17:27:02Z","title":"Characterizing stable regions in the residual stream of LLMs","summary":" We identify stable regions in the residual stream of Transformers, where the\nmodel's output remains insensitive to small activation changes, but exhibits\nhigh sensitivity at region boundaries. These regions emerge during training and\nbecome more defined as training progresses or model size increases. The regions\nappear to be much larger than previously studied polytopes. Our analysis\nsuggests that these stable regions align with semantic distinctions, where\nsimilar prompts cluster within regions, and activations from the same region\nlead to similar next token predictions. This work provides a promising research\ndirection for understanding the complexity of neural networks, shedding light\non training dynamics, and advancing interpretability.\n","authors":["Jett Janiak","Jacek Karwowski","Chatrik Singh Mangat","Giorgi Giglemiani","Nora Petrova","Stefan Heimersheim"],"pdf_url":"https://arxiv.org/pdf/2409.17113v3.pdf","comment":"Published at Scientific Methods for Understanding Deep Learning\n (SciForDL) workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.04708v1","updated":"2024-11-07T13:45:26Z","published":"2024-11-07T13:45:26Z","title":"Exploring Hierarchical Molecular Graph Representation in Multimodal LLMs","summary":" Following the milestones in large language models (LLMs) and multimodal\nmodels, we have seen a surge in applying LLMs to biochemical tasks. Leveraging\ngraph features and molecular text representations, LLMs can tackle various\ntasks, such as predicting chemical reaction outcomes and describing molecular\nproperties. However, most current work overlooks the multi-level nature of\ngraph features. The impact of different feature levels on LLMs and the\nimportance of each level remain unexplored, and it is possible that different\nchemistry tasks require different feature levels. In this work, we first\ninvestigate the effect of feature granularity by fusing GNN-generated feature\ntokens, discovering that even reducing all tokens to a single token does not\nsignificantly impact performance. We then explore the effect of various feature\nlevels on performance, finding that both the quality of LLM-generated molecules\nand performance on different tasks benefit from different feature levels. We\nconclude with two key insights: (1) current molecular Multimodal LLMs(MLLMs)\nlack a comprehensive understanding of graph features, and (2) static processing\nis not sufficient for hierarchical graph feature. Our code will be publicly\navailable soon.\n","authors":["Chengxin Hu","Hao Li"],"pdf_url":"https://arxiv.org/pdf/2411.04708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17299v2","updated":"2024-11-07T13:44:22Z","published":"2024-05-27T16:00:45Z","title":"Simplicity Bias of Two-Layer Networks beyond Linearly Separable Data","summary":" Simplicity bias, the propensity of deep models to over-rely on simple\nfeatures, has been identified as a potential reason for limited\nout-of-distribution generalization of neural networks (Shah et al., 2020).\nDespite the important implications, this phenomenon has been theoretically\nconfirmed and characterized only under strong dataset assumptions, such as\nlinear separability (Lyu et al., 2021). In this work, we characterize\nsimplicity bias for general datasets in the context of two-layer neural\nnetworks initialized with small weights and trained with gradient flow.\nSpecifically, we prove that in the early training phases, network features\ncluster around a few directions that do not depend on the size of the hidden\nlayer. Furthermore, for datasets with an XOR-like pattern, we precisely\nidentify the learned features and demonstrate that simplicity bias intensifies\nduring later training stages. These results indicate that features learned in\nthe middle stages of training may be more useful for OOD transfer. We support\nthis hypothesis with experiments on image data.\n","authors":["Nikita Tsoy","Nikola Konstantinov"],"pdf_url":"https://arxiv.org/pdf/2405.17299v2.pdf","comment":"ICML 2024, camera-ready version (expanded related work)"},{"id":"http://arxiv.org/abs/2411.04700v1","updated":"2024-11-07T13:34:37Z","published":"2024-11-07T13:34:37Z","title":"Field Assessment of Force Torque Sensors for Planetary Rover Navigation","summary":" Proprioceptive sensors on planetary rovers serve for state estimation and for\nunderstanding terrain and locomotion performance. While inertial measurement\nunits (IMUs) are widely used to this effect, force-torque sensors are less\nexplored for planetary navigation despite their potential to directly measure\ninteraction forces and provide insights into traction performance. This paper\npresents an evaluation of the performance and use cases of force-torque sensors\nbased on data collected from a six-wheeled rover during tests over varying\nterrains, speeds, and slopes. We discuss challenges, such as sensor signal\nreliability and terrain response accuracy, and identify opportunities regarding\nthe use of these sensors. The data is openly accessible and includes\nforce-torque measurements from each of the six-wheel assemblies as well as IMU\ndata from within the rover chassis. This paper aims to inform the design of\nfuture studies and rover upgrades, particularly in sensor integration and\ncontrol algorithms, to improve navigation capabilities.\n","authors":["Levin Gerdes","Carlos Pérez del Pulgar","Raúl Castilla Arquillo","Martin Azkarate"],"pdf_url":"https://arxiv.org/pdf/2411.04700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06672v2","updated":"2024-11-07T13:32:58Z","published":"2024-03-11T12:43:44Z","title":"Provable Mutual Benefits from Federated Learning in Privacy-Sensitive\n Domains","summary":" Cross-silo federated learning (FL) allows data owners to train accurate\nmachine learning models by benefiting from each others private datasets.\nUnfortunately, the model accuracy benefits of collaboration are often\nundermined by privacy defenses. Therefore, to incentivize client participation\nin privacy-sensitive domains, a FL protocol should strike a delicate balance\nbetween privacy guarantees and end-model accuracy. In this paper, we study the\nquestion of when and how a server could design a FL protocol provably\nbeneficial for all participants. First, we provide necessary and sufficient\nconditions for the existence of mutually beneficial protocols in the context of\nmean estimation and convex stochastic optimization. We also derive protocols\nthat maximize the total clients' utility, given symmetric privacy preferences.\nFinally, we design protocols maximizing end-model accuracy and demonstrate\ntheir benefits in synthetic experiments.\n","authors":["Nikita Tsoy","Anna Mihalkova","Teodora Todorova","Nikola Konstantinov"],"pdf_url":"https://arxiv.org/pdf/2403.06672v2.pdf","comment":"AISTATS 2024; Camera-ready version (updated references)"},{"id":"http://arxiv.org/abs/2411.04696v1","updated":"2024-11-07T13:29:32Z","published":"2024-11-07T13:29:32Z","title":"The Multiple Dimensions of Spuriousness in Machine Learning","summary":" Learning correlations from data forms the foundation of today's machine\nlearning (ML) and artificial intelligence (AI) research. While such an approach\nenables the automatic discovery of patterned relationships within big data\ncorpora, it is susceptible to failure modes when unintended correlations are\ncaptured. This vulnerability has expanded interest in interrogating\nspuriousness, often critiqued as an impediment to model performance, fairness,\nand robustness. In this article, we trace deviations from the conventional\ndefinition of statistical spuriousness-which denotes a non-causal observation\narising from either coincidence or confounding variables-to articulate how ML\nresearchers make sense of spuriousness in practice. Drawing on a broad survey\nof ML literature, we conceptualize the \"multiple dimensions of spuriousness,\"\nencompassing: relevance (\"Models should only use correlations that are relevant\nto the task.\"), generalizability (\"Models should only use correlations that\ngeneralize to unseen data\"), human-likeness (\"Models should only use\ncorrelations that a human would use to perform the same task\"), and harmfulness\n(\"Models should only use correlations that are not harmful\"). These dimensions\ndemonstrate that ML spuriousness goes beyond the causal/non-causal dichotomy\nand that the disparate interpretative paths researchers choose could\nmeaningfully influence the trajectory of ML development. By underscoring how a\nfundamental problem in ML is contingently negotiated in research contexts, we\ncontribute to ongoing debates about responsible practices in AI development.\n","authors":["Samuel J. Bell","Skyler Wang"],"pdf_url":"https://arxiv.org/pdf/2411.04696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04695v1","updated":"2024-11-07T13:27:37Z","published":"2024-11-07T13:27:37Z","title":"Is network fragmentation a useful complexity measure?","summary":" It has been observed that the input space of deep neural network classifiers\ncan exhibit `fragmentation', where the model function rapidly changes class as\nthe input space is traversed. The severity of this fragmentation tends to\nfollow the double descent curve, achieving a maximum at the interpolation\nregime. We study this phenomenon in the context of image classification and ask\nwhether fragmentation could be predictive of generalization performance. Using\na fragmentation-based complexity measure, we show this to be possible by\nachieving good performance on the PGDL (Predicting Generalization in Deep\nLearning) benchmark. In addition, we report on new observations related to\nfragmentation, namely (i) fragmentation is not limited to the input space but\noccurs in the hidden representations as well, (ii) fragmentation follows the\ntrends in the validation error throughout training, and (iii) fragmentation is\nnot a direct result of increased weight norms. Together, this indicates that\nfragmentation is a phenomenon worth investigating further when studying the\ngeneralization ability of deep neural networks.\n","authors":["Coenraad Mouton","Randle Rabe","Daniël G. Haasbroek","Marthinus W. Theunissen","Hermanus L. Potgieter","Marelie H. Davel"],"pdf_url":"https://arxiv.org/pdf/2411.04695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15598v4","updated":"2024-11-07T13:26:18Z","published":"2024-05-24T14:30:00Z","title":"MCDFN: Supply Chain Demand Forecasting via an Explainable Multi-Channel\n Data Fusion Network Model","summary":" Accurate demand forecasting is crucial for optimizing supply chain\nmanagement. Traditional methods often fail to capture complex patterns from\nseasonal variability and special events. Despite advancements in deep learning,\ninterpretable forecasting models remain a challenge. To address this, we\nintroduce the Multi-Channel Data Fusion Network (MCDFN), a hybrid architecture\nthat integrates Convolutional Neural Networks (CNN), Long Short-Term Memory\nnetworks (LSTM), and Gated Recurrent Units (GRU) to enhance predictive\nperformance by extracting spatial and temporal features from time series data.\nOur comparative benchmarking demonstrates that MCDFN outperforms seven other\ndeep-learning models, achieving superior metrics: MSE (23.5738), RMSE (4.8553),\nMAE (3.9991), and MAPE (20.1575%). Additionally, MCDFN's predictions were\nstatistically indistinguishable from actual values, confirmed by a paired\nt-test with a 5% p-value and a 10-fold cross-validated statistical paired\nt-test. We apply explainable AI techniques like ShapTime and Permutation\nFeature Importance to enhance interpretability. This research advances demand\nforecasting methodologies and offers practical guidelines for integrating MCDFN\ninto supply chain systems, highlighting future research directions for\nscalability and user-friendly deployment.\n","authors":["Md Abrar Jahin","Asef Shahriar","Md Al Amin"],"pdf_url":"https://arxiv.org/pdf/2405.15598v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16598v2","updated":"2024-11-07T13:12:55Z","published":"2024-05-26T15:18:22Z","title":"Regularized Projection Matrix Approximation with Applications to\n Community Detection","summary":" This paper introduces a regularized projection matrix approximation framework\ndesigned to recover cluster information from the affinity matrix. The model is\nformulated as a projection approximation problem, incorporating an entry-wise\npenalty function. We investigate three distinct penalty functions, each\nspecifically tailored to address bounded, positive, and sparse scenarios. To\nsolve this problem, we propose direct optimization on the Stiefel manifold,\nutilizing the Cayley transformation along with the Alternating Direction Method\nof Multipliers (ADMM) algorithm. Additionally, we provide a theoretical\nanalysis that establishes the convergence properties of ADMM, demonstrating\nthat the convergence point satisfies the KKT conditions of the original\nproblem. Numerical experiments conducted on both synthetic and real-world\ndatasets reveal that our regularized projection matrix approximation approach\nsignificantly outperforms state-of-the-art methods in clustering performance.\n","authors":["Zheng Zhai","Jialu Xu","Mingxin Wu","Xiaohui Li"],"pdf_url":"https://arxiv.org/pdf/2405.16598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04680v1","updated":"2024-11-07T13:08:06Z","published":"2024-11-07T13:08:06Z","title":"Differentially Private Continual Learning using Pre-Trained Models","summary":" This work explores the intersection of continual learning (CL) and\ndifferential privacy (DP). Crucially, continual learning models must retain\nknowledge across tasks, but this conflicts with the differential privacy\nrequirement of restricting individual samples to be memorised in the model. We\npropose using pre-trained models to address the trade-offs between privacy and\nperformance in a continual learning setting.More specifically, we present\nnecessary assumptions to enable privacy-preservation and propose combining\npre-trained models with parameter-free classifiers and parameter-efficient\nadapters that are learned under differential privacy. Our experiments\ndemonstrate their effectiveness and provide insights into balancing the\ncompeting demands of continual learning and privacy.\n","authors":["Marlon Tobaben","Marcus Klasson","Rui Li","Arno Solin","Antti Honkela"],"pdf_url":"https://arxiv.org/pdf/2411.04680v1.pdf","comment":"15 pages, 3 figures, Accepted at Scalable Continual Learning for\n Lifelong Foundation Models Workshop at 38th Conference on Neural Information\n Processing Systems (NeurIPS 2024)"},{"id":"http://arxiv.org/abs/2411.04672v1","updated":"2024-11-07T12:55:35Z","published":"2024-11-07T12:55:35Z","title":"Semantic-Aware Resource Management for C-V2X Platooning via Multi-Agent\n Reinforcement Learning","summary":" This paper presents a semantic-aware multi-modal resource allocation (SAMRA)\nfor multi-task using multi-agent reinforcement learning (MARL), termed\nSAMRAMARL, utilizing in platoon systems where cellular vehicle-to-everything\n(C-V2X) communication is employed. The proposed approach leverages the semantic\ninformation to optimize the allocation of communication resources. By\nintegrating a distributed multi-agent reinforcement learning (MARL) algorithm,\nSAMRAMARL enables autonomous decision-making for each vehicle, channel\nassignment optimization, power allocation, and semantic symbol length based on\nthe contextual importance of the transmitted information. This\nsemantic-awareness ensures that both vehicle-to-vehicle (V2V) and\nvehicle-to-infrastructure (V2I) communications prioritize data that is critical\nfor maintaining safe and efficient platoon operations. The framework also\nintroduces a tailored quality of experience (QoE) metric for semantic\ncommunication, aiming to maximize QoE in V2V links while improving the success\nrate of semantic information transmission (SRS). Extensive simulations has\ndemonstrated that SAMRAMARL outperforms existing methods, achieving significant\ngains in QoE and communication efficiency in C-V2X platooning scenarios.\n","authors":["Zhiyu Shao","Qiong Wu","Pingyi Fan","Kezhi Wang","Qiang Fan","Wen Chen","Khaled B. Letaief"],"pdf_url":"https://arxiv.org/pdf/2411.04672v1.pdf","comment":"This paper has been submitted to IEEE Journal. The source code has\n been released\n at:https://github.com/qiongwu86/Semantic-Aware-Resource-Management-for-C-V2X-Platooning-via-Multi-Agent-Reinforcement-Learning"},{"id":"http://arxiv.org/abs/2411.04669v1","updated":"2024-11-07T12:54:42Z","published":"2024-11-07T12:54:42Z","title":"EffiCANet: Efficient Time Series Forecasting with Convolutional\n Attention","summary":" The exponential growth of multivariate time series data from sensor networks\nin domains like industrial monitoring and smart cities requires efficient and\naccurate forecasting models. Current deep learning methods often fail to\nadequately capture long-range dependencies and complex inter-variable\nrelationships, especially under real-time processing constraints. These\nlimitations arise as many models are optimized for either short-term\nforecasting with limited receptive fields or long-term accuracy at the cost of\nefficiency. Additionally, dynamic and intricate interactions between variables\nin real-world data further complicate modeling efforts. To address these\nlimitations, we propose EffiCANet, an Efficient Convolutional Attention Network\ndesigned to enhance forecasting accuracy while maintaining computational\nefficiency. EffiCANet integrates three key components: (1) a Temporal\nLarge-kernel Decomposed Convolution (TLDC) module that captures long-term\ntemporal dependencies while reducing computational overhead; (2) an\nInter-Variable Group Convolution (IVGC) module that captures complex and\nevolving relationships among variables; and (3) a Global Temporal-Variable\nAttention (GTVA) mechanism that prioritizes critical temporal and\ninter-variable features. Extensive evaluations across nine benchmark datasets\nshow that EffiCANet achieves the maximum reduction of 10.02% in MAE over\nstate-of-the-art models, while cutting computational costs by 26.2% relative to\nconventional large-kernel convolution methods, thanks to its efficient\ndecomposition strategy.\n","authors":["Xinxing Zhou","Jiaqi Ye","Shubao Zhao","Ming Jin","Chengyi Yang","Yanlong Wen","Xiaojie Yuan"],"pdf_url":"https://arxiv.org/pdf/2411.04669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04662v1","updated":"2024-11-07T12:48:27Z","published":"2024-11-07T12:48:27Z","title":"Enhancing Trust in Clinically Significant Prostate Cancer Prediction\n with Multiple Magnetic Resonance Imaging Modalities","summary":" In the United States, prostate cancer is the second leading cause of deaths\nin males with a predicted 35,250 deaths in 2024. However, most diagnoses are\nnon-lethal and deemed clinically insignificant which means that the patient\nwill likely not be impacted by the cancer over their lifetime. As a result,\nnumerous research studies have explored the accuracy of predicting clinical\nsignificance of prostate cancer based on magnetic resonance imaging (MRI)\nmodalities and deep neural networks. Despite their high performance, these\nmodels are not trusted by most clinical scientists as they are trained solely\non a single modality whereas clinical scientists often use multiple magnetic\nresonance imaging modalities during their diagnosis. In this paper, we\ninvestigate combining multiple MRI modalities to train a deep learning model to\nenhance trust in the models for clinically significant prostate cancer\nprediction. The promising performance and proposed training pipeline showcase\nthe benefits of incorporating multiple MRI modalities for enhanced trust and\naccuracy.\n","authors":["Benjamin Ng","Chi-en Amy Tai","E. Zhixuan Zeng","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2411.04662v1.pdf","comment":"Findings paper presented at Machine Learning for Health (ML4H)\n symposium 2024, December 15-16, 2024, Vancouver, Canada, 6 pages"},{"id":"http://arxiv.org/abs/2411.04655v1","updated":"2024-11-07T12:32:24Z","published":"2024-11-07T12:32:24Z","title":"Centrality Graph Shift Operators for Graph Neural Networks","summary":" Graph Shift Operators (GSOs), such as the adjacency and graph Laplacian\nmatrices, play a fundamental role in graph theory and graph representation\nlearning. Traditional GSOs are typically constructed by normalizing the\nadjacency matrix by the degree matrix, a local centrality metric. In this work,\nwe instead propose and study Centrality GSOs (CGSOs), which normalize adjacency\nmatrices by global centrality metrics such as the PageRank, $k$-core or count\nof fixed length walks. We study spectral properties of the CGSOs, allowing us\nto get an understanding of their action on graph signals. We confirm this\nunderstanding by defining and running the spectral clustering algorithm based\non different CGSOs on several synthetic and real-world datasets. We furthermore\noutline how our CGSO can act as the message passing operator in any Graph\nNeural Network and in particular demonstrate strong performance of a variant of\nthe Graph Convolutional Network and Graph Attention Network using our CGSOs on\nseveral real-world benchmark datasets.\n","authors":["Yassine Abbahaddou","Fragkiskos D. Malliaros","Johannes F. Lutzeyer","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2411.04655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04653v1","updated":"2024-11-07T12:28:52Z","published":"2024-11-07T12:28:52Z","title":"IGDrivSim: A Benchmark for the Imitation Gap in Autonomous Driving","summary":" Developing autonomous vehicles that can navigate complex environments with\nhuman-level safety and efficiency is a central goal in self-driving research. A\ncommon approach to achieving this is imitation learning, where agents are\ntrained to mimic human expert demonstrations collected from real-world driving\nscenarios. However, discrepancies between human perception and the self-driving\ncar's sensors can introduce an \\textit{imitation gap}, leading to imitation\nlearning failures. In this work, we introduce \\textbf{IGDrivSim}, a benchmark\nbuilt on top of the Waymax simulator, designed to investigate the effects of\nthe imitation gap in learning autonomous driving policy from human expert\ndemonstrations. Our experiments show that this perception gap between human\nexperts and self-driving agents can hinder the learning of safe and effective\ndriving behaviors. We further show that combining imitation with reinforcement\nlearning, using a simple penalty reward for prohibited behaviors, effectively\nmitigates these failures. Our code is open-sourced at:\nhttps://github.com/clemgris/IGDrivSim.git.\n","authors":["Clémence Grislain","Risto Vuorio","Cong Lu","Shimon Whiteson"],"pdf_url":"https://arxiv.org/pdf/2411.04653v1.pdf","comment":"8 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2411.04649v1","updated":"2024-11-07T12:12:44Z","published":"2024-11-07T12:12:44Z","title":"DISCO: DISCovering Overfittings as Causal Rules for Text Classification\n Models","summary":" With the rapid advancement of neural language models, the deployment of\nover-parameterized models has surged, increasing the need for interpretable\nexplanations comprehensible to human inspectors. Existing post-hoc\ninterpretability methods, which often focus on unigram features of single input\ntextual instances, fail to capture the models' decision-making process fully.\nAdditionally, many methods do not differentiate between decisions based on\nspurious correlations and those based on a holistic understanding of the input.\nOur paper introduces DISCO, a novel method for discovering global, rule-based\nexplanations by identifying causal n-gram associations with model predictions.\nThis method employs a scalable sequence mining technique to extract relevant\ntext spans from training data, associate them with model predictions, and\nconduct causality checks to distill robust rules that elucidate model behavior.\nThese rules expose potential overfitting and provide insights into misleading\nfeature combinations. We validate DISCO through extensive testing,\ndemonstrating its superiority over existing methods in offering comprehensive\ninsights into complex model behaviors. Our approach successfully identifies all\nshortcuts manually introduced into the training data (100% detection rate on\nthe MultiRC dataset), resulting in an 18.8% regression in model performance --\na capability unmatched by any other method. Furthermore, DISCO supports\ninteractive explanations, enabling human inspectors to distinguish spurious\ncauses in the rule-based output. This alleviates the burden of abundant\ninstance-wise explanations and helps assess the model's risk when encountering\nout-of-distribution (OOD) data.\n","authors":["Zijian Zhang","Vinay Setty","Yumeng Wang","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2411.04649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04644v1","updated":"2024-11-07T12:01:36Z","published":"2024-11-07T12:01:36Z","title":"wav2sleep: A Unified Multi-Modal Approach to Sleep Stage Classification\n from Physiological Signals","summary":" Accurate classification of sleep stages from less obtrusive sensor\nmeasurements such as the electrocardiogram (ECG) or photoplethysmogram (PPG)\ncould enable important applications in sleep medicine. Existing approaches to\nthis problem have typically used deep learning models designed and trained to\noperate on one or more specific input signals. However, the datasets used to\ndevelop these models often do not contain the same sets of input signals. Some\nsignals, particularly PPG, are much less prevalent than others, and this has\npreviously been addressed with techniques such as transfer learning.\nAdditionally, only training on one or more fixed modalities precludes\ncross-modal information transfer from other sources, which has proved valuable\nin other problem domains. To address this, we introduce wav2sleep, a unified\nmodel designed to operate on variable sets of input signals during training and\ninference. After jointly training on over 10,000 overnight recordings from six\npublicly available polysomnography datasets, including SHHS and MESA, wav2sleep\noutperforms existing sleep stage classification models across test-time input\ncombinations including ECG, PPG, and respiratory signals.\n","authors":["Jonathan F. Carter","Lionel Tarassenko"],"pdf_url":"https://arxiv.org/pdf/2411.04644v1.pdf","comment":"Accepted to Machine Learning for Health (ML4H) 2024"},{"id":"http://arxiv.org/abs/2411.04635v1","updated":"2024-11-07T11:46:48Z","published":"2024-11-07T11:46:48Z","title":"Cybercrime Prediction via Geographically Weighted Learning","summary":" Inspired by the success of Geographically Weighted Regression and its\naccounting for spatial variations, we propose GeogGNN -- A graph neural network\nmodel that accounts for geographical latitude and longitudinal points. Using a\nsynthetically generated dataset, we apply the algorithm for a 4-class\nclassification problem in cybersecurity with seemingly realistic geographic\ncoordinates centered in the Gulf Cooperation Council region. We demonstrate\nthat it has higher accuracy than standard neural networks and convolutional\nneural networks that treat the coordinates as features. Encouraged by the\nspeed-up in model accuracy by the GeogGNN model, we provide a general\nmathematical result that demonstrates that a geometrically weighted neural\nnetwork will, in principle, always display higher accuracy in the\nclassification of spatially dependent data by making use of spatial continuity\nand local averaging features.\n","authors":["Muhammad Al-Zafar Khan","Jamal Al-Karaki","Emad Mahafzah"],"pdf_url":"https://arxiv.org/pdf/2411.04635v1.pdf","comment":"17 pages, 8 figures, Submitted to the International Jordanian\n Cybersecurity Conference 2024 (IJCC24)"},{"id":"http://arxiv.org/abs/2411.04632v1","updated":"2024-11-07T11:35:31Z","published":"2024-11-07T11:35:31Z","title":"Improved Multi-Task Brain Tumour Segmentation with Synthetic Data\n Augmentation","summary":" This paper presents the winning solution of task 1 and the third-placed\nsolution of task 3 of the BraTS challenge. The use of automated tools in\nclinical practice has increased due to the development of more and more\nsophisticated and reliable algorithms. However, achieving clinical standards\nand developing tools for real-life scenarios is a major challenge. To this end,\nBraTS has organised tasks to find the most advanced solutions for specific\npurposes. In this paper, we propose the use of synthetic data to train\nstate-of-the-art frameworks in order to improve the segmentation of adult\ngliomas in a post-treatment scenario, and the segmentation of meningioma for\nradiotherapy planning. Our results suggest that the use of synthetic data leads\nto more robust algorithms, although the synthetic data generation pipeline is\nnot directly suited to the meningioma task. The code for these tasks is\navailable at https://github.com/ShadowTwin41/BraTS_2023_2024_solutions.\n","authors":["André Ferreira","Tiago Jesus","Behrus Puladi","Jens Kleesiek","Victor Alves","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2411.04632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04630v1","updated":"2024-11-07T11:29:55Z","published":"2024-11-07T11:29:55Z","title":"Brain Tumour Removing and Missing Modality Generation using 3D WDM","summary":" This paper presents the second-placed solution for task 8 and the\nparticipation solution for task 7 of BraTS 2024. The adoption of automated\nbrain analysis algorithms to support clinical practice is increasing. However,\nmany of these algorithms struggle with the presence of brain lesions or the\nabsence of certain MRI modalities. The alterations in the brain's morphology\nleads to high variability and thus poor performance of predictive models that\nwere trained only on healthy brains. The lack of information that is usually\nprovided by some of the missing MRI modalities also reduces the reliability of\nthe prediction models trained with all modalities. In order to improve the\nperformance of these models, we propose the use of conditional 3D wavelet\ndiffusion models. The wavelet transform enabled full-resolution image training\nand prediction on a GPU with 48 GB VRAM, without patching or downsampling,\npreserving all information for prediction. For the inpainting task of BraTS\n2024, the use of a large and variable number of healthy masks and the stability\nand efficiency of the 3D wavelet diffusion model resulted in 0.007, 22.61 and\n0.842 in the validation set and 0.07 , 22.8 and 0.91 in the testing set (MSE,\nPSNR and SSIM respectively). The code for these tasks is available at\nhttps://github.com/ShadowTwin41/BraTS_2023_2024_solutions.\n","authors":["André Ferreira","Gijs Luijten","Behrus Puladi","Jens Kleesiek","Victor Alves","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2411.04630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07712v3","updated":"2024-11-07T11:26:39Z","published":"2024-07-10T14:44:25Z","title":"Deep-Graph-Sprints: Accelerated Representation Learning in\n Continuous-Time Dynamic Graphs","summary":" Continuous-time dynamic graphs (CTDGs) are essential for modeling\ninterconnected, evolving systems. Traditional methods for extracting knowledge\nfrom these graphs often depend on feature engineering or deep learning. Feature\nengineering is limited by the manual and time-intensive nature of crafting\nfeatures, while deep learning approaches suffer from high inference latency,\nmaking them impractical for real-time applications. This paper introduces\nDeep-Graph-Sprints (DGS), a novel deep learning architecture designed for\nefficient representation learning on CTDGs with low-latency inference\nrequirements. We benchmark DGS against state-of-the-art (SOTA) feature\nengineering and graph neural network methods using five diverse datasets. The\nresults indicate that DGS achieves competitive performance while inference\nspeed improves between 4x and 12x compared to other deep learning approaches on\nour benchmark datasets. Our method effectively bridges the gap between deep\nrepresentation learning and low-latency application requirements for CTDGs.\n","authors":["Ahmad Naser Eddin","Jacopo Bono","David Aparício","Hugo Ferreira","Pedro Ribeiro","Pedro Bizarro"],"pdf_url":"https://arxiv.org/pdf/2407.07712v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04625v1","updated":"2024-11-07T11:22:46Z","published":"2024-11-07T11:22:46Z","title":"Sharp Analysis for KL-Regularized Contextual Bandits and RLHF","summary":" Reverse-Kullback-Leibler (KL) regularization has emerged to be a predominant\ntechnique used to enhance policy optimization in reinforcement learning (RL)\nand reinforcement learning from human feedback (RLHF), which forces the learned\npolicy to stay close to a reference policy. While the effectiveness and\nnecessity of KL-regularization have been empirically demonstrated in various\npractical scenarios, current theoretical analysis of KL-regularized RLHF still\nobtains the same $\\mathcal{O}(1 / \\epsilon^2)$ sample complexity as problems\nwithout KL-regularization. To understand the fundamental distinction between\npolicy learning objectives with KL-regularization and ones without\nKL-regularization, we are the first to theoretically demonstrate the power of\nKL-regularization by providing a sharp analysis for KL-regularized contextual\nbandits and RLHF, revealing an $\\mathcal{O}(1 / \\epsilon)$ sample complexity\nwhen $\\epsilon$ is sufficiently small.\n We further explore the role of data coverage in contextual bandits and RLHF.\nWhile the coverage assumption is commonly employed in offline RLHF to link the\nsamples from the reference policy to the optimal policy, often at the cost of a\nmultiplicative dependence on the coverage coefficient, its impact on the sample\ncomplexity of online RLHF remains unclear. Previous theoretical analyses of\nonline RLHF typically require explicit exploration and additional structural\nassumptions on the reward function class. In contrast, we show that with\nsufficient coverage from the reference policy, a simple two-stage mixed\nsampling strategy can achieve a sample complexity with only an additive\ndependence on the coverage coefficient. Our results provide a comprehensive\nunderstanding of the roles of KL-regularization and data coverage in RLHF,\nshedding light on the design of more efficient RLHF algorithms.\n","authors":["Heyang Zhao","Chenlu Ye","Quanquan Gu","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.04625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09377v2","updated":"2024-11-07T11:21:56Z","published":"2023-06-15T08:18:29Z","title":"Evaluating alignment between humans and neural network representations\n in image-based learning tasks","summary":" Humans represent scenes and objects in rich feature spaces, carrying\ninformation that allows us to generalise about category memberships and\nabstract functions with few examples. What determines whether a neural network\nmodel generalises like a human? We tested how well the representations of $86$\npretrained neural network models mapped to human learning trajectories across\ntwo tasks where humans had to learn continuous relationships and categories of\nnatural images. In these tasks, both human participants and neural networks\nsuccessfully identified the relevant stimulus features within a few trials,\ndemonstrating effective generalisation. We found that while training dataset\nsize was a core determinant of alignment with human choices, contrastive\ntraining with multi-modal data (text and imagery) was a common feature of\ncurrently publicly available models that predicted human generalisation.\nIntrinsic dimensionality of representations had different effects on alignment\nfor different model types. Lastly, we tested three sets of human-aligned\nrepresentations and found no consistent improvements in predictive accuracy\ncompared to the baselines. In conclusion, pretrained neural networks can serve\nto extract representations for cognitive models, as they appear to capture some\nfundamental aspects of cognition that are transferable across tasks. Both our\nparadigms and modelling approach offer a novel way to quantify alignment\nbetween neural networks and humans and extend cognitive science into more\nnaturalistic domains.\n","authors":["Can Demircan","Tankred Saanum","Leonardo Pettini","Marcel Binz","Blazej M Baczkowski","Christian F Doeller","Mona M Garvert","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2306.09377v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02099v2","updated":"2024-11-07T10:53:14Z","published":"2024-11-04T14:08:26Z","title":"Differentially Private Integrated Decision Gradients (IDG-DP) for\n Radar-based Human Activity Recognition","summary":" Human motion analysis offers significant potential for healthcare monitoring\nand early detection of diseases. The advent of radar-based sensing systems has\ncaptured the spotlight for they are able to operate without physical contact\nand they can integrate with pre-existing Wi-Fi networks. They are also seen as\nless privacy-invasive compared to camera-based systems. However, recent\nresearch has shown high accuracy in recognizing subjects or gender from radar\ngait patterns, raising privacy concerns. This study addresses these issues by\ninvestigating privacy vulnerabilities in radar-based Human Activity Recognition\n(HAR) systems and proposing a novel method for privacy preservation using\nDifferential Privacy (DP) driven by attributions derived with Integrated\nDecision Gradient (IDG) algorithm. We investigate Black-box Membership\nInference Attack (MIA) Models in HAR settings across various levels of\nattacker-accessible information. We extensively evaluated the effectiveness of\nthe proposed IDG-DP method by designing a CNN-based HAR model and rigorously\nassessing its resilience against MIAs. Experimental results demonstrate the\npotential of IDG-DP in mitigating privacy attacks while maintaining utility\nacross all settings, particularly excelling against label-only and shadow model\nblack-box MIA attacks. This work represents a crucial step towards balancing\nthe need for effective radar-based HAR with robust privacy protection in\nhealthcare environments.\n","authors":["Idris Zakariyya","Linda Tran","Kaushik Bhargav Sivangi","Paul Henderson","Fani Deligianni"],"pdf_url":"https://arxiv.org/pdf/2411.02099v2.pdf","comment":"Accepted at WACV 2025. 12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2406.18624v3","updated":"2024-11-07T10:35:32Z","published":"2024-06-26T12:50:55Z","title":"Robust Low-Cost Drone Detection and Classification in Low SNR\n Environments","summary":" The proliferation of drones, or unmanned aerial vehicles (UAVs), has raised\nsignificant safety concerns due to their potential misuse in activities such as\nespionage, smuggling, and infrastructure disruption. This paper addresses the\ncritical need for effective drone detection and classification systems that\noperate independently of UAV cooperation. We evaluate various convolutional\nneural networks (CNNs) for their ability to detect and classify drones using\nspectrogram data derived from consecutive Fourier transforms of signal\ncomponents. The focus is on model robustness in low signal-to-noise ratio (SNR)\nenvironments, which is critical for real-world applications. A comprehensive\ndataset is provided to support future model development. In addition, we\ndemonstrate a low-cost drone detection system using a standard computer,\nsoftware-defined radio (SDR) and antenna, validated through real-world field\ntesting. On our development dataset, all models consistently achieved an\naverage balanced classification accuracy of >= 85% at SNR > -12dB. In the field\ntest, these models achieved an average balance accuracy of > 80%, depending on\ntransmitter distance and antenna direction. Our contributions include: a\npublicly available dataset for model development, a comparative analysis of CNN\nfor drone detection under low SNR conditions, and the deployment and field\nevaluation of a practical, low-cost detection system.\n","authors":["Stefan Glüge","Matthias Nyfeler","Ahmad Aghaebrahimian","Nicola Ramagnano","Christof Schüpbach"],"pdf_url":"https://arxiv.org/pdf/2406.18624v3.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.04596v1","updated":"2024-11-07T10:28:11Z","published":"2024-11-07T10:28:11Z","title":"The Impact of Semi-Supervised Learning on Line Segment Detection","summary":" In this paper we present a method for line segment detection in images, based\non a semi-supervised framework. Leveraging the use of a consistency loss based\non differently augmented and perturbed unlabeled images with a small amount of\nlabeled data, we show comparable results to fully supervised methods. This\nopens up application scenarios where annotation is difficult or expensive, and\nfor domain specific adaptation of models. We are specifically interested in\nreal-time and online applications, and investigate small and efficient learning\nbackbones. Our method is to our knowledge the first to target line detection\nusing modern state-of-the-art methodologies for semi-supervised learning. We\ntest the method on both standard benchmarks and domain specific scenarios for\nforestry applications, showing the tractability of the proposed method.\n","authors":["Johanna Engman","Karl Åström","Magnus Oskarsson"],"pdf_url":"https://arxiv.org/pdf/2411.04596v1.pdf","comment":"9 pages, 6 figures, 7 tables"},{"id":"http://arxiv.org/abs/2411.04594v1","updated":"2024-11-07T10:25:20Z","published":"2024-11-07T10:25:20Z","title":"Verification of Neural Networks against Convolutional Perturbations via\n Parameterised Kernels","summary":" We develop a method for the efficient verification of neural networks against\nconvolutional perturbations such as blurring or sharpening. To define input\nperturbations we use well-known camera shake, box blur and sharpen kernels. We\ndemonstrate that these kernels can be linearly parameterised in a way that\nallows for a variation of the perturbation strength while preserving desired\nkernel properties. To facilitate their use in neural network verification, we\ndevelop an efficient way of convolving a given input with these parameterised\nkernels. The result of this convolution can be used to encode the perturbation\nin a verification setting by prepending a linear layer to a given network. This\nleads to tight bounds and a high effectiveness in the resulting verification\nstep. We add further precision by employing input splitting as a branch and\nbound strategy. We demonstrate that we are able to verify robustness on a\nnumber of standard benchmarks where the baseline is unable to provide any\nsafety certificates. To the best of our knowledge, this is the first solution\nfor verifying robustness against specific convolutional perturbations such as\ncamera shake.\n","authors":["Benedikt Brückner","Alessio Lomuscio"],"pdf_url":"https://arxiv.org/pdf/2411.04594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04620v4","updated":"2024-11-07T10:16:23Z","published":"2024-02-07T07:07:02Z","title":"CataractBot: An LLM-Powered Expert-in-the-Loop Chatbot for Cataract\n Patients","summary":" The healthcare landscape is evolving, with patients seeking reliable\ninformation about their health conditions and available treatment options.\nDespite the abundance of information sources, the digital age overwhelms\nindividuals with excess, often inaccurate information. Patients primarily trust\nmedical professionals, highlighting the need for expert-endorsed health\ninformation. However, increased patient loads on experts has led to reduced\ncommunication time, impacting information sharing. To address this gap, we\ndeveloped CataractBot, an experts-in-the-loop chatbot powered by LLMs, in\ncollaboration with an eye hospital in India. CataractBot answers cataract\nsurgery related questions instantly by querying a curated knowledge base and\nprovides expert-verified responses asynchronously. It has multimodal and\nmultilingual capabilities. In an in-the-wild deployment study with 55\nparticipants, CataractBot proved valuable, providing anytime accessibility,\nsaving time, accommodating diverse literacy levels, alleviating power\ndifferences, and adding a privacy layer between patients and doctors. Users\nreported that their trust in the system was established through expert\nverification. Broadly, our results could inform future work on designing\nexpert-mediated LLM bots.\n","authors":["Pragnya Ramjee","Bhuvan Sachdeva","Satvik Golechha","Shreyas Kulkarni","Geeta Fulari","Kaushik Murali","Mohit Jain"],"pdf_url":"https://arxiv.org/pdf/2402.04620v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04586v1","updated":"2024-11-07T10:15:25Z","published":"2024-11-07T10:15:25Z","title":"On the Inherent Robustness of One-Stage Object Detection against\n Out-of-Distribution Data","summary":" Robustness is a fundamental aspect for developing safe and trustworthy\nmodels, particularly when they are deployed in the open world. In this work we\nanalyze the inherent capability of one-stage object detectors to robustly\noperate in the presence of out-of-distribution (OoD) data. Specifically, we\npropose a novel detection algorithm for detecting unknown objects in image\ndata, which leverages the features extracted by the model from each sample.\nDifferently from other recent approaches in the literature, our proposal does\nnot require retraining the object detector, thereby allowing for the use of\npretrained models. Our proposed OoD detector exploits the application of\nsupervised dimensionality reduction techniques to mitigate the effects of the\ncurse of dimensionality on the features extracted by the model. Furthermore, it\nutilizes high-resolution feature maps to identify potential unknown objects in\nan unsupervised fashion. Our experiments analyze the Pareto trade-off between\nthe performance detecting known and unknown objects resulting from different\nalgorithmic configurations and inference confidence thresholds. We also compare\nthe performance of our proposed algorithm to that of logits-based post-hoc OoD\nmethods, as well as possible fusion strategies. Finally, we discuss on the\ncompetitiveness of all tested methods against state-of-the-art OoD approaches\nfor object detection models over the recently published Unknown Object\nDetection benchmark. The obtained results verify that the performance of\navant-garde post-hoc OoD detectors can be further improved when combined with\nour proposed algorithm.\n","authors":["Aitor Martinez-Seras","Javier Del Ser","Alain Andres","Pablo Garcia-Bringas"],"pdf_url":"https://arxiv.org/pdf/2411.04586v1.pdf","comment":"12 figures, 4 tables, under review"},{"id":"http://arxiv.org/abs/2411.04580v1","updated":"2024-11-07T10:06:23Z","published":"2024-11-07T10:06:23Z","title":"Interpreting the Learned Model in MuZero Planning","summary":" MuZero has achieved superhuman performance in various games by using a\ndynamics network to predict environment dynamics for planning, without relying\non simulators. However, the latent states learned by the dynamics network make\nits planning process opaque. This paper aims to demystify MuZero's model by\ninterpreting the learned latent states. We incorporate observation\nreconstruction and state consistency into MuZero training and conduct an\nin-depth analysis to evaluate latent states across two board games: 9x9 Go and\nOuter-Open Gomoku, and three Atari games: Breakout, Ms. Pacman, and Pong. Our\nfindings reveal that while the dynamics network becomes less accurate over\nlonger simulations, MuZero still performs effectively by using planning to\ncorrect errors. Our experiments also show that the dynamics network learns\nbetter latent states in board games than in Atari games. These insights\ncontribute to a better understanding of MuZero and offer directions for future\nresearch to improve the playing performance, robustness, and interpretability\nof the MuZero algorithm.\n","authors":["Hung Guei","Yan-Ru Ju","Wei-Yu Chen","Ti-Rong Wu"],"pdf_url":"https://arxiv.org/pdf/2411.04580v1.pdf","comment":"Accepted by the 29th International Conference on Technologies and\n Applications of Artificial Intelligence (TAAI 2024)"},{"id":"http://arxiv.org/abs/2411.04579v1","updated":"2024-11-07T10:03:44Z","published":"2024-11-07T10:03:44Z","title":"Towards Robust Federated Analytics via Differentially Private\n Measurements of Statistical Heterogeneity","summary":" Statistical heterogeneity is a measure of how skewed the samples of a dataset\nare. It is a common problem in the study of differential privacy that the usage\nof a statistically heterogeneous dataset results in a significant loss of\naccuracy. In federated scenarios, statistical heterogeneity is more likely to\nhappen, and so the above problem is even more pressing. We explore the three\nmost promising ways to measure statistical heterogeneity and give formulae for\ntheir accuracy, while simultaneously incorporating differential privacy. We\nfind the optimum privacy parameters via an analytic mechanism, which\nincorporates root finding methods. We validate the main theorems and related\nhypotheses experimentally, and test the robustness of the analytic mechanism to\ndifferent heterogeneity levels. The analytic mechanism in a distributed setting\ndelivers superior accuracy to all combinations involving the classic mechanism\nand/or the centralized setting. All measures of statistical heterogeneity do\nnot lose significant accuracy when a heterogeneous sample is used.\n","authors":["Mary Scott","Graham Cormode","Carsten Maple"],"pdf_url":"https://arxiv.org/pdf/2411.04579v1.pdf","comment":"26 pages, 6 tables, 1 figure"},{"id":"http://arxiv.org/abs/2411.04570v1","updated":"2024-11-07T09:53:11Z","published":"2024-11-07T09:53:11Z","title":"Higher-Order GNNs Meet Efficiency: Sparse Sobolev Graph Neural Networks","summary":" Graph Neural Networks (GNNs) have shown great promise in modeling\nrelationships between nodes in a graph, but capturing higher-order\nrelationships remains a challenge for large-scale networks. Previous studies\nhave primarily attempted to utilize the information from higher-order neighbors\nin the graph, involving the incorporation of powers of the shift operator, such\nas the graph Laplacian or adjacency matrix. This approach comes with a\ntrade-off in terms of increased computational and memory demands. Relying on\ngraph spectral theory, we make a fundamental observation: the regular and the\nHadamard power of the Laplacian matrix behave similarly in the spectrum. This\nobservation has significant implications for capturing higher-order information\nin GNNs for various tasks such as node classification and semi-supervised\nlearning. Consequently, we propose a novel graph convolutional operator based\non the sparse Sobolev norm of graph signals. Our approach, known as Sparse\nSobolev GNN (S2-GNN), employs Hadamard products between matrices to maintain\nthe sparsity level in graph representations. S2-GNN utilizes a cascade of\nfilters with increasing Hadamard powers to generate a diverse set of functions.\nWe theoretically analyze the stability of S2-GNN to show the robustness of the\nmodel against possible graph perturbations. We also conduct a comprehensive\nevaluation of S2-GNN across various graph mining, semi-supervised node\nclassification, and computer vision tasks. In particular use cases, our\nalgorithm demonstrates competitive performance compared to state-of-the-art\nGNNs in terms of performance and running time.\n","authors":["Jhony H. Giraldo","Aref Einizade","Andjela Todorovic","Jhon A. Castro-Correa","Mohsen Badiey","Thierry Bouwmans","Fragkiskos D. Malliaros"],"pdf_url":"https://arxiv.org/pdf/2411.04570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04569v1","updated":"2024-11-07T09:47:18Z","published":"2024-11-07T09:47:18Z","title":"Impact of Label Noise on Learning Complex Features","summary":" Neural networks trained with stochastic gradient descent exhibit an inductive\nbias towards simpler decision boundaries, typically converging to a narrow\nfamily of functions, and often fail to capture more complex features. This\nphenomenon raises concerns about the capacity of deep models to adequately\nlearn and represent real-world datasets. Traditional approaches such as\nexplicit regularization, data augmentation, architectural modifications, etc.,\nhave largely proven ineffective in encouraging the models to learn diverse\nfeatures. In this work, we investigate the impact of pre-training models with\nnoisy labels on the dynamics of SGD across various architectures and datasets.\nWe show that pretraining promotes learning complex functions and diverse\nfeatures in the presence of noise. Our experiments demonstrate that\npre-training with noisy labels encourages gradient descent to find alternate\nminima that do not solely depend upon simple features, rather learns more\ncomplex and broader set of features, without hurting performance.\n","authors":["Rahul Vashisht","P. Krishna Kumar","Harsha Vardhan Govind","Harish G. Ramaswamy"],"pdf_url":"https://arxiv.org/pdf/2411.04569v1.pdf","comment":"Accepted at Workshop on Scientific Methods for Understanding Deep\n Learning, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.04562v1","updated":"2024-11-07T09:35:22Z","published":"2024-11-07T09:35:22Z","title":"Constrained Latent Action Policies for Model-Based Offline Reinforcement\n Learning","summary":" In offline reinforcement learning, a policy is learned using a static dataset\nin the absence of costly feedback from the environment. In contrast to the\nonline setting, only using static datasets poses additional challenges, such as\npolicies generating out-of-distribution samples. Model-based offline\nreinforcement learning methods try to overcome these by learning a model of the\nunderlying dynamics of the environment and using it to guide policy search. It\nis beneficial but, with limited datasets, errors in the model and the issue of\nvalue overestimation among out-of-distribution states can worsen performance.\nCurrent model-based methods apply some notion of conservatism to the Bellman\nupdate, often implemented using uncertainty estimation derived from model\nensembles. In this paper, we propose Constrained Latent Action Policies (C-LAP)\nwhich learns a generative model of the joint distribution of observations and\nactions. We cast policy learning as a constrained objective to always stay\nwithin the support of the latent action distribution, and use the generative\ncapabilities of the model to impose an implicit constraint on the generated\nactions. Thereby eliminating the need to use additional uncertainty penalties\non the Bellman update and significantly decreasing the number of gradient steps\nrequired to learn a policy. We empirically evaluate C-LAP on the D4RL and\nV-D4RL benchmark, and show that C-LAP is competitive to state-of-the-art\nmethods, especially outperforming on datasets with visual observations.\n","authors":["Marvin Alles","Philip Becker-Ehmck","Patrick van der Smagt","Maximilian Karl"],"pdf_url":"https://arxiv.org/pdf/2411.04562v1.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024)"},{"id":"http://arxiv.org/abs/2404.12096v3","updated":"2024-11-07T09:29:32Z","published":"2024-04-18T11:29:23Z","title":"LongEmbed: Extending Embedding Models for Long Context Retrieval","summary":" Embedding models play a pivot role in modern NLP applications such as IR and\nRAG. While the context limit of LLMs has been pushed beyond 1 million tokens,\nembedding models are still confined to a narrow context window not exceeding 8k\ntokens, refrained from application scenarios requiring long inputs such as\nlegal contracts. This paper explores context window extension of existing\nembedding models, pushing the limit to 32k without requiring additional\ntraining. First, we examine the performance of current embedding models for\nlong context retrieval on our newly constructed LongEmbed benchmark. LongEmbed\ncomprises two synthetic tasks and four carefully chosen real-world tasks,\nfeaturing documents of varying length and dispersed target information.\nBenchmarking results underscore huge room for improvement in these models.\nBased on this, comprehensive experiments show that training-free context window\nextension strategies like position interpolation can effectively extend the\ncontext window of existing embedding models by several folds, regardless of\ntheir original context being 512 or beyond 4k. Furthermore, for models\nemploying absolute position encoding (APE), we show the possibility of further\nfine-tuning to harvest notable performance gains while strictly preserving\noriginal behavior for short inputs. For models using rotary position embedding\n(RoPE), significant enhancements are observed when employing RoPE-specific\nmethods, such as NTK and SelfExtend, indicating RoPE's superiority over APE for\ncontext window extension. To facilitate future research, we release E5-Base-4k\nand E5-RoPE-Base, along with the LongEmbed benchmark.\n","authors":["Dawei Zhu","Liang Wang","Nan Yang","Yifan Song","Wenhao Wu","Furu Wei","Sujian Li"],"pdf_url":"https://arxiv.org/pdf/2404.12096v3.pdf","comment":"EMNLP 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2411.04557v1","updated":"2024-11-07T09:28:38Z","published":"2024-11-07T09:28:38Z","title":"Pruning Literals for Highly Efficient Explainability at Word Level","summary":" Designing an explainable model becomes crucial now for Natural Language\nProcessing(NLP) since most of the state-of-the-art machine learning models\nprovide a limited explanation for the prediction. In the spectrum of an\nexplainable model, Tsetlin Machine(TM) is promising because of its capability\nof providing word-level explanation using proposition logic. However, concern\nrises over the elaborated combination of literals (propositional logic) in the\nclause that makes the model difficult for humans to comprehend, despite having\na transparent learning process. In this paper, we design a post-hoc pruning of\nclauses that eliminate the randomly placed literals in the clause thereby\nmaking the model more efficiently interpretable than the vanilla TM.\nExperiments on the publicly available YELP-HAT Dataset demonstrate that the\nproposed pruned TM's attention map aligns more with the human attention map\nthan the vanilla TM's attention map. In addition, the pairwise similarity\nmeasure also surpasses the attention map-based neural network models. In terms\nof accuracy, the proposed pruning method does not degrade the accuracy\nsignificantly but rather enhances the performance up to 4% to 9% in some test\ndata.\n","authors":["Rohan Kumar Yadav","Bimal Bhattarai","Abhik Jana","Lei Jiao","Seid Muhie Yimam"],"pdf_url":"https://arxiv.org/pdf/2411.04557v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.04556v1","updated":"2024-11-07T09:27:42Z","published":"2024-11-07T09:27:42Z","title":"Uncertainty Prediction Neural Network (UpNet): Embedding Artificial\n Neural Network in Bayesian Inversion Framework to Quantify the Uncertainty of\n Remote Sensing Retrieval","summary":" For the retrieval of large-scale vegetation biophysical parameters, the\ninversion of radiative transfer models (RTMs) is the most commonly used\napproach. In recent years, Artificial Neural Network (ANN)-based methods have\nbecome the mainstream for inverting RTMs due to their high accuracy and\ncomputational efficiency. It has been widely used in the retrieval of\nbiophysical variables (BV). However, due to the lack of the Bayesian inversion\ntheory interpretation, it faces challenges in quantifying the retrieval\nuncertainty, a crucial metric for product quality validation and downstream\napplications such as data assimilation or ecosystem carbon cycling modeling.\nThis study proved that the ANN trained with squared loss outputs the posterior\nmean, providing a rigorous foundation for its uncertainty quantification,\nregularization, and incorporation of prior information. A Bayesian theoretical\nframework was subsequently proposed for ANN-based methods. Using this\nframework, we derived a new algorithm called Uncertainty Prediction Neural\nNetwork (UpNet), which enables the simultaneous training of two ANNs to\nretrieve BV and provide retrieval uncertainty. To validate our method, we\ncompared UpNet with the standard Bayesian inference method, i.e., Markov Chain\nMonte Carlo (MCMC), in the inversion of a widely used RTM called ProSAIL for\nretrieving BVs and estimating uncertainty. The results demonstrated that the\nBVs retrieved and the uncertainties estimated by UpNet were highly consistent\nwith those from MCMC, achieving over a million-fold acceleration. These results\nindicated that UpNet has significant potential for fast retrieval and\nuncertainty quantification of BVs or other parameters with medium and\nhigh-resolution remote sensing data. Our Python implementation is available at:\nhttps://github.com/Dash-RSer/UpNet.\n","authors":["Dasheng Fan","Xihan Mu","Yongkang Lai","Donghui Xie","Guangjian Yan"],"pdf_url":"https://arxiv.org/pdf/2411.04556v1.pdf","comment":"24 pages, f figures"},{"id":"http://arxiv.org/abs/2411.02623v2","updated":"2024-11-07T09:25:28Z","published":"2024-11-04T21:31:04Z","title":"Learning to Assist Humans without Inferring Rewards","summary":" Assistive agents should make humans' lives easier. Classically, such\nassistance is studied through the lens of inverse reinforcement learning, where\nan assistive agent (e.g., a chatbot, a robot) infers a human's intention and\nthen selects actions to help the human reach that goal. This approach requires\ninferring intentions, which can be difficult in high-dimensional settings. We\nbuild upon prior work that studies assistance through the lens of empowerment:\nan assistive agent aims to maximize the influence of the human's actions such\nthat they exert a greater control over the environmental outcomes and can solve\ntasks in fewer steps. We lift the major limitation of prior work in this\narea--scalability to high-dimensional settings--with contrastive successor\nrepresentations. We formally prove that these representations estimate a\nsimilar notion of empowerment to that studied by prior work and provide a\nready-made mechanism for optimizing it. Empirically, our proposed method\noutperforms prior methods on synthetic benchmarks, and scales to Overcooked, a\ncooperative game setting. Theoretically, our work connects ideas from\ninformation theory, neuroscience, and reinforcement learning, and charts a path\nfor representations to play a critical role in solving assistive problems.\n","authors":["Vivek Myers","Evan Ellis","Sergey Levine","Benjamin Eysenbach","Anca Dragan"],"pdf_url":"https://arxiv.org/pdf/2411.02623v2.pdf","comment":"Conference on Neural Information Processing Systems (NeurIPS), 2024"},{"id":"http://arxiv.org/abs/2411.04554v1","updated":"2024-11-07T09:24:26Z","published":"2024-11-07T09:24:26Z","title":"Peri-midFormer: Periodic Pyramid Transformer for Time Series Analysis","summary":" Time series analysis finds wide applications in fields such as weather\nforecasting, anomaly detection, and behavior recognition. Previous methods\nattempted to model temporal variations directly using 1D time series. However,\nthis has been quite challenging due to the discrete nature of data points in\ntime series and the complexity of periodic variation. In terms of periodicity,\ntaking weather and traffic data as an example, there are multi-periodic\nvariations such as yearly, monthly, weekly, and daily, etc. In order to break\nthrough the limitations of the previous methods, we decouple the implied\ncomplex periodic variations into inclusion and overlap relationships among\ndifferent level periodic components based on the observation of the\nmulti-periodicity therein and its inclusion relationships. This explicitly\nrepresents the naturally occurring pyramid-like properties in time series,\nwhere the top level is the original time series and lower levels consist of\nperiodic components with gradually shorter periods, which we call the periodic\npyramid. To further extract complex temporal variations, we introduce\nself-attention mechanism into the periodic pyramid, capturing complex periodic\nrelationships by computing attention between periodic components based on their\ninclusion, overlap, and adjacency relationships. Our proposed Peri-midFormer\ndemonstrates outstanding performance in five mainstream time series analysis\ntasks, including short- and long-term forecasting, imputation, classification,\nand anomaly detection.\n","authors":["Qiang Wu","Gechang Yao","Zhixi Feng","Shuyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2411.04554v1.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024)"},{"id":"http://arxiv.org/abs/2406.01494v2","updated":"2024-11-07T09:23:34Z","published":"2024-06-03T16:21:29Z","title":"Robust Classification by Coupling Data Mollification with Label\n Smoothing","summary":" Introducing training-time augmentations is a key technique to enhance\ngeneralization and prepare deep neural networks against test-time corruptions.\nInspired by the success of generative diffusion models, we propose a novel\napproach of coupling data mollification, in the form of image noising and\nblurring, with label smoothing to align predicted label confidences with image\ndegradation. The method is simple to implement, introduces negligible\noverheads, and can be combined with existing augmentations. We demonstrate\nimproved robustness and uncertainty quantification on the corrupted image\nbenchmarks of the CIFAR and TinyImageNet datasets.\n","authors":["Markus Heinonen","Ba-Hien Tran","Michael Kampffmeyer","Maurizio Filippone"],"pdf_url":"https://arxiv.org/pdf/2406.01494v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.02545v2","updated":"2024-11-07T09:20:39Z","published":"2024-04-03T08:03:27Z","title":"Grid-Mapping Pseudo-Count Constraint for Offline Reinforcement Learning","summary":" Offline reinforcement learning learns from a static dataset without\ninteracting with environments, which ensures security and thus owns a good\napplication prospect. However, directly applying naive reinforcement learning\nalgorithm usually fails in an offline environment due to inaccurate Q value\napproximation caused by out-of-distribution (OOD) state-actions. It is an\neffective way to solve this problem by penalizing the Q-value of OOD\nstate-actions. Among the methods of punishing OOD state-actions, count-based\nmethods have achieved good results in discrete domains in a simple form.\nInspired by it, a novel pseudo-count method for continuous domains called\nGrid-Mapping Pseudo-Count method (GPC) is proposed by extending the count-based\nmethod from discrete to continuous domains. Firstly, the continuous state and\naction space are mapped to discrete space using Grid-Mapping, then the Q-values\nof OOD state-actions are constrained through pseudo-count. Secondly, the\ntheoretical proof is given to show that GPC can obtain appropriate uncertainty\nconstraints under fewer assumptions than other pseudo-count methods. Thirdly,\nGPC is combined with Soft Actor-Critic algorithm (SAC) to get a new algorithm\ncalled GPC-SAC. Lastly, experiments on D4RL datasets are given to show that\nGPC-SAC has better performance and less computational cost than other\nalgorithms that constrain the Q-value.\n","authors":["Yi Shen","Hanyan Huang"],"pdf_url":"https://arxiv.org/pdf/2404.02545v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04551v1","updated":"2024-11-07T09:18:39Z","published":"2024-11-07T09:18:39Z","title":"Measure-to-measure interpolation using Transformers","summary":" Transformers are deep neural network architectures that underpin the recent\nsuccesses of large language models. Unlike more classical architectures that\ncan be viewed as point-to-point maps, a Transformer acts as a\nmeasure-to-measure map implemented as specific interacting particle system on\nthe unit sphere: the input is the empirical measure of tokens in a prompt and\nits evolution is governed by the continuity equation. In fact, Transformers are\nnot limited to empirical measures and can in principle process any input\nmeasure. As the nature of data processed by Transformers is expanding rapidly,\nit is important to investigate their expressive power as maps from an arbitrary\nmeasure to another arbitrary measure. To that end, we provide an explicit\nchoice of parameters that allows a single Transformer to match $N$ arbitrary\ninput measures to $N$ arbitrary target measures, under the minimal assumption\nthat every pair of input-target measures can be matched by some transport map.\n","authors":["Borjan Geshkovski","Philippe Rigollet","Domènec Ruiz-Balet"],"pdf_url":"https://arxiv.org/pdf/2411.04551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04549v1","updated":"2024-11-07T09:17:50Z","published":"2024-11-07T09:17:50Z","title":"Vision Language Models are In-Context Value Learners","summary":" Predicting temporal progress from visual trajectories is important for\nintelligent robots that can learn, adapt, and improve. However, learning such\nprogress estimator, or temporal value function, across different tasks and\ndomains requires both a large amount of diverse data and methods which can\nscale and generalize. To address these challenges, we present Generative Value\nLearning (\\GVL), a universal value function estimator that leverages the world\nknowledge embedded in vision-language models (VLMs) to predict task progress.\nNaively asking a VLM to predict values for a video sequence performs poorly due\nto the strong temporal correlation between successive frames. Instead, GVL\nposes value estimation as a temporal ordering problem over shuffled video\nframes; this seemingly more challenging task encourages VLMs to more fully\nexploit their underlying semantic and temporal grounding capabilities to\ndifferentiate frames based on their perceived task progress, consequently\nproducing significantly better value predictions. Without any robot or task\nspecific training, GVL can in-context zero-shot and few-shot predict effective\nvalues for more than 300 distinct real-world tasks across diverse robot\nplatforms, including challenging bimanual manipulation tasks. Furthermore, we\ndemonstrate that GVL permits flexible multi-modal in-context learning via\nexamples from heterogeneous tasks and embodiments, such as human videos. The\ngenerality of GVL enables various downstream applications pertinent to\nvisuomotor policy learning, including dataset filtering, success detection, and\nadvantage-weighted regression -- all without any model training or finetuning.\n","authors":["Yecheng Jason Ma","Joey Hejna","Ayzaan Wahid","Chuyuan Fu","Dhruv Shah","Jacky Liang","Zhuo Xu","Sean Kirmani","Peng Xu","Danny Driess","Ted Xiao","Jonathan Tompson","Osbert Bastani","Dinesh Jayaraman","Wenhao Yu","Tingnan Zhang","Dorsa Sadigh","Fei Xia"],"pdf_url":"https://arxiv.org/pdf/2411.04549v1.pdf","comment":"Project website and demo:\n https://generative-value-learning.github.io/"},{"id":"http://arxiv.org/abs/2307.10869v3","updated":"2024-11-07T09:03:49Z","published":"2023-07-20T13:41:26Z","title":"Identifying Performance Issues in Cloud Service Systems Based on\n Relational-Temporal Features","summary":" Cloud systems are susceptible to performance issues, which may cause\nservice-level agreement violations and financial losses. In current practice,\ncrucial metrics are monitored periodically to provide insight into the\noperational status of components. Identifying performance issues is often\nformulated as an anomaly detection problem, which is tackled by analyzing each\nmetric independently. However, this approach overlooks the complex dependencies\nexisting among cloud components. Some graph neural network-based methods take\nboth temporal and relational information into account, however, the correlation\nviolations in the metrics that serve as indicators of underlying performance\nissues are difficult for them to identify. Furthermore, a large volume of\ncomponents in a cloud system results in a vast array of noisy metrics. This\ncomplexity renders it impractical for engineers to fully comprehend the\ncorrelations, making it challenging to identify performance issues accurately.\nTo address these limitations, we propose Identifying Performance Issues based\non Relational-Temporal Features (ISOLATE ), a learning-based approach that\nleverages both the relational and temporal features of metrics to identify\nperformance issues. In particular, it adopts a graph neural network with\nattention to characterizing the relations among metrics and extracts long-term\nand multi-scale temporal patterns using a GRU and a convolution network,\nrespectively. The learned graph attention weights can be further used to\nlocalize the correlation-violated metrics. Moreover, to relieve the impact of\nnoisy data, ISOLATE utilizes a positive unlabeled learning strategy that tags\npseudo-labels based on a small portion of confirmed negative examples.\nExtensive evaluation on both public and industrial datasets shows that ISOLATE\noutperforms all baseline models with 0.945 F1-score and 0.920 Hit rate@3.\n","authors":["Wenwei Gu","Jinyang Liu","Zhuangbin Chen","Jianping Zhang","Yuxin Su","Jiazhen Gu","Cong Feng","Zengyin Yang","Yongqiang Yang","Michael Lyu"],"pdf_url":"https://arxiv.org/pdf/2307.10869v3.pdf","comment":"Accepted in ACM Transactions on Software Engineering and Methodology\n (TOSEM)"},{"id":"http://arxiv.org/abs/2404.04940v2","updated":"2024-11-07T08:59:23Z","published":"2024-04-07T12:25:03Z","title":"Fuzzy K-Means Clustering without Cluster Centroids","summary":" Fuzzy K-Means clustering is a critical technique in unsupervised data\nanalysis. Unlike traditional hard clustering algorithms such as K-Means, it\nallows data points to belong to multiple clusters with varying degrees of\nmembership, determined through iterative optimization to establish optimal\ncluster centers and memberships, thereby achieving fuzzy partitioning of data.\nHowever, the performance of popular Fuzzy K-Means algorithms is sensitive to\nthe selection of initial cluster centroids and is also affected by noise when\nupdating mean cluster centroids. To address these challenges, this paper\nproposes a novel Fuzzy \\textit{K}-Means clustering algorithm that entirely\neliminates the reliance on cluster centroids, obtaining membership metrics\nsolely through distance matrix computation. This innovation enhances\nflexibility in distance measurement between sample points, thus improving the\nalgorithm's performance and robustness. The paper also establishes theoretical\nconnections between the proposed model and popular Fuzzy K-Means clustering\ntechniques. Experimental results on several real datasets demonstrate the\neffectiveness of the algorithm.\n","authors":["Yichen Bao","Han Lu","Quanxue Gao"],"pdf_url":"https://arxiv.org/pdf/2404.04940v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03701v4","updated":"2024-11-07T08:50:25Z","published":"2024-04-04T00:49:05Z","title":"Predictive Analytics of Varieties of Potatoes","summary":" We explore the application of machine learning algorithms specifically to\nenhance the selection process of Russet potato clones in breeding trials by\npredicting their suitability for advancement. This study addresses the\nchallenge of efficiently identifying high-yield, disease-resistant, and\nclimate-resilient potato varieties that meet processing industry standards.\nLeveraging manually collected data from trials in the state of Oregon, we\ninvestigate the potential of a wide variety of state-of-the-art binary\nclassification models. The dataset includes 1086 clones, with data on 38\nattributes recorded for each clone, focusing on yield, size, appearance, and\nfrying characteristics, with several control varieties planted consistently\nacross four Oregon regions from 2013-2021. We conduct a comprehensive analysis\nof the dataset that includes preprocessing, feature engineering, and imputation\nto address missing values. We focus on several key metrics such as accuracy,\nF1-score, and Matthews correlation coefficient (MCC) for model evaluation. The\ntop-performing models, namely a neural network classifier (Neural Net),\nhistogram-based gradient boosting classifier (HGBC), and a support vector\nmachine classifier (SVM), demonstrate consistent and significant results. To\nfurther validate our findings, we conduct a simulation study. By simulating\ndifferent data-generating scenarios, we assess model robustness and performance\nthrough true positive, true negative, false positive, and false negative\ndistributions, area under the receiver operating characteristic curve (AUC-ROC)\nand MCC. The simulation results highlight that non-linear models like SVM and\nHGBC consistently show higher AUC-ROC and MCC than logistic regression (LR),\nthus outperforming the traditional linear model across various distributions,\nand emphasizing the importance of model selection and tuning in agricultural\ntrials.\n","authors":["Fabiana Ferracina","Bala Krishnamoorthy","Mahantesh Halappanavar","Shengwei Hu","Vidyasagar Sathuvalli"],"pdf_url":"https://arxiv.org/pdf/2404.03701v4.pdf","comment":"Minor revision; to appear in Crop Sciences"},{"id":"http://arxiv.org/abs/2411.04534v1","updated":"2024-11-07T08:48:32Z","published":"2024-11-07T08:48:32Z","title":"Hypercube Policy Regularization Framework for Offline Reinforcement\n Learning","summary":" Offline reinforcement learning has received extensive attention from scholars\nbecause it avoids the interaction between the agent and the environment by\nlearning a policy through a static dataset. However, general reinforcement\nlearning methods cannot get satisfactory results in offline reinforcement\nlearning due to the out-of-distribution state actions that the dataset cannot\ncover during training. To solve this problem, the policy regularization method\nthat tries to directly clone policies used in static datasets has received\nnumerous studies due to its simplicity and effectiveness. However, policy\nconstraint methods make the agent choose the corresponding actions in the\nstatic dataset. This type of constraint is usually over-conservative, which\nresults in suboptimal policies, especially in low-quality static datasets. In\nthis paper, a hypercube policy regularization framework is proposed, this\nmethod alleviates the constraints of policy constraint methods by allowing the\nagent to explore the actions corresponding to similar states in the static\ndataset, which increases the effectiveness of algorithms in low-quality\ndatasets. It was also theoretically demonstrated that the hypercube policy\nregularization framework can effectively improve the performance of original\nalgorithms. In addition, the hypercube policy regularization framework is\ncombined with TD3-BC and Diffusion-QL for experiments on D4RL datasets which\nare called TD3-BC-C and Diffusion-QL-C. The experimental results of the score\ndemonstrate that TD3-BC-C and Diffusion-QL-C perform better than\nstate-of-the-art algorithms like IQL, CQL, TD3-BC and Diffusion-QL in most D4RL\nenvironments in approximate time.\n","authors":["Yi Shen","Hanyan Huang"],"pdf_url":"https://arxiv.org/pdf/2411.04534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04533v1","updated":"2024-11-07T08:43:42Z","published":"2024-11-07T08:43:42Z","title":"Neural Fingerprints for Adversarial Attack Detection","summary":" Deep learning models for image classification have become standard tools in\nrecent years. A well known vulnerability of these models is their\nsusceptibility to adversarial examples. These are generated by slightly\naltering an image of a certain class in a way that is imperceptible to humans\nbut causes the model to classify it wrongly as another class. Many algorithms\nhave been proposed to address this problem, falling generally into one of two\ncategories: (i) building robust classifiers (ii) directly detecting attacked\nimages. Despite the good performance of these detectors, we argue that in a\nwhite-box setting, where the attacker knows the configuration and weights of\nthe network and the detector, they can overcome the detector by running many\nexamples on a local copy, and sending only those that were not detected to the\nactual model. This problem is common in security applications where even a very\ngood model is not sufficient to ensure safety. In this paper we propose to\novercome this inherent limitation of any static defence with randomization. To\ndo so, one must generate a very large family of detectors with consistent\nperformance, and select one or more of them randomly for each input. For the\nindividual detectors, we suggest the method of neural fingerprints. In the\ntraining phase, for each class we repeatedly sample a tiny random subset of\nneurons from certain layers of the network, and if their average is\nsufficiently different between clean and attacked images of the focal class\nthey are considered a fingerprint and added to the detector bank. During test\ntime, we sample fingerprints from the bank associated with the label predicted\nby the model, and detect attacks using a likelihood ratio test. We evaluate our\ndetectors on ImageNet with different attack methods and model architectures,\nand show near-perfect detection with low rates of false detection.\n","authors":["Haim Fisher","Moni Shahar","Yehezkel S. Resheff"],"pdf_url":"https://arxiv.org/pdf/2411.04533v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2411.04532v1","updated":"2024-11-07T08:41:13Z","published":"2024-11-07T08:41:13Z","title":"Real-time stress detection on social network posts using big data\n technology","summary":" In the context of modern life, particularly in Industry 4.0 within the online\nspace, emotions and moods are frequently conveyed through social media posts.\nThe trend of sharing stories, thoughts, and feelings on these platforms\ngenerates a vast and promising data source for Big Data. This creates both a\nchallenge and an opportunity for research in applying technology to develop\nmore automated and accurate methods for detecting stress in social media users.\nIn this study, we developed a real-time system for stress detection in online\nposts, using the \"Dreaddit: A Reddit Dataset for Stress Analysis in Social\nMedia,\" which comprises 187,444 posts across five different Reddit domains.\nEach domain contains texts with both stressful and non-stressful content,\nshowcasing various expressions of stress. A labeled dataset of 3,553 lines was\ncreated for training. Apache Kafka, PySpark, and AirFlow were utilized to build\nand deploy the model. Logistic Regression yielded the best results for new\nstreaming data, achieving 69,39% for measuring accuracy and 68,97 for measuring\nF1-scores.\n","authors":["Hai-Yen Phan Nguyen","Phi-Lan Ly","Duc-Manh Le","Trong-Hop Do"],"pdf_url":"https://arxiv.org/pdf/2411.04532v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.17382v2","updated":"2024-11-07T08:34:27Z","published":"2024-05-27T17:38:33Z","title":"ReMoDetect: Reward Models Recognize Aligned LLM's Generations","summary":" The remarkable capabilities and easy accessibility of large language models\n(LLMs) have significantly increased societal risks (e.g., fake news\ngeneration), necessitating the development of LLM-generated text (LGT)\ndetection methods for safe usage. However, detecting LGTs is challenging due to\nthe vast number of LLMs, making it impractical to account for each LLM\nindividually; hence, it is crucial to identify the common characteristics\nshared by these models. In this paper, we draw attention to a common feature of\nrecent powerful LLMs, namely the alignment training, i.e., training LLMs to\ngenerate human-preferable texts. Our key finding is that as these aligned LLMs\nare trained to maximize the human preferences, they generate texts with higher\nestimated preferences even than human-written texts; thus, such texts are\neasily detected by using the reward model (i.e., an LLM trained to model human\npreference distribution). Based on this finding, we propose two training\nschemes to further improve the detection ability of the reward model, namely\n(i) continual preference fine-tuning to make the reward model prefer aligned\nLGTs even further and (ii) reward modeling of Human/LLM mixed texts (a\nrephrased texts from human-written texts using aligned LLMs), which serves as a\nmedian preference text corpus between LGTs and human-written texts to learn the\ndecision boundary better. We provide an extensive evaluation by considering six\ntext domains across twelve aligned LLMs, where our method demonstrates\nstate-of-the-art results. Code is available at\nhttps://github.com/hyunseoklee-ai/ReMoDetect.\n","authors":["Hyunseok Lee","Jihoon Tack","Jinwoo Shin"],"pdf_url":"https://arxiv.org/pdf/2405.17382v2.pdf","comment":"Published as a conference proceeding for NeurIPS 2024"},{"id":"http://arxiv.org/abs/2401.00828v4","updated":"2024-11-07T08:29:41Z","published":"2024-01-01T17:56:24Z","title":"Multi-Lattice Sampling of Quantum Field Theories via Neural\n Operator-based Flows","summary":" We consider the problem of sampling lattice field configurations on a lattice\nfrom the Boltzmann distribution corresponding to some action. Since such\ndensities arise as approximationw of an underlying functional density, we frame\nthe task as an instance of operator learning. We propose to approximate a\ntime-dependent neural operator whose time integral provides a mapping between\nthe functional distributions of the free and target theories. Once a particular\nlattice is chosen, the neural operator can be discretized to a\nfinite-dimensional, time-dependent vector field which in turn induces a\ncontinuous normalizing flow between finite dimensional distributions over the\nchosen lattice. This flow can then be trained to be a diffeormorphism between\nthe discretized free and target theories on the chosen lattice, and, by\nconstruction, can be evaluated on different discretizations of spacetime. We\nexperimentally validate the proposal on the 2-dimensional $\\phi^4$-theory to\nexplore to what extent such operator-based flow architectures generalize to\nlattice sizes they were not trained on, and show that pretraining on smaller\nlattices can lead to a speedup over training directly on the target lattice\nsize.\n","authors":["Bálint Máté","François Fleuret"],"pdf_url":"https://arxiv.org/pdf/2401.00828v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04517v1","updated":"2024-11-07T08:19:39Z","published":"2024-11-07T08:19:39Z","title":"Continuous Sign Language Recognition System using Deep Learning with\n MediaPipe Holistic","summary":" Sign languages are the language of hearing-impaired people who use visuals\nlike the hand, facial, and body movements for communication. There are\ndifferent signs and gestures representing alphabets, words, and phrases.\nNowadays approximately 300 sign languages are being practiced worldwide such as\nAmerican Sign Language (ASL), Chinese Sign Language (CSL), Indian Sign Language\n(ISL), and many more. Sign languages are dependent on the vocal language of a\nplace. Unlike vocal or spoken languages, there are no helping words in sign\nlanguage like is, am, are, was, were, will, be, etc. As only a limited\npopulation is well-versed in sign language, this lack of familiarity of sign\nlanguage hinders hearing-impaired people from communicating freely and easily\nwith everyone. This issue can be addressed by a sign language recognition (SLR)\nsystem which has the capability to translate the sign language into vocal\nlanguage. In this paper, a continuous SLR system is proposed using a deep\nlearning model employing Long Short-Term Memory (LSTM), trained and tested on\nan ISL primary dataset. This dataset is created using MediaPipe Holistic\npipeline for tracking face, hand, and body movements and collecting landmarks.\nThe system recognizes the signs and gestures in real-time with 88.23% accuracy.\n","authors":["Sharvani Srivastava","Sudhakar Singh"," Pooja","Shiv Prakash"],"pdf_url":"https://arxiv.org/pdf/2411.04517v1.pdf","comment":"14 pages, 4 figures, Wireless Pers Commun"},{"id":"http://arxiv.org/abs/2411.03450v2","updated":"2024-11-07T08:10:41Z","published":"2024-11-05T19:07:26Z","title":"Fourier Analysis of Variational Quantum Circuits for Supervised Learning","summary":" VQC can be understood through the lens of Fourier analysis. It is already\nwell-known that the function space represented by any circuit architecture can\nbe described through a truncated Fourier sum. We show that the spectrum\navailable to that truncated Fourier sum is not entirely determined by the\nencoding gates of the circuit, since the variational part of the circuit can\nconstrain certain coefficients to zero, effectively removing that frequency\nfrom the spectrum. To the best of our knowledge, we give the first description\nof the functional dependence of the Fourier coefficients on the variational\nparameters as trigonometric polynomials. This allows us to provide an algorithm\nwhich computes the exact spectrum of any given circuit and the corresponding\nFourier coefficients. Finally, we demonstrate that by comparing the Fourier\ntransform of the dataset to the available spectra, it is possible to predict\nwhich VQC out of a given list of choices will be able to best fit the data.\n","authors":["Marco Wiedmann","Maniraman Periyasamy","Daniel D. Scherer"],"pdf_url":"https://arxiv.org/pdf/2411.03450v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04512v1","updated":"2024-11-07T08:10:28Z","published":"2024-11-07T08:10:28Z","title":"Normalized Space Alignment: A Versatile Metric for Representation\n Analysis","summary":" We introduce a manifold analysis technique for neural network\nrepresentations. Normalized Space Alignment (NSA) compares pairwise distances\nbetween two point clouds derived from the same source and having the same size,\nwhile potentially possessing differing dimensionalities. NSA can act as both an\nanalytical tool and a differentiable loss function, providing a robust means of\ncomparing and aligning representations across different layers and models. It\nsatisfies the criteria necessary for both a similarity metric and a neural\nnetwork loss function. We showcase NSA's versatility by illustrating its\nutility as a representation space analysis metric, a structure-preserving loss\nfunction, and a robustness analysis tool. NSA is not only computationally\nefficient but it can also approximate the global structural discrepancy during\nmini-batching, facilitating its use in a wide variety of neural network\ntraining paradigms.\n","authors":["Danish Ebadulla","Aditya Gulati","Ambuj Singh"],"pdf_url":"https://arxiv.org/pdf/2411.04512v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2411.04511v1","updated":"2024-11-07T08:08:12Z","published":"2024-11-07T08:08:12Z","title":"Improve the Fitting Accuracy of Deep Learning for the Nonlinear\n Schrödinger Equation Using Linear Feature Decoupling Method","summary":" We utilize the Feature Decoupling Distributed (FDD) method to enhance the\ncapability of deep learning to fit the Nonlinear Schrodinger Equation (NLSE),\nsignificantly reducing the NLSE loss compared to non decoupling model.\n","authors":["Yunfan Zhang","Zekun Niu","Minghui Shi","Weisheng Hu","Lilin Yi"],"pdf_url":"https://arxiv.org/pdf/2411.04511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01635v4","updated":"2024-11-07T07:52:35Z","published":"2024-06-30T10:53:40Z","title":"Commute Graph Neural Networks","summary":" Graph Neural Networks (GNNs) have shown remarkable success in learning from\ngraph-structured data. However, their application to directed graphs (digraphs)\npresents unique challenges, primarily due to the inherent asymmetry in node\nrelationships. Traditional GNNs are adept at capturing unidirectional relations\nbut fall short in encoding the mutual path dependencies between nodes, such as\nasymmetrical shortest paths typically found in digraphs. Recognizing this gap,\nwe introduce Commute Graph Neural Networks (CGNN), an approach that seamlessly\nintegrates node-wise commute time into the message passing scheme. The\ncornerstone of CGNN is an efficient method for computing commute time using a\nnewly formulated digraph Laplacian. Commute time is then integrated into the\nneighborhood aggregation process, with neighbor contributions weighted\naccording to their respective commute time to the central node in each layer.\nIt enables CGNN to directly capture the mutual, asymmetric relationships in\ndigraphs. Extensive experiments confirm the superior performance of CGNN.\n","authors":["Wei Zhuo","Guang Tan"],"pdf_url":"https://arxiv.org/pdf/2407.01635v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04493v1","updated":"2024-11-07T07:41:04Z","published":"2024-11-07T07:41:04Z","title":"Synergy-Guided Regional Supervision of Pseudo Labels for Semi-Supervised\n Medical Image Segmentation","summary":" Semi-supervised learning has received considerable attention for its\npotential to leverage abundant unlabeled data to enhance model robustness.\nPseudo labeling is a widely used strategy in semi supervised learning. However,\nexisting methods often suffer from noise contamination, which can undermine\nmodel performance. To tackle this challenge, we introduce a novel\nSynergy-Guided Regional Supervision of Pseudo Labels (SGRS-Net) framework.\nBuilt upon the mean teacher network, we employ a Mix Augmentation module to\nenhance the unlabeled data. By evaluating the synergy before and after\naugmentation, we strategically partition the pseudo labels into distinct\nregions. Additionally, we introduce a Region Loss Evaluation module to assess\nthe loss across each delineated area. Extensive experiments conducted on the LA\ndataset have demonstrated superior performance over state-of-the-art\ntechniques, underscoring the efficiency and practicality of our framework.\n","authors":["Tao Wang","Xinlin Zhang","Yuanbin Chen","Yuanbo Zhou","Longxuan Zhao","Tao Tan","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2411.04493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04491v1","updated":"2024-11-07T07:37:34Z","published":"2024-11-07T07:37:34Z","title":"Series-to-Series Diffusion Bridge Model","summary":" Diffusion models have risen to prominence in time series forecasting,\nshowcasing their robust capability to model complex data distributions.\nHowever, their effectiveness in deterministic predictions is often constrained\nby instability arising from their inherent stochasticity. In this paper, we\nrevisit time series diffusion models and present a comprehensive framework that\nencompasses most existing diffusion-based methods. Building on this theoretical\nfoundation, we propose a novel diffusion-based time series forecasting model,\nthe Series-to-Series Diffusion Bridge Model ($\\mathrm{S^2DBM}$), which\nleverages the Brownian Bridge process to reduce randomness in reverse\nestimations and improves accuracy by incorporating informative priors and\nconditions derived from historical time series data. Experimental results\ndemonstrate that $\\mathrm{S^2DBM}$ delivers superior performance in\npoint-to-point forecasting and competes effectively with other diffusion-based\nmodels in probabilistic forecasting.\n","authors":["Hao Yang","Zhanbo Feng","Feng Zhou","Robert C Qiu","Zenan Ling"],"pdf_url":"https://arxiv.org/pdf/2411.04491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14979v3","updated":"2024-11-07T07:25:04Z","published":"2024-10-19T05:01:56Z","title":"Do Large Language Models Truly Grasp Mathematics? An Empirical\n Exploration From A Psychological Perspective","summary":" Despite their proficiency in math tasks, the mechanisms underlying LLMs'\nmathematical reasoning abilities remain a subject of debate. Recent studies\nsuggest that chain-of-thought (CoT) prompts can bolster mathematical reasoning\nby encouraging LLMs to employ human-like logical reasoning (System 2), enabling\nthem to excel on the Cognitive Reflection Test (CRT). To assess whether LLMs\ngenuinely possess System 2-like logical reasoning, we introduced targeted\nmodifications to CRT problems. Our findings reveal that, despite the use of CoT\nprompts, mainstream LLMs, including the latest o1-preview model, continue to\nexhibit a significant error rate. Further analysis indicates that they\npredominantly rely on System 1-like intuitive reasoning and pattern matching\nderived from training data, rather than demonstrating mastery of mathematical\nthinking. This discovery challenges the prevailing notion that LLMs possess\ngenuine logical reasoning abilities and that CoT can enhance them.\nConsequently, this work may temper overly optimistic projections regarding\nLLMs' advancement toward artificial general intelligence.\n","authors":["Wei Xie","Shuoyoucheng Ma","Zhenhua Wang","Enze Wang","Kai Chen","Xiaobing Sun","Baosheng Wang"],"pdf_url":"https://arxiv.org/pdf/2410.14979v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04476v1","updated":"2024-11-07T07:07:34Z","published":"2024-11-07T07:07:34Z","title":"LLM-R: A Framework for Domain-Adaptive Maintenance Scheme Generation\n Combining Hierarchical Agents and RAG","summary":" The increasing use of smart devices has emphasized the critical role of\nmaintenance in production activities. Interactive Electronic Technical Manuals\n(IETMs) are vital tools that support the maintenance of smart equipment.\nHowever, traditional IETMs face challenges such as transitioning from Graphical\nUser Interfaces (GUIs) to natural Language User Interfaces (LUIs) and managing\ncomplex logical relationships. Additionally, they must meet the current demands\nfor higher intelligence. This paper proposes a Maintenance Scheme Generation\nMethod based on Large Language Models (LLM-R). The proposed method includes\nseveral key innovations: We propose the Low Rank Adaptation-Knowledge Retention\n(LORA-KR) loss technology to proportionally adjust mixed maintenance data for\nfine-tuning the LLM. This method prevents knowledge conflicts caused by mixed\ndata, improving the model's adaptability and reasoning ability in specific\nmaintenance domains, Besides, Hierarchical Task-Based Agent and\nInstruction-level Retrieval-Augmented Generation (RAG) technologies are adopted\nto optimize the generation steps and mitigate the phenomenon of hallucination\ncaused by the model's Inability to access contextual information. This\nenhancement improves the model's flexibility and accuracy in handling known or\nunknown maintenance objects and maintenance scheme scenarios. To validate the\nproposed method's effectiveness in maintenance tasks, a maintenance scheme\ndataset was constructed using objects from different fields. The experimental\nresults show that the accuracy of the maintenance schemes generated by the\nproposed method reached 91.59%, indicating which improvement enhances the\nintelligence of maintenance schemes and introduces novel technical approaches\nfor equipment maintenance.\n","authors":["Laifa Tao","Qixuan Huang","Xianjun Wu","Weiwei Zhang","Yunlong Wu","Bin Li","Chen Lu","Xingshuo Hai"],"pdf_url":"https://arxiv.org/pdf/2411.04476v1.pdf","comment":"30 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.12452v2","updated":"2024-11-07T06:41:07Z","published":"2024-05-21T02:06:40Z","title":"Prompt-Based Spatio-Temporal Graph Transfer Learning","summary":" Spatio-temporal graph neural networks have proven efficacy in capturing\ncomplex dependencies for urban computing tasks such as forecasting and kriging.\nYet, their performance is constrained by the reliance on extensive data for\ntraining on a specific task, thereby limiting their adaptability to new urban\ndomains with varied task demands. Although transfer learning has been proposed\nto remedy this problem by leveraging knowledge across domains, the cross-task\ngeneralization still remains under-explored in spatio-temporal graph transfer\nlearning due to the lack of a unified framework. To bridge the gap, we propose\nSpatio-Temporal Graph Prompting (STGP), a prompt-based framework capable of\nadapting to multi-diverse tasks in a data-scarce domain. Specifically, we first\nunify different tasks into a single template and introduce a task-agnostic\nnetwork architecture that aligns with this template. This approach enables\ncapturing dependencies shared across tasks. Furthermore, we employ learnable\nprompts to achieve domain and task transfer in a two-stage prompting pipeline,\nfacilitating the prompts to effectively capture domain knowledge and\ntask-specific properties. Our extensive experiments demonstrate that STGP\noutperforms state-of-the-art baselines in three tasks-forecasting, kriging, and\nextrapolation-achieving an improvement of up to 10.7%.\n","authors":["Junfeng Hu","Xu Liu","Zhencheng Fan","Yifang Yin","Shili Xiang","Savitha Ramasamy","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2405.12452v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04466v1","updated":"2024-11-07T06:27:12Z","published":"2024-11-07T06:27:12Z","title":"Enabling Adaptive Agent Training in Open-Ended Simulators by Targeting\n Diversity","summary":" The wider application of end-to-end learning methods to embodied\ndecision-making domains remains bottlenecked by their reliance on a\nsuperabundance of training data representative of the target domain.\nMeta-reinforcement learning (meta-RL) approaches abandon the aim of zero-shot\ngeneralization--the goal of standard reinforcement learning (RL)--in favor of\nfew-shot adaptation, and thus hold promise for bridging larger generalization\ngaps. While learning this meta-level adaptive behavior still requires\nsubstantial data, efficient environment simulators approaching real-world\ncomplexity are growing in prevalence. Even so, hand-designing sufficiently\ndiverse and numerous simulated training tasks for these complex domains is\nprohibitively labor-intensive. Domain randomization (DR) and procedural\ngeneration (PG), offered as solutions to this problem, require simulators to\npossess carefully-defined parameters which directly translate to meaningful\ntask diversity--a similarly prohibitive assumption. In this work, we present\nDIVA, an evolutionary approach for generating diverse training tasks in such\ncomplex, open-ended simulators. Like unsupervised environment design (UED)\nmethods, DIVA can be applied to arbitrary parameterizations, but can\nadditionally incorporate realistically-available domain knowledge--thus\ninheriting the flexibility and generality of UED, and the supervised structure\nembedded in well-designed simulators exploited by DR and PG. Our empirical\nresults showcase DIVA's unique ability to overcome complex parameterizations\nand successfully train adaptive agent behavior, far outperforming competitive\nbaselines from prior literature. These findings highlight the potential of such\nsemi-supervised environment design (SSED) approaches, of which DIVA is the\nfirst humble constituent, to enable training in realistic simulated domains,\nand produce more robust and capable adaptive agents.\n","authors":["Robby Costales","Stefanos Nikolaidis"],"pdf_url":"https://arxiv.org/pdf/2411.04466v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.04459v1","updated":"2024-11-07T06:12:38Z","published":"2024-11-07T06:12:38Z","title":"GPT-Guided Monte Carlo Tree Search for Symbolic Regression in Financial\n Fraud Detection","summary":" With the increasing number of financial services available online, the rate\nof financial fraud has also been increasing. The traffic and transaction rates\non the internet have increased considerably, leading to a need for fast\ndecision-making. Financial institutions also have stringent regulations that\noften require transparency and explainability of the decision-making process.\nHowever, most state-of-the-art algorithms currently used in the industry are\nhighly parameterized black-box models that rely on complex computations to\ngenerate a score. These algorithms are inherently slow and lack the\nexplainability and speed of traditional rule-based learners. This work\nintroduces SR-MCTS (Symbolic Regression MCTS), which utilizes a foundational\nGPT model to guide the MCTS, significantly enhancing its convergence speed and\nthe quality of the generated expressions which are further extracted to rules.\nOur experiments show that SR-MCTS can detect fraud more efficiently than widely\nused methods in the industry while providing substantial insights into the\ndecision-making process.\n","authors":["Prashank Kadam"],"pdf_url":"https://arxiv.org/pdf/2411.04459v1.pdf","comment":"ACM International Conference on Information and Knowledge Management\n 2024 RAG - Enterprise"},{"id":"http://arxiv.org/abs/2411.04453v1","updated":"2024-11-07T06:01:12Z","published":"2024-11-07T06:01:12Z","title":"Comparing Fairness of Generative Mobility Models","summary":" This work examines the fairness of generative mobility models, addressing the\noften overlooked dimension of equity in model performance across geographic\nregions. Predictive models built on crowd flow data are instrumental in\nunderstanding urban structures and movement patterns; however, they risk\nembedding biases, particularly in spatiotemporal contexts where model\nperformance may reflect and reinforce existing inequities tied to geographic\ndistribution. We propose a novel framework for assessing fairness by measuring\nthe utility and equity of generated traces. Utility is assessed via the Common\nPart of Commuters (CPC), a similarity metric comparing generated and real\nmobility flows, while fairness is evaluated using demographic parity. By\nreformulating demographic parity to reflect the difference in CPC distribution\nbetween two groups, our analysis reveals disparities in how various models\nencode biases present in the underlying data. We utilized four models (Gravity,\nRadiation, Deep Gravity, and Non-linear Gravity) and our results indicate that\ntraditional gravity and radiation models produce fairer outcomes, although Deep\nGravity achieves higher CPC. This disparity underscores a trade-off between\nmodel accuracy and equity, with the feature-rich Deep Gravity model amplifying\npre-existing biases in community representations. Our findings emphasize the\nimportance of integrating fairness metrics in mobility modeling to avoid\nperpetuating inequities.\n","authors":["Daniel Wang","Jack McFarland","Afra Mashhadi","Ekin Ugurel"],"pdf_url":"https://arxiv.org/pdf/2411.04453v1.pdf","comment":"2 pages, Accepted at the Network Mobility (NetMob) 2024 conference"},{"id":"http://arxiv.org/abs/2409.00588v2","updated":"2024-11-07T05:58:27Z","published":"2024-09-01T02:47:50Z","title":"Diffusion Policy Policy Optimization","summary":" We introduce Diffusion Policy Policy Optimization, DPPO, an algorithmic\nframework including best practices for fine-tuning diffusion-based policies\n(e.g. Diffusion Policy) in continuous control and robot learning tasks using\nthe policy gradient (PG) method from reinforcement learning (RL). PG methods\nare ubiquitous in training RL policies with other policy parameterizations;\nnevertheless, they had been conjectured to be less efficient for\ndiffusion-based policies. Surprisingly, we show that DPPO achieves the\nstrongest overall performance and efficiency for fine-tuning in common\nbenchmarks compared to other RL methods for diffusion-based policies and also\ncompared to PG fine-tuning of other policy parameterizations. Through\nexperimental investigation, we find that DPPO takes advantage of unique\nsynergies between RL fine-tuning and the diffusion parameterization, leading to\nstructured and on-manifold exploration, stable training, and strong policy\nrobustness. We further demonstrate the strengths of DPPO in a range of\nrealistic settings, including simulated robotic tasks with pixel observations,\nand via zero-shot deployment of simulation-trained policies on robot hardware\nin a long-horizon, multi-stage manipulation task. Website with code:\ndiffusion-ppo.github.io\n","authors":["Allen Z. Ren","Justin Lidard","Lars L. Ankile","Anthony Simeonov","Pulkit Agrawal","Anirudha Majumdar","Benjamin Burchfiel","Hongkai Dai","Max Simchowitz"],"pdf_url":"https://arxiv.org/pdf/2409.00588v2.pdf","comment":"Website: diffusion-ppo.github.io"},{"id":"http://arxiv.org/abs/2411.01442v2","updated":"2024-11-07T05:54:07Z","published":"2024-11-03T05:43:55Z","title":"Online Relational Inference for Evolving Multi-agent Interacting Systems","summary":" We introduce a novel framework, Online Relational Inference (ORI), designed\nto efficiently identify hidden interaction graphs in evolving multi-agent\ninteracting systems using streaming data. Unlike traditional offline methods\nthat rely on a fixed training set, ORI employs online backpropagation, updating\nthe model with each new data point, thereby allowing it to adapt to changing\nenvironments in real-time. A key innovation is the use of an adjacency matrix\nas a trainable parameter, optimized through a new adaptive learning rate\ntechnique called AdaRelation, which adjusts based on the historical sensitivity\nof the decoder to changes in the interaction graph. Additionally, a data\naugmentation method named Trajectory Mirror (TM) is introduced to improve\ngeneralization by exposing the model to varied trajectory patterns.\nExperimental results on both synthetic datasets and real-world data (CMU MoCap\nfor human motion) demonstrate that ORI significantly improves the accuracy and\nadaptability of relational inference in dynamic settings compared to existing\nmethods. This approach is model-agnostic, enabling seamless integration with\nvarious neural relational inference (NRI) architectures, and offers a robust\nsolution for real-time applications in complex, evolving systems.\n","authors":["Beomseok Kang","Priyabrata Saha","Sudarshan Sharma","Biswadeep Chakraborty","Saibal Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2411.01442v2.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.04434v1","updated":"2024-11-07T04:57:40Z","published":"2024-11-07T04:57:40Z","title":"Scaling Laws for Pre-training Agents and World Models","summary":" The performance of embodied agents has been shown to improve by increasing\nmodel parameters, dataset size, and compute. This has been demonstrated in\ndomains from robotics to video games, when generative learning objectives on\noffline datasets (pre-training) are used to model an agent's behavior\n(imitation learning) or their environment (world modeling). This paper\ncharacterizes the role of scale in these tasks more precisely. Going beyond the\nsimple intuition that `bigger is better', we show that the same types of power\nlaws found in language modeling (e.g. between loss and optimal model size),\nalso arise in world modeling and imitation learning. However, the coefficients\nof these laws are heavily influenced by the tokenizer, task \\& architecture --\nthis has important implications on the optimal sizing of models and data.\n","authors":["Tim Pearce","Tabish Rashid","Dave Bignell","Raluca Georgescu","Sam Devlin","Katja Hofmann"],"pdf_url":"https://arxiv.org/pdf/2411.04434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04430v1","updated":"2024-11-07T04:52:18Z","published":"2024-11-07T04:52:18Z","title":"Towards Unifying Interpretability and Control: Evaluation via\n Intervention","summary":" With the growing complexity and capability of large language models, a need\nto understand model reasoning has emerged, often motivated by an underlying\ngoal of controlling and aligning models. While numerous interpretability and\nsteering methods have been proposed as solutions, they are typically designed\neither for understanding or for control, seldom addressing both, with the\nconnection between interpretation and control more broadly remaining tenuous.\nAdditionally, the lack of standardized applications, motivations, and\nevaluation metrics makes it difficult to assess these methods' practical\nutility and efficacy. To address this, we propose intervention as a fundamental\ngoal of interpretability and introduce success criteria to evaluate how well\nmethods are able to control model behavior through interventions. We unify and\nextend four popular interpretability methods--sparse autoencoders, logit lens,\ntuned lens, and probing--into an abstract encoder-decoder framework. This\nframework maps intermediate latent representations to human-interpretable\nfeature spaces, enabling interventions on these interpretable features, which\ncan then be mapped back to latent representations to control model outputs. We\nintroduce two new evaluation metrics: intervention success rate and the\ncoherence-intervention tradeoff, designed to measure the accuracy of\nexplanations and their utility in controlling model behavior. Our findings\nreveal that (1) although current methods allow for intervention, they are\ninconsistent across models and features, (2) lens-based methods outperform\nothers in achieving simple, concrete interventions, and (3) interventions often\ncompromise model performance and coherence, underperforming simpler\nalternatives, such as prompting, for steering model behavior and highlighting a\ncritical shortcoming of current interpretability approaches in real-world\napplications requiring control.\n","authors":["Usha Bhalla","Suraj Srinivas","Asma Ghandeharioun","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2411.04430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00172v2","updated":"2024-11-07T04:41:32Z","published":"2024-10-31T19:37:47Z","title":"SeafloorAI: A Large-scale Vision-Language Dataset for Seafloor\n Geological Survey","summary":" A major obstacle to the advancements of machine learning models in marine\nscience, particularly in sonar imagery analysis, is the scarcity of AI-ready\ndatasets. While there have been efforts to make AI-ready sonar image dataset\npublicly available, they suffer from limitations in terms of environment\nsetting and scale. To bridge this gap, we introduce SeafloorAI, the first\nextensive AI-ready datasets for seafloor mapping across 5 geological layers\nthat is curated in collaboration with marine scientists. We further extend the\ndataset to SeafloorGenAI by incorporating the language component in order to\nfacilitate the development of both vision- and language-capable machine\nlearning models for sonar imagery. The dataset consists of 62 geo-distributed\ndata surveys spanning 17,300 square kilometers, with 696K sonar images, 827K\nannotated segmentation masks, 696K detailed language descriptions and\napproximately 7M question-answer pairs. By making our data processing source\ncode publicly available, we aim to engage the marine science community to\nenrich the data pool and inspire the machine learning community to develop more\nrobust models. This collaborative approach will enhance the capabilities and\napplications of our datasets within both fields.\n","authors":["Kien X. Nguyen","Fengchun Qiao","Arthur Trembanis","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2411.00172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15265v4","updated":"2024-11-07T04:38:33Z","published":"2023-05-24T15:52:08Z","title":"Winner-Take-All Column Row Sampling for Memory Efficient Adaptation of\n Language Model","summary":" With the rapid growth in model size, fine-tuning the large pre-trained\nlanguage model has become increasingly difficult due to its extensive memory\nusage. Previous works usually focus on reducing the number of trainable\nparameters in the network. While the model parameters do contribute to memory\nusage, the primary memory bottleneck during training arises from storing\nfeature maps, also known as activations, as they are crucial for gradient\ncalculation. Notably, neural networks are usually trained using stochastic\ngradient descent. We argue that in stochastic optimization, models can handle\nnoisy gradients as long as the gradient estimator is unbiased with reasonable\nvariance. Following this motivation, we propose a new family of unbiased\nestimators called WTA-CRS, for matrix production with reduced variance, which\nonly requires storing the sub-sampled activations for calculating the gradient.\nOur work provides both theoretical and experimental evidence that, in the\ncontext of tuning transformers, our proposed estimators exhibit lower variance\ncompared to existing ones. By replacing the linear operation with our\napproximated one in transformers, we can achieve up to 2.7$\\times$ peak memory\nreduction with almost no accuracy drop and enables up to $6.4\\times$ larger\nbatch size. Under the same hardware, WTA-CRS enables better down-streaming task\nperformance by applying larger models and/or faster training speed with larger\nbatch sizes.\n","authors":["Zirui Liu","Guanchu Wang","Shaochen Zhong","Zhaozhuo Xu","Daochen Zha","Ruixiang Tang","Zhimeng Jiang","Kaixiong Zhou","Vipin Chaudhary","Shuai Xu","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2305.15265v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04422v1","updated":"2024-11-07T04:21:26Z","published":"2024-11-07T04:21:26Z","title":"Unsupervised Abnormal Stop Detection for Long Distance Coaches with\n Low-Frequency GPS","summary":" In our urban life, long distance coaches supply a convenient yet economic\napproach to the transportation of the public. One notable problem is to\ndiscover the abnormal stop of the coaches due to the important reason, i.e.,\nillegal pick up on the way which possibly endangers the safety of passengers.\nIt has become a pressing issue to detect the coach abnormal stop with\nlow-quality GPS. In this paper, we propose an unsupervised method that helps\ntransportation managers to efficiently discover the Abnormal Stop Detection\n(ASD) for long distance coaches. Concretely, our method converts the ASD\nproblem into an unsupervised clustering framework in which both the normal stop\nand the abnormal one are decomposed. Firstly, we propose a stop duration model\nfor the low frequency GPS based on the assumption that a coach changes speed\napproximately in a linear approach. Secondly, we strip the abnormal stops from\nthe normal stop points by the low rank assumption. The proposed method is\nconceptually simple yet efficient, by leveraging low rank assumption to handle\nnormal stop points, our approach enables domain experts to discover the ASD for\ncoaches, from a case study motivated by traffic managers. Datset and code are\npublicly available at: https://github.com/pangjunbiao/IPPs.\n","authors":["Jiaxin Deng","Junbiao Pang","Jiayu Xu","Haitao Yu"],"pdf_url":"https://arxiv.org/pdf/2411.04422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04421v1","updated":"2024-11-07T04:17:30Z","published":"2024-11-07T04:17:30Z","title":"Variational Low-Rank Adaptation Using IVON","summary":" We show that variational learning can significantly improve the accuracy and\ncalibration of Low-Rank Adaptation (LoRA) without a substantial increase in the\ncost. We replace AdamW by the Improved Variational Online Newton (IVON)\nalgorithm to finetune large language models. For Llama-2 with 7 billion\nparameters, IVON improves the accuracy over AdamW by 2.8% and expected\ncalibration error by 4.6%. The accuracy is also better than the other Bayesian\nalternatives, yet the cost is lower and the implementation is easier. Our work\nprovides additional evidence for the effectiveness of IVON for large language\nmodels. The code is available at\nhttps://github.com/team-approx-bayes/ivon-lora.\n","authors":["Bai Cong","Nico Daheim","Yuesong Shen","Daniel Cremers","Rio Yokota","Mohammad Emtiyaz Khan","Thomas Möllenhoff"],"pdf_url":"https://arxiv.org/pdf/2411.04421v1.pdf","comment":"Published at 38th Workshop on Fine-Tuning in Machine Learning\n (NeurIPS 2024). Code available at\n https://github.com/team-approx-bayes/ivon-lora"},{"id":"http://arxiv.org/abs/2411.04420v1","updated":"2024-11-07T04:16:15Z","published":"2024-11-07T04:16:15Z","title":"BendVLM: Test-Time Debiasing of Vision-Language Embeddings","summary":" Vision-language model (VLM) embeddings have been shown to encode biases\npresent in their training data, such as societal biases that prescribe negative\ncharacteristics to members of various racial and gender identities. VLMs are\nbeing quickly adopted for a variety of tasks ranging from few-shot\nclassification to text-guided image generation, making debiasing VLM embeddings\ncrucial. Debiasing approaches that fine-tune the VLM often suffer from\ncatastrophic forgetting. On the other hand, fine-tuning-free methods typically\nutilize a \"one-size-fits-all\" approach that assumes that correlation with the\nspurious attribute can be explained using a single linear direction across all\npossible inputs. In this work, we propose Bend-VLM, a nonlinear,\nfine-tuning-free approach for VLM embedding debiasing that tailors the\ndebiasing operation to each unique input. This allows for a more flexible\ndebiasing approach. Additionally, we do not require knowledge of the set of\ninputs a priori to inference time, making our method more appropriate for\nonline, open-set tasks such as retrieval and text guided image generation.\n","authors":["Walter Gerych","Haoran Zhang","Kimia Hamidieh","Eileen Pan","Maanas Sharma","Thomas Hartvigsen","Marzyeh Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2411.04420v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2411.04859v1","updated":"2024-11-07T16:49:25Z","published":"2024-11-07T16:49:25Z","title":"A multi-purpose automatic editing system based on lecture semantics for\n remote education","summary":" Remote teaching has become popular recently due to its convenience and\nsafety, especially under extreme circumstances like a pandemic. However, online\nstudents usually have a poor experience since the information acquired from the\nviews provided by the broadcast platforms is limited. One potential solution is\nto show more camera views simultaneously, but it is technically challenging and\ndistracting for the viewers. Therefore, an automatic multi-camera\ndirecting/editing system, which aims at selecting the most concerned view at\neach time instance to guide the attention of online students, is in urgent\ndemand. However, existing systems mostly make simple assumptions and focus on\ntracking the position of the speaker instead of the real lecture semantics, and\ntherefore have limited capacities to deliver optimal information flow. To this\nend, this paper proposes an automatic multi-purpose editing system based on the\nlecture semantics, which can both direct the multiple video streams for\nreal-time broadcasting and edit the optimal video offline for review purposes.\nOur system directs the views by semantically analyzing the class events while\nfollowing the professional directing rules, mimicking a human director to\ncapture the regions of interest from the viewpoint of the onsite students. We\nconduct both qualitative and quantitative analyses to verify the effectiveness\nof the proposed system and its components.\n","authors":["Panwen Hu","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2411.04859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04517v1","updated":"2024-11-07T08:19:39Z","published":"2024-11-07T08:19:39Z","title":"Continuous Sign Language Recognition System using Deep Learning with\n MediaPipe Holistic","summary":" Sign languages are the language of hearing-impaired people who use visuals\nlike the hand, facial, and body movements for communication. There are\ndifferent signs and gestures representing alphabets, words, and phrases.\nNowadays approximately 300 sign languages are being practiced worldwide such as\nAmerican Sign Language (ASL), Chinese Sign Language (CSL), Indian Sign Language\n(ISL), and many more. Sign languages are dependent on the vocal language of a\nplace. Unlike vocal or spoken languages, there are no helping words in sign\nlanguage like is, am, are, was, were, will, be, etc. As only a limited\npopulation is well-versed in sign language, this lack of familiarity of sign\nlanguage hinders hearing-impaired people from communicating freely and easily\nwith everyone. This issue can be addressed by a sign language recognition (SLR)\nsystem which has the capability to translate the sign language into vocal\nlanguage. In this paper, a continuous SLR system is proposed using a deep\nlearning model employing Long Short-Term Memory (LSTM), trained and tested on\nan ISL primary dataset. This dataset is created using MediaPipe Holistic\npipeline for tracking face, hand, and body movements and collecting landmarks.\nThe system recognizes the signs and gestures in real-time with 88.23% accuracy.\n","authors":["Sharvani Srivastava","Sudhakar Singh"," Pooja","Shiv Prakash"],"pdf_url":"https://arxiv.org/pdf/2411.04517v1.pdf","comment":"14 pages, 4 figures, Wireless Pers Commun"},{"id":"http://arxiv.org/abs/2411.02551v2","updated":"2024-11-07T07:18:51Z","published":"2024-11-04T19:34:13Z","title":"PIAST: A Multimodal Piano Dataset with Audio, Symbolic and Text","summary":" While piano music has become a significant area of study in Music Information\nRetrieval (MIR), there is a notable lack of datasets for piano solo music with\ntext labels. To address this gap, we present PIAST (PIano dataset with Audio,\nSymbolic, and Text), a piano music dataset. Utilizing a piano-specific taxonomy\nof semantic tags, we collected 9,673 tracks from YouTube and added human\nannotations for 2,023 tracks by music experts, resulting in two subsets:\nPIAST-YT and PIAST-AT. Both include audio, text, tag annotations, and\ntranscribed MIDI utilizing state-of-the-art piano transcription and beat\ntracking models. Among many possible tasks with the multi-modal dataset, we\nconduct music tagging and retrieval using both audio and MIDI data and report\nbaseline performances to demonstrate its potential as a valuable resource for\nMIR research.\n","authors":["Hayeon Bang","Eunjin Choi","Megan Finch","Seungheon Doh","Seolhee Lee","Gyeong-Hoon Lee","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2411.02551v2.pdf","comment":"Accepted for publication at the 3rd Workshop on NLP for Music and\n Audio (NLP4MusA 2024)"},{"id":"http://arxiv.org/abs/2411.04366v1","updated":"2024-11-07T01:52:46Z","published":"2024-11-07T01:52:46Z","title":"The Concatenator: A Bayesian Approach To Real Time Concatenative\n Musaicing","summary":" We present ``The Concatenator,'' a real time system for audio-guided\nconcatenative synthesis. Similarly to Driedger et al.'s ``musaicing'' (or\n``audio mosaicing'') technique, we concatenate a set number of windows within a\ncorpus of audio to re-create the harmonic and percussive aspects of a target\naudio stream. Unlike Driedger's NMF-based technique, however, we instead use an\nexplicitly Bayesian point of view, where corpus window indices are hidden\nstates and the target audio stream is an observation. We use a particle filter\nto infer the best hidden corpus states in real-time. Our transition model\nincludes a tunable parameter to control the time-continuity of corpus grains,\nand our observation model allows users to prioritize how quickly windows change\nto match the target. Because the computational complexity of the system is\nindependent of the corpus size, our system scales to corpora that are hours\nlong, which is an important feature in the age of vast audio data collections.\nWithin The Concatenator module itself, composers can vary grain length, fit to\ntarget, and pitch shift in real time while reacting to the sounds they hear,\nenabling them to rapidly iterate ideas. To conclude our work, we evaluate our\nsystem with extensive quantitative tests of the effects of parameters, as well\nas a qualitative evaluation with artistic insights. Based on the quality of the\nresults, we believe the real-time capability unlocks new avenues for musical\nexpression and control, suitable for live performance and modular synthesis\nintegration, which furthermore represents an essential breakthrough in\nconcatenative synthesis technology.\n","authors":["Christopher Tralie","Ben Cantil"],"pdf_url":"https://arxiv.org/pdf/2411.04366v1.pdf","comment":"12 pages, 6 figures, Accepted for Publication in The International\n Society for Music Information Retrieval Proceedings, 2024"}]},"2024-11-06T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2411.04316v1","updated":"2024-11-06T23:41:18Z","published":"2024-11-06T23:41:18Z","title":"A Multilingual Sentiment Lexicon for Low-Resource Language Translation\n using Large Languages Models and Explainable AI","summary":" South Africa and the Democratic Republic of Congo (DRC) present a complex\nlinguistic landscape with languages such as Zulu, Sepedi, Afrikaans, French,\nEnglish, and Tshiluba (Ciluba), which creates unique challenges for AI-driven\ntranslation and sentiment analysis systems due to a lack of accurately labeled\ndata. This study seeks to address these challenges by developing a multilingual\nlexicon designed for French and Tshiluba, now expanded to include translations\nin English, Afrikaans, Sepedi, and Zulu. The lexicon enhances cultural\nrelevance in sentiment classification by integrating language-specific\nsentiment scores. A comprehensive testing corpus is created to support\ntranslation and sentiment analysis tasks, with machine learning models such as\nRandom Forest, Support Vector Machine (SVM), Decision Trees, and Gaussian Naive\nBayes (GNB) trained to predict sentiment across low resource languages (LRLs).\nAmong them, the Random Forest model performed particularly well, capturing\nsentiment polarity and handling language-specific nuances effectively.\nFurthermore, Bidirectional Encoder Representations from Transformers (BERT), a\nLarge Language Model (LLM), is applied to predict context-based sentiment with\nhigh accuracy, achieving 99% accuracy and 98% precision, outperforming other\nmodels. The BERT predictions were clarified using Explainable AI (XAI),\nimproving transparency and fostering confidence in sentiment classification.\nOverall, findings demonstrate that the proposed lexicon and machine learning\nmodels significantly enhance translation and sentiment analysis for LRLs in\nSouth Africa and the DRC, laying a foundation for future AI models that support\nunderrepresented languages, with applications across education, governance, and\nbusiness in multilingual contexts.\n","authors":["Melusi Malinga","Isaac Lupanda","Mike Wa Nkongolo","Phil van Deventer"],"pdf_url":"https://arxiv.org/pdf/2411.04316v1.pdf","comment":"This work is part of a PhD proposal in Information Technology at the\n University of Pretoria, supervised by Dr. Mike Wa Nkongolo and co-supervised\n by Dr. Phil van Deventer, under the Low-Resource Language Processing Lab in\n the Department of Informatics"},{"id":"http://arxiv.org/abs/2411.02316v2","updated":"2024-11-06T23:27:24Z","published":"2024-11-04T17:40:39Z","title":"Evaluating Creative Short Story Generation in Humans and Large Language\n Models","summary":" Storytelling is a fundamental aspect of human communication, relying heavily\non creativity to produce narratives that are novel, appropriate, and\nsurprising. While large language models (LLMs) have recently demonstrated the\nability to generate high-quality stories, their creative capabilities remain\nunderexplored. Previous research has either focused on creativity tests\nrequiring short responses or primarily compared model performance in story\ngeneration to that of professional writers. However, the question of whether\nLLMs exhibit creativity in writing short stories on par with the average human\nremains unanswered. In this work, we conduct a systematic analysis of\ncreativity in short story generation across LLMs and everyday people. Using a\nfive-sentence creative story task, commonly employed in psychology to assess\nhuman creativity, we automatically evaluate model- and human-generated stories\nacross several dimensions of creativity, including novelty, surprise, and\ndiversity. Our findings reveal that while LLMs can generate stylistically\ncomplex stories, they tend to fall short in terms of creativity when compared\nto average human writers.\n","authors":["Mete Ismayilzada","Claire Stevenson","Lonneke van der Plas"],"pdf_url":"https://arxiv.org/pdf/2411.02316v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2411.04308v1","updated":"2024-11-06T23:16:25Z","published":"2024-11-06T23:16:25Z","title":"Improving Bilingual Capabilities of Language Models to Support Diverse\n Linguistic Practices in Education","summary":" Large language models (LLMs) offer promise in generating educational content,\nproviding instructor feedback, and reducing teacher workload on assessments.\nWhile prior studies have focused on studying LLM-powered learning analytics,\nlimited research has examined how effective LLMs are in a bilingual context. In\nthis paper, we study the effectiveness of multilingual large language models\n(MLLMs) across monolingual (English-only, Spanish-only) and bilingual\n(Spanglish) student writing. We present a learning analytics use case that\ndetails LLM performance in assessing acceptable and unacceptable explanations\nof Science and Social Science concepts. Our findings reveal a significant bias\nin the grading performance of pre-trained models for bilingual writing compared\nto English-only and Spanish-only writing. Following this, we fine-tune\nopen-source MLLMs including Llama 3.1 and Mistral NeMo using synthetic datasets\ngenerated in English, Spanish, and Spanglish. Our experiments indicate that the\nmodels perform significantly better for all three languages after fine-tuning\nwith bilingual data. This study highlights the potential of enhancing MLLM\neffectiveness to support authentic language practices amongst bilingual\nlearners. It also aims to illustrate the value of incorporating non-English\nlanguages into the design and implementation of language models in education.\n","authors":["Anand Syamkumar","Nora Tseng","Kaycie Barron","Shanglin Yang","Shamya Karumbaiah","Rheeya Uppal","Junjie Hu"],"pdf_url":"https://arxiv.org/pdf/2411.04308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04298v1","updated":"2024-11-06T22:46:13Z","published":"2024-11-06T22:46:13Z","title":"A Capabilities Approach to Studying Bias and Harm in Language\n Technologies","summary":" Mainstream Natural Language Processing (NLP) research has ignored the\nmajority of the world's languages. In moving from excluding the majority of the\nworld's languages to blindly adopting what we make for English, we first risk\nimporting the same harms we have at best mitigated and at least measured for\nEnglish. However, in evaluating and mitigating harms arising from adopting new\ntechnologies into such contexts, we often disregard (1) the actual community\nneeds of Language Technologies, and (2) biases and fairness issues within the\ncontext of the communities. In this extended abstract, we consider fairness,\nbias, and inclusion in Language Technologies through the lens of the\nCapabilities Approach. The Capabilities Approach centers on what people are\ncapable of achieving, given their intersectional social, political, and\neconomic contexts instead of what resources are (theoretically) available to\nthem. We detail the Capabilities Approach, its relationship to multilingual and\nmulticultural evaluation, and how the framework affords meaningful\ncollaboration with community members in defining and measuring the harms of\nLanguage Technologies.\n","authors":["Hellina Hailu Nigatu","Zeerak Talat"],"pdf_url":"https://arxiv.org/pdf/2411.04298v1.pdf","comment":"Accepted to the New Perspectives on Bias and Discrimination in\n Language Technology workshop"},{"id":"http://arxiv.org/abs/2411.00369v2","updated":"2024-11-06T22:41:31Z","published":"2024-11-01T05:14:03Z","title":"GRSQA -- Graph Reasoning-Structured Question Answering Dataset","summary":" Large Language Models (LLMs) have excelled in multi-hop question-answering\n(M-QA) due to their advanced reasoning abilities. However, the impact of the\ninherent reasoning structures on LLM M-QA performance remains unclear, largely\ndue to the absence of QA datasets that provide fine-grained reasoning\nstructures. To address this gap, we introduce the Graph Reasoning-Structured\nQuestion Answering Dataset (GRS-QA), which includes both semantic contexts and\nreasoning structures for QA pairs. Unlike existing M-QA datasets, where\ndifferent reasoning structures are entangled together, GRS-QA explicitly\ncaptures intricate reasoning pathways by constructing reasoning graphs, where\nnodes represent textual contexts and edges denote logical flows. These\nreasoning graphs of different structures enable a fine-grained evaluation of\nLLM reasoning capabilities across various reasoning structures. Our empirical\nanalysis reveals that LLMs perform differently when handling questions with\nvarying reasoning structures. This finding facilitates the exploration of\ntextual structures as compared with semantics.\n","authors":["Anish Pahilajani","Devasha Trivedi","Jincen Shuai","Khin S. Yone","Samyak Rajesh Jain","Namyong Park","Ryan A. Rossi","Nesreen K. Ahmed","Franck Dernoncourt","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2411.00369v2.pdf","comment":"15 pages, 24 figures, 10 tables"},{"id":"http://arxiv.org/abs/2406.11944v2","updated":"2024-11-06T22:37:30Z","published":"2024-06-17T17:49:00Z","title":"Transcoders Find Interpretable LLM Feature Circuits","summary":" A key goal in mechanistic interpretability is circuit analysis: finding\nsparse subgraphs of models corresponding to specific behaviors or capabilities.\nHowever, MLP sublayers make fine-grained circuit analysis on transformer-based\nlanguage models difficult. In particular, interpretable features -- such as\nthose found by sparse autoencoders (SAEs) -- are typically linear combinations\nof extremely many neurons, each with its own nonlinearity to account for.\nCircuit analysis in this setting thus either yields intractably large circuits\nor fails to disentangle local and global behavior. To address this we explore\ntranscoders, which seek to faithfully approximate a densely activating MLP\nlayer with a wider, sparsely-activating MLP layer. We introduce a novel method\nfor using transcoders to perform weights-based circuit analysis through MLP\nsublayers. The resulting circuits neatly factorize into input-dependent and\ninput-invariant terms. We then successfully train transcoders on language\nmodels with 120M, 410M, and 1.4B parameters, and find them to perform at least\non par with SAEs in terms of sparsity, faithfulness, and\nhuman-interpretability. Finally, we apply transcoders to reverse-engineer\nunknown circuits in the model, and we obtain novel insights regarding the\n\"greater-than circuit\" in GPT2-small. Our results suggest that transcoders can\nprove effective in decomposing model computations involving MLPs into\ninterpretable circuits. Code is available at\nhttps://github.com/jacobdunefsky/transcoder_circuits/.\n","authors":["Jacob Dunefsky","Philippe Chlenski","Neel Nanda"],"pdf_url":"https://arxiv.org/pdf/2406.11944v2.pdf","comment":"29 pages, 6 figures, 4 tables, 2 algorithms. NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.04291v1","updated":"2024-11-06T22:19:32Z","published":"2024-11-06T22:19:32Z","title":"Unfair Alignment: Examining Safety Alignment Across Vision Encoder\n Layers in Vision-Language Models","summary":" Vision-language models (VLMs) have improved significantly in multi-modal\ntasks, but their more complex architecture makes their safety alignment more\nchallenging than the alignment of large language models (LLMs). In this paper,\nwe reveal an unfair distribution of safety across the layers of VLM's vision\nencoder, with earlier and middle layers being disproportionately vulnerable to\nmalicious inputs compared to the more robust final layers. This 'cross-layer'\nvulnerability stems from the model's inability to generalize its safety\ntraining from the default architectural settings used during training to unseen\nor out-of-distribution scenarios, leaving certain layers exposed. We conduct a\ncomprehensive analysis by projecting activations from various intermediate\nlayers and demonstrate that these layers are more likely to generate harmful\noutputs when exposed to malicious inputs. Our experiments with LLaVA-1.5 and\nLlama 3.2 show discrepancies in attack success rates and toxicity scores across\nlayers, indicating that current safety alignment strategies focused on a single\ndefault layer are insufficient.\n","authors":["Saketh Bachu","Erfan Shayegani","Trishna Chakraborty","Rohit Lal","Arindam Dutta","Chengyu Song","Yue Dong","Nael Abu-Ghazaleh","Amit K. Roy-Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2411.04291v1.pdf","comment":"Preprint, Under Review"},{"id":"http://arxiv.org/abs/2406.15708v2","updated":"2024-11-06T22:07:17Z","published":"2024-06-22T02:07:10Z","title":"Teach Better or Show Smarter? On Instructions and Exemplars in Automatic\n Prompt Optimization","summary":" Large language models have demonstrated remarkable capabilities, but their\nperformance is heavily reliant on effective prompt engineering. Automatic\nprompt optimization (APO) methods are designed to automate this and can be\nbroadly categorized into those targeting instructions (instruction\noptimization, IO) vs. those targeting exemplars (exemplar optimization, EO).\nDespite their shared objective, these have evolved rather independently, with\nIO receiving more research attention recently. This paper seeks to bridge this\ngap by comprehensively comparing the performance of representative IO and EO\ntechniques both isolation and combination on a diverse set of challenging\ntasks. Our findings reveal that intelligently reusing model-generated\ninput-output pairs obtained from evaluating prompts on the validation set as\nexemplars, consistently improves performance on top of IO methods but is\ncurrently under-investigated. We also find that despite the recent focus on IO,\nhow we select exemplars can outweigh how we optimize instructions, with EO\nstrategies as simple as random search outperforming state-of-the-art IO methods\nwith seed instructions without any optimization. Moreover, we observe a synergy\nbetween EO and IO, with optimal combinations surpassing the individual\ncontributions. We conclude that studying exemplar optimization both as a\nstandalone method and its optimal combination with instruction optimization\nremain a crucial aspect of APO and deserve greater consideration in future\nresearch, even in the era of highly capable instruction-following models.\n","authors":["Xingchen Wan","Ruoxi Sun","Hootan Nakhost","Sercan O. Arik"],"pdf_url":"https://arxiv.org/pdf/2406.15708v2.pdf","comment":"Expanded version of the NeurIPS 2024 paper"},{"id":"http://arxiv.org/abs/2407.06004v3","updated":"2024-11-06T22:07:06Z","published":"2024-07-08T14:58:29Z","title":"Perceptions to Beliefs: Exploring Precursory Inferences for Theory of\n Mind in Large Language Models","summary":" While humans naturally develop theory of mind (ToM), the capability to\nunderstand other people's mental states and beliefs, state-of-the-art large\nlanguage models (LLMs) underperform on simple ToM benchmarks. We posit that we\ncan extend our understanding of LLMs' ToM abilities by evaluating key human ToM\nprecursors$-$perception inference and perception-to-belief inference$-$in LLMs.\nWe introduce two datasets, Percept-ToMi and Percept-FANToM, to evaluate these\nprecursory inferences for ToM in LLMs by annotating characters' perceptions on\nToMi and FANToM, respectively. Our evaluation of eight state-of-the-art LLMs\nreveals that the models generally perform well in perception inference while\nexhibiting limited capability in perception-to-belief inference (e.g., lack of\ninhibitory control). Based on these results, we present PercepToM, a novel ToM\nmethod leveraging LLMs' strong perception inference capability while\nsupplementing their limited perception-to-belief inference. Experimental\nresults demonstrate that PercepToM significantly enhances LLM's performance,\nespecially in false belief scenarios.\n","authors":["Chani Jung","Dongkwan Kim","Jiho Jin","Jiseon Kim","Yeon Seonwoo","Yejin Choi","Alice Oh","Hyunwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2407.06004v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07330v2","updated":"2024-11-06T22:03:18Z","published":"2024-07-10T02:58:37Z","title":"Interpretable Differential Diagnosis with Dual-Inference Large Language\n Models","summary":" Automatic differential diagnosis (DDx) is an essential medical task that\ngenerates a list of potential diseases as differentials based on patient\nsymptom descriptions. In practice, interpreting these differential diagnoses\nyields significant value but remains under-explored. Given the powerful\ncapabilities of large language models (LLMs), we investigated using LLMs for\ninterpretable DDx. Specifically, we curated the first DDx dataset with\nexpert-derived interpretation on 570 clinical notes. Besides, we proposed\nDual-Inf, a novel framework that enabled LLMs to conduct bidirectional\ninference (i.e., from symptoms to diagnoses and vice versa) for DDx\ninterpretation. Both human and automated evaluation validated its efficacy in\npredicting and elucidating differentials across four base LLMs. In addition,\nDual-Inf could reduce interpretation errors and hold promise for rare disease\nexplanations. To the best of our knowledge, it is the first work that\ncustomizes LLMs for DDx explanation and comprehensively evaluates their\ninterpretation performance. Overall, our study bridges a critical gap in DDx\ninterpretation and enhances clinical decision-making.\n","authors":["Shuang Zhou","Mingquan Lin","Sirui Ding","Jiashuo Wang","Genevieve B. Melton","James Zou","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.07330v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2411.04282v1","updated":"2024-11-06T22:02:30Z","published":"2024-11-06T22:02:30Z","title":"Language Models are Hidden Reasoners: Unlocking Latent Reasoning\n Capabilities via Self-Rewarding","summary":" Large language models (LLMs) have shown impressive capabilities, but still\nstruggle with complex reasoning tasks requiring multiple steps. While\nprompt-based methods like Chain-of-Thought (CoT) can improve LLM reasoning at\ninference time, optimizing reasoning capabilities during training remains\nchallenging. We introduce LaTent Reasoning Optimization (LaTRO), a principled\nframework that formulates reasoning as sampling from a latent distribution and\noptimizes it via variational approaches. LaTRO enables LLMs to concurrently\nimprove both their reasoning process and ability to evaluate reasoning quality,\nwithout requiring external feedback or reward models. We validate LaTRO through\nexperiments on GSM8K and ARC-Challenge datasets using multiple model\narchitectures. On GSM8K, LaTRO improves zero-shot accuracy by an average of\n12.5% over base models and 9.6% over supervised fine-tuning across\nPhi-3.5-mini, Mistral-7B, and Llama-3.1-8B. Our findings suggest that\npre-trained LLMs possess latent reasoning capabilities that can be unlocked and\nenhanced through our proposed optimization approach in a self-improvement\nmanner. The code of LaTRO is available at\n\\url{https://github.com/SalesforceAIResearch/LaTRO}.\n","authors":["Haolin Chen","Yihao Feng","Zuxin Liu","Weiran Yao","Akshara Prabhakar","Shelby Heinecke","Ricky Ho","Phil Mui","Silvio Savarese","Caiming Xiong","Huan Wang"],"pdf_url":"https://arxiv.org/pdf/2411.04282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15405v3","updated":"2024-11-06T21:18:59Z","published":"2023-05-24T17:59:05Z","title":"Textless Speech-to-Speech Translation With Limited Parallel Data","summary":" Existing speech-to-speech translation (S2ST) models fall into two camps: they\neither leverage text as an intermediate step or require hundreds of hours of\nparallel speech data. Both approaches are incompatible with textless languages\nor language pairs with limited parallel data. We present PFB, a framework for\ntraining textless S2ST models that require just dozens of hours of parallel\nspeech data. We first pretrain a model on large-scale monolingual speech data,\nfinetune it with a small amount of parallel speech data (20-60 hours), and\nlastly train with an unsupervised backtranslation objective. We train and\nevaluate our models for English-to-German, German-to-English and\nMarathi-to-English translation on three different domains (European Parliament,\nCommon Voice, and All India Radio) with single-speaker synthesized speech.\nEvaluated using the ASR-BLEU metric, our models achieve reasonable performance\non all three domains, with some being within 1-2 points of our higher-resourced\ntopline.\n","authors":["Anuj Diwan","Anirudh Srinivasan","David Harwath","Eunsol Choi"],"pdf_url":"https://arxiv.org/pdf/2305.15405v3.pdf","comment":"Accepted to EMNLP 2024 Findings"},{"id":"http://arxiv.org/abs/2407.12176v3","updated":"2024-11-06T20:38:48Z","published":"2024-07-16T21:03:14Z","title":"GPT-4V Cannot Generate Radiology Reports Yet","summary":" GPT-4V's purported strong multimodal abilities raise interests in using it to\nautomate radiology report writing, but there lacks thorough evaluations. In\nthis work, we perform a systematic evaluation of GPT-4V in generating radiology\nreports on two chest X-ray report datasets: MIMIC-CXR and IU X-Ray. We attempt\nto directly generate reports using GPT-4V through different prompting\nstrategies and find that it fails terribly in both lexical metrics and clinical\nefficacy metrics. To understand the low performance, we decompose the task into\ntwo steps: 1) the medical image reasoning step of predicting medical condition\nlabels from images; and 2) the report synthesis step of generating reports from\n(groundtruth) conditions. We show that GPT-4V's performance in image reasoning\nis consistently low across different prompts. In fact, the distributions of\nmodel-predicted labels remain constant regardless of which groundtruth\nconditions are present on the image, suggesting that the model is not\ninterpreting chest X-rays meaningfully. Even when given groundtruth conditions\nin report synthesis, its generated reports are less correct and less\nnatural-sounding than a finetuned LLaMA-2. Altogether, our findings cast doubt\non the viability of using GPT-4V in a radiology workflow.\n","authors":["Yuyang Jiang","Chacha Chen","Dang Nguyen","Benjamin M. Mervak","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2407.12176v3.pdf","comment":"24 pages, 3 figures, code:\n https://github.com/ChicagoHAI/cxr-eval-gpt-4v"},{"id":"http://arxiv.org/abs/2411.04223v1","updated":"2024-11-06T19:39:48Z","published":"2024-11-06T19:39:48Z","title":"Diversity Helps Jailbreak Large Language Models","summary":" We have uncovered a powerful jailbreak technique that leverages large\nlanguage models' ability to diverge from prior context, enabling them to bypass\nsafety constraints and generate harmful outputs. By simply instructing the LLM\nto deviate and obfuscate previous attacks, our method dramatically outperforms\nexisting approaches, achieving up to a 62% higher success rate in compromising\nnine leading chatbots, including GPT-4, Gemini, and Llama, while using only 13%\nof the queries. This revelation exposes a critical flaw in current LLM safety\ntraining, suggesting that existing methods may merely mask vulnerabilities\nrather than eliminate them. Our findings sound an urgent alarm for the need to\nrevolutionize testing methodologies to ensure robust and reliable LLM security.\n","authors":["Weiliang Zhao","Daniel Ben-Levi","Junfeng Yang","Chengzhi Mao"],"pdf_url":"https://arxiv.org/pdf/2411.04223v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2312.02119"},{"id":"http://arxiv.org/abs/2411.02537v2","updated":"2024-11-06T19:27:10Z","published":"2024-11-04T19:16:53Z","title":"INQUIRE: A Natural World Text-to-Image Retrieval Benchmark","summary":" We introduce INQUIRE, a text-to-image retrieval benchmark designed to\nchallenge multimodal vision-language models on expert-level queries. INQUIRE\nincludes iNaturalist 2024 (iNat24), a new dataset of five million natural world\nimages, along with 250 expert-level retrieval queries. These queries are paired\nwith all relevant images comprehensively labeled within iNat24, comprising\n33,000 total matches. Queries span categories such as species identification,\ncontext, behavior, and appearance, emphasizing tasks that require nuanced image\nunderstanding and domain expertise. Our benchmark evaluates two core retrieval\ntasks: (1) INQUIRE-Fullrank, a full dataset ranking task, and (2)\nINQUIRE-Rerank, a reranking task for refining top-100 retrievals. Detailed\nevaluation of a range of recent multimodal models demonstrates that INQUIRE\nposes a significant challenge, with the best models failing to achieve an\nmAP@50 above 50%. In addition, we show that reranking with more powerful\nmultimodal models can enhance retrieval performance, yet there remains a\nsignificant margin for improvement. By focusing on scientifically-motivated\necological challenges, INQUIRE aims to bridge the gap between AI capabilities\nand the needs of real-world scientific inquiry, encouraging the development of\nretrieval systems that can assist with accelerating ecological and biodiversity\nresearch. Our dataset and code are available at\nhttps://inquire-benchmark.github.io\n","authors":["Edward Vendrow","Omiros Pantazis","Alexander Shepard","Gabriel Brostow","Kate E. Jones","Oisin Mac Aodha","Sara Beery","Grant Van Horn"],"pdf_url":"https://arxiv.org/pdf/2411.02537v2.pdf","comment":"Published in NeurIPS 2024, Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2407.10964v2","updated":"2024-11-06T18:58:03Z","published":"2024-07-15T17:58:42Z","title":"No Train, all Gain: Self-Supervised Gradients Improve Deep Frozen\n Representations","summary":" This paper introduces FUNGI, Features from UNsupervised GradIents, a method\nto enhance the features of transformer encoders by leveraging self-supervised\ngradients. Our method is simple: given any pretrained model, we first compute\ngradients from various self-supervised objectives for each input. These\ngradients are projected to a lower dimension and then concatenated with the\nmodel's output embedding. The resulting features are evaluated on k-nearest\nneighbor classification over 11 datasets from vision, 5 from natural language\nprocessing, and 2 from audio. Across backbones spanning various sizes and\npretraining strategies, FUNGI features provide consistent performance\nimprovements over the embeddings. We also show that using FUNGI features can\nbenefit linear classification, clustering and image retrieval, and that they\nsignificantly improve the retrieval-based in-context scene understanding\nabilities of pretrained models, for example improving upon DINO by +17% for\nsemantic segmentation - without any training.\n","authors":["Walter Simoncini","Spyros Gidaris","Andrei Bursuc","Yuki M. Asano"],"pdf_url":"https://arxiv.org/pdf/2407.10964v2.pdf","comment":"NeurIPS 2024. Code available at\n https://github.com/WalterSimoncini/fungivision"},{"id":"http://arxiv.org/abs/2411.04118v1","updated":"2024-11-06T18:51:02Z","published":"2024-11-06T18:51:02Z","title":"Medical Adaptation of Large Language and Vision-Language Models: Are We\n Making Progress?","summary":" Several recent works seek to develop foundation models specifically for\nmedical applications, adapting general-purpose large language models (LLMs) and\nvision-language models (VLMs) via continued pretraining on publicly available\nbiomedical corpora. These works typically claim that such domain-adaptive\npretraining (DAPT) improves performance on downstream medical tasks, such as\nanswering medical licensing exam questions. In this paper, we compare seven\npublic \"medical\" LLMs and two VLMs against their corresponding base models,\narriving at a different conclusion: all medical VLMs and nearly all medical\nLLMs fail to consistently improve over their base models in the zero-/few-shot\nprompting regime for medical question-answering (QA) tasks. For instance,\nacross the tasks and model pairs we consider in the 3-shot setting, medical\nLLMs only outperform their base models in 12.1% of cases, reach a (statistical)\ntie in 49.8% of cases, and are significantly worse than their base models in\nthe remaining 38.2% of cases. Our conclusions are based on (i) comparing each\nmedical model head-to-head, directly against the corresponding base model; (ii)\noptimizing the prompts for each model separately; and (iii) accounting for\nstatistical uncertainty in comparisons. While these basic practices are not\nconsistently adopted in the literature, our ablations show that they\nsubstantially impact conclusions. Our findings suggest that state-of-the-art\ngeneral-domain models may already exhibit strong medical knowledge and\nreasoning capabilities, and offer recommendations to strengthen the conclusions\nof future studies.\n","authors":["Daniel P. Jeong","Saurabh Garg","Zachary C. Lipton","Michael Oberst"],"pdf_url":"https://arxiv.org/pdf/2411.04118v1.pdf","comment":"Accepted to EMNLP 2024 Main Conference as Long Paper (Oral)"},{"id":"http://arxiv.org/abs/2411.04109v1","updated":"2024-11-06T18:36:22Z","published":"2024-11-06T18:36:22Z","title":"Self-Consistency Preference Optimization","summary":" Self-alignment, whereby models learn to improve themselves without human\nannotation, is a rapidly growing research area. However, existing techniques\noften fail to improve complex reasoning tasks due to the difficulty of\nassigning correct rewards. An orthogonal approach that is known to improve\ncorrectness is self-consistency, a method applied at inference time based on\nmultiple sampling in order to find the most consistent answer. In this work, we\nextend the self-consistency concept to help train models. We thus introduce\nself-consistency preference optimization (ScPO), which iteratively trains\nconsistent answers to be preferred over inconsistent ones on unsupervised new\nproblems. We show ScPO leads to large improvements over conventional reward\nmodel training on reasoning tasks such as GSM8K and MATH, closing the gap with\nsupervised training with gold answers or preferences, and that combining ScPO\nwith standard supervised learning improves results even further. On ZebraLogic,\nScPO finetunes Llama-3 8B to be superior to Llama-3 70B, Gemma-2 27B, and\nClaude-3 Haiku.\n","authors":["Archiki Prasad","Weizhe Yuan","Richard Yuanzhe Pang","Jing Xu","Maryam Fazel-Zarandi","Mohit Bansal","Sainbayar Sukhbaatar","Jason Weston","Jane Yu"],"pdf_url":"https://arxiv.org/pdf/2411.04109v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.04093v1","updated":"2024-11-06T18:14:48Z","published":"2024-11-06T18:14:48Z","title":"Summarization of Opinionated Political Documents with Varied\n Perspectives","summary":" Global partisan hostility and polarization has increased, and this\npolarization is heightened around presidential elections. Models capable of\ngenerating accurate summaries of diverse perspectives can help reduce such\npolarization by exposing users to alternative perspectives. In this work, we\nintroduce a novel dataset and task for independently summarizing each political\nperspective in a set of passages from opinionated news articles. For this task,\nwe propose a framework for evaluating different dimensions of perspective\nsummary performance. We benchmark 10 models of varying sizes and architectures\nthrough both automatic and human evaluation. While recent models like GPT-4o\nperform well on this task, we find that all models struggle to generate\nsummaries faithful to the intended perspective. Our analysis of summaries\nfocuses on how extraction behavior depends on the features of the input\ndocuments.\n","authors":["Nicholas Deas","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2411.04093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11832v2","updated":"2024-11-06T18:07:03Z","published":"2024-08-06T15:49:58Z","title":"OpenFactCheck: A Unified Framework for Factuality Evaluation of LLMs","summary":" The increased use of large language models (LLMs) across a variety of\nreal-world applications calls for automatic tools to check the factual accuracy\nof their outputs, as LLMs often hallucinate. This is difficult as it requires\nassessing the factuality of free-form open-domain responses. While there has\nbeen a lot of research on this topic, different papers use different evaluation\nbenchmarks and measures, which makes them hard to compare and hampers future\nprogress. To mitigate these issues, we developed OpenFactCheck, a unified\nframework, with three modules: (i) RESPONSEEVAL, which allows users to easily\ncustomize an automatic fact-checking system and to assess the factuality of all\nclaims in an input document using that system, (ii) LLMEVAL, which assesses the\noverall factuality of an LLM, and (iii) CHECKEREVAL, a module to evaluate\nautomatic fact-checking systems. OpenFactCheck is open-sourced\n(https://github.com/mbzuai-nlp/openfactcheck) and publicly released as a Python\nlibrary (https://pypi.org/project/openfactcheck/) and also as a web service\n(http://app.openfactcheck.com). A video describing the system is available at\nhttps://youtu.be/-i9VKL0HleI.\n","authors":["Hasan Iqbal","Yuxia Wang","Minghan Wang","Georgi Georgiev","Jiahui Geng","Iryna Gurevych","Preslav Nakov"],"pdf_url":"https://arxiv.org/pdf/2408.11832v2.pdf","comment":"11 pages, 4 Figures, 3 Tables, Accepted at EMNLP 2024 System\n Demonstration. arXiv admin note: substantial text overlap with\n arXiv:2405.05583"},{"id":"http://arxiv.org/abs/2411.04075v1","updated":"2024-11-06T17:52:01Z","published":"2024-11-06T17:52:01Z","title":"M3SciQA: A Multi-Modal Multi-Document Scientific QA Benchmark for\n Evaluating Foundation Models","summary":" Existing benchmarks for evaluating foundation models mainly focus on\nsingle-document, text-only tasks. However, they often fail to fully capture the\ncomplexity of research workflows, which typically involve interpreting\nnon-textual data and gathering information across multiple documents. To\naddress this gap, we introduce M3SciQA, a multi-modal, multi-document\nscientific question answering benchmark designed for a more comprehensive\nevaluation of foundation models. M3SciQA consists of 1,452 expert-annotated\nquestions spanning 70 natural language processing paper clusters, where each\ncluster represents a primary paper along with all its cited documents,\nmirroring the workflow of comprehending a single paper by requiring multi-modal\nand multi-document data. With M3SciQA, we conduct a comprehensive evaluation of\n18 foundation models. Our results indicate that current foundation models still\nsignificantly underperform compared to human experts in multi-modal information\nretrieval and in reasoning across multiple scientific documents. Additionally,\nwe explore the implications of these findings for the future advancement of\napplying foundation models in multi-modal scientific literature analysis.\n","authors":["Chuhan Li","Ziyao Shangguan","Yilun Zhao","Deyuan Li","Yixin Liu","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2411.04075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17027v2","updated":"2024-11-06T17:20:42Z","published":"2024-09-25T15:30:24Z","title":"Counterfactual Token Generation in Large Language Models","summary":" \"Sure, I am happy to generate a story for you: Captain Lyra stood at the helm\nof her trusty ship, the Maelstrom's Fury, gazing out at the endless sea. [...]\nLyra's eyes welled up with tears as she realized the bitter truth - she had\nsacrificed everything for fleeting riches, and lost the love of her crew, her\nfamily, and herself.\" Although this story, generated by a large language model,\nis captivating, one may wonder -- how would the story have unfolded if the\nmodel had chosen \"Captain Maeve\" as the protagonist instead? We cannot know.\nState-of-the-art large language models are stateless -- they maintain no\ninternal memory or state. Given a prompt, they generate a sequence of tokens as\nan output using an autoregressive process. As a consequence, they cannot reason\nabout counterfactual alternatives to tokens they have generated in the past. In\nthis work, our goal is to enhance them with this functionality. To this end, we\ndevelop a causal model of token generation that builds upon the Gumbel-Max\nstructural causal model. Our model allows any large language model to perform\ncounterfactual token generation at almost no cost in comparison with vanilla\ntoken generation, it is embarrassingly simple to implement, and it does not\nrequire any fine-tuning nor prompt engineering. We implement our model on Llama\n3 8B-Instruct and Ministral-8B-Instruct and conduct a qualitative and a\nquantitative analysis of counterfactually generated text. We conclude with a\ndemonstrative application of counterfactual token generation for bias\ndetection, unveiling interesting insights about the model of the world\nconstructed by large language models.\n","authors":["Ivi Chatzi","Nina Corvelo Benz","Eleni Straitouri","Stratis Tsirtsis","Manuel Gomez-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2409.17027v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01483v3","updated":"2024-11-06T17:04:36Z","published":"2024-11-03T08:49:55Z","title":"Teaching Models to Improve on Tape","summary":" Large Language Models (LLMs) often struggle when prompted to generate content\nunder specific constraints. However, in such cases it is often easy to check\nwhether these constraints are satisfied or violated. Recent works have shown\nthat LLMs can benefit from such \"corrective feedback\". Here we claim that this\nskill of LLMs can be significantly enhanced via training. We introduce an RL\nframework for teaching models to use such rewards, by simulating interaction\nsessions, and rewarding the model according to its ability to satisfy the\nconstraints. We refer to our method as CORGI (Controlled Generation with RL for\nGuided Interaction), and evaluate it on a variety of controlled generation\ntasks using unlabeled training data. We find that CORGI consistently\noutperforms the baseline reinforcement learning method that does not\nincorporate conversational feedback. Furthermore, CORGI's interactive framework\nenables meta-learning, allowing the LLM to generalize better to guided\ninteraction in new tasks. Our results clearly show that conversational\noptimization, when combined with reinforcement learning, significantly improves\nthe effectiveness of LLMs in controlled generation contexts.\n","authors":["Liat Bezalel","Eyal Orgad","Amir Globerson"],"pdf_url":"https://arxiv.org/pdf/2411.01483v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14632v2","updated":"2024-11-06T16:54:48Z","published":"2024-10-18T17:32:22Z","title":"Diverging Preferences: When do Annotators Disagree and do Models Know?","summary":" We examine diverging preferences in human-labeled preference datasets. We\ndevelop a taxonomy of disagreement sources spanning 10 categories across four\nhigh-level classes -- task underspecification, response style, refusals, and\nannotation errors. We find that the majority of disagreements are in opposition\nwith standard reward modeling approaches, which are designed with the\nassumption that annotator disagreement is noise. We then explore how these\nfindings impact two areas of LLM development: reward modeling and evaluation.\nIn our experiments, we demonstrate how standard reward modeling methods, like\nthe Bradley-Terry model, fail to differentiate whether a given preference\njudgment is the result of unanimous agreement among annotators or the majority\nopinion among diverging user preferences. We also find that these tendencies\nare also echoed by popular LLM-as-Judge evaluation methods, which consistently\nidentify a winning response in cases of diverging preferences. These findings\nhighlight remaining challenges in LLM evaluations, which are greatly influenced\nby divisive features like response style, and in developing pluralistically\naligned LLMs. To address these issues, we develop methods for identifying\ndiverging preferences to mitigate their influence on evaluation and training.\n","authors":["Michael JQ Zhang","Zhilin Wang","Jena D. Hwang","Yi Dong","Olivier Delalleau","Yejin Choi","Eunsol Choi","Xiang Ren","Valentina Pyatkin"],"pdf_url":"https://arxiv.org/pdf/2410.14632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04032v1","updated":"2024-11-06T16:31:28Z","published":"2024-11-06T16:31:28Z","title":"Beemo: Benchmark of Expert-edited Machine-generated Outputs","summary":" The rapid proliferation of large language models (LLMs) has increased the\nvolume of machine-generated texts (MGTs) and blurred text authorship in various\ndomains. However, most existing MGT benchmarks include single-author texts\n(human-written and machine-generated). This conventional design fails to\ncapture more practical multi-author scenarios, where the user refines the LLM\nresponse for natural flow, coherence, and factual correctness. Our paper\nintroduces the Benchmark of Expert-edited Machine-generated Outputs (Beemo),\nwhich includes 6.5k texts written by humans, generated by ten\ninstruction-finetuned LLMs, and edited by experts for various use cases,\nranging from creative writing to summarization. Beemo additionally comprises\n13.1k machine-generated and LLM-edited texts, allowing for diverse MGT\ndetection evaluation across various edit types. We document Beemo's creation\nprotocol and present the results of benchmarking 33 configurations of MGT\ndetectors in different experimental setups. We find that expert-based editing\nevades MGT detection, while LLM-edited texts are unlikely to be recognized as\nhuman-written. Beemo and all materials are publicly available.\n","authors":["Ekaterina Artemova","Jason Lucas","Saranya Venkatraman","Jooyoung Lee","Sergei Tilga","Adaku Uchendu","Vladislav Mikhailov"],"pdf_url":"https://arxiv.org/pdf/2411.04032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04025v1","updated":"2024-11-06T16:20:37Z","published":"2024-11-06T16:20:37Z","title":"Prompt Engineering Using GPT for Word-Level Code-Mixed Language\n Identification in Low-Resource Dravidian Languages","summary":" Language Identification (LI) is crucial for various natural language\nprocessing tasks, serving as a foundational step in applications such as\nsentiment analysis, machine translation, and information retrieval. In\nmultilingual societies like India, particularly among the youth engaging on\nsocial media, text often exhibits code-mixing, blending local languages with\nEnglish at different linguistic levels. This phenomenon presents formidable\nchallenges for LI systems, especially when languages intermingle within single\nwords. Dravidian languages, prevalent in southern India, possess rich\nmorphological structures yet suffer from under-representation in digital\nplatforms, leading to the adoption of Roman or hybrid scripts for\ncommunication. This paper introduces a prompt based method for a shared task\naimed at addressing word-level LI challenges in Dravidian languages. In this\nwork, we leveraged GPT-3.5 Turbo to understand whether the large language\nmodels is able to correctly classify words into correct categories. Our\nfindings show that the Kannada model consistently outperformed the Tamil model\nacross most metrics, indicating a higher accuracy and reliability in\nidentifying and categorizing Kannada language instances. In contrast, the Tamil\nmodel showed moderate performance, particularly needing improvement in\nprecision and recall.\n","authors":["Aniket Deroy","Subhankar Maity"],"pdf_url":"https://arxiv.org/pdf/2411.04025v1.pdf","comment":"Accepted at FIRE 2024 (Track: Word-level Language Identification in\n Dravidian Languages)"},{"id":"http://arxiv.org/abs/2404.08262v3","updated":"2024-11-06T16:19:24Z","published":"2024-04-12T06:21:48Z","title":"Pretraining and Updates of Domain-Specific LLM: A Case Study in the\n Japanese Business Domain","summary":" The development of Large Language Models (LLMs) in various languages has been\nadvancing, but the combination of non-English languages with domain-specific\ncontexts remains underexplored. This paper presents our findings from training\nand evaluating a Japanese business domain-specific LLM designed to better\nunderstand business-related documents, such as the news on current affairs,\ntechnical reports, and patents. Additionally, LLMs in this domain require\nregular updates to incorporate the most recent knowledge. Therefore, we also\nreport our findings from the first experiments and evaluations involving\nupdates to this LLM using the latest article data, which is an important\nproblem setting that has not been addressed in previous research. From our\nexperiments on a newly created benchmark dataset for question answering in the\ntarget domain, we found that (1) our pretrained model improves QA accuracy\nwithout losing general knowledge, and (2) a proper mixture of the latest and\nolder texts in the training data for the update is necessary. Our pretrained\nmodel and business domain benchmark are publicly available to support further\nstudies.\n","authors":["Kosuke Takahashi","Takahiro Omi","Kosuke Arima","Tatsuya Ishigaki"],"pdf_url":"https://arxiv.org/pdf/2404.08262v3.pdf","comment":"Accepted at PACLIC 38"},{"id":"http://arxiv.org/abs/2410.07520v2","updated":"2024-11-06T16:17:21Z","published":"2024-10-10T01:21:48Z","title":"News Reporter: A Multi-lingual LLM Framework for Broadcast T.V News","summary":" Large Language Models (LLMs) have fast become an essential tools to many\nconversational chatbots due to their ability to provide coherent answers for\nvaried queries. Datasets used to train these LLMs are often a mix of generic\nand synthetic samples, thus lacking the verification needed to provide correct\nand verifiable answers for T.V. News.\n We collect and share a large collection of QA pairs extracted from\ntranscripts of news recordings from various news-channels across the United\nStates. Resultant QA pairs are then used to fine-tune an off-the-shelf LLM\nmodel. Our model surpasses base models of similar size on several open LLM\nbenchmarks. We further integrate and propose a RAG method to improve\ncontextualization of our answers and also point it to a verifiable news\nrecording.\n","authors":["Tarun Jain","Yufei Gao","Sridhar Vanga","Karan Singla"],"pdf_url":"https://arxiv.org/pdf/2410.07520v2.pdf","comment":"5 pages, under review at ICASSP 2025"},{"id":"http://arxiv.org/abs/2407.14916v2","updated":"2024-11-06T16:11:18Z","published":"2024-07-20T16:05:17Z","title":"Improving Context-Aware Preference Modeling for Language Models","summary":" While finetuning language models from pairwise preferences has proven\nremarkably effective, the underspecified nature of natural language presents\ncritical challenges. Direct preference feedback is uninterpretable, difficult\nto provide where multidimensional criteria may apply, and often inconsistent,\neither because it is based on incomplete instructions or provided by diverse\nprincipals. To address these challenges, we consider the two-step preference\nmodeling procedure that first resolves the under-specification by selecting a\ncontext, and then evaluates preference with respect to the chosen context. We\ndecompose reward modeling error according to these two steps, which suggests\nthat supervising context in addition to context-specific preference may be a\nviable approach to aligning models with diverse human preferences. For this to\nwork, the ability of models to evaluate context-specific preference is\ncritical. To this end, we contribute context-conditioned preference datasets\nand accompanying experiments that investigate the ability of language models to\nevaluate context-specific preference. We use our datasets to (1) show that\nexisting preference models benefit from, but fail to fully consider, added\ncontext, (2) finetune a context-aware reward model with context-specific\nperformance exceeding that of GPT-4 and Llama 3 70B on tested datasets, and (3)\ninvestigate the value of context-aware preference modeling.\n","authors":["Silviu Pitis","Ziang Xiao","Nicolas Le Roux","Alessandro Sordoni"],"pdf_url":"https://arxiv.org/pdf/2407.14916v2.pdf","comment":"NeurIPS 2024. 10 pages (29 with references and appendix)"},{"id":"http://arxiv.org/abs/2405.17537v3","updated":"2024-11-06T15:56:04Z","published":"2024-05-27T17:57:48Z","title":"CLIBD: Bridging Vision and Genomics for Biodiversity Monitoring at Scale","summary":" Measuring biodiversity is crucial for understanding ecosystem health. While\nprior works have developed machine learning models for taxonomic classification\nof photographic images and DNA separately, in this work, we introduce a\nmultimodal approach combining both, using CLIP-style contrastive learning to\nalign images, barcode DNA, and text-based representations of taxonomic labels\nin a unified embedding space. This allows for accurate classification of both\nknown and unknown insect species without task-specific fine-tuning, leveraging\ncontrastive learning for the first time to fuse DNA and image data. Our method\nsurpasses previous single-modality approaches in accuracy by over 8% on\nzero-shot learning tasks, showcasing its effectiveness in biodiversity studies.\n","authors":["ZeMing Gong","Austin T. Wang","Xiaoliang Huo","Joakim Bruslund Haurum","Scott C. Lowe","Graham W. Taylor","Angel X. Chang"],"pdf_url":"https://arxiv.org/pdf/2405.17537v3.pdf","comment":"25 pages with 11 figures"},{"id":"http://arxiv.org/abs/2410.16676v3","updated":"2024-11-06T15:49:30Z","published":"2024-10-22T04:18:19Z","title":"Improving Causal Reasoning in Large Language Models: A Survey","summary":" Causal reasoning (CR) is a crucial aspect of intelligence, essential for\nproblem-solving, decision-making, and understanding the world. While large\nlanguage models (LLMs) can generate rationales for their outputs, their ability\nto reliably perform causal reasoning remains uncertain, often falling short in\ntasks requiring a deep understanding of causality. In this survey, we provide a\ncomprehensive review of research aimed at enhancing LLMs for causal reasoning.\nWe categorize existing methods based on the role of LLMs: either as reasoning\nengines or as helpers providing knowledge or data to traditional CR methods,\nfollowed by a detailed discussion of the methodologies in each category. We\nthen evaluate the performance of LLMs on various causal reasoning tasks,\nproviding key findings and in-depth analysis. Finally, we provide insights from\ncurrent studies and highlight promising directions for future research. We aim\nfor this work to serve as a comprehensive resource, fostering further\nadvancements in causal reasoning with LLMs. Resources are available at\nhttps://github.com/chendl02/Awesome-LLM-causal-reasoning.\n","authors":["Longxuan Yu","Delin Chen","Siheng Xiong","Qingyang Wu","Qingzhen Liu","Dawei Li","Zhikai Chen","Xiaoze Liu","Liangming Pan"],"pdf_url":"https://arxiv.org/pdf/2410.16676v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03966v1","updated":"2024-11-06T15:03:47Z","published":"2024-11-06T15:03:47Z","title":"WorryWords: Norms of Anxiety Association for over 44k English Words","summary":" Anxiety, the anticipatory unease about a potential negative outcome, is a\ncommon and beneficial human emotion. However, there is still much that is not\nknown, such as how anxiety relates to our body and how it manifests in\nlanguage. This is especially pertinent given the increasing impact of\nanxiety-related disorders. In this work, we introduce WorryWords, the first\nlarge-scale repository of manually derived word--anxiety associations for over\n44,450 English words. We show that the anxiety associations are highly\nreliable. We use WorryWords to study the relationship between anxiety and other\nemotion constructs, as well as the rate at which children acquire anxiety words\nwith age. Finally, we show that using WorryWords alone, one can accurately\ntrack the change of anxiety in streams of text. The lexicon enables a wide\nvariety of anxiety-related research in psychology, NLP, public health, and\nsocial sciences. WorryWords (and its translations to over 100 languages) is\nfreely available. http://saifmohammad.com/worrywords.html\n","authors":["Saif M. Mohammad"],"pdf_url":"https://arxiv.org/pdf/2411.03966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03964v1","updated":"2024-11-06T14:54:19Z","published":"2024-11-06T14:54:19Z","title":"What Really is Commonsense Knowledge?","summary":" Commonsense datasets have been well developed in Natural Language Processing,\nmainly through crowdsource human annotation. However, there are debates on the\ngenuineness of commonsense reasoning benchmarks. In specific, a significant\nportion of instances in some commonsense benchmarks do not concern commonsense\nknowledge. That problem would undermine the measurement of the true commonsense\nreasoning ability of evaluated models. It is also suggested that the problem\noriginated from a blurry concept of commonsense knowledge, as distinguished\nfrom other types of knowledge. To demystify all of the above claims, in this\nstudy, we survey existing definitions of commonsense knowledge, ground into the\nthree frameworks for defining concepts, and consolidate them into a\nmulti-framework unified definition of commonsense knowledge (so-called\nconsolidated definition). We then use the consolidated definition for\nannotations and experiments on the CommonsenseQA and CommonsenseQA 2.0 datasets\nto examine the above claims. Our study shows that there exists a large portion\nof non-commonsense-knowledge instances in the two datasets, and a large\nperformance gap on these two subsets where Large Language Models (LLMs) perform\nworse on commonsense-knowledge instances.\n","authors":["Quyet V. Do","Junze Li","Tung-Duong Vuong","Zhaowei Wang","Yangqiu Song","Xiaojuan Ma"],"pdf_url":"https://arxiv.org/pdf/2411.03964v1.pdf","comment":"Code and data will be released together with the next version of the\n paper"},{"id":"http://arxiv.org/abs/2411.03962v1","updated":"2024-11-06T14:51:02Z","published":"2024-11-06T14:51:02Z","title":"How Does A Text Preprocessing Pipeline Affect Ontology Syntactic\n Matching?","summary":" The generic text preprocessing pipeline, comprising Tokenisation,\nNormalisation, Stop Words Removal, and Stemming/Lemmatisation, has been\nimplemented in many ontology matching (OM) systems. However, the lack of\nstandardisation in text preprocessing creates diversity in mapping results. In\nthis paper, we investigate the effect of the text preprocessing pipeline on OM\ntasks at syntactic levels. Our experiments on 8 Ontology Alignment Evaluation\nInitiative (OAEI) track repositories with 49 distinct alignments indicate: (1)\nTokenisation and Normalisation are currently more effective than Stop Words\nRemoval and Stemming/Lemmatisation; and (2) The selection of Lemmatisation and\nStemming is task-specific. We recommend standalone Lemmatisation or Stemming\nwith post-hoc corrections. We find that (3) Porter Stemmer and Snowball Stemmer\nperform better than Lancaster Stemmer; and that (4) Part-of-Speech (POS)\nTagging does not help Lemmatisation. To repair less effective Stop Words\nRemoval and Stemming/Lemmatisation used in OM tasks, we propose a novel\ncontext-based pipeline repair approach that significantly improves matching\ncorrectness and overall matching performance. We also discuss the use of text\npreprocessing pipeline in the new era of large language models (LLMs).\n","authors":["Zhangcheng Qiang","Kerry Taylor","Weiqing Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03962v1.pdf","comment":"13 pages, 26 figures, 4 tables"},{"id":"http://arxiv.org/abs/2406.10149v2","updated":"2024-11-06T14:50:40Z","published":"2024-06-14T16:00:29Z","title":"BABILong: Testing the Limits of LLMs with Long Context\n Reasoning-in-a-Haystack","summary":" In recent years, the input context sizes of large language models (LLMs) have\nincreased dramatically. However, existing evaluation methods have not kept\npace, failing to comprehensively assess the efficiency of models in handling\nlong contexts. To bridge this gap, we introduce the BABILong benchmark,\ndesigned to test language models' ability to reason across facts distributed in\nextremely long documents. BABILong includes a diverse set of 20 reasoning\ntasks, including fact chaining, simple induction, deduction, counting, and\nhandling lists/sets. These tasks are challenging on their own, and even more\ndemanding when the required facts are scattered across long natural text. Our\nevaluations show that popular LLMs effectively utilize only 10-20\\% of the\ncontext and their performance declines sharply with increased reasoning\ncomplexity. Among alternatives to in-context reasoning, Retrieval-Augmented\nGeneration methods achieve a modest 60\\% accuracy on single-fact question\nanswering, independent of context length. Among context extension methods, the\nhighest performance is demonstrated by recurrent memory transformers after\nfine-tuning, enabling the processing of lengths up to 50 million tokens. The\nBABILong benchmark is extendable to any length to support the evaluation of new\nupcoming models with increased capabilities, and we provide splits up to 10\nmillion token lengths.\n","authors":["Yuri Kuratov","Aydar Bulatov","Petr Anokhin","Ivan Rodkin","Dmitry Sorokin","Artyom Sorokin","Mikhail Burtsev"],"pdf_url":"https://arxiv.org/pdf/2406.10149v2.pdf","comment":"NeurIPS 2024 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2410.12656v2","updated":"2024-11-06T14:14:58Z","published":"2024-10-16T15:17:20Z","title":"Evaluating Morphological Compositional Generalization in Large Language\n Models","summary":" Large language models (LLMs) have demonstrated significant progress in\nvarious natural language generation and understanding tasks. However, their\nlinguistic generalization capabilities remain questionable, raising doubts\nabout whether these models learn language similarly to humans. While humans\nexhibit compositional generalization and linguistic creativity in language use,\nthe extent to which LLMs replicate these abilities, particularly in morphology,\nis under-explored. In this work, we systematically investigate the\nmorphological generalization abilities of LLMs through the lens of\ncompositionality. We define morphemes as compositional primitives and design a\nnovel suite of generative and discriminative tasks to assess morphological\nproductivity and systematicity. Focusing on agglutinative languages such as\nTurkish and Finnish, we evaluate several state-of-the-art instruction-finetuned\nmultilingual models, including GPT-4 and Gemini. Our analysis shows that LLMs\nstruggle with morphological compositional generalization particularly when\napplied to novel word roots, with performance declining sharply as\nmorphological complexity increases. While models can identify individual\nmorphological combinations better than chance, their performance lacks\nsystematicity, leading to significant accuracy gaps compared to humans.\n","authors":["Mete Ismayilzada","Defne Circi","Jonne Sälevä","Hale Sirin","Abdullatif Köksal","Bhuwan Dhingra","Antoine Bosselut","Lonneke van der Plas","Duygu Ataman"],"pdf_url":"https://arxiv.org/pdf/2410.12656v2.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2411.03934v1","updated":"2024-11-06T14:11:39Z","published":"2024-11-06T14:11:39Z","title":"Interactions Across Blocks in Post-Training Quantization of Large\n Language Models","summary":" Post-training quantization is widely employed to reduce the computational\ndemands of neural networks. Typically, individual substructures, such as layers\nor blocks of layers, are quantized with the objective of minimizing\nquantization errors in their pre-activations by fine-tuning the corresponding\nweights. Deriving this local objective from the global objective of minimizing\ntask loss involves two key simplifications: assuming substructures are mutually\nindependent and ignoring the knowledge of subsequent substructures as well as\nthe task loss. In this work, we assess the effects of these simplifications on\nweight-only quantization of large language models. We introduce two multi-block\nfine-tuning strategies and compare them against the baseline of fine-tuning\nsingle transformer blocks. The first captures correlations of weights across\nblocks by jointly optimizing multiple quantized blocks. The second incorporates\nknowledge of subsequent blocks by minimizing the error in downstream\npre-activations rather than focusing solely on the quantized block. Our\nfindings indicate that the effectiveness of these methods depends on the\nspecific network model, with no impact on some models but demonstrating\nsignificant benefits for others.\n","authors":["Khasmamad Shabanovi","Lukas Wiest","Vladimir Golkov","Daniel Cremers","Thomas Pfeil"],"pdf_url":"https://arxiv.org/pdf/2411.03934v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07001v4","updated":"2024-11-06T13:56:28Z","published":"2024-05-11T12:33:46Z","title":"ChartInsights: Evaluating Multimodal Large Language Models for Low-Level\n Chart Question Answering","summary":" Chart question answering (ChartQA) tasks play a critical role in interpreting\nand extracting insights from visualization charts. While recent advancements in\nmultimodal large language models (MLLMs) like GPT-4o have shown promise in\nhigh-level ChartQA tasks, such as chart captioning, their effectiveness in\nlow-level ChartQA tasks (e.g., identifying correlations) remains underexplored.\nIn this paper, we address this gap by evaluating MLLMs on low-level ChartQA\nusing a newly curated dataset, ChartInsights, which consists of 22,347 (chart,\ntask, query, answer) covering 10 data analysis tasks across 7 chart types. We\nsystematically evaluate 19 advanced MLLMs, including 12 open-source and 7\nclosed-source models. The average accuracy rate across these models is 39.8%,\nwith GPT-4o achieving the highest accuracy at 69.17%. To further explore the\nlimitations of MLLMs in low-level ChartQA, we conduct experiments that alter\nvisual elements of charts (e.g., changing color schemes, adding image noise) to\nassess their impact on the task effectiveness. Furthermore, we propose a new\ntextual prompt strategy, Chain-of-Charts, tailored for low-level ChartQA tasks,\nwhich boosts performance by 14.41%, achieving an accuracy of 83.58%. Finally,\nincorporating a visual prompt strategy that directs attention to relevant\nvisual elements further improves accuracy to 84.32%.\n","authors":["Yifan Wu","Lutao Yan","Leixian Shen","Yunhai Wang","Nan Tang","Yuyu Luo"],"pdf_url":"https://arxiv.org/pdf/2405.07001v4.pdf","comment":"EMNLP 2024 Conference Paper"},{"id":"http://arxiv.org/abs/2411.03923v1","updated":"2024-11-06T13:54:08Z","published":"2024-11-06T13:54:08Z","title":"Evaluation data contamination in LLMs: how do we measure it and (when)\n does it matter?","summary":" Hampering the interpretation of benchmark scores, evaluation data\ncontamination has become a growing concern in the evaluation of LLMs, and an\nactive area of research studies its effects. While evaluation data\ncontamination is easily understood intuitively, it is surprisingly difficult to\ndefine precisely which samples should be considered contaminated and,\nconsequently, how it impacts benchmark scores. We propose that these questions\nshould be addressed together and that contamination metrics can be assessed\nbased on whether models benefit from the examples they mark contaminated. We\npropose a novel analysis method called ConTAM, and show with a large scale\nsurvey of existing and novel n-gram based contamination metrics across 13\nbenchmarks and 7 models from 2 different families that ConTAM can be used to\nbetter understand evaluation data contamination and its effects. We find that\ncontamination may have a much larger effect than reported in recent LLM\nreleases and benefits models differently at different scales. We also find that\nconsidering only the longest contaminated substring provides a better signal\nthan considering a union of all contaminated substrings, and that doing model\nand benchmark specific threshold analysis greatly increases the specificity of\nthe results. Lastly, we investigate the impact of hyperparameter choices,\nfinding that, among other things, both using larger values of n and\ndisregarding matches that are infrequent in the pre-training data lead to many\nfalse negatives. With ConTAM, we provide a method to empirically ground\nevaluation data contamination metrics in downstream effects. With our\nexploration, we shed light on how evaluation data contamination can impact LLMs\nand provide insight into the considerations important when doing contamination\nanalysis. We end our paper by discussing these in more detail and providing\nconcrete suggestions for future work.\n","authors":["Aaditya K. Singh","Muhammed Yusuf Kocyigit","Andrew Poulton","David Esiobu","Maria Lomeli","Gergely Szilvasy","Dieuwke Hupkes"],"pdf_url":"https://arxiv.org/pdf/2411.03923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03920v1","updated":"2024-11-06T13:51:42Z","published":"2024-11-06T13:51:42Z","title":"RAGulator: Lightweight Out-of-Context Detectors for Grounded Text\n Generation","summary":" Real-time detection of out-of-context LLM outputs is crucial for enterprises\nlooking to safely adopt RAG applications. In this work, we train lightweight\nmodels to discriminate LLM-generated text that is semantically out-of-context\nfrom retrieved text documents. We preprocess a combination of summarisation and\nsemantic textual similarity datasets to construct training data using minimal\nresources. We find that DeBERTa is not only the best-performing model under\nthis pipeline, but it is also fast and does not require additional text\npreprocessing or feature engineering. While emerging work demonstrates that\ngenerative LLMs can also be fine-tuned and used in complex data pipelines to\nachieve state-of-the-art performance, we note that speed and resource limits\nare important considerations for on-premise deployment.\n","authors":["Ian Poey","Jiajun Liu","Qishuai Zhong","Adrien Chenailler"],"pdf_url":"https://arxiv.org/pdf/2411.03920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04158v1","updated":"2024-11-06T13:50:50Z","published":"2024-11-06T13:50:50Z","title":"Analyzing Multimodal Features of Spontaneous Voice Assistant Commands\n for Mild Cognitive Impairment Detection","summary":" Mild cognitive impairment (MCI) is a major public health concern due to its\nhigh risk of progressing to dementia. This study investigates the potential of\ndetecting MCI with spontaneous voice assistant (VA) commands from 35 older\nadults in a controlled setting. Specifically, a command-generation task is\ndesigned with pre-defined intents for participants to freely generate commands\nthat are more associated with cognitive ability than read commands. We develop\nMCI classification and regression models with audio, textual, intent, and\nmultimodal fusion features. We find the command-generation task outperforms the\ncommand-reading task with an average classification accuracy of 82%, achieved\nby leveraging multimodal fusion features. In addition, generated commands\ncorrelate more strongly with memory and attention subdomains than read\ncommands. Our results confirm the effectiveness of the command-generation task\nand imply the promise of using longitudinal in-home commands for MCI detection.\n","authors":["Nana Lin","Youxiang Zhu","Xiaohui Liang","John A. Batsis","Caroline Summerour"],"pdf_url":"https://arxiv.org/pdf/2411.04158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02937v2","updated":"2024-11-06T13:40:25Z","published":"2024-11-05T09:27:21Z","title":"Benchmarking Multimodal Retrieval Augmented Generation with Dynamic VQA\n Dataset and Self-adaptive Planning Agent","summary":" Multimodal Retrieval Augmented Generation (mRAG) plays an important role in\nmitigating the \"hallucination\" issue inherent in multimodal large language\nmodels (MLLMs). Although promising, existing heuristic mRAGs typically\npredefined fixed retrieval processes, which causes two issues: (1) Non-adaptive\nRetrieval Queries. (2) Overloaded Retrieval Queries. However, these flaws\ncannot be adequately reflected by current knowledge-seeking visual question\nanswering (VQA) datasets, since the most required knowledge can be readily\nobtained with a standard two-step retrieval. To bridge the dataset gap, we\nfirst construct Dyn-VQA dataset, consisting of three types of \"dynamic\"\nquestions, which require complex knowledge retrieval strategies variable in\nquery, tool, and time: (1) Questions with rapidly changing answers. (2)\nQuestions requiring multi-modal knowledge. (3) Multi-hop questions. Experiments\non Dyn-VQA reveal that existing heuristic mRAGs struggle to provide sufficient\nand precisely relevant knowledge for dynamic questions due to their rigid\nretrieval processes. Hence, we further propose the first self-adaptive planning\nagent for multimodal retrieval, OmniSearch. The underlying idea is to emulate\nthe human behavior in question solution which dynamically decomposes complex\nmultimodal questions into sub-question chains with retrieval action. Extensive\nexperiments prove the effectiveness of our OmniSearch, also provide direction\nfor advancing mRAG. The code and dataset will be open-sourced at\nhttps://github.com/Alibaba-NLP/OmniSearch.\n","authors":["Yangning Li","Yinghui Li","Xinyu Wang","Yong Jiang","Zhen Zhang","Xinran Zheng","Hui Wang","Hai-Tao Zheng","Pengjun Xie","Philip S. Yu","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.02937v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03906v1","updated":"2024-11-06T13:37:28Z","published":"2024-11-06T13:37:28Z","title":"Lexicalization Is All You Need: Examining the Impact of Lexical\n Knowledge in a Compositional QALD System","summary":" In this paper, we examine the impact of lexicalization on Question Answering\nover Linked Data (QALD). It is well known that one of the key challenges in\ninterpreting natural language questions with respect to SPARQL lies in bridging\nthe lexical gap, that is mapping the words in the query to the correct\nvocabulary elements. We argue in this paper that lexicalization, that is\nexplicit knowledge about the potential interpretations of a word with respect\nto the given vocabulary, significantly eases the task and increases the\nperformance of QA systems. Towards this goal, we present a compositional QA\nsystem that can leverage explicit lexical knowledge in a compositional manner\nto infer the meaning of a question in terms of a SPARQL query. We show that\nsuch a system, given lexical knowledge, has a performance well beyond current\nQA systems, achieving up to a $35.8\\%$ increase in the micro $F_1$ score\ncompared to the best QA system on QALD-9. This shows the importance and\npotential of including explicit lexical knowledge. In contrast, we show that\nLLMs have limited abilities to exploit lexical knowledge, with only marginal\nimprovements compared to a version without lexical knowledge. This shows that\nLLMs have no ability to compositionally interpret a question on the basis of\nthe meaning of its parts, a key feature of compositional approaches. Taken\ntogether, our work shows new avenues for QALD research, emphasizing the\nimportance of lexicalization and compositionality.\n","authors":["David Maria Schmidt","Mohammad Fazleh Elahi","Philipp Cimiano"],"pdf_url":"https://arxiv.org/pdf/2411.03906v1.pdf","comment":"24th International Conference on Knowledge Engineering and Knowledge\n Management (EKAW 2024), November 26-28, 2024, Amsterdam, The Netherlands"},{"id":"http://arxiv.org/abs/2411.03895v1","updated":"2024-11-06T13:13:33Z","published":"2024-11-06T13:13:33Z","title":"Computational Analysis of Gender Depiction in the Comedias of Calderón\n de la Barca","summary":" In theatre, playwrights use the portrayal of characters to explore culturally\nbased gender norms. In this paper, we develop quantitative methods to study\ngender depiction in the non-religious works (comedias) of Pedro Calder\\'on de\nla Barca, a prolific Spanish 17th century author. We gather insights from a\ncorpus of more than 100 plays by using a gender classifier and applying model\nexplainability (attribution) methods to determine which text features are most\ninfluential in the model's decision to classify speech as 'male' or 'female',\nindicating the most gendered elements of dialogue in Calder\\'on's comedias in a\nhuman accessible manner. We find that female and male characters are portrayed\ndifferently and can be identified by the gender prediction model at practically\nuseful accuracies (up to f=0.83). Analysis reveals semantic aspects of gender\nportrayal, and demonstrates that the model is even useful in providing a\nrelatively accurate scene-by-scene prediction of cross-dressing characters.\n","authors":["Allison Keith","Antonio Rojas Castro","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2411.03895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03888v1","updated":"2024-11-06T13:06:43Z","published":"2024-11-06T13:06:43Z","title":"Multi3Hate: Multimodal, Multilingual, and Multicultural Hate Speech\n Detection with Vision-Language Models","summary":" Warning: this paper contains content that may be offensive or upsetting\n Hate speech moderation on global platforms poses unique challenges due to the\nmultimodal and multilingual nature of content, along with the varying cultural\nperceptions. How well do current vision-language models (VLMs) navigate these\nnuances? To investigate this, we create the first multimodal and multilingual\nparallel hate speech dataset, annotated by a multicultural set of annotators,\ncalled Multi3Hate. It contains 300 parallel meme samples across 5 languages:\nEnglish, German, Spanish, Hindi, and Mandarin. We demonstrate that cultural\nbackground significantly affects multimodal hate speech annotation in our\ndataset. The average pairwise agreement among countries is just 74%,\nsignificantly lower than that of randomly selected annotator groups. Our\nqualitative analysis indicates that the lowest pairwise label agreement-only\n67% between the USA and India-can be attributed to cultural factors. We then\nconduct experiments with 5 large VLMs in a zero-shot setting, finding that\nthese models align more closely with annotations from the US than with those\nfrom other cultures, even when the memes and prompts are presented in the\ndominant language of the other culture. Code and dataset are available at\nhttps://github.com/MinhDucBui/Multi3Hate.\n","authors":["Minh Duc Bui","Katharina von der Wense","Anne Lauscher"],"pdf_url":"https://arxiv.org/pdf/2411.03888v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2410.20746v3","updated":"2024-11-06T13:05:51Z","published":"2024-10-28T05:25:50Z","title":"ElectionSim: Massive Population Election Simulation Powered by Large\n Language Model Driven Agents","summary":" The massive population election simulation aims to model the preferences of\nspecific groups in particular election scenarios. It has garnered significant\nattention for its potential to forecast real-world social trends. Traditional\nagent-based modeling (ABM) methods are constrained by their ability to\nincorporate complex individual background information and provide interactive\nprediction results. In this paper, we introduce ElectionSim, an innovative\nelection simulation framework based on large language models, designed to\nsupport accurate voter simulations and customized distributions, together with\nan interactive platform to dialogue with simulated voters. We present a\nmillion-level voter pool sampled from social media platforms to support\naccurate individual simulation. We also introduce PPE, a poll-based\npresidential election benchmark to assess the performance of our framework\nunder the U.S. presidential election scenario. Through extensive experiments\nand analyses, we demonstrate the effectiveness and robustness of our framework\nin U.S. presidential election simulations.\n","authors":["Xinnong Zhang","Jiayu Lin","Libo Sun","Weihong Qi","Yihang Yang","Yue Chen","Hanjia Lyu","Xinyi Mou","Siming Chen","Jiebo Luo","Xuanjing Huang","Shiping Tang","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2410.20746v3.pdf","comment":"42 pages, 14 figures"},{"id":"http://arxiv.org/abs/2411.03884v1","updated":"2024-11-06T13:00:34Z","published":"2024-11-06T13:00:34Z","title":"Polynomial Composition Activations: Unleashing the Dynamics of Large\n Language Models","summary":" Transformers have found extensive applications across various domains due to\nthe powerful fitting capabilities. This success can be partially attributed to\ntheir inherent nonlinearity. Thus, in addition to the ReLU function employed in\nthe original transformer architecture, researchers have explored alternative\nmodules such as GeLU and SwishGLU to enhance nonlinearity and thereby augment\nrepresentational capacity. In this paper, we propose a novel category of\npolynomial composition activations (PolyCom), designed to optimize the dynamics\nof transformers. Theoretically, we provide a comprehensive mathematical\nanalysis of PolyCom, highlighting its enhanced expressivity and efficacy\nrelative to other activation functions. Notably, we demonstrate that networks\nincorporating PolyCom achieve the $\\textbf{optimal approximation rate}$,\nindicating that PolyCom networks require minimal parameters to approximate\ngeneral smooth functions in Sobolev spaces. We conduct empirical experiments on\nthe pre-training configurations of large language models (LLMs), including both\ndense and sparse architectures. By substituting conventional activation\nfunctions with PolyCom, we enable LLMs to capture higher-order interactions\nwithin the data, thus improving performance metrics in terms of accuracy and\nconvergence rates. Extensive experimental results demonstrate the effectiveness\nof our method, showing substantial improvements over other activation\nfunctions. Code is available at https://github.com/BryceZhuo/PolyCom.\n","authors":["Zhijian Zhuo","Ya Wang","Yutao Zeng","Xiaoqing Li","Xun Zhou","Jinwen Ma"],"pdf_url":"https://arxiv.org/pdf/2411.03884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10499v3","updated":"2024-11-06T12:35:37Z","published":"2024-07-15T07:43:55Z","title":"CIBench: Evaluating Your LLMs with a Code Interpreter Plugin","summary":" While LLM-Based agents, which use external tools to solve complex problems,\nhave made significant progress, benchmarking their ability is challenging,\nthereby hindering a clear understanding of their limitations. In this paper, we\npropose an interactive evaluation framework, named CIBench, to comprehensively\nassess LLMs' ability to utilize code interpreters for data science tasks. Our\nevaluation framework includes an evaluation dataset and two evaluation modes.\nThe evaluation dataset is constructed using an LLM-human cooperative approach\nand simulates an authentic workflow by leveraging consecutive and interactive\nIPython sessions. The two evaluation modes assess LLMs' ability with and\nwithout human assistance. We conduct extensive experiments to analyze the\nability of 24 LLMs on CIBench and provide valuable insights for future LLMs in\ncode interpreter utilization.\n","authors":["Chuyu Zhang","Songyang Zhang","Yingfan Hu","Haowen Shen","Kuikun Liu","Zerun Ma","Fengzhe Zhou","Wenwei Zhang","Xuming He","Dahua Lin","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2407.10499v3.pdf","comment":"Under review. The first three authors contribute equally, and\n Songyang Zhang is the project leader"},{"id":"http://arxiv.org/abs/2411.03866v1","updated":"2024-11-06T12:22:04Z","published":"2024-11-06T12:22:04Z","title":"Performance evaluation of SLAM-ASR: The Good, the Bad, the Ugly, and the\n Way Forward","summary":" Recent research has demonstrated that training a linear connector between\nspeech foundation encoders and large language models (LLMs) enables this\narchitecture to achieve strong ASR capabilities. Despite the impressive\nresults, it remains unclear whether these simple approaches are robust enough\nacross different scenarios and speech conditions, such as domain shifts and\ndifferent speech perturbations. In this paper, we address these questions by\nconducting various ablation experiments using a recent and widely adopted\napproach called SLAM-ASR. We present novel empirical findings that offer\ninsights on how to effectively utilize the SLAM-ASR architecture across a wide\nrange of settings. Our main findings indicate that the SLAM-ASR exhibits poor\nperformance in cross-domain evaluation settings. Additionally, speech\nperturbations within in-domain data, such as changes in speed or the presence\nof additive noise, can significantly impact performance. Our findings offer\ncritical insights for fine-tuning and configuring robust LLM-based ASR models,\ntailored to different data characteristics and computational resources.\n","authors":["Shashi Kumar","Iuliia Thorbecke","Sergio Burdisso","Esaú Villatoro-Tello","Manjunath K E","Kadri Hacioğlu","Pradeep Rangappa","Petr Motlicek","Aravind Ganapathiraju","Andreas Stolcke"],"pdf_url":"https://arxiv.org/pdf/2411.03866v1.pdf","comment":"Submitted to ICASSP 2025 SALMA Workshop"},{"id":"http://arxiv.org/abs/2404.01204v3","updated":"2024-11-06T12:02:52Z","published":"2024-04-01T16:00:01Z","title":"The Fine Line: Navigating Large Language Model Pretraining with\n Down-streaming Capability Analysis","summary":" Uncovering early-stage metrics that reflect final model performance is one\ncore principle for large-scale pretraining. The existing scaling law\ndemonstrates the power-law correlation between pretraining loss and training\nflops, which serves as an important indicator of the current training state for\nlarge language models. However, this principle only focuses on the model's\ncompression properties on the training data, resulting in an inconsistency with\nthe ability improvements on the downstream tasks. Some follow-up works\nattempted to extend the scaling-law to more complex metrics (such as\nhyperparameters), but still lacked a comprehensive analysis of the dynamic\ndifferences among various capabilities during pretraining. To address the\naforementioned limitations, this paper undertakes a comprehensive comparison of\nmodel capabilities at various pretraining intermediate checkpoints. Through\nthis analysis, we confirm that specific downstream metrics exhibit similar\ntraining dynamics across models of different sizes, up to 67 billion\nparameters. In addition to our core findings, we've reproduced Amber and\nOpenLLaMA, releasing their intermediate checkpoints. This initiative offers\nvaluable resources to the research community and facilitates the verification\nand exploration of LLM pretraining by open-source researchers. Besides, we\nprovide empirical summaries, including performance comparisons of different\nmodels and capabilities, and tuition of key metrics for different training\nphases. Based on these findings, we provide a more user-friendly strategy for\nevaluating the optimization state, offering guidance for establishing a stable\npretraining process.\n","authors":["Chen Yang","Junzhuo Li","Xinyao Niu","Xinrun Du","Songyang Gao","Haoran Zhang","Zhaoliang Chen","Xingwei Qu","Ruibin Yuan","Yizhi Li","Jiaheng Liu","Stephen W. Huang","Shawn Yue","Ge Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.01204v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03855v1","updated":"2024-11-06T11:57:55Z","published":"2024-11-06T11:57:55Z","title":"MambaPEFT: Exploring Parameter-Efficient Fine-Tuning for Mamba","summary":" An ecosystem of Transformer-based models has been established by building\nlarge models with extensive data. Parameter-efficient fine-tuning (PEFT) is a\ncrucial technology for deploying these models to downstream tasks with minimal\ncost while achieving effective performance. Recently, Mamba, a State Space\nModel (SSM)-based model, has attracted attention as a potential alternative to\nTransformers. While many large-scale Mamba-based models have been proposed,\nefficiently adapting pre-trained Mamba-based models to downstream tasks remains\nunexplored. In this paper, we conduct an exploratory analysis of PEFT methods\nfor Mamba. We investigate the effectiveness of existing PEFT methods for\nTransformers when applied to Mamba. We also modify these methods to better\nalign with the Mamba architecture. Additionally, we propose new Mamba-specific\nPEFT methods that leverage the distinctive structure of Mamba. Our experiments\nindicate that PEFT performs more effectively for Mamba than Transformers.\nLastly, we demonstrate how to effectively combine multiple PEFT methods and\nprovide a framework that outperforms previous works. To ensure reproducibility,\nwe will release the code after publication.\n","authors":["Masakazu Yoshimura","Teruaki Hayashi","Yota Maeda"],"pdf_url":"https://arxiv.org/pdf/2411.03855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19453v2","updated":"2024-11-06T11:49:10Z","published":"2024-10-25T10:28:59Z","title":"ShifCon: Enhancing Non-Dominant Language Capabilities with a Shift-based\n Contrastive Framework","summary":" Although fine-tuning Large Language Models (LLMs) with multilingual data can\nrapidly enhance the multilingual capabilities of LLMs, they still exhibit a\nperformance gap between the dominant language (e.g., English) and non-dominant\nones due to the imbalance of training data across languages. To further enhance\nthe performance of non-dominant languages, we propose ShifCon, a Shift-based\nContrastive framework that aligns the internal forward process of other\nlanguages toward that of the dominant one. Specifically, it shifts the\nrepresentations of non-dominant languages into the dominant language subspace,\nallowing them to access relatively rich information encoded in the model\nparameters. The enriched representations are then shifted back into their\noriginal language subspace before generation. Moreover, we introduce a subspace\ndistance metric to pinpoint the optimal layer area for shifting representations\nand employ multilingual contrastive learning to further enhance the alignment\nof representations within this area. Experiments demonstrate that our ShifCon\nframework significantly enhances the performance of non-dominant languages,\nparticularly for low-resource ones. Further analysis offers extra insights to\nverify the effectiveness of ShifCon and propel future research\n","authors":["Hengyuan Zhang","Chenming Shang","Sizhe Wang","Dongdong Zhang","Renliang Sun","Yiyao Yu","Yujiu Yang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2410.19453v2.pdf","comment":"23 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.02832v2","updated":"2024-11-06T11:19:42Z","published":"2024-11-05T06:11:17Z","title":"PersianRAG: A Retrieval-Augmented Generation System for Persian Language","summary":" Retrieval augmented generation (RAG) models, which integrate large-scale\npre-trained generative models with external retrieval mechanisms, have shown\nsignificant success in various natural language processing (NLP) tasks.\nHowever, applying RAG models in Persian language as a low-resource language,\nposes distinct challenges. These challenges primarily involve the\npreprocessing, embedding, retrieval, prompt construction, language modeling,\nand response evaluation of the system. In this paper, we address the challenges\ntowards implementing a real-world RAG system for Persian language called\nPersianRAG. We propose novel solutions to overcome these obstacles and evaluate\nour approach using several Persian benchmark datasets. Our experimental results\ndemonstrate the capability of the PersianRAG framework to enhance question\nanswering task in Persian.\n","authors":["Hossein Hosseini","Mohammad Sobhan Zare","Amir Hossein Mohammadi","Arefeh Kazemi","Zahra Zojaji","Mohammad Ali Nematbakhsh"],"pdf_url":"https://arxiv.org/pdf/2411.02832v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01192v2","updated":"2024-11-06T11:19:01Z","published":"2024-11-02T09:39:49Z","title":"Swan and ArabicMTEB: Dialect-Aware, Arabic-Centric, Cross-Lingual, and\n Cross-Cultural Embedding Models and Benchmarks","summary":" We introduce {\\bf Swan}, a family of embedding models centred around the\nArabic language, addressing both small-scale and large-scale use cases. Swan\nincludes two variants: Swan-Small, based on ARBERTv2, and Swan-Large, built on\nArMistral, a pretrained Arabic large language model. To evaluate these models,\nwe propose ArabicMTEB, a comprehensive benchmark suite that assesses\ncross-lingual, multi-dialectal, multi-domain, and multi-cultural Arabic text\nembedding performance, covering eight diverse tasks and spanning 94 datasets.\nSwan-Large achieves state-of-the-art results, outperforming\nMultilingual-E5-large in most Arabic tasks, while the Swan-Small consistently\nsurpasses Multilingual-E5-base. Our extensive evaluations demonstrate that Swan\nmodels are both dialectally and culturally aware, excelling across various\nArabic domains while offering significant monetary efficiency. This work\nsignificantly advances the field of Arabic language modelling and provides\nvaluable resources for future research and applications in Arabic natural\nlanguage processing. Our models and benchmark will be made publicly accessible\nfor research.\n","authors":["Gagan Bhatia","El Moatez Billah Nagoudi","Abdellah El Mekki","Fakhraddin Alwajih","Muhammad Abdul-Mageed"],"pdf_url":"https://arxiv.org/pdf/2411.01192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03823v1","updated":"2024-11-06T10:44:15Z","published":"2024-11-06T10:44:15Z","title":"Both Text and Images Leaked! A Systematic Analysis of Multimodal LLM\n Data Contamination","summary":" The rapid progression of multimodal large language models (MLLMs) has\ndemonstrated superior performance on various multimodal benchmarks. However,\nthe issue of data contamination during training creates challenges in\nperformance evaluation and comparison. While numerous methods exist for\ndetecting dataset contamination in large language models (LLMs), they are less\neffective for MLLMs due to their various modalities and multiple training\nphases. In this study, we introduce a multimodal data contamination detection\nframework, MM-Detect, designed for MLLMs. Our experimental results indicate\nthat MM-Detect is sensitive to varying degrees of contamination and can\nhighlight significant performance improvements due to leakage of the training\nset of multimodal benchmarks. Furthermore, We also explore the possibility of\ncontamination originating from the pre-training phase of LLMs used by MLLMs and\nthe fine-tuning phase of MLLMs, offering new insights into the stages at which\ncontamination may be introduced.\n","authors":["Dingjie Song","Sicheng Lai","Shunian Chen","Lichao Sun","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03817v1","updated":"2024-11-06T10:35:11Z","published":"2024-11-06T10:35:11Z","title":"From Novice to Expert: LLM Agent Policy Optimization via Step-wise\n Reinforcement Learning","summary":" The outstanding capabilities of large language models (LLMs) render them a\ncrucial component in various autonomous agent systems. While traditional\nmethods depend on the inherent knowledge of LLMs without fine-tuning, more\nrecent approaches have shifted toward the reinforcement learning strategy to\nfurther enhance agents' ability to solve complex interactive tasks with\nenvironments and tools. However, previous approaches are constrained by the\nsparse reward issue, where existing datasets solely provide a final scalar\nreward for each multi-step reasoning chain, potentially leading to\nineffectiveness and inefficiency in policy learning. In this paper, we\nintroduce StepAgent, which utilizes step-wise reward to optimize the agent's\nreinforcement learning process. Inheriting the spirit of novice-to-expert\ntheory, we first compare the actions of the expert and the agent to\nautomatically generate intermediate rewards for fine-grained optimization.\nAdditionally, we propose implicit-reward and inverse reinforcement learning\ntechniques to facilitate agent reflection and policy adjustment. Further\ntheoretical analysis demonstrates that the action distribution of the agent can\nconverge toward the expert action distribution over multiple training cycles.\nExperimental results across various datasets indicate that StepAgent\noutperforms existing baseline methods.\n","authors":["Zhirui Deng","Zhicheng Dou","Yutao Zhu","Ji-Rong Wen","Ruibin Xiong","Mang Wang","Weipeng Chen"],"pdf_url":"https://arxiv.org/pdf/2411.03817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03814v1","updated":"2024-11-06T10:32:09Z","published":"2024-11-06T10:32:09Z","title":"MRJ-Agent: An Effective Jailbreak Agent for Multi-Round Dialogue","summary":" Large Language Models (LLMs) demonstrate outstanding performance in their\nreservoir of knowledge and understanding capabilities, but they have also been\nshown to be prone to illegal or unethical reactions when subjected to jailbreak\nattacks. To ensure their responsible deployment in critical applications, it is\ncrucial to understand the safety capabilities and vulnerabilities of LLMs.\nPrevious works mainly focus on jailbreak in single-round dialogue, overlooking\nthe potential jailbreak risks in multi-round dialogues, which are a vital way\nhumans interact with and extract information from LLMs. Some studies have\nincreasingly concentrated on the risks associated with jailbreak in multi-round\ndialogues. These efforts typically involve the use of manually crafted\ntemplates or prompt engineering techniques. However, due to the inherent\ncomplexity of multi-round dialogues, their jailbreak performance is limited. To\nsolve this problem, we propose a novel multi-round dialogue jailbreaking agent,\nemphasizing the importance of stealthiness in identifying and mitigating\npotential threats to human values posed by LLMs. We propose a risk\ndecomposition strategy that distributes risks across multiple rounds of queries\nand utilizes psychological strategies to enhance attack strength. Extensive\nexperiments show that our proposed method surpasses other attack methods and\nachieves state-of-the-art attack success rate. We will make the corresponding\ncode and dataset available for future research. The code will be released soon.\n","authors":["Fengxiang Wang","Ranjie Duan","Peng Xiao","Xiaojun Jia","YueFeng Chen","Chongwen Wang","Jialing Tao","Hang Su","Jun Zhu","Hui Xue"],"pdf_url":"https://arxiv.org/pdf/2411.03814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04156v1","updated":"2024-11-06T10:28:46Z","published":"2024-11-06T10:28:46Z","title":"Crystal: Illuminating LLM Abilities on Language and Code","summary":" Large Language Models (LLMs) specializing in code generation (which are also\noften referred to as code LLMs), e.g., StarCoder and Code Llama, play\nincreasingly critical roles in various software development scenarios. It is\nalso crucial for code LLMs to possess both code generation and natural language\nabilities for many specific applications, such as code snippet retrieval using\nnatural language or code explanations. The intricate interaction between\nacquiring language and coding skills complicates the development of strong code\nLLMs. Furthermore, there is a lack of thorough prior studies on the LLM\npretraining strategy that mixes code and natural language. In this work, we\npropose a pretraining strategy to enhance the integration of natural language\nand coding capabilities within a single LLM. Specifically, it includes two\nphases of training with appropriately adjusted code/language ratios. The\nresulting model, Crystal, demonstrates remarkable capabilities in both domains.\nSpecifically, it has natural language and coding performance comparable to that\nof Llama 2 and Code Llama, respectively. Crystal exhibits better data\nefficiency, using 1.4 trillion tokens compared to the more than 2 trillion\ntokens used by Llama 2 and Code Llama. We verify our pretraining strategy by\nanalyzing the training process and observe consistent improvements in most\nbenchmarks. We also adopted a typical application adaptation phase with a\ncode-centric data mixture, only to find that it did not lead to enhanced\nperformance or training efficiency, underlining the importance of a carefully\ndesigned data recipe. To foster research within the community, we commit to\nopen-sourcing every detail of the pretraining, including our training datasets,\ncode, loggings and 136 checkpoints throughout the training.\n","authors":["Tianhua Tao","Junbo Li","Bowen Tan","Hongyi Wang","William Marshall","Bhargav M Kanakiya","Joel Hestness","Natalia Vassilieva","Zhiqiang Shen","Eric P. Xing","Zhengzhong Liu"],"pdf_url":"https://arxiv.org/pdf/2411.04156v1.pdf","comment":"Published as a conference paper at COLM 2024"},{"id":"http://arxiv.org/abs/2409.18680v3","updated":"2024-11-06T10:27:05Z","published":"2024-09-27T12:06:53Z","title":"Beyond Single-Audio: Advancing Multi-Audio Processing in Audio Large\n Language Models","summary":" Various audio-LLMs (ALLMs) have been explored recently for tackling different\naudio tasks simultaneously using a single, unified model. While existing\nevaluations of ALLMs primarily focus on single-audio tasks, real-world\napplications often involve processing multiple audio streams simultaneously. To\nbridge this gap, we propose the first multi-audio evaluation (MAE) benchmark\nthat consists of 20 datasets from 11 multi-audio tasks encompassing both speech\nand sound scenarios. Comprehensive experiments on MAE demonstrate that the\nexisting ALLMs, while being powerful in comprehending primary audio elements in\nindividual audio inputs, struggling to handle multi-audio scenarios. To this\nend, we propose a novel multi-audio-LLM (MALLM) to capture audio context among\nmultiple similar audios using discriminative learning on our proposed synthetic\ndata. The results demonstrate that the proposed MALLM outperforms all baselines\nand achieves high data efficiency using synthetic data without requiring human\nannotations. The proposed MALLM opens the door for ALLMs towards multi-audio\nprocessing era and brings us closer to replicating human auditory capabilities\nin machines.\n","authors":["Yiming Chen","Xianghu Yue","Xiaoxue Gao","Chen Zhang","Luis Fernando D'Haro","Robby T. Tan","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2409.18680v3.pdf","comment":"EMNLP24 Findings. Data available at\n https://github.com/MatthewCYM/MALLM"},{"id":"http://arxiv.org/abs/2411.03811v1","updated":"2024-11-06T10:14:58Z","published":"2024-11-06T10:14:58Z","title":"The natural stability of autonomous morphology","summary":" Autonomous morphology, such as inflection class systems and paradigmatic\ndistribution patterns, is widespread and diachronically resilient in natural\nlanguage. Why this should be so has remained unclear given that autonomous\nmorphology imposes learning costs, offers no clear benefit relative to its\nabsence and could easily be removed by the analogical forces which are\nconstantly reshaping it. Here we propose an explanation for the resilience of\nautonomous morphology, in terms of a diachronic dynamic of attraction and\nrepulsion between morphomic categories, which emerges spontaneously from a\nsimple paradigm cell filling process. Employing computational evolutionary\nmodels, our key innovation is to bring to light the role of `dissociative\nevidence', i.e., evidence for inflectional distinctiveness which a rational\nreasoner will have access to during analogical inference. Dissociative evidence\ncreates a repulsion dynamic which prevents morphomic classes from collapsing\ntogether entirely, i.e., undergoing complete levelling. As we probe alternative\nmodels, we reveal the limits of conditional entropy as a measure for\npredictability in systems that are undergoing change. Finally, we demonstrate\nthat autonomous morphology, far from being `unnatural' (e.g.\n\\citealt{Aronoff1994}), is rather the natural (emergent) consequence of a\nnatural (rational) process of inference applied to inflectional systems.\n","authors":["Erich Round","Louise Esher","Sacha Beniamine"],"pdf_url":"https://arxiv.org/pdf/2411.03811v1.pdf","comment":"Accepted for publication by the journal Morphology"},{"id":"http://arxiv.org/abs/2411.03806v1","updated":"2024-11-06T10:06:21Z","published":"2024-11-06T10:06:21Z","title":"Understanding the Effects of Human-written Paraphrases in LLM-generated\n Text Detection","summary":" Natural Language Generation has been rapidly developing with the advent of\nlarge language models (LLMs). While their usage has sparked significant\nattention from the general public, it is important for readers to be aware when\na piece of text is LLM-generated. This has brought about the need for building\nmodels that enable automated LLM-generated text detection, with the aim of\nmitigating potential negative outcomes of such content. Existing LLM-generated\ndetectors show competitive performances in telling apart LLM-generated and\nhuman-written text, but this performance is likely to deteriorate when\nparaphrased texts are considered. In this study, we devise a new data\ncollection strategy to collect Human & LLM Paraphrase Collection (HLPC), a\nfirst-of-its-kind dataset that incorporates human-written texts and\nparaphrases, as well as LLM-generated texts and paraphrases. With the aim of\nunderstanding the effects of human-written paraphrases on the performance of\nstate-of-the-art LLM-generated text detectors OpenAI RoBERTa and watermark\ndetectors, we perform classification experiments that incorporate human-written\nparaphrases, watermarked and non-watermarked LLM-generated documents from GPT\nand OPT, and LLM-generated paraphrases from DIPPER and BART. The results show\nthat the inclusion of human-written paraphrases has a significant impact of\nLLM-generated detector performance, promoting TPR@1%FPR with a possible\ntrade-off of AUROC and accuracy.\n","authors":["Hiu Ting Lau","Arkaitz Zubiaga"],"pdf_url":"https://arxiv.org/pdf/2411.03806v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2304.12507v3","updated":"2024-11-06T23:44:52Z","published":"2023-04-25T01:12:47Z","title":"Learning Task-Specific Strategies for Accelerated MRI","summary":" Compressed sensing magnetic resonance imaging (CS-MRI) seeks to recover\nvisual information from subsampled measurements for diagnostic tasks.\nTraditional CS-MRI methods often separately address measurement subsampling,\nimage reconstruction, and task prediction, resulting in a suboptimal end-to-end\nperformance. In this work, we propose TACKLE as a unified co-design framework\nfor jointly optimizing subsampling, reconstruction, and prediction strategies\nfor the performance on downstream tasks. The na\\\"ive approach of simply\nappending a task prediction module and training with a task-specific loss leads\nto suboptimal downstream performance. Instead, we develop a training procedure\nwhere a backbone architecture is first trained for a generic pre-training task\n(image reconstruction in our case), and then fine-tuned for different\ndownstream tasks with a prediction head. Experimental results on multiple\npublic MRI datasets show that TACKLE achieves an improved performance on\nvarious tasks over traditional CS-MRI methods. We also demonstrate that TACKLE\nis robust to distribution shifts by showing that it generalizes to a new\ndataset we experimentally collected using different acquisition setups from the\ntraining data. Without additional fine-tuning, TACKLE leads to both numerical\nand visual improvements compared to existing baselines. We have further\nimplemented a learned 4$\\times$-accelerated sequence on a Siemens 3T MRI Skyra\nscanner. Compared to the fully-sampling scan that takes 335 seconds, our\noptimized sequence only takes 84 seconds, achieving a four-fold time reduction\nas desired, while maintaining high performance.\n","authors":["Zihui Wu","Tianwei Yin","Yu Sun","Robert Frost","Andre van der Kouwe","Adrian V. Dalca","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2304.12507v3.pdf","comment":"Our code is available at https://github.com/zihuiwu/TACKLE. More\n information can be found at http://imaging.cms.caltech.edu/tackle/"},{"id":"http://arxiv.org/abs/2304.04901v2","updated":"2024-11-06T23:32:27Z","published":"2023-04-11T00:17:28Z","title":"Efficiently Collecting Training Dataset for 2D Object Detection by\n Online Visual Feedback","summary":" Training deep-learning-based vision systems require the manual annotation of\na significant number of images. Such manual annotation is highly time-consuming\nand labor-intensive. Although previous studies have attempted to eliminate the\neffort required for annotation, the effort required for image collection was\nretained. To address this, we propose a human-in-the-loop dataset collection\nmethod that uses a web application. To counterbalance the workload and\nperformance by encouraging the collection of multi-view object image datasets\nin an enjoyable manner, thereby amplifying motivation, we propose three types\nof online visual feedback features to track the progress of the collection\nstatus. Our experiments thoroughly investigated the impact of each feature on\ncollection performance and quality of operation. The results suggested the\nfeasibility of annotation and object detection.\n","authors":["Takuya Kiyokawa","Naoki Shirakura","Hiroki Katayama","Keita Tomochika","Jun Takamatsu"],"pdf_url":"https://arxiv.org/pdf/2304.04901v2.pdf","comment":"13 pages, 14 figures"},{"id":"http://arxiv.org/abs/2402.03478v2","updated":"2024-11-06T23:02:47Z","published":"2024-02-05T19:39:52Z","title":"Estimating Epistemic and Aleatoric Uncertainty with a Single Model","summary":" Estimating and disentangling epistemic uncertainty, uncertainty that is\nreducible with more training data, and aleatoric uncertainty, uncertainty that\nis inherent to the task at hand, is critically important when applying machine\nlearning to high-stakes applications such as medical imaging and weather\nforecasting. Conditional diffusion models' breakthrough ability to accurately\nand efficiently sample from the posterior distribution of a dataset now makes\nuncertainty estimation conceptually straightforward: One need only train and\nsample from a large ensemble of diffusion models. Unfortunately, training such\nan ensemble becomes computationally intractable as the complexity of the model\narchitecture grows. In this work we introduce a new approach to ensembling,\nhyper-diffusion models (HyperDM), which allows one to accurately estimate both\nepistemic and aleatoric uncertainty with a single model. Unlike existing\nsingle-model uncertainty methods like Monte-Carlo dropout and Bayesian neural\nnetworks, HyperDM offers prediction accuracy on par with, and in some cases\nsuperior to, multi-model ensembles. Furthermore, our proposed approach scales\nto modern network architectures such as Attention U-Net and yields more\naccurate uncertainty estimates compared to existing methods. We validate our\nmethod on two distinct real-world tasks: x-ray computed tomography\nreconstruction and weather temperature forecasting.\n","authors":["Matthew A. Chan","Maria J. Molina","Christopher A. Metzler"],"pdf_url":"https://arxiv.org/pdf/2402.03478v2.pdf","comment":"19 pages, 11 figures. To be published in Conference on Neural\n Information Processing Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2312.02985v2","updated":"2024-11-06T23:02:02Z","published":"2023-11-15T13:28:02Z","title":"FocalPose++: Focal Length and Object Pose Estimation via Render and\n Compare","summary":" We introduce FocalPose++, a neural render-and-compare method for jointly\nestimating the camera-object 6D pose and camera focal length given a single RGB\ninput image depicting a known object. The contributions of this work are\nthreefold. First, we derive a focal length update rule that extends an existing\nstate-of-the-art render-and-compare 6D pose estimator to address the joint\nestimation task. Second, we investigate several different loss functions for\njointly estimating the object pose and focal length. We find that a combination\nof direct focal length regression with a reprojection loss disentangling the\ncontribution of translation, rotation, and focal length leads to improved\nresults. Third, we explore the effect of different synthetic training data on\nthe performance of our method. Specifically, we investigate different\ndistributions used for sampling object's 6D pose and camera's focal length when\nrendering the synthetic images, and show that parametric distribution fitted on\nreal training data works the best. We show results on three challenging\nbenchmark datasets that depict known 3D models in uncontrolled settings. We\ndemonstrate that our focal length and 6D pose estimates have lower error than\nthe existing state-of-the-art methods.\n","authors":["Martin Cífka","Georgy Ponimatkin","Yann Labbé","Bryan Russell","Mathieu Aubry","Vladimir Petrik","Josef Sivic"],"pdf_url":"https://arxiv.org/pdf/2312.02985v2.pdf","comment":"25 pages, 22 figures. IEEE TPAMI, 2024. Extended version of the\n conference paper arXiv:2204.05145"},{"id":"http://arxiv.org/abs/2411.04291v1","updated":"2024-11-06T22:19:32Z","published":"2024-11-06T22:19:32Z","title":"Unfair Alignment: Examining Safety Alignment Across Vision Encoder\n Layers in Vision-Language Models","summary":" Vision-language models (VLMs) have improved significantly in multi-modal\ntasks, but their more complex architecture makes their safety alignment more\nchallenging than the alignment of large language models (LLMs). In this paper,\nwe reveal an unfair distribution of safety across the layers of VLM's vision\nencoder, with earlier and middle layers being disproportionately vulnerable to\nmalicious inputs compared to the more robust final layers. This 'cross-layer'\nvulnerability stems from the model's inability to generalize its safety\ntraining from the default architectural settings used during training to unseen\nor out-of-distribution scenarios, leaving certain layers exposed. We conduct a\ncomprehensive analysis by projecting activations from various intermediate\nlayers and demonstrate that these layers are more likely to generate harmful\noutputs when exposed to malicious inputs. Our experiments with LLaVA-1.5 and\nLlama 3.2 show discrepancies in attack success rates and toxicity scores across\nlayers, indicating that current safety alignment strategies focused on a single\ndefault layer are insufficient.\n","authors":["Saketh Bachu","Erfan Shayegani","Trishna Chakraborty","Rohit Lal","Arindam Dutta","Chengyu Song","Yue Dong","Nael Abu-Ghazaleh","Amit K. Roy-Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2411.04291v1.pdf","comment":"Preprint, Under Review"},{"id":"http://arxiv.org/abs/2401.03115v2","updated":"2024-11-06T22:09:09Z","published":"2024-01-06T03:03:28Z","title":"Transferable Learned Image Compression-Resistant Adversarial\n Perturbations","summary":" Adversarial attacks can readily disrupt the image classification system,\nrevealing the vulnerability of DNN-based recognition tasks. While existing\nadversarial perturbations are primarily applied to uncompressed images or\ncompressed images by the traditional image compression method, i.e., JPEG,\nlimited studies have investigated the robustness of models for image\nclassification in the context of DNN-based image compression. With the rapid\nevolution of advanced image compression, DNN-based learned image compression\nhas emerged as the promising approach for transmitting images in many\nsecurity-critical applications, such as cloud-based face recognition and\nautonomous driving, due to its superior performance over traditional\ncompression. Therefore, there is a pressing need to fully investigate the\nrobustness of a classification system post-processed by learned image\ncompression. To bridge this research gap, we explore the adversarial attack on\na new pipeline that targets image classification models that utilize learned\nimage compressors as pre-processing modules. Furthermore, to enhance the\ntransferability of perturbations across various quality levels and\narchitectures of learned image compression models, we introduce a saliency\nscore-based sampling method to enable the fast generation of transferable\nperturbation. Extensive experiments with popular attack methods demonstrate the\nenhanced transferability of our proposed method when attacking images that have\nbeen post-processed with different learned image compression models.\n","authors":["Yang Sui","Zhuohang Li","Ding Ding","Xiang Pan","Xiaozhong Xu","Shan Liu","Zhenzhong Chen"],"pdf_url":"https://arxiv.org/pdf/2401.03115v2.pdf","comment":"Accepted by BMVC 2024"},{"id":"http://arxiv.org/abs/2410.12692v2","updated":"2024-11-06T21:46:48Z","published":"2024-10-16T15:52:32Z","title":"Machine learning approach to brain tumor detection and classification","summary":" Brain tumor detection and classification are critical tasks in medical image\nanalysis, particularly in early-stage diagnosis, where accurate and timely\ndetection can significantly improve treatment outcomes. In this study, we apply\nvarious statistical and machine learning models to detect and classify brain\ntumors using brain MRI images. We explore a variety of statistical models\nincluding linear, logistic, and Bayesian regressions, and the machine learning\nmodels including decision tree, random forest, single-layer perceptron,\nmulti-layer perceptron, convolutional neural network (CNN), recurrent neural\nnetwork, and long short-term memory. Our findings show that CNN outperforms\nother models, achieving the best performance. Additionally, we confirm that the\nCNN model can also work for multi-class classification, distinguishing between\nfour categories of brain MRI images such as normal, glioma, meningioma, and\npituitary tumor images. This study demonstrates that machine learning\napproaches are suitable for brain tumor detection and classification,\nfacilitating real-world medical applications in assisting radiologists with\nearly and accurate diagnosis.\n","authors":["Alice Oh","Inyoung Noh","Jian Choo","Jihoo Lee","Justin Park","Kate Hwang","Sanghyeon Kim","Soo Min Oh"],"pdf_url":"https://arxiv.org/pdf/2410.12692v2.pdf","comment":"7 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2411.04269v1","updated":"2024-11-06T21:22:46Z","published":"2024-11-06T21:22:46Z","title":"Increasing the scalability of graph convolution for FPGA-implemented\n event-based vision","summary":" Event cameras are becoming increasingly popular as an alternative to\ntraditional frame-based vision sensors, especially in mobile robotics. Taking\nfull advantage of their high temporal resolution, high dynamic range, low power\nconsumption and sparsity of event data, which only reflects changes in the\nobserved scene, requires both an efficient algorithm and a specialised hardware\nplatform. A recent trend involves using Graph Convolutional Neural Networks\n(GCNNs) implemented on a heterogeneous SoC FPGA. In this paper we focus on\noptimising hardware modules for graph convolution to allow flexible selection\nof the FPGA resource (BlockRAM, DSP and LUT) for their implementation. We\npropose a ''two-step convolution'' approach that utilises additional BRAM\nbuffers in order to reduce up to 94% of LUT usage for multiplications. This\nmethod significantly improves the scalability of GCNNs, enabling the deployment\nof models with more layers, larger graphs sizes and their application for more\ndynamic scenarios.\n","authors":["Piotr Wzorek","Kamil Jeziorek","Tomasz Kryjak","Andrea Pinna"],"pdf_url":"https://arxiv.org/pdf/2411.04269v1.pdf","comment":"Accepted for the PhD forum during FPT 2024 (International Conference\n on Field Programmable Technology), 10-12 December 2024, Sydney, Australia"},{"id":"http://arxiv.org/abs/2411.04263v1","updated":"2024-11-06T21:16:02Z","published":"2024-11-06T21:16:02Z","title":"Object Recognition in Human Computer Interaction:- A Comparative\n Analysis","summary":" Human-computer interaction (HCI) has been a widely researched area for many\nyears, with continuous advancements in technology leading to the development of\nnew techniques that change the way we interact with computers. With the recent\nadvent of powerful computers, we recognize human actions and interact\naccordingly, thus revolutionizing the way we interact with computers. The\npurpose of this paper is to provide a comparative analysis of various\nalgorithms used for recognizing user faces and gestures in the context of\ncomputer vision and HCI. This study aims to explore and evaluate the\nperformance of different algorithms in terms of accuracy, robustness, and\nefficiency. This study aims to provide a comprehensive analysis of algorithms\nfor face and gesture recognition in the context of computer vision and HCI,\nwith the goal of improving the design and development of interactive systems\nthat are more intuitive, efficient, and user-friendly.\n","authors":["Kaushik Ranade","Tanmay Khule","Riddhi More"],"pdf_url":"https://arxiv.org/pdf/2411.04263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04255v1","updated":"2024-11-06T20:55:30Z","published":"2024-11-06T20:55:30Z","title":"Pose-Transformation and Radial Distance Clustering for Unsupervised\n Person Re-identification","summary":" Person re-identification (re-ID) aims to tackle the problem of matching\nidentities across non-overlapping cameras. Supervised approaches require\nidentity information that may be difficult to obtain and are inherently biased\ntowards the dataset they are trained on, making them unscalable across domains.\nTo overcome these challenges, we propose an unsupervised approach to the person\nre-ID setup. Having zero knowledge of true labels, our proposed method enhances\nthe discriminating ability of the learned features via a novel two-stage\ntraining strategy. The first stage involves training a deep network on an\nexpertly designed pose-transformed dataset obtained by generating multiple\nperturbations for each original image in the pose space. Next, the network\nlearns to map similar features closer in the feature space using the proposed\ndiscriminative clustering algorithm. We introduce a novel radial distance loss,\nthat attends to the fundamental aspects of feature learning - compact clusters\nwith low intra-cluster and high inter-cluster variation. Extensive experiments\non several large-scale re-ID datasets demonstrate the superiority of our method\ncompared to state-of-the-art approaches.\n","authors":["Siddharth Seth","Akash Sonth","Anirban Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2411.04255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04249v1","updated":"2024-11-06T20:42:13Z","published":"2024-11-06T20:42:13Z","title":"PocoLoco: A Point Cloud Diffusion Model of Human Shape in Loose Clothing","summary":" Modeling a human avatar that can plausibly deform to articulations is an\nactive area of research. We present PocoLoco -- the first template-free,\npoint-based, pose-conditioned generative model for 3D humans in loose clothing.\nWe motivate our work by noting that most methods require a parametric model of\nthe human body to ground pose-dependent deformations. Consequently, they are\nrestricted to modeling clothing that is topologically similar to the naked body\nand do not extend well to loose clothing. The few methods that attempt to model\nloose clothing typically require either canonicalization or a\nUV-parameterization and need to address the challenging problem of explicitly\nestimating correspondences for the deforming clothes. In this work, we\nformulate avatar clothing deformation as a conditional point-cloud generation\ntask within the denoising diffusion framework. Crucially, our framework\noperates directly on unordered point clouds, eliminating the need for a\nparametric model or a clothing template. This also enables a variety of\npractical applications, such as point-cloud completion and pose-based editing\n-- important features for virtual human animation. As current datasets for\nhuman avatars in loose clothing are far too small for training diffusion\nmodels, we release a dataset of two subjects performing various poses in loose\nclothing with a total of 75K point clouds. By contributing towards tackling the\nchallenging task of effectively modeling loose clothing and expanding the\navailable data for training these models, we aim to set the stage for further\ninnovation in digital humans. The source code is available at\nhttps://github.com/sidsunny/pocoloco .\n","authors":["Siddharth Seth","Rishabh Dabral","Diogo Luvizon","Marc Habermann","Ming-Hsuan Yang","Christian Theobalt","Adam Kortylewski"],"pdf_url":"https://arxiv.org/pdf/2411.04249v1.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2410.22233v2","updated":"2024-11-06T19:52:58Z","published":"2024-10-29T17:01:05Z","title":"ContextIQ: A Multimodal Expert-Based Video Retrieval System for\n Contextual Advertising","summary":" Contextual advertising serves ads that are aligned to the content that the\nuser is viewing. The rapid growth of video content on social platforms and\nstreaming services, along with privacy concerns, has increased the need for\ncontextual advertising. Placing the right ad in the right context creates a\nseamless and pleasant ad viewing experience, resulting in higher audience\nengagement and, ultimately, better ad monetization. From a technology\nstandpoint, effective contextual advertising requires a video retrieval system\ncapable of understanding complex video content at a very granular level.\nCurrent text-to-video retrieval models based on joint multimodal training\ndemand large datasets and computational resources, limiting their practicality\nand lacking the key functionalities required for ad ecosystem integration. We\nintroduce ContextIQ, a multimodal expert-based video retrieval system designed\nspecifically for contextual advertising. ContextIQ utilizes modality-specific\nexperts-video, audio, transcript (captions), and metadata such as objects,\nactions, emotion, etc.-to create semantically rich video representations. We\nshow that our system, without joint training, achieves better or comparable\nresults to state-of-the-art models and commercial solutions on multiple\ntext-to-video retrieval benchmarks. Our ablation studies highlight the benefits\nof leveraging multiple modalities for enhanced video retrieval accuracy instead\nof using a vision-language model alone. Furthermore, we show how video\nretrieval systems such as ContextIQ can be used for contextual advertising in\nan ad ecosystem while also addressing concerns related to brand safety and\nfiltering inappropriate content.\n","authors":["Ashutosh Chaubey","Anoubhav Agarwaal","Sartaki Sinha Roy","Aayush Agrawal","Susmita Ghose"],"pdf_url":"https://arxiv.org/pdf/2410.22233v2.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2411.04224v1","updated":"2024-11-06T19:44:36Z","published":"2024-11-06T19:44:36Z","title":"WiFlexFormer: Efficient WiFi-Based Person-Centric Sensing","summary":" We propose WiFlexFormer, a highly efficient Transformer-based architecture\ndesigned for WiFi Channel State Information (CSI)-based person-centric sensing.\nWe benchmark WiFlexFormer against state-of-the-art vision and specialized\narchitectures for processing radio frequency data and demonstrate that it\nachieves comparable Human Activity Recognition (HAR) performance while offering\na significantly lower parameter count and faster inference times. With an\ninference time of just 10 ms on an Nvidia Jetson Orin Nano, WiFlexFormer is\noptimized for real-time inference. Additionally, its low parameter count\ncontributes to improved cross-domain generalization, where it often outperforms\nlarger models. Our comprehensive evaluation shows that WiFlexFormer is a\npotential solution for efficient, scalable WiFi-based sensing applications. The\nPyTorch implementation of WiFlexFormer is publicly available at:\nhttps://github.com/StrohmayerJ/WiFlexFormer.\n","authors":["Julian Strohmayer","Matthias Wödlinger","Martin Kampel"],"pdf_url":"https://arxiv.org/pdf/2411.04224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02537v2","updated":"2024-11-06T19:27:10Z","published":"2024-11-04T19:16:53Z","title":"INQUIRE: A Natural World Text-to-Image Retrieval Benchmark","summary":" We introduce INQUIRE, a text-to-image retrieval benchmark designed to\nchallenge multimodal vision-language models on expert-level queries. INQUIRE\nincludes iNaturalist 2024 (iNat24), a new dataset of five million natural world\nimages, along with 250 expert-level retrieval queries. These queries are paired\nwith all relevant images comprehensively labeled within iNat24, comprising\n33,000 total matches. Queries span categories such as species identification,\ncontext, behavior, and appearance, emphasizing tasks that require nuanced image\nunderstanding and domain expertise. Our benchmark evaluates two core retrieval\ntasks: (1) INQUIRE-Fullrank, a full dataset ranking task, and (2)\nINQUIRE-Rerank, a reranking task for refining top-100 retrievals. Detailed\nevaluation of a range of recent multimodal models demonstrates that INQUIRE\nposes a significant challenge, with the best models failing to achieve an\nmAP@50 above 50%. In addition, we show that reranking with more powerful\nmultimodal models can enhance retrieval performance, yet there remains a\nsignificant margin for improvement. By focusing on scientifically-motivated\necological challenges, INQUIRE aims to bridge the gap between AI capabilities\nand the needs of real-world scientific inquiry, encouraging the development of\nretrieval systems that can assist with accelerating ecological and biodiversity\nresearch. Our dataset and code are available at\nhttps://inquire-benchmark.github.io\n","authors":["Edward Vendrow","Omiros Pantazis","Alexander Shepard","Gabriel Brostow","Kate E. Jones","Oisin Mac Aodha","Sara Beery","Grant Van Horn"],"pdf_url":"https://arxiv.org/pdf/2411.02537v2.pdf","comment":"Published in NeurIPS 2024, Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2411.04125v1","updated":"2024-11-06T18:59:41Z","published":"2024-11-06T18:59:41Z","title":"Community Forensics: Using Thousands of Generators to Train Fake Image\n Detectors","summary":" One of the key challenges of detecting AI-generated images is spotting images\nthat have been created by previously unseen generative models. We argue that\nthe limited diversity of the training data is a major obstacle to addressing\nthis problem, and we propose a new dataset that is significantly larger and\nmore diverse than prior work. As part of creating this dataset, we\nsystematically download thousands of text-to-image latent diffusion models and\nsample images from them. We also collect images from dozens of popular open\nsource and commercial models. The resulting dataset contains 2.7M images that\nhave been sampled from 4803 different models. These images collectively capture\na wide range of scene content, generator architectures, and image processing\nsettings. Using this dataset, we study the generalization abilities of fake\nimage detectors. Our experiments suggest that detection performance improves as\nthe number of models in the training set increases, even when these models have\nsimilar architectures. We also find that detection performance improves as the\ndiversity of the models increases, and that our trained detectors generalize\nbetter than those trained on other datasets.\n","authors":["Jeongsoo Park","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2411.04125v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2411.04168v1","updated":"2024-11-06T18:59:17Z","published":"2024-11-06T18:59:17Z","title":"DiMSUM: Diffusion Mamba -- A Scalable and Unified Spatial-Frequency\n Method for Image Generation","summary":" We introduce a novel state-space architecture for diffusion models,\neffectively harnessing spatial and frequency information to enhance the\ninductive bias towards local features in input images for image generation\ntasks. While state-space networks, including Mamba, a revolutionary advancement\nin recurrent neural networks, typically scan input sequences from left to\nright, they face difficulties in designing effective scanning strategies,\nespecially in the processing of image data. Our method demonstrates that\nintegrating wavelet transformation into Mamba enhances the local structure\nawareness of visual inputs and better captures long-range relations of\nfrequencies by disentangling them into wavelet subbands, representing both low-\nand high-frequency components. These wavelet-based outputs are then processed\nand seamlessly fused with the original Mamba outputs through a cross-attention\nfusion layer, combining both spatial and frequency information to optimize the\norder awareness of state-space models which is essential for the details and\noverall quality of image generation. Besides, we introduce a globally-shared\ntransformer to supercharge the performance of Mamba, harnessing its exceptional\npower to capture global relationships. Through extensive experiments on\nstandard benchmarks, our method demonstrates superior results compared to DiT\nand DIFFUSSM, achieving faster training convergence and delivering high-quality\noutputs. The codes and pretrained models are released at\nhttps://github.com/VinAIResearch/DiMSUM.git.\n","authors":["Hao Phung","Quan Dao","Trung Dao","Hoang Phan","Dimitris Metaxas","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2411.04168v1.pdf","comment":"Accepted to NeurIPS 2024. Project page:\n https://hao-pt.github.io/dimsum/"},{"id":"http://arxiv.org/abs/2407.10964v2","updated":"2024-11-06T18:58:03Z","published":"2024-07-15T17:58:42Z","title":"No Train, all Gain: Self-Supervised Gradients Improve Deep Frozen\n Representations","summary":" This paper introduces FUNGI, Features from UNsupervised GradIents, a method\nto enhance the features of transformer encoders by leveraging self-supervised\ngradients. Our method is simple: given any pretrained model, we first compute\ngradients from various self-supervised objectives for each input. These\ngradients are projected to a lower dimension and then concatenated with the\nmodel's output embedding. The resulting features are evaluated on k-nearest\nneighbor classification over 11 datasets from vision, 5 from natural language\nprocessing, and 2 from audio. Across backbones spanning various sizes and\npretraining strategies, FUNGI features provide consistent performance\nimprovements over the embeddings. We also show that using FUNGI features can\nbenefit linear classification, clustering and image retrieval, and that they\nsignificantly improve the retrieval-based in-context scene understanding\nabilities of pretrained models, for example improving upon DINO by +17% for\nsemantic segmentation - without any training.\n","authors":["Walter Simoncini","Spyros Gidaris","Andrei Bursuc","Yuki M. Asano"],"pdf_url":"https://arxiv.org/pdf/2407.10964v2.pdf","comment":"NeurIPS 2024. Code available at\n https://github.com/WalterSimoncini/fungivision"},{"id":"http://arxiv.org/abs/2411.04112v1","updated":"2024-11-06T18:44:09Z","published":"2024-11-06T18:44:09Z","title":"Fed-EC: Bandwidth-Efficient Clustering-Based Federated Learning For\n Autonomous Visual Robot Navigation","summary":" Centralized learning requires data to be aggregated at a central server,\nwhich poses significant challenges in terms of data privacy and bandwidth\nconsumption. Federated learning presents a compelling alternative, however,\nvanilla federated learning methods deployed in robotics aim to learn a single\nglobal model across robots that works ideally for all. But in practice one\nmodel may not be well suited for robots deployed in various environments. This\npaper proposes Federated-EmbedCluster (Fed-EC), a clustering-based federated\nlearning framework that is deployed with vision based autonomous robot\nnavigation in diverse outdoor environments. The framework addresses the key\nfederated learning challenge of deteriorating model performance of a single\nglobal model due to the presence of non-IID data across real-world robots.\nExtensive real-world experiments validate that Fed-EC reduces the communication\nsize by 23x for each robot while matching the performance of centralized\nlearning for goal-oriented navigation and outperforms local learning. Fed-EC\ncan transfer previously learnt models to new robots that join the cluster.\n","authors":["Shreya Gummadi","Mateus V. Gasparino","Deepak Vasisht","Girish Chowdhary"],"pdf_url":"https://arxiv.org/pdf/2411.04112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19863v4","updated":"2024-11-06T18:29:38Z","published":"2024-03-28T22:17:19Z","title":"DeNetDM: Debiasing by Network Depth Modulation","summary":" Neural networks trained on biased datasets tend to inadvertently learn\nspurious correlations, hindering generalization. We formally prove that (1)\nsamples that exhibit spurious correlations lie on a lower rank manifold\nrelative to the ones that do not; and (2) the depth of a network acts as an\nimplicit regularizer on the rank of the attribute subspace that is encoded in\nits representations. Leveraging these insights, we present DeNetDM, a novel\ndebiasing method that uses network depth modulation as a way of developing\nrobustness to spurious correlations. Using a training paradigm derived from\nProduct of Experts, we create both biased and debiased branches with deep and\nshallow architectures and then distill knowledge to produce the target debiased\nmodel. Our method requires no bias annotations or explicit data augmentation\nwhile performing on par with approaches that require either or both. We\ndemonstrate that DeNetDM outperforms existing debiasing techniques on both\nsynthetic and real-world datasets by 5\\%. The project page is available at\nhttps://vssilpa.github.io/denetdm/.\n","authors":["Silpa Vadakkeeveetil Sreelatha","Adarsh Kappiyath","Abhra Chaudhuri","Anjan Dutta"],"pdf_url":"https://arxiv.org/pdf/2403.19863v4.pdf","comment":"Camera-ready version : NeurIPS 2024, * indicates these authors\n contributed equally"},{"id":"http://arxiv.org/abs/2411.04097v1","updated":"2024-11-06T18:25:00Z","published":"2024-11-06T18:25:00Z","title":"RaVL: Discovering and Mitigating Spurious Correlations in Fine-Tuned\n Vision-Language Models","summary":" Fine-tuned vision-language models (VLMs) often capture spurious correlations\nbetween image features and textual attributes, resulting in degraded zero-shot\nperformance at test time. Existing approaches for addressing spurious\ncorrelations (i) primarily operate at the global image-level rather than\nintervening directly on fine-grained image features and (ii) are predominantly\ndesigned for unimodal settings. In this work, we present RaVL, which takes a\nfine-grained perspective on VLM robustness by discovering and mitigating\nspurious correlations using local image features rather than operating at the\nglobal image level. Given a fine-tuned VLM, RaVL first discovers spurious\ncorrelations by leveraging a region-level clustering approach to identify\nprecise image features contributing to zero-shot classification errors. Then,\nRaVL mitigates the identified spurious correlation with a novel region-aware\nloss function that enables the VLM to focus on relevant regions and ignore\nspurious relationships during fine-tuning. We evaluate RaVL on 654 VLMs with\nvarious model architectures, data domains, and learned spurious correlations.\nOur results show that RaVL accurately discovers (191% improvement over the\nclosest baseline) and mitigates (8.2% improvement on worst-group image\nclassification accuracy) spurious correlations. Qualitative evaluations on\ngeneral-domain and medical-domain VLMs confirm our findings.\n","authors":["Maya Varma","Jean-Benoit Delbrouck","Zhihong Chen","Akshay Chaudhari","Curtis Langlotz"],"pdf_url":"https://arxiv.org/pdf/2411.04097v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.16147v3","updated":"2024-11-06T18:08:23Z","published":"2024-09-23T00:11:30Z","title":"Gaussian Deja-vu: Creating Controllable 3D Gaussian Head-Avatars with\n Enhanced Generalization and Personalization Abilities","summary":" Recent advancements in 3D Gaussian Splatting (3DGS) have unlocked significant\npotential for modeling 3D head avatars, providing greater flexibility than\nmesh-based methods and more efficient rendering compared to NeRF-based\napproaches. Despite these advancements, the creation of controllable 3DGS-based\nhead avatars remains time-intensive, often requiring tens of minutes to hours.\nTo expedite this process, we here introduce the \"Gaussian Deja-vu\" framework,\nwhich first obtains a generalized model of the head avatar and then\npersonalizes the result. The generalized model is trained on large 2D\n(synthetic and real) image datasets. This model provides a well-initialized 3D\nGaussian head that is further refined using a monocular video to achieve the\npersonalized head avatar. For personalizing, we propose learnable\nexpression-aware rectification blendmaps to correct the initial 3D Gaussians,\nensuring rapid convergence without the reliance on neural networks. Experiments\ndemonstrate that the proposed method meets its objectives. It outperforms\nstate-of-the-art 3D Gaussian head avatars in terms of photorealistic quality as\nwell as reduces training time consumption to at least a quarter of the existing\nmethods, producing the avatar in minutes.\n","authors":["Peizhi Yan","Rabab Ward","Qiang Tang","Shan Du"],"pdf_url":"https://arxiv.org/pdf/2409.16147v3.pdf","comment":"11 pages, Accepted by WACV 2025 in Round 1"},{"id":"http://arxiv.org/abs/2411.04079v1","updated":"2024-11-06T17:57:43Z","published":"2024-11-06T17:57:43Z","title":"Textual Decomposition Then Sub-motion-space Scattering for\n Open-Vocabulary Motion Generation","summary":" Text-to-motion generation is a crucial task in computer vision, which\ngenerates the target 3D motion by the given text. The existing annotated\ndatasets are limited in scale, resulting in most existing methods overfitting\nto the small datasets and unable to generalize to the motions of the open\ndomain. Some methods attempt to solve the open-vocabulary motion generation\nproblem by aligning to the CLIP space or using the Pretrain-then-Finetuning\nparadigm. However, the current annotated dataset's limited scale only allows\nthem to achieve mapping from sub-text-space to sub-motion-space, instead of\nmapping between full-text-space and full-motion-space (full mapping), which is\nthe key to attaining open-vocabulary motion generation. To this end, this paper\nproposes to leverage the atomic motion (simple body part motions over a short\ntime period) as an intermediate representation, and leverage two orderly\ncoupled steps, i.e., Textual Decomposition and Sub-motion-space Scattering, to\naddress the full mapping problem. For Textual Decomposition, we design a\nfine-grained description conversion algorithm, and combine it with the\ngeneralization ability of a large language model to convert any given motion\ntext into atomic texts. Sub-motion-space Scattering learns the compositional\nprocess from atomic motions to the target motions, to make the learned\nsub-motion-space scattered to form the full-motion-space. For a given motion of\nthe open domain, it transforms the extrapolation into interpolation and thereby\nsignificantly improves generalization. Our network, $DSO$-Net, combines textual\n$d$ecomposition and sub-motion-space $s$cattering to solve the\n$o$pen-vocabulary motion generation. Extensive experiments demonstrate that our\nDSO-Net achieves significant improvements over the state-of-the-art methods on\nopen-vocabulary motion generation. Code is available at\nhttps://vankouf.github.io/DSONet/.\n","authors":["Ke Fan","Jiangning Zhang","Ran Yi","Jingyu Gong","Yabiao Wang","Yating Wang","Xin Tan","Chengjie Wang","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2411.04079v1.pdf","comment":"project page: https://vankouf.github.io/DSONet/"},{"id":"http://arxiv.org/abs/2411.04077v1","updated":"2024-11-06T17:55:37Z","published":"2024-11-06T17:55:37Z","title":"H-POPE: Hierarchical Polling-based Probing Evaluation of Hallucinations\n in Large Vision-Language Models","summary":" By leveraging both texts and images, large vision language models (LVLMs)\nhave shown significant progress in various multi-modal tasks. Nevertheless,\nthese models often suffer from hallucinations, e.g., they exhibit\ninconsistencies between the visual input and the textual output. To address\nthis, we propose H-POPE, a coarse-to-fine-grained benchmark that systematically\nassesses hallucination in object existence and attributes. Our evaluation shows\nthat models are prone to hallucinations on object existence, and even more so\non fine-grained attributes. We further investigate whether these models rely on\nvisual input to formulate the output texts.\n","authors":["Nhi Pham","Michael Schott"],"pdf_url":"https://arxiv.org/pdf/2411.04077v1.pdf","comment":"Poster at https://sites.google.com/berkeley.edu/bb-stat/home"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2410.22233v2","updated":"2024-11-06T19:52:58Z","published":"2024-10-29T17:01:05Z","title":"ContextIQ: A Multimodal Expert-Based Video Retrieval System for\n Contextual Advertising","summary":" Contextual advertising serves ads that are aligned to the content that the\nuser is viewing. The rapid growth of video content on social platforms and\nstreaming services, along with privacy concerns, has increased the need for\ncontextual advertising. Placing the right ad in the right context creates a\nseamless and pleasant ad viewing experience, resulting in higher audience\nengagement and, ultimately, better ad monetization. From a technology\nstandpoint, effective contextual advertising requires a video retrieval system\ncapable of understanding complex video content at a very granular level.\nCurrent text-to-video retrieval models based on joint multimodal training\ndemand large datasets and computational resources, limiting their practicality\nand lacking the key functionalities required for ad ecosystem integration. We\nintroduce ContextIQ, a multimodal expert-based video retrieval system designed\nspecifically for contextual advertising. ContextIQ utilizes modality-specific\nexperts-video, audio, transcript (captions), and metadata such as objects,\nactions, emotion, etc.-to create semantically rich video representations. We\nshow that our system, without joint training, achieves better or comparable\nresults to state-of-the-art models and commercial solutions on multiple\ntext-to-video retrieval benchmarks. Our ablation studies highlight the benefits\nof leveraging multiple modalities for enhanced video retrieval accuracy instead\nof using a vision-language model alone. Furthermore, we show how video\nretrieval systems such as ContextIQ can be used for contextual advertising in\nan ad ecosystem while also addressing concerns related to brand safety and\nfiltering inappropriate content.\n","authors":["Ashutosh Chaubey","Anoubhav Agarwaal","Sartaki Sinha Roy","Aayush Agrawal","Susmita Ghose"],"pdf_url":"https://arxiv.org/pdf/2410.22233v2.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2411.04228v1","updated":"2024-11-06T19:50:00Z","published":"2024-11-06T19:50:00Z","title":"dsld: A Socially Relevant Tool for Teaching Statistics","summary":" The growing power of data science can play a crucial role in addressing\nsocial discrimination, necessitating nuanced understanding and effective\nmitigation strategies of potential biases. Data Science Looks At Discrimination\n(dsld) is an R and Python package designed to provide users with a\ncomprehensive toolkit of statistical and graphical methods for assessing\npossible discrimination related to protected groups, such as race, gender, and\nage. Our software offers techniques for discrimination analysis by identifying\nand mitigating confounding variables, along with methods for reducing bias in\npredictive models.\n In educational settings, dsld offers instructors powerful tools to teach\nimportant statistical principles through motivating real world examples of\ndiscrimination analysis. The inclusion of an 80-page Quarto book further\nsupports users, from statistics educators to legal professionals, in\neffectively applying these analytical tools to real world scenarios.\n","authors":["Taha Abdullah","Arjun Ashok","Brandon Estrada","Norman Matloff","Aditya Mittal"],"pdf_url":"https://arxiv.org/pdf/2411.04228v1.pdf","comment":"To be submitted to the Journal of Statistics and Data Science\n Education"},{"id":"http://arxiv.org/abs/2411.02537v2","updated":"2024-11-06T19:27:10Z","published":"2024-11-04T19:16:53Z","title":"INQUIRE: A Natural World Text-to-Image Retrieval Benchmark","summary":" We introduce INQUIRE, a text-to-image retrieval benchmark designed to\nchallenge multimodal vision-language models on expert-level queries. INQUIRE\nincludes iNaturalist 2024 (iNat24), a new dataset of five million natural world\nimages, along with 250 expert-level retrieval queries. These queries are paired\nwith all relevant images comprehensively labeled within iNat24, comprising\n33,000 total matches. Queries span categories such as species identification,\ncontext, behavior, and appearance, emphasizing tasks that require nuanced image\nunderstanding and domain expertise. Our benchmark evaluates two core retrieval\ntasks: (1) INQUIRE-Fullrank, a full dataset ranking task, and (2)\nINQUIRE-Rerank, a reranking task for refining top-100 retrievals. Detailed\nevaluation of a range of recent multimodal models demonstrates that INQUIRE\nposes a significant challenge, with the best models failing to achieve an\nmAP@50 above 50%. In addition, we show that reranking with more powerful\nmultimodal models can enhance retrieval performance, yet there remains a\nsignificant margin for improvement. By focusing on scientifically-motivated\necological challenges, INQUIRE aims to bridge the gap between AI capabilities\nand the needs of real-world scientific inquiry, encouraging the development of\nretrieval systems that can assist with accelerating ecological and biodiversity\nresearch. Our dataset and code are available at\nhttps://inquire-benchmark.github.io\n","authors":["Edward Vendrow","Omiros Pantazis","Alexander Shepard","Gabriel Brostow","Kate E. Jones","Oisin Mac Aodha","Sara Beery","Grant Van Horn"],"pdf_url":"https://arxiv.org/pdf/2411.02537v2.pdf","comment":"Published in NeurIPS 2024, Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2411.04051v1","updated":"2024-11-06T16:57:55Z","published":"2024-11-06T16:57:55Z","title":"Reproducible Hybrid Time-Travel Retrieval in Evolving Corpora","summary":" There are settings in which reproducibility of ranked lists is desirable,\nsuch as when extracting a subset of an evolving document corpus for downstream\nresearch tasks or in domains such as patent retrieval or in medical systematic\nreviews, with high reproducibility expectations. However, as global term\nstatistics change when documents change or are added to a corpus, queries using\ntypical ranked retrieval models are not even reproducible for the parts of the\ndocument corpus that have not changed. Thus, Boolean retrieval frequently\nremains the mechanism of choice in such settings.\n We present a hybrid retrieval system combining Lucene for fast retrieval with\na column-store-based retrieval system maintaining a versioned and time-stamped\nindex. The latter component allows re-execution of previously posed queries\nresulting in the same ranked list and further allows for time-travel queries\nover evolving collection, as web archives, while maintaining the original\nranking. Thus, retrieval results in evolving document collections are fully\nreproducible even when document collections and thus term statistics change.\n","authors":["Moritz Staudinger","Florina Piroi","Andreas Rauber"],"pdf_url":"https://arxiv.org/pdf/2411.04051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03957v1","updated":"2024-11-06T14:42:39Z","published":"2024-11-06T14:42:39Z","title":"Fine-Grained Guidance for Retrievers: Leveraging LLMs' Feedback in\n Retrieval-Augmented Generation","summary":" Retrieval-Augmented Generation (RAG) has proven to be an effective method for\nmitigating hallucination issues inherent in large language models (LLMs).\nPrevious approaches typically train retrievers based on semantic similarity,\nlacking optimization for RAG. More recent works have proposed aligning\nretrievers with the preference signals of LLMs. However, these preference\nsignals are often difficult for dense retrievers, which typically have weaker\nlanguage capabilities, to understand and learn effectively. Drawing inspiration\nfrom pedagogical theories like Guided Discovery Learning, we propose a novel\nframework, FiGRet (Fine-grained Guidance for Retrievers), which leverages the\nlanguage capabilities of LLMs to construct examples from a more granular,\ninformation-centric perspective to guide the learning of retrievers.\nSpecifically, our method utilizes LLMs to construct easy-to-understand examples\nfrom samples where the retriever performs poorly, focusing on three learning\nobjectives highly relevant to the RAG scenario: relevance, comprehensiveness,\nand purity. These examples serve as scaffolding to ultimately align the\nretriever with the LLM's preferences. Furthermore, we employ a dual curriculum\nlearning strategy and leverage the reciprocal feedback between LLM and\nretriever to further enhance the performance of the RAG system. A series of\nexperiments demonstrate that our proposed framework enhances the performance of\nRAG systems equipped with different retrievers and is applicable to various\nLLMs.\n","authors":["Yuhang Liu","Xueyu Hu","Shengyu Zhang","Jingyuan Chen","Fan Wu","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2411.03957v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.03906v1","updated":"2024-11-06T13:37:28Z","published":"2024-11-06T13:37:28Z","title":"Lexicalization Is All You Need: Examining the Impact of Lexical\n Knowledge in a Compositional QALD System","summary":" In this paper, we examine the impact of lexicalization on Question Answering\nover Linked Data (QALD). It is well known that one of the key challenges in\ninterpreting natural language questions with respect to SPARQL lies in bridging\nthe lexical gap, that is mapping the words in the query to the correct\nvocabulary elements. We argue in this paper that lexicalization, that is\nexplicit knowledge about the potential interpretations of a word with respect\nto the given vocabulary, significantly eases the task and increases the\nperformance of QA systems. Towards this goal, we present a compositional QA\nsystem that can leverage explicit lexical knowledge in a compositional manner\nto infer the meaning of a question in terms of a SPARQL query. We show that\nsuch a system, given lexical knowledge, has a performance well beyond current\nQA systems, achieving up to a $35.8\\%$ increase in the micro $F_1$ score\ncompared to the best QA system on QALD-9. This shows the importance and\npotential of including explicit lexical knowledge. In contrast, we show that\nLLMs have limited abilities to exploit lexical knowledge, with only marginal\nimprovements compared to a version without lexical knowledge. This shows that\nLLMs have no ability to compositionally interpret a question on the basis of\nthe meaning of its parts, a key feature of compositional approaches. Taken\ntogether, our work shows new avenues for QALD research, emphasizing the\nimportance of lexicalization and compositionality.\n","authors":["David Maria Schmidt","Mohammad Fazleh Elahi","Philipp Cimiano"],"pdf_url":"https://arxiv.org/pdf/2411.03906v1.pdf","comment":"24th International Conference on Knowledge Engineering and Knowledge\n Management (EKAW 2024), November 26-28, 2024, Amsterdam, The Netherlands"},{"id":"http://arxiv.org/abs/2411.03881v1","updated":"2024-11-06T12:54:27Z","published":"2024-11-06T12:54:27Z","title":"Data Fusion of Synthetic Query Variants With Generative Large Language\n Models","summary":" Considering query variance in information retrieval (IR) experiments is\nbeneficial for retrieval effectiveness. Especially ranking ensembles based on\ndifferent topically related queries retrieve better results than rankings based\non a single query alone. Recently, generative instruction-tuned Large Language\nModels (LLMs) improved on a variety of different tasks in capturing human\nlanguage. To this end, this work explores the feasibility of using synthetic\nquery variants generated by instruction-tuned LLMs in data fusion experiments.\nMore specifically, we introduce a lightweight, unsupervised, and cost-efficient\napproach that exploits principled prompting and data fusion techniques. In our\nexperiments, LLMs produce more effective queries when provided with additional\ncontext information on the topic. Furthermore, our analysis based on four TREC\nnewswire benchmarks shows that data fusion based on synthetic query variants is\nsignificantly better than baselines with single queries and also outperforms\npseudo-relevance feedback methods. We publicly share the code and query\ndatasets with the community as resources for follow-up studies.\n","authors":["Timo Breuer"],"pdf_url":"https://arxiv.org/pdf/2411.03881v1.pdf","comment":"The definitive version of record was published in SIGIR-AP '24"},{"id":"http://arxiv.org/abs/2411.02832v2","updated":"2024-11-06T11:19:42Z","published":"2024-11-05T06:11:17Z","title":"PersianRAG: A Retrieval-Augmented Generation System for Persian Language","summary":" Retrieval augmented generation (RAG) models, which integrate large-scale\npre-trained generative models with external retrieval mechanisms, have shown\nsignificant success in various natural language processing (NLP) tasks.\nHowever, applying RAG models in Persian language as a low-resource language,\nposes distinct challenges. These challenges primarily involve the\npreprocessing, embedding, retrieval, prompt construction, language modeling,\nand response evaluation of the system. In this paper, we address the challenges\ntowards implementing a real-world RAG system for Persian language called\nPersianRAG. We propose novel solutions to overcome these obstacles and evaluate\nour approach using several Persian benchmark datasets. Our experimental results\ndemonstrate the capability of the PersianRAG framework to enhance question\nanswering task in Persian.\n","authors":["Hossein Hosseini","Mohammad Sobhan Zare","Amir Hossein Mohammadi","Arefeh Kazemi","Zahra Zojaji","Mohammad Ali Nematbakhsh"],"pdf_url":"https://arxiv.org/pdf/2411.02832v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03039v2","updated":"2024-11-06T09:28:25Z","published":"2024-11-05T12:22:51Z","title":"Self-Compositional Data Augmentation for Scientific Keyphrase Generation","summary":" State-of-the-art models for keyphrase generation require large amounts of\ntraining data to achieve good performance. However, obtaining keyphrase-labeled\ndocuments can be challenging and costly. To address this issue, we present a\nself-compositional data augmentation method. More specifically, we measure the\nrelatedness of training documents based on their shared keyphrases, and combine\nsimilar documents to generate synthetic samples. The advantage of our method\nlies in its ability to create additional training samples that keep domain\ncoherence, without relying on external data or resources. Our results on\nmultiple datasets spanning three different domains, demonstrate that our method\nconsistently improves keyphrase generation. A qualitative analysis of the\ngenerated keyphrases for the Computer Science domain confirms this improvement\ntowards their representativity property.\n","authors":["Mael Houbre","Florian Boudin","Beatrice Daille","Akiko Aizawa"],"pdf_url":"https://arxiv.org/pdf/2411.03039v2.pdf","comment":"Accepted to JCDL 2024. This is the author's version of the work. It\n is posted here for your personal use. Not for redistribution. The definitive\n version was published in the proceedings of the 2024 ACM/IEEE Joint\n Conference on Digital Libraries (JCDL 24)\n https://doi.org/10.1145/3677389.3702504"},{"id":"http://arxiv.org/abs/2411.03701v1","updated":"2024-11-06T06:56:22Z","published":"2024-11-06T06:56:22Z","title":"The Essence of the Essence from the Web:The Metasearch Engine","summary":" The exponential growth of information source on the web and in turn\ncontinuing technological progress of searching the information by using tools\nlike Search Engines gives rise to many problems for the user to know which tool\nis best for their query and which tool is not. At this time Metasearch Engine\ncomes into play by reducing the user burden by dispatching queries to multiple\nsearch engines in parallel and refining the results of these search engines to\ngive the best out of best by doing superior job on their side. These engines do\nnot own a database of Web pages rather they send search terms to the databases\nmaintained by the search engine companies, get back results from all the search\nengines queried and then compile the results to be presented to the user. In\nthis paper, we describe the working of a typical metasearch engine and then\npresent a comparative study of traditional search engines and metasearch\nengines on the basis of different parameters and show how metasearch engines\nare better than the other search engines.\n","authors":["Rajender Nath","Satinder Bal"],"pdf_url":"https://arxiv.org/pdf/2411.03701v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2401.11505v2","updated":"2024-11-06T04:11:14Z","published":"2024-01-21T14:30:20Z","title":"CheX-GPT: Harnessing Large Language Models for Enhanced Chest X-ray\n Report Labeling","summary":" Free-text radiology reports present a rich data source for various medical\ntasks, but effectively labeling these texts remains challenging. Traditional\nrule-based labeling methods fall short of capturing the nuances of diverse\nfree-text patterns. Moreover, models using expert-annotated data are limited by\ndata scarcity and pre-defined classes, impacting their performance, flexibility\nand scalability. To address these issues, our study offers three main\ncontributions: 1) We demonstrate the potential of GPT as an adept labeler using\ncarefully designed prompts. 2) Utilizing only the data labeled by GPT, we\ntrained a BERT-based labeler, CheX-GPT, which operates faster and more\nefficiently than its GPT counterpart. 3) To benchmark labeler performance, we\nintroduced a publicly available expert-annotated test set, MIMIC-500,\ncomprising 500 cases from the MIMIC validation set. Our findings demonstrate\nthat CheX-GPT not only excels in labeling accuracy over existing models, but\nalso showcases superior efficiency, flexibility, and scalability, supported by\nour introduction of the MIMIC-500 dataset for robust benchmarking. Code and\nmodels are available at https://github.com/Soombit-ai/CheXGPT.\n","authors":["Jawook Gu","Kihyun You","Han-Cheol Cho","Jiho Kim","Eun Kyoung Hong","Byungseok Roh"],"pdf_url":"https://arxiv.org/pdf/2401.11505v2.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.03624v1","updated":"2024-11-06T02:45:16Z","published":"2024-11-06T02:45:16Z","title":"SEGMN: A Structure-Enhanced Graph Matching Network for Graph Similarity\n Learning","summary":" Graph similarity computation (GSC) aims to quantify the similarity score\nbetween two graphs. Although recent GSC methods based on graph neural networks\n(GNNs) take advantage of intra-graph structures in message passing, few of them\nfully utilize the structures presented by edges to boost the representation of\ntheir connected nodes. Moreover, previous cross-graph node embedding matching\nlacks the perception of the overall structure of the graph pair, due to the\nfact that the node representations from GNNs are confined to the intra-graph\nstructure, causing the unreasonable similarity score. Intuitively, the\ncross-graph structure represented in the assignment graph is helpful to rectify\nthe inappropriate matching. Therefore, we propose a structure-enhanced graph\nmatching network (SEGMN). Equipped with a dual embedding learning module and a\nstructure perception matching module, SEGMN achieves structure enhancement in\nboth embedding learning and cross-graph matching. The dual embedding learning\nmodule incorporates adjacent edge representation into each node to achieve a\nstructure-enhanced representation. The structure perception matching module\nachieves cross-graph structure enhancement through assignment graph\nconvolution. The similarity score of each cross-graph node pair can be\nrectified by aggregating messages from structurally relevant node pairs.\nExperimental results on benchmark datasets demonstrate that SEGMN outperforms\nthe state-of-the-art GSC methods in the GED regression task, and the structure\nperception matching module is plug-and-play, which can further improve the\nperformance of the baselines by up to 25%.\n","authors":["Wenjun Wang","Jiacheng Lu","Kejia Chen","Zheng Liu","Shilong Sang"],"pdf_url":"https://arxiv.org/pdf/2411.03624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02937v2","updated":"2024-11-06T02:36:02Z","published":"2024-08-06T03:44:06Z","title":"A Real-Time Adaptive Multi-Stream GPU System for Online Approximate\n Nearest Neighborhood Search","summary":" In recent years, Approximate Nearest Neighbor Search (ANNS) has played a\npivotal role in modern search and recommendation systems, especially in\nemerging LLM applications like Retrieval-Augmented Generation. There is a\ngrowing exploration into harnessing the parallel computing capabilities of GPUs\nto meet the substantial demands of ANNS. However, existing systems primarily\nfocus on offline scenarios, overlooking the distinct requirements of online\napplications that necessitate real-time insertion of new vectors. This\nlimitation renders such systems inefficient for real-world scenarios. Moreover,\nprevious architectures struggled to effectively support real-time insertion due\nto their reliance on serial execution streams. In this paper, we introduce a\nnovel Real-Time Adaptive Multi-Stream GPU ANNS System (RTAMS-GANNS). Our\narchitecture achieves its objectives through three key advancements: 1) We\ninitially examined the real-time insertion mechanisms in existing GPU ANNS\nsystems and discovered their reliance on repetitive copying and memory\nallocation, which significantly hinders real-time effectiveness on GPUs. As a\nsolution, we introduce a dynamic vector insertion algorithm based on memory\nblocks, which includes in-place rearrangement. 2) To enable real-time vector\ninsertion in parallel, we introduce a multi-stream parallel execution mode,\nwhich differs from existing systems that operate serially within a single\nstream. Our system utilizes a dynamic resource pool, allowing multiple streams\nto execute concurrently without additional execution blocking. 3) Through\nextensive experiments and comparisons, our approach effectively handles varying\nQPS levels across different datasets, reducing latency by up to 40%-80%. The\nproposed system has also been deployed in real-world industrial search and\nrecommendation systems, serving hundreds of millions of users daily, and has\nachieved good results.\n","authors":["Yiping Sun","Yang Shi","Jiaolong Du"],"pdf_url":"https://arxiv.org/pdf/2408.02937v2.pdf","comment":"Accepted by CIKM'24, V2 fixes some typos"},{"id":"http://arxiv.org/abs/2408.09380v3","updated":"2024-11-06T02:26:07Z","published":"2024-08-18T06:41:46Z","title":"ELASTIC: Efficient Linear Attention for Sequential Interest Compression","summary":" State-of-the-art sequential recommendation models heavily rely on\ntransformer's attention mechanism. However, the quadratic computational and\nmemory complexities of self attention have limited its scalability for modeling\nusers' long range behaviour sequences. To address this problem, we propose\nELASTIC, an Efficient Linear Attention for SequenTial Interest Compression,\nrequiring only linear time complexity and decoupling model capacity from\ncomputational cost. Specifically, ELASTIC introduces a fixed length interest\nexperts with linear dispatcher attention mechanism which compresses the\nlong-term behaviour sequences to a significantly more compact representation\nwhich reduces up to 90% GPU memory usage with x2.7 inference speed up. The\nproposed linear dispatcher attention mechanism significantly reduces the\nquadratic complexity and makes the model feasible for adequately modeling\nextremely long sequences. Moreover, in order to retain the capacity for\nmodeling various user interests, ELASTIC initializes a vast learnable interest\nmemory bank and sparsely retrieves compressed user's interests from the memory\nwith a negligible computational overhead. The proposed interest memory\nretrieval technique significantly expands the cardinality of available interest\nspace while keeping the same computational cost, thereby striking a trade-off\nbetween recommendation accuracy and efficiency. To validate the effectiveness\nof our proposed ELASTIC, we conduct extensive experiments on various public\ndatasets and compare it with several strong sequential recommenders.\nExperimental results demonstrate that ELASTIC consistently outperforms\nbaselines by a significant margin and also highlight the computational\nefficiency of ELASTIC when modeling long sequences. We will make our\nimplementation code publicly available.\n","authors":["Jiaxin Deng","Shiyao Wang","Song Lu","Yinfeng Li","Xinchen Luo","Yuanjun Liu","Peixing Xu","Guorui Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.09380v3.pdf","comment":"We hereby withdraw this paper from arXiv due to incomplete\n experiments. Upon further review, we have determined that additional\n experimental work is necessary to fully validate our findings and conclusions"},{"id":"http://arxiv.org/abs/2411.03572v1","updated":"2024-11-06T00:23:55Z","published":"2024-11-06T00:23:55Z","title":"Advanced RAG Models with Graph Structures: Optimizing Complex Knowledge\n Reasoning and Text Generation","summary":" This study aims to optimize the existing retrieval-augmented generation model\n(RAG) by introducing a graph structure to improve the performance of the model\nin dealing with complex knowledge reasoning tasks. The traditional RAG model\nhas the problem of insufficient processing efficiency when facing complex graph\nstructure information (such as knowledge graphs, hierarchical relationships,\netc.), which affects the quality and consistency of the generated results. This\nstudy proposes a scheme to process graph structure data by combining graph\nneural network (GNN), so that the model can capture the complex relationship\nbetween entities, thereby improving the knowledge consistency and reasoning\nability of the generated text. The experiment used the Natural Questions (NQ)\ndataset and compared it with multiple existing generation models. The results\nshow that the graph-based RAG model proposed in this paper is superior to the\ntraditional generation model in terms of quality, knowledge consistency, and\nreasoning ability, especially when dealing with tasks that require\nmulti-dimensional reasoning. Through the combination of the enhancement of the\nretrieval module and the graph neural network, the model in this study can\nbetter handle complex knowledge background information and has broad potential\nvalue in multiple practical application scenarios.\n","authors":["Yuxin Dong","Shuo Wang","Hongye Zheng","Jiajing Chen","Zhenhong Zhang","Chihang Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03572v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2304.04901v2","updated":"2024-11-06T23:32:27Z","published":"2023-04-11T00:17:28Z","title":"Efficiently Collecting Training Dataset for 2D Object Detection by\n Online Visual Feedback","summary":" Training deep-learning-based vision systems require the manual annotation of\na significant number of images. Such manual annotation is highly time-consuming\nand labor-intensive. Although previous studies have attempted to eliminate the\neffort required for annotation, the effort required for image collection was\nretained. To address this, we propose a human-in-the-loop dataset collection\nmethod that uses a web application. To counterbalance the workload and\nperformance by encouraging the collection of multi-view object image datasets\nin an enjoyable manner, thereby amplifying motivation, we propose three types\nof online visual feedback features to track the progress of the collection\nstatus. Our experiments thoroughly investigated the impact of each feature on\ncollection performance and quality of operation. The results suggested the\nfeasibility of annotation and object detection.\n","authors":["Takuya Kiyokawa","Naoki Shirakura","Hiroki Katayama","Keita Tomochika","Jun Takamatsu"],"pdf_url":"https://arxiv.org/pdf/2304.04901v2.pdf","comment":"13 pages, 14 figures"},{"id":"http://arxiv.org/abs/2401.03115v2","updated":"2024-11-06T22:09:09Z","published":"2024-01-06T03:03:28Z","title":"Transferable Learned Image Compression-Resistant Adversarial\n Perturbations","summary":" Adversarial attacks can readily disrupt the image classification system,\nrevealing the vulnerability of DNN-based recognition tasks. While existing\nadversarial perturbations are primarily applied to uncompressed images or\ncompressed images by the traditional image compression method, i.e., JPEG,\nlimited studies have investigated the robustness of models for image\nclassification in the context of DNN-based image compression. With the rapid\nevolution of advanced image compression, DNN-based learned image compression\nhas emerged as the promising approach for transmitting images in many\nsecurity-critical applications, such as cloud-based face recognition and\nautonomous driving, due to its superior performance over traditional\ncompression. Therefore, there is a pressing need to fully investigate the\nrobustness of a classification system post-processed by learned image\ncompression. To bridge this research gap, we explore the adversarial attack on\na new pipeline that targets image classification models that utilize learned\nimage compressors as pre-processing modules. Furthermore, to enhance the\ntransferability of perturbations across various quality levels and\narchitectures of learned image compression models, we introduce a saliency\nscore-based sampling method to enable the fast generation of transferable\nperturbation. Extensive experiments with popular attack methods demonstrate the\nenhanced transferability of our proposed method when attacking images that have\nbeen post-processed with different learned image compression models.\n","authors":["Yang Sui","Zhuohang Li","Ding Ding","Xiang Pan","Xiaozhong Xu","Shan Liu","Zhenzhong Chen"],"pdf_url":"https://arxiv.org/pdf/2401.03115v2.pdf","comment":"Accepted by BMVC 2024"},{"id":"http://arxiv.org/abs/2407.14093v2","updated":"2024-11-06T16:45:17Z","published":"2024-07-19T07:57:48Z","title":"Routing Experts: Learning to Route Dynamic Experts in Multi-modal Large\n Language Models","summary":" Recently, mixture of experts (MoE) has become a popular paradigm for\nachieving the trade-off between modal capacity and efficiency of multi-modal\nlarge language models (MLLMs). Different from previous efforts, we are\ndedicated to exploring the dynamic expert path in an already exist MLLM and\nshow that a standard MLLM can be also a mixture of experts. To approach this\ntarget, we propose a novel dynamic expert scheme for MLLMs, termed Routing\nExperts (RoE), which can achieve example-dependent optimal path routing without\nobvious structure tweaks. Meanwhile, a new regularization of structure sparsity\nis also introduced to enforce MLLMs to learn more short-cut inference, ensuring\nthe efficiency. In addition, we also realize the first attempt of aligning the\ntraining and inference schemes of MLLMs in terms of network routing. To\nvalidate RoE, we apply it to a set of latest MLLMs, including LLaVA-1.5,\nLLaVA-HR and VILA, and conduct extensive experiments on a bunch of VL\nbenchmarks. The experiment results not only show the great advantages of our\nRoE in improving MLLMs' efficiency, but also yield obvious advantages than\nMoE-LLaVA in both performance and speed, e.g., an average performance gain of\n3.3% on 5 benchmarks while being faster.\n","authors":["Qiong Wu","Zhaoxi Ke","Yiyi Zhou","Gen Luo","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2407.14093v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03948v1","updated":"2024-11-06T14:29:49Z","published":"2024-11-06T14:29:49Z","title":"Long-Form Text-to-Music Generation with Adaptive Prompts: A Case of\n Study in Tabletop Role-Playing Games Soundtracks","summary":" This paper investigates the capabilities of text-to-audio music generation\nmodels in producing long-form music with prompts that change over time,\nfocusing on soundtrack generation for Tabletop Role-Playing Games (TRPGs). We\nintroduce Babel Bardo, a system that uses Large Language Models (LLMs) to\ntransform speech transcriptions into music descriptions for controlling a\ntext-to-music model. Four versions of Babel Bardo were compared in two TRPG\ncampaigns: a baseline using direct speech transcriptions, and three LLM-based\nversions with varying approaches to music description generation. Evaluations\nconsidered audio quality, story alignment, and transition smoothness. Results\nindicate that detailed music descriptions improve audio quality while\nmaintaining consistency across consecutive descriptions enhances story\nalignment and transition smoothness.\n","authors":["Felipe Marra","Lucas N. Ferreira"],"pdf_url":"https://arxiv.org/pdf/2411.03948v1.pdf","comment":"Paper accepted at the LAMIR 2024 workshop"},{"id":"http://arxiv.org/abs/2411.03921v1","updated":"2024-11-06T13:52:49Z","published":"2024-11-06T13:52:49Z","title":"Inter-Frame Coding for Dynamic Meshes via Coarse-to-Fine Anchor Mesh\n Generation","summary":" In the current Video-based Dynamic Mesh Coding (V-DMC) standard, inter-frame\ncoding is restricted to mesh frames with constant topology. Consequently,\ntemporal redundancy is not fully leveraged, resulting in suboptimal compression\nefficacy. To address this limitation, this paper introduces a novel\ncoarse-to-fine scheme to generate anchor meshes for frames with time-varying\ntopology. Initially, we generate a coarse anchor mesh using an octree-based\nnearest neighbor search. Motion estimation compensates for regions with\nsignificant motion changes during this process. However, the quality of the\ncoarse mesh is low due to its suboptimal vertices. To enhance details, the fine\nanchor mesh is further optimized using the Quadric Error Metrics (QEM)\nalgorithm to calculate more precise anchor points. The inter-frame anchor mesh\ngenerated herein retains the connectivity of the reference base mesh, while\nconcurrently preserving superior quality. Experimental results show that our\nmethod achieves 7.2% ~ 10.3% BD-rate gain compared to the existing V-DMC test\nmodel version 7.\n","authors":["He Huang","Lizhi Hou","Qi Yang","Yiling Xu"],"pdf_url":"https://arxiv.org/pdf/2411.03921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03823v1","updated":"2024-11-06T10:44:15Z","published":"2024-11-06T10:44:15Z","title":"Both Text and Images Leaked! A Systematic Analysis of Multimodal LLM\n Data Contamination","summary":" The rapid progression of multimodal large language models (MLLMs) has\ndemonstrated superior performance on various multimodal benchmarks. However,\nthe issue of data contamination during training creates challenges in\nperformance evaluation and comparison. While numerous methods exist for\ndetecting dataset contamination in large language models (LLMs), they are less\neffective for MLLMs due to their various modalities and multiple training\nphases. In this study, we introduce a multimodal data contamination detection\nframework, MM-Detect, designed for MLLMs. Our experimental results indicate\nthat MM-Detect is sensitive to varying degrees of contamination and can\nhighlight significant performance improvements due to leakage of the training\nset of multimodal benchmarks. Furthermore, We also explore the possibility of\ncontamination originating from the pre-training phase of LLMs used by MLLMs and\nthe fine-tuning phase of MLLMs, offering new insights into the stages at which\ncontamination may be introduced.\n","authors":["Dingjie Song","Sicheng Lai","Shunian Chen","Lichao Sun","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18680v3","updated":"2024-11-06T10:27:05Z","published":"2024-09-27T12:06:53Z","title":"Beyond Single-Audio: Advancing Multi-Audio Processing in Audio Large\n Language Models","summary":" Various audio-LLMs (ALLMs) have been explored recently for tackling different\naudio tasks simultaneously using a single, unified model. While existing\nevaluations of ALLMs primarily focus on single-audio tasks, real-world\napplications often involve processing multiple audio streams simultaneously. To\nbridge this gap, we propose the first multi-audio evaluation (MAE) benchmark\nthat consists of 20 datasets from 11 multi-audio tasks encompassing both speech\nand sound scenarios. Comprehensive experiments on MAE demonstrate that the\nexisting ALLMs, while being powerful in comprehending primary audio elements in\nindividual audio inputs, struggling to handle multi-audio scenarios. To this\nend, we propose a novel multi-audio-LLM (MALLM) to capture audio context among\nmultiple similar audios using discriminative learning on our proposed synthetic\ndata. The results demonstrate that the proposed MALLM outperforms all baselines\nand achieves high data efficiency using synthetic data without requiring human\nannotations. The proposed MALLM opens the door for ALLMs towards multi-audio\nprocessing era and brings us closer to replicating human auditory capabilities\nin machines.\n","authors":["Yiming Chen","Xianghu Yue","Xiaoxue Gao","Chen Zhang","Luis Fernando D'Haro","Robby T. Tan","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2409.18680v3.pdf","comment":"EMNLP24 Findings. Data available at\n https://github.com/MatthewCYM/MALLM"},{"id":"http://arxiv.org/abs/2411.03595v1","updated":"2024-11-06T01:14:42Z","published":"2024-11-06T01:14:42Z","title":"Investigating Conceptual Blending of a Diffusion Model for Improving\n Nonword-to-Image Generation","summary":" Text-to-image diffusion models sometimes depict blended concepts in the\ngenerated images. One promising use case of this effect would be the\nnonword-to-image generation task which attempts to generate images intuitively\nimaginable from a non-existing word (nonword). To realize nonword-to-image\ngeneration, an existing study focused on associating nonwords with\nsimilar-sounding words. Since each nonword can have multiple similar-sounding\nwords, generating images containing their blended concepts would increase\nintuitiveness, facilitating creative activities and promoting computational\npsycholinguistics. Nevertheless, no existing study has quantitatively evaluated\nthis effect in either diffusion models or the nonword-to-image generation\nparadigm. Therefore, this paper first analyzes the conceptual blending in a\npretrained diffusion model, Stable Diffusion. The analysis reveals that a high\npercentage of generated images depict blended concepts when inputting an\nembedding interpolating between the text embeddings of two text prompts\nreferring to different concepts. Next, this paper explores the best text\nembedding space conversion method of an existing nonword-to-image generation\nframework to ensure both the occurrence of conceptual blending and image\ngeneration quality. We compare the conventional direct prediction approach with\nthe proposed method that combines $k$-nearest neighbor search and linear\nregression. Evaluation reveals that the enhanced accuracy of the embedding\nspace conversion by the proposed method improves the image generation quality,\nwhile the emergence of conceptual blending could be attributed mainly to the\nspecific dimensions of the high-dimensional text embedding space.\n","authors":["Chihaya Matsuhira","Marc A. Kastner","Takahiro Komamizu","Takatsugu Hirayama","Ichiro Ide"],"pdf_url":"https://arxiv.org/pdf/2411.03595v1.pdf","comment":"Paper accepted at ACM MM 2024 (doi: 10.1145/3664647.3681202) with\n supplementary materials concatenated"},{"id":"http://arxiv.org/abs/2410.21169v3","updated":"2024-11-06T00:11:08Z","published":"2024-10-28T16:11:35Z","title":"Document Parsing Unveiled: Techniques, Challenges, and Prospects for\n Structured Information Extraction","summary":" Document parsing is essential for converting unstructured and semi-structured\ndocuments-such as contracts, academic papers, and invoices-into structured,\nmachine-readable data. Document parsing extract reliable structured data from\nunstructured inputs, providing huge convenience for numerous applications.\nEspecially with recent achievements in Large Language Models, document parsing\nplays an indispensable role in both knowledge base construction and training\ndata generation. This survey presents a comprehensive review of the current\nstate of document parsing, covering key methodologies, from modular pipeline\nsystems to end-to-end models driven by large vision-language models. Core\ncomponents such as layout detection, content extraction (including text,\ntables, and mathematical expressions), and multi-modal data integration are\nexamined in detail. Additionally, this paper discusses the challenges faced by\nmodular document parsing systems and vision-language models in handling complex\nlayouts, integrating multiple modules, and recognizing high-density text. It\nemphasizes the importance of developing larger and more diverse datasets and\noutlines future research directions.\n","authors":["Qintong Zhang","Victor Shea-Jay Huang","Bin Wang","Junyuan Zhang","Zhengren Wang","Hao Liang","Shawn Wang","Matthieu Lin","Conghui He","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.21169v3.pdf","comment":null}]},"2024-11-05T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2411.03484v1","updated":"2024-11-05T20:08:23Z","published":"2024-11-05T20:08:23Z","title":"Automated, LLM enabled extraction of synthesis details for reticular\n materials from scientific literature","summary":" Automated knowledge extraction from scientific literature can potentially\naccelerate materials discovery. We have investigated an approach for extracting\nsynthesis protocols for reticular materials from scientific literature using\nlarge language models (LLMs). To that end, we introduce a Knowledge Extraction\nPipeline (KEP) that automatizes LLM-assisted paragraph classification and\ninformation extraction. By applying prompt engineering with in-context learning\n(ICL) to a set of open-source LLMs, we demonstrate that LLMs can retrieve\nchemical information from PDF documents, without the need for fine-tuning or\ntraining and at a reduced risk of hallucination. By comparing the performance\nof five open-source families of LLMs in both paragraph classification and\ninformation extraction tasks, we observe excellent model performance even if\nonly few example paragraphs are included in the ICL prompts. The results show\nthe potential of the KEP approach for reducing human annotations and data\ncuration efforts in automated scientific knowledge extraction.\n","authors":["Viviane Torres da Silva","Alexandre Rademaker","Krystelle Lionti","Ronaldo Giro","Geisa Lima","Sandro Fiorini","Marcelo Archanjo","Breno W. Carvalho","Rodrigo Neumann","Anaximandro Souza","João Pedro Souza","Gabriela de Valnisio","Carmen Nilda Paz","Renato Cerqueira","Mathias Steiner"],"pdf_url":"https://arxiv.org/pdf/2411.03484v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2406.04331v2","updated":"2024-11-05T15:43:18Z","published":"2024-06-06T17:59:10Z","title":"PaCE: Parsimonious Concept Engineering for Large Language Models","summary":" Large Language Models (LLMs) are being used for a wide variety of tasks.\nWhile they are capable of generating human-like responses, they can also\nproduce undesirable output including potentially harmful information, racist or\nsexist language, and hallucinations. Alignment methods are designed to reduce\nsuch undesirable outputs via techniques such as fine-tuning, prompt\nengineering, and representation engineering. However, existing methods face\nseveral challenges: some require costly fine-tuning for every alignment task;\nsome do not adequately remove undesirable concepts, failing alignment; some\nremove benign concepts, lowering the linguistic capabilities of LLMs. To\naddress these issues, we propose Parsimonious Concept Engineering (PaCE), a\nnovel activation engineering framework for alignment. First, to sufficiently\nmodel the concepts, we construct a large-scale concept dictionary in the\nactivation space, in which each atom corresponds to a semantic concept. Given\nany alignment task, we instruct a concept partitioner to efficiently annotate\nthe concepts as benign or undesirable. Then, at inference time, we decompose\nthe LLM activations along the concept dictionary via sparse coding, to\naccurately represent the activations as linear combinations of benign and\nundesirable components. By removing the latter ones from the activations, we\nreorient the behavior of the LLM towards the alignment goal. We conduct\nexperiments on tasks such as response detoxification, faithfulness enhancement,\nand sentiment revising, and show that PaCE achieves state-of-the-art alignment\nperformance while maintaining linguistic capabilities.\n","authors":["Jinqi Luo","Tianjiao Ding","Kwan Ho Ryan Chan","Darshan Thaker","Aditya Chattopadhyay","Chris Callison-Burch","René Vidal"],"pdf_url":"https://arxiv.org/pdf/2406.04331v2.pdf","comment":"Accepted in NeurIPS 2024. GitHub repository at\n https://github.com/peterljq/Parsimonious-Concept-Engineering"},{"id":"http://arxiv.org/abs/2411.03143v1","updated":"2024-11-05T14:33:50Z","published":"2024-11-05T14:33:50Z","title":"Self-supervised Hierarchical Representation for Medication\n Recommendation","summary":" Medication recommender is to suggest appropriate medication combinations\nbased on a patient's health history, e.g., diagnoses and procedures. Existing\nworks represent different diagnoses/procedures well separated by one-hot\nencodings. However, they ignore the latent hierarchical structures of these\nmedical terms, undermining the generalization performance of the model. For\nexample, \"Respiratory Diseases\", \"Chronic Respiratory Diseases\" and \"Chronic\nBronchiti\" have a hierarchical relationship, progressing from general to\nspecific. To address this issue, we propose a novel hierarchical encoder named\nHIER to hierarchically represent diagnoses and procedures, which is based on\nstandard medical codes and compatible with any existing methods. Specifically,\nthe proposed method learns relation embedding with a self-supervised objective\nfor incorporating the neighbor hierarchical structure. Additionally, we develop\nthe position encoding to explicitly introduce global hierarchical position.\nExtensive experiments demonstrate significant and consistent improvements in\nrecommendation accuracy across four baselines and two real-world clinical\ndatasets.\n","authors":["Yuliang Liang","Yuting Liu","Yizhou Dang","Enneng Yang","Guibing Guo","Wei Cai","Jianzhe Zhao","Xingwei Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20598v2","updated":"2024-11-05T14:15:03Z","published":"2024-10-27T21:12:12Z","title":"R^3AG: First Workshop on Refined and Reliable Retrieval Augmented\n Generation","summary":" Retrieval-augmented generation (RAG) has gained wide attention as the key\ncomponent to improve generative models with external knowledge augmentation\nfrom information retrieval. It has shown great prominence in enhancing the\nfunctionality and performance of large language model (LLM)-based applications.\nHowever, with the comprehensive application of RAG, more and more problems and\nlimitations have been identified, thus urgently requiring further fundamental\nexploration to improve current RAG frameworks. This workshop aims to explore in\ndepth how to conduct refined and reliable RAG for downstream AI tasks.\n To this end, we propose to organize the first R3AG workshop at SIGIR-AP 2024\nto call for participants to re-examine and formulate the basic principles and\npractical implementation of refined and reliable RAG. The workshop serves as a\nplatform for both academia and industry researchers to conduct discussions,\nshare insights, and foster research to build the next generation of RAG\nsystems. Participants will engage in discussions and presentations focusing on\nfundamental challenges, cutting-edge research, and potential pathways to\nimprove RAG. At the end of the workshop, we aim to have a clearer understanding\nof how to improve the reliability and applicability of RAG with more robust\ninformation retrieval and language generation.\n","authors":["Zihan Wang","Xuri Ge","Joemon M. Jose","Haitao Yu","Weizhi Ma","Zhaochun Ren","Xin Xin"],"pdf_url":"https://arxiv.org/pdf/2410.20598v2.pdf","comment":"R^3AG workshop overview at SIGIR-AP 2024"},{"id":"http://arxiv.org/abs/2309.14984v2","updated":"2024-11-05T12:09:18Z","published":"2023-09-26T14:56:56Z","title":"Facilitating Interdisciplinary Knowledge Transfer with Research Paper\n Recommender Systems","summary":" In the extensive recommender systems literature, novelty and diversity have\nbeen identified as key properties of useful recommendations. However, these\nproperties have received limited attention in the specific sub-field of\nresearch paper recommender systems. In this work, we argue for the importance\nof offering novel and diverse research paper recommendations to scientists.\nThis approach aims to reduce siloed reading, break down filter bubbles, and\npromote interdisciplinary research. We propose a novel framework for evaluating\nthe novelty and diversity of research paper recommendations that leverages\nmethods from network analysis and natural language processing. Using this\nframework, we show that the choice of representational method within a larger\nresearch paper recommendation system can have a measurable impact on the nature\nof downstream recommendations, specifically on their novelty and diversity. We\nhighlight a novel paper embedding method, which we demonstrate offers more\ninnovative and diverse recommendations without sacrificing precision, compared\nto other state-of-the-art baselines.\n","authors":["Eoghan Cunningham","Derek Greene","Barry Smyth"],"pdf_url":"https://arxiv.org/pdf/2309.14984v2.pdf","comment":"Under Review at QSS"},{"id":"http://arxiv.org/abs/2404.00243v2","updated":"2024-11-05T11:46:14Z","published":"2024-03-30T04:39:18Z","title":"DSFNet: Learning Disentangled Scenario Factorization for Multi-Scenario\n Route Ranking","summary":" Multi-scenario route ranking (MSRR) is crucial in many industrial mapping\nsystems. However, the industrial community mainly adopts interactive interfaces\nto encourage users to select pre-defined scenarios, which may hinder the\ndownstream ranking performance. In addition, in the academic community, the\nmulti-scenario ranking works only come from other fields, and there are no\nworks specifically focusing on route data due to lacking a publicly available\nMSRR dataset. Moreover, all the existing multi-scenario works still fail to\naddress the three specific challenges of MSRR simultaneously, i.e. explosion of\nscenario number, high entanglement, and high-capacity demand. Different from\nthe prior, to address MSRR, our key idea is to factorize the complicated\nscenario in route ranking into several disentangled factor scenario patterns.\nAccordingly, we propose a novel method, Disentangled Scenario Factorization\nNetwork (DSFNet), which flexibly composes scenario-dependent parameters based\non a high-capacity multi-factor-scenario-branch structure. Then, a novel\nregularization is proposed to induce the disentanglement of factor scenarios.\nFurthermore, two extra novel techniques, i.e. scenario-aware batch\nnormalization and scenario-aware feature filtering, are developed to improve\nthe network awareness of scenario representation. Additionally, to facilitate\nMSRR research in the academic community, we propose MSDR, the first large-scale\npublicly available annotated industrial Multi-Scenario Driving Route dataset.\nComprehensive experimental results demonstrate the superiority of our DSFNet,\nwhich has been successfully deployed in AMap to serve the major online traffic.\n","authors":["Jiahao Yu","Yihai Duan","Longfei Xu","Chao Chen","Shuliang Liu","Kaikui Liu","Fan Yang","Xiangxiang Chu","Ning Guo"],"pdf_url":"https://arxiv.org/pdf/2404.00243v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02992v1","updated":"2024-11-05T10:53:25Z","published":"2024-11-05T10:53:25Z","title":"Efficient and Effective Adaptation of Multimodal Foundation Models in\n Sequential Recommendation","summary":" Multimodal foundation models (MFMs) have revolutionized sequential\nrecommender systems through advanced representation learning. While\nParameter-efficient Fine-tuning (PEFT) is commonly used to adapt these models,\nstudies often prioritize parameter efficiency, neglecting GPU memory and\ntraining speed. To address this, we introduced the IISAN framework,\nsignificantly enhancing efficiency. However, IISAN was limited to symmetrical\nMFMs and identical text and image encoders, preventing the use of\nstate-of-the-art Large Language Models. To overcome this, we developed\nIISAN-Versa, a versatile plug-and-play architecture compatible with both\nsymmetrical and asymmetrical MFMs. IISAN-Versa employs a Decoupled PEFT\nstructure and utilizes both intra- and inter-modal adaptation. It effectively\nhandles asymmetry through a simple yet effective combination of group\nlayer-dropping and dimension transformation alignment. Our research\ndemonstrates that IISAN-Versa effectively adapts large text encoders, and we\nfurther identify a scaling effect where larger encoders generally perform\nbetter. IISAN-Versa also demonstrates strong versatility in our defined\nmultimodal scenarios, which include raw titles and captions generated from\nimages and videos. Additionally, IISAN-Versa achieved state-of-the-art\nperformance on the Microlens public benchmark. We will release our code and\ndatasets to support future research.\n","authors":["Junchen Fu","Xuri Ge","Xin Xin","Alexandros Karatzoglou","Ioannis Arapakis","Kaiwen Zheng","Yongxin Ni","Joemon M. Jose"],"pdf_url":"https://arxiv.org/pdf/2411.02992v1.pdf","comment":"The extension of IISAN in SIGIR2024"},{"id":"http://arxiv.org/abs/2411.02959v1","updated":"2024-11-05T09:58:36Z","published":"2024-11-05T09:58:36Z","title":"HtmlRAG: HTML is Better Than Plain Text for Modeling Retrieved Knowledge\n in RAG Systems","summary":" Retrieval-Augmented Generation (RAG) has been shown to improve knowledge\ncapabilities and alleviate the hallucination problem of LLMs. The Web is a\nmajor source of external knowledge used in RAG systems, and many commercial\nsystems such as ChatGPT and Perplexity have used Web search engines as their\nmajor retrieval systems. Typically, such RAG systems retrieve search results,\ndownload HTML sources of the results, and then extract plain texts from the\nHTML sources. Plain text documents or chunks are fed into the LLMs to augment\nthe generation. However, much of the structural and semantic information\ninherent in HTML, such as headings and table structures, is lost during this\nplain-text-based RAG process. To alleviate this problem, we propose HtmlRAG,\nwhich uses HTML instead of plain text as the format of retrieved knowledge in\nRAG. We believe HTML is better than plain text in modeling knowledge in\nexternal documents, and most LLMs possess robust capacities to understand HTML.\nHowever, utilizing HTML presents new challenges. HTML contains additional\ncontent such as tags, JavaScript, and CSS specifications, which bring extra\ninput tokens and noise to the RAG system. To address this issue, we propose\nHTML cleaning, compression, and pruning strategies, to shorten the HTML while\nminimizing the loss of information. Specifically, we design a two-step\nblock-tree-based pruning method that prunes useless HTML blocks and keeps only\nthe relevant part of the HTML. Experiments on six QA datasets confirm the\nsuperiority of using HTML in RAG systems.\n","authors":["Jiejun Tan","Zhicheng Dou","Wen Wang","Mang Wang","Weipeng Chen","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2411.02959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01135v2","updated":"2024-11-05T08:51:44Z","published":"2024-11-02T04:44:27Z","title":"Music Foundation Model as Generic Booster for Music Downstream Tasks","summary":" We demonstrate the efficacy of using intermediate representations from a\nsingle foundation model to enhance various music downstream tasks. We introduce\nSoniDo, a music foundation model (MFM) designed to extract hierarchical\nfeatures from target music samples. By leveraging hierarchical intermediate\nfeatures, SoniDo constrains the information granularity, leading to improved\nperformance across various downstream tasks including both understanding and\ngenerative tasks. We specifically evaluated this approach on representative\ntasks such as music tagging, music transcription, music source separation, and\nmusic mixing. Our results reveal that the features extracted from foundation\nmodels provide valuable enhancements in training downstream task models. This\nhighlights the capability of using features extracted from music foundation\nmodels as a booster for downstream tasks. Our approach not only benefits\nexisting task-specific models but also supports music downstream tasks\nconstrained by data scarcity. This paves the way for more effective and\naccessible music processing solutions.\n","authors":["WeiHsiang Liao","Yuhta Takida","Yukara Ikemiya","Zhi Zhong","Chieh-Hsin Lai","Giorgio Fabbro","Kazuki Shimada","Keisuke Toyama","Kinwai Cheuk","Marco A. Martínez-Ramírez","Shusuke Takahashi","Stefan Uhlich","Taketo Akama","Woosung Choi","Yuichiro Koyama","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2411.01135v2.pdf","comment":"41 pages with 14 figures"},{"id":"http://arxiv.org/abs/2411.02864v1","updated":"2024-11-05T07:12:36Z","published":"2024-11-05T07:12:36Z","title":"Graph-DPEP: Decomposed Plug and Ensemble Play for Few-Shot Document\n Relation Extraction with Graph-of-Thoughts Reasoning","summary":" Large language models (LLMs) pre-trained on massive corpora have demonstrated\nimpressive few-shot learning capability on many NLP tasks. Recasting an NLP\ntask into a text-to-text generation task is a common practice so that\ngenerative LLMs can be prompted to resolve it. However, performing\ndocument-level relation extraction (DocRE) tasks with generative LLM models is\nstill challenging due to the structured output format of DocRE, which\ncomplicates the conversion to plain text. Limited information available in\nfew-shot samples and prompt instructions induce further difficulties and\nchallenges in relation extraction for mentioned entities in a document. In this\npaper, we represent the structured output as a graph-style triplet rather than\nnatural language expressions and leverage generative LLMs for the DocRE task.\nOur approach, the Graph-DPEP framework is grounded in the reasoning behind\ntriplet explanation thoughts presented in natural language. In this framework,\nwe first introduce a ``decomposed-plug\" method for performing the generation\nfrom LLMs over prompts with type-space decomposition to alleviate the burden of\ndistinguishing all relation types. Second, we employ a verifier for calibrating\nthe generation and identifying overlooked query entity pairs. Third, we develop\n\"ensemble-play\", reapplying generation on the entire type list by leveraging\nthe reasoning thoughts embedded in a sub-graph associated with the missing\nquery pair to address the missingness issue. Through extensive comparisons with\nexisting prompt techniques and alternative Language Models (LLMs), our\nframework demonstrates superior performance on publicly available benchmarks in\nexperiments.\n","authors":["Tao Zhang","Ning Yan","Masood Mortazavi","Hoang H. Nguyen","Zhongfen Deng","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2411.02864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03364v1","updated":"2024-11-05T06:54:38Z","published":"2024-11-05T06:54:38Z","title":"DM4Steal: Diffusion Model For Link Stealing Attack On Graph Neural\n Networks","summary":" Graph has become increasingly integral to the advancement of recommendation\nsystems, particularly with the fast development of graph neural network(GNN).\nBy exploring the virtue of rich node features and link information, GNN is\ndesigned to provide personalized and accurate suggestions. Meanwhile, the\nprivacy leakage of GNN in such contexts has also captured special attention.\nPrior work has revealed that a malicious user can utilize auxiliary knowledge\nto extract sensitive link data of the target graph, integral to recommendation\nsystems, via the decision made by the target GNN model. This poses a\nsignificant risk to the integrity and confidentiality of data used in\nrecommendation system. Though important, previous works on GNN's privacy\nleakage are still challenged in three aspects, i.e., limited stealing attack\nscenarios, sub-optimal attack performance, and adaptation against defense. To\naddress these issues, we propose a diffusion model based link stealing attack,\nnamed DM4Steal. It differs previous work from three critical aspects. (i)\nGenerality: aiming at six attack scenarios with limited auxiliary knowledge, we\npropose a novel training strategy for diffusion models so that DM4Steal is\ntransferable to diverse attack scenarios. (ii) Effectiveness: benefiting from\nthe retention of semantic structure in the diffusion model during the training\nprocess, DM4Steal is capable to learn the precise topology of the target graph\nthrough the GNN decision process. (iii) Adaptation: when GNN is defensive\n(e.g., DP, Dropout), DM4Steal relies on the stability that comes from sampling\nthe score model multiple times to keep performance degradation to a minimum,\nthus DM4Steal implements successful adaptive attack on defensive GNN.\n","authors":["Jinyin Chen","Haonan Ma","Haibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.03364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14900v3","updated":"2024-11-05T06:52:30Z","published":"2024-06-21T06:47:28Z","title":"Decoding Matters: Addressing Amplification Bias and Homogeneity Issue\n for LLM-based Recommendation","summary":" Adapting Large Language Models (LLMs) for recommendation requires careful\nconsideration of the decoding process, given the inherent differences between\ngenerating items and natural language. Existing approaches often directly apply\nLLMs' original decoding methods. However, we find these methods encounter\nsignificant challenges: 1) amplification bias -- where standard length\nnormalization inflates scores for items containing tokens with generation\nprobabilities close to 1 (termed ghost tokens), and 2) homogeneity issue --\ngenerating multiple similar or repetitive items for a user. To tackle these\nchallenges, we introduce a new decoding approach named Debiasing-Diversifying\nDecoding (D3). D3 disables length normalization for ghost tokens to alleviate\namplification bias, and it incorporates a text-free assistant model to\nencourage tokens less frequently generated by LLMs for counteracting\nrecommendation homogeneity. Extensive experiments on real-world datasets\ndemonstrate the method's effectiveness in enhancing accuracy and diversity. The\ncode is available at https://github.com/SAI990323/DecodingMatters.\n","authors":["Keqin Bao","Jizhi Zhang","Yang Zhang","Xinyue Huo","Chong Chen","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2406.14900v3.pdf","comment":"Accepted at EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2411.02851v1","updated":"2024-11-05T06:49:14Z","published":"2024-11-05T06:49:14Z","title":"Learning to Unify Audio, Visual and Text for Audio-Enhanced Multilingual\n Visual Answer Localization","summary":" The goal of Multilingual Visual Answer Localization (MVAL) is to locate a\nvideo segment that answers a given multilingual question. Existing methods\neither focus solely on visual modality or integrate visual and subtitle\nmodalities. However, these methods neglect the audio modality in videos,\nconsequently leading to incomplete input information and poor performance in\nthe MVAL task. In this paper, we propose a unified Audio-Visual-Textual Span\nLocalization (AVTSL) method that incorporates audio modality to augment both\nvisual and textual representations for the MVAL task. Specifically, we\nintegrate features from three modalities and develop three predictors, each\ntailored to the unique contributions of the fused modalities: an audio-visual\npredictor, a visual predictor, and a textual predictor. Each predictor\ngenerates predictions based on its respective modality. To maintain consistency\nacross the predicted results, we introduce an Audio-Visual-Textual Consistency\nmodule. This module utilizes a Dynamic Triangular Loss (DTL) function, allowing\neach modality's predictor to dynamically learn from the others. This\ncollaborative learning ensures that the model generates consistent and\ncomprehensive answers. Extensive experiments show that our proposed method\noutperforms several state-of-the-art (SOTA) methods, which demonstrates the\neffectiveness of the audio modality.\n","authors":["Zhibin Wen","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2411.02851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02850v1","updated":"2024-11-05T06:44:15Z","published":"2024-11-05T06:44:15Z","title":"WASHtsApp -- A RAG-powered WhatsApp Chatbot for supporting rural African\n clean water access, sanitation and hygiene","summary":" This paper introduces WASHtsApp, a WhatsApp-based chatbot designed to educate\nrural African communities on clean water access, sanitation, and hygiene (WASH)\nprinciples. WASHtsApp leverages a Retrieval-Augmented Generation (RAG) approach\nto address the limitations of previous approaches with limited reach or missing\ncontextualization. The paper details the development process, employing Design\nScience Research Methodology. The evaluation consisted of two phases: content\nvalidation by four WASH experts and community validation by potential users.\nContent validation confirmed WASHtsApp's ability to provide accurate and\nrelevant WASH-related information. Community validation indicated high user\nacceptance and perceived usefulness of the chatbot. The paper concludes by\ndiscussing the potential for further development, including incorporating local\nlanguages and user data analysis for targeted interventions. It also proposes\nfuture research cycles focused on wider deployment and leveraging user data for\neducational purposes.\n","authors":["Simon Kloker","Alex Cedric Luyima","Matthew Bazanya"],"pdf_url":"https://arxiv.org/pdf/2411.02850v1.pdf","comment":"Working Paper"},{"id":"http://arxiv.org/abs/2411.02831v1","updated":"2024-11-05T06:03:55Z","published":"2024-11-05T06:03:55Z","title":"Enhancing EmoBot: An In-Depth Analysis of User Satisfaction and Faults\n in an Emotion-Aware Chatbot","summary":" The research community has traditionally shown a keen interest in emotion\nmodeling, with a notable emphasis on the detection aspect. In contrast, the\nexploration of emotion generation has received less attention.This study delves\ninto an existing state-of-the-art emotional chatbot, EmoBot, designed for\ngenerating emotions in general-purpose conversations. This research involves a\ncomprehensive examination, including a survey to evaluate EmoBot's proficiency\nin key dimensions like usability, accuracy, and overall user satisfaction, with\na specific focus on fault tolerance. By closely examining the chatbot's\noperations, we identified some noteworthy shortcomings in the existing model.\nWe propose some solutions designed to address and overcome the identified\nissues.\n","authors":["Taseen Mubassira","Mehedi Hasan","A. B. M. Alim Al Iislam"],"pdf_url":"https://arxiv.org/pdf/2411.02831v1.pdf","comment":"3 pages, extended abstract"},{"id":"http://arxiv.org/abs/2411.02810v1","updated":"2024-11-05T04:57:55Z","published":"2024-11-05T04:57:55Z","title":"Leveraging Vision-Language Models for Manufacturing Feature Recognition\n in CAD Designs","summary":" Automatic feature recognition (AFR) is essential for transforming design\nknowledge into actionable manufacturing information. Traditional AFR methods,\nwhich rely on predefined geometric rules and large datasets, are often\ntime-consuming and lack generalizability across various manufacturing features.\nTo address these challenges, this study investigates vision-language models\n(VLMs) for automating the recognition of a wide range of manufacturing features\nin CAD designs without the need for extensive training datasets or predefined\nrules. Instead, prompt engineering techniques, such as multi-view query images,\nfew-shot learning, sequential reasoning, and chain-of-thought, are applied to\nenable recognition. The approach is evaluated on a newly developed CAD dataset\ncontaining designs of varying complexity relevant to machining, additive\nmanufacturing, sheet metal forming, molding, and casting. Five VLMs, including\nthree closed-source models (GPT-4o, Claude-3.5-Sonnet, and Claude-3.0-Opus) and\ntwo open-source models (LLava and MiniCPM), are evaluated on this dataset with\nground truth features labelled by experts. Key metrics include feature quantity\naccuracy, feature name matching accuracy, hallucination rate, and mean absolute\nerror (MAE). Results show that Claude-3.5-Sonnet achieves the highest feature\nquantity accuracy (74%) and name-matching accuracy (75%) with the lowest MAE\n(3.2), while GPT-4o records the lowest hallucination rate (8%). In contrast,\nopen-source models have higher hallucination rates (>30%) and lower accuracies\n(<40%). This study demonstrates the potential of VLMs to automate feature\nrecognition in CAD designs within diverse manufacturing scenarios.\n","authors":["Muhammad Tayyab Khan","Lequn Chen","Ye Han Ng","Wenhe Feng","Nicholas Yew Jin Tan","Seung Ki Moon"],"pdf_url":"https://arxiv.org/pdf/2411.02810v1.pdf","comment":"Paper has been submitted to The ASME Journal of Computing and\n Information Science in Engineering (JCISE)"},{"id":"http://arxiv.org/abs/2411.02791v1","updated":"2024-11-05T04:01:41Z","published":"2024-11-05T04:01:41Z","title":"Language Models and Cycle Consistency for Self-Reflective Machine\n Translation","summary":" This paper introduces a novel framework that leverages large language models\n(LLMs) for machine translation (MT). We start with one conjecture: an ideal\ntranslation should contain complete and accurate information for a strong\nenough LLM to recover the original sentence. We generate multiple translation\ncandidates from a source language A to a target language B, and subsequently\ntranslate these candidates back to the original language A. By evaluating the\ncycle consistency between the original and back-translated sentences using\nmetrics such as token-level precision and accuracy, we implicitly estimate the\ntranslation quality in language B, without knowing its ground-truth. This also\nhelps to evaluate the LLM translation capability, only with monolingual\ncorpora. For each source sentence, we identify the translation candidate with\noptimal cycle consistency with the original sentence as the final answer. Our\nexperiments demonstrate that larger LLMs, or the same LLM with more forward\npasses during inference, exhibit increased cycle consistency, aligning with the\nLLM model size scaling law and test-time computation scaling law. This work\nprovide methods for, 1) to implicitly evaluate translation quality of a\nsentence in the target language, 2), to evaluate capability of LLM for\nany-to-any-language translation, and 3), how to generate a better translation\nfor a specific LLM.\n","authors":["Jianqiao Wangni"],"pdf_url":"https://arxiv.org/pdf/2411.02791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02790v1","updated":"2024-11-05T03:55:25Z","published":"2024-11-05T03:55:25Z","title":"Memory Augmented Cross-encoders for Controllable Personalized Search","summary":" Personalized search represents a problem where retrieval models condition on\nhistorical user interaction data in order to improve retrieval results.\nHowever, personalization is commonly perceived as opaque and not amenable to\ncontrol by users. Further, personalization necessarily limits the space of\nitems that users are exposed to. Therefore, prior work notes a tension between\npersonalization and users' ability for discovering novel items. While discovery\nof novel items in personalization setups may be resolved through search result\ndiversification, these approaches do little to allow user control over\npersonalization. Therefore, in this paper, we introduce an approach for\ncontrollable personalized search. Our model, CtrlCE presents a novel\ncross-encoder model augmented with an editable memory constructed from users\nhistorical items. Our proposed memory augmentation allows cross-encoder models\nto condition on large amounts of historical user data and supports interaction\nfrom users permitting control over personalization. Further, controllable\npersonalization for search must account for queries which don't require\npersonalization, and in turn user control. For this, we introduce a calibrated\nmixing model which determines when personalization is necessary. This allows\nsystem designers using CtrlCE to only obtain user input for control when\nnecessary. In multiple datasets of personalized search, we show CtrlCE to\nresult in effective personalization as well as fulfill various key goals for\ncontrollable personalized search.\n","authors":["Sheshera Mysore","Garima Dhanania","Kishor Patil","Surya Kallumadi","Andrew McCallum","Hamed Zamani"],"pdf_url":"https://arxiv.org/pdf/2411.02790v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2410.09359v2","updated":"2024-11-05T03:45:24Z","published":"2024-10-12T04:00:55Z","title":"Green Recommender Systems: Optimizing Dataset Size for Energy-Efficient\n Algorithm Performance","summary":" As recommender systems become increasingly prevalent, the environmental\nimpact and energy efficiency of training large-scale models have come under\nscrutiny. This paper investigates the potential for energy-efficient algorithm\nperformance by optimizing dataset sizes through downsampling techniques in the\ncontext of Green Recommender Systems. We conducted experiments on the MovieLens\n100K, 1M, 10M, and Amazon Toys and Games datasets, analyzing the performance of\nvarious recommender algorithms under different portions of dataset size. Our\nresults indicate that while more training data generally leads to higher\nalgorithm performance, certain algorithms, such as FunkSVD and BiasedMF,\nparticularly with unbalanced and sparse datasets like Amazon Toys and Games,\nmaintain high-quality recommendations with up to a 50% reduction in training\ndata, achieving nDCG@10 scores within approximately 13% of full dataset\nperformance. These findings suggest that strategic dataset reduction can\ndecrease computational and environmental costs without substantially\ncompromising recommendation quality. This study advances sustainable and green\nrecommender systems by providing insights for reducing energy consumption while\nmaintaining effectiveness.\n","authors":["Ardalan Arabzadeh","Tobias Vente","Joeran Beel"],"pdf_url":"https://arxiv.org/pdf/2410.09359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09180v2","updated":"2024-11-05T03:34:10Z","published":"2023-11-15T18:19:58Z","title":"Pearl: Personalizing Large Language Model Writing Assistants with\n Generation-Calibrated Retrievers","summary":" Powerful large language models have facilitated the development of writing\nassistants that promise to significantly improve the quality and efficiency of\ncomposition and communication. However, a barrier to effective assistance is\nthe lack of personalization in LLM outputs to the author's communication style,\nspecialized knowledge, and values. In this paper, we address this challenge by\nproposing Pearl, a LLM writing assistant personalized with a retriever that is\ntrained to be generation-calibrated for personalization. Generation calibration\nensures that our retriever selects historic user authored documents to augment\nan LLM prompt such that they are likely to help an LLM generation better adhere\nto a users' preferences. We propose two key novelties for training such a\nretriever: (1) A training data selection method that identifies user requests\nlikely to benefit from personalization and documents that provide that benefit;\nand (2) A scale-calibrating KL-divergence objective that ensures that our\nretriever scores remain proportional to the downstream generation quality from\nusing the document for personalized generation. In a series of holistic\nevaluations, we demonstrate the effectiveness of Pearl in generating long-form\ntexts on multiple social media datasets. Finally, we demonstrate how a\ngeneration-calibrated retriever can double as a performance predictor --\ndetecting low quality retrieval, and improving potentially under-performing\noutputs via revision with LLMs.\n","authors":["Sheshera Mysore","Zhuoran Lu","Mengting Wan","Longqi Yang","Bahareh Sarrafzadeh","Steve Menezes","Tina Baghaee","Emmanuel Barajas Gonzalez","Jennifer Neville","Tara Safavi"],"pdf_url":"https://arxiv.org/pdf/2311.09180v2.pdf","comment":"Accepted to Workshop on Customizable NLP at EMNLP 2024"},{"id":"http://arxiv.org/abs/2408.09698v4","updated":"2024-11-05T03:32:31Z","published":"2024-08-19T04:44:32Z","title":"Harnessing Multimodal Large Language Models for Multimodal Sequential\n Recommendation","summary":" Recent advances in Large Language Models (LLMs) have demonstrated significant\npotential in the field of Recommendation Systems (RSs). Most existing studies\nhave focused on converting user behavior logs into textual prompts and\nleveraging techniques such as prompt tuning to enable LLMs for recommendation\ntasks. Meanwhile, research interest has recently grown in multimodal\nrecommendation systems that integrate data from images, text, and other sources\nusing modality fusion techniques. This introduces new challenges to the\nexisting LLM-based recommendation paradigm which relies solely on text modality\ninformation. Moreover, although Multimodal Large Language Models (MLLMs)\ncapable of processing multi-modal inputs have emerged, how to equip MLLMs with\nmulti-modal recommendation capabilities remains largely unexplored. To this\nend, in this paper, we propose the Multimodal Large Language Model-enhanced\nMultimodaln Sequential Recommendation (MLLM-MSR) model. To capture the dynamic\nuser preference, we design a two-stage user preference summarization method.\nSpecifically, we first utilize an MLLM-based item-summarizer to extract image\nfeature given an item and convert the image into text. Then, we employ a\nrecurrent user preference summarization generation paradigm to capture the\ndynamic changes in user preferences based on an LLM-based user-summarizer.\nFinally, to enable the MLLM for multi-modal recommendation task, we propose to\nfine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT)\ntechniques. Extensive evaluations across various datasets validate the\neffectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt\nto the evolving dynamics of user preferences.\n","authors":["Yuyang Ye","Zhi Zheng","Yishan Shen","Tianshu Wang","Hengruo Zhang","Peijun Zhu","Runlong Yu","Kai Zhang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.09698v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00077v3","updated":"2024-11-05T02:25:16Z","published":"2024-06-22T15:32:53Z","title":"Differentially Private Graph Diffusion with Applications in Personalized\n PageRanks","summary":" Graph diffusion, which iteratively propagates real-valued substances among\nthe graph, is used in numerous graph/network-involved applications. However,\nreleasing diffusion vectors may reveal sensitive linking information in the\ndata such as transaction information in financial network data. However,\nprotecting the privacy of graph data is challenging due to its interconnected\nnature. This work proposes a novel graph diffusion framework with edge-level\ndifferential privacy guarantees by using noisy diffusion iterates. The\nalgorithm injects Laplace noise per diffusion iteration and adopts a\ndegree-based thresholding function to mitigate the high sensitivity induced by\nlow-degree nodes. Our privacy loss analysis is based on Privacy Amplification\nby Iteration (PABI), which to our best knowledge, is the first effort that\nanalyzes PABI with Laplace noise and provides relevant applications. We also\nintroduce a novel Infinity-Wasserstein distance tracking method, which tightens\nthe analysis of privacy leakage and makes PABI more applicable in practice. We\nevaluate this framework by applying it to Personalized Pagerank computation for\nranking tasks. Experiments on real-world network data demonstrate the\nsuperiority of our method under stringent privacy conditions.\n","authors":["Rongzhe Wei","Eli Chien","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2407.00077v3.pdf","comment":"Appear in NeurIPS 2024. In this version, we provide a more rigorous\n analysis of graph distortion by establishing a tight bound, then update our\n corresponding experimental results, which are better than the previous\n version"},{"id":"http://arxiv.org/abs/2411.02695v1","updated":"2024-11-05T00:46:25Z","published":"2024-11-05T00:46:25Z","title":"JEL: Applying End-to-End Neural Entity Linking in JPMorgan Chase","summary":" Knowledge Graphs have emerged as a compelling abstraction for capturing key\nrelationship among the entities of interest to enterprises and for integrating\ndata from heterogeneous sources. JPMorgan Chase (JPMC) is leading this trend by\nleveraging knowledge graphs across the organization for multiple mission\ncritical applications such as risk assessment, fraud detection, investment\nadvice, etc. A core problem in leveraging a knowledge graph is to link mentions\n(e.g., company names) that are encountered in textual sources to entities in\nthe knowledge graph. Although several techniques exist for entity linking, they\nare tuned for entities that exist in Wikipedia, and fail to generalize for the\nentities that are of interest to an enterprise. In this paper, we propose a\nnovel end-to-end neural entity linking model (JEL) that uses minimal context\ninformation and a margin loss to generate entity embeddings, and a Wide & Deep\nLearning model to match character and semantic information respectively. We\nshow that JEL achieves the state-of-the-art performance to link mentions of\ncompany names in financial news with entities in our knowledge graph. We report\non our efforts to deploy this model in the company-wide system to generate\nalerts in response to financial news. The methodology used for JEL is directly\napplicable and usable by other enterprises who need entity linking solutions\nfor data that are unique to their respective situations.\n","authors":["Wanying Ding","Vinay K. Chaudhri","Naren Chittar","Krishna Konakanchi"],"pdf_url":"https://arxiv.org/pdf/2411.02695v1.pdf","comment":"8 pages, 4 figures, IAAI-21"},{"id":"http://arxiv.org/abs/2411.02692v1","updated":"2024-11-05T00:39:22Z","published":"2024-11-05T00:39:22Z","title":"JPEC: A Novel Graph Neural Network for Competitor Retrieval in Financial\n Knowledge Graphs","summary":" Knowledge graphs have gained popularity for their ability to organize and\nanalyze complex data effectively. When combined with graph embedding\ntechniques, such as graph neural networks (GNNs), knowledge graphs become a\npotent tool in providing valuable insights. This study explores the application\nof graph embedding in identifying competitors from a financial knowledge graph.\nExisting state-of-the-art(SOTA) models face challenges due to the unique\nattributes of our knowledge graph, including directed and undirected\nrelationships, attributed nodes, and minimal annotated competitor connections.\nTo address these challenges, we propose a novel graph embedding model,\nJPEC(JPMorgan Proximity Embedding for Competitor Detection), which utilizes\ngraph neural network to learn from both first-order and second-order node\nproximity together with vital features for competitor retrieval. JPEC had\noutperformed most existing models in extensive experiments, showcasing its\neffectiveness in competitor retrieval.\n","authors":["Wanying Ding","Manoj Cherukumalli","Santosh Chikoti","Vinay K. Chaudhri"],"pdf_url":"https://arxiv.org/pdf/2411.02692v1.pdf","comment":"5 pages, 4 figures, accepted by SIGIR'24"}],"Multimedia":[{"id":"http://arxiv.org/abs/2411.03109v1","updated":"2024-11-05T13:56:44Z","published":"2024-11-05T13:56:44Z","title":"pTSE-T: Presentation Target Speaker Extraction using Unaligned Text Cues","summary":" TSE aims to extract the clean speech of the target speaker in an audio\nmixture, thus eliminating irrelevant background noise and speech. While prior\nwork has explored various auxiliary cues including pre-recorded speech, visual\ninformation (e.g., lip motions and gestures), and spatial information, the\nacquisition and selection of such strong cues are infeasible in many practical\nscenarios. Unlike all existing work, in this paper, we condition the TSE\nalgorithm on semantic cues extracted from limited and unaligned text content,\nsuch as condensed points from a presentation slide. This method is particularly\nuseful in scenarios like meetings, poster sessions, or lecture presentations,\nwhere acquiring other cues in real-time is challenging. To this end, we design\ntwo different networks. Specifically, our proposed TPE fuses audio features\nwith content-based semantic cues to facilitate time-frequency mask generation\nto filter out extraneous noise, while another proposal, namely TSR, employs the\ncontrastive learning technique to associate blindly separated speech signals\nwith semantic cues. The experimental results show the efficacy in accurately\nidentifying the target speaker by utilizing semantic cues derived from limited\nand unaligned text, resulting in SI-SDRi of 12.16 dB, SDRi of 12.66 dB, PESQi\nof 0.830 and STOIi of 0.150, respectively. Dataset and source code will be\npublicly available. Project demo page: https://slideTSE.github.io/.\n","authors":["Ziyang Jiang","Xinquan Qian","Jiahe Lei","Zexu Pan","Wei Xue","Xu-cheng Yin"],"pdf_url":"https://arxiv.org/pdf/2411.03109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03085v1","updated":"2024-11-05T13:30:27Z","published":"2024-11-05T13:30:27Z","title":"Speech Separation with Pretrained Frontend to Minimize Domain Mismatch","summary":" Speech separation seeks to separate individual speech signals from a speech\nmixture. Typically, most separation models are trained on synthetic data due to\nthe unavailability of target reference in real-world cocktail party scenarios.\nAs a result, there exists a domain gap between real and synthetic data when\ndeploying speech separation models in real-world applications. In this paper,\nwe propose a self-supervised domain-invariant pretrained (DIP) frontend that is\nexposed to mixture data without the need for target reference speech. The DIP\nfrontend utilizes a Siamese network with two innovative pretext tasks, mixture\npredictive coding (MPC) and mixture invariant coding (MIC), to capture shared\ncontextual cues between real and synthetic unlabeled mixtures. Subsequently, we\nfreeze the DIP frontend as a feature extractor when training the downstream\nspeech separation models on synthetic data. By pretraining the DIP frontend\nwith the contextual cues, we expect that the speech separation skills learned\nfrom synthetic data can be effectively transferred to real data. To benefit\nfrom the DIP frontend, we introduce a novel separation pipeline to align the\nfeature resolution of the separation models. We evaluate the speech separation\nquality on standard benchmarks and real-world datasets. The results confirm the\nsuperiority of our DIP frontend over existing speech separation models. This\nstudy underscores the potential of large-scale pretraining to enhance the\nquality and intelligibility of speech separation in real-world applications.\n","authors":["Wupeng Wang","Zexu Pan","Xinke Li","Shuai Wang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2411.03085v1.pdf","comment":"IEEE/ACM Transactions on Audio, Speech, and Language Processing"},{"id":"http://arxiv.org/abs/2411.03034v1","updated":"2024-11-05T12:14:57Z","published":"2024-11-05T12:14:57Z","title":"HumanVLM: Foundation for Human-Scene Vision-Language Model","summary":" Human-scene vision-language tasks are increasingly prevalent in diverse\nsocial applications, yet recent advancements predominantly rely on models\nspecifically tailored to individual tasks. Emerging research indicates that\nlarge vision-language models (VLMs) can enhance performance across various\ndownstream vision-language understanding tasks. However, general-domain models\noften underperform in specialized fields. This study introduces a\ndomain-specific Large Vision-Language Model, Human-Scene Vision-Language Model\n(HumanVLM), designed to provide a foundation for human-scene Vision-Language\ntasks. Specifically, (1) we create a large-scale human-scene multimodal\nimage-text dataset (HumanCaption-10M) sourced from the Internet to facilitate\ndomain-specific alignment; (2) develop a captioning approach for human-centered\nimages, capturing human faces, bodies, and backgrounds, and construct a\nhigh-quality Human-Scene image-text dataset (HumanCaptionHQ, about 311k pairs)\nthat contain as much detailed information as possible about human; (3) Using\nHumanCaption-10M and HumanCaptionHQ, we train a HumanVLM. In the experiments,\nwe then evaluate our HumanVLM across varous downstream tasks, where it\ndemonstrates superior overall performance among multimodal models of comparable\nscale, particularly excelling in human-related tasks and significantly\noutperforming similar models, including Qwen2VL and ChatGPT-4o. HumanVLM,\nalongside the data introduced, will stimulate the research in human-around\nfields.\n","authors":["Dawei Dai","Xu Long","Li Yutang","Zhang Yuanhui","Shuyin Xia"],"pdf_url":"https://arxiv.org/pdf/2411.03034v1.pdf","comment":"34 pages,11 figures"},{"id":"http://arxiv.org/abs/2411.03010v1","updated":"2024-11-05T11:18:43Z","published":"2024-11-05T11:18:43Z","title":"Learning-based Lossless Event Data Compression","summary":" Emerging event cameras acquire visual information by detecting time domain\nbrightness changes asynchronously at the pixel level and, unlike conventional\ncameras, are able to provide high temporal resolution, very high dynamic range,\nlow latency, and low power consumption. Considering the huge amount of data\ninvolved, efficient compression solutions are very much needed. In this\ncontext, this paper presents a novel deep-learning-based lossless event data\ncompression scheme based on octree partitioning and a learned hyperprior model.\nThe proposed method arranges the event stream as a 3D volume and employs an\noctree structure for adaptive partitioning. A deep neural network-based entropy\nmodel, using a hyperprior, is then applied. Experimental results demonstrate\nthat the proposed method outperforms traditional lossless data compression\ntechniques in terms of compression ratio and bits per event.\n","authors":["Ahmadreza Sezavar","Catarina Brites","Joao Ascenso"],"pdf_url":"https://arxiv.org/pdf/2411.03010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02905v3","updated":"2024-11-05T10:51:30Z","published":"2023-08-05T15:54:06Z","title":"FASTER: A Font-Agnostic Scene Text Editing and Rendering Framework","summary":" Scene Text Editing (STE) is a challenging research problem, that primarily\naims towards modifying existing texts in an image while preserving the\nbackground and the font style of the original text. Despite its utility in\nnumerous real-world applications, existing style-transfer-based approaches have\nshown sub-par editing performance due to (1) complex image backgrounds, (2)\ndiverse font attributes, and (3) varying word lengths within the text. To\naddress such limitations, in this paper, we propose a novel font-agnostic scene\ntext editing and rendering framework, named FASTER, for simultaneously\ngenerating text in arbitrary styles and locations while preserving a natural\nand realistic appearance and structure. A combined fusion of target mask\ngeneration and style transfer units, with a cascaded self-attention mechanism\nhas been proposed to focus on multi-level text region edits to handle varying\nword lengths. Extensive evaluation on a real-world database with further\nsubjective human evaluation study indicates the superiority of FASTER in both\nscene text editing and rendering tasks, in terms of model performance and\nefficiency. Our code will be released upon acceptance.\n","authors":["Alloy Das","Sanket Biswas","Prasun Roy","Subhankar Ghosh","Umapada Pal","Michael Blumenstein","Josep Lladós","Saumik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2308.02905v3.pdf","comment":"Accepted in WACV 2025"},{"id":"http://arxiv.org/abs/2411.02860v1","updated":"2024-11-05T07:09:14Z","published":"2024-11-05T07:09:14Z","title":"Continual Audio-Visual Sound Separation","summary":" In this paper, we introduce a novel continual audio-visual sound separation\ntask, aiming to continuously separate sound sources for new classes while\npreserving performance on previously learned classes, with the aid of visual\nguidance. This problem is crucial for practical visually guided auditory\nperception as it can significantly enhance the adaptability and robustness of\naudio-visual sound separation models, making them more applicable for\nreal-world scenarios where encountering new sound sources is commonplace. The\ntask is inherently challenging as our models must not only effectively utilize\ninformation from both modalities in current tasks but also preserve their\ncross-modal association in old tasks to mitigate catastrophic forgetting during\naudio-visual continual learning. To address these challenges, we propose a\nnovel approach named ContAV-Sep (\\textbf{Cont}inual\n\\textbf{A}udio-\\textbf{V}isual Sound \\textbf{Sep}aration). ContAV-Sep presents\na novel Cross-modal Similarity Distillation Constraint (CrossSDC) to uphold the\ncross-modal semantic similarity through incremental tasks and retain previously\nacquired knowledge of semantic similarity in old models, mitigating the risk of\ncatastrophic forgetting. The CrossSDC can seamlessly integrate into the\ntraining process of different audio-visual sound separation frameworks.\nExperiments demonstrate that ContAV-Sep can effectively mitigate catastrophic\nforgetting and achieve significantly better performance compared to other\ncontinual learning baselines for audio-visual sound separation. Code is\navailable at: \\url{https://github.com/weiguoPian/ContAV-Sep_NeurIPS2024}.\n","authors":["Weiguo Pian","Yiyang Nan","Shijian Deng","Shentong Mo","Yunhui Guo","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2411.02860v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.02851v1","updated":"2024-11-05T06:49:14Z","published":"2024-11-05T06:49:14Z","title":"Learning to Unify Audio, Visual and Text for Audio-Enhanced Multilingual\n Visual Answer Localization","summary":" The goal of Multilingual Visual Answer Localization (MVAL) is to locate a\nvideo segment that answers a given multilingual question. Existing methods\neither focus solely on visual modality or integrate visual and subtitle\nmodalities. However, these methods neglect the audio modality in videos,\nconsequently leading to incomplete input information and poor performance in\nthe MVAL task. In this paper, we propose a unified Audio-Visual-Textual Span\nLocalization (AVTSL) method that incorporates audio modality to augment both\nvisual and textual representations for the MVAL task. Specifically, we\nintegrate features from three modalities and develop three predictors, each\ntailored to the unique contributions of the fused modalities: an audio-visual\npredictor, a visual predictor, and a textual predictor. Each predictor\ngenerates predictions based on its respective modality. To maintain consistency\nacross the predicted results, we introduce an Audio-Visual-Textual Consistency\nmodule. This module utilizes a Dynamic Triangular Loss (DTL) function, allowing\neach modality's predictor to dynamically learn from the others. This\ncollaborative learning ensures that the model generates consistent and\ncomprehensive answers. Extensive experiments show that our proposed method\noutperforms several state-of-the-art (SOTA) methods, which demonstrates the\neffectiveness of the audio modality.\n","authors":["Zhibin Wen","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2411.02851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04828v3","updated":"2024-11-05T02:32:06Z","published":"2024-09-07T13:41:37Z","title":"POINTS: Improving Your Vision-language Model with Affordable Strategies","summary":" In recent years, vision-language models have made significant strides,\nexcelling in tasks like optical character recognition and geometric\nproblem-solving. However, several critical issues remain: 1) Proprietary models\noften lack transparency about their architectures, while open-source models\nneed more detailed ablations of their training strategies. 2) Pre-training data\nin open-source works is under-explored, with datasets added empirically, making\nthe process cumbersome. 3) Fine-tuning often focuses on adding datasets,\nleading to diminishing returns. To address these issues, we propose the\nfollowing contributions: 1) We trained a robust baseline model using the latest\nadvancements in vision-language models, introducing effective improvements and\nconducting comprehensive ablation and validation for each technique. 2)\nInspired by recent work on large language models, we filtered pre-training data\nusing perplexity, selecting the lowest perplexity data for training. This\napproach allowed us to train on a curated 1M dataset, achieving competitive\nperformance. 3) During visual instruction tuning, we used model soup on\ndifferent datasets when adding more datasets yielded marginal improvements.\nThese innovations resulted in a 9B parameter model that performs competitively\nwith state-of-the-art models. Our strategies are efficient and lightweight,\nmaking them easily adoptable by the community.\n","authors":["Yuan Liu","Zhongyin Zhao","Ziyuan Zhuang","Le Tian","Xiao Zhou","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.04828v3.pdf","comment":"v2"}]},"2024-11-04T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2411.02607v1","updated":"2024-11-04T21:01:03Z","published":"2024-11-04T21:01:03Z","title":"Towards Context-Aware Adaptation in Extended Reality: A Design Space for\n XR Interfaces and an Adaptive Placement Strategy","summary":" By converting the entire 3D space around the user into a screen, Extended\nReality (XR) can ameliorate traditional displays' space limitations and\nfacilitate the consumption of multiple pieces of information at a time.\nHowever, if designed inappropriately, these XR interfaces can overwhelm the\nuser and complicate information access. In this work, we explored the design\ndimensions that can be adapted to enable suitable presentation and interaction\nwithin an XR interface. To investigate a specific use case of context-aware\nadaptations within our proposed design space, we concentrated on the spatial\nlayout of the XR content and investigated non-adaptive and adaptive placement\nstrategies. In this paper, we (1) present a comprehensive design space for XR\ninterfaces, (2) propose Environment-referenced, an adaptive placement strategy\nthat uses a relevant intermediary from the environment within a Hybrid Frame of\nReference (FoR) for each XR object, and (3) evaluate the effectiveness of this\nadaptive placement strategy and a non-adaptive Body-Fixed placement strategy in\nfour contextual scenarios varying in terms of social setting and user mobility\nin the environment. The performance of these placement strategies from our\nwithin-subjects user study emphasized the importance of intermediaries'\nrelevance to the user's focus. These findings underscore the importance of\ncontext-aware interfaces, indicating that the appropriate use of an adaptive\ncontent placement strategy in a context can significantly improve task\nefficiency, accuracy, and usability.\n","authors":["Shakiba Davari","Doug A. Bowman"],"pdf_url":"https://arxiv.org/pdf/2411.02607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02571v1","updated":"2024-11-04T20:06:34Z","published":"2024-11-04T20:06:34Z","title":"MM-Embed: Universal Multimodal Retrieval with Multimodal LLMs","summary":" State-of-the-art retrieval models typically address a straightforward search\nscenario, where retrieval tasks are fixed (e.g., finding a passage to answer a\nspecific question) and only a single modality is supported for both queries and\nretrieved results. This paper introduces techniques for advancing information\nretrieval with multimodal large language models (MLLMs), enabling a broader\nsearch scenario, termed universal multimodal retrieval, where multiple\nmodalities and diverse retrieval tasks are accommodated. To this end, we first\nstudy fine-tuning an MLLM as a bi-encoder retriever on 10 datasets with 16\nretrieval tasks. Our empirical results show that the fine-tuned MLLM retriever\nis capable of understanding challenging queries, composed of both text and\nimage, but underperforms a smaller CLIP retriever in cross-modal retrieval\ntasks due to modality bias from MLLMs. To address the issue, we propose\nmodality-aware hard negative mining to mitigate the modality bias exhibited by\nMLLM retrievers. Second, we propose to continually fine-tune the universal\nmultimodal retriever to enhance its text retrieval capability while maintaining\nmultimodal retrieval capability. As a result, our model, MM-Embed, achieves\nstate-of-the-art performance on the multimodal retrieval benchmark M-BEIR,\nwhich spans multiple domains and tasks, while also surpassing the\nstate-of-the-art text retrieval model, NV-Embed-v1, on MTEB retrieval\nbenchmark. Finally, we explore to prompt the off-the-shelf MLLMs as the\nzero-shot rerankers to refine the ranking of the candidates from the multimodal\nretriever. We find that through prompt-and-reranking, MLLMs can further improve\nmultimodal retrieval when the user queries (e.g., text-image composed queries)\nare more complex and challenging to understand. These findings also pave the\nway to advance universal multimodal retrieval in the future.\n","authors":["Sheng-Chieh Lin","Chankyu Lee","Mohammad Shoeybi","Jimmy Lin","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2411.02571v1.pdf","comment":"We release the model weights at:\n https://huggingface.co/nvidia/MM-Embed"},{"id":"http://arxiv.org/abs/2411.02284v1","updated":"2024-11-04T17:11:14Z","published":"2024-11-04T17:11:14Z","title":"Training on the Test Model: Contamination in Ranking Distillation","summary":" Neural approaches to ranking based on pre-trained language models are highly\neffective in ad-hoc search. However, the computational expense of these models\ncan limit their application. As such, a process known as knowledge distillation\nis frequently applied to allow a smaller, efficient model to learn from an\neffective but expensive model. A key example of this is the distillation of\nexpensive API-based commercial Large Language Models into smaller\nproduction-ready models. However, due to the opacity of training data and\nprocesses of most commercial models, one cannot ensure that a chosen test\ncollection has not been observed previously, creating the potential for\ninadvertent data contamination. We, therefore, investigate the effect of a\ncontaminated teacher model in a distillation setting. We evaluate several\ndistillation techniques to assess the degree to which contamination occurs\nduring distillation. By simulating a ``worst-case'' setting where the degree of\ncontamination is known, we find that contamination occurs even when the test\ndata represents a small fraction of the teacher's training samples. We,\ntherefore, encourage caution when training using black-box teacher models where\ndata provenance is ambiguous.\n","authors":["Vishakha Suresh Kalal","Andrew Parry","Sean MacAvaney"],"pdf_url":"https://arxiv.org/pdf/2411.02284v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2411.02041v1","updated":"2024-11-04T12:43:12Z","published":"2024-11-04T12:43:12Z","title":"Enhancing ID-based Recommendation with Large Language Models","summary":" Large Language Models (LLMs) have recently garnered significant attention in\nvarious domains, including recommendation systems. Recent research leverages\nthe capabilities of LLMs to improve the performance and user modeling aspects\nof recommender systems. These studies primarily focus on utilizing LLMs to\ninterpret textual data in recommendation tasks. However, it's worth noting that\nin ID-based recommendations, textual data is absent, and only ID data is\navailable. The untapped potential of LLMs for ID data within the ID-based\nrecommendation paradigm remains relatively unexplored. To this end, we\nintroduce a pioneering approach called \"LLM for ID-based Recommendation\"\n(LLM4IDRec). This innovative approach integrates the capabilities of LLMs while\nexclusively relying on ID data, thus diverging from the previous reliance on\ntextual data. The basic idea of LLM4IDRec is that by employing LLM to augment\nID data, if augmented ID data can improve recommendation performance, it\ndemonstrates the ability of LLM to interpret ID data effectively, exploring an\ninnovative way for the integration of LLM in ID-based recommendation. We\nevaluate the effectiveness of our LLM4IDRec approach using three widely-used\ndatasets. Our results demonstrate a notable improvement in recommendation\nperformance, with our approach consistently outperforming existing methods in\nID-based recommendation by solely augmenting input data.\n","authors":["Lei Chen","Chen Gao","Xiaoyi Du","Hengliang Luo","Depeng Jin","Yong Li","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2411.02041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01843v1","updated":"2024-11-04T06:31:52Z","published":"2024-11-04T06:31:52Z","title":"Dissertation: On the Theoretical Foundation of Model Comparison and\n Evaluation for Recommender System","summary":" Recommender systems have become increasingly important with the rise of the\nweb as a medium for electronic and business transactions. One of the key\ndrivers of this technology is the ease with which users can provide feedback\nabout their likes and dislikes through simple clicks of a mouse. This feedback\nis commonly collected in the form of ratings, but can also be inferred from a\nuser's browsing and purchasing history. Recommender systems utilize users'\nhistorical data to infer customer interests and provide personalized\nrecommendations. The basic principle of recommendations is that significant\ndependencies exist between user- and item-centric activity, which can be\nlearned in a data-driven manner to make accurate predictions. Collaborative\nfiltering is one family of recommendation algorithms that uses ratings from\nmultiple users to predict missing ratings or uses binary click information to\npredict potential clicks. However, recommender systems can be more complex and\nincorporate auxiliary data such as content-based attributes, user interactions,\nand contextual information.\n","authors":["Dong Li"],"pdf_url":"https://arxiv.org/pdf/2411.01843v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2312.08517"},{"id":"http://arxiv.org/abs/2411.01785v1","updated":"2024-11-04T04:16:11Z","published":"2024-11-04T04:16:11Z","title":"Transferable Sequential Recommendation via Vector Quantized Meta\n Learning","summary":" While sequential recommendation achieves significant progress on capturing\nuser-item transition patterns, transferring such large-scale recommender\nsystems remains challenging due to the disjoint user and item groups across\ndomains. In this paper, we propose a vector quantized meta learning for\ntransferable sequential recommenders (MetaRec). Without requiring additional\nmodalities or shared information across domains, our approach leverages\nuser-item interactions from multiple source domains to improve the target\ndomain performance. To solve the input heterogeneity issue, we adopt vector\nquantization that maps item embeddings from heterogeneous input spaces to a\nshared feature space. Moreover, our meta transfer paradigm exploits limited\ntarget data to guide the transfer of source domain knowledge to the target\ndomain (i.e., learn to transfer). In addition, MetaRec adaptively transfers\nfrom multiple source tasks by rescaling meta gradients based on the\nsource-target domain similarity, enabling selective learning to improve\nrecommendation performance. To validate the effectiveness of our approach, we\nperform extensive experiments on benchmark datasets, where MetaRec consistently\noutperforms baseline methods by a considerable margin.\n","authors":["Zhenrui Yue","Huimin Zeng","Yang Zhang","Julian McAuley","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.01785v1.pdf","comment":"Accepted to BigData 2024"},{"id":"http://arxiv.org/abs/2403.00801v2","updated":"2024-11-04T03:07:30Z","published":"2024-02-23T18:45:35Z","title":"Self-Retrieval: End-to-End Information Retrieval with One Large Language\n Model","summary":" The rise of large language models (LLMs) has significantly transformed both\nthe construction and application of information retrieval (IR) systems.\nHowever, current interactions between IR systems and LLMs remain limited, with\nLLMs merely serving as part of components within IR systems, and IR systems\nbeing constructed independently of LLMs. This separated architecture restricts\nknowledge sharing and deep collaboration between them. In this paper, we\nintroduce Self-Retrieval, a novel end-to-end LLM-driven information retrieval\narchitecture. Self-Retrieval unifies all essential IR functions within a single\nLLM, leveraging the inherent capabilities of LLMs throughout the IR process.\nSpecifically, Self-Retrieval internalizes the retrieval corpus through\nself-supervised learning, transforms the retrieval process into sequential\npassage generation, and performs relevance assessment for reranking.\nExperimental results demonstrate that Self-Retrieval not only outperforms\nexisting retrieval approaches by a significant margin, but also substantially\nenhances the performance of LLM-driven downstream applications like\nretrieval-augmented generation.\n","authors":["Qiaoyu Tang","Jiawei Chen","Zhuoqun Li","Bowen Yu","Yaojie Lu","Cheng Fu","Haiyang Yu","Hongyu Lin","Fei Huang","Ben He","Xianpei Han","Le Sun","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2403.00801v2.pdf","comment":"NeurIPS 2024 Camera-ready Version. Code:\n https://github.com/icip-cas/SelfRetrieval"}],"Multimedia":[{"id":"http://arxiv.org/abs/2411.02607v1","updated":"2024-11-04T21:01:03Z","published":"2024-11-04T21:01:03Z","title":"Towards Context-Aware Adaptation in Extended Reality: A Design Space for\n XR Interfaces and an Adaptive Placement Strategy","summary":" By converting the entire 3D space around the user into a screen, Extended\nReality (XR) can ameliorate traditional displays' space limitations and\nfacilitate the consumption of multiple pieces of information at a time.\nHowever, if designed inappropriately, these XR interfaces can overwhelm the\nuser and complicate information access. In this work, we explored the design\ndimensions that can be adapted to enable suitable presentation and interaction\nwithin an XR interface. To investigate a specific use case of context-aware\nadaptations within our proposed design space, we concentrated on the spatial\nlayout of the XR content and investigated non-adaptive and adaptive placement\nstrategies. In this paper, we (1) present a comprehensive design space for XR\ninterfaces, (2) propose Environment-referenced, an adaptive placement strategy\nthat uses a relevant intermediary from the environment within a Hybrid Frame of\nReference (FoR) for each XR object, and (3) evaluate the effectiveness of this\nadaptive placement strategy and a non-adaptive Body-Fixed placement strategy in\nfour contextual scenarios varying in terms of social setting and user mobility\nin the environment. The performance of these placement strategies from our\nwithin-subjects user study emphasized the importance of intermediaries'\nrelevance to the user's focus. These findings underscore the importance of\ncontext-aware interfaces, indicating that the appropriate use of an adaptive\ncontent placement strategy in a context can significantly improve task\nefficiency, accuracy, and usability.\n","authors":["Shakiba Davari","Doug A. Bowman"],"pdf_url":"https://arxiv.org/pdf/2411.02607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02334v1","updated":"2024-11-04T17:58:54Z","published":"2024-11-04T17:58:54Z","title":"Diffusion-based Generative Multicasting with Intent-aware Semantic\n Decomposition","summary":" Generative diffusion models (GDMs) have recently shown great success in\nsynthesizing multimedia signals with high perceptual quality enabling highly\nefficient semantic communications in future wireless networks. In this paper,\nwe develop an intent-aware generative semantic multicasting framework utilizing\npre-trained diffusion models. In the proposed framework, the transmitter\ndecomposes the source signal to multiple semantic classes based on the\nmulti-user intent, i.e. each user is assumed to be interested in details of\nonly a subset of the semantic classes. The transmitter then sends to each user\nonly its intended classes, and multicasts a highly compressed semantic map to\nall users over shared wireless resources that allows them to locally synthesize\nthe other classes, i.e. non-intended classes, utilizing pre-trained diffusion\nmodels. The signal retrieved at each user is thereby partially reconstructed\nand partially synthesized utilizing the received semantic map. This improves\nutilization of the wireless resources, with better preserving privacy of the\nnon-intended classes. We design a communication/computation-aware scheme for\nper-class adaptation of the communication parameters, such as the transmission\npower and compression rate to minimize the total latency of retrieving signals\nat multiple receivers, tailored to the prevailing channel conditions as well as\nthe users reconstruction/synthesis distortion/perception requirements. The\nsimulation results demonstrate significantly reduced per-user latency compared\nwith non-generative and intent-unaware multicasting benchmarks while\nmaintaining high perceptual quality of the signals retrieved at the users.\n","authors":["Xinkai Liu","Mahdi Boloursaz Mashhadi","Li Qiao","Yi Ma","Rahim Tafazolli","Mehdi Bennis"],"pdf_url":"https://arxiv.org/pdf/2411.02334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02236v1","updated":"2024-11-04T16:30:14Z","published":"2024-11-04T16:30:14Z","title":"3D Audio-Visual Segmentation","summary":" Recognizing the sounding objects in scenes is a longstanding objective in\nembodied AI, with diverse applications in robotics and AR/VR/MR. To that end,\nAudio-Visual Segmentation (AVS), taking as condition an audio signal to\nidentify the masks of the target sounding objects in an input image with\nsynchronous camera and microphone sensors, has been recently advanced. However,\nthis paradigm is still insufficient for real-world operation, as the mapping\nfrom 2D images to 3D scenes is missing. To address this fundamental limitation,\nwe introduce a novel research problem, 3D Audio-Visual Segmentation, extending\nthe existing AVS to the 3D output space. This problem poses more challenges due\nto variations in camera extrinsics, audio scattering, occlusions, and diverse\nacoustics across sounding object categories. To facilitate this research, we\ncreate the very first simulation based benchmark, 3DAVS-S34-O7, providing\nphotorealistic 3D scene environments with grounded spatial audio under\nsingle-instance and multi-instance settings, across 34 scenes and 7 object\ncategories. This is made possible by re-purposing the Habitat simulator to\ngenerate comprehensive annotations of sounding object locations and\ncorresponding 3D masks. Subsequently, we propose a new approach, EchoSegnet,\ncharacterized by integrating the ready-to-use knowledge from pretrained 2D\naudio-visual foundation models synergistically with 3D visual scene\nrepresentation through spatial audio-aware mask alignment and refinement.\nExtensive experiments demonstrate that EchoSegnet can effectively segment\nsounding objects in 3D space on our new benchmark, representing a significant\nadvancement in the field of embodied AI. Project page:\nhttps://surrey-uplab.github.io/research/3d-audio-visual-segmentation/\n","authors":["Artem Sokolov","Swapnil Bhosale","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.02236v1.pdf","comment":"Accepted at the NeurIPS 2024 Workshop on Audio Imagination"},{"id":"http://arxiv.org/abs/2407.05645v3","updated":"2024-11-04T09:14:03Z","published":"2024-07-08T06:14:37Z","title":"OneDiff: A Generalist Model for Image Difference Captioning","summary":" In computer vision, Image Difference Captioning (IDC) is crucial for\naccurately describing variations between closely related images. Traditional\nIDC methods often rely on specialist models, which restrict their applicability\nacross varied contexts. This paper introduces the OneDiff model, a novel\ngeneralist approach that utilizes a robust vision-language model architecture,\nintegrating a siamese image encoder with a Visual Delta Module. This innovative\nconfiguration allows for the precise detection and articulation of fine-grained\ndifferences between image pairs. OneDiff is trained through a dual-phase\nstrategy, encompassing Coupled Sample Training and multi-task learning across a\ndiverse array of data types, supported by our newly developed DiffCap Dataset.\nThis dataset merges real-world and synthetic data, enhancing the training\nprocess and bolstering the model's robustness. Extensive testing on diverse IDC\nbenchmarks, such as Spot-the-Diff, Image-Editing-Request, and Birds-to-Words,\nshows that OneDiff consistently outperforms existing state-of-the-art models in\naccuracy and adaptability, achieving improvements of up to 97% CIDEr points in\naverage. By setting a new benchmark in IDC, OneDiff paves the way for more\nversatile and effective applications in detecting and describing visual\ndifferences. The code, models, and data will be made publicly available.\n","authors":["Erdong Hu","Longteng Guo","Tongtian Yue","Zijia Zhao","Shuning Xue","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2407.05645v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01805v1","updated":"2024-11-04T05:17:44Z","published":"2024-11-04T05:17:44Z","title":"MoMu-Diffusion: On Learning Long-Term Motion-Music Synchronization and\n Correspondence","summary":" Motion-to-music and music-to-motion have been studied separately, each\nattracting substantial research interest within their respective domains. The\ninteraction between human motion and music is a reflection of advanced human\nintelligence, and establishing a unified relationship between them is\nparticularly important. However, to date, there has been no work that considers\nthem jointly to explore the modality alignment within. To bridge this gap, we\npropose a novel framework, termed MoMu-Diffusion, for long-term and synchronous\nmotion-music generation. Firstly, to mitigate the huge computational costs\nraised by long sequences, we propose a novel Bidirectional Contrastive Rhythmic\nVariational Auto-Encoder (BiCoR-VAE) that extracts the modality-aligned latent\nrepresentations for both motion and music inputs. Subsequently, leveraging the\naligned latent spaces, we introduce a multi-modal Transformer-based diffusion\nmodel and a cross-guidance sampling strategy to enable various generation\ntasks, including cross-modal, multi-modal, and variable-length generation.\nExtensive experiments demonstrate that MoMu-Diffusion surpasses recent\nstate-of-the-art methods both qualitatively and quantitatively, and can\nsynthesize realistic, diverse, long-term, and beat-matched music or motion\nsequences. The generated samples and codes are available at\nhttps://momu-diffusion.github.io/\n","authors":["Fuming You","Minghui Fang","Li Tang","Rongjie Huang","Yongqi Wang","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.01805v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2305.15748v2","updated":"2024-11-04T00:48:12Z","published":"2023-05-25T05:55:53Z","title":"ReactFace: Online Multiple Appropriate Facial Reaction Generation in\n Dyadic Interactions","summary":" In dyadic interaction, predicting the listener's facial reactions is\nchallenging as different reactions could be appropriate in response to the same\nspeaker's behaviour. Previous approaches predominantly treated this task as an\ninterpolation or fitting problem, emphasizing deterministic outcomes but\nignoring the diversity and uncertainty of human facial reactions. Furthermore,\nthese methods often failed to model short-range and long-range dependencies\nwithin the interaction context, leading to issues in the synchrony and\nappropriateness of the generated facial reactions. To address these\nlimitations, this paper reformulates the task as an extrapolation or prediction\nproblem, and proposes an novel framework (called ReactFace) to generate\nmultiple different but appropriate facial reactions from a speaker behaviour\nrather than merely replicating the corresponding listener facial behaviours.\nOur ReactFace generates multiple different but appropriate photo-realistic\nhuman facial reactions by: (i) learning an appropriate facial reaction\ndistribution representing multiple different but appropriate facial reactions;\nand (ii) synchronizing the generated facial reactions with the speaker verbal\nand non-verbal behaviours at each time stamp, resulting in realistic 2D facial\nreaction sequences. Experimental results demonstrate the effectiveness of our\napproach in generating multiple diverse, synchronized, and appropriate facial\nreactions from each speaker's behaviour. The quality of the generated facial\nreactions is intimately tied to the speaker's speech and facial expressions,\nachieved through our novel speaker-listener interaction modules. Our code is\nmade publicly available at \\url{https://github.com/lingjivoo/ReactFace}.\n","authors":["Cheng Luo","Siyang Song","Weicheng Xie","Micol Spitale","Zongyuan Ge","Linlin Shen","Hatice Gunes"],"pdf_url":"https://arxiv.org/pdf/2305.15748v2.pdf","comment":"Accepted to IEEE Transactions on Visualization and Computer Graphics\n (TVCG), 18 pages, 10 figures"}]},"2024-11-03T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2410.23300v2","updated":"2024-11-03T22:11:35Z","published":"2024-10-15T21:54:13Z","title":"Understanding and Scaling Collaborative Filtering Optimization from the\n Perspective of Matrix Rank","summary":" Collaborative Filtering (CF) methods dominate real-world recommender systems\ngiven their ability to learn high-quality, sparse ID-embedding tables that\neffectively capture user preferences. These tables scale linearly with the\nnumber of users and items, and are trained to ensure high similarity between\nembeddings of interacted user-item pairs, while maintaining low similarity for\nnon-interacted pairs. Despite their high performance, encouraging dispersion\nfor non-interacted pairs necessitates expensive regularization (e.g., negative\nsampling), hurting runtime and scalability. Existing research tends to address\nthese challenges by simplifying the learning process, either by reducing model\ncomplexity or sampling data, trading performance for runtime. In this work, we\nmove beyond model-level modifications and study the properties of the embedding\ntables under different learning strategies. Through theoretical analysis, we\nfind that the singular values of the embedding tables are intrinsically linked\nto different CF loss functions. These findings are empirically validated on\nreal-world datasets, demonstrating the practical benefits of higher stable\nrank, a continuous version of matrix rank which encodes the distribution of\nsingular values. Based on these insights, we propose an efficient warm-start\nstrategy that regularizes the stable rank of the user and item embeddings. We\nshow that stable rank regularization during early training phases can promote\nhigher-quality embeddings, resulting in training speed improvements of up to\n66%. Additionally, stable rank regularization can act as a proxy for negative\nsampling, allowing for performance gains of up to 21% over loss functions with\nsmall negative sampling ratios. Overall, our analysis unifies current CF\nmethods under a new perspective, their optimization of stable rank, motivating\na flexible regularization method.\n","authors":["Donald Loveland","Xinyi Wu","Tong Zhao","Danai Koutra","Neil Shah","Mingxuan Ju"],"pdf_url":"https://arxiv.org/pdf/2410.23300v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01690v1","updated":"2024-11-03T21:32:07Z","published":"2024-11-03T21:32:07Z","title":"Co-clustering for Federated Recommender System","summary":" As data privacy and security attract increasing attention, Federated\nRecommender System (FRS) offers a solution that strikes a balance between\nproviding high-quality recommendations and preserving user privacy. However,\nthe presence of statistical heterogeneity in FRS, commonly observed due to\npersonalized decision-making patterns, can pose challenges. To address this\nissue and maximize the benefit of collaborative filtering (CF) in FRS, it is\nintuitive to consider clustering clients (users) as well as items into\ndifferent groups and learning group-specific models. Existing methods either\nresort to client clustering via user representations-risking privacy leakage,\nor employ classical clustering strategies on item embeddings or gradients,\nwhich we found are plagued by the curse of dimensionality. In this paper, we\ndelve into the inefficiencies of the K-Means method in client grouping,\nattributing failures due to the high dimensionality as well as data sparsity\noccurring in FRS, and propose CoFedRec, a novel Co-clustering Federated\nRecommendation mechanism, to address clients heterogeneity and enhance the\ncollaborative filtering within the federated framework. Specifically, the\nserver initially formulates an item membership from the client-provided item\nnetworks. Subsequently, clients are grouped regarding a specific item category\npicked from the item membership during each communication round, resulting in\nan intelligently aggregated group model. Meanwhile, to comprehensively capture\nthe global inter-relationships among items, we incorporate an additional\nsupervised contrastive learning term based on the server-side generated item\nmembership into the local training phase for each client. Extensive experiments\non four datasets are provided, which verify the effectiveness of the proposed\nCoFedRec.\n","authors":["Xinrui He","Shuo Liu","Jackey Keung","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2411.01690v1.pdf","comment":"WWW '24: Proceedings of the ACM Web Conference 2024"},{"id":"http://arxiv.org/abs/2411.02454v1","updated":"2024-11-03T20:36:44Z","published":"2024-11-03T20:36:44Z","title":"Graph-based Confidence Calibration for Large Language Models","summary":" One important approach to improving the reliability of large language models\n(LLMs) is to provide accurate confidence estimations regarding the correctness\nof their answers. However, developing a well-calibrated confidence estimation\nmodel is challenging, as mistakes made by LLMs can be difficult to detect. We\npropose a novel method combining the LLM's self-consistency with labeled data\nand training an auxiliary model to estimate the correctness of its responses to\nquestions. This auxiliary model predicts the correctness of responses based\nsolely on their consistent information. To set up the learning problem, we use\na weighted graph to represent the consistency among the LLM's multiple\nresponses to a question. Correctness labels are assigned to these responses\nbased on their similarity to the correct answer. We then train a graph neural\nnetwork to estimate the probability of correct responses. Experiments\ndemonstrate that the proposed approach substantially outperforms several of the\nmost recent methods in confidence calibration across multiple widely adopted\nbenchmark datasets. Furthermore, the proposed approach significantly improves\nthe generalization capability of confidence calibration on out-of-domain (OOD)\ndata.\n","authors":["Yukun Li","Sijia Wang","Lifu Huang","Li-Ping Liu"],"pdf_url":"https://arxiv.org/pdf/2411.02454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01611v1","updated":"2024-11-03T15:37:37Z","published":"2024-11-03T15:37:37Z","title":"Stochastic Communication Avoidance for Recommendation Systems","summary":" One of the major bottlenecks for efficient deployment of neural network based\nrecommendation systems is the memory footprint of their embedding tables.\nAlthough many neural network based recommendation systems could benefit from\nthe faster on-chip memory access and increased computational power of hardware\naccelerators, the large embedding tables in these models often cannot fit on\nthe constrained memory of accelerators. Despite the pervasiveness of these\nmodels, prior methods in memory optimization and parallelism fail to address\nthe memory and communication costs of large embedding tables on accelerators.\nAs a result, the majority of models are trained on CPUs, while current\nimplementations of accelerators are hindered by issues such as bottlenecks in\ninter-device communication and main memory lookups. In this paper, we propose a\ntheoretical framework that analyses the communication costs of arbitrary\ndistributed systems that use lookup tables. We use this framework to propose\nalgorithms that maximize throughput subject to memory, computation, and\ncommunication constraints. Furthermore, we demonstrate that our method achieves\nstrong theoretical performance across dataset distributions and memory\nconstraints, applicable to a wide range of use cases from mobile federated\nlearning to warehouse-scale computation. We implement our framework and\nalgorithms in PyTorch and achieve up to 6x increases in training throughput on\nGPU systems over baselines, on the Criteo Terabytes dataset.\n","authors":["Lutfi Eren Erdogan","Vijay Anand Raghava Kanakagiri","Kurt Keutzer","Zhen Dong"],"pdf_url":"https://arxiv.org/pdf/2411.01611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01561v1","updated":"2024-11-03T13:23:07Z","published":"2024-11-03T13:23:07Z","title":"Multimodal Graph Neural Network for Recommendation with Dynamic\n De-redundancy and Modality-Guided Feature De-noisy","summary":" Graph neural networks (GNNs) have become crucial in multimodal recommendation\ntasks because of their powerful ability to capture complex relationships\nbetween neighboring nodes. However, increasing the number of propagation layers\nin GNNs can lead to feature redundancy, which may negatively impact the overall\nrecommendation performance. In addition, the existing recommendation task\nmethod directly maps the preprocessed multimodal features to the\nlow-dimensional space, which will bring the noise unrelated to user preference,\nthus affecting the representation ability of the model. To tackle the\naforementioned challenges, we propose Multimodal Graph Neural Network for\nRecommendation (MGNM) with Dynamic De-redundancy and Modality-Guided Feature\nDe-noisy, which is divided into local and global interaction. Initially, in the\nlocal interaction process,we integrate a dynamic de-redundancy (DDR) loss\nfunction which is achieved by utilizing the product of the feature coefficient\nmatrix and the feature matrix as a penalization factor. It reduces the feature\nredundancy effects of multimodal and behavioral features caused by the stacking\nof multiple GNN layers. Subsequently, in the global interaction process, we\ndeveloped modality-guided global feature purifiers for each modality to\nalleviate the impact of modality noise. It is a two-fold guiding mechanism\neliminating modality features that are irrelevant to user preferences and\ncaptures complex relationships within the modality. Experimental results\ndemonstrate that MGNM achieves superior performance on multimodal information\ndenoising and removal of redundant information compared to the state-of-the-art\nmethods.\n","authors":["Feng Mo","Lin Xiao","Qiya Song","Xieping Gao","Eryao Liang"],"pdf_url":"https://arxiv.org/pdf/2411.01561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01540v1","updated":"2024-11-03T12:10:20Z","published":"2024-11-03T12:10:20Z","title":"Efficient and Robust Regularized Federated Recommendation","summary":" Recommender systems play a pivotal role across practical scenarios,\nshowcasing remarkable capabilities in user preference modeling. However, the\ncentralized learning paradigm predominantly used raises serious privacy\nconcerns. The federated recommender system (FedRS) addresses this by updating\nmodels on clients, while a central server orchestrates training without\naccessing private data. Existing FedRS approaches, however, face unresolved\nchallenges, including non-convex optimization, vulnerability, potential privacy\nleakage risk, and communication inefficiency. This paper addresses these\nchallenges by reformulating the federated recommendation problem as a convex\noptimization issue, ensuring convergence to the global optimum. Based on this,\nwe devise a novel method, RFRec, to tackle this optimization problem\nefficiently. In addition, we propose RFRecF, a highly efficient version that\nincorporates non-uniform stochastic gradient descent to improve communication\nefficiency. In user preference modeling, both methods learn local and global\nmodels, collaboratively learning users' common and personalized interests under\nthe federated learning setting. Moreover, both methods significantly enhance\ncommunication efficiency, robustness, and privacy protection, with theoretical\nsupport. Comprehensive evaluations on four benchmark datasets demonstrate RFRec\nand RFRecF's superior performance compared to diverse baselines.\n","authors":["Langming Liu","Wanyu Wang","Xiangyu Zhao","Zijian Zhang","Chunxu Zhang","Shanru Lin","Yiqi Wang","Lixin Zou","Zitao Liu","Xuetao Wei","Hongzhi Yin","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2411.01540v1.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2411.01537v1","updated":"2024-11-03T11:56:00Z","published":"2024-11-03T11:56:00Z","title":"LinRec: Linear Attention Mechanism for Long-term Sequential Recommender\n Systems","summary":" Transformer models have achieved remarkable success in sequential recommender\nsystems (SRSs). However, computing the attention matrix in traditional\ndot-product attention mechanisms results in a quadratic complexity with\nsequence lengths, leading to high computational costs for long-term sequential\nrecommendation. Motivated by the above observation, we propose a novel\nL2-Normalized Linear Attention for the Transformer-based Sequential Recommender\nSystems (LinRec), which theoretically improves efficiency while preserving the\nlearning capabilities of the traditional dot-product attention. Specifically,\nby thoroughly examining the equivalence conditions of efficient attention\nmechanisms, we show that LinRec possesses linear complexity while preserving\nthe property of attention mechanisms. In addition, we reveal its latent\nefficiency properties by interpreting the proposed LinRec mechanism through a\nstatistical lens. Extensive experiments are conducted based on two public\nbenchmark datasets, demonstrating that the combination of LinRec and\nTransformer models achieves comparable or even superior performance than\nstate-of-the-art Transformer-based SRS models while significantly improving\ntime and memory efficiency.\n","authors":["Langming Liu","Xiangyu Zhao","Chi Zhang","Jingtong Gao","Wanyu Wang","Wenqi Fan","Yiqi Wang","Ming He","Zitao Liu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2411.01537v1.pdf","comment":"SIGIR 2023"},{"id":"http://arxiv.org/abs/2411.02451v1","updated":"2024-11-03T10:06:14Z","published":"2024-11-03T10:06:14Z","title":"High-performance automated abstract screening with large language model\n ensembles","summary":" Large language models (LLMs) excel in tasks requiring processing and\ninterpretation of input text. Abstract screening is a labour-intensive\ncomponent of systematic review involving repetitive application of inclusion\nand exclusion criteria on a large volume of studies identified by a literature\nsearch. Here, LLMs (GPT-3.5 Turbo, GPT-4 Turbo, GPT-4o, Llama 3 70B, Gemini 1.5\nPro, and Claude Sonnet 3.5) were trialled on systematic reviews in a full issue\nof the Cochrane Library to evaluate their accuracy in zero-shot binary\nclassification for abstract screening. Trials over a subset of 800 records\nidentified optimal prompting strategies and demonstrated superior performance\nof LLMs to human researchers in terms of sensitivity (LLMmax = 1.000, humanmax\n= 0.775), precision (LLMmax = 0.927, humanmax = 0.911), and balanced accuracy\n(LLMmax = 0.904, humanmax = 0.865). The best performing LLM-prompt combinations\nwere trialled across every replicated search result (n = 119,691), and\nexhibited consistent sensitivity (range 0.756-1.000) but diminished precision\n(range 0.004-0.096). 66 LLM-human and LLM-LLM ensembles exhibited perfect\nsensitivity with a maximal precision of 0.458, with less observed performance\ndrop in larger trials. Significant variation in performance was observed\nbetween reviews, highlighting the importance of domain-specific validation\nbefore deployment. LLMs may reduce the human labour cost of systematic review\nwith maintained or improved accuracy and sensitivity. Systematic review is the\nfoundation of evidence-based medicine, and LLMs can contribute to increasing\nthe efficiency and quality of this mode of research.\n","authors":["Rohan Sanghera","Arun James Thirunavukarasu","Marc El Khoury","Jessica O'Logbon","Yuqing Chen","Archie Watt","Mustafa Mahmood","Hamid Butt","George Nishimura","Andrew Soltan"],"pdf_url":"https://arxiv.org/pdf/2411.02451v1.pdf","comment":"RS and AJT are joint-first authors"},{"id":"http://arxiv.org/abs/2410.18634v2","updated":"2024-11-03T08:14:34Z","published":"2024-10-24T10:47:30Z","title":"Little Giants: Synthesizing High-Quality Embedding Data at Scale","summary":" Synthetic data generation has become an increasingly popular way of training\nmodels without the need for large, manually labeled datasets. For tasks like\ntext embedding, synthetic data offers diverse and scalable training examples,\nsignificantly reducing the cost of human annotation. However, most current\napproaches rely heavily on proprietary models like GPT-4, which are expensive\nand inefficient for generating large-scale embedding data. In this paper, we\nintroduce SPEED, a framework that aligns open-source small models (8B) to\nefficiently generate large-scale synthetic embedding data. Through supervised\nfine-tuning, preference optimization, and self-improvement, SPEED enables small\nopen-source models to produce high-quality data. Remarkably, SPEED uses only\nless than 1/10 of the GPT API calls, outperforming the state-of-the-art\nembedding model E5_mistral when both are trained solely on their synthetic\ndata. Using this efficient generator, we conduct a comprehensive study on how\nvarious factors within the alignment pipeline impact data quality and reveal\nthe scaling law for synthetic embedding data.\n","authors":["Haonan Chen","Liang Wang","Nan Yang","Yutao Zhu","Ziliang Zhao","Furu Wei","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2410.18634v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01457v1","updated":"2024-11-03T06:47:45Z","published":"2024-11-03T06:47:45Z","title":"Facet-Aware Multi-Head Mixture-of-Experts Model for Sequential\n Recommendation","summary":" Sequential recommendation (SR) systems excel at capturing users' dynamic\npreferences by leveraging their interaction histories. Most existing SR systems\nassign a single embedding vector to each item to represent its features, and\nvarious types of models are adopted to combine these item embeddings into a\nsequence representation vector to capture the user intent. However, we argue\nthat this representation alone is insufficient to capture an item's\nmulti-faceted nature (e.g., movie genres, starring actors). Besides, users\noften exhibit complex and varied preferences within these facets (e.g., liking\nboth action and musical films in the facet of genre), which are challenging to\nfully represent. To address the issues above, we propose a novel structure\ncalled Facet-Aware Multi-Head Mixture-of-Experts Model for Sequential\nRecommendation (FAME). We leverage sub-embeddings from each head in the last\nmulti-head attention layer to predict the next item separately. This approach\ncaptures the potential multi-faceted nature of items without increasing model\ncomplexity. A gating mechanism integrates recommendations from each head and\ndynamically determines their importance. Furthermore, we introduce a\nMixture-of-Experts (MoE) network in each attention head to disentangle various\nuser preferences within each facet. Each expert within the MoE focuses on a\nspecific preference. A learnable router network is adopted to compute the\nimportance weight for each expert and aggregate them. We conduct extensive\nexperiments on four public sequential recommendation datasets and the results\ndemonstrate the effectiveness of our method over existing baseline models.\n","authors":["Mingrui Liu","Sixiao Zhang","Cheng Long"],"pdf_url":"https://arxiv.org/pdf/2411.01457v1.pdf","comment":"This paper has been accepted by WSDM'25. The final camera-ready\n version will be available soon"}],"Multimedia":[{"id":"http://arxiv.org/abs/2403.14468v4","updated":"2024-11-03T21:16:54Z","published":"2024-03-21T15:15:00Z","title":"AnyV2V: A Tuning-Free Framework For Any Video-to-Video Editing Tasks","summary":" In the dynamic field of digital content creation using generative models,\nstate-of-the-art video editing models still do not offer the level of quality\nand control that users desire. Previous works on video editing either extended\nfrom image-based generative models in a zero-shot manner or necessitated\nextensive fine-tuning, which can hinder the production of fluid video edits.\nFurthermore, these methods frequently rely on textual input as the editing\nguidance, leading to ambiguities and limiting the types of edits they can\nperform. Recognizing these challenges, we introduce AnyV2V, a novel tuning-free\nparadigm designed to simplify video editing into two primary steps: (1)\nemploying an off-the-shelf image editing model to modify the first frame, (2)\nutilizing an existing image-to-video generation model to generate the edited\nvideo through temporal feature injection. AnyV2V can leverage any existing\nimage editing tools to support an extensive array of video editing tasks,\nincluding prompt-based editing, reference-based style transfer, subject-driven\nediting, and identity manipulation, which were unattainable by previous\nmethods. AnyV2V can also support any video length. Our evaluation shows that\nAnyV2V achieved CLIP-scores comparable to other baseline methods. Furthermore,\nAnyV2V significantly outperformed these baselines in human evaluations,\ndemonstrating notable improvements in visual consistency with the source video\nwhile producing high-quality edits across all editing tasks.\n","authors":["Max Ku","Cong Wei","Weiming Ren","Harry Yang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2403.14468v4.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR 2024)\n (11/2024)"},{"id":"http://arxiv.org/abs/2406.13743v3","updated":"2024-11-03T20:22:32Z","published":"2024-06-19T18:00:07Z","title":"GenAI-Bench: Evaluating and Improving Compositional Text-to-Visual\n Generation","summary":" While text-to-visual models now produce photo-realistic images and videos,\nthey struggle with compositional text prompts involving attributes,\nrelationships, and higher-order reasoning such as logic and comparison. In this\nwork, we conduct an extensive human study on GenAI-Bench to evaluate the\nperformance of leading image and video generation models in various aspects of\ncompositional text-to-visual generation. We also compare automated evaluation\nmetrics against our collected human ratings and find that VQAScore -- a metric\nmeasuring the likelihood that a VQA model views an image as accurately\ndepicting the prompt -- significantly outperforms previous metrics such as\nCLIPScore. In addition, VQAScore can improve generation in a black-box manner\n(without finetuning) via simply ranking a few (3 to 9) candidate images.\nRanking by VQAScore is 2x to 3x more effective than other scoring methods like\nPickScore, HPSv2, and ImageReward at improving human alignment ratings for\nDALL-E 3 and Stable Diffusion, especially on compositional prompts that require\nadvanced visio-linguistic reasoning. We release a new GenAI-Rank benchmark with\nover 40,000 human ratings to evaluate scoring metrics on ranking images\ngenerated from the same prompt. Lastly, we discuss promising areas for\nimprovement in VQAScore, such as addressing fine-grained visual details. We\nwill release all human ratings (over 80,000) to facilitate scientific\nbenchmarking of both generative models and automated metrics.\n","authors":["Baiqi Li","Zhiqiu Lin","Deepak Pathak","Jiayao Li","Yixin Fei","Kewen Wu","Tiffany Ling","Xide Xia","Pengchuan Zhang","Graham Neubig","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2406.13743v3.pdf","comment":"We open-source our dataset, model, and code at:\n https://linzhiqiu.github.io/papers/genai_bench ; Project page:\n https://linzhiqiu.github.io/papers/genai_bench ; GenAI-Bench was first\n introduced in arxiv:2404.01291. This article extends it with an additional\n GenAI-Rank benchmark"},{"id":"http://arxiv.org/abs/2411.01561v1","updated":"2024-11-03T13:23:07Z","published":"2024-11-03T13:23:07Z","title":"Multimodal Graph Neural Network for Recommendation with Dynamic\n De-redundancy and Modality-Guided Feature De-noisy","summary":" Graph neural networks (GNNs) have become crucial in multimodal recommendation\ntasks because of their powerful ability to capture complex relationships\nbetween neighboring nodes. However, increasing the number of propagation layers\nin GNNs can lead to feature redundancy, which may negatively impact the overall\nrecommendation performance. In addition, the existing recommendation task\nmethod directly maps the preprocessed multimodal features to the\nlow-dimensional space, which will bring the noise unrelated to user preference,\nthus affecting the representation ability of the model. To tackle the\naforementioned challenges, we propose Multimodal Graph Neural Network for\nRecommendation (MGNM) with Dynamic De-redundancy and Modality-Guided Feature\nDe-noisy, which is divided into local and global interaction. Initially, in the\nlocal interaction process,we integrate a dynamic de-redundancy (DDR) loss\nfunction which is achieved by utilizing the product of the feature coefficient\nmatrix and the feature matrix as a penalization factor. It reduces the feature\nredundancy effects of multimodal and behavioral features caused by the stacking\nof multiple GNN layers. Subsequently, in the global interaction process, we\ndeveloped modality-guided global feature purifiers for each modality to\nalleviate the impact of modality noise. It is a two-fold guiding mechanism\neliminating modality features that are irrelevant to user preferences and\ncaptures complex relationships within the modality. Experimental results\ndemonstrate that MGNM achieves superior performance on multimodal information\ndenoising and removal of redundant information compared to the state-of-the-art\nmethods.\n","authors":["Feng Mo","Lin Xiao","Qiya Song","Xieping Gao","Eryao Liang"],"pdf_url":"https://arxiv.org/pdf/2411.01561v1.pdf","comment":null}]},"2024-11-02T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2411.01376v1","updated":"2024-11-02T22:59:36Z","published":"2024-11-02T22:59:36Z","title":"Multi-Channel Hypergraph Contrastive Learning for Matrix Completion","summary":" Rating is a typical user explicit feedback that visually reflects how much a\nuser likes a related item. The (rating) matrix completion is essentially a\nrating prediction process, which is also a significant problem in recommender\nsystems. Recently, graph neural networks (GNNs) have been widely used in matrix\ncompletion, which captures users' preferences over items by formulating a\nrating matrix as a bipartite graph. However, existing methods are susceptible\ndue to data sparsity and long-tail distribution in real-world scenarios.\nMoreover, the messaging mechanism of GNNs makes it difficult to capture\nhigh-order correlations and constraints between nodes, which are essentially\nuseful in recommendation tasks. To tackle these challenges, we propose a\nMulti-Channel Hypergraph Contrastive Learning framework for matrix completion,\nnamed MHCL. Specifically, MHCL adaptively learns hypergraph structures to\ncapture high-order correlations between nodes and jointly captures local and\nglobal collaborative relationships through attention-based cross-view\naggregation. Additionally, to consider the magnitude and order information of\nratings, we treat different rating subgraphs as different channels, encourage\nalignment between adjacent ratings, and further achieve the mutual enhancement\nbetween different ratings through multi-channel cross-rating contrastive\nlearning. Extensive experiments on five public datasets demonstrate that the\nproposed method significantly outperforms the current state-of-the-art\napproaches.\n","authors":["Xiang Li","Changsheng Shui","Yanwei Yu","Chao Huang","Zhongying Zhao","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2411.01376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01368v1","updated":"2024-11-02T21:53:20Z","published":"2024-11-02T21:53:20Z","title":"Combining Financial Data and News Articles for Stock Price Movement\n Prediction Using Large Language Models","summary":" Predicting financial markets and stock price movements requires analyzing a\ncompany's performance, historic price movements, industry-specific events\nalongside the influence of human factors such as social media and press\ncoverage. We assume that financial reports (such as income statements, balance\nsheets, and cash flow statements), historical price data, and recent news\narticles can collectively represent aforementioned factors. We combine\nfinancial data in tabular format with textual news articles and employ\npre-trained Large Language Models (LLMs) to predict market movements. Recent\nresearch in LLMs has demonstrated that they are able to perform both tabular\nand text classification tasks, making them our primary model to classify the\nmulti-modal data. We utilize retrieval augmentation techniques to retrieve and\nattach relevant chunks of news articles to financial metrics related to a\ncompany and prompt the LLMs in zero, two, and four-shot settings. Our dataset\ncontains news articles collected from different sources, historic stock price,\nand financial report data for 20 companies with the highest trading volume\nacross different industries in the stock market. We utilized recently released\nlanguage models for our LLM-based classifier, including GPT- 3 and 4, and\nLLaMA- 2 and 3 models. We introduce an LLM-based classifier capable of\nperforming classification tasks using combination of tabular (structured) and\ntextual (unstructured) data. By using this model, we predicted the movement of\na given stock's price in our dataset with a weighted F1-score of 58.5% and\n59.1% and Matthews Correlation Coefficient of 0.175 for both 3-month and\n6-month periods.\n","authors":["Ali Elahi","Fatemeh Taghvaei"],"pdf_url":"https://arxiv.org/pdf/2411.01368v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.01354v1","updated":"2024-11-02T20:05:31Z","published":"2024-11-02T20:05:31Z","title":"Online and Offline Evaluations of Collaborative Filtering and Content\n Based Recommender Systems","summary":" Recommender systems are widely used AI applications designed to help users\nefficiently discover relevant items. The effectiveness of such systems is tied\nto the satisfaction of both users and providers. However, user satisfaction is\ncomplex and cannot be easily framed mathematically using information retrieval\nand accuracy metrics. While many studies evaluate accuracy through offline\ntests, a growing number of researchers argue that online evaluation methods\nsuch as A/B testing are better suited for this purpose. We have employed a\nvariety of algorithms on different types of datasets divergent in size and\nsubject, producing recommendations in various platforms, including media\nstreaming services, digital publishing websites, e-commerce systems, and news\nbroadcasting networks. Notably, our target websites and datasets are in Persian\n(Farsi) language.\n This study provides a comparative analysis of a large-scale recommender\nsystem that has been operating for the past year across about 70 websites in\nIran, processing roughly 300 requests per second collectively. The system\nemploys user-based and item-based recommendations using content-based,\ncollaborative filtering, trend-based methods, and hybrid approaches. Through\nboth offline and online evaluations, we aim to identify where these algorithms\nperform most efficiently and determine the best method for our specific needs,\nconsidering the dataset and system scale. Our methods of evaluation include\nmanual evaluation, offline tests including accuracy and ranking metrics like\nhit-rate@k and nDCG, and online tests consisting of click-through rate (CTR).\nAdditionally we analyzed and proposed methods to address cold-start and\npopularity bias.\n","authors":["Ali Elahi","Armin Zirak"],"pdf_url":"https://arxiv.org/pdf/2411.01354v1.pdf","comment":"9 pages, 9 figures"},{"id":"http://arxiv.org/abs/2303.07865v6","updated":"2024-11-02T16:56:36Z","published":"2023-03-14T12:56:47Z","title":"Predicting the Geolocation of Tweets Using transformer models on\n Customized Data","summary":" This research is aimed to solve the tweet/user geolocation prediction task\nand provide a flexible methodology for the geotagging of textual big data. The\nsuggested approach implements neural networks for natural language processing\n(NLP) to estimate the location as coordinate pairs (longitude, latitude) and\ntwo-dimensional Gaussian Mixture Models (GMMs). The scope of proposed models\nhas been finetuned on a Twitter dataset using pretrained Bidirectional Encoder\nRepresentations from Transformers (BERT) as base models. Performance metrics\nshow a median error of fewer than 30 km on a worldwide-level, and fewer than 15\nkm on the US-level datasets for the models trained and evaluated on text\nfeatures of tweets' content and metadata context. Our source code and data are\navailable at https://github.com/K4TEL/geo-twitter.git\n","authors":["Kateryna Lutsai","Christoph H. Lampert"],"pdf_url":"https://arxiv.org/pdf/2303.07865v6.pdf","comment":"31 pages, 5 tables, 9 figures"},{"id":"http://arxiv.org/abs/2411.01304v1","updated":"2024-11-02T16:39:45Z","published":"2024-11-02T16:39:45Z","title":"Towards a Knowledge Graph for Teaching Knowledge Graphs","summary":" This poster paper describes the ongoing research project for the creation of\na use-case-driven Knowledge Graph resource tailored to the needs of teaching\neducation in Knowledge Graphs (KGs). We gather resources related to KG courses\nfrom lectures offered by the Semantic Web community, with the help of the COST\nAction Distributed Knowledge Graphs and the interest group on KGs at The Alan\nTuring Institute. Our goal is to create a resource-focused KG with multiple\ninterconnected semantic layers that interlink topics, courses, and materials\nwith each lecturer. Our approach formulates a domain KG in teaching and relates\nit with multiple Personal KGs created for the lecturers.\n","authors":["Eleni Ilkou","Ernesto Jiménez-Ruiz"],"pdf_url":"https://arxiv.org/pdf/2411.01304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.22844v2","updated":"2024-11-02T15:23:36Z","published":"2024-10-30T09:23:14Z","title":"Understanding and Improving Adversarial Collaborative Filtering for\n Robust Recommendation","summary":" Adversarial Collaborative Filtering (ACF), which typically applies\nadversarial perturbations at user and item embeddings through adversarial\ntraining, is widely recognized as an effective strategy for enhancing the\nrobustness of Collaborative Filtering (CF) recommender systems against\npoisoning attacks. Besides, numerous studies have empirically shown that ACF\ncan also improve recommendation performance compared to traditional CF. Despite\nthese empirical successes, the theoretical understanding of ACF's effectiveness\nin terms of both performance and robustness remains unclear. To bridge this\ngap, in this paper, we first theoretically show that ACF can achieve a lower\nrecommendation error compared to traditional CF with the same training epochs\nin both clean and poisoned data contexts. Furthermore, by establishing bounds\nfor reductions in recommendation error during ACF's optimization process, we\nfind that applying personalized magnitudes of perturbation for different users\nbased on their embedding scales can further improve ACF's effectiveness.\nBuilding on these theoretical understandings, we propose Personalized Magnitude\nAdversarial Collaborative Filtering (PamaCF). Extensive experiments demonstrate\nthat PamaCF effectively defends against various types of poisoning attacks\nwhile significantly enhancing recommendation performance.\n","authors":["Kaike Zhang","Qi Cao","Yunfan Wu","Fei Sun","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.22844v2.pdf","comment":"To appear in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.02442v1","updated":"2024-11-02T14:36:03Z","published":"2024-11-02T14:36:03Z","title":"TODO: Enhancing LLM Alignment with Ternary Preferences","summary":" Aligning large language models (LLMs) with human intent is critical for\nenhancing their performance across a variety of tasks. Standard alignment\ntechniques, such as Direct Preference Optimization (DPO), often rely on the\nbinary Bradley-Terry (BT) model, which can struggle to capture the complexities\nof human preferences -- particularly in the presence of noisy or inconsistent\nlabels and frequent ties. To address these limitations, we introduce the\nTie-rank Oriented Bradley-Terry model (TOBT), an extension of the BT model that\nexplicitly incorporates ties, enabling more nuanced preference representation.\nBuilding on this, we propose Tie-rank Oriented Direct Preference Optimization\n(TODO), a novel alignment algorithm that leverages TOBT's ternary ranking\nsystem to improve preference alignment. In evaluations on Mistral-7B and Llama\n3-8B models, TODO consistently outperforms DPO in modeling preferences across\nboth in-distribution and out-of-distribution datasets. Additional assessments\nusing MT Bench and benchmarks such as Piqa, ARC-c, and MMLU further demonstrate\nTODO's superior alignment performance. Notably, TODO also shows strong results\nin binary preference alignment, highlighting its versatility and potential for\nbroader integration into LLM alignment. The implementation details can be found\nin https://github.com/XXares/TODO.\n","authors":["Yuxiang Guo","Lu Yin","Bo Jiang","Jiaqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.02442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01182v1","updated":"2024-11-02T08:50:11Z","published":"2024-11-02T08:50:11Z","title":"Graph Cross-Correlated Network for Recommendation","summary":" Collaborative filtering (CF) models have demonstrated remarkable performance\nin recommender systems, which represent users and items as embedding vectors.\nRecently, due to the powerful modeling capability of graph neural networks for\nuser-item interaction graphs, graph-based CF models have gained increasing\nattention. They encode each user/item and its subgraph into a single super\nvector by combining graph embeddings after each graph convolution. However,\neach hop of the neighbor in the user-item subgraphs carries a specific semantic\nmeaning. Encoding all subgraph information into single vectors and inferring\nuser-item relations with dot products can weaken the semantic information\nbetween user and item subgraphs, thus leaving untapped potential. Exploiting\nthis untapped potential provides insight into improving performance for\nexisting recommendation models. To this end, we propose the Graph\nCross-correlated Network for Recommendation (GCR), which serves as a general\nrecommendation paradigm that explicitly considers correlations between\nuser/item subgraphs. GCR first introduces the Plain Graph Representation (PGR)\nto extract information directly from each hop of neighbors into corresponding\nPGR vectors. Then, GCR develops Cross-Correlated Aggregation (CCA) to construct\npossible cross-correlated terms between PGR vectors of user/item subgraphs.\nFinally, GCR comprehensively incorporates the cross-correlated terms for\nrecommendations. Experimental results show that GCR outperforms\nstate-of-the-art models on both interaction prediction and click-through rate\nprediction tasks.\n","authors":["Hao Chen","Yuanchen Bei","Wenbing Huang","Shengyuan Chen","Feiran Huang","Xiao Huang"],"pdf_url":"https://arxiv.org/pdf/2411.01182v1.pdf","comment":"14 pages, accepted by TKDE"},{"id":"http://arxiv.org/abs/2411.01178v1","updated":"2024-11-02T08:36:16Z","published":"2024-11-02T08:36:16Z","title":"LLM4PR: Improving Post-Ranking in Search Engine with Large Language\n Models","summary":" Alongside the rapid development of Large Language Models (LLMs), there has\nbeen a notable increase in efforts to integrate LLM techniques in information\nretrieval (IR) and search engines (SE). Recently, an additional post-ranking\nstage is suggested in SE to enhance user satisfaction in practical\napplications. Nevertheless, research dedicated to enhancing the post-ranking\nstage through LLMs remains largely unexplored. In this study, we introduce a\nnovel paradigm named Large Language Models for Post-Ranking in search engine\n(LLM4PR), which leverages the capabilities of LLMs to accomplish the\npost-ranking task in SE. Concretely, a Query-Instructed Adapter (QIA) module is\ndesigned to derive the user/item representation vectors by incorporating their\nheterogeneous features. A feature adaptation step is further introduced to\nalign the semantics of user/item representations with the LLM. Finally, the\nLLM4PR integrates a learning to post-rank step, leveraging both a main task and\nan auxiliary task to fine-tune the model to adapt the post-ranking task.\nExperiment studies demonstrate that the proposed framework leads to significant\nimprovements and exhibits state-of-the-art performance compared with other\nalternatives.\n","authors":["Yang Yan","Yihao Wang","Chi Zhang","Wenyuan Hou","Kang Pan","Xingkai Ren","Zelun Wu","Zhixin Zhai","Enyun Yu","Wenwu Ou","Yang Song"],"pdf_url":"https://arxiv.org/pdf/2411.01178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17769v2","updated":"2024-11-02T08:06:32Z","published":"2024-04-27T03:37:12Z","title":"Two-stage Conformal Risk Control with Application to Ranked Retrieval","summary":" Many practical machine learning systems, such as ranking and recommendation\nsystems, consist of two concatenated stages: retrieval and ranking. These\nsystems present significant challenges in accurately assessing and managing the\nuncertainty inherent in their predictions. To address these challenges, we\nextend the recently developed framework of conformal risk control, originally\ndesigned for single-stage problems, to accommodate the more complex two-stage\nsetup. We first demonstrate that a straightforward application of conformal\nrisk control, treating each stage independently, may fail to maintain risk at\ntheir pre-specified levels. Therefore, we propose an integrated approach that\nconsiders both stages simultaneously, devising algorithms to control the risk\nof each stage by jointly identifying thresholds for both stages. Our algorithm\nfurther optimizes for a weighted combination of prediction set sizes across all\nfeasible thresholds, resulting in more effective prediction sets. Finally, we\napply the proposed method to the critical task of two-stage ranked retrieval.\nWe validate the efficacy of our method through extensive experiments on two\nlarge-scale public datasets, MSLR-WEB and MS MARCO, commonly used for ranked\nretrieval tasks.\n","authors":["Yunpeng Xu","Mufang Ying","Wenge Guo","Zhi Wei"],"pdf_url":"https://arxiv.org/pdf/2404.17769v2.pdf","comment":"13 pages, 3 figures; 5 supplementary pages, 3 supplementary figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.00562v2","updated":"2024-11-02T22:31:05Z","published":"2024-08-31T23:22:30Z","title":"Comparative Analysis of Modality Fusion Approaches for Audio-Visual\n Person Identification and Verification","summary":" Multimodal learning involves integrating information from various modalities\nto enhance learning and comprehension. We compare three modality fusion\nstrategies in person identification and verification by processing two\nmodalities: voice and face. In this paper, a one-dimensional convolutional\nneural network is employed for x-vector extraction from voice, while the\npre-trained VGGFace2 network and transfer learning are utilized for face\nmodality. In addition, gammatonegram is used as speech representation in\nengagement with the Darknet19 pre-trained network. The proposed systems are\nevaluated using the K-fold cross-validation technique on the 118 speakers of\nthe test set of the VoxCeleb2 dataset. The comparative evaluations are done for\nsingle-modality and three proposed multimodal strategies in equal situations.\nResults demonstrate that the feature fusion strategy of gammatonegram and\nfacial features achieves the highest performance, with an accuracy of 98.37% in\nthe person identification task. However, concatenating facial features with the\nx-vector reaches 0.62% for EER in verification tasks.\n","authors":["Aref Farhadipour","Masoumeh Chapariniya","Teodora Vukovic","Volker Dellwo"],"pdf_url":"https://arxiv.org/pdf/2409.00562v2.pdf","comment":"This paper was accepted at the ICNLSP2024 conference"},{"id":"http://arxiv.org/abs/2410.22023v3","updated":"2024-11-02T12:52:23Z","published":"2024-10-29T13:13:30Z","title":"Multi-modal Speech Emotion Recognition via Feature Distribution\n Adaptation Network","summary":" In this paper, we propose a novel deep inductive transfer learning framework,\nnamed feature distribution adaptation network, to tackle the challenging\nmulti-modal speech emotion recognition problem. Our method aims to use deep\ntransfer learning strategies to align visual and audio feature distributions to\nobtain consistent representation of emotion, thereby improving the performance\nof speech emotion recognition. In our model, the pre-trained ResNet-34 is\nutilized for feature extraction for facial expression images and acoustic Mel\nspectrograms, respectively. Then, the cross-attention mechanism is introduced\nto model the intrinsic similarity relationships of multi-modal features.\nFinally, the multi-modal feature distribution adaptation is performed\nefficiently with feed-forward network, which is extended using the local\nmaximum mean discrepancy loss. Experiments are carried out on two benchmark\ndatasets, and the results demonstrate that our model can achieve excellent\nperformance compared with existing ones.\n","authors":["Shaokai Li","Yixuan Ji","Peng Song","Haoqin Sun","Wenming Zheng"],"pdf_url":"https://arxiv.org/pdf/2410.22023v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18709v3","updated":"2024-11-02T11:09:37Z","published":"2023-10-28T13:37:52Z","title":"Audio-Visual Instance Segmentation","summary":" In this paper, we propose a new multi-modal task, termed audio-visual\ninstance segmentation (AVIS), which aims to simultaneously identify, segment\nand track individual sounding object instances in audible videos. To facilitate\nthis research, we introduce a high-quality benchmark named AVISeg, containing\nover 90K instance masks from 26 semantic categories in 926 long videos.\nAdditionally, we propose a strong baseline model for this task. Our model first\nlocalizes sound source within each frame, and condenses object-specific\ncontexts into concise tokens. Then it builds long-range audio-visual\ndependencies between these tokens using window-based attention, and tracks\nsounding objects among the entire video sequences. Extensive experiments reveal\nthat our method performs best on AVISeg, surpassing the existing methods from\nrelated tasks. We further conduct the evaluation on several multi-modal large\nmodels; however, they exhibits subpar performance on instance-level sound\nsource localization and temporal perception. We expect that AVIS will inspire\nthe community towards a more comprehensive multi-modal understanding. The\ndataset and code will soon be released on https://github.com/ruohaoguo/avis.\n","authors":["Ruohao Guo","Xianghua Ying","Yaru Chen","Dantong Niu","Guangyao Li","Liao Qu","Yanyu Qi","Jinxing Zhou","Bowei Xing","Wenzhen Yue","Ji Shi","Qixun Wang","Peiliang Zhang","Buwen Liang"],"pdf_url":"https://arxiv.org/pdf/2310.18709v3.pdf","comment":"Project page: https://github.com/ruohaoguo/avis"},{"id":"http://arxiv.org/abs/2406.11161v2","updated":"2024-11-02T02:30:50Z","published":"2024-06-17T03:01:22Z","title":"Emotion-LLaMA: Multimodal Emotion Recognition and Reasoning with\n Instruction Tuning","summary":" Accurate emotion perception is crucial for various applications, including\nhuman-computer interaction, education, and counseling. However, traditional\nsingle-modality approaches often fail to capture the complexity of real-world\nemotional expressions, which are inherently multimodal. Moreover, existing\nMultimodal Large Language Models (MLLMs) face challenges in integrating audio\nand recognizing subtle facial micro-expressions. To address this, we introduce\nthe MERR dataset, containing 28,618 coarse-grained and 4,487 fine-grained\nannotated samples across diverse emotional categories. This dataset enables\nmodels to learn from varied scenarios and generalize to real-world\napplications. Furthermore, we propose Emotion-LLaMA, a model that seamlessly\nintegrates audio, visual, and textual inputs through emotion-specific encoders.\nBy aligning features into a shared space and employing a modified LLaMA model\nwith instruction tuning, Emotion-LLaMA significantly enhances both emotional\nrecognition and reasoning capabilities. Extensive evaluations show\nEmotion-LLaMA outperforms other MLLMs, achieving top scores in Clue Overlap\n(7.83) and Label Overlap (6.25) on EMER, an F1 score of 0.9036 on MER2023-SEMI\nchallenge, and the highest UAR (45.59) and WAR (59.37) in zero-shot evaluations\non DFEW dataset.\n","authors":["Zebang Cheng","Zhi-Qi Cheng","Jun-Yan He","Jingdong Sun","Kai Wang","Yuxiang Lin","Zheng Lian","Xiaojiang Peng","Alexander Hauptmann"],"pdf_url":"https://arxiv.org/pdf/2406.11161v2.pdf","comment":"Accepted at NeurIPS 2024. 49 pages, 13 figures, Project:\n https://github.com/ZebangCheng/Emotion-LLaMA, Demo:\n https://huggingface.co/spaces/ZebangCheng/Emotion-LLaMA"}]},"2024-11-01T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2410.13047v2","updated":"2024-11-01T23:39:14Z","published":"2024-10-16T21:17:18Z","title":"LLM Confidence Evaluation Measures in Zero-Shot CSS Classification","summary":" Assessing classification confidence is critical for leveraging large language\nmodels (LLMs) in automated labeling tasks, especially in the sensitive domains\npresented by Computational Social Science (CSS) tasks. In this paper, we make\nthree key contributions: (1) we propose an uncertainty quantification (UQ)\nperformance measure tailored for data annotation tasks, (2) we compare, for the\nfirst time, five different UQ strategies across three distinct LLMs and CSS\ndata annotation tasks, (3) we introduce a novel UQ aggregation strategy that\neffectively identifies low-confidence LLM annotations and disproportionately\nuncovers data incorrectly labeled by the LLMs. Our results demonstrate that our\nproposed UQ aggregation strategy improves upon existing methods andcan be used\nto significantly improve human-in-the-loop data annotation processes.\n","authors":["David Farr","Iain Cruickshank","Nico Manzonelli","Nicholas Clark","Kate Starbird","Jevin West"],"pdf_url":"https://arxiv.org/pdf/2410.13047v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01039v1","updated":"2024-11-01T21:14:04Z","published":"2024-11-01T21:14:04Z","title":"Enhancing Question Answering Precision with Optimized Vector Retrieval\n and Instructions","summary":" Question-answering (QA) is an important application of Information Retrieval\n(IR) and language models, and the latest trend is toward pre-trained large\nneural networks with embedding parameters. Augmenting QA performances with\nthese LLMs requires intensive computational resources for fine-tuning. We\npropose an innovative approach to improve QA task performances by integrating\noptimized vector retrievals and instruction methodologies. Based on retrieval\naugmentation, the process involves document embedding, vector retrieval, and\ncontext construction for optimal QA results. We experiment with different\ncombinations of text segmentation techniques and similarity functions, and\nanalyze their impacts on QA performances. Results show that the model with a\nsmall chunk size of 100 without any overlap of the chunks achieves the best\nresult and outperforms the models based on semantic segmentation using\nsentences. We discuss related QA examples and offer insight into how model\nperformances are improved within the two-stage framework.\n","authors":["Lixiao Yang","Mengyang Xu","Weimao Ke"],"pdf_url":"https://arxiv.org/pdf/2411.01039v1.pdf","comment":"6 pages, 4 tables"},{"id":"http://arxiv.org/abs/2411.00744v1","updated":"2024-11-01T17:11:16Z","published":"2024-11-01T17:11:16Z","title":"CORAG: A Cost-Constrained Retrieval Optimization System for\n Retrieval-Augmented Generation","summary":" Large Language Models (LLMs) have demonstrated remarkable generation\ncapabilities but often struggle to access up-to-date information, which can\nlead to hallucinations. Retrieval-Augmented Generation (RAG) addresses this\nissue by incorporating knowledge from external databases, enabling more\naccurate and relevant responses. Due to the context window constraints of LLMs,\nit is impractical to input the entire external database context directly into\nthe model. Instead, only the most relevant information, referred to as chunks,\nis selectively retrieved. However, current RAG research faces three key\nchallenges. First, existing solutions often select each chunk independently,\noverlooking potential correlations among them. Second, in practice the utility\nof chunks is non-monotonic, meaning that adding more chunks can decrease\noverall utility. Traditional methods emphasize maximizing the number of\nincluded chunks, which can inadvertently compromise performance. Third, each\ntype of user query possesses unique characteristics that require tailored\nhandling, an aspect that current approaches do not fully consider. To overcome\nthese challenges, we propose a cost constrained retrieval optimization system\nCORAG for retrieval-augmented generation. We employ a Monte Carlo Tree Search\n(MCTS) based policy framework to find optimal chunk combinations sequentially,\nallowing for a comprehensive consideration of correlations among chunks.\nAdditionally, rather than viewing budget exhaustion as a termination condition,\nwe integrate budget constraints into the optimization of chunk combinations,\neffectively addressing the non-monotonicity of chunk utility.\n","authors":["Ziting Wang","Haitao Yuan","Wei Dong","Gao Cong","Feifei Li"],"pdf_url":"https://arxiv.org/pdf/2411.00744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00702v1","updated":"2024-11-01T16:05:59Z","published":"2024-11-01T16:05:59Z","title":"A graph-based approach to extracting narrative signals from public\n discourse","summary":" Narratives are key interpretative devices by which humans make sense of\npolitical reality. As the significance of narratives for understanding current\nsocietal issues such as polarization and misinformation becomes increasingly\nevident, there is a growing demand for methods that support their empirical\nanalysis. To this end, we propose a graph-based formalism and machine-guided\nmethod for extracting, representing, and analyzing selected narrative signals\nfrom digital textual corpora, based on Abstract Meaning Representation (AMR).\nThe formalism and method introduced here specifically cater to the study of\npolitical narratives that figure in texts from digital media such as archived\npolitical speeches, social media posts, political manifestos and transcripts of\nparliamentary debates. We conceptualize these political narratives as a type of\nontological narratives: stories by which actors position themselves as\npolitical beings, and which are akin to political worldviews in which actors\npresent their normative vision of the world, or aspects thereof. We approach\nthe study of such political narratives as a problem of information retrieval:\nstarting from a textual corpus, we first extract a graph-like representation of\nthe meaning of each sentence in the corpus using AMR. Drawing on transferable\nconcepts from narratology, we then apply a set of heuristics to filter these\ngraphs for representations of 1) actors, 2) the events in which these actors\nfigure, and 3) traces of the perspectivization of these events. We approach\nthese references to actors, events, and instances of perspectivization as core\nnarrative signals that initiate a further analysis by alluding to larger\npolitical narratives. By means of a case study of State of the European Union\naddresses, we demonstrate how the formalism can be used to inductively surface\nsignals of political narratives from public discourse.\n","authors":["Armin Pournaki","Tom Willaert"],"pdf_url":"https://arxiv.org/pdf/2411.00702v1.pdf","comment":"23 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.00677v1","updated":"2024-11-01T15:36:52Z","published":"2024-11-01T15:36:52Z","title":"Making Sense of Metadata Mess: Alignment & Risk Assessment for Diatom\n Data Use Case","summary":" Biologists study Diatoms, a fundamental algae, to assess the health of\naquatic systems. Diatom specimens have traditionally been preserved on analog\nslides, where a single slide can contain thousands of these microscopic\norganisms. Digitization of these collections presents both metadata challenges\nand opportunities. This paper reports on metadata research aimed at providing\naccess to a digital portion of the Academy of Natural Sciences' Diatom\nHerbarium, Drexel University. We report results of a 3-part study covering 1) a\nreview of relevant metadata standards and a microscopy metadata framework\nshared by Hammer et al., 2) a baseline metadata alignment mapping current\ndiatom metadata properties to standard metadata types, and 3) a metadata risk\nanalysis associated with the course of standard data curation practices. This\nresearch is part of an effort involving the transfer of these digital slides to\nan new system, DataFed, to support global accessible. The final section of this\npaper includes a conclusion and discusses next steps.\n","authors":["Kio Polson","Marina Potapova","Uttam Meena","Chad Peiper","Joshua Brown","Joshua Agar","Jane Greenberg"],"pdf_url":"https://arxiv.org/pdf/2411.00677v1.pdf","comment":"13 pages, 2 figures, 1 table, to be published in MTSR 2024 conference\n proceedings"},{"id":"http://arxiv.org/abs/2411.00676v1","updated":"2024-11-01T15:35:56Z","published":"2024-11-01T15:35:56Z","title":"Enhancing Semantic Interoperability Across Materials Science With\n HIVE4MAT","summary":" HIVE4MAT is a linked data interactive application for navigating ontologies\nof value to materials science. HIVE enables automatic indexing of textual\nresources with standardized terminology. This article presents the motivation\nunderlying HIVE4MAT, explains the system architecture, reports on two\nevaluations, and discusses future plans.\n","authors":["Jane Greenberg","Kio Polson","Scott McClellan","Xintong Zhao","Alex Kalinowski","Yuan An"],"pdf_url":"https://arxiv.org/pdf/2411.00676v1.pdf","comment":"11 pages, 1 figures, 3 tables, to be published in SeMatS 2024\n workshop proceedings"},{"id":"http://arxiv.org/abs/2407.10691v2","updated":"2024-11-01T14:08:31Z","published":"2024-07-15T13:04:09Z","title":"$\\texttt{MixGR}$: Enhancing Retriever Generalization for Scientific\n Domain through Complementary Granularity","summary":" Recent studies show the growing significance of document retrieval in the\ngeneration of LLMs, i.e., RAG, within the scientific domain by bridging their\nknowledge gap. However, dense retrievers often struggle with domain-specific\nretrieval and complex query-document relationships, particularly when query\nsegments correspond to various parts of a document. To alleviate such prevalent\nchallenges, this paper introduces $\\texttt{MixGR}$, which improves dense\nretrievers' awareness of query-document matching across various levels of\ngranularity in queries and documents using a zero-shot approach.\n$\\texttt{MixGR}$ fuses various metrics based on these granularities to a united\nscore that reflects a comprehensive query-document similarity. Our experiments\ndemonstrate that $\\texttt{MixGR}$ outperforms previous document retrieval by\n24.7%, 9.8%, and 6.9% on nDCG@5 with unsupervised, supervised, and LLM-based\nretrievers, respectively, averaged on queries containing multiple subqueries\nfrom five scientific retrieval datasets. Moreover, the efficacy of two\ndownstream scientific question-answering tasks highlights the advantage of\n$\\texttt{MixGR}$ to boost the application of LLMs in the scientific domain. The\ncode and experimental datasets are available.\n","authors":["Fengyu Cai","Xinran Zhao","Tong Chen","Sihao Chen","Hongming Zhang","Iryna Gurevych","Heinz Koeppl"],"pdf_url":"https://arxiv.org/pdf/2407.10691v2.pdf","comment":"EMNLP 2024 Main Conference"},{"id":"http://arxiv.org/abs/2411.00556v1","updated":"2024-11-01T13:09:30Z","published":"2024-11-01T13:09:30Z","title":"LLM-KT: A Versatile Framework for Knowledge Transfer from Large Language\n Models to Collaborative Filtering","summary":" We present LLM-KT, a flexible framework designed to enhance collaborative\nfiltering (CF) models by seamlessly integrating LLM (Large Language\nModel)-generated features. Unlike existing methods that rely on passing\nLLM-generated features as direct inputs, our framework injects these features\ninto an intermediate layer of any CF model, allowing the model to reconstruct\nand leverage the embeddings internally. This model-agnostic approach works with\na wide range of CF models without requiring architectural changes, making it\nadaptable to various recommendation scenarios. Our framework is built for easy\nintegration and modification, providing researchers and developers with a\npowerful tool for extending CF model capabilities through efficient knowledge\ntransfer. We demonstrate its effectiveness through experiments on the MovieLens\nand Amazon datasets, where it consistently improves baseline CF models.\nExperimental studies showed that LLM-KT is competitive with the\nstate-of-the-art methods in context-aware settings but can be applied to a\nbroader range of CF models than current approaches.\n","authors":["Nikita Severin","Aleksei Ziablitsev","Yulia Savelyeva","Valeriy Tashchilin","Ivan Bulychev","Mikhail Yushkov","Artem Kushneruk","Amaliya Zaryvnykh","Dmitrii Kiselev","Andrey Savchenko","Ilya Makarov"],"pdf_url":"https://arxiv.org/pdf/2411.00556v1.pdf","comment":"accepted at ICDM 2024 (demo track)"},{"id":"http://arxiv.org/abs/2411.00469v1","updated":"2024-11-01T09:34:36Z","published":"2024-11-01T09:34:36Z","title":"MIRFLEX: Music Information Retrieval Feature Library for Extraction","summary":" This paper introduces an extendable modular system that compiles a range of\nmusic feature extraction models to aid music information retrieval research.\nThe features include musical elements like key, downbeats, and genre, as well\nas audio characteristics like instrument recognition, vocals/instrumental\nclassification, and vocals gender detection. The integrated models are\nstate-of-the-art or latest open-source. The features can be extracted as latent\nor post-processed labels, enabling integration into music applications such as\ngenerative music, recommendation, and playlist generation. The modular design\nallows easy integration of newly developed systems, making it a good\nbenchmarking and comparison tool. This versatile toolkit supports the research\ncommunity in developing innovative solutions by providing concrete musical\nfeatures.\n","authors":["Anuradha Chopra","Abhinaba Roy","Dorien Herremans"],"pdf_url":"https://arxiv.org/pdf/2411.00469v1.pdf","comment":"2 pages, 4 tables, submitted to Extended Abstracts for the\n Late-Breaking Demo Session of the 25th Int. Society for Music Information\n Retrieval Conf., San Francisco, United States, 2024"},{"id":"http://arxiv.org/abs/2411.00451v1","updated":"2024-11-01T08:57:29Z","published":"2024-11-01T08:57:29Z","title":"Improving Few-Shot Cross-Domain Named Entity Recognition by Instruction\n Tuning a Word-Embedding based Retrieval Augmented Large Language Model","summary":" Few-Shot Cross-Domain NER is the process of leveraging knowledge from\ndata-rich source domains to perform entity recognition on data scarce target\ndomains. Most previous state-of-the-art (SOTA) approaches use pre-trained\nlanguage models (PLMs) for cross-domain NER. However, these models are often\ndomain specific. To successfully use these models for new target domains, we\nneed to modify either the model architecture or perform model finetuning using\ndata from the new domains. Both of these result in the creation of entirely new\nNER models for each target domain which is infeasible for practical scenarios.\nRecently,several works have attempted to use LLMs to solve Few-Shot\nCross-Domain NER. However, most of these are either too expensive for practical\npurposes or struggle to follow LLM prompt instructions. In this paper, we\npropose IF-WRANER (Instruction Finetuned Word-embedding based Retrieval\nAugmented large language model for Named Entity Recognition), a retrieval\naugmented LLM, finetuned for the NER task. By virtue of the regularization\ntechniques used during LLM finetuning and the adoption of word-level embedding\nover sentence-level embedding during the retrieval of in-prompt examples,\nIF-WRANER is able to outperform previous SOTA Few-Shot Cross-Domain NER\napproaches. We have demonstrated the effectiveness of our model by benchmarking\nits performance on the open source CrossNER dataset, on which it shows more\nthan 2% F1 score improvement over the previous SOTA model. We have deployed the\nmodel for multiple customer care domains of an enterprise. Accurate entity\nprediction through IF-WRANER helps direct customers to automated workflows for\nthe domains, thereby reducing escalations to human agents by almost 15% and\nleading to millions of dollars in yearly savings for the company.\n","authors":["Subhadip Nandi","Neeraj Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.00451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00395v1","updated":"2024-11-01T06:49:39Z","published":"2024-11-01T06:49:39Z","title":"DivNet: Diversity-Aware Self-Correcting Sequential Recommendation\n Networks","summary":" As the last stage of a typical \\textit{recommendation system},\n\\textit{collective recommendation} aims to give the final touches to the\nrecommended items and their layout so as to optimize overall objectives such as\ndiversity and whole-page relevance. In practice, however, the interaction\ndynamics among the recommended items, their visual appearances and meta-data\nsuch as specifications are often too complex to be captured by experts'\nheuristics or simple models. To address this issue, we propose a\n\\textit{\\underline{div}ersity-aware self-correcting sequential recommendation\n\\underline{net}works} (\\textit{DivNet}) that is able to estimate utility by\ncapturing the complex interactions among sequential items and diversify\nrecommendations simultaneously. Experiments on both offline and online settings\ndemonstrate that \\textit{DivNet} can achieve better results compared to\nbaselines with or without collective recommendations.\n","authors":["Shuai Xiao","Zaifan Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.00395v1.pdf","comment":"Published at CIKM"},{"id":"http://arxiv.org/abs/2408.10159v3","updated":"2024-11-01T03:47:59Z","published":"2024-08-19T17:09:32Z","title":"Customizing Language Models with Instance-wise LoRA for Sequential\n Recommendation","summary":" Sequential recommendation systems predict the next interaction item based on\nusers' past interactions, aligning recommendations with individual preferences.\nLeveraging the strengths of Large Language Models (LLMs) in knowledge\ncomprehension and reasoning, recent approaches are eager to apply LLMs to\nsequential recommendation. A common paradigm is converting user behavior\nsequences into instruction data, and fine-tuning the LLM with\nparameter-efficient fine-tuning (PEFT) methods like Low-Rank Adaption (LoRA).\nHowever, the uniform application of LoRA across diverse user behaviors is\ninsufficient to capture individual variability, resulting in negative transfer\nbetween disparate sequences. To address these challenges, we propose\nInstance-wise LoRA (iLoRA). We innovatively treat the sequential recommendation\ntask as a form of multi-task learning, integrating LoRA with the Mixture of\nExperts (MoE) framework. This approach encourages different experts to capture\nvarious aspects of user behavior. Additionally, we introduce a sequence\nrepresentation guided gate function that generates customized expert\nparticipation weights for each user sequence, which allows dynamic parameter\nadjustment for instance-wise recommendations. In sequential recommendation,\niLoRA achieves an average relative improvement of 11.4\\% over basic LoRA in the\nhit ratio metric, with less than a 1\\% relative increase in trainable\nparameters. Extensive experiments on three benchmark datasets demonstrate the\neffectiveness of iLoRA, highlighting its superior performance compared to\nexisting methods in mitigating negative transfer and improving recommendation\naccuracy. Our data and code are available at\nhttps://github.com/AkaliKong/iLoRA.\n","authors":["Xiaoyu Kong","Jiancan Wu","An Zhang","Leheng Sheng","Hui Lin","Xiang Wang","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2408.10159v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00341v1","updated":"2024-11-01T03:43:50Z","published":"2024-11-01T03:43:50Z","title":"A Survey on Bundle Recommendation: Methods, Applications, and Challenges","summary":" In recent years, bundle recommendation systems have gained significant\nattention in both academia and industry due to their ability to enhance user\nexperience and increase sales by recommending a set of items as a bundle rather\nthan individual items. This survey provides a comprehensive review on bundle\nrecommendation, beginning by a taxonomy for exploring product bundling. We\nclassify it into two categories based on bundling strategy from various\napplication domains, i.e., discriminative and generative bundle recommendation.\nThen we formulate the corresponding tasks of the two categories and\nsystematically review their methods: 1) representation learning from bundle and\nitem levels and interaction modeling for discriminative bundle recommendation;\n2) representation learning from item level and bundle generation for generative\nbundle recommendation. Subsequently, we survey the resources of bundle\nrecommendation including datasets and evaluation metrics, and conduct\nreproducibility experiments on mainstream models. Lastly, we discuss the main\nchallenges and highlight the promising future directions in the field of bundle\nrecommendation, aiming to serve as a useful resource for researchers and\npractitioners. Our code and datasets are publicly available at\nhttps://github.com/WUT-IDEA/bundle-recommendation-survey.\n","authors":["Meng Sun","Lin Li","Ming Li","Xiaohui Tao","Dong Zhang","Peipei Wang","Jimmy Xiangji Huang"],"pdf_url":"https://arxiv.org/pdf/2411.00341v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20646v2","updated":"2024-11-01T03:12:44Z","published":"2024-05-31T07:24:42Z","title":"LLM-ESR: Large Language Models Enhancement for Long-tailed Sequential\n Recommendation","summary":" Sequential recommender systems (SRS) aim to predict users' subsequent choices\nbased on their historical interactions and have found applications in diverse\nfields such as e-commerce and social media. However, in real-world systems,\nmost users interact with only a handful of items, while the majority of items\nare seldom consumed. These two issues, known as the long-tail user and\nlong-tail item challenges, often pose difficulties for existing SRS. These\nchallenges can adversely affect user experience and seller benefits, making\nthem crucial to address. Though a few works have addressed the challenges, they\nstill struggle with the seesaw or noisy issues due to the intrinsic scarcity of\ninteractions. The advancements in large language models (LLMs) present a\npromising solution to these problems from a semantic perspective. As one of the\npioneers in this field, we propose the Large Language Models Enhancement\nframework for Sequential Recommendation (LLM-ESR). This framework utilizes\nsemantic embeddings derived from LLMs to enhance SRS without adding extra\ninference load from LLMs. To address the long-tail item challenge, we design a\ndual-view modeling framework that combines semantics from LLMs and\ncollaborative signals from conventional SRS. For the long-tail user challenge,\nwe propose a retrieval augmented self-distillation method to enhance user\npreference representation using more informative interactions from similar\nusers. To verify the effectiveness and versatility of our proposed enhancement\nframework, we conduct extensive experiments on three real-world datasets using\nthree popular SRS models. The results show that our method surpasses existing\nbaselines consistently, and benefits long-tail users and items especially. The\nimplementation code is available at\nhttps://github.com/Applied-Machine-Learning-Lab/LLM-ESR.\n","authors":["Qidong Liu","Xian Wu","Yejing Wang","Zijian Zhang","Feng Tian","Yefeng Zheng","Xiangyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.20646v2.pdf","comment":"accepted by NeruIPS'24 (Spotlight)"},{"id":"http://arxiv.org/abs/2411.00331v1","updated":"2024-11-01T03:09:28Z","published":"2024-11-01T03:09:28Z","title":"Beyond Utility: Evaluating LLM as Recommender","summary":" With the rapid development of Large Language Models (LLMs), recent studies\nemployed LLMs as recommenders to provide personalized information services for\ndistinct users. Despite efforts to improve the accuracy of LLM-based\nrecommendation models, relatively little attention is paid to beyond-utility\ndimensions. Moreover, there are unique evaluation aspects of LLM-based\nrecommendation models, which have been largely ignored. To bridge this gap, we\nexplore four new evaluation dimensions and propose a multidimensional\nevaluation framework. The new evaluation dimensions include: 1) history length\nsensitivity, 2) candidate position bias, 3) generation-involved performance,\nand 4) hallucinations. All four dimensions have the potential to impact\nperformance, but are largely unnecessary for consideration in traditional\nsystems. Using this multidimensional evaluation framework, along with\ntraditional aspects, we evaluate the performance of seven LLM-based\nrecommenders, with three prompting strategies, comparing them with six\ntraditional models on both ranking and re-ranking tasks on four datasets. We\nfind that LLMs excel at handling tasks with prior knowledge and shorter input\nhistories in the ranking setting, and perform better in the re-ranking setting,\nbeating traditional models across multiple dimensions. However, LLMs exhibit\nsubstantial candidate position bias issues, and some models hallucinate\nnon-existent items much more often than others. We intend our evaluation\nframework and observations to benefit future research on the use of LLMs as\nrecommenders. The code and data are available at\nhttps://github.com/JiangDeccc/EvaLLMasRecommender.\n","authors":["Chumeng Jiang","Jiayin Wang","Weizhi Ma","Charles L. A. Clarke","Shuai Wang","Chuhan Wu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.00331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15235v3","updated":"2024-11-01T02:00:49Z","published":"2024-02-23T09:57:20Z","title":"MACRec: a Multi-Agent Collaboration Framework for Recommendation","summary":" LLM-based agents have gained considerable attention for their decision-making\nskills and ability to handle complex tasks. Recognizing the current gap in\nleveraging agent capabilities for multi-agent collaboration in recommendation\nsystems, we introduce MACRec, a novel framework designed to enhance\nrecommendation systems through multi-agent collaboration. Unlike existing work\non using agents for user/item simulation, we aim to deploy multi-agents to\ntackle recommendation tasks directly. In our framework, recommendation tasks\nare addressed through the collaborative efforts of various specialized agents,\nincluding Manager, User/Item Analyst, Reflector, Searcher, and Task\nInterpreter, with different working flows. Furthermore, we provide application\nexamples of how developers can easily use MACRec on various recommendation\ntasks, including rating prediction, sequential recommendation, conversational\nrecommendation, and explanation generation of recommendation results. The\nframework and demonstration video are publicly available at\nhttps://github.com/wzf2000/MACRec.\n","authors":["Zhefan Wang","Yuanqing Yu","Wendi Zheng","Weizhi Ma","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.15235v3.pdf","comment":"Accepted by SIGIR2024"},{"id":"http://arxiv.org/abs/2410.23683v2","updated":"2024-11-01T01:21:04Z","published":"2024-10-31T07:19:22Z","title":"Unveiling User Satisfaction and Creator Productivity Trade-Offs in\n Recommendation Platforms","summary":" On User-Generated Content (UGC) platforms, recommendation algorithms\nsignificantly impact creators' motivation to produce content as they compete\nfor algorithmically allocated user traffic. This phenomenon subtly shapes the\nvolume and diversity of the content pool, which is crucial for the platform's\nsustainability. In this work, we demonstrate, both theoretically and\nempirically, that a purely relevance-driven policy with low exploration\nstrength boosts short-term user satisfaction but undermines the long-term\nrichness of the content pool. In contrast, a more aggressive exploration policy\nmay slightly compromise user satisfaction but promote higher content creation\nvolume. Our findings reveal a fundamental trade-off between immediate user\nsatisfaction and overall content production on UGC platforms. Building on this\nfinding, we propose an efficient optimization method to identify the optimal\nexploration strength, balancing user and creator engagement. Our model can\nserve as a pre-deployment audit tool for recommendation algorithms on UGC\nplatforms, helping to align their immediate objectives with sustainable,\nlong-term goals.\n","authors":["Fan Yao","Yiming Liao","Jingzhou Liu","Shaoliang Nie","Qifan Wang","Haifeng Xu","Hongning Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23683v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00275v1","updated":"2024-11-01T00:13:46Z","published":"2024-11-01T00:13:46Z","title":"Improving Musical Instrument Classification with Advanced Machine\n Learning Techniques","summary":" Musical instrument classification, a key area in Music Information Retrieval,\nhas gained considerable interest due to its applications in education, digital\nmusic production, and consumer media. Recent advances in machine learning,\nspecifically deep learning, have enhanced the capability to identify and\nclassify musical instruments from audio signals. This study applies various\nmachine learning methods, including Naive Bayes, Support Vector Machines,\nRandom Forests, Boosting techniques like AdaBoost and XGBoost, as well as deep\nlearning models such as Convolutional Neural Networks and Artificial Neural\nNetworks. The effectiveness of these methods is evaluated on the NSynth\ndataset, a large repository of annotated musical sounds. By comparing these\napproaches, the analysis aims to showcase the advantages and limitations of\neach method, providing guidance for developing more accurate and efficient\nclassification systems. Additionally, hybrid model testing and discussion are\nincluded. This research aims to support further studies in instrument\nclassification by proposing new approaches and future research directions.\n","authors":["Joanikij Chulev"],"pdf_url":"https://arxiv.org/pdf/2411.00275v1.pdf","comment":"43 pages, 35 figures, 14 tables"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.20012v2","updated":"2024-11-01T08:40:28Z","published":"2024-09-30T07:14:31Z","title":"Towards Robust Multimodal Sentiment Analysis with Incomplete Data","summary":" The field of Multimodal Sentiment Analysis (MSA) has recently witnessed an\nemerging direction seeking to tackle the issue of data incompleteness.\nRecognizing that the language modality typically contains dense sentiment\ninformation, we consider it as the dominant modality and present an innovative\nLanguage-dominated Noise-resistant Learning Network (LNLN) to achieve robust\nMSA. The proposed LNLN features a dominant modality correction (DMC) module and\ndominant modality based multimodal learning (DMML) module, which enhances the\nmodel's robustness across various noise scenarios by ensuring the quality of\ndominant modality representations. Aside from the methodical design, we perform\ncomprehensive experiments under random data missing scenarios, utilizing\ndiverse and meaningful settings on several popular datasets (\\textit{e.g.,}\nMOSI, MOSEI, and SIMS), providing additional uniformity, transparency, and\nfairness compared to existing evaluations in the literature. Empirically, LNLN\nconsistently outperforms existing baselines, demonstrating superior performance\nacross these challenging and extensive evaluation metrics.\n","authors":["Haoyu Zhang","Wenbin Wang","Tianshu Yu"],"pdf_url":"https://arxiv.org/pdf/2409.20012v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.00304v1","updated":"2024-11-01T01:51:31Z","published":"2024-11-01T01:51:31Z","title":"Unified Generative and Discriminative Training for Multi-modal Large\n Language Models","summary":" In recent times, Vision-Language Models (VLMs) have been trained under two\npredominant paradigms. Generative training has enabled Multimodal Large\nLanguage Models (MLLMs) to tackle various complex tasks, yet issues such as\nhallucinations and weak object discrimination persist. Discriminative training,\nexemplified by models like CLIP, excels in zero-shot image-text classification\nand retrieval, yet struggles with complex scenarios requiring fine-grained\nsemantic differentiation. This paper addresses these challenges by proposing a\nunified approach that integrates the strengths of both paradigms. Considering\ninterleaved image-text sequences as the general format of input samples, we\nintroduce a structure-induced training strategy that imposes semantic\nrelationships between input samples and the MLLM's hidden state. This approach\nenhances the MLLM's ability to capture global semantics and distinguish\nfine-grained semantics. By leveraging dynamic sequence alignment within the\nDynamic Time Warping framework and integrating a novel kernel for fine-grained\nsemantic differentiation, our method effectively balances generative and\ndiscriminative tasks. Extensive experiments demonstrate the effectiveness of\nour approach, achieving state-of-the-art results in multiple generative tasks,\nespecially those requiring cognitive and discrimination abilities.\nAdditionally, our method surpasses discriminative benchmarks in interleaved and\nfine-grained retrieval tasks. By employing a retrieval-augmented generation\nstrategy, our approach further enhances performance in some generative tasks\nwithin one model, offering a promising direction for future research in\nvision-language modeling.\n","authors":["Wei Chow","Juncheng Li","Qifan Yu","Kaihang Pan","Hao Fei","Zhiqi Ge","Shuai Yang","Siliang Tang","Hanwang Zhang","Qianru Sun"],"pdf_url":"https://arxiv.org/pdf/2411.00304v1.pdf","comment":null}]},"2024-10-31T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2411.00262v1","updated":"2024-10-31T23:41:09Z","published":"2024-10-31T23:41:09Z","title":"Content Aware Analysis of Scholarly Networks: A Case Study on CORD19\n Dataset","summary":" This paper investigates the relationships among key elements of scientific\nresearch network, namely articles, researchers, and journals. We introduce a\nnovel approach to use semantic information through the HITS algorithm based\npropagation of topic information in the network. The topic information is\nderived by using the Named Entity Recognition and Entity Linkage. In our case,\nMedCAT is used to extract the topics from the CORD19 Dataset, which is a corpus\nof academic articles about COVID-19 and coronavirus scientific network. Our\napproach focuses on the COVID-19 domain, utilizing the CORD-19 dataset to\ndemonstrate the efficacy of integrating topic-related information within the\ncitation framework. Through the application of a hybrid HITS algorithm, we show\nthat incorporating topic data significantly influences article rankings,\nrevealing deeper insights into the structure of the academic community.\n","authors":["Mehmet Emre Akbulut","Yusuf Erdem Nacar"],"pdf_url":"https://arxiv.org/pdf/2411.00262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12036v2","updated":"2024-10-31T23:08:03Z","published":"2024-08-21T23:42:06Z","title":"Reasoning and Tools for Human-Level Forecasting","summary":" Language models (LMs) trained on web-scale datasets are largely successful\ndue to their ability to memorize large amounts of training data, even if only\npresent in a few examples. These capabilities are often desirable in evaluation\non tasks such as question answering but raise questions about whether these\nmodels can exhibit genuine reasoning or succeed only at mimicking patterns from\nthe training data. This distinction is particularly salient in forecasting\ntasks, where the answer is not present in the training data, and the model must\nreason to make logical deductions. We present Reasoning and Tools for\nForecasting (RTF), a framework of reasoning-and-acting (ReAct) agents that can\ndynamically retrieve updated information and run numerical simulation with\nequipped tools. We evaluate our model with questions from competitive\nforecasting platforms and demonstrate that our method is competitive with and\ncan outperform human predictions. This suggests that LMs, with the right tools,\ncan indeed think and adapt like humans, offering valuable insights for\nreal-world decision-making.\n","authors":["Elvis Hsieh","Preston Fu","Jonathan Chen"],"pdf_url":"https://arxiv.org/pdf/2408.12036v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14094v2","updated":"2024-10-31T21:26:01Z","published":"2024-07-19T07:58:26Z","title":"User-Creator Feature Polarization in Recommender Systems with Dual\n Influence","summary":" Recommender systems serve the dual purpose of presenting relevant content to\nusers and helping content creators reach their target audience. The dual nature\nof these systems naturally influences both users and creators: users'\npreferences are affected by the items they are recommended, while creators may\nbe incentivized to alter their content to attract more users. We define a\nmodel, called user-creator feature dynamics, to capture the dual influence of\nrecommender systems. We prove that a recommender system with dual influence is\nguaranteed to polarize, causing diversity loss in the system. We then\ninvestigate, both theoretically and empirically, approaches for mitigating\npolarization and promoting diversity in recommender systems. Unexpectedly, we\nfind that common diversity-promoting approaches do not work in the presence of\ndual influence, while relevancy-optimizing methods like top-$k$ truncation can\nprevent polarization and improve diversity of the system.\n","authors":["Tao Lin","Kun Jin","Andrew Estornell","Xiaoying Zhang","Yiling Chen","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2407.14094v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2407.08571v2","updated":"2024-10-31T20:30:51Z","published":"2024-07-11T14:59:17Z","title":"Multi-Group Proportional Representation in Retrieval","summary":" Image search and retrieval tasks can perpetuate harmful stereotypes, erase\ncultural identities, and amplify social disparities. Current approaches to\nmitigate these representational harms balance the number of retrieved items\nacross population groups defined by a small number of (often binary)\nattributes. However, most existing methods overlook intersectional groups\ndetermined by combinations of group attributes, such as gender, race, and\nethnicity. We introduce Multi-Group Proportional Representation (MPR), a novel\nmetric that measures representation across intersectional groups. We develop\npractical methods for estimating MPR, provide theoretical guarantees, and\npropose optimization algorithms to ensure MPR in retrieval. We demonstrate that\nexisting methods optimizing for equal and proportional representation metrics\nmay fail to promote MPR. Crucially, our work shows that optimizing MPR yields\nmore proportional representation across multiple intersectional groups\nspecified by a rich function class, often with minimal compromise in retrieval\naccuracy.\n","authors":["Alex Oesterling","Claudio Mayrink Verdun","Carol Xuan Long","Alexander Glynn","Lucas Monteiro Paes","Sajani Vithana","Martina Cardone","Flavio P. Calmon"],"pdf_url":"https://arxiv.org/pdf/2407.08571v2.pdf","comment":"48 pages, 33 figures. Accepted as poster at NeurIPS 2024. Code can be\n found at\n https://github.com/alex-oesterling/multigroup-proportional-representation"},{"id":"http://arxiv.org/abs/2411.00188v1","updated":"2024-10-31T20:15:14Z","published":"2024-10-31T20:15:14Z","title":"Building Multi-Agent Copilot towards Autonomous Agricultural Data\n Management and Analysis","summary":" Current agricultural data management and analysis paradigms are to large\nextent traditional, in which data collecting, curating, integration, loading,\nstoring, sharing and analyzing still involve too much human effort and\nknow-how. The experts, researchers and the farm operators need to understand\nthe data and the whole process of data management pipeline to make fully use of\nthe data. The essential problem of the traditional paradigm is the lack of a\nlayer of orchestrational intelligence which can understand, organize and\ncoordinate the data processing utilities to maximize data management and\nanalysis outcome. The emerging reasoning and tool mastering abilities of large\nlanguage models (LLM) make it a potentially good fit to this position, which\nhelps a shift from the traditional user-driven paradigm to AI-driven paradigm.\nIn this paper, we propose and explore the idea of a LLM based copilot for\nautonomous agricultural data management and analysis. Based on our previously\ndeveloped platform of Agricultural Data Management and Analytics (ADMA), we\nbuild a proof-of-concept multi-agent system called ADMA Copilot, which can\nunderstand user's intent, makes plans for data processing pipeline and\naccomplishes tasks automatically, in which three agents: a LLM based\ncontroller, an input formatter and an output formatter collaborate together.\nDifferent from existing LLM based solutions, by defining a meta-program graph,\nour work decouples control flow and data flow to enhance the predictability of\nthe behaviour of the agents. Experiments demonstrates the intelligence,\nautonomy, efficacy, efficiency, extensibility, flexibility and privacy of our\nsystem. Comparison is also made between ours and existing systems to show the\nsuperiority and potential of our system.\n","authors":["Yu Pan","Jianxin Sun","Hongfeng Yu","Joe Luck","Geng Bai","Nipuna Chamara","Yufeng Ge","Tala Awada"],"pdf_url":"https://arxiv.org/pdf/2411.00188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19302v3","updated":"2024-10-31T19:37:07Z","published":"2024-03-28T10:40:22Z","title":"Generating Multi-Aspect Queries for Conversational Search","summary":" Conversational information seeking (CIS) systems aim to model the user's\ninformation need within the conversational context and retrieve the relevant\ninformation. One major approach to modeling the conversational context aims to\nrewrite the user utterance in the conversation to represent the information\nneed independently. Recent work has shown the benefit of expanding the\nrewritten utterance with relevant terms. In this work, we hypothesize that\nbreaking down the information of an utterance into multi-aspect rewritten\nqueries can lead to more effective retrieval performance. This is more evident\nin more complex utterances that require gathering evidence from various\ninformation sources, where a single query rewrite or query representation\ncannot capture the complexity of the utterance. To test this hypothesis, we\nconduct extensive experiments on five widely used CIS datasets where we\nleverage LLMs to generate multi-aspect queries to represent the information\nneed for each utterance in multiple query rewrites. We show that, for most of\nthe utterances, the same retrieval model would perform better with more than\none rewritten query by 85% in terms of nDCG@3. We further propose a\nmulti-aspect query generation and retrieval framework, called MQ4CS. Our\nextensive experiments show that MQ4CS outperforms the state-of-the-art query\nrewriting methods. We make our code and our new dataset of generated\nmulti-aspect queries publicly available.\n","authors":["Zahra Abbasiantaeb","Simon Lupart","Mohammad Aliannejadi"],"pdf_url":"https://arxiv.org/pdf/2403.19302v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00163v1","updated":"2024-10-31T19:11:26Z","published":"2024-10-31T19:11:26Z","title":"PSL: Rethinking and Improving Softmax Loss from Pairwise Perspective for\n Recommendation","summary":" Softmax Loss (SL) is widely applied in recommender systems (RS) and has\ndemonstrated effectiveness. This work analyzes SL from a pairwise perspective,\nrevealing two significant limitations: 1) the relationship between SL and\nconventional ranking metrics like DCG is not sufficiently tight; 2) SL is\nhighly sensitive to false negative instances. Our analysis indicates that these\nlimitations are primarily due to the use of the exponential function. To\naddress these issues, this work extends SL to a new family of loss functions,\ntermed Pairwise Softmax Loss (PSL), which replaces the exponential function in\nSL with other appropriate activation functions. While the revision is minimal,\nwe highlight three merits of PSL: 1) it serves as a tighter surrogate for DCG\nwith suitable activation functions; 2) it better balances data contributions;\nand 3) it acts as a specific BPR loss enhanced by Distributionally Robust\nOptimization (DRO). We further validate the effectiveness and robustness of PSL\nthrough empirical experiments. The code is available at\nhttps://github.com/Tiny-Snow/IR-Benchmark.\n","authors":["Weiqin Yang","Jiawei Chen","Xin Xin","Sheng Zhou","Binbin Hu","Yan Feng","Chun Chen","Can Wang"],"pdf_url":"https://arxiv.org/pdf/2411.00163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13959v2","updated":"2024-10-31T18:38:37Z","published":"2024-10-17T18:34:43Z","title":"FinQAPT: Empowering Financial Decisions with End-to-End LLM-driven\n Question Answering Pipeline","summary":" Financial decision-making hinges on the analysis of relevant information\nembedded in the enormous volume of documents in the financial domain. To\naddress this challenge, we developed FinQAPT, an end-to-end pipeline that\nstreamlines the identification of relevant financial reports based on a query,\nextracts pertinent context, and leverages Large Language Models (LLMs) to\nperform downstream tasks. To evaluate the pipeline, we experimented with\nvarious techniques to optimize the performance of each module using the FinQA\ndataset. We introduced a novel clustering-based negative sampling technique to\nenhance context extraction and a novel prompting method called Dynamic N-shot\nPrompting to boost the numerical question-answering capabilities of LLMs. At\nthe module level, we achieved state-of-the-art accuracy on FinQA, attaining an\naccuracy of 80.6%. However, at the pipeline level, we observed decreased\nperformance due to challenges in extracting relevant context from financial\nreports. We conducted a detailed error analysis of each module and the\nend-to-end pipeline, pinpointing specific challenges that must be addressed to\ndevelop a robust solution for handling complex financial tasks.\n","authors":["Kuldeep Singh","Simerjot Kaur","Charese Smiley"],"pdf_url":"https://arxiv.org/pdf/2410.13959v2.pdf","comment":"Accepted in ICAIF 2024, 8 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2411.00137v1","updated":"2024-10-31T18:35:03Z","published":"2024-10-31T18:35:03Z","title":"Cost-Aware Query Policies in Active Learning for Efficient Autonomous\n Robotic Exploration","summary":" In missions constrained by finite resources, efficient data collection is\ncritical. Informative path planning, driven by automated decision-making,\noptimizes exploration by reducing the costs associated with accurate\ncharacterization of a target in an environment. Previous implementations of\nactive learning did not consider the action cost for regression problems or\nonly considered the action cost for classification problems. This paper\nanalyzes an AL algorithm for Gaussian Process regression while incorporating\naction cost. The algorithm's performance is compared on various regression\nproblems to include terrain mapping on diverse simulated surfaces along metrics\nof root mean square error, samples and distance until convergence, and model\nvariance upon convergence. The cost-dependent acquisition policy doesn't\norganically optimize information gain over distance. Instead, the traditional\nuncertainty metric with a distance constraint best minimizes root-mean-square\nerror over trajectory distance. This studys impact is to provide insight into\nincorporating action cost with AL methods to optimize exploration under\nrealistic mission constraints.\n","authors":["Sapphira Akins","Hans Mertens","Frances Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.00137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24200v1","updated":"2024-10-31T17:55:36Z","published":"2024-10-31T17:55:36Z","title":"Length-Induced Embedding Collapse in Transformer-based Models","summary":" Text embeddings enable various applications, but their performance\ndeteriorates on longer texts. In this paper, we find that the performance\ndegradation is due to a phenomenon called Length Collapse, where longer text\nembeddings collapse into a narrow space. This collapse results in a\ndistributional inconsistency between embeddings of different text lengths,\nultimately hurting the performance of downstream tasks. Theoretically, by\nconsidering the self-attention mechanism inherently functions as a low-pass\nfilter, we prove that long sequences increase the attenuation rate of the\nlow-pass filter effect of the self-attention mechanism. With layers going\ndeeper, excessive low-pass filtering causes the token signals to retain only\ntheir Direct-Current (DC) component, which means the input token feature maps\nwill collapse into a narrow space, especially in long texts. Based on the above\nanalysis, we propose to mitigate the undesirable length collapse limitation by\nintroducing a temperature in softmax(), which achieves a higher low-filter\nattenuation rate. The tuning-free method, called TempScale, can be plugged into\nmultiple transformer-based embedding models. Empirically, we demonstrate that\nTempScale can improve existing embedding models, especially on long text\ninputs, bringing up to 0.53% performance gains on 40 datasets from Massive Text\nEmbedding Benchmark (MTEB) and 0.82% performance gains on 4 datasets from\nLongEmbed, which specifically focuses on long context retrieval.\n","authors":["Yuqi Zhou","Sunhao Dai","Zhanshuo Cao","Xiao Zhang","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2410.24200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19073v2","updated":"2024-10-31T15:58:23Z","published":"2024-05-29T13:31:12Z","title":"An engine not a camera: Measuring performative power of online search","summary":" The power of digital platforms is at the center of major ongoing policy and\nregulatory efforts. To advance existing debates, we designed and executed an\nexperiment to measure the performative power of online search providers.\nInstantiated in our setting, performative power quantifies the ability of a\nsearch engine to steer web traffic by rearranging results. To operationalize\nthis definition we developed a browser extension that performs unassuming\nrandomized experiments in the background. These randomized experiments emulate\nupdates to the search algorithm and identify the causal effect of different\ncontent arrangements on clicks. Analyzing tens of thousands of clicks, we\ndiscuss what our robust quantitative findings say about the power of online\nsearch engines, using the Google Shopping antitrust investigation as a case\nstudy. More broadly, we envision our work to serve as a blueprint for how the\nrecent definition of performative power can help integrate quantitative\ninsights from online experiments with future investigations into the economic\npower of digital platforms.\n","authors":["Celestine Mendler-Dünner","Gabriele Carovano","Moritz Hardt"],"pdf_url":"https://arxiv.org/pdf/2405.19073v2.pdf","comment":"to appear at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23879v1","updated":"2024-10-31T12:40:38Z","published":"2024-10-31T12:40:38Z","title":"Investigating Bias in Political Search Query Suggestions by Relative\n Comparison with LLMs","summary":" Search query suggestions affect users' interactions with search engines,\nwhich then influences the information they encounter. Thus, bias in search\nquery suggestions can lead to exposure to biased search results and can impact\nopinion formation. This is especially critical in the political domain.\nDetecting and quantifying bias in web search engines is difficult due to its\ntopic dependency, complexity, and subjectivity. The lack of context and\nphrasality of query suggestions emphasizes this problem. In a multi-step\napproach, we combine the benefits of large language models, pairwise\ncomparison, and Elo-based scoring to identify and quantify bias in English\nsearch query suggestions. We apply our approach to the U.S. political news\ndomain and compare bias in Google and Bing.\n","authors":["Fabian Haak","Björn Engelmann","Christin Katharina Kreutz","Philipp Schaer"],"pdf_url":"https://arxiv.org/pdf/2410.23879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23851v1","updated":"2024-10-31T12:01:51Z","published":"2024-10-31T12:01:51Z","title":"Leveraging Large Language Models for Medical Information Extraction and\n Query Generation","summary":" This paper introduces a system that integrates large language models (LLMs)\ninto the clinical trial retrieval process, enhancing the effectiveness of\nmatching patients with eligible trials while maintaining information privacy\nand allowing expert oversight. We evaluate six LLMs for query generation,\nfocusing on open-source and relatively small models that require minimal\ncomputational resources. Our evaluation includes two closed-source and four\nopen-source models, with one specifically trained in the medical field and five\ngeneral-purpose models. We compare the retrieval effectiveness achieved by\nLLM-generated queries against those created by medical experts and\nstate-of-the-art methods from the literature. Our findings indicate that the\nevaluated models reach retrieval effectiveness on par with or greater than\nexpert-created queries. The LLMs consistently outperform standard baselines and\nother approaches in the literature. The best performing LLMs exhibit fast\nresponse times, ranging from 1.7 to 8 seconds, and generate a manageable number\nof query terms (15-63 on average), making them suitable for practical\nimplementation. Our overall findings suggest that leveraging small, open-source\nLLMs for clinical trials retrieval can balance performance, computational\nefficiency, and real-world applicability in medical settings.\n","authors":["Georgios Peikos","Pranav Kasela","Gabriella Pasi"],"pdf_url":"https://arxiv.org/pdf/2410.23851v1.pdf","comment":"Accepted in WI-IAT '24"},{"id":"http://arxiv.org/abs/2410.23842v1","updated":"2024-10-31T11:49:16Z","published":"2024-10-31T11:49:16Z","title":"Auditing Google's Search Algorithm: Measuring News Diversity Across\n Brazil, the UK, and the US","summary":" This study examines the influence of Google's search algorithm on news\ndiversity by analyzing search results in Brazil, the UK, and the US. It\nexplores how Google's system preferentially favors a limited number of news\noutlets. Utilizing algorithm auditing techniques, the research measures source\nconcentration with the Herfindahl-Hirschman Index (HHI) and Gini coefficient,\nrevealing significant concentration trends. The study underscores the\nimportance of conducting horizontal analyses across multiple search queries, as\nfocusing solely on individual results pages may obscure these patterns. Factors\nsuch as popularity, political bias, and recency were evaluated for their impact\non news rankings. Findings indicate a slight leftward bias in search outcomes\nand a preference for popular, often national outlets. This bias, combined with\na tendency to prioritize recent content, suggests that Google's algorithm may\nreinforce existing media inequalities. By analyzing the largest dataset to date\n-- 221,863 search results -- this research provides comprehensive, longitudinal\ninsights into how algorithms shape public access to diverse news sources.\n","authors":["Raphael Hernandes","Giulio Corsi"],"pdf_url":"https://arxiv.org/pdf/2410.23842v1.pdf","comment":"21 pages, 3 figures, 7 tables"},{"id":"http://arxiv.org/abs/2410.23841v1","updated":"2024-10-31T11:47:21Z","published":"2024-10-31T11:47:21Z","title":"Beyond Content Relevance: Evaluating Instruction Following in Retrieval\n Models","summary":" Instruction-following capabilities in large language models (LLMs) have\nsignificantly progressed, enabling more complex user interactions through\ndetailed prompts. However, retrieval systems have not matched these advances,\nmost of them still relies on traditional lexical and semantic matching\ntechniques that fail to fully capture user intent. Recent efforts have\nintroduced instruction-aware retrieval models, but these primarily focus on\nintrinsic content relevance, which neglects the importance of customized\npreferences for broader document-level attributes. This study evaluates the\ninstruction-following capabilities of various retrieval models beyond content\nrelevance, including LLM-based dense retrieval and reranking models. We develop\nInfoSearch, a novel retrieval evaluation benchmark spanning six document-level\nattributes: Audience, Keyword, Format, Language, Length, and Source, and\nintroduce novel metrics -- Strict Instruction Compliance Ratio (SICR) and\nWeighted Instruction Sensitivity Evaluation (WISE) to accurately assess the\nmodels' responsiveness to instructions. Our findings reveal that while\nreranking models generally surpass retrieval models in instruction following,\nthey still face challenges in handling certain attributes. Moreover, although\ninstruction fine-tuning and increased model size lead to better performance,\nmost models fall short of achieving comprehensive instruction compliance as\nassessed by our benchmark.\n","authors":["Jianqun Zhou","Yuanlei Zheng","Wei Chen","Qianqian Zheng","Zeyuan Shang","Wei Zhang","Rui Meng","Xiaoyu Shen"],"pdf_url":"https://arxiv.org/pdf/2410.23841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21488v2","updated":"2024-10-31T11:45:00Z","published":"2024-07-31T09:52:53Z","title":"Breaking the Hourglass Phenomenon of Residual Quantization: Enhancing\n the Upper Bound of Generative Retrieval","summary":" Generative retrieval (GR) has emerged as a transformative paradigm in search\nand recommender systems, leveraging numeric-based identifier representations to\nenhance efficiency and generalization. Notably, methods like TIGER employing\nResidual Quantization-based Semantic Identifiers (RQ-SID), have shown\nsignificant promise in e-commerce scenarios by effectively managing item IDs.\nHowever, a critical issue termed the \"\\textbf{Hourglass}\" phenomenon, occurs in\nRQ-SID, where intermediate codebook tokens become overly concentrated,\nhindering the full utilization of generative retrieval methods. This paper\nanalyses and addresses this problem by identifying data sparsity and\nlong-tailed distribution as the primary causes. Through comprehensive\nexperiments and detailed ablation studies, we analyze the impact of these\nfactors on codebook utilization and data distribution. Our findings reveal that\nthe \"Hourglass\" phenomenon substantially impacts the performance of RQ-SID in\ngenerative retrieval. We propose effective solutions to mitigate this issue,\nthereby significantly enhancing the effectiveness of generative retrieval in\nreal-world E-commerce applications.\n","authors":["Zhirui Kuai","Zuxu Chen","Huimu Wang","Mingming Li","Dadong Miao","Binbin Wang","Xusong Chen","Li Kuang","Yuxing Han","Jiaxing Wang","Guoyu Tang","Lin Liu","Songlin Wang","Jingwei Zhuo"],"pdf_url":"https://arxiv.org/pdf/2407.21488v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15187v2","updated":"2024-10-31T10:10:28Z","published":"2024-06-21T14:29:39Z","title":"UDA: A Benchmark Suite for Retrieval Augmented Generation in Real-world\n Document Analysis","summary":" The use of Retrieval-Augmented Generation (RAG) has improved Large Language\nModels (LLMs) in collaborating with external data, yet significant challenges\nexist in real-world scenarios. In areas such as academic literature and finance\nquestion answering, data are often found in raw text and tables in HTML or PDF\nformats, which can be lengthy and highly unstructured. In this paper, we\nintroduce a benchmark suite, namely Unstructured Document Analysis (UDA), that\ninvolves 2,965 real-world documents and 29,590 expert-annotated Q&A pairs. We\nrevisit popular LLM- and RAG-based solutions for document analysis and evaluate\nthe design choices and answer qualities across multiple document domains and\ndiverse query types. Our evaluation yields interesting findings and highlights\nthe importance of data parsing and retrieval. We hope our benchmark can shed\nlight and better serve real-world document analysis applications. The benchmark\nsuite and code can be found at https://github.com/qinchuanhui/UDA-Benchmark.\n","authors":["Yulong Hui","Yao Lu","Huanchen Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.15187v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23757v1","updated":"2024-10-31T09:24:22Z","published":"2024-10-31T09:24:22Z","title":"Identify Then Recommend: Towards Unsupervised Group Recommendation","summary":" Group Recommendation (GR), which aims to recommend items to groups of users,\nhas become a promising and practical direction for recommendation systems. This\npaper points out two issues of the state-of-the-art GR models. (1) The\npre-defined and fixed number of user groups is inadequate for real-time\nindustrial recommendation systems, where the group distribution can shift\ndynamically. (2) The training schema of existing GR methods is supervised,\nnecessitating expensive user-group and group-item labels, leading to\nsignificant annotation costs. To this end, we present a novel unsupervised\ngroup recommendation framework named \\underline{I}dentify \\underline{T}hen\n\\underline{R}ecommend (\\underline{ITR}), where it first identifies the user\ngroups in an unsupervised manner even without the pre-defined number of groups,\nand then two pre-text tasks are designed to conduct self-supervised group\nrecommendation. Concretely, at the group identification stage, we first\nestimate the adaptive density of each user point, where areas with higher\ndensities are more likely to be recognized as group centers. Then, a heuristic\nmerge-and-split strategy is designed to discover the user groups and decision\nboundaries. Subsequently, at the self-supervised learning stage, the\npull-and-repulsion pre-text task is proposed to optimize the user-group\ndistribution. Besides, the pseudo group recommendation pre-text task is\ndesigned to assist the recommendations. Extensive experiments demonstrate the\nsuperiority and effectiveness of ITR on both user recommendation (e.g., 22.22\\%\nNDCG@5 $\\uparrow$) and group recommendation (e.g., 22.95\\% NDCG@5 $\\uparrow$).\nFurthermore, we deploy ITR on the industrial recommender and achieve promising\nresults.\n","authors":["Yue Liu","Shihao Zhu","Tianyuan Yang","Jian Ma","Wenliang Zhong"],"pdf_url":"https://arxiv.org/pdf/2410.23757v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2401.05975v4","updated":"2024-10-31T09:14:56Z","published":"2024-01-11T15:22:55Z","title":"End-to-end Learnable Clustering for Intent Learning in Recommendation","summary":" Intent learning, which aims to learn users' intents for user understanding\nand item recommendation, has become a hot research spot in recent years.\nHowever, existing methods suffer from complex and cumbersome alternating\noptimization, limiting performance and scalability. To this end, we propose a\nnovel intent learning method termed \\underline{ELCRec}, by unifying behavior\nrepresentation learning into an \\underline{E}nd-to-end \\underline{L}earnable\n\\underline{C}lustering framework, for effective and efficient\n\\underline{Rec}ommendation. Concretely, we encode user behavior sequences and\ninitialize the cluster centers (latent intents) as learnable neurons. Then, we\ndesign a novel learnable clustering module to separate different cluster\ncenters, thus decoupling users' complex intents. Meanwhile, it guides the\nnetwork to learn intents from behaviors by forcing behavior embeddings close to\ncluster centers. This allows simultaneous optimization of recommendation and\nclustering via mini-batch data. Moreover, we propose intent-assisted\ncontrastive learning by using cluster centers as self-supervision signals,\nfurther enhancing mutual promotion. Both experimental results and theoretical\nanalyses demonstrate the superiority of ELCRec from six perspectives. Compared\nto the runner-up, ELCRec improves NDCG@5 by 8.9\\% and reduces computational\ncosts by 22.5\\% on the Beauty dataset. Furthermore, due to the scalability and\nuniversal applicability, we deploy this method on the industrial recommendation\nsystem with 130 million page views and achieve promising results. The codes are\navailable on GitHub (https://github.com/yueliu1999/ELCRec). A collection\n(papers, codes, datasets) of deep group recommendation/intent learning methods\nis available on GitHub\n(https://github.com/yueliu1999/Awesome-Deep-Group-Recommendation).\n","authors":["Yue Liu","Shihao Zhu","Jun Xia","Yingwei Ma","Jian Ma","Xinwang Liu","Shengju Yu","Kejun Zhang","Wenliang Zhong"],"pdf_url":"https://arxiv.org/pdf/2401.05975v4.pdf","comment":"37 pages"},{"id":"http://arxiv.org/abs/2410.23736v1","updated":"2024-10-31T08:49:05Z","published":"2024-10-31T08:49:05Z","title":"MoTaDual: Modality-Task Dual Alignment for Enhanced Zero-shot Composed\n Image Retrieval","summary":" Composed Image Retrieval (CIR) is a challenging vision-language task,\nutilizing bi-modal (image+text) queries to retrieve target images. Despite the\nimpressive performance of supervised CIR, the dependence on costly,\nmanually-labeled triplets limits its scalability and zero-shot capability. To\naddress this issue, zero-shot composed image retrieval (ZS-CIR) is presented\nalong with projection-based approaches. However, such methods face two major\nproblems, i.e., task discrepancy between pre-training (image $\\leftrightarrow$\ntext) and inference (image+text $\\rightarrow$ image), and modality discrepancy.\nThe latter pertains to approaches based on text-only projection training due to\nthe necessity of feature extraction from the reference image during inference.\nIn this paper, we propose a two-stage framework to tackle both discrepancies.\nFirst, to ensure efficiency and scalability, a textual inversion network is\npre-trained on large-scale caption datasets. Subsequently, we put forward\nModality-Task Dual Alignment (MoTaDual) as the second stage, where\nlarge-language models (LLMs) generate triplet data for fine-tuning, and\nadditionally, prompt learning is introduced in a multi-modal context to\neffectively alleviate both modality and task discrepancies. The experimental\nresults show that our MoTaDual achieves the state-of-the-art performance across\nfour widely used ZS-CIR benchmarks, while maintaining low training time and\ncomputational cost. The code will be released soon.\n","authors":["Haiwen Li","Fei Su","Zhicheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2410.23736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00072v5","updated":"2024-10-31T08:35:42Z","published":"2024-06-21T08:52:11Z","title":"Pistis-RAG: Enhancing Retrieval-Augmented Generation with Human Feedback","summary":" RAG systems face limitations when semantic relevance alone does not guarantee\nimproved generation quality. This issue becomes particularly evident due to the\nsensitivity of large language models (LLMs) to the ordering of few-shot\nprompts, which can affect model performance. To address this challenge,\naligning LLM outputs with human preferences using structured feedback, such as\noptions to copy, regenerate, or dislike, offers a promising method for\nimprovement. This feedback is applied to the entire list of inputs rather than\ngiving specific ratings for individual documents, making it a Listwide Labels\nLearning-to-Rank task.\n To address this task, we propose Pistis-RAG, a new RAG framework designed\nwith a content-centric approach to better align LLMs with human preferences.\nPistis-RAG effectively utilizes human feedback, enhancing content ranking and\ngeneration quality. To validate our framework, we use public datasets to\nsimulate human feedback, allowing us to evaluate and refine our method\neffectively. Experimental results indicate that Pistis-RAG improves alignment\nwith human preferences relative to the baseline RAG system, showing a 6.06%\nincrease in MMLU (English) and a 7.08% increase in C-EVAL (Chinese) accuracy\nmetrics. These results highlight Pistis-RAG's effectiveness in overcoming the\nlimitations associated with traditional RAG approaches.\n","authors":["Yu Bai","Yukai Miao","Li Chen","Dawei Wang","Dan Li","Yanyu Ren","Hongtao Xie","Ce Yang","Xuhui Cai"],"pdf_url":"https://arxiv.org/pdf/2407.00072v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23715v1","updated":"2024-10-31T08:03:13Z","published":"2024-10-31T08:03:13Z","title":"Towards Cross-Modal Text-Molecule Retrieval with Better Modality\n Alignment","summary":" Cross-modal text-molecule retrieval model aims to learn a shared feature\nspace of the text and molecule modalities for accurate similarity calculation,\nwhich facilitates the rapid screening of molecules with specific properties and\nactivities in drug design. However, previous works have two main defects.\nFirst, they are inadequate in capturing modality-shared features considering\nthe significant gap between text sequences and molecule graphs. Second, they\nmainly rely on contrastive learning and adversarial training for cross-modality\nalignment, both of which mainly focus on the first-order similarity, ignoring\nthe second-order similarity that can capture more structural information in the\nembedding space. To address these issues, we propose a novel cross-modal\ntext-molecule retrieval model with two-fold improvements. Specifically, on the\ntop of two modality-specific encoders, we stack a memory bank based feature\nprojector that contain learnable memory vectors to extract modality-shared\nfeatures better. More importantly, during the model training, we calculate four\nkinds of similarity distributions (text-to-text, text-to-molecule,\nmolecule-to-molecule, and molecule-to-text similarity distributions) for each\ninstance, and then minimize the distance between these similarity distributions\n(namely second-order similarity losses) to enhance cross-modal alignment.\nExperimental results and analysis strongly demonstrate the effectiveness of our\nmodel. Particularly, our model achieves SOTA performance, outperforming the\npreviously-reported best result by 6.4%.\n","authors":["Jia Song","Wanru Zhuang","Yujie Lin","Liang Zhang","Chunyan Li","Jinsong Su","Song He","Xiaochen Bo"],"pdf_url":"https://arxiv.org/pdf/2410.23715v1.pdf","comment":"BIBM 2024 regular paper"},{"id":"http://arxiv.org/abs/2408.11611v3","updated":"2024-10-31T07:09:38Z","published":"2024-08-21T13:39:21Z","title":"DTN: Deep Multiple Task-specific Feature Interactions Network for\n Multi-Task Recommendation","summary":" Neural-based multi-task learning (MTL) has been successfully applied to many\nrecommendation applications. However, these MTL models (e.g., MMoE, PLE) did\nnot consider feature interaction during the optimization, which is crucial for\ncapturing complex high-order features and has been widely used in ranking\nmodels for real-world recommender systems. Moreover, through feature importance\nanalysis across various tasks in MTL, we have observed an interesting\ndivergence phenomenon that the same feature can have significantly different\nimportance across different tasks in MTL. To address these issues, we propose\nDeep Multiple Task-specific Feature Interactions Network (DTN) with a novel\nmodel structure design. DTN introduces multiple diversified task-specific\nfeature interaction methods and task-sensitive network in MTL networks,\nenabling the model to learn task-specific diversified feature interaction\nrepresentations, which improves the efficiency of joint representation learning\nin a general setup. We applied DTN to our company's real-world E-commerce\nrecommendation dataset, which consisted of over 6.3 billion samples, the\nresults demonstrated that DTN significantly outperformed state-of-the-art MTL\nmodels. Moreover, during online evaluation of DTN in a large-scale E-commerce\nrecommender system, we observed a 3.28% in clicks, a 3.10% increase in orders\nand a 2.70% increase in GMV (Gross Merchandise Value) compared to the\nstate-of-the-art MTL models. Finally, extensive offline experiments conducted\non public benchmark datasets demonstrate that DTN can be applied to various\nscenarios beyond recommendations, enhancing the performance of ranking models.\n","authors":["Yaowen Bi","Yuteng Lian","Jie Cui","Jun Liu","Peijian Wang","Guanghui Li","Xuejun Chen","Jinglin Zhao","Hao Wen","Jing Zhang","Zhaoqi Zhang","Wenzhuo Song","Yang Sun","Weiwei Zhang","Mingchen Cai","Jian Dong","Guanxing Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11611v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11821v3","updated":"2024-10-31T05:19:58Z","published":"2024-02-19T04:29:45Z","title":"Microstructures and Accuracy of Graph Recall by Large Language Models","summary":" Graphs data is crucial for many applications, and much of it exists in the\nrelations described in textual format. As a result, being able to accurately\nrecall and encode a graph described in earlier text is a basic yet pivotal\nability that LLMs need to demonstrate if they are to perform reasoning tasks\nthat involve graph-structured information. Human performance at graph recall\nhas been studied by cognitive scientists for decades, and has been found to\noften exhibit certain structural patterns of bias that align with human\nhandling of social relationships. To date, however, we know little about how\nLLMs behave in analogous graph recall tasks: do their recalled graphs also\nexhibit certain biased patterns, and if so, how do they compare with humans and\naffect other graph reasoning tasks? In this work, we perform the first\nsystematical study of graph recall by LLMs, investigating the accuracy and\nbiased microstructures (local structural patterns) in their recall. We find\nthat LLMs not only underperform often in graph recall, but also tend to favor\nmore triangles and alternating 2-paths. Moreover, we find that more advanced\nLLMs have a striking dependence on the domain that a real-world graph comes\nfrom -- by yielding the best recall accuracy when the graph is narrated in a\nlanguage style consistent with its original domain.\n","authors":["Yanbang Wang","Hejie Cui","Jon Kleinberg"],"pdf_url":"https://arxiv.org/pdf/2402.11821v3.pdf","comment":"Accepted at NeurIPS 2024; Code available at:\n https://github.com/Abel0828/llm-graph-recall"},{"id":"http://arxiv.org/abs/2307.02147v4","updated":"2024-10-31T02:54:38Z","published":"2023-07-05T09:42:51Z","title":"Recommendation Unlearning via Influence Function","summary":" Recommendation unlearning is an emerging task to serve users for erasing\nunusable data (e.g., some historical behaviors) from a well-trained recommender\nmodel. Existing methods process unlearning requests by fully or partially\nretraining the model after removing the unusable data. However, these methods\nare impractical due to the high computation cost of full retraining and the\nhighly possible performance damage of partial training. In this light, a\ndesired recommendation unlearning method should obtain a similar model as full\nretraining in a more efficient manner, i.e., achieving complete, efficient and\nharmless unlearning.\n In this work, we propose a new Influence Function-based Recommendation\nUnlearning (IFRU) framework, which efficiently updates the model without\nretraining by estimating the influence of the unusable data on the model via\nthe influence function. In the light that recent recommender models use\nhistorical data for both the constructions of the optimization loss and the\ncomputational graph (e.g., neighborhood aggregation), IFRU jointly estimates\nthe direct influence of unusable data on optimization loss and the spillover\ninfluence on the computational graph to pursue complete unlearning.\nFurthermore, we propose an importance-based pruning algorithm to reduce the\ncost of the influence function. IFRU is harmless and applicable to mainstream\ndifferentiable models. Extensive experiments demonstrate that IFRU achieves\nmore than 250 times acceleration compared to retraining-based methods with\nrecommendation performance comparable to full retraining. Codes are avaiable at\nhttps://github.com/baiyimeng/IFRU.\n","authors":["Yang Zhang","Zhiyu Hu","Yimeng Bai","Jiancan Wu","Qifan Wang","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2307.02147v4.pdf","comment":"Accepted by ACM TORS"}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.23883v1","updated":"2024-10-31T12:45:54Z","published":"2024-10-31T12:45:54Z","title":"'No' Matters: Out-of-Distribution Detection in Multimodality Long\n Dialogue","summary":" Out-of-distribution (OOD) detection in multimodal contexts is essential for\nidentifying deviations in combined inputs from different modalities,\nparticularly in applications like open-domain dialogue systems or real-life\ndialogue interactions. This paper aims to improve the user experience that\ninvolves multi-round long dialogues by efficiently detecting OOD dialogues and\nimages. We introduce a novel scoring framework named Dialogue Image Aligning\nand Enhancing Framework (DIAEF) that integrates the visual language models with\nthe novel proposed scores that detect OOD in two key scenarios (1) mismatches\nbetween the dialogue and image input pair and (2) input pairs with previously\nunseen labels. Our experimental results, derived from various benchmarks,\ndemonstrate that integrating image and multi-round dialogue OOD detection is\nmore effective with previously unseen labels than using either modality\nindependently. In the presence of mismatched pairs, our proposed score\neffectively identifies these mismatches and demonstrates strong robustness in\nlong dialogues. This approach enhances domain-aware, adaptive conversational\nagents and establishes baselines for future studies.\n","authors":["Rena Gao","Xuetong Wu","Siwen Luo","Caren Han","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2410.23883v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2410.23861v1","updated":"2024-10-31T12:11:17Z","published":"2024-10-31T12:11:17Z","title":"Audio Is the Achilles' Heel: Red Teaming Audio Large Multimodal Models","summary":" Large Multimodal Models (LMMs) have demonstrated the ability to interact with\nhumans under real-world conditions by combining Large Language Models (LLMs)\nand modality encoders to align multimodal information (visual and auditory)\nwith text. However, such models raise new safety challenges of whether models\nthat are safety-aligned on text also exhibit consistent safeguards for\nmultimodal inputs. Despite recent safety-alignment research on vision LMMs, the\nsafety of audio LMMs remains under-explored. In this work, we comprehensively\nred team the safety of five advanced audio LMMs under three settings: (i)\nharmful questions in both audio and text formats, (ii) harmful questions in\ntext format accompanied by distracting non-speech audio, and (iii)\nspeech-specific jailbreaks. Our results under these settings demonstrate that\nopen-source audio LMMs suffer an average attack success rate of 69.14% on\nharmful audio questions, and exhibit safety vulnerabilities when distracted\nwith non-speech audio noise. Our speech-specific jailbreaks on Gemini-1.5-Pro\nachieve an attack success rate of 70.67% on the harmful query benchmark. We\nprovide insights on what could cause these reported safety-misalignments.\nWarning: this paper contains offensive examples.\n","authors":["Hao Yang","Lizhen Qu","Ehsan Shareghi","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2410.23861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23663v1","updated":"2024-10-31T06:26:00Z","published":"2024-10-31T06:26:00Z","title":"DIP: Diffusion Learning of Inconsistency Pattern for General DeepFake\n Detection","summary":" With the advancement of deepfake generation techniques, the importance of\ndeepfake detection in protecting multimedia content integrity has become\nincreasingly obvious. Recently, temporal inconsistency clues have been explored\nto improve the generalizability of deepfake video detection. According to our\nobservation, the temporal artifacts of forged videos in terms of motion\ninformation usually exhibits quite distinct inconsistency patterns along\nhorizontal and vertical directions, which could be leveraged to improve the\ngeneralizability of detectors. In this paper, a transformer-based framework for\nDiffusion Learning of Inconsistency Pattern (DIP) is proposed, which exploits\ndirectional inconsistencies for deepfake video detection. Specifically, DIP\nbegins with a spatiotemporal encoder to represent spatiotemporal information. A\ndirectional inconsistency decoder is adopted accordingly, where direction-aware\nattention and inconsistency diffusion are incorporated to explore potential\ninconsistency patterns and jointly learn the inherent relationships. In\naddition, the SpatioTemporal Invariant Loss (STI Loss) is introduced to\ncontrast spatiotemporally augmented sample pairs and prevent the model from\noverfitting nonessential forgery artifacts. Extensive experiments on several\npublic datasets demonstrate that our method could effectively identify\ndirectional forgery clues and achieve state-of-the-art performance.\n","authors":["Fan Nie","Jiangqun Ni","Jian Zhang","Bin Zhang","Weizhe Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.23663v1.pdf","comment":"13 pages, accepted with IEEE Trans. on Multimedia"},{"id":"http://arxiv.org/abs/2410.23230v2","updated":"2024-10-31T04:20:22Z","published":"2024-10-30T17:18:53Z","title":"Aligning Audio-Visual Joint Representations with an Agentic Workflow","summary":" Visual content and accompanied audio signals naturally formulate a joint\nrepresentation to improve audio-visual (AV) related applications. While studies\ndevelop various AV representation learning frameworks, the importance of AV\ndata alignment is usually undermined for achieving high-quality representation.\nWe observe that an audio signal may contain background noise interference.\nAlso, non-synchronization may appear between audio and video streams. These\nnon-strict data alignment limits representation quality and downgrade\napplication performance. In this paper, we propose to improve AV joint\nrepresentations from a data-centric perspective by aligning audio signals to\nvisual data. Our alignment is conducted in an agentic workflow controlled by an\nLLM-based assistant named AVAgent. For each input AV data pair, our AVAgent\nuses a multi-modal LLM to convert audio and visual data into language\ndescriptions separately (i.e., tool use). Then, AVAgent reasons whether this\npaired data is aligned well and plans to edit the audio signal if needed (i.e.,\nplanning). The audio editing is executed by predefined actions that filter\nnoise or augment data. Moreover, we use a VLM to evaluate how modified audio\nsignals match the visual content and provide feedback to AVAgent (i.e.,\nreflection). The tool use, planning, and reflection steps operate cyclically to\nbecome an agentic workflow where audio signals are gradually aligned to visual\ncontent. To this end, existing methods can directly leverage the aligned AV\ndata via our agentic workflow to improve AV joint representations. The\nexperimental results comprehensively demonstrate the state-of-the-art\nperformance of the proposed approach against previous baselines in diverse\ndownstream tasks.\n","authors":["Shentong Mo","Yibing Song"],"pdf_url":"https://arxiv.org/pdf/2410.23230v2.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..c0226016 --- /dev/null +++ b/index.html @@ -0,0 +1,24173 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 89 + +
+
+
+ + ☆ Analyzing The Language of Visual Tokens + + +
+ With the introduction of transformer-based models for vision and language +tasks, such as LLaVA and Chameleon, there has been renewed interest in the +discrete tokenized representation of images. These models often treat image +patches as discrete tokens, analogous to words in natural language, learning +joint alignments between visual and human languages. However, little is known +about the statistical behavior of these visual languages - whether they follow +similar frequency distributions, grammatical structures, or topologies as +natural languages. In this paper, we take a natural-language-centric approach +to analyzing discrete visual languages and uncover striking similarities and +fundamental differences. We demonstrate that, although visual languages adhere +to Zipfian distributions, higher token innovation drives greater entropy and +lower compression, with tokens predominantly representing object parts, +indicating intermediate granularity. We also show that visual languages lack +cohesive grammatical structures, leading to higher perplexity and weaker +hierarchical organization compared to natural languages. Finally, we +demonstrate that, while vision models align more closely with natural languages +than other models, this alignment remains significantly weaker than the +cohesion found within natural languages. Through these experiments, we +demonstrate how understanding the statistical properties of discrete visual +languages can inform the design of more effective computer vision models. + +
+
+
+
+
+ + ☆ Needle Threading: Can LLMs Follow Threads through Near-Million-Scale + Haystacks? + + +
+ As the context limits of Large Language Models (LLMs) increase, the range of +possible applications and downstream functions broadens. In many real-world +tasks, decisions depend on details scattered across collections of often +disparate documents containing mostly irrelevant information. Long-context LLMs +appear well-suited to this form of complex information retrieval and reasoning, +which has traditionally proven costly and time-consuming. However, although the +development of longer context models has seen rapid gains in recent years, our +understanding of how effectively LLMs use their context has not kept pace. To +address this, we conduct a set of retrieval experiments designed to evaluate +the capabilities of 17 leading LLMs, such as their ability to follow threads of +information through the context window. Strikingly, we find that many models +are remarkably threadsafe: capable of simultaneously following multiple threads +without significant loss in performance. Still, for many models, we find the +effective context limit is significantly shorter than the supported context +length, with accuracy decreasing as the context window grows. Our study also +highlights the important point that token counts from different tokenizers +should not be directly compared -- they often correspond to substantially +different numbers of written characters. We release our code and long-context +experimental data. + +
+
+
+
+
+ + ☆ LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation + + +
+ CLIP is one of the most important multimodal foundational models today. What +powers CLIP's capabilities? The rich supervision signals provided by natural +language, the carrier of human knowledge, shape a powerful cross-modal +representation space. However, with the rapid advancements in large language +models LLMs like GPT-4 and LLaMA, the boundaries of language comprehension and +generation are continually being pushed. This raises an intriguing question: +can the capabilities of LLMs be harnessed to further improve multimodal +representation learning? The potential benefits of incorporating LLMs into CLIP +are clear. LLMs' strong textual understanding can fundamentally improve CLIP's +ability to handle image captions, drastically enhancing its ability to process +long and complex texts, a well-known limitation of vanilla CLIP. Moreover, LLMs +are trained on a vast corpus of text, possessing open-world knowledge. This +allows them to expand on caption information during training, increasing the +efficiency of the learning process. In this paper, we propose LLM2CLIP, a novel +approach that embraces the power of LLMs to unlock CLIP's potential. By +fine-tuning the LLM in the caption space with contrastive learning, we extract +its textual capabilities into the output embeddings, significantly improving +the output layer's textual discriminability. We then design an efficient +training process where the fine-tuned LLM acts as a powerful teacher for CLIP's +visual encoder. Thanks to the LLM's presence, we can now incorporate longer and +more complex captions without being restricted by vanilla CLIP's text encoder's +context window and ability limitations. Our experiments demonstrate that this +approach brings substantial improvements in cross-modal tasks. + +
+
+
+
+
+ + ☆ Mixture-of-Transformers: A Sparse and Scalable Architecture for + Multi-Modal Foundation Models + + +
+ The development of large language models (LLMs) has expanded to multi-modal +systems capable of processing text, images, and speech within a unified +framework. Training these models demands significantly larger datasets and +computational resources compared to text-only LLMs. To address the scaling +challenges, we introduce Mixture-of-Transformers (MoT), a sparse multi-modal +transformer architecture that significantly reduces pretraining computational +costs. MoT decouples non-embedding parameters of the model by modality -- +including feed-forward networks, attention matrices, and layer normalization -- +enabling modality-specific processing with global self-attention over the full +input sequence. We evaluate MoT across multiple settings and model scales. In +the Chameleon 7B setting (autoregressive text-and-image generation), MoT +matches the dense baseline's performance using only 55.8\% of the FLOPs. When +extended to include speech, MoT reaches speech performance comparable to the +dense baseline with only 37.2\% of the FLOPs. In the Transfusion setting, where +text and image are trained with different objectives, a 7B MoT model matches +the image modality performance of the dense baseline with one third of the +FLOPs, and a 760M MoT model outperforms a 1.4B dense baseline across key image +generation metrics. System profiling further highlights MoT's practical +benefits, achieving dense baseline image quality in 47.2\% of the wall-clock +time and text quality in 75.6\% of the wall-clock time (measured on AWS +p4de.24xlarge instances with NVIDIA A100 GPUs). + +
+
+
+
+
+ + ☆ The Semantic Hub Hypothesis: Language Models Share Semantic + Representations Across Languages and Modalities + + +
+ Modern language models can process inputs across diverse languages and +modalities. We hypothesize that models acquire this capability through learning +a shared representation space across heterogeneous data types (e.g., different +languages and modalities), which places semantically similar inputs near one +another, even if they are from different modalities/languages. We term this the +semantic hub hypothesis, following the hub-and-spoke model from neuroscience +(Patterson et al., 2007) which posits that semantic knowledge in the human +brain is organized through a transmodal semantic "hub" which integrates +information from various modality-specific "spokes" regions. We first show that +model representations for semantically equivalent inputs in different languages +are similar in the intermediate layers, and that this space can be interpreted +using the model's dominant pretraining language via the logit lens. This +tendency extends to other data types, including arithmetic expressions, code, +and visual/audio inputs. Interventions in the shared representation space in +one data type also predictably affect model outputs in other data types, +suggesting that this shared representations space is not simply a vestigial +byproduct of large-scale training on broad data, but something that is actively +utilized by the model during input processing. + +
+
+
+
+
+ + ☆ SuffixDecoding: A Model-Free Approach to Speeding Up Large Language + Model Inference + + +
+ We present SuffixDecoding, a novel model-free approach to accelerating large +language model (LLM) inference through speculative decoding. Unlike existing +methods that rely on draft models or specialized decoding heads, SuffixDecoding +leverages suffix trees built from previously generated outputs to efficiently +predict candidate token sequences. Our approach enables flexible +tree-structured speculation without the overhead of maintaining and +orchestrating additional models. SuffixDecoding builds and dynamically updates +suffix trees to capture patterns in the generated text, using them to construct +speculation trees through a principled scoring mechanism based on empirical +token frequencies. SuffixDecoding requires only CPU memory which is plentiful +and underutilized on typical LLM serving nodes. We demonstrate that +SuffixDecoding achieves competitive speedups compared to model-based approaches +across diverse workloads including open-domain chat, code generation, and +text-to-SQL tasks. For open-ended chat and code generation tasks, +SuffixDecoding achieves up to $1.4\times$ higher output throughput than +SpecInfer and up to $1.1\times$ lower time-per-token (TPOT) latency. For a +proprietary multi-LLM text-to-SQL application, SuffixDecoding achieves up to +$2.9\times$ higher output throughput and $3\times$ lower latency than +speculative decoding. Our evaluation shows that SuffixDecoding maintains high +acceptance rates even with small reference corpora of 256 examples, while +continuing to improve performance as more historical outputs are incorporated. + +
+
+
+
+
+ + ☆ BitNet a4.8: 4-bit Activations for 1-bit LLMs + + +
+ Recent research on the 1-bit Large Language Models (LLMs), such as BitNet +b1.58, presents a promising direction for reducing the inference cost of LLMs +while maintaining their performance. In this work, we introduce BitNet a4.8, +enabling 4-bit activations for 1-bit LLMs. BitNet a4.8 employs a hybrid +quantization and sparsification strategy to mitigate the quantization errors +introduced by the outlier channels. Specifically, we utilize 4-bit activations +for inputs to the attention and feed-forward network layers, while sparsifying +intermediate states followed with 8-bit quantization. Extensive experiments +demonstrate that BitNet a4.8 achieves performance comparable to BitNet b1.58 +with equivalent training costs, while being faster in inference with enabling +4-bit (INT4/FP4) kernels. Additionally, BitNet a4.8 activates only 55% of +parameters and supports 3-bit KV cache, further enhancing the efficiency of +large-scale LLM deployment and inference. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Position Paper On Diagnostic Uncertainty Estimation from Large Language + Models: Next-Word Probability Is Not Pre-test Probability NeurIPS 2024 + + +
+ Large language models (LLMs) are being explored for diagnostic decision +support, yet their ability to estimate pre-test probabilities, vital for +clinical decision-making, remains limited. This study evaluates two LLMs, +Mistral-7B and Llama3-70B, using structured electronic health record data on +three diagnosis tasks. We examined three current methods of extracting LLM +probability estimations and revealed their limitations. We aim to highlight the +need for improved techniques in LLM confidence estimation. + +
+
+ comment: Accepted to GenAI4Health Workshop at NeurIPS 2024 +
+
+
+
+
+ + ☆ M3DocRAG: Multi-modal Retrieval is What You Need for Multi-page + Multi-document Understanding + + +
+ Document visual question answering (DocVQA) pipelines that answer questions +from documents have broad applications. Existing methods focus on handling +single-page documents with multi-modal language models (MLMs), or rely on +text-based retrieval-augmented generation (RAG) that uses text extraction tools +such as optical character recognition (OCR). However, there are difficulties in +applying these methods in real-world scenarios: (a) questions often require +information across different pages or documents, where MLMs cannot handle many +long documents; (b) documents often have important information in visual +elements such as figures, but text extraction tools ignore them. We introduce +M3DocRAG, a novel multi-modal RAG framework that flexibly accommodates various +document contexts (closed-domain and open-domain), question hops (single-hop +and multi-hop), and evidence modalities (text, chart, figure, etc.). M3DocRAG +finds relevant documents and answers questions using a multi-modal retriever +and an MLM, so that it can efficiently handle single or many documents while +preserving visual information. Since previous DocVQA datasets ask questions in +the context of a specific document, we also present M3DocVQA, a new benchmark +for evaluating open-domain DocVQA over 3,000+ PDF documents with 40,000+ pages. +In three benchmarks (M3DocVQA/MMLongBench-Doc/MP-DocVQA), empirical results +show that M3DocRAG with ColPali and Qwen2-VL 7B achieves superior performance +than many strong baselines, including state-of-the-art performance in +MP-DocVQA. We provide comprehensive analyses of different indexing, MLMs, and +retrieval models. Lastly, we qualitatively show that M3DocRAG can successfully +handle various scenarios, such as when relevant information exists across +multiple pages and when answer evidence only exists in images. + +
+
+ comment: Project webpage: https://m3docrag.github.io +
+
+
+
+
+ + ☆ Estimating the Influence of Sequentially Correlated Literary Properties + in Textual Classification: A Data-Centric Hypothesis-Testing Approach + + +
+ Stylometry aims to distinguish authors by analyzing literary traits assumed +to reflect semi-conscious choices distinct from elements like genre or theme. +However, these components often overlap, complicating text classification based +solely on feature distributions. While some literary properties, such as +thematic content, are likely to manifest as correlations between adjacent text +units, others, like authorial style, may be independent thereof. We introduce a +hypothesis-testing approach to evaluate the influence of sequentially +correlated literary properties on text classification, aiming to determine when +these correlations drive classification. Using a multivariate binary +distribution, our method models sequential correlations between text units as a +stochastic process, assessing the likelihood of clustering across varying +adjacency scales. This enables us to examine whether classification is +dominated by sequentially correlated properties or remains independent. In +experiments on a diverse English prose corpus, our analysis integrates +traditional and neural embeddings within supervised and unsupervised +frameworks. Results demonstrate that our approach effectively identifies when +textual classification is not primarily influenced by sequentially correlated +literary properties, particularly in cases where texts differ in authorial +style or genre rather than by a single author within a similar genre. + +
+
+
+
+
+ + ☆ GPTKB: Building Very Large Knowledge Bases from Language Models + + +
+ General-domain knowledge bases (KB), in particular the "big three" -- +Wikidata, Yago and DBpedia -- are the backbone of many intelligent +applications. While these three have seen steady development, comprehensive KB +construction at large has seen few fresh attempts. In this work, we propose to +build a large general-domain KB entirely from a large language model (LLM). We +demonstrate the feasibility of large-scale KB construction from LLMs, while +highlighting specific challenges arising around entity recognition, entity and +property canonicalization, and taxonomy construction. As a prototype, we use +GPT-4o-mini to construct GPTKB, which contains 105 million triples for more +than 2.9 million entities, at a cost 100x less than previous KBC projects. Our +work is a landmark for two fields: For NLP, for the first time, it provides +\textit{constructive} insights into the knowledge (or beliefs) of LLMs. For the +Semantic Web, it shows novel ways forward for the long-standing challenge of +general-domain KB construction. GPTKB is accessible at https://gptkb.org. + +
+
+ comment: 11 pages, 4 tables +
+
+
+
+
+ + ☆ GASE: Generatively Augmented Sentence Encoding + + +
+ We propose an approach to enhance sentence embeddings by applying generative +text models for data augmentation at inference time. Unlike conventional data +augmentation that utilises synthetic training data, our approach does not +require access to model parameters or the computational resources typically +required for fine-tuning state-of-the-art models. Generatively Augmented +Sentence Encoding uses diverse linguistic synthetic variants of input texts +generated by paraphrasing, summarising, or extracting keywords, followed by +pooling the original and synthetic embeddings. Experimental results on the +Massive Text Embedding Benchmark for Semantic Textual Similarity (STS) +demonstrate performance improvements across a range of embedding models using +different generative models for augmentation. We find that generative +augmentation leads to larger performance improvements for embedding models with +lower baseline performance. These findings suggest that integrating generative +augmentation at inference time adds semantic diversity and can enhance the +robustness and generalizability of sentence embeddings for embedding models. +Our results show that the degree to which generative augmentation can improve +STS performance depends not only on the embedding model but also on the +dataset. From a broader perspective, the approach allows trading training for +inference compute. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ OpenCoder: The Open Cookbook for Top-Tier Code Large Language Models + + +
+ Large language models (LLMs) for code have become indispensable in various +domains, including code generation, reasoning tasks and agent systems.While +open-access code LLMs are increasingly approaching the performance levels of +proprietary models, high-quality code LLMs suitable for rigorous scientific +investigation, particularly those with reproducible data processing pipelines +and transparent training protocols, remain limited. The scarcity is due to +various challenges, including resource constraints, ethical considerations, and +the competitive advantages of keeping models advanced. To address the gap, we +introduce OpenCoder, a top-tier code LLM that not only achieves performance +comparable to leading models but also serves as an ``open cookbook'' for the +research community. Unlike most prior efforts, we release not only model +weights and inference code, but also the reproducible training data, complete +data processing pipeline, rigorous experimental ablation results, and detailed +training protocols for open scientific research. Through this comprehensive +release, we identify the key ingredients for building a top-tier code LLM: (1) +code optimized heuristic rules for data cleaning and methods for data +deduplication, (2) recall of text corpus related to code and (3) high-quality +synthetic data in both annealing and supervised fine-tuning stages. By offering +this level of openness, we aim to broaden access to all aspects of a top-tier +code LLM, with OpenCoder serving as both a powerful model and an open +foundation to accelerate research, and enable reproducible advancements in code +AI. + +
+
+
+
+
+ + ☆ Sentiment Analysis of Spanish Political Party Tweets Using Pre-trained + Language Models + + +
+ Title: Sentiment Analysis of Spanish Political Party Communications on +Twitter Using Pre-trained Language Models + Authors: Chuqiao Song, Shunzhang Chen, Xinyi Cai, Hao Chen + Comments: 21 pages, 6 figures + Abstract: This study investigates sentiment patterns within Spanish political +party communications on Twitter by leveraging BETO and RoBERTuito, two +pre-trained language models optimized for Spanish text. Using a dataset of +tweets from major Spanish political parties: PSOE, PP, Vox, Podemos, and +Ciudadanos, spanning 2019 to 2024, this research analyzes sentiment +distributions and explores the relationship between sentiment expression and +party ideology. The findings indicate that both models consistently identify a +predominant Neutral sentiment across all parties, with significant variations +in Negative and Positive sentiments that align with ideological distinctions. +Specifically, Vox exhibits higher levels of Negative sentiment, while PSOE +demonstrates relatively high Positive sentiment, supporting the hypothesis that +emotional appeals in political messaging reflect ideological stances. This +study underscores the potential of pre-trained language models for non-English +sentiment analysis on social media, providing insights into sentiment dynamics +that shape public discourse within Spain's multi-party political system. + Keywords: Spanish politics, sentiment analysis, pre-trained language models, +Twitter, BETO, RoBERTuito, political ideology, multi-party system + +
+
+ comment: 21 pages, 6 figures +
+
+
+
+
+ + ☆ Prompt-Guided Internal States for Hallucination Detection of Large + Language Models + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities across +a variety of tasks in different domains. However, they sometimes generate +responses that are logically coherent but factually incorrect or misleading, +which is known as LLM hallucinations. Data-driven supervised methods train +hallucination detectors by leveraging the internal states of LLMs, but +detectors trained on specific domains often struggle to generalize well to +other domains. In this paper, we aim to enhance the cross-domain performance of +supervised detectors with only in-domain data. We propose a novel framework, +prompt-guided internal states for hallucination detection of LLMs, namely +PRISM. By utilizing appropriate prompts to guide changes in the structure +related to text truthfulness within the LLM's internal states, we make this +structure more salient and consistent across texts from different domains. We +integrated our framework with existing hallucination detection methods and +conducted experiments on datasets from different domains. The experimental +results indicate that our framework significantly enhances the cross-domain +generalization of existing hallucination detection methods. + +
+
+
+
+
+ + ☆ VTechAGP: An Academic-to-General-Audience Text Paraphrase Dataset and + Benchmark Models + + +
+ Existing text simplification or paraphrase datasets mainly focus on +sentence-level text generation in a general domain. These datasets are +typically developed without using domain knowledge. In this paper, we release a +novel dataset, VTechAGP, which is the first academic-to-general-audience text +paraphrase dataset consisting of 4,938 document-level these and dissertation +academic and general-audience abstract pairs from 8 colleges authored over 25 +years. We also propose a novel dynamic soft prompt generative language model, +DSPT5. For training, we leverage a contrastive-generative loss function to +learn the keyword vectors in the dynamic prompt. For inference, we adopt a +crowd-sampling decoding strategy at both semantic and structural levels to +further select the best output candidate. We evaluate DSPT5 and various +state-of-the-art large language models (LLMs) from multiple perspectives. +Results demonstrate that the SOTA LLMs does not provide satisfactory outcomes, +while the lightweight DSPT5 can achieve competitive results. To the best of our +knowledge, we are the first to build a benchmark dataset and solutions for +academic-to-general-audience text paraphrase dataset. + +
+
+ comment: 21 pages, 3 figures +
+
+
+
+
+ + ☆ When Does Classical Chinese Help? Quantifying Cross-Lingual Transfer in + Hanja and Kanbun + + +
+ Historical and linguistic connections within the Sinosphere have led +researchers to use Classical Chinese resources for cross-lingual transfer when +processing historical documents from Korea and Japan. In this paper, we +question the assumption of cross-lingual transferability from Classical Chinese +to Hanja and Kanbun, the ancient written languages of Korea and Japan, +respectively. Our experiments across machine translation, named entity +recognition, and punctuation restoration tasks show minimal impact of Classical +Chinese datasets on language model performance for ancient Korean documents +written in Hanja, with performance differences within $\pm{}0.0068$ F1-score +for sequence labeling tasks and up to $+0.84$ BLEU score for translation. These +limitations persist consistently across various model sizes, architectures, and +domain-specific datasets. Our analysis reveals that the benefits of Classical +Chinese resources diminish rapidly as local language data increases for Hanja, +while showing substantial improvements only in extremely low-resource scenarios +for both Korean and Japanese historical documents. These mixed results +emphasize the need for careful empirical validation rather than assuming +benefits from indiscriminate cross-lingual transfer. + +
+
+
+
+
+ + ☆ LuxBank: The First Universal Dependency Treebank for Luxembourgish + + +
+ The Universal Dependencies (UD) project has significantly expanded linguistic +coverage across 161 languages, yet Luxembourgish, a West Germanic language +spoken by approximately 400,000 people, has remained absent until now. In this +paper, we introduce LuxBank, the first UD Treebank for Luxembourgish, +addressing the gap in syntactic annotation and analysis for this `low-research' +language. We establish formal guidelines for Luxembourgish language annotation, +providing the foundation for the first large-scale quantitative analysis of its +syntax. LuxBank serves not only as a resource for linguists and language +learners but also as a tool for developing spell checkers and grammar checkers, +organising existing text archives and even training large language models. By +incorporating Luxembourgish into the UD framework, we aim to enhance the +understanding of syntactic variation within West Germanic languages and offer a +model for documenting smaller, semi-standardised languages. This work positions +Luxembourgish as a valuable resource in the broader linguistic and NLP +communities, contributing to the study of languages with limited research and +resources. + +
+
+ comment: Accepted at 22nd Workshop on Treebanks and Linguistic Theories (TLT + 2024) +
+
+
+
+
+ + ☆ Kwai-STaR: Transform LLMs into State-Transition Reasoners + + +
+ Mathematical reasoning presents a significant challenge to the cognitive +capabilities of LLMs. Various methods have been proposed to enhance the +mathematical ability of LLMs. However, few recognize the value of state +transition for LLM reasoning. In this work, we define mathematical +problem-solving as a process of transiting from an initial unsolved state to +the final resolved state, and propose Kwai-STaR framework, which transforms +LLMs into State-Transition Reasoners to improve their intuitive reasoning +capabilities. Our approach comprises three main steps: (1) Define the state +space tailored to the mathematical reasoning. (2) Generate state-transition +data based on the state space. (3) Convert original LLMs into State-Transition +Reasoners via a curricular training strategy. Our experiments validate the +effectiveness of Kwai-STaR in enhancing mathematical reasoning: After training +on the small-scale Kwai-STaR dataset, general LLMs, including Mistral-7B and +LLaMA-3, achieve considerable performance gain on the GSM8K and GSM-Hard +dataset. Additionally, the state transition-based design endows Kwai-STaR with +remarkable training and inference efficiency. Further experiments are underway +to establish the generality of Kwai-STaR. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ☆ AlignXIE: Improving Multilingual Information Extraction by Cross-Lingual + Alignment + + +
+ Empirical evidence suggests that LLMs exhibit spontaneous cross-lingual +alignment. Our findings suggest that although LLMs also demonstrate promising +cross-lingual alignment in Information Extraction, there remains significant +imbalance across languages, revealing an underlying deficiency in the IE +alignment. To address this issue, we propose AlignXIE, a powerful code-based +LLM that significantly enhances cross-lingual IE alignment through two +strategies. Firstly, AlignXIE formulates IE across different languages, +especially non-English ones, as code generation tasks, standardizing the +representation of various schemas using Python classes to ensure consistency of +the same ontology in different languages and align the schema. Secondly, it +incorporates an IE cross-lingual alignment phase through a translated instance +prediction task proposed in this paper to align the extraction process, +utilizing ParallelNER, an IE bilingual parallel dataset with 257,190 samples, +generated by our proposed LLM-based automatic pipeline for IE parallel data +construction, with manual annotation to ensure quality. Ultimately, we obtain +AlignXIE through multilingual IE instruction tuning. Although without training +in 9 unseen languages, AlignXIE surpasses ChatGPT by $30.17\%$ and SoTA by +$20.03\%$, thereby demonstrating superior cross-lingual IE capabilities. +Comprehensive evaluations on 63 IE benchmarks in Chinese and English under +various settings, demonstrate that AlignXIE significantly enhances +cross-lingual and multilingual IE through boosting the IE alignment. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Enhancing Investment Analysis: Optimizing AI-Agent Collaboration in + Financial Research + + +
+ In recent years, the application of generative artificial intelligence +(GenAI) in financial analysis and investment decision-making has gained +significant attention. However, most existing approaches rely on single-agent +systems, which fail to fully utilize the collaborative potential of multiple AI +agents. In this paper, we propose a novel multi-agent collaboration system +designed to enhance decision-making in financial investment research. The +system incorporates agent groups with both configurable group sizes and +collaboration structures to leverage the strengths of each agent group type. By +utilizing a sub-optimal combination strategy, the system dynamically adapts to +varying market conditions and investment scenarios, optimizing performance +across different tasks. We focus on three sub-tasks: fundamentals, market +sentiment, and risk analysis, by analyzing the 2023 SEC 10-K forms of 30 +companies listed on the Dow Jones Index. Our findings reveal significant +performance variations based on the configurations of AI agents for different +tasks. The results demonstrate that our multi-agent collaboration system +outperforms traditional single-agent models, offering improved accuracy, +efficiency, and adaptability in complex financial environments. This study +highlights the potential of multi-agent systems in transforming financial +analysis and investment decision-making by integrating diverse analytical +perspectives. + +
+
+
+
+
+ + ☆ A study of Vietnamese readability assessing through semantic and + statistical features + + +
+ Determining the difficulty of a text involves assessing various textual +features that may impact the reader's text comprehension, yet current research +in Vietnamese has only focused on statistical features. This paper introduces a +new approach that integrates statistical and semantic approaches to assessing +text readability. Our research utilized three distinct datasets: the Vietnamese +Text Readability Dataset (ViRead), OneStopEnglish, and RACE, with the latter +two translated into Vietnamese. Advanced semantic analysis methods were +employed for the semantic aspect using state-of-the-art language models such as +PhoBERT, ViDeBERTa, and ViBERT. In addition, statistical methods were +incorporated to extract syntactic and lexical features of the text. We +conducted experiments using various machine learning models, including Support +Vector Machine (SVM), Random Forest, and Extra Trees and evaluated their +performance using accuracy and F1 score metrics. Our results indicate that a +joint approach that combines semantic and statistical features significantly +enhances the accuracy of readability classification compared to using each +method in isolation. The current study emphasizes the importance of considering +both statistical and semantic aspects for a more accurate assessment of text +difficulty in Vietnamese. This contribution to the field provides insights into +the adaptability of advanced language models in the context of Vietnamese text +readability. It lays the groundwork for future research in this area. + +
+
+
+
+
+ + ☆ RetrieveGPT: Merging Prompts and Mathematical Models for Enhanced + Code-Mixed Information Retrieval + + +
+ Code-mixing, the integration of lexical and grammatical elements from +multiple languages within a single sentence, is a widespread linguistic +phenomenon, particularly prevalent in multilingual societies. In India, social +media users frequently engage in code-mixed conversations using the Roman +script, especially among migrant communities who form online groups to share +relevant local information. This paper focuses on the challenges of extracting +relevant information from code-mixed conversations, specifically within Roman +transliterated Bengali mixed with English. This study presents a novel approach +to address these challenges by developing a mechanism to automatically identify +the most relevant answers from code-mixed conversations. We have experimented +with a dataset comprising of queries and documents from Facebook, and Query +Relevance files (QRels) to aid in this task. Our results demonstrate the +effectiveness of our approach in extracting pertinent information from complex, +code-mixed digital conversations, contributing to the broader field of natural +language processing in multilingual and informal text environments. We use +GPT-3.5 Turbo via prompting alongwith using the sequential nature of relevant +documents to frame a mathematical model which helps to detect relevant +documents corresponding to a query. + +
+
+ comment: Accepted at FIRE 2024 (Track: Code-Mixed Information Retrieval from + Social Media Data) +
+
+
+
+
+ + ☆ BhasaAnuvaad: A Speech Translation Dataset for 14 Indian Languages + + +
+ Automatic Speech Translation (AST) datasets for Indian languages remain +critically scarce, with public resources covering fewer than 10 of the 22 +official languages. This scarcity has resulted in AST systems for Indian +languages lagging far behind those available for high-resource languages like +English. In this paper, we first evaluate the performance of widely-used AST +systems on Indian languages, identifying notable performance gaps and +challenges. Our findings show that while these systems perform adequately on +read speech, they struggle significantly with spontaneous speech, including +disfluencies like pauses and hesitations. Additionally, there is a striking +absence of systems capable of accurately translating colloquial and informal +language, a key aspect of everyday communication. To this end, we introduce +BhasaAnuvaad, the largest publicly available dataset for AST involving 14 +scheduled Indian languages spanning over 44,400 hours and 17M text segments. +BhasaAnuvaad contains data for English speech to Indic text, as well as Indic +speech to English text. This dataset comprises three key categories: (1) +Curated datasets from existing resources, (2) Large-scale web mining, and (3) +Synthetic data generation. By offering this diverse and expansive dataset, we +aim to bridge the resource gap and promote advancements in AST for low-resource +Indian languages, especially in handling spontaneous and informal speech +patterns. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ DISCO: DISCovering Overfittings as Causal Rules for Text Classification + Models + + +
+ With the rapid advancement of neural language models, the deployment of +over-parameterized models has surged, increasing the need for interpretable +explanations comprehensible to human inspectors. Existing post-hoc +interpretability methods, which often focus on unigram features of single input +textual instances, fail to capture the models' decision-making process fully. +Additionally, many methods do not differentiate between decisions based on +spurious correlations and those based on a holistic understanding of the input. +Our paper introduces DISCO, a novel method for discovering global, rule-based +explanations by identifying causal n-gram associations with model predictions. +This method employs a scalable sequence mining technique to extract relevant +text spans from training data, associate them with model predictions, and +conduct causality checks to distill robust rules that elucidate model behavior. +These rules expose potential overfitting and provide insights into misleading +feature combinations. We validate DISCO through extensive testing, +demonstrating its superiority over existing methods in offering comprehensive +insights into complex model behaviors. Our approach successfully identifies all +shortcuts manually introduced into the training data (100% detection rate on +the MultiRC dataset), resulting in an 18.8% regression in model performance -- +a capability unmatched by any other method. Furthermore, DISCO supports +interactive explanations, enabling human inspectors to distinguish spurious +causes in the rule-based output. This alleviates the burden of abundant +instance-wise explanations and helps assess the model's risk when encountering +out-of-distribution (OOD) data. + +
+
+
+
+
+ + ☆ Hands-On Tutorial: Labeling with LLM and Human-in-the-Loop COLING 2025 + + +
+ Training and deploying machine learning models relies on a large amount of +human-annotated data. As human labeling becomes increasingly expensive and +time-consuming, recent research has developed multiple strategies to speed up +annotation and reduce costs and human workload: generating synthetic training +data, active learning, and hybrid labeling. This tutorial is oriented toward +practical applications: we will present the basics of each strategy, highlight +their benefits and limitations, and discuss in detail real-life case studies. +Additionally, we will walk through best practices for managing human annotators +and controlling the quality of the final dataset. The tutorial includes a +hands-on workshop, where attendees will be guided in implementing a hybrid +annotation setup. This tutorial is designed for NLP practitioners from both +research and industry backgrounds who are involved in or interested in +optimizing data labeling projects. + +
+
+ comment: To be presented at COLING 2025 +
+
+
+
+
+ + ☆ FASSILA: A Corpus for Algerian Dialect Fake News Detection and Sentiment + Analysis + + +
+ In the context of low-resource languages, the Algerian dialect (AD) faces +challenges due to the absence of annotated corpora, hindering its effective +processing, notably in Machine Learning (ML) applications reliant on corpora +for training and assessment. This study outlines the development process of a +specialized corpus for Fake News (FN) detection and sentiment analysis (SA) in +AD called FASSILA. This corpus comprises 10,087 sentences, encompassing over +19,497 unique words in AD, and addresses the significant lack of linguistic +resources in the language and covers seven distinct domains. We propose an +annotation scheme for FN detection and SA, detailing the data collection, +cleaning, and labelling process. Remarkable Inter-Annotator Agreement indicates +that the annotation scheme produces consistent annotations of high quality. +Subsequent classification experiments using BERT-based models and ML models are +presented, demonstrate promising results and highlight avenues for further +research. The dataset is made freely available on GitHub +(https://github.com/amincoding/FASSILA) to facilitate future advancements in +the field. + +
+
+ comment: 16 pages, 6 Figuers +
+
+
+
+
+ + ☆ Self-Calibrated Listwise Reranking with Large Language Models + + +
+ Large language models (LLMs), with advanced linguistic capabilities, have +been employed in reranking tasks through a sequence-to-sequence approach. In +this paradigm, multiple passages are reranked in a listwise manner and a +textual reranked permutation is generated. However, due to the limited context +window of LLMs, this reranking paradigm requires a sliding window strategy to +iteratively handle larger candidate sets. This not only increases computational +costs but also restricts the LLM from fully capturing all the comparison +information for all candidates. To address these challenges, we propose a novel +self-calibrated listwise reranking method, which aims to leverage LLMs to +produce global relevance scores for ranking. To achieve it, we first propose +the relevance-aware listwise reranking framework, which incorporates explicit +list-view relevance scores to improve reranking efficiency and enable global +comparison across the entire candidate set. Second, to ensure the comparability +of the computed scores, we propose self-calibrated training that uses +point-view relevance assessments generated internally by the LLM itself to +calibrate the list-view relevance assessments. Extensive experiments and +comprehensive analysis on the BEIR benchmark and TREC Deep Learning Tracks +demonstrate the effectiveness and efficiency of our proposed method. + +
+
+
+
+
+ + ☆ Tibyan Corpus: Balanced and Comprehensive Error Coverage Corpus Using + ChatGPT for Arabic Grammatical Error Correction + + +
+ Natural language processing (NLP) utilizes text data augmentation to overcome +sample size constraints. Increasing the sample size is a natural and widely +used strategy for alleviating these challenges. In this study, we chose Arabic +to increase the sample size and correct grammatical errors. Arabic is +considered one of the languages with limited resources for grammatical error +correction (GEC). Furthermore, QALB-14 and QALB-15 are the only datasets used +in most Arabic grammatical error correction research, with approximately 20,500 +parallel examples, which is considered low compared with other languages. +Therefore, this study aims to develop an Arabic corpus called "Tibyan" for +grammatical error correction using ChatGPT. ChatGPT is used as a data augmenter +tool based on a pair of Arabic sentences containing grammatical errors matched +with a sentence free of errors extracted from Arabic books, called guide +sentences. Multiple steps were involved in establishing our corpus, including +the collection and pre-processing of a pair of Arabic texts from various +sources, such as books and open-access corpora. We then used ChatGPT to +generate a parallel corpus based on the text collected previously, as a guide +for generating sentences with multiple types of errors. By engaging linguistic +experts to review and validate the automatically generated sentences, we +ensured that they were correct and error-free. The corpus was validated and +refined iteratively based on feedback provided by linguistic experts to improve +its accuracy. Finally, we used the Arabic Error Type Annotation tool (ARETA) to +analyze the types of errors in the Tibyan corpus. Our corpus contained 49 of +errors, including seven types: orthography, morphology, syntax, semantics, +punctuation, merge, and split. The Tibyan corpus contains approximately 600 K +tokens. + +
+
+ comment: 17 pages, 11 figures +
+
+
+
+
+ + ☆ The State and Fate of Summarization Datasets + + +
+ Automatic summarization has consistently attracted attention, due to its +versatility and wide application in various downstream tasks. Despite its +popularity, we find that annotation efforts have largely been disjointed, and +have lacked common terminology. Consequently, it is challenging to discover +existing resources or identify coherent research directions. To address this, +we survey a large body of work spanning 133 datasets in over 100 languages, +creating a novel ontology covering sample properties, collection methods and +distribution. With this ontology we make key observations, including the lack +in accessible high-quality datasets for low-resource languages, and the field's +over-reliance on the news domain and on automatically collected distant +supervision. Finally, we make available a web interface that allows users to +interact and explore our ontology and dataset collection, as well as a template +for a summarization data card, which can be used to streamline future research +into a more coherent body of work. + +
+
+
+
+
+ + ☆ Multistage Fine-tuning Strategies for Automatic Speech Recognition in + Low-resource Languages + + +
+ This paper presents a novel multistage fine-tuning strategy designed to +enhance automatic speech recognition (ASR) performance in low-resource +languages using OpenAI's Whisper model. In this approach we aim to build ASR +model for languages with limited digital resources by sequentially adapting the +model across linguistically similar languages. We experimented this on the +Malasar language, a Dravidian language spoken by approximately ten thousand +people in the Western Ghats of South India. Malasar language faces critical +challenges for technological intervention due to its lack of a native script +and absence of digital or spoken data resources. Working in collaboration with +Wycliffe India and Malasar community members, we created a spoken Malasar +corpus paired with transcription in Tamil script, a closely related major +language. In our approach to build ASR model for Malasar, we first build an +intermediate Tamil ASR, leveraging higher data availability for Tamil annotated +speech. This intermediate model is subsequently fine-tuned on Malasar data, +allowing for more effective ASR adaptation despite limited resources. The +multistage fine-tuning strategy demonstrated significant improvements over +direct fine-tuning on Malasar data alone, achieving a word error rate (WER) of +51.9%, which is 4.5% absolute reduction when compared to the direct fine-tuning +method. Further a WER reduction to 47.3% was achieved through punctuation +removal in post-processing, which addresses formatting inconsistencies that +impact evaluation. Our results underscore the effectiveness of sequential +multistage fine-tuning combined with targeted post-processing as a scalable +strategy for ASR system development in low-resource languages, especially where +linguistic similarities can be leveraged to bridge gaps in training data. + +
+
+
+
+
+ + ☆ Pruning Literals for Highly Efficient Explainability at Word Level + + +
+ Designing an explainable model becomes crucial now for Natural Language +Processing(NLP) since most of the state-of-the-art machine learning models +provide a limited explanation for the prediction. In the spectrum of an +explainable model, Tsetlin Machine(TM) is promising because of its capability +of providing word-level explanation using proposition logic. However, concern +rises over the elaborated combination of literals (propositional logic) in the +clause that makes the model difficult for humans to comprehend, despite having +a transparent learning process. In this paper, we design a post-hoc pruning of +clauses that eliminate the randomly placed literals in the clause thereby +making the model more efficiently interpretable than the vanilla TM. +Experiments on the publicly available YELP-HAT Dataset demonstrate that the +proposed pruned TM's attention map aligns more with the human attention map +than the vanilla TM's attention map. In addition, the pairwise similarity +measure also surpasses the attention map-based neural network models. In terms +of accuracy, the proposed pruning method does not degrade the accuracy +significantly but rather enhances the performance up to 4% to 9% in some test +data. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Best Practices for Distilling Large Language Models into BERT for Web + Search Ranking + + +
+ Recent studies have highlighted the significant potential of Large Language +Models (LLMs) as zero-shot relevance rankers. These methods predominantly +utilize prompt learning to assess the relevance between queries and documents +by generating a ranked list of potential documents. Despite their promise, the +substantial costs associated with LLMs pose a significant challenge for their +direct implementation in commercial search systems. To overcome this barrier +and fully exploit the capabilities of LLMs for text ranking, we explore +techniques to transfer the ranking expertise of LLMs to a more compact model +similar to BERT, using a ranking loss to enable the deployment of less +resource-intensive models. Specifically, we enhance the training of LLMs +through Continued Pre-Training, taking the query as input and the clicked title +and summary as output. We then proceed with supervised fine-tuning of the LLM +using a rank loss, assigning the final token as a representative of the entire +sentence. Given the inherent characteristics of autoregressive language models, +only the final token can encapsulate all preceding tokens. Additionally, +we introduce a hybrid point-wise and margin MSE loss to transfer the ranking +knowledge from LLMs to smaller models like BERT. This method creates a viable +solution for environments with strict resource constraints. Both offline and +online evaluations have confirmed the efficacy of our approach, and our model +has been successfully integrated into a commercial web search engine as of +February 2024. + +
+
+ comment: Arxiv Version +
+
+
+
+
+ + ☆ Meta-Reasoning Improves Tool Use in Large Language Models + + +
+ External tools help large language models (LLMs) succeed at tasks where they +would otherwise typically fail. In existing frameworks, LLMs learn tool use +either by in-context demonstrations or via full model fine-tuning on annotated +data. As these approaches do not easily scale, a recent trend is to abandon +them in favor of lightweight, parameter-efficient tuning paradigms. These +methods allow quickly alternating between the frozen LLM and its specialised +fine-tuned version, by switching on or off a handful of additional custom +parameters. Hence, we postulate that the generalization ability of the frozen +model can be leveraged to improve tool selection. We present Tool selECTion via +meta-reasONing (TECTON), a two-phase system that first reasons over a task +using a custom fine-tuned LM head and outputs candidate tools. Then, with the +custom head disabled, it meta-reasons (i.e., it reasons over the previous +reasoning process) to make a final choice. We show that TECTON results in +substantial gains - both in-distribution and out-of-distribution - on a range +of math reasoning datasets. + +
+
+
+
+
+ + ☆ Tomato, Tomahto, Tomate: Measuring the Role of Shared Semantics among + Subwords in Multilingual Language Models + + +
+ Human understanding of language is robust to different word choices as far as +they represent similar semantic concepts. To what extent does our human +intuition transfer to language models, which represent all subwords as distinct +embeddings? In this work, we take an initial step on measuring the role of +shared semantics among subwords in the encoder-only multilingual language +models (mLMs). To this end, we form "semantic tokens" by merging the +semantically similar subwords and their embeddings, and evaluate the updated +mLMs on 5 heterogeneous multilingual downstream tasks. Results show that the +general shared semantics could get the models a long way in making the +predictions on mLMs with different tokenizers and model sizes. Inspections on +the grouped subwords show that they exhibit a wide range of semantic +similarities, including synonyms and translations across many languages and +scripts. Lastly, we found the zero-shot results with semantic tokens are on par +or even better than the original models on certain classification tasks, +suggesting that the shared subword-level semantics may serve as the anchors for +cross-lingual transferring. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ Thanos: Enhancing Conversational Agents with Skill-of-Mind-Infused Large + Language Model + + +
+ To increase social bonding with interlocutors, humans naturally acquire the +ability to respond appropriately in a given situation by considering which +conversational skill is most suitable for the response - a process we call +skill-of-mind. For large language model (LLM)-based conversational agents, +planning appropriate conversational skills, as humans do, is challenging due to +the complexity of social dialogue, especially in interactive scenarios. To +address this, we propose a skill-of-mind-annotated conversation dataset, named +Multifaceted Skill-of-Mind, which includes multi-turn and multifaceted +conversational skills across various interactive scenarios (e.g., long-term, +counseling, task-oriented), grounded in diverse social contexts (e.g., +demographics, persona, rules of thumb). This dataset consists of roughly 100K +conversations. Using this dataset, we introduce a new family of +skill-of-mind-infused LLMs, named Thanos, with model sizes of 1B, 3B, and 8B +parameters. With extensive experiments, these models successfully demonstrate +the skill-of-mind process and exhibit strong generalizability in inferring +multifaceted skills across a variety of domains. Moreover, we show that Thanos +significantly enhances the quality of responses generated by LLM-based +conversational agents and promotes prosocial behavior in human evaluations. + +
+
+ comment: Code: https://github.com/passing2961/Thanos +
+
+
+
+
+ + ☆ ML-Promise: A Multilingual Dataset for Corporate Promise Verification + + +
+ Promises made by politicians, corporate leaders, and public figures have a +significant impact on public perception, trust, and institutional reputation. +However, the complexity and volume of such commitments, coupled with +difficulties in verifying their fulfillment, necessitate innovative methods for +assessing their credibility. This paper introduces the concept of Promise +Verification, a systematic approach involving steps such as promise +identification, evidence assessment, and the evaluation of timing for +verification. We propose the first multilingual dataset, ML-Promise, which +includes English, French, Chinese, Japanese, and Korean, aimed at facilitating +in-depth verification of promises, particularly in the context of +Environmental, Social, and Governance (ESG) reports. Given the growing emphasis +on corporate environmental contributions, this dataset addresses the challenge +of evaluating corporate promises, especially in light of practices like +greenwashing. Our findings also explore textual and image-based baselines, with +promising results from retrieval-augmented generation (RAG) approaches. This +work aims to foster further discourse on the accountability of public +commitments across multiple languages and domains. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ Gradient Localization Improves Lifelong Pretraining of Language Models EMNLP + + +
+ Large Language Models (LLMs) trained on web-scale text corpora have been +shown to capture world knowledge in their parameters. However, the mechanism by +which language models store different types of knowledge is poorly understood. +In this work, we examine two types of knowledge relating to temporally +sensitive entities and demonstrate that each type is localized to different +sets of parameters within the LLMs. We hypothesize that the lack of +consideration of the locality of knowledge in existing continual learning +methods contributes to both: the failed uptake of new information, and +catastrophic forgetting of previously learned information. We observe that +sequences containing references to updated and newly mentioned entities exhibit +larger gradient norms in a subset of layers. We demonstrate that targeting +parameter updates to these relevant layers can improve the performance of +continually pretraining on language containing temporal drift. + +
+
+ comment: EMNLP Findings 2024 +
+
+
+
+
+ + ☆ ACCIO: Table Understanding Enhanced via Contrastive Learning with + Aggregations + + +
+ The attention to table understanding using recent natural language models has +been growing. However, most related works tend to focus on learning the +structure of the table directly. Just as humans improve their understanding of +sentences by comparing them, they can also enhance their understanding by +comparing tables. With this idea, in this paper, we introduce ACCIO, tAble +understanding enhanCed via Contrastive learnIng with aggregatiOns, a novel +approach to enhancing table understanding by contrasting original tables with +their pivot summaries through contrastive learning. ACCIO trains an encoder to +bring these table pairs closer together. Through validation via column type +annotation, ACCIO achieves competitive performance with a macro F1 score of +91.1 compared to state-of-the-art methods. This work represents the first +attempt to utilize pairs of tables for table embedding, promising significant +advancements in table comprehension. Our code is available at +https://github.com/whnhch/ACCIO/. + +
+
+
+
+
+ + ☆ One fish, two fish, but not the whole sea: Alignment reduces language + models' conceptual diversity + + +
+ Researchers in social science and psychology have recently proposed using +large language models (LLMs) as replacements for humans in behavioral research. +In addition to arguments about whether LLMs accurately capture population-level +patterns, this has raised questions about whether LLMs capture human-like +conceptual diversity. Separately, it is debated whether post-training alignment +(RLHF or RLAIF) affects models' internal diversity. Inspired by human studies, +we use a new way of measuring the conceptual diversity of +synthetically-generated LLM "populations" by relating the internal variability +of simulated individuals to the population-level variability. We use this +approach to evaluate non-aligned and aligned LLMs on two domains with rich +human behavioral data. While no model reaches human-like diversity, aligned +models generally display less diversity than their instruction fine-tuned +counterparts. Our findings highlight potential trade-offs between increasing +models' value alignment and decreasing the diversity of their conceptual +representations. + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ☆ DELIFT: Data Efficient Language model Instruction Fine Tuning + + +
+ Fine-tuning large language models (LLMs) is essential for enhancing their +performance on specific tasks but is often resource-intensive due to redundant +or uninformative data. To address this inefficiency, we introduce DELIFT (Data +Efficient Language model Instruction Fine-Tuning), a novel algorithm that +systematically optimizes data selection across the three key stages of +fine-tuning: (1) instruction tuning, (2) task-specific fine-tuning (e.g., +reasoning, question-answering), and (3) continual fine-tuning (e.g., +incorporating new data versions). Unlike existing methods that focus on +single-stage optimization or rely on computationally intensive gradient +calculations, DELIFT operates efficiently across all stages. Central to our +approach is a pairwise utility metric that quantifies how beneficial a data +sample is for improving the model's responses to other samples, effectively +measuring the informational value relative to the model's current capabilities. +By leveraging different submodular functions applied to this metric, DELIFT +selects diverse and optimal subsets that are useful across all stages of +fine-tuning. Experiments across various tasks and model scales demonstrate that +DELIFT can reduce the fine-tuning data size by up to 70% without compromising +performance, offering significant computational savings and outperforming +existing methods in both efficiency and efficacy. + +
+
+
+
+
+ + ☆ Bayesian Calibration of Win Rate Estimation with LLM Evaluators EMNLP 2024 + + +
+ Recent advances in large language models (LLMs) show the potential of using +LLMs as evaluators for assessing the quality of text generations from LLMs. +However, applying LLM evaluators naively to compare or judge between different +systems can lead to unreliable results due to the intrinsic win rate estimation +bias of LLM evaluators. In order to mitigate this problem, we propose two +calibration methods, Bayesian Win Rate Sampling (BWRS) and Bayesian +Dawid-Skene, both of which leverage Bayesian inference to more accurately infer +the true win rate of generative language models. We empirically validate our +methods on six datasets covering story generation, summarization, and +instruction following tasks. We show that both our methods are effective in +improving the accuracy of win rate estimation using LLMs as evaluators, +offering a promising direction for reliable automatic text quality evaluation. + +
+
+ comment: Accepted by EMNLP 2024 +
+
+
+
+
+ + ☆ Variational Low-Rank Adaptation Using IVON NeurIPS 2024 + + +
+ We show that variational learning can significantly improve the accuracy and +calibration of Low-Rank Adaptation (LoRA) without a substantial increase in the +cost. We replace AdamW by the Improved Variational Online Newton (IVON) +algorithm to finetune large language models. For Llama-2 with 7 billion +parameters, IVON improves the accuracy over AdamW by 2.8% and expected +calibration error by 4.6%. The accuracy is also better than the other Bayesian +alternatives, yet the cost is lower and the implementation is easier. Our work +provides additional evidence for the effectiveness of IVON for large language +models. The code is available at +https://github.com/team-approx-bayes/ivon-lora. + +
+
+ comment: Published at 38th Workshop on Fine-Tuning in Machine Learning + (NeurIPS 2024). Code available at + https://github.com/team-approx-bayes/ivon-lora +
+
+
+
+
+ + ☆ Measuring short-form factuality in large language models + + +
+ We present SimpleQA, a benchmark that evaluates the ability of language +models to answer short, fact-seeking questions. We prioritized two properties +in designing this eval. First, SimpleQA is challenging, as it is adversarially +collected against GPT-4 responses. Second, responses are easy to grade, because +questions are created such that there exists only a single, indisputable +answer. Each answer in SimpleQA is graded as either correct, incorrect, or not +attempted. A model with ideal behavior would get as many questions correct as +possible while not attempting the questions for which it is not confident it +knows the correct answer. SimpleQA is a simple, targeted evaluation for whether +models "know what they know," and our hope is that this benchmark will remain +relevant for the next few generations of frontier models. SimpleQA can be found +at https://github.com/openai/simple-evals. + +
+
+ comment: Blog post: https://openai.com/index/introducing-simpleqa/ +
+
+
+
+
+ + ☆ Robust and Efficient Fine-tuning of LLMs with Bayesian + Reparameterization of Low-Rank Adaptation + + +
+ Large Language Models (LLMs) are highly resource-intensive to fine-tune due +to their enormous size. While low-rank adaptation is a prominent +parameter-efficient fine-tuning approach, it suffers from sensitivity to +hyperparameter choices, leading to instability in model performance on +fine-tuning downstream tasks. This paper highlights the importance of effective +parameterization in low-rank fine-tuning to reduce estimator variance and +enhance the stability of final model outputs. We propose MonteCLoRA, an +efficient fine-tuning technique, employing Monte Carlo estimation to learn an +unbiased posterior estimation of low-rank parameters with low expected +variance, which stabilizes fine-tuned LLMs with only O(1) additional +parameters. MonteCLoRA shows significant improvements in accuracy and +robustness, achieving up to 3.8% higher accuracy and 8.6% greater robustness +than existing efficient fine-tuning methods on natural language understanding +tasks with pre-trained RoBERTa-base. Furthermore, in generative tasks with +pre-trained LLaMA-1-7B, MonteCLoRA demonstrates robust zero-shot performance +with 50% lower variance than the contemporary efficient fine-tuning methods. +The theoretical and empirical results presented in the paper underscore how +parameterization and hyperpriors balance exploration-exploitation in the +low-rank parametric space, therefore leading to more optimal and robust +parameter estimation during efficient fine-tuning. + +
+
+ comment: 48 pages, 10 figures, 10 tables, Code: + https://github.com/LCS2-IIITD/MonteCLoRA +
+
+
+
+
+ + ☆ Scaling Laws for Precision + + +
+ Low precision training and inference affect both the quality and cost of +language models, but current scaling laws do not account for this. In this +work, we devise "precision-aware" scaling laws for both training and inference. +We propose that training in lower precision reduces the model's "effective +parameter count," allowing us to predict the additional loss incurred from +training in low precision and post-train quantization. For inference, we find +that the degradation introduced by post-training quantization increases as +models are trained on more data, eventually making additional pretraining data +actively harmful. For training, our scaling laws allow us to predict the loss +of a model with different parts in different precisions, and suggest that +training larger models in lower precision may be compute optimal. We unify the +scaling laws for post and pretraining quantization to arrive at a single +functional form that predicts degradation from training and inference in varied +precisions. We fit on over 465 pretraining runs and validate our predictions on +model sizes up to 1.7B parameters trained on up to 26B tokens. + +
+
+
+
+
+ + ☆ CodeTree: Agent-guided Tree Search for Code Generation with Large + Language Models + + +
+ Pre-trained on massive amounts of code and text data, large language models +(LLMs) have demonstrated remarkable achievements in performing code generation +tasks. With additional execution-based feedback, these models can act as agents +with capabilities to self-refine and improve generated code autonomously. +However, on challenging coding tasks with extremely large search space, current +agentic approaches still struggle with multi-stage planning, generating, and +debugging. To address this problem, we propose CodeTree, a framework for LLM +agents to efficiently explore the search space in different stages of the code +generation process. Specifically, we adopted a unified tree structure to +explicitly explore different coding strategies, generate corresponding coding +solutions, and subsequently refine the solutions. In each stage, critical +decision-making (ranking, termination, expanding) of the exploration process is +guided by both the environmental execution-based feedback and +LLM-agent-generated feedback. We comprehensively evaluated CodeTree on 7 code +generation benchmarks and demonstrated the significant performance gains of +CodeTree against strong baselines. Using GPT-4o as the base model, we +consistently achieved top results of 95.1 on HumanEval, 98.7 on MBPP, and 43.0 +on CodeContests. On the challenging SWEBench benchmark, our approach led to +significant performance gains. + +
+
+
+
+
+ + ☆ Balancing Transparency and Accuracy: A Comparative Analysis of + Rule-Based and Deep Learning Models in Political Bias Classification + + +
+ The unchecked spread of digital information, combined with increasing +political polarization and the tendency of individuals to isolate themselves +from opposing political viewpoints, has driven researchers to develop systems +for automatically detecting political bias in media. This trend has been +further fueled by discussions on social media. We explore methods for +categorizing bias in US news articles, comparing rule-based and deep learning +approaches. The study highlights the sensitivity of modern self-learning +systems to unconstrained data ingestion, while reconsidering the strengths of +traditional rule-based systems. Applying both models to left-leaning (CNN) and +right-leaning (FOX) news articles, we assess their effectiveness on data beyond +the original training and test sets.This analysis highlights each model's +accuracy, offers a framework for exploring deep-learning explainability, and +sheds light on political bias in US news media. We contrast the opaque +architecture of a deep learning model with the transparency of a linguistically +informed rule-based model, showing that the rule-based model performs +consistently across different data conditions and offers greater transparency, +whereas the deep learning model is dependent on the training set and struggles +with unseen data. + +
+
+
+
+
+ + ♻ ☆ MediQ: Question-Asking LLMs and a Benchmark for Reliable Interactive + Clinical Reasoning + + +
+ Users typically engage with LLMs interactively, yet most existing benchmarks +evaluate them in a static, single-turn format, posing reliability concerns in +interactive scenarios. We identify a key obstacle towards reliability: LLMs are +trained to answer any question, even with incomplete context or insufficient +knowledge. In this paper, we propose to change the static paradigm to an +interactive one, develop systems that proactively ask questions to gather more +information and respond reliably, and introduce an benchmark - MediQ - to +evaluate question-asking ability in LLMs. MediQ simulates clinical interactions +consisting of a Patient System and an adaptive Expert System; with potentially +incomplete initial information, the Expert refrains from making diagnostic +decisions when unconfident, and instead elicits missing details via follow-up +questions. We provide a pipeline to convert single-turn medical benchmarks into +an interactive format. Our results show that directly prompting +state-of-the-art LLMs to ask questions degrades performance, indicating that +adapting LLMs to proactive information-seeking settings is nontrivial. We +experiment with abstention strategies to better estimate model confidence and +decide when to ask questions, improving diagnostic accuracy by 22.3%; however, +performance still lags compared to an (unrealistic in practice) upper bound +with complete information upfront. Further analyses show improved interactive +performance with filtering irrelevant contexts and reformatting conversations. +Overall, we introduce a novel problem towards LLM reliability, an interactive +MediQ benchmark and a novel question-asking system, and highlight directions to +extend LLMs' information-seeking abilities in critical domains. + +
+
+ comment: 29 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Talking the Talk Does Not Entail Walking the Walk: On the Limits of + Large Language Models in Lexical Entailment Recognition EMNLP-2024 + + +
+ Verbs form the backbone of language, providing the structure and meaning to +sentences. Yet, their intricate semantic nuances pose a longstanding challenge. +Understanding verb relations through the concept of lexical entailment is +crucial for comprehending sentence meanings and grasping verb dynamics. This +work investigates the capabilities of eight Large Language Models in +recognizing lexical entailment relations among verbs through differently +devised prompting strategies and zero-/few-shot settings over verb pairs from +two lexical databases, namely WordNet and HyperLex. Our findings unveil that +the models can tackle the lexical entailment recognition task with moderately +good performance, although at varying degree of effectiveness and under +different conditions. Also, utilizing few-shot prompting can enhance the +models' performance. However, perfectly solving the task arises as an unmet +challenge for all examined LLMs, which raises an emergence for further research +developments on this topic. + +
+
+ comment: Accepted for publication at The 2024 Conference on Empirical Methods + in Natural Language Processing (EMNLP-2024) - Findings +
+
+
+
+
+ + ♻ ☆ TinyStyler: Efficient Few-Shot Text Style Transfer with Authorship + Embeddings + + +
+ The goal of text style transfer is to transform the style of texts while +preserving their original meaning, often with only a few examples of the target +style. Existing style transfer methods generally rely on the few-shot +capabilities of large language models or on complex controllable text +generation approaches that are inefficient and underperform on fluency metrics. +We introduce TinyStyler, a lightweight but effective approach, which leverages +a small language model (800M params) and pre-trained authorship embeddings to +perform efficient, few-shot text style transfer. We evaluate on the challenging +task of authorship style transfer and find TinyStyler outperforms strong +approaches such as GPT-4. We also evaluate TinyStyler's ability to perform text +attribute style transfer (formal $\leftrightarrow$ informal) with automatic and +human evaluations and find that the approach outperforms recent controllable +text generation methods. Our model has been made publicly available at +https://huggingface.co/tinystyler/tinystyler . + +
+
+
+
+
+ + ♻ ☆ Perceptions of Linguistic Uncertainty by Language Models and Humans EMNLP 2024 + + +
+ _Uncertainty expressions_ such as "probably" or "highly unlikely" are +pervasive in human language. While prior work has established that there is +population-level agreement in terms of how humans quantitatively interpret +these expressions, there has been little inquiry into the abilities of language +models in the same context. In this paper, we investigate how language models +map linguistic expressions of uncertainty to numerical responses. Our approach +assesses whether language models can employ theory of mind in this setting: +understanding the uncertainty of another agent about a particular statement, +independently of the model's own certainty about that statement. We find that 7 +out of 10 models are able to map uncertainty expressions to probabilistic +responses in a human-like manner. However, we observe systematically different +behavior depending on whether a statement is actually true or false. This +sensitivity indicates that language models are substantially more susceptible +to bias based on their prior knowledge (as compared to humans). These findings +raise important questions and have broad implications for human-AI and AI-AI +communication. + +
+
+ comment: Accepted at EMNLP 2024 (Main) +
+
+
+
+
+ + ♻ ☆ On the Rigour of Scientific Writing: Criteria, Analysis, and Insights EMNLP 2024 + + +
+ Rigour is crucial for scientific research as it ensures the reproducibility +and validity of results and findings. Despite its importance, little work +exists on modelling rigour computationally, and there is a lack of analysis on +whether these criteria can effectively signal or measure the rigour of +scientific papers in practice. In this paper, we introduce a bottom-up, +data-driven framework to automatically identify and define rigour criteria and +assess their relevance in scientific writing. Our framework includes rigour +keyword extraction, detailed rigour definition generation, and salient criteria +identification. Furthermore, our framework is domain-agnostic and can be +tailored to the evaluation of scientific rigour for different areas, +accommodating the distinct salient criteria across fields. We conducted +comprehensive experiments based on datasets collected from two high impact +venues for Machine Learning and NLP (i.e., ICLR and ACL) to demonstrate the +effectiveness of our framework in modelling rigour. In addition, we analyse +linguistic patterns of rigour, revealing that framing certainty is crucial for +enhancing the perception of scientific rigour, while suggestion certainty and +probability uncertainty diminish it. + +
+
+ comment: Accepted Findings at EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Personalized Large Language Models ICDM + + +
+ Large language models (LLMs) have significantly advanced Natural Language +Processing (NLP) tasks in recent years. However, their universal nature poses +limitations in scenarios requiring personalized responses, such as +recommendation systems and chatbots. This paper investigates methods to +personalize LLMs, comparing fine-tuning and zero-shot reasoning approaches on +subjective tasks. Results demonstrate that personalized fine-tuning improves +model reasoning compared to non-personalized models. Experiments on datasets +for emotion recognition and hate speech detection show consistent performance +gains with personalized methods across different LLM architectures. These +findings underscore the importance of personalization for enhancing LLM +capabilities in subjective text perception tasks. + +
+
+ comment: Accepted to SENTIRE 2024 (ICDM Workshops): + https://sentic.net/sentire2024wozniak.pdf +
+
+
+
+
+ + ♻ ☆ FRACTURED-SORRY-Bench: Framework for Revealing Attacks in Conversational + Turns Undermining Refusal Efficacy and Defenses over SORRY-Bench (Automated + Multi-shot Jailbreaks) + + +
+ This paper introduces FRACTURED-SORRY-Bench, a framework for evaluating the +safety of Large Language Models (LLMs) against multi-turn conversational +attacks. Building upon the SORRY-Bench dataset, we propose a simple yet +effective method for generating adversarial prompts by breaking down harmful +queries into seemingly innocuous sub-questions. Our approach achieves a maximum +increase of +46.22\% in Attack Success Rates (ASRs) across GPT-4, GPT-4o, +GPT-4o-mini, and GPT-3.5-Turbo models compared to baseline methods. We +demonstrate that this technique poses a challenge to current LLM safety +measures and highlights the need for more robust defenses against subtle, +multi-turn attacks. + +
+
+ comment: 4 pages, 2 tables +
+
+
+
+
+ + ♻ ☆ Pre-Finetuning for Few-Shot Emotional Speech Recognition INTERSPEECH 2023 + + +
+ Speech models have long been known to overfit individual speakers for many +classification tasks. This leads to poor generalization in settings where the +speakers are out-of-domain or out-of-distribution, as is common in production +environments. We view speaker adaptation as a few-shot learning problem and +propose investigating transfer learning approaches inspired by recent success +with pre-trained models in natural language tasks. We propose pre-finetuning +speech models on difficult tasks to distill knowledge into few-shot downstream +classification objectives. We pre-finetune Wav2Vec2.0 on every permutation of +four multiclass emotional speech recognition corpora and evaluate our +pre-finetuned models through 33,600 few-shot fine-tuning trials on the +Emotional Speech Dataset. + +
+
+ comment: Published at INTERSPEECH 2023. 5 pages, 4 figures. Code available at + https://github.com/maxlchen/Speech-PreFinetuning +
+
+
+
+
+ + ♻ ☆ Gradient Cuff: Detecting Jailbreak Attacks on Large Language Models by + Exploring Refusal Loss Landscapes NeurIPS 2024 + + +
+ Large Language Models (LLMs) are becoming a prominent generative AI tool, +where the user enters a query and the LLM generates an answer. To reduce harm +and misuse, efforts have been made to align these LLMs to human values using +advanced training techniques such as Reinforcement Learning from Human Feedback +(RLHF). However, recent studies have highlighted the vulnerability of LLMs to +adversarial jailbreak attempts aiming at subverting the embedded safety +guardrails. To address this challenge, this paper defines and investigates the +Refusal Loss of LLMs and then proposes a method called Gradient Cuff to detect +jailbreak attempts. Gradient Cuff exploits the unique properties observed in +the refusal loss landscape, including functional values and its smoothness, to +design an effective two-step detection strategy. Experimental results on two +aligned LLMs (LLaMA-2-7B-Chat and Vicuna-7B-V1.5) and six types of jailbreak +attacks (GCG, AutoDAN, PAIR, TAP, Base64, and LRL) show that Gradient Cuff can +significantly improve the LLM's rejection capability for malicious jailbreak +queries, while maintaining the model's performance for benign user queries by +adjusting the detection threshold. + +
+
+ comment: Accepted by NeurIPS 2024. Project page: + https://huggingface.co/spaces/TrustSafeAI/GradientCuff-Jailbreak-Defense +
+
+
+
+
+ + ♻ ☆ SYNTHEVAL: Hybrid Behavioral Testing of NLP Models with Synthetic + CheckLists EMNLP 2024 + + +
+ Traditional benchmarking in NLP typically involves using static held-out test +sets. However, this approach often results in an overestimation of performance +and lacks the ability to offer comprehensive, interpretable, and dynamic +assessments of NLP models. Recently, works like DynaBench (Kiela et al., 2021) +and CheckList (Ribeiro et al., 2020) have addressed these limitations through +behavioral testing of NLP models with test types generated by a multistep +human-annotated pipeline. Unfortunately, manually creating a variety of test +types requires much human labor, often at prohibitive cost. In this work, we +propose SYNTHEVAL, a hybrid behavioral testing framework that leverages large +language models (LLMs) to generate a wide range of test types for a +comprehensive evaluation of NLP models. SYNTHEVAL first generates sentences via +LLMs using controlled generation, and then identifies challenging examples by +comparing the predictions made by LLMs with task-specific NLP models. In the +last stage, human experts investigate the challenging examples, manually design +templates, and identify the types of failures the taskspecific models +consistently exhibit. We apply SYNTHEVAL to two classification tasks, sentiment +analysis and toxic language detection, and show that our framework is effective +in identifying weaknesses of strong models on these tasks. We share our code in +https://github.com/Loreley99/SynthEval_CheckList. + +
+
+ comment: EMNLP 2024 - Findings +
+
+
+
+
+ + ♻ ☆ MEG: Medical Knowledge-Augmented Large Language Models for Question + Answering + + +
+ Question answering is a natural language understanding task that involves +reasoning over both explicit context and unstated, relevant domain knowledge. +Large language models (LLMs), which underpin most contemporary question +answering systems, struggle to induce how concepts relate in specialized +domains such as medicine. Existing medical LLMs are also costly to train. In +this work, we present MEG, a parameter-efficient approach for medical +knowledge-augmented LLMs. MEG uses a lightweight mapping network to integrate +graph embeddings into the LLM, enabling it to leverage external knowledge in a +cost-effective way. We evaluate our method on four popular medical +multiple-choice datasets and show that LLMs greatly benefit from the factual +grounding provided by knowledge graph embeddings. MEG attains an average of ++10.2% accuracy over the Mistral-Instruct baseline, and +6.7% over specialized +models like BioMistral. We also show results based on Llama-3. Finally, we show +that MEG's performance remains robust to the choice of graph encoder. + +
+
+
+
+
+ + ♻ ☆ MILPaC: A Novel Benchmark for Evaluating Translation of Legal Text to + Indian Languages + + +
+ Most legal text in the Indian judiciary is written in complex English due to +historical reasons. However, only a small fraction of the Indian population is +comfortable in reading English. Hence legal text needs to be made available in +various Indian languages, possibly by translating the available legal text from +English. Though there has been a lot of research on translation to and between +Indian languages, to our knowledge, there has not been much prior work on such +translation in the legal domain. In this work, we construct the first +high-quality legal parallel corpus containing aligned text units in English and +nine Indian languages, that includes several low-resource languages. We also +benchmark the performance of a wide variety of Machine Translation (MT) systems +over this corpus, including commercial MT systems, open-source MT systems and +Large Language Models. Through a comprehensive survey by Law practitioners, we +check how satisfied they are with the translations by some of these MT systems, +and how well automatic MT evaluation metrics agree with the opinions of Law +practitioners. + +
+
+ comment: To be published in ACM Transactions on Asian and Low-Resource + Language Information Processing (TALLIP) +
+
+
+
+
+ + ♻ ☆ Dynamic Speculation Lookahead Accelerates Speculative Decoding of Large + Language Models + + +
+ Speculative decoding is commonly used for reducing the inference latency of +large language models. Its effectiveness depends highly on the speculation +lookahead (SL)-the number of tokens generated by the draft model at each +iteration. In this work we show that the common practice of using the same SL +for all iterations (static SL) is suboptimal. We introduce DISCO (DynamIc +SpeCulation lookahead Optimization), a novel method for dynamically selecting +the SL. Our experiments with four datasets show that DISCO reaches an average +speedup of 10% compared to the best static SL baseline, while generating the +exact same text. + +
+
+
+
+
+ + ♻ ☆ Wave Network: An Ultra-Small Language Model + + +
+ We propose an innovative token representation and update method in a new +ultra-small language model: the Wave network. Specifically, we use a complex +vector to represent each token, encoding both global and local semantics of the +input text. A complex vector consists of two components: a magnitude vector +representing the global semantics of the input text, and a phase vector +capturing the relationships between individual tokens and global semantics. +Experiments on the AG News text classification task demonstrate that, when +generating complex vectors from randomly initialized token embeddings, our +single-layer Wave Network achieves 90.91% accuracy with wave interference and +91.66% with wave modulation - outperforming a single Transformer layer using +BERT pre-trained embeddings by 19.23% and 19.98%, respectively, and approaching +the accuracy of the pre-trained and fine-tuned BERT base model (94.64%). +Additionally, compared to BERT base, the Wave Network reduces video memory +usage and training time by 77.34% and 85.62% during wave modulation. In +summary, we used a 2.4-million-parameter small language model to achieve +accuracy comparable to a 100-million-parameter BERT model in text +classification. + +
+
+
+
+
+ + ♻ ☆ ALI-Agent: Assessing LLMs' Alignment with Human Values via Agent-based + Evaluation + + +
+ Large Language Models (LLMs) can elicit unintended and even harmful content +when misaligned with human values, posing severe risks to users and society. To +mitigate these risks, current evaluation benchmarks predominantly employ +expert-designed contextual scenarios to assess how well LLMs align with human +values. However, the labor-intensive nature of these benchmarks limits their +test scope, hindering their ability to generalize to the extensive variety of +open-world use cases and identify rare but crucial long-tail risks. +Additionally, these static tests fail to adapt to the rapid evolution of LLMs, +making it hard to evaluate timely alignment issues. To address these +challenges, we propose ALI-Agent, an evaluation framework that leverages the +autonomous abilities of LLM-powered agents to conduct in-depth and adaptive +alignment assessments. ALI-Agent operates through two principal stages: +Emulation and Refinement. During the Emulation stage, ALI-Agent automates the +generation of realistic test scenarios. In the Refinement stage, it iteratively +refines the scenarios to probe long-tail risks. Specifically, ALI-Agent +incorporates a memory module to guide test scenario generation, a tool-using +module to reduce human labor in tasks such as evaluating feedback from target +LLMs, and an action module to refine tests. Extensive experiments across three +aspects of human values--stereotypes, morality, and legality--demonstrate that +ALI-Agent, as a general evaluation framework, effectively identifies model +misalignment. Systematic analysis also validates that the generated test +scenarios represent meaningful use cases, as well as integrate enhanced +measures to probe long-tail risks. Our code is available at +https://github.com/SophieZheng998/ALI-Agent.git + +
+
+
+
+
+ + ♻ ☆ LongEmbed: Extending Embedding Models for Long Context Retrieval EMNLP 2024 + + +
+ Embedding models play a pivot role in modern NLP applications such as IR and +RAG. While the context limit of LLMs has been pushed beyond 1 million tokens, +embedding models are still confined to a narrow context window not exceeding 8k +tokens, refrained from application scenarios requiring long inputs such as +legal contracts. This paper explores context window extension of existing +embedding models, pushing the limit to 32k without requiring additional +training. First, we examine the performance of current embedding models for +long context retrieval on our newly constructed LongEmbed benchmark. LongEmbed +comprises two synthetic tasks and four carefully chosen real-world tasks, +featuring documents of varying length and dispersed target information. +Benchmarking results underscore huge room for improvement in these models. +Based on this, comprehensive experiments show that training-free context window +extension strategies like position interpolation can effectively extend the +context window of existing embedding models by several folds, regardless of +their original context being 512 or beyond 4k. Furthermore, for models +employing absolute position encoding (APE), we show the possibility of further +fine-tuning to harvest notable performance gains while strictly preserving +original behavior for short inputs. For models using rotary position embedding +(RoPE), significant enhancements are observed when employing RoPE-specific +methods, such as NTK and SelfExtend, indicating RoPE's superiority over APE for +context window extension. To facilitate future research, we release E5-Base-4k +and E5-RoPE-Base, along with the LongEmbed benchmark. + +
+
+ comment: EMNLP 2024 Camera Ready +
+
+
+
+
+ + ♻ ☆ LOVA3: Learning to Visual Question Answering, Asking and Assessment NeurIPS 2024 + + +
+ Question answering, asking, and assessment are three innate human traits +crucial for understanding the world and acquiring knowledge. By enhancing these +capabilities, humans can more effectively utilize data, leading to better +comprehension and learning outcomes. Current Multimodal Large Language Models +(MLLMs) primarily focus on question answering, often neglecting the full +potential of questioning and assessment skills. Inspired by the human learning +mechanism, we introduce LOVA3, an innovative framework named "Learning tO +Visual question Answering, Asking and Assessment," designed to equip MLLMs with +these additional capabilities. Our approach involves the creation of two +supplementary training tasks GenQA and EvalQA, aiming at fostering the skills +of asking and assessing questions in the context of images. To develop the +questioning ability, we compile a comprehensive set of multimodal foundational +tasks. For assessment, we introduce a new benchmark called EvalQABench, +comprising 64,000 training samples (split evenly between positive and negative +samples) and 5,000 validation and testing samples. We posit that enhancing +MLLMs with the capabilities to answer, ask, and assess questions will enhance +their multimodal comprehension, ultimately improving overall performance. To +validate this hypothesis, we train MLLMs using the LOVA3 framework and evaluate +them on a range of multimodal datasets and benchmarks. Our results demonstrate +consistent performance gains, underscoring the critical role of these +additional tasks in fostering comprehensive intelligence in MLLMs. The code is +available at https://github.com/showlab/LOVA3. + +
+
+ comment: Accepted by NeurIPS 2024. The code is available at + https://github.com/showlab/LOVA3 +
+
+
+
+
+ + ♻ ☆ ReMoDetect: Reward Models Recognize Aligned LLM's Generations NeurIPS 2024 + + +
+ The remarkable capabilities and easy accessibility of large language models +(LLMs) have significantly increased societal risks (e.g., fake news +generation), necessitating the development of LLM-generated text (LGT) +detection methods for safe usage. However, detecting LGTs is challenging due to +the vast number of LLMs, making it impractical to account for each LLM +individually; hence, it is crucial to identify the common characteristics +shared by these models. In this paper, we draw attention to a common feature of +recent powerful LLMs, namely the alignment training, i.e., training LLMs to +generate human-preferable texts. Our key finding is that as these aligned LLMs +are trained to maximize the human preferences, they generate texts with higher +estimated preferences even than human-written texts; thus, such texts are +easily detected by using the reward model (i.e., an LLM trained to model human +preference distribution). Based on this finding, we propose two training +schemes to further improve the detection ability of the reward model, namely +(i) continual preference fine-tuning to make the reward model prefer aligned +LGTs even further and (ii) reward modeling of Human/LLM mixed texts (a +rephrased texts from human-written texts using aligned LLMs), which serves as a +median preference text corpus between LGTs and human-written texts to learn the +decision boundary better. We provide an extensive evaluation by considering six +text domains across twelve aligned LLMs, where our method demonstrates +state-of-the-art results. Code is available at +https://github.com/hyunseoklee-ai/ReMoDetect. + +
+
+ comment: Published as a conference proceeding for NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ SciEval: A Multi-Level Large Language Model Evaluation Benchmark for + Scientific Research AAAI 2024 + + +
+ Recently, there has been growing interest in using Large Language Models +(LLMs) for scientific research. Numerous benchmarks have been proposed to +evaluate the ability of LLMs for scientific research. However, current +benchmarks are mostly based on pre-collected objective questions. This design +suffers from data leakage problem and lacks the evaluation of subjective Q/A +ability. In this paper, we propose SciEval, a comprehensive and +multi-disciplinary evaluation benchmark to address these issues. Based on +Bloom's taxonomy, SciEval covers four dimensions to systematically evaluate +scientific research ability. In particular, we design a "dynamic" subset based +on scientific principles to prevent evaluation from potential data leakage. +Both objective and subjective questions are included in SciEval. These +characteristics make SciEval a more effective benchmark for scientific research +ability evaluation of LLMs. Comprehensive experiments on most advanced LLMs +show that, although GPT-4 achieves SOTA performance compared to other LLMs, +there is still substantial room for improvement, especially for dynamic +questions. The codes and data are publicly available on +https://github.com/OpenDFM/SciEval. + +
+
+ comment: 12 pages, 17 figures, 12 tables. Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Truly Grasp Mathematics? An Empirical + Exploration From A Psychological Perspective + + +
+ Despite their proficiency in math tasks, the mechanisms underlying LLMs' +mathematical reasoning abilities remain a subject of debate. Recent studies +suggest that chain-of-thought (CoT) prompts can bolster mathematical reasoning +by encouraging LLMs to employ human-like logical reasoning (System 2), enabling +them to excel on the Cognitive Reflection Test (CRT). To assess whether LLMs +genuinely possess System 2-like logical reasoning, we introduced targeted +modifications to CRT problems. Our findings reveal that, despite the use of CoT +prompts, mainstream LLMs, including the latest o1-preview model, continue to +exhibit a significant error rate. Further analysis indicates that they +predominantly rely on System 1-like intuitive reasoning and pattern matching +derived from training data, rather than demonstrating mastery of mathematical +thinking. This discovery challenges the prevailing notion that LLMs possess +genuine logical reasoning abilities and that CoT can enhance them. +Consequently, this work may temper overly optimistic projections regarding +LLMs' advancement toward artificial general intelligence. + +
+
+
+
+
+ + ♻ ☆ A Collaborative Content Moderation Framework for Toxicity Detection + based on Conformalized Estimates of Annotation Disagreement + + +
+ Content moderation typically combines the efforts of human moderators and +machine learning models. However, these systems often rely on data where +significant disagreement occurs during moderation, reflecting the subjective +nature of toxicity perception. Rather than dismissing this disagreement as +noise, we interpret it as a valuable signal that highlights the inherent +ambiguity of the content,an insight missed when only the majority label is +considered. In this work, we introduce a novel content moderation framework +that emphasizes the importance of capturing annotation disagreement. Our +approach uses multitask learning, where toxicity classification serves as the +primary task and annotation disagreement is addressed as an auxiliary task. +Additionally, we leverage uncertainty estimation techniques, specifically +Conformal Prediction, to account for both the ambiguity in comment annotations +and the model's inherent uncertainty in predicting toxicity and +disagreement.The framework also allows moderators to adjust thresholds for +annotation disagreement, offering flexibility in determining when ambiguity +should trigger a review. We demonstrate that our joint approach enhances model +performance, calibration, and uncertainty estimation, while offering greater +parameter efficiency and improving the review process in comparison to +single-task methods. + +
+
+ comment: 35 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Instruct, Not Assist: LLM-based Multi-Turn Planning and Hierarchical + Questioning for Socratic Code Debugging EMNLP'24 + + +
+ Socratic questioning is an effective teaching strategy, encouraging critical +thinking and problem-solving. The conversational capabilities of large language +models (LLMs) show great potential for providing scalable, real-time student +guidance. However, current LLMs often give away solutions directly, making them +ineffective instructors. We tackle this issue in the code debugging domain with +TreeInstruct, an Instructor agent guided by a novel state space-based planning +algorithm. TreeInstruct asks probing questions to help students independently +identify and resolve errors. It estimates a student's conceptual and +syntactical knowledge to dynamically construct a question tree based on their +responses and current knowledge state, effectively addressing both independent +and dependent mistakes concurrently in a multi-turn interaction setting. In +addition to using an existing single-bug debugging benchmark, we construct a +more challenging multi-bug dataset of 150 coding problems, incorrect solutions, +and bug fixes -- all carefully constructed and annotated by experts. Extensive +evaluation shows TreeInstruct's state-of-the-art performance on both datasets, +proving it to be a more effective instructor than baselines. Furthermore, a +real-world case study with five students of varying skill levels further +demonstrates TreeInstruct's ability to guide students to debug their code +efficiently with minimal turns and highly Socratic questioning. + +
+
+ comment: Code available at: https://github.com/agarwalishika/TreeInstruct + Accepted at EMNLP'24 Findings +
+
+
+
+
+ + ♻ ☆ PAD: Personalized Alignment of LLMs at Decoding-Time + + +
+ Aligning with personalized preferences, which vary significantly across +cultural, educational, and political differences, poses a significant challenge +due to the computational costs and data demands of traditional alignment +methods. In response, this paper presents Personalized Alignment at +Decoding-time (PAD), a novel framework designed to align LLM outputs with +diverse personalized preferences during the inference phase, eliminating the +need for additional training. By introducing a unique personalized reward +modeling strategy, this framework decouples the text generation process from +personalized preferences, facilitating the generation of generalizable +token-level personalized rewards. The PAD algorithm leverages these rewards to +guide the decoding process, dynamically tailoring the base model's predictions +to personalized preferences. Extensive experimental results demonstrate that +PAD not only outperforms existing training-based alignment methods in terms of +aligning with diverse preferences but also shows significant generalizability +to preferences unseen during training and scalability across different base +models. This work advances the capability of LLMs to meet user needs in +real-time applications, presenting a substantial step forward in personalized +LLM alignment. + +
+
+ comment: This paper presents Personalized Alignment at Decoding-time (PAD), a + novel framework designed to align LLM outputs with diverse personalized + preferences during the inference phase +
+
+
+
+
+ + ♻ ☆ The Translation of Circumlocution in Arabic Short Stories into English + + +
+ This study investigates the translation of circumlocution from Arabic to +English in a corpus of short stories by renowned Arabic authors. By analyzing +the source and target texts, the study aims to identify and categorize +circumlocution instances in Arabic and their corresponding renditions in +English. The study employs Nida's (1964) translation theory as a framework to +assess the appropriateness of the translation strategies employed. It examines +the extent to which translators successfully rendered Arabic circumlocution +into English, identifying potential challenges and limitations in the +translation process. The findings reveal significant similarities between +Arabic circumlocution categories and English metadiscourse categories, +particularly in terms of textual and interpersonal functions. However, the +study also highlights instances where translators encountered difficulties in +accurately conveying the nuances of circumlocution, often resorting to +strategies like addition, subtraction, and alteration.https://ntu.edu.iq/ + +
+
+
+
+
+ + ♻ ☆ SciDFM: A Large Language Model with Mixture-of-Experts for Science NeurIPS + 2024 + + +
+ Recently, there has been a significant upsurge of interest in leveraging +large language models (LLMs) to assist scientific discovery. However, most LLMs +only focus on general science, while they lack domain-specific knowledge, such +as chemical molecules and amino acid sequences. To bridge these gaps, we +introduce SciDFM, a mixture-of-experts LLM, which is trained from scratch and +is able to conduct college-level scientific reasoning and understand molecules +and amino acid sequences. We collect a large-scale training corpus containing +numerous scientific papers and books from different disciplines as well as data +from domain-specific databases. We further fine-tune the pre-trained model on +lots of instruction data to improve performances on downstream benchmarks. From +experiment results, we show that SciDFM achieves strong performance on general +scientific benchmarks such as SciEval and SciQ, and it reaches a SOTA +performance on domain-specific benchmarks among models of similar size. We +further analyze the expert layers and show that the results of expert selection +vary with data from different disciplines. To benefit the broader research +community, we open-source SciDFM at +https://huggingface.co/OpenDFM/SciDFM-MoE-A5.6B-v1.0. + +
+
+ comment: 12 pages, 1 figure, 9 tables. Technical Report, accepted by NeurIPS + 2024 Workshop FM4Science +
+
+
+
+
+ + ♻ ☆ GraphTeam: Facilitating Large Language Model-based Graph Analysis via + Multi-Agent Collaboration + + +
+ Graphs are widely used for modeling relational data in real-world scenarios, +such as social networks and urban computing. Existing LLM-based graph analysis +approaches either integrate graph neural networks (GNNs) for specific machine +learning tasks, limiting their transferability, or rely solely on LLMs' +internal reasoning ability, resulting in suboptimal performance. To address +these limitations, we take advantage of recent advances in LLM-based agents, +which have shown capabilities of utilizing external knowledge or tools for +problem solving. By simulating human problem-solving strategies such as analogy +and collaboration, we propose a multi-agent system based on LLMs named +GraphTeam, for graph analysis. GraphTeam consists of five LLM-based agents from +three modules, and the agents with different specialities can collaborate with +each other to address complex problems. Specifically, (1) input-output +normalization module: the question agent extracts and refines four key +arguments from the original question, facilitating the problem understanding, +and the answer agent organizes the results to meet the output requirement; (2) +external knowledge retrieval module: we first build a knowledge base consisting +of relevant documentation and experience information, and then the search agent +retrieves the most relevant entries for each question. (3) problem-solving +module: given the retrieved information from search agent, the coding agent +uses established algorithms via programming to generate solutions, and in case +the coding agent does not work, the reasoning agent will directly compute the +results without programming. Extensive experiments on six graph analysis +benchmarks demonstrate that GraphTeam achieves state-of-the-art performance +with an average 25.85% improvement over the best baseline in terms of accuracy. +The code and data are available at https://github.com/BUPT-GAMMA/GraphTeam. + +
+
+
+
+
+ + ♻ ☆ What is lost in Normalization? Exploring Pitfalls in Multilingual ASR + Model Evaluations EMNLP 2024 + + +
+ This paper explores the pitfalls in evaluating multilingual automatic speech +recognition (ASR) models, with a particular focus on Indic language scripts. We +investigate the text normalization routine employed by leading ASR models, +including OpenAI Whisper, Meta's MMS, Seamless, and Assembly AI's Conformer, +and their unintended consequences on performance metrics. Our research reveals +that current text normalization practices, while aiming to standardize ASR +outputs for fair comparison, by removing inconsistencies such as variations in +spelling, punctuation, and special characters, are fundamentally flawed when +applied to Indic scripts. Through empirical analysis using text similarity +scores and in-depth linguistic examination, we demonstrate that these flaws +lead to artificially improved performance metrics for Indic languages. We +conclude by proposing a shift towards developing text normalization routines +that leverage native linguistic expertise, ensuring more robust and accurate +evaluations of multilingual ASR models. + +
+
+ comment: Accepted to EMNLP 2024 Main +
+
+
+
+
+ + ♻ ☆ Winner-Take-All Column Row Sampling for Memory Efficient Adaptation of + Language Model + + +
+ With the rapid growth in model size, fine-tuning the large pre-trained +language model has become increasingly difficult due to its extensive memory +usage. Previous works usually focus on reducing the number of trainable +parameters in the network. While the model parameters do contribute to memory +usage, the primary memory bottleneck during training arises from storing +feature maps, also known as activations, as they are crucial for gradient +calculation. Notably, neural networks are usually trained using stochastic +gradient descent. We argue that in stochastic optimization, models can handle +noisy gradients as long as the gradient estimator is unbiased with reasonable +variance. Following this motivation, we propose a new family of unbiased +estimators called WTA-CRS, for matrix production with reduced variance, which +only requires storing the sub-sampled activations for calculating the gradient. +Our work provides both theoretical and experimental evidence that, in the +context of tuning transformers, our proposed estimators exhibit lower variance +compared to existing ones. By replacing the linear operation with our +approximated one in transformers, we can achieve up to 2.7$\times$ peak memory +reduction with almost no accuracy drop and enables up to $6.4\times$ larger +batch size. Under the same hardware, WTA-CRS enables better down-streaming task +performance by applying larger models and/or faster training speed with larger +batch sizes. + +
+
+
+
+
+ + ♻ ☆ Evaluating Quality of Answers for Retrieval-Augmented Generation: A + Strong LLM Is All You Need + + +
+ We present a comprehensive study of answer quality evaluation in +Retrieval-Augmented Generation (RAG) applications using vRAG-Eval, a novel +grading system that is designed to assess correctness, completeness, and +honesty. We further map the grading of quality aspects aforementioned into a +binary score, indicating an accept or reject decision, mirroring the intuitive +"thumbs-up" or "thumbs-down" gesture commonly used in chat applications. This +approach suits factual business contexts where a clear decision opinion is +essential. Our assessment applies vRAG-Eval to two Large Language Models +(LLMs), evaluating the quality of answers generated by a vanilla RAG +application. We compare these evaluations with human expert judgments and find +a substantial alignment between GPT-4's assessments and those of human experts, +reaching 83% agreement on accept or reject decisions. This study highlights the +potential of LLMs as reliable evaluators in closed-domain, closed-ended +settings, particularly when human evaluations require significant resources. + +
+
+ comment: 13 pages, 8 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ How Transformers Solve Propositional Logic Problems: A Mechanistic + Analysis + + +
+ Large language models (LLMs) have shown amazing performance on tasks that +require planning and reasoning. Motivated by this, we investigate the internal +mechanisms that underpin a network's ability to perform complex logical +reasoning. We first construct a synthetic propositional logic problem that +serves as a concrete test-bed for network training and evaluation. Crucially, +this problem demands nontrivial planning to solve, but we can train a small +transformer to achieve perfect accuracy. Building on our set-up, we then pursue +an understanding of precisely how a three-layer transformer, trained from +scratch, solves this problem. We are able to identify certain "planning" and +"reasoning" circuits in the network that necessitate cooperation between the +attention blocks to implement the desired logic. To expand our findings, we +then study a larger model, Mistral 7B. Using activation patching, we +characterize internal components that are critical in solving our logic +problem. Overall, our work systemically uncovers novel aspects of small and +large transformers, and continues the study of how they plan and reason. + +
+
+
+
+
+ + ♻ ☆ MAG-SQL: Multi-Agent Generative Approach with Soft Schema Linking and + Iterative Sub-SQL Refinement for Text-to-SQL + + +
+ Recent In-Context Learning based methods have achieved remarkable success in +Text-to-SQL task. However, there is still a large gap between the performance +of these models and human performance on datasets with complex database schema +and difficult questions, such as BIRD. Besides, existing work has neglected to +supervise intermediate steps when solving questions iteratively with question +decomposition methods, and the schema linking methods used in these works are +very rudimentary. To address these issues, we propose MAG-SQL, a multi-agent +generative approach with soft schema linking and iterative Sub-SQL refinement. +In our framework, an entity-based method with tables' summary is used to select +the columns in database, and a novel targets-conditions decomposition method is +introduced to decompose those complex questions. Additionally, we build a +iterative generating module which includes a Sub-SQL Generator and Sub-SQL +Refiner, introducing external oversight for each step of generation. Through a +series of ablation studies, the effectiveness of each agent in our framework +has been demonstrated. When evaluated on the BIRD benchmark with GPT-4, MAG-SQL +achieves an execution accuracy of 61.08%, compared to the baseline accuracy of +46.35% for vanilla GPT-4 and the baseline accuracy of 57.56% for MAC-SQL. +Besides, our approach makes similar progress on Spider. The codes are available +at https://github.com/LancelotXWX/MAG-SQL. + +
+
+ comment: 22 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ A Survey on Employing Large Language Models for Text-to-SQL Tasks + + +
+ The increasing volume of data in relational databases and the expertise +needed for writing SQL queries pose challenges for users to access and analyze +data. Text-to-SQL (Text2SQL) solves the issues by utilizing natural language +processing (NLP) techniques to convert natural language into SQL queries. With +the development of Large Language Models (LLMs), a range of LLM-based Text2SQL +methods have emerged. This survey provides a comprehensive review of LLMs in +Text2SQL tasks. We review benchmark datasets, prompt engineering methods, +fine-tuning methods, and base models in LLM-based Text2SQL methods. We provide +insights in each part and discuss future directions in this field. + +
+
+
+
+
+ + ♻ ☆ The Oscars of AI Theater: A Survey on Role-Playing with Language Models + + +
+ This survey explores the burgeoning field of role-playing with language +models, focusing on their development from early persona-based models to +advanced character-driven simulations facilitated by Large Language Models +(LLMs). Initially confined to simple persona consistency due to limited model +capabilities, role-playing tasks have now expanded to embrace complex character +portrayals involving character consistency, behavioral alignment, and overall +attractiveness. We provide a comprehensive taxonomy of the critical components +in designing these systems, including data, models and alignment, agent +architecture and evaluation. This survey not only outlines the current +methodologies and challenges, such as managing dynamic personal profiles and +achieving high-level persona consistency but also suggests avenues for future +research in improving the depth and realism of role-playing applications. The +goal is to guide future research by offering a structured overview of current +methodologies and identifying potential areas for improvement. Related +resources and papers are available at +https://github.com/nuochenpku/Awesome-Role-Play-Papers. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ♻ ☆ FactTest: Factuality Testing in Large Language Models with Finite-Sample + and Distribution-Free Guarantees + + +
+ The propensity of Large Language Models (LLMs) to generate hallucinations and +non-factual content undermines their reliability in high-stakes domains, where +rigorous control over Type I errors (the conditional probability of incorrectly +classifying hallucinations as truthful content) is essential. Despite its +importance, formal verification of LLM factuality with such guarantees remains +largely unexplored. In this paper, we introduce FactTest, a novel framework +that statistically assesses whether a LLM can confidently provide correct +answers to given questions with high-probability correctness guarantees. We +formulate factuality testing as hypothesis testing problem to enforce an upper +bound of Type I errors at user-specified significance levels. Notably, we prove +that our framework also ensures strong Type II error control under mild +conditions and can be extended to maintain its effectiveness when covariate +shifts exist. Our approach is distribution-free and works for any number of +human-annotated samples. It is model-agnostic and applies to any black-box or +white-box LM. Extensive experiments on question-answering (QA) and +multiple-choice benchmarks demonstrate that FactTest effectively detects +hallucinations and improves the model's ability to abstain from answering +unknown questions, leading to an over 40% accuracy improvement. + +
+
+
+
+
+ + ♻ ☆ Linguistic Collapse: Neural Collapse in (Large) Language Models NeurIPS 2024 + + +
+ Neural collapse ($\mathcal{NC}$) is a phenomenon observed in classification +tasks where top-layer representations collapse into their class means, which +become equinorm, equiangular and aligned with the classifiers. These behaviors +-- associated with generalization and robustness -- would manifest under +specific conditions: models are trained towards zero loss, with noise-free +labels belonging to balanced classes, which do not outnumber the model's hidden +dimension. Recent studies have explored $\mathcal{NC}$ in the absence of one or +more of these conditions to extend and capitalize on the associated benefits of +ideal geometries. Language modeling presents a curious frontier, as +\textit{training by token prediction} constitutes a classification task where +none of the conditions exist: the vocabulary is imbalanced and exceeds the +embedding dimension; different tokens might correspond to similar contextual +embeddings; and large language models (LLMs) in particular are typically only +trained for a few epochs. This paper empirically investigates the impact of +scaling the architectures and training of causal language models (CLMs) on +their progression towards $\mathcal{NC}$. We find that $\mathcal{NC}$ +properties that develop with scale (and regularization) are linked to +generalization. Moreover, there is evidence of some relationship between +$\mathcal{NC}$ and generalization independent of scale. Our work thereby +underscores the generality of $\mathcal{NC}$ as it extends to the novel and +more challenging setting of language modeling. Downstream, we seek to inspire +further research on the phenomenon to deepen our understanding of LLMs -- and +neural networks at large -- and improve existing architectures based on +$\mathcal{NC}$-related properties. Our code is hosted on GitHub at +https://github.com/rhubarbwu/linguistic-collapse . + +
+
+ comment: NeurIPS 2024; 36 pages; 30 figures +
+
+
+
+
+ + ♻ ☆ Long-form factuality in large language models NeurIPS 2024 + + +
+ Large language models (LLMs) often generate content that contains factual +errors when responding to fact-seeking prompts on open-ended topics. To +benchmark a model's long-form factuality in open domains, we first use GPT-4 to +generate LongFact, a prompt set comprising thousands of questions spanning 38 +topics. We then propose that LLM agents can be used as automated evaluators for +long-form factuality through a method which we call Search-Augmented Factuality +Evaluator (SAFE). SAFE utilizes an LLM to break down a long-form response into +a set of individual facts and to evaluate the accuracy of each fact using a +multi-step reasoning process comprising sending search queries to Google Search +and determining whether a fact is supported by the search results. Furthermore, +we propose extending F1 score as an aggregated metric for long-form factuality. +To do so, we balance the percentage of supported facts in a response +(precision) with the percentage of provided facts relative to a hyperparameter +representing a user's preferred response length (recall). + Empirically, we demonstrate that LLM agents can outperform crowdsourced human +annotators - on a set of ~16k individual facts, SAFE agrees with crowdsourced +human annotators 72% of the time, and on a random subset of 100 disagreement +cases, SAFE wins 76% of the time. At the same time, SAFE is more than 20 times +cheaper than human annotators. We also benchmark thirteen language models on +LongFact across four model families (Gemini, GPT, Claude, and PaLM-2), finding +that larger language models generally achieve better long-form factuality. +LongFact, SAFE, and all experimental code are available at +https://github.com/google-deepmind/long-form-factuality. + +
+
+ comment: NeurIPS 2024; 72 pages, 18 figures, 30 tables. Code at + https://github.com/google-deepmind/long-form-factuality +
+
+
+
+
+ + ♻ ☆ HealthQ: Unveiling Questioning Capabilities of LLM Chains in Healthcare + Conversations + + +
+ In digital healthcare, large language models (LLMs) have primarily been +utilized to enhance question-answering capabilities and improve patient +interactions. However, effective patient care necessitates LLM chains that can +actively gather information by posing relevant questions. This paper presents +HealthQ, a novel framework designed to evaluate the questioning capabilities of +LLM healthcare chains. We implemented several LLM chains, including +Retrieval-Augmented Generation (RAG), Chain of Thought (CoT), and reflective +chains, and introduced an LLM judge to assess the relevance and informativeness +of the generated questions. To validate HealthQ, we employed traditional +Natural Language Processing (NLP) metrics such as Recall-Oriented Understudy +for Gisting Evaluation (ROUGE) and Named Entity Recognition (NER)-based set +comparison, and constructed two custom datasets from public medical note +datasets, ChatDoctor and MTS-Dialog. Our contributions are threefold: we +provide the first comprehensive study on the questioning capabilities of LLMs +in healthcare conversations, develop a novel dataset generation pipeline, and +propose a detailed evaluation methodology. + +
+
+
+
+
+ + ♻ ☆ Deploying Multi-task Online Server with Large Language Model COLING 2025 + + +
+ In the industry, numerous tasks are deployed online. Traditional approaches +often tackle each task separately by its own network, which leads to excessive +costs for developing and scaling models, especially in the context of large +language models. Although multi-task methods can save costs through parameter +sharing, they often struggle to outperform single-task methods in real-world +applications. To tackle these challenges, we present a three-stage multi-task +learning framework for large language models. It involves task filtering, +followed by fine-tuning on high-resource tasks, and finally fine-tuning on all +tasks. We conducted comprehensive experiments in single-task and multi-task +settings. Our approach, exemplified on different benchmarks, demonstrates that +it is able to achieve performance comparable to the single-task method while +reducing up to 90.9\% of its overhead. + +
+
+ comment: Accepted by COLING 2025 Industry Track +
+
+
+
+
+ + ♻ ☆ $B^4$: A Black-Box Scrubbing Attack on LLM Watermarks + + +
+ Watermarking has emerged as a prominent technique for LLM-generated content +detection by embedding imperceptible patterns. Despite supreme performance, its +robustness against adversarial attacks remains underexplored. Previous work +typically considers a grey-box attack setting, where the specific type of +watermark is already known. Some even necessitates knowledge about +hyperparameters of the watermarking method. Such prerequisites are unattainable +in real-world scenarios. Targeting at a more realistic black-box threat model +with fewer assumptions, we here propose $B^4$, a black-box scrubbing attack on +watermarks. Specifically, we formulate the watermark scrubbing attack as a +constrained optimization problem by capturing its objectives with two +distributions, a Watermark Distribution and a Fidelity Distribution. This +optimization problem can be approximately solved using two proxy distributions. +Experimental results across 12 different settings demonstrate the superior +performance of $B^4$ compared with other baselines. + +
+
+
+
+
+ + ♻ ☆ FinCon: A Synthesized LLM Multi-Agent System with Conceptual Verbal + Reinforcement for Enhanced Financial Decision Making + + +
+ Large language models (LLMs) have demonstrated notable potential in +conducting complex tasks and are increasingly utilized in various financial +applications. However, high-quality sequential financial investment +decision-making remains challenging. These tasks require multiple interactions +with a volatile environment for every decision, demanding sufficient +intelligence to maximize returns and manage risks. Although LLMs have been used +to develop agent systems that surpass human teams and yield impressive +investment returns, opportunities to enhance multi-sourced information +synthesis and optimize decision-making outcomes through timely experience +refinement remain unexplored. Here, we introduce the FinCon, an LLM-based +multi-agent framework with CONceptual verbal reinforcement tailored for diverse +FINancial tasks. Inspired by effective real-world investment firm +organizational structures, FinCon utilizes a manager-analyst communication +hierarchy. This structure allows for synchronized cross-functional agent +collaboration towards unified goals through natural language interactions and +equips each agent with greater memory capacity than humans. Additionally, a +risk-control component in FinCon enhances decision quality by episodically +initiating a self-critiquing mechanism to update systematic investment beliefs. +The conceptualized beliefs serve as verbal reinforcement for the future agent's +behavior and can be selectively propagated to the appropriate node that +requires knowledge updates. This feature significantly improves performance +while reducing unnecessary peer-to-peer communication costs. Moreover, FinCon +demonstrates strong generalization capabilities in various financial tasks, +including single stock trading and portfolio management. + +
+
+ comment: LLM Applications, LLM Agents, Financial Technology, Quantitative + Finance, Algorithmic Trading, Cognitive Science +
+
+
+
+
+ + ♻ ☆ Birdie: Advancing State Space Models with Reward-Driven Objectives and + Curricula EMNLP 2024 + + +
+ Efficient state space models (SSMs), such as linear recurrent neural networks +and linear attention variants, offer computational advantages over Transformers +but struggle with tasks requiring long-range in-context retrieval-like text +copying, associative recall, and question answering over long contexts. +Previous efforts to address these challenges have focused on architectural +modifications, often reintroducing computational inefficiencies. In this paper, +we propose a novel training procedure, Birdie, that significantly enhances the +in-context retrieval capabilities of SSMs without altering their architecture. +Our approach combines bidirectional input processing with dynamic mixtures of +specialized pre-training objectives, optimized via reinforcement learning. We +introduce a new bidirectional SSM architecture that seamlessly transitions from +bidirectional context processing to causal generation. Experimental evaluations +demonstrate that Birdie markedly improves performance on retrieval-intensive +tasks such as multi-number phone book lookup, long paragraph +question-answering, and infilling. This narrows the performance gap with +Transformers, while retaining computational efficiency. Our findings highlight +the importance of training procedures in leveraging the fixed-state capacity of +SSMs, offering a new direction to advance their capabilities. All code and +pre-trained models are available at https://www.github.com/samblouir/birdie, +with support for JAX and PyTorch. + +
+
+ comment: Accepted to EMNLP 2024 (Main Conference) +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 127 + +
+
+
+ + ☆ SVDQunat: Absorbing Outliers by Low-Rank Components for 4-Bit Diffusion + Models + + +
+ Diffusion models have been proven highly effective at generating high-quality +images. However, as these models grow larger, they require significantly more +memory and suffer from higher latency, posing substantial challenges for +deployment. In this work, we aim to accelerate diffusion models by quantizing +their weights and activations to 4 bits. At such an aggressive level, both +weights and activations are highly sensitive, where conventional post-training +quantization methods for large language models like smoothing become +insufficient. To overcome this limitation, we propose SVDQuant, a new 4-bit +quantization paradigm. Different from smoothing which redistributes outliers +between weights and activations, our approach absorbs these outliers using a +low-rank branch. We first consolidate the outliers by shifting them from +activations to weights, then employ a high-precision low-rank branch to take in +the weight outliers with Singular Value Decomposition (SVD). This process eases +the quantization on both sides. However, na\"{\i}vely running the low-rank +branch independently incurs significant overhead due to extra data movement of +activations, negating the quantization speedup. To address this, we co-design +an inference engine Nunchaku that fuses the kernels of the low-rank branch into +those of the low-bit branch to cut off redundant memory access. It can also +seamlessly support off-the-shelf low-rank adapters (LoRAs) without the need for +re-quantization. Extensive experiments on SDXL, PixArt-$\Sigma$, and FLUX.1 +validate the effectiveness of SVDQuant in preserving image quality. We reduce +the memory usage for the 12B FLUX.1 models by 3.5$\times$, achieving +3.0$\times$ speedup over the 4-bit weight-only quantized baseline on the 16GB +laptop 4090 GPU, paving the way for more interactive applications on PCs. Our +quantization library and inference engine are open-sourced. + +
+
+ comment: Quantization Library: https://github.com/mit-han-lab/deepcompressor + Inference Engine: https://github.com/mit-han-lab/nunchaku Website: + https://hanlab.mit.edu/projects/svdquant Demo: https://svdquant.mit.edu Blog: + https://hanlab.mit.edu/blog/svdquant +
+
+
+
+
+ + ☆ ProEdit: Simple Progression is All You Need for High-Quality 3D Scene + Editing NeurIPS 2024 + + +
+ This paper proposes ProEdit - a simple yet effective framework for +high-quality 3D scene editing guided by diffusion distillation in a novel +progressive manner. Inspired by the crucial observation that multi-view +inconsistency in scene editing is rooted in the diffusion model's large +feasible output space (FOS), our framework controls the size of FOS and reduces +inconsistency by decomposing the overall editing task into several subtasks, +which are then executed progressively on the scene. Within this framework, we +design a difficulty-aware subtask decomposition scheduler and an adaptive 3D +Gaussian splatting (3DGS) training strategy, ensuring high quality and +efficiency in performing each subtask. Extensive evaluation shows that our +ProEdit achieves state-of-the-art results in various scenes and challenging +editing tasks, all through a simple framework without any expensive or +sophisticated add-ons like distillation losses, components, or training +procedures. Notably, ProEdit also provides a new way to control, preview, and +select the "aggressivity" of editing operation during the editing process. + +
+
+ comment: NeurIPS 2024. Project Page: https://immortalco.github.io/ProEdit/ +
+
+
+
+
+ + ☆ Diff-2-in-1: Bridging Generation and Dense Perception with Diffusion + Models + + +
+ Beyond high-fidelity image synthesis, diffusion models have recently +exhibited promising results in dense visual perception tasks. However, most +existing work treats diffusion models as a standalone component for perception +tasks, employing them either solely for off-the-shelf data augmentation or as +mere feature extractors. In contrast to these isolated and thus sub-optimal +efforts, we introduce a unified, versatile, diffusion-based framework, +Diff-2-in-1, that can simultaneously handle both multi-modal data generation +and dense visual perception, through a unique exploitation of the +diffusion-denoising process. Within this framework, we further enhance +discriminative visual perception via multi-modal generation, by utilizing the +denoising network to create multi-modal data that mirror the distribution of +the original training set. Importantly, Diff-2-in-1 optimizes the utilization +of the created diverse and faithful data by leveraging a novel self-improving +learning mechanism. Comprehensive experimental evaluations validate the +effectiveness of our framework, showcasing consistent performance improvements +across various discriminative backbones and high-quality multi-modal data +generation characterized by both realism and usefulness. + +
+
+ comment: 26 pages, 14 figures +
+
+
+
+
+ + ☆ ReCapture: Generative Video Camera Controls for User-Provided Videos + using Masked Video Fine-Tuning + + +
+ Recently, breakthroughs in video modeling have allowed for controllable +camera trajectories in generated videos. However, these methods cannot be +directly applied to user-provided videos that are not generated by a video +model. In this paper, we present ReCapture, a method for generating new videos +with novel camera trajectories from a single user-provided video. Our method +allows us to re-generate the reference video, with all its existing scene +motion, from vastly different angles and with cinematic camera motion. Notably, +using our method we can also plausibly hallucinate parts of the scene that were +not observable in the reference video. Our method works by (1) generating a +noisy anchor video with a new camera trajectory using multiview diffusion +models or depth-based point cloud rendering and then (2) regenerating the +anchor video into a clean and temporally consistent reangled video using our +proposed masked video fine-tuning technique. + +
+
+ comment: project page: https://generative-video-camera-controls.github.io/ +
+
+
+
+
+ + ☆ Analyzing The Language of Visual Tokens + + +
+ With the introduction of transformer-based models for vision and language +tasks, such as LLaVA and Chameleon, there has been renewed interest in the +discrete tokenized representation of images. These models often treat image +patches as discrete tokens, analogous to words in natural language, learning +joint alignments between visual and human languages. However, little is known +about the statistical behavior of these visual languages - whether they follow +similar frequency distributions, grammatical structures, or topologies as +natural languages. In this paper, we take a natural-language-centric approach +to analyzing discrete visual languages and uncover striking similarities and +fundamental differences. We demonstrate that, although visual languages adhere +to Zipfian distributions, higher token innovation drives greater entropy and +lower compression, with tokens predominantly representing object parts, +indicating intermediate granularity. We also show that visual languages lack +cohesive grammatical structures, leading to higher perplexity and weaker +hierarchical organization compared to natural languages. Finally, we +demonstrate that, while vision models align more closely with natural languages +than other models, this alignment remains significantly weaker than the +cohesion found within natural languages. Through these experiments, we +demonstrate how understanding the statistical properties of discrete visual +languages can inform the design of more effective computer vision models. + +
+
+
+
+
+ + ☆ LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation + + +
+ CLIP is one of the most important multimodal foundational models today. What +powers CLIP's capabilities? The rich supervision signals provided by natural +language, the carrier of human knowledge, shape a powerful cross-modal +representation space. However, with the rapid advancements in large language +models LLMs like GPT-4 and LLaMA, the boundaries of language comprehension and +generation are continually being pushed. This raises an intriguing question: +can the capabilities of LLMs be harnessed to further improve multimodal +representation learning? The potential benefits of incorporating LLMs into CLIP +are clear. LLMs' strong textual understanding can fundamentally improve CLIP's +ability to handle image captions, drastically enhancing its ability to process +long and complex texts, a well-known limitation of vanilla CLIP. Moreover, LLMs +are trained on a vast corpus of text, possessing open-world knowledge. This +allows them to expand on caption information during training, increasing the +efficiency of the learning process. In this paper, we propose LLM2CLIP, a novel +approach that embraces the power of LLMs to unlock CLIP's potential. By +fine-tuning the LLM in the caption space with contrastive learning, we extract +its textual capabilities into the output embeddings, significantly improving +the output layer's textual discriminability. We then design an efficient +training process where the fine-tuned LLM acts as a powerful teacher for CLIP's +visual encoder. Thanks to the LLM's presence, we can now incorporate longer and +more complex captions without being restricted by vanilla CLIP's text encoder's +context window and ability limitations. Our experiments demonstrate that this +approach brings substantial improvements in cross-modal tasks. + +
+
+
+
+
+ + ☆ HourVideo: 1-Hour Video-Language Understanding NeurIPS 2024 + + +
+ We present HourVideo, a benchmark dataset for hour-long video-language +understanding. Our dataset consists of a novel task suite comprising +summarization, perception (recall, tracking), visual reasoning (spatial, +temporal, predictive, causal, counterfactual), and navigation (room-to-room, +object retrieval) tasks. HourVideo includes 500 manually curated egocentric +videos from the Ego4D dataset, spanning durations of 20 to 120 minutes, and +features 12,976 high-quality, five-way multiple-choice questions. Benchmarking +results reveal that multimodal models, including GPT-4 and LLaVA-NeXT, achieve +marginal improvements over random chance. In stark contrast, human experts +significantly outperform the state-of-the-art long-context multimodal model, +Gemini Pro 1.5 (85.0% vs. 37.3%), highlighting a substantial gap in multimodal +capabilities. Our benchmark, evaluation toolkit, prompts, and documentation are +available at https://hourvideo.stanford.edu + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track; 28 pages +
+
+
+
+
+ + ☆ LoFi: Scalable Local Image Reconstruction with Implicit Neural + Representation + + +
+ Neural fields or implicit neural representations (INRs) have attracted +significant attention in machine learning and signal processing due to their +efficient continuous representation of images and 3D volumes. In this work, we +build on INRs and introduce a coordinate-based local processing framework for +solving imaging inverse problems, termed LoFi (Local Field). Unlike +conventional methods for image reconstruction, LoFi processes local information +at each coordinate \textit{separately} by multi-layer perceptrons (MLPs), +recovering the object at that specific coordinate. Similar to INRs, LoFi can +recover images at any continuous coordinate, enabling image reconstruction at +multiple resolutions. With comparable or better performance than standard CNNs +for image reconstruction, LoFi achieves excellent generalization to +out-of-distribution data and memory usage almost independent of image +resolution. Remarkably, training on $1024 \times 1024$ images requires just 3GB +of memory -- over 20 times less than the memory typically needed by standard +CNNs. Additionally, LoFi's local design allows it to train on extremely small +datasets with less than 10 samples, without overfitting or the need for +regularization or early stopping. Finally, we use LoFi as a denoising prior in +a plug-and-play framework for solving general inverse problems to benefit from +its continuous image representation and strong generalization. Although trained +on low-resolution images, LoFi can be used as a low-dimensional prior to solve +inverse problems at any resolution. We validate our framework across a variety +of imaging modalities, from low-dose computed tomography to radio +interferometric imaging. + +
+
+
+
+
+ + ☆ SG-I2V: Self-Guided Trajectory Control in Image-to-Video Generation + + +
+ Methods for image-to-video generation have achieved impressive, +photo-realistic quality. However, adjusting specific elements in generated +videos, such as object motion or camera movement, is often a tedious process of +trial and error, e.g., involving re-generating videos with different random +seeds. Recent techniques address this issue by fine-tuning a pre-trained model +to follow conditioning signals, such as bounding boxes or point trajectories. +Yet, this fine-tuning procedure can be computationally expensive, and it +requires datasets with annotated object motion, which can be difficult to +procure. In this work, we introduce SG-I2V, a framework for controllable +image-to-video generation that is self-guided$\unicode{x2013}$offering +zero-shot control by relying solely on the knowledge present in a pre-trained +image-to-video diffusion model without the need for fine-tuning or external +knowledge. Our zero-shot method outperforms unsupervised baselines while being +competitive with supervised models in terms of visual quality and motion +fidelity. + +
+
+ comment: Project page: https://kmcode1.github.io/Projects/SG-I2V/ +
+
+
+
+
+ + ☆ Planar Reflection-Aware Neural Radiance Fields + + +
+ Neural Radiance Fields (NeRF) have demonstrated exceptional capabilities in +reconstructing complex scenes with high fidelity. However, NeRF's view +dependency can only handle low-frequency reflections. It falls short when +handling complex planar reflections, often interpreting them as erroneous scene +geometries and leading to duplicated and inaccurate scene representations. To +address this challenge, we introduce a reflection-aware NeRF that jointly +models planar reflectors, such as windows, and explicitly casts reflected rays +to capture the source of the high-frequency reflections. We query a single +radiance field to render the primary color and the source of the reflection. We +propose a sparse edge regularization to help utilize the true sources of +reflections for rendering planar reflections rather than creating a duplicate +along the primary ray at the same depth. As a result, we obtain accurate scene +geometry. Rendering along the primary ray results in a clean, reflection-free +view, while explicitly rendering along the reflected ray allows us to +reconstruct highly detailed reflections. Our extensive quantitative and +qualitative evaluations of real-world datasets demonstrate our method's +enhanced performance in accurately handling reflections. + +
+
+
+
+
+ + ☆ AsCAN: Asymmetric Convolution-Attention Networks for Efficient + Recognition and Generation NeurIPS 2024 + + +
+ Neural network architecture design requires making many crucial decisions. +The common desiderata is that similar decisions, with little modifications, can +be reused in a variety of tasks and applications. To satisfy that, +architectures must provide promising latency and performance trade-offs, +support a variety of tasks, scale efficiently with respect to the amounts of +data and compute, leverage available data from other tasks, and efficiently +support various hardware. To this end, we introduce AsCAN -- a hybrid +architecture, combining both convolutional and transformer blocks. We revisit +the key design principles of hybrid architectures and propose a simple and +effective \emph{asymmetric} architecture, where the distribution of +convolutional and transformer blocks is \emph{asymmetric}, containing more +convolutional blocks in the earlier stages, followed by more transformer blocks +in later stages. AsCAN supports a variety of tasks: recognition, segmentation, +class-conditional image generation, and features a superior trade-off between +performance and latency. We then scale the same architecture to solve a +large-scale text-to-image task and show state-of-the-art performance compared +to the most recent public and commercial models. Notably, even without any +computation optimization for transformer blocks, our models still yield faster +inference speed than existing works featuring efficient attention mechanisms, +highlighting the advantages and the value of our approach. + +
+
+ comment: NeurIPS 2024. Project Page: + https://snap-research.github.io/snap_image/ +
+
+
+
+
+ + ☆ VAIR: Visuo-Acoustic Implicit Representations for Low-Cost, Multi-Modal + Transparent Surface Reconstruction in Indoor Scenes + + +
+ Mobile robots operating indoors must be prepared to navigate challenging +scenes that contain transparent surfaces. This paper proposes a novel method +for the fusion of acoustic and visual sensing modalities through implicit +neural representations to enable dense reconstruction of transparent surfaces +in indoor scenes. We propose a novel model that leverages generative latent +optimization to learn an implicit representation of indoor scenes consisting of +transparent surfaces. We demonstrate that we can query the implicit +representation to enable volumetric rendering in image space or 3D geometry +reconstruction (point clouds or mesh) with transparent surface prediction. We +evaluate our method's effectiveness qualitatively and quantitatively on a new +dataset collected using a custom, low-cost sensing platform featuring RGB-D +cameras and ultrasonic sensors. Our method exhibits significant improvement +over state-of-the-art for transparent surface reconstruction. + +
+
+ comment: https://umfieldrobotics.github.io/VAIR_site/ +
+
+
+
+
+ + ☆ Uncovering Hidden Subspaces in Video Diffusion Models Using + Re-Identification + + +
+ Latent Video Diffusion Models can easily deceive casual observers and domain +experts alike thanks to the produced image quality and temporal consistency. +Beyond entertainment, this creates opportunities around safe data sharing of +fully synthetic datasets, which are crucial in healthcare, as well as other +domains relying on sensitive personal information. However, privacy concerns +with this approach have not fully been addressed yet, and models trained on +synthetic data for specific downstream tasks still perform worse than those +trained on real data. This discrepancy may be partly due to the sampling space +being a subspace of the training videos, effectively reducing the training data +size for downstream models. Additionally, the reduced temporal consistency when +generating long videos could be a contributing factor. + In this paper, we first show that training privacy-preserving models in +latent space is computationally more efficient and generalize better. +Furthermore, to investigate downstream degradation factors, we propose to use a +re-identification model, previously employed as a privacy preservation filter. +We demonstrate that it is sufficient to train this model on the latent space of +the video generator. Subsequently, we use these models to evaluate the subspace +covered by synthetic video datasets and thus introduce a new way to measure the +faithfulness of generative machine learning models. We focus on a specific +application in healthcare echocardiography to illustrate the effectiveness of +our novel methods. Our findings indicate that only up to 30.8% of the training +videos are learned in latent video diffusion models, which could explain the +lack of performance when training downstream tasks on synthetic data. + +
+
+ comment: 8 pages, 5 tables, 6 figures +
+
+
+
+
+ + ☆ CAD-MLLM: Unifying Multimodality-Conditioned CAD Generation With MLLM + + +
+ This paper aims to design a unified Computer-Aided Design (CAD) generation +system that can easily generate CAD models based on the user's inputs in the +form of textual description, images, point clouds, or even a combination of +them. Towards this goal, we introduce the CAD-MLLM, the first system capable of +generating parametric CAD models conditioned on the multimodal input. +Specifically, within the CAD-MLLM framework, we leverage the command sequences +of CAD models and then employ advanced large language models (LLMs) to align +the feature space across these diverse multi-modalities data and CAD models' +vectorized representations. To facilitate the model training, we design a +comprehensive data construction and annotation pipeline that equips each CAD +model with corresponding multimodal data. Our resulting dataset, named +Omni-CAD, is the first multimodal CAD dataset that contains textual +description, multi-view images, points, and command sequence for each CAD +model. It contains approximately 450K instances and their CAD construction +sequences. To thoroughly evaluate the quality of our generated CAD models, we +go beyond current evaluation metrics that focus on reconstruction quality by +introducing additional metrics that assess topology quality and surface +enclosure extent. Extensive experimental results demonstrate that CAD-MLLM +significantly outperforms existing conditional generative methods and remains +highly robust to noises and missing points. The project page and more +visualizations can be found at: https://cad-mllm.github.io/ + +
+
+ comment: Project page: https://cad-mllm.github.io/ +
+
+
+
+
+ + ☆ M3DocRAG: Multi-modal Retrieval is What You Need for Multi-page + Multi-document Understanding + + +
+ Document visual question answering (DocVQA) pipelines that answer questions +from documents have broad applications. Existing methods focus on handling +single-page documents with multi-modal language models (MLMs), or rely on +text-based retrieval-augmented generation (RAG) that uses text extraction tools +such as optical character recognition (OCR). However, there are difficulties in +applying these methods in real-world scenarios: (a) questions often require +information across different pages or documents, where MLMs cannot handle many +long documents; (b) documents often have important information in visual +elements such as figures, but text extraction tools ignore them. We introduce +M3DocRAG, a novel multi-modal RAG framework that flexibly accommodates various +document contexts (closed-domain and open-domain), question hops (single-hop +and multi-hop), and evidence modalities (text, chart, figure, etc.). M3DocRAG +finds relevant documents and answers questions using a multi-modal retriever +and an MLM, so that it can efficiently handle single or many documents while +preserving visual information. Since previous DocVQA datasets ask questions in +the context of a specific document, we also present M3DocVQA, a new benchmark +for evaluating open-domain DocVQA over 3,000+ PDF documents with 40,000+ pages. +In three benchmarks (M3DocVQA/MMLongBench-Doc/MP-DocVQA), empirical results +show that M3DocRAG with ColPali and Qwen2-VL 7B achieves superior performance +than many strong baselines, including state-of-the-art performance in +MP-DocVQA. We provide comprehensive analyses of different indexing, MLMs, and +retrieval models. Lastly, we qualitatively show that M3DocRAG can successfully +handle various scenarios, such as when relevant information exists across +multiple pages and when answer evidence only exists in images. + +
+
+ comment: Project webpage: https://m3docrag.github.io +
+
+
+
+
+ + ☆ A Reinforcement Learning-Based Automatic Video Editing Method Using + Pre-trained Vision-Language Model + + +
+ In this era of videos, automatic video editing techniques attract more and +more attention from industry and academia since they can reduce workloads and +lower the requirements for human editors. Existing automatic editing systems +are mainly scene- or event-specific, e.g., soccer game broadcasting, yet the +automatic systems for general editing, e.g., movie or vlog editing which covers +various scenes and events, were rarely studied before, and converting the +event-driven editing method to a general scene is nontrivial. In this paper, we +propose a two-stage scheme for general editing. Firstly, unlike previous works +that extract scene-specific features, we leverage the pre-trained +Vision-Language Model (VLM) to extract the editing-relevant representations as +editing context. Moreover, to close the gap between the professional-looking +videos and the automatic productions generated with simple guidelines, we +propose a Reinforcement Learning (RL)-based editing framework to formulate the +editing problem and train the virtual editor to make better sequential editing +decisions. Finally, we evaluate the proposed method on a more general editing +task with a real movie dataset. Experimental results demonstrate the +effectiveness and benefits of the proposed context representation and the +learning ability of our RL-based editing framework. + +
+
+
+
+
+ + ☆ SaSR-Net: Source-Aware Semantic Representation Network for Enhancing + Audio-Visual Question Answering EMNLP 2024 + + +
+ Audio-Visual Question Answering (AVQA) is a challenging task that involves +answering questions based on both auditory and visual information in videos. A +significant challenge is interpreting complex multi-modal scenes, which include +both visual objects and sound sources, and connecting them to the given +question. In this paper, we introduce the Source-aware Semantic Representation +Network (SaSR-Net), a novel model designed for AVQA. SaSR-Net utilizes +source-wise learnable tokens to efficiently capture and align audio-visual +elements with the corresponding question. It streamlines the fusion of audio +and visual information using spatial and temporal attention mechanisms to +identify answers in multi-modal scenes. Extensive experiments on the Music-AVQA +and AVQA-Yang datasets show that SaSR-Net outperforms state-of-the-art AVQA +methods. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ☆ DimensionX: Create Any 3D and 4D Scenes from a Single Image with + Controllable Video Diffusion + + +
+ In this paper, we introduce \textbf{DimensionX}, a framework designed to +generate photorealistic 3D and 4D scenes from just a single image with video +diffusion. Our approach begins with the insight that both the spatial structure +of a 3D scene and the temporal evolution of a 4D scene can be effectively +represented through sequences of video frames. While recent video diffusion +models have shown remarkable success in producing vivid visuals, they face +limitations in directly recovering 3D/4D scenes due to limited spatial and +temporal controllability during generation. To overcome this, we propose +ST-Director, which decouples spatial and temporal factors in video diffusion by +learning dimension-aware LoRAs from dimension-variant data. This controllable +video diffusion approach enables precise manipulation of spatial structure and +temporal dynamics, allowing us to reconstruct both 3D and 4D representations +from sequential frames with the combination of spatial and temporal dimensions. +Additionally, to bridge the gap between generated videos and real-world scenes, +we introduce a trajectory-aware mechanism for 3D generation and an +identity-preserving denoising strategy for 4D generation. Extensive experiments +on various real-world and synthetic datasets demonstrate that DimensionX +achieves superior results in controllable video generation, as well as in 3D +and 4D scene generation, compared with previous methods. + +
+
+ comment: Project Page: https://chenshuo20.github.io/DimensionX/ +
+
+
+
+
+ + ☆ StoryAgent: Customized Storytelling Video Generation via Multi-Agent + Collaboration + + +
+ The advent of AI-Generated Content (AIGC) has spurred research into automated +video generation to streamline conventional processes. However, automating +storytelling video production, particularly for customized narratives, remains +challenging due to the complexity of maintaining subject consistency across +shots. While existing approaches like Mora and AesopAgent integrate multiple +agents for Story-to-Video (S2V) generation, they fall short in preserving +protagonist consistency and supporting Customized Storytelling Video Generation +(CSVG). To address these limitations, we propose StoryAgent, a multi-agent +framework designed for CSVG. StoryAgent decomposes CSVG into distinct subtasks +assigned to specialized agents, mirroring the professional production process. +Notably, our framework includes agents for story design, storyboard generation, +video creation, agent coordination, and result evaluation. Leveraging the +strengths of different models, StoryAgent enhances control over the generation +process, significantly improving character consistency. Specifically, we +introduce a customized Image-to-Video (I2V) method, LoRA-BE, to enhance +intra-shot temporal consistency, while a novel storyboard generation pipeline +is proposed to maintain subject consistency across shots. Extensive experiments +demonstrate the effectiveness of our approach in synthesizing highly consistent +storytelling videos, outperforming state-of-the-art methods. Our contributions +include the introduction of StoryAgent, a versatile framework for video +generation tasks, and novel techniques for preserving protagonist consistency. + +
+
+
+
+
+ + ☆ MVSplat360: Feed-Forward 360 Scene Synthesis from Sparse Views NeurIPS 2024 + + +
+ We introduce MVSplat360, a feed-forward approach for 360{\deg} novel view +synthesis (NVS) of diverse real-world scenes, using only sparse observations. +This setting is inherently ill-posed due to minimal overlap among input views +and insufficient visual information provided, making it challenging for +conventional methods to achieve high-quality results. Our MVSplat360 addresses +this by effectively combining geometry-aware 3D reconstruction with temporally +consistent video generation. Specifically, it refactors a feed-forward 3D +Gaussian Splatting (3DGS) model to render features directly into the latent +space of a pre-trained Stable Video Diffusion (SVD) model, where these features +then act as pose and visual cues to guide the denoising process and produce +photorealistic 3D-consistent views. Our model is end-to-end trainable and +supports rendering arbitrary views with as few as 5 sparse input views. To +evaluate MVSplat360's performance, we introduce a new benchmark using the +challenging DL3DV-10K dataset, where MVSplat360 achieves superior visual +quality compared to state-of-the-art methods on wide-sweeping or even 360{\deg} +NVS tasks. Experiments on the existing benchmark RealEstate10K also confirm the +effectiveness of our model. The video results are available on our project +page: https://donydchen.github.io/mvsplat360. + +
+
+ comment: NeurIPS 2024, Project page: https://donydchen.github.io/mvsplat360, + Code: https://github.com/donydchen/mvsplat360 +
+
+
+
+
+ + ☆ VideoGLaMM: A Large Multimodal Model for Pixel-Level Visual Grounding in + Videos + + +
+ Fine-grained alignment between videos and text is challenging due to complex +spatial and temporal dynamics in videos. Existing video-based Large Multimodal +Models (LMMs) handle basic conversations but struggle with precise pixel-level +grounding in videos. To address this, we introduce VideoGLaMM, a LMM designed +for fine-grained pixel-level grounding in videos based on user-provided textual +inputs. Our design seamlessly connects three key components: a Large Language +Model, a dual vision encoder that emphasizes both spatial and temporal details, +and a spatio-temporal decoder for accurate mask generation. This connection is +facilitated via tunable V-L and L-V adapters that enable close Vision-Language +(VL) alignment. The architecture is trained to synchronize both spatial and +temporal elements of video content with textual instructions. To enable +fine-grained grounding, we curate a multimodal dataset featuring detailed +visually-grounded conversations using a semiautomatic annotation pipeline, +resulting in a diverse set of 38k video-QA triplets along with 83k objects and +671k masks. We evaluate VideoGLaMM on three challenging tasks: Grounded +Conversation Generation, Visual Grounding, and Referring Video Segmentation. +Experimental results show that our model consistently outperforms existing +approaches across all three tasks. + +
+
+ comment: Technical Report of VideoGLaMM +
+
+
+
+
+ + ☆ Stem-OB: Generalizable Visual Imitation Learning with Stem-Like + Convergent Observation through Diffusion Inversion + + +
+ Visual imitation learning methods demonstrate strong performance, yet they +lack generalization when faced with visual input perturbations, including +variations in lighting and textures, impeding their real-world application. We +propose Stem-OB that utilizes pretrained image diffusion models to suppress +low-level visual differences while maintaining high-level scene structures. +This image inversion process is akin to transforming the observation into a +shared representation, from which other observations stem, with extraneous +details removed. Stem-OB contrasts with data-augmentation approaches as it is +robust to various unspecified appearance changes without the need for +additional training. Our method is a simple yet highly effective plug-and-play +solution. Empirical results confirm the effectiveness of our approach in +simulated tasks and show an exceptionally significant improvement in real-world +applications, with an average increase of 22.2% in success rates compared to +the best baseline. See https://hukz18.github.io/Stem-Ob/ for more info. + +
+
+ comment: Arxiv preprint version +
+
+
+
+
+ + ☆ Robust Iris Centre Localisation for Assistive Eye-Gaze Tracking + + +
+ In this research work, we address the problem of robust iris centre +localisation in unconstrained conditions as a core component of our eye-gaze +tracking platform. We investigate the application of U-Net variants for +segmentation-based and regression-based approaches to improve our iris centre +localisation, which was previously based on Bayes' classification. The achieved +results are comparable to or better than the state-of-the-art, offering a +drastic improvement over those achieved by the Bayes' classifier, and without +sacrificing the real-time performance of our eye-gaze tracking platform. + +
+
+
+
+
+ + ☆ In the Era of Prompt Learning with Vision-Language Models + + +
+ Large-scale foundation models like CLIP have shown strong zero-shot +generalization but struggle with domain shifts, limiting their adaptability. In +our work, we introduce \textsc{StyLIP}, a novel domain-agnostic prompt learning +strategy for Domain Generalization (DG). StyLIP disentangles visual style and +content in CLIP`s vision encoder by using style projectors to learn +domain-specific prompt tokens and combining them with content features. Trained +contrastively, this approach enables seamless adaptation across domains, +outperforming state-of-the-art methods on multiple DG benchmarks. Additionally, +we propose AD-CLIP for unsupervised domain adaptation (DA), leveraging CLIP`s +frozen vision backbone to learn domain-invariant prompts through image style +and content features. By aligning domains in embedding space with entropy +minimization, AD-CLIP effectively handles domain shifts, even when only target +domain samples are available. Lastly, we outline future work on class discovery +using prompt learning for semantic segmentation in remote sensing, focusing on +identifying novel or rare classes in unstructured environments. This paves the +way for more adaptive and generalizable models in complex, real-world +scenarios. + +
+
+ comment: ICVGIP 2024, Young Faculty Symposium +
+
+
+
+
+ + ☆ ZAHA: Introducing the Level of Facade Generalization and the Large-Scale + Point Cloud Facade Semantic Segmentation Benchmark Dataset WACV 2025 + + +
+ Facade semantic segmentation is a long-standing challenge in photogrammetry +and computer vision. Although the last decades have witnessed the influx of +facade segmentation methods, there is a lack of comprehensive facade classes +and data covering the architectural variability. In ZAHA, we introduce Level of +Facade Generalization (LoFG), novel hierarchical facade classes designed based +on international urban modeling standards, ensuring compatibility with +real-world challenging classes and uniform methods' comparison. Realizing the +LoFG, we present to date the largest semantic 3D facade segmentation dataset, +providing 601 million annotated points at five and 15 classes of LoFG2 and +LoFG3, respectively. Moreover, we analyze the performance of baseline semantic +segmentation methods on our introduced LoFG classes and data, complementing it +with a discussion on the unresolved challenges for facade segmentation. We +firmly believe that ZAHA shall facilitate further development of 3D facade +semantic segmentation methods, enabling robust segmentation indispensable in +creating urban digital twins. + +
+
+ comment: Accepted to WACV 2025 (IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV)) +
+
+
+
+
+ + ☆ A multi-purpose automatic editing system based on lecture semantics for + remote education + + +
+ Remote teaching has become popular recently due to its convenience and +safety, especially under extreme circumstances like a pandemic. However, online +students usually have a poor experience since the information acquired from the +views provided by the broadcast platforms is limited. One potential solution is +to show more camera views simultaneously, but it is technically challenging and +distracting for the viewers. Therefore, an automatic multi-camera +directing/editing system, which aims at selecting the most concerned view at +each time instance to guide the attention of online students, is in urgent +demand. However, existing systems mostly make simple assumptions and focus on +tracking the position of the speaker instead of the real lecture semantics, and +therefore have limited capacities to deliver optimal information flow. To this +end, this paper proposes an automatic multi-purpose editing system based on the +lecture semantics, which can both direct the multiple video streams for +real-time broadcasting and edit the optimal video offline for review purposes. +Our system directs the views by semantically analyzing the class events while +following the professional directing rules, mimicking a human director to +capture the regions of interest from the viewpoint of the onsite students. We +conduct both qualitative and quantitative analyses to verify the effectiveness +of the proposed system and its components. + +
+
+
+
+
+ + ☆ Differentiable Gaussian Representation for Incomplete CT Reconstruction + + +
+ Incomplete Computed Tomography (CT) benefits patients by reducing radiation +exposure. However, reconstructing high-fidelity images from limited views or +angles remains challenging due to the ill-posed nature of the problem. Deep +Learning Reconstruction (DLR) methods have shown promise in enhancing image +quality, but the paradox between training data diversity and high +generalization ability remains unsolved. In this paper, we propose a novel +Gaussian Representation for Incomplete CT Reconstruction (GRCT) without the +usage of any neural networks or full-dose CT data. Specifically, we model the +3D volume as a set of learnable Gaussians, which are optimized directly from +the incomplete sinogram. Our method can be applied to multiple views and angles +without changing the architecture. Additionally, we propose a differentiable +Fast CT Reconstruction method for efficient clinical usage. Extensive +experiments on multiple datasets and settings demonstrate significant +improvements in reconstruction quality metrics and high efficiency. We plan to +release our code as open-source. + +
+
+
+
+
+ + ☆ D$^3$epth: Self-Supervised Depth Estimation with Dynamic Mask in Dynamic + Scenes + + +
+ Depth estimation is a crucial technology in robotics. Recently, +self-supervised depth estimation methods have demonstrated great potential as +they can efficiently leverage large amounts of unlabelled real-world data. +However, most existing methods are designed under the assumption of static +scenes, which hinders their adaptability in dynamic environments. To address +this issue, we present D$^3$epth, a novel method for self-supervised depth +estimation in dynamic scenes. It tackles the challenge of dynamic objects from +two key perspectives. First, within the self-supervised framework, we design a +reprojection constraint to identify regions likely to contain dynamic objects, +allowing the construction of a dynamic mask that mitigates their impact at the +loss level. Second, for multi-frame depth estimation, we introduce a cost +volume auto-masking strategy that leverages adjacent frames to identify regions +associated with dynamic objects and generate corresponding masks. This provides +guidance for subsequent processes. Furthermore, we propose a spectral entropy +uncertainty module that incorporates spectral entropy to guide uncertainty +estimation during depth fusion, effectively addressing issues arising from cost +volume computation in dynamic environments. Extensive experiments on KITTI and +Cityscapes datasets demonstrate that the proposed method consistently +outperforms existing self-supervised monocular depth estimation baselines. Code +is available at \url{https://github.com/Csyunling/D3epth}. + +
+
+ comment: Open sourced +
+
+
+
+
+ + ☆ End-to-end Inception-Unet based Generative Adversarial Networks for Snow + and Rain Removals + + +
+ The superior performance introduced by deep learning approaches in removing +atmospheric particles such as snow and rain from a single image; favors their +usage over classical ones. However, deep learning-based approaches still suffer +from challenges related to the particle appearance characteristics such as +size, type, and transparency. Furthermore, due to the unique characteristics of +rain and snow particles, single network based deep learning approaches struggle +in handling both degradation scenarios simultaneously. In this paper, a global +framework that consists of two Generative Adversarial Networks (GANs) is +proposed where each handles the removal of each particle individually. The +architectures of both desnowing and deraining GANs introduce the integration of +a feature extraction phase with the classical U-net generator network which in +turn enhances the removal performance in the presence of severe variations in +size and appearance. Furthermore, a realistic dataset that contains pairs of +snowy images next to their groundtruth images estimated using a low-rank +approximation approach; is presented. The experiments show that the proposed +desnowing and deraining approaches achieve significant improvements in +comparison to the state-of-the-art approaches when tested on both synthetic and +realistic datasets. + +
+
+
+
+
+ + ☆ GANESH: Generalizable NeRF for Lensless Imaging + + +
+ Lensless imaging offers a significant opportunity to develop ultra-compact +cameras by removing the conventional bulky lens system. However, without a +focusing element, the sensor's output is no longer a direct image but a complex +multiplexed scene representation. Traditional methods have attempted to address +this challenge by employing learnable inversions and refinement models, but +these methods are primarily designed for 2D reconstruction and do not +generalize well to 3D reconstruction. We introduce GANESH, a novel framework +designed to enable simultaneous refinement and novel view synthesis from +multi-view lensless images. Unlike existing methods that require scene-specific +training, our approach supports on-the-fly inference without retraining on each +scene. Moreover, our framework allows us to tune our model to specific scenes, +enhancing the rendering and refinement quality. To facilitate research in this +area, we also present the first multi-view lensless dataset, LenslessScenes. +Extensive experiments demonstrate that our method outperforms current +approaches in reconstruction accuracy and refinement quality. Code and video +results are available at https://rakesh-123-cryp.github.io/Rakesh.github.io/ + +
+
+
+
+
+ + ☆ MPVO: Motion-Prior based Visual Odometry for PointGoal Navigation ECCV + + +
+ Visual odometry (VO) is essential for enabling accurate point-goal navigation +of embodied agents in indoor environments where GPS and compass sensors are +unreliable and inaccurate. However, traditional VO methods face challenges in +wide-baseline scenarios, where fast robot motions and low frames per second +(FPS) during inference hinder their performance, leading to drift and +catastrophic failures in point-goal navigation. Recent deep-learned VO methods +show robust performance but suffer from sample inefficiency during training; +hence, they require huge datasets and compute resources. So, we propose a +robust and sample-efficient VO pipeline based on motion priors available while +an agent is navigating an environment. It consists of a training-free +action-prior based geometric VO module that estimates a coarse relative pose +which is further consumed as a motion prior by a deep-learned VO model, which +finally produces a fine relative pose to be used by the navigation policy. This +strategy helps our pipeline achieve up to 2x sample efficiency during training +and demonstrates superior accuracy and robustness in point-goal navigation +tasks compared to state-of-the-art VO method(s). Realistic indoor environments +of the Gibson dataset is used in the AI-Habitat simulator to evaluate the +proposed approach using navigation metrics (like success/SPL) and pose metrics +(like RPE/ATE). We hope this method further opens a direction of work where +motion priors from various sources can be utilized to improve VO estimates and +achieve better results in embodied navigation tasks. + +
+
+ comment: Accepted in 50SFM Workshop of the 18th European Conference on + Computer Vision (ECCV) 2024 +
+
+
+
+
+ + ☆ An Effective Pipeline for Whole-Slide Image Glomerulus Segmentation + + +
+ Whole-slide images (WSI) glomerulus segmentation is essential for accurately +diagnosing kidney diseases. In this work, we propose a practical pipeline for +glomerulus segmentation that effectively enhances both patch-level and +WSI-level segmentation tasks. Our approach leverages stitching on overlapping +patches, increasing the detection coverage, especially when glomeruli are +located near patch image borders. In addition, we conduct comprehensive +evaluations from different segmentation models across two large and diverse +datasets with over 30K glomerulus annotations. Experimental results demonstrate +that models using our pipeline outperform the previous state-of-the-art method, +achieving superior results across both datasets and setting a new benchmark for +glomerulus segmentation in WSIs. The code and pre-trained models are available +at https://github.com/huuquan1994/wsi_glomerulus_seg. + +
+
+
+
+
+ + ☆ Taming Rectified Flow for Inversion and Editing + + +
+ Rectified-flow-based diffusion transformers, such as FLUX and OpenSora, have +demonstrated exceptional performance in the field of image and video +generation. Despite their robust generative capabilities, these models often +suffer from inaccurate inversion, which could further limit their effectiveness +in downstream tasks such as image and video editing. To address this issue, we +propose RF-Solver, a novel training-free sampler that enhances inversion +precision by reducing errors in the process of solving rectified flow ODEs. +Specifically, we derive the exact formulation of the rectified flow ODE and +perform a high-order Taylor expansion to estimate its nonlinear components, +significantly decreasing the approximation error at each timestep. Building +upon RF-Solver, we further design RF-Edit, which comprises specialized +sub-modules for image and video editing. By sharing self-attention layer +features during the editing process, RF-Edit effectively preserves the +structural information of the source image or video while achieving +high-quality editing results. Our approach is compatible with any pre-trained +rectified-flow-based models for image and video tasks, requiring no additional +training or optimization. Extensive experiments on text-to-image generation, +image & video inversion, and image & video editing demonstrate the robust +performance and adaptability of our methods. Code is available at +https://github.com/wangjiangshan0725/RF-Solver-Edit. + +
+
+
+
+
+ + ☆ Convolutional Differentiable Logic Gate Networks NeurIPS 2024 + + +
+ With the increasing inference cost of machine learning models, there is a +growing interest in models with fast and efficient inference. Recently, an +approach for learning logic gate networks directly via a differentiable +relaxation was proposed. Logic gate networks are faster than conventional +neural network approaches because their inference only requires logic gate +operators such as NAND, OR, and XOR, which are the underlying building blocks +of current hardware and can be efficiently executed. We build on this idea, +extending it by deep logic gate tree convolutions, logical OR pooling, and +residual initializations. This allows scaling logic gate networks up by over +one order of magnitude and utilizing the paradigm of convolution. On CIFAR-10, +we achieve an accuracy of 86.29% using only 61 million logic gates, which +improves over the SOTA while being 29x smaller. + +
+
+ comment: Published at NeurIPS 2024 (Oral) +
+
+
+
+
+ + ☆ Controlling Human Shape and Pose in Text-to-Image Diffusion Models via + Domain Adaptation + + +
+ We present a methodology for conditional control of human shape and pose in +pretrained text-to-image diffusion models using a 3D human parametric model +(SMPL). Fine-tuning these diffusion models to adhere to new conditions requires +large datasets and high-quality annotations, which can be more cost-effectively +acquired through synthetic data generation rather than real-world data. +However, the domain gap and low scene diversity of synthetic data can +compromise the pretrained model's visual fidelity. We propose a +domain-adaptation technique that maintains image quality by isolating +synthetically trained conditional information in the classifier-free guidance +vector and composing it with another control network to adapt the generated +images to the input domain. To achieve SMPL control, we fine-tune a +ControlNet-based architecture on the synthetic SURREAL dataset of rendered +humans and apply our domain adaptation at generation time. Experiments +demonstrate that our model achieves greater shape and pose diversity than the +2d pose-based ControlNet, while maintaining the visual fidelity and improving +stability, proving its usefulness for downstream tasks such as human animation. + +
+
+
+
+
+ + ☆ Subspace-Constrained Quadratic Matrix Factorization: Algorithm and + Applications + + +
+ Matrix Factorization has emerged as a widely adopted framework for modeling +data exhibiting low-rank structures. To address challenges in manifold +learning, this paper presents a subspace-constrained quadratic matrix +factorization model. The model is designed to jointly learn key low-dimensional +structures, including the tangent space, the normal subspace, and the quadratic +form that links the tangent space to a low-dimensional representation. We solve +the proposed factorization model using an alternating minimization method, +involving an in-depth investigation of nonlinear regression and projection +subproblems. Theoretical properties of the quadratic projection problem and +convergence characteristics of the alternating strategy are also investigated. +To validate our approach, we conduct numerical experiments on synthetic and +real-world datasets. Results demonstrate that our model outperforms existing +methods, highlighting its robustness and efficacy in capturing core +low-dimensional structures. + +
+
+
+
+
+ + ☆ NeuroFly: A framework for whole-brain single neuron reconstruction + + +
+ Neurons, with their elongated, tree-like dendritic and axonal structures, +enable efficient signal integration and long-range communication across brain +regions. By reconstructing individual neurons' morphology, we can gain valuable +insights into brain connectivity, revealing the structure basis of cognition, +movement, and perception. Despite the accumulation of extensive 3D microscopic +imaging data, progress has been considerably hindered by the absence of +automated tools to streamline this process. Here we introduce NeuroFly, a +validated framework for large-scale automatic single neuron reconstruction. +This framework breaks down the process into three distinct stages: +segmentation, connection, and proofreading. In the segmentation stage, we +perform automatic segmentation followed by skeletonization to generate +over-segmented neuronal fragments without branches. During the connection +stage, we use a 3D image-based path following approach to extend each fragment +and connect it with other fragments of the same neuron. Finally, human +annotators are required only to proofread the few unresolved positions. The +first two stages of our process are clearly defined computer vision problems, +and we have trained robust baseline models to solve them. We validated +NeuroFly's efficiency using in-house datasets that include a variety of +challenging scenarios, such as dense arborizations, weak axons, images with +contamination. We will release the datasets along with a suite of visualization +and annotation tools for better reproducibility. Our goal is to foster +collaboration among researchers to address the neuron reconstruction challenge, +ultimately accelerating advancements in neuroscience research. The dataset and +code are available at https://github.com/beanli161514/neurofly + +
+
+
+
+
+ + ☆ Progressive Multi-Level Alignments for Semi-Supervised Domain Adaptation + SAR Target Recognition Using Simulated Data + + +
+ Recently, an intriguing research trend for automatic target recognition (ATR) +from synthetic aperture radar (SAR) imagery has arisen: using simulated data to +train ATR models is a feasible solution to the issue of inadequate measured +data. To close the domain gap that exists between the real and simulated data, +the unsupervised domain adaptation (UDA) techniques are frequently exploited to +construct ATR models. However, for UDA, the target domain lacks labeled data to +direct the model training, posing a great challenge to ATR performance. To +address the above problem, a semi-supervised domain adaptation (SSDA) framework +has been proposed adopting progressive multi-level alignments for simulated +data-aided SAR ATR. First, a progressive wavelet transform data augmentation +(PWTDA) is presented by analyzing the discrepancies of wavelet decomposition +sub-bands of two domain images, obtaining the domain-level alignment. +Specifically, the domain gap is narrowed by mixing the wavelet transform +high-frequency sub-band components. Second, we develop an asymptotic +instance-prototype alignment (AIPA) strategy to push the source domain +instances close to the corresponding target prototypes, aiming to achieve +category-level alignment. Moreover, the consistency alignment is implemented by +excavating the strong-weak augmentation consistency of both individual samples +and the multi-sample relationship, enhancing the generalization capability of +the model. Extensive experiments on the Synthetic and Measured Paired Labeled +Experiment (SAMPLE) dataset, indicate that our approach obtains recognition +accuracies of 99.63% and 98.91% in two common experimental settings with only +one labeled sample per class of the target domain, outperforming the most +advanced SSDA techniques. + +
+
+
+
+
+ + ☆ From CNN to ConvRNN: Adapting Visualization Techniques for Time-Series + Anomaly Detection + + +
+ Nowadays, neural networks are commonly used to solve various problems. +Unfortunately, despite their effectiveness, they are often perceived as black +boxes capable of providing answers without explaining their decisions, which +raises numerous ethical and legal concerns. Fortunately, the field of +explainability helps users understand these results. This aspect of machine +learning allows users to grasp the decision-making process of a model and +verify the relevance of its outcomes. In this article, we focus on the learning +process carried out by a ``time distributed`` convRNN, which performs anomaly +detection from video data. + +
+
+
+
+
+ + ☆ ESC-MISR: Enhancing Spatial Correlations for Multi-Image + Super-Resolution in Remote Sensing + + +
+ Multi-Image Super-Resolution (MISR) is a crucial yet challenging research +task in the remote sensing community. In this paper, we address the challenging +task of Multi-Image Super-Resolution in Remote Sensing (MISR-RS), aiming to +generate a High-Resolution (HR) image from multiple Low-Resolution (LR) images +obtained by satellites. Recently, the weak temporal correlations among LR +images have attracted increasing attention in the MISR-RS task. However, +existing MISR methods treat the LR images as sequences with strong temporal +correlations, overlooking spatial correlations and imposing temporal +dependencies. To address this problem, we propose a novel end-to-end framework +named Enhancing Spatial Correlations in MISR (ESC-MISR), which fully exploits +the spatial-temporal relations of multiple images for HR image reconstruction. +Specifically, we first introduce a novel fusion module named Multi-Image +Spatial Transformer (MIST), which emphasizes parts with clearer global spatial +features and enhances the spatial correlations between LR images. Besides, we +perform a random shuffle strategy for the sequential inputs of LR images to +attenuate temporal dependencies and capture weak temporal correlations in the +training stage. Compared with the state-of-the-art methods, our ESC-MISR +achieves 0.70dB and 0.76dB cPSNR improvements on the two bands of the PROBA-V +dataset respectively, demonstrating the superiority of our method. + +
+
+
+
+
+ + ☆ Dynamic Brightness Adaptation for Robust Multi-modal Image Fusion IJCAI 2024 + + +
+ Infrared and visible image fusion aim to integrate modality strengths for +visually enhanced, informative images. Visible imaging in real-world scenarios +is susceptible to dynamic environmental brightness fluctuations, leading to +texture degradation. Existing fusion methods lack robustness against such +brightness perturbations, significantly compromising the visual fidelity of the +fused imagery. To address this challenge, we propose the Brightness Adaptive +multimodal dynamic fusion framework (BA-Fusion), which achieves robust image +fusion despite dynamic brightness fluctuations. Specifically, we introduce a +Brightness Adaptive Gate (BAG) module, which is designed to dynamically select +features from brightness-related channels for normalization, while preserving +brightness-independent structural information within the source images. +Furthermore, we propose a brightness consistency loss function to optimize the +BAG module. The entire framework is tuned via alternating training strategies. +Extensive experiments validate that our method surpasses state-of-the-art +methods in preserving multi-modal image information and visual fidelity, while +exhibiting remarkable robustness across varying brightness levels. Our code is +available: https://github.com/SunYM2020/BA-Fusion. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Reciprocal Point Learning Network with Large Electromagnetic Kernel for + SAR Open-Set Recognition + + +
+ The limitations of existing Synthetic Aperture Radar (SAR) Automatic Target +Recognition (ATR) methods lie in their confinement by the closed-environment +assumption, hindering their effective and robust handling of unknown target +categories in open environments. Open Set Recognition (OSR), a pivotal facet +for algorithmic practicality, intends to categorize known classes while +denoting unknown ones as "unknown." The chief challenge in OSR involves +concurrently mitigating risks associated with generalizing features from a +restricted set of known classes to numerous unknown samples and the open space +exposure to potential unknown data. To enhance open-set SAR classification, a +method called scattering kernel with reciprocal learning network is proposed. +Initially, a feature learning framework is constructed based on reciprocal +point learning (RPL), establishing a bounded space for potential unknown +classes. This approach indirectly introduces unknown information into a learner +confined to known classes, thereby acquiring more concise and discriminative +representations. Subsequently, considering the variability in the imaging of +targets at different angles and the discreteness of components in SAR images, a +proposal is made to design convolutional kernels based on large-sized attribute +scattering center models. This enhances the ability to extract intrinsic +non-linear features and specific scattering characteristics in SAR images, +thereby improving the discriminative features of the model and mitigating the +impact of imaging variations on classification performance. Experiments on the +MSTAR datasets substantiate the superior performance of the proposed approach +called ASC-RPL over mainstream methods. + +
+
+
+
+
+ + ☆ Personalized Federated Learning for Cross-view Geo-localization SP + + +
+ In this paper we propose a methodology combining Federated Learning (FL) with +Cross-view Image Geo-localization (CVGL) techniques. We address the challenges +of data privacy and heterogeneity in autonomous vehicle environments by +proposing a personalized Federated Learning scenario that allows selective +sharing of model parameters. Our method implements a coarse-to-fine approach, +where clients share only the coarse feature extractors while keeping +fine-grained features specific to local environments. We evaluate our approach +against traditional centralized and single-client training schemes using the +KITTI dataset combined with satellite imagery. Results demonstrate that our +federated CVGL method achieves performance close to centralized training while +maintaining data privacy. The proposed partial model sharing strategy shows +comparable or slightly better performance than classical FL, offering +significant reduced communication overhead without sacrificing accuracy. Our +work contributes to more robust and privacy-preserving localization systems for +autonomous vehicles operating in diverse environments + +
+
+ comment: 6 pages, 2 figures, Preprint submitted to the IEEE 26th International + Workshop on Multimedia Signal Processing (MMSP) +
+
+
+
+
+ + ☆ DNN-based 3D Cloud Retrieval for Variable Solar Illumination and + Multiview Spaceborne Imaging + + +
+ Climate studies often rely on remotely sensed images to retrieve +two-dimensional maps of cloud properties. To advance volumetric analysis, we +focus on recovering the three-dimensional (3D) heterogeneous extinction +coefficient field of shallow clouds using multiview remote sensing data. +Climate research requires large-scale worldwide statistics. To enable scalable +data processing, previous deep neural networks (DNNs) can infer at spaceborne +remote sensing downlink rates. However, prior methods are limited to a fixed +solar illumination direction. In this work, we introduce the first scalable +DNN-based system for 3D cloud retrieval that accommodates varying camera poses +and solar directions. By integrating multiview cloud intensity images with +camera poses and solar direction data, we achieve greater flexibility in +recovery. Training of the DNN is performed by a novel two-stage scheme to +address the high number of degrees of freedom in this problem. Our approach +shows substantial improvements over previous state-of-the-art, particularly in +handling variations in the sun's zenith angle. + +
+
+ comment: 4 pages, 4 figures +
+
+
+
+
+ + ☆ CaPo: Cooperative Plan Optimization for Efficient Embodied Multi-Agent + Cooperation + + +
+ In this work, we address the cooperation problem among large language model +(LLM) based embodied agents, where agents must cooperate to achieve a common +goal. Previous methods often execute actions extemporaneously and incoherently, +without long-term strategic and cooperative planning, leading to redundant +steps, failures, and even serious repercussions in complex tasks like +search-and-rescue missions where discussion and cooperative plan are crucial. +To solve this issue, we propose Cooperative Plan Optimization (CaPo) to enhance +the cooperation efficiency of LLM-based embodied agents. Inspired by human +cooperation schemes, CaPo improves cooperation efficiency with two phases: 1) +meta-plan generation, and 2) progress-adaptive meta-plan and execution. In the +first phase, all agents analyze the task, discuss, and cooperatively create a +meta-plan that decomposes the task into subtasks with detailed steps, ensuring +a long-term strategic and coherent plan for efficient coordination. In the +second phase, agents execute tasks according to the meta-plan and dynamically +adjust it based on their latest progress (e.g., discovering a target object) +through multi-turn discussions. This progress-based adaptation eliminates +redundant actions, improving the overall cooperation efficiency of agents. +Experimental results on the ThreeDworld Multi-Agent Transport and Communicative +Watch-And-Help tasks demonstrate that CaPo achieves much higher task completion +rate and efficiency compared with state-of-the-arts. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Explainable Search and Discovery of Visual Cultural Heritage Collections + with Multimodal Large Language Models + + +
+ Many cultural institutions have made large digitized visual collections +available online, often under permissible re-use licences. Creating interfaces +for exploring and searching these collections is difficult, particularly in the +absence of granular metadata. In this paper, we introduce a method for using +state-of-the-art multimodal large language models (LLMs) to enable an +open-ended, explainable search and discovery interface for visual collections. +We show how our approach can create novel clustering and recommendation systems +that avoid common pitfalls of methods based directly on visual embeddings. Of +particular interest is the ability to offer concrete textual explanations of +each recommendation without the need to preselect the features of interest. +Together, these features can create a digital interface that is more open-ended +and flexible while also being better suited to addressing privacy and ethical +concerns. Through a case study using a collection of documentary photographs, +we provide several metrics showing the efficacy and possibilities of our +approach. + +
+
+ comment: 16 pages, CHR 2024: Computational Humanities Research Conference, + December 4 - 6, 2024, Aarhus University, Denmark +
+
+
+
+
+ + ☆ Automated Image Color Mapping for a Historic Photographic Collection + + +
+ In the 1970s, the United States Environmental Protection Agency sponsored +Documerica, a large-scale photography initiative to document environmental +subjects nation-wide. While over 15,000 digitized public-domain photographs +from the collection are available online, most of the images were scanned from +damaged copies of the original prints. We present and evaluate a modified +histogram matching technique based on the underlying chemistry of the prints +for correcting the damaged images by using training data collected from a small +set of undamaged prints. The entire set of color-adjusted Documerica images is +made available in an open repository. + +
+
+ comment: 11 pages, CHR 2024: Computational Humanities Research Conference, + December 4 - 6, 2024, Aarhus University, Denmark +
+
+
+
+
+ + ☆ ICH-SCNet: Intracerebral Hemorrhage Segmentation and Prognosis + Classification Network Using CLIP-guided SAM mechanism + + +
+ Intracerebral hemorrhage (ICH) is the most fatal subtype of stroke and is +characterized by a high incidence of disability. Accurate segmentation of the +ICH region and prognosis prediction are critically important for developing and +refining treatment plans for post-ICH patients. However, existing approaches +address these two tasks independently and predominantly focus on imaging data +alone, thereby neglecting the intrinsic correlation between the tasks and +modalities. This paper introduces a multi-task network, ICH-SCNet, designed for +both ICH segmentation and prognosis classification. Specifically, we integrate +a SAM-CLIP cross-modal interaction mechanism that combines medical text and +segmentation auxiliary information with neuroimaging data to enhance +cross-modal feature recognition. Additionally, we develop an effective feature +fusion module and a multi-task loss function to improve performance further. +Extensive experiments on an ICH dataset reveal that our approach surpasses +other state-of-the-art methods. It excels in the overall performance of +classification tasks and outperforms competing models in all segmentation task +metrics. + +
+
+ comment: 6 pages, 2 figures, 3 tables, published to BIBM 2024 +
+
+
+
+
+ + ☆ DanceFusion: A Spatio-Temporal Skeleton Diffusion Transformer for + Audio-Driven Dance Motion Reconstruction + + +
+ This paper introduces DanceFusion, a novel framework for reconstructing and +generating dance movements synchronized to music, utilizing a Spatio-Temporal +Skeleton Diffusion Transformer. The framework adeptly handles incomplete and +noisy skeletal data common in short-form dance videos on social media platforms +like TikTok. DanceFusion incorporates a hierarchical Transformer-based +Variational Autoencoder (VAE) integrated with a diffusion model, significantly +enhancing motion realism and accuracy. Our approach introduces sophisticated +masking techniques and a unique iterative diffusion process that refines the +motion sequences, ensuring high fidelity in both motion generation and +synchronization with accompanying audio cues. Comprehensive evaluations +demonstrate that DanceFusion surpasses existing methods, providing +state-of-the-art performance in generating dynamic, realistic, and +stylistically diverse dance motions. Potential applications of this framework +extend to content creation, virtual reality, and interactive entertainment, +promising substantial advancements in automated dance generation. Visit our +project page at https://th-mlab.github.io/DanceFusion/. + +
+
+
+
+
+ + ☆ TAP-VL: Text Layout-Aware Pre-training for Enriched Vision-Language + Models + + +
+ Vision-Language (VL) models have garnered considerable research interest; +however, they still face challenges in effectively handling text within images. +To address this limitation, researchers have developed two approaches. The +first method involves utilizing external Optical Character Recognition (OCR) +tools to extract textual information from images, which is then prepended to +other textual inputs. The second strategy focuses on employing extremely +high-resolution images to improve text recognition capabilities. In this paper, +we focus on enhancing the first strategy by introducing a novel method, named +TAP-VL, which treats OCR information as a distinct modality and seamlessly +integrates it into any VL model. TAP-VL employs a lightweight transformer-based +OCR module to receive OCR with layout information, compressing it into a short +fixed-length sequence for input into the LLM. Initially, we conduct +model-agnostic pretraining of the OCR module on unlabeled documents, followed +by its integration into any VL architecture through brief fine-tuning. +Extensive experiments demonstrate consistent performance improvements when +applying TAP-VL to top-performing VL models, across scene-text and +document-based VL benchmarks. + +
+
+
+
+
+ + ☆ Improved Multi-Task Brain Tumour Segmentation with Synthetic Data + Augmentation + + +
+ This paper presents the winning solution of task 1 and the third-placed +solution of task 3 of the BraTS challenge. The use of automated tools in +clinical practice has increased due to the development of more and more +sophisticated and reliable algorithms. However, achieving clinical standards +and developing tools for real-life scenarios is a major challenge. To this end, +BraTS has organised tasks to find the most advanced solutions for specific +purposes. In this paper, we propose the use of synthetic data to train +state-of-the-art frameworks in order to improve the segmentation of adult +gliomas in a post-treatment scenario, and the segmentation of meningioma for +radiotherapy planning. Our results suggest that the use of synthetic data leads +to more robust algorithms, although the synthetic data generation pipeline is +not directly suited to the meningioma task. The code for these tasks is +available at https://github.com/ShadowTwin41/BraTS_2023_2024_solutions. + +
+
+
+
+
+ + ☆ Brain Tumour Removing and Missing Modality Generation using 3D WDM + + +
+ This paper presents the second-placed solution for task 8 and the +participation solution for task 7 of BraTS 2024. The adoption of automated +brain analysis algorithms to support clinical practice is increasing. However, +many of these algorithms struggle with the presence of brain lesions or the +absence of certain MRI modalities. The alterations in the brain's morphology +leads to high variability and thus poor performance of predictive models that +were trained only on healthy brains. The lack of information that is usually +provided by some of the missing MRI modalities also reduces the reliability of +the prediction models trained with all modalities. In order to improve the +performance of these models, we propose the use of conditional 3D wavelet +diffusion models. The wavelet transform enabled full-resolution image training +and prediction on a GPU with 48 GB VRAM, without patching or downsampling, +preserving all information for prediction. For the inpainting task of BraTS +2024, the use of a large and variable number of healthy masks and the stability +and efficiency of the 3D wavelet diffusion model resulted in 0.007, 22.61 and +0.842 in the validation set and 0.07 , 22.8 and 0.91 in the testing set (MSE, +PSNR and SSIM respectively). The code for these tasks is available at +https://github.com/ShadowTwin41/BraTS_2023_2024_solutions. + +
+
+
+
+
+ + ☆ Multi-temporal crack segmentation in concrete structure using deep + learning approaches + + +
+ Cracks are among the earliest indicators of deterioration in concrete +structures. Early automatic detection of these cracks can significantly extend +the lifespan of critical infrastructures, such as bridges, buildings, and +tunnels, while simultaneously reducing maintenance costs and facilitating +efficient structural health monitoring. This study investigates whether +leveraging multi-temporal data for crack segmentation can enhance segmentation +quality. Therefore, we compare a Swin UNETR trained on multi-temporal data with +a U-Net trained on mono-temporal data to assess the effect of temporal +information compared with conventional single-epoch approaches. To this end, a +multi-temporal dataset comprising 1356 images, each with 32 sequential crack +propagation images, was created. After training the models, experiments were +conducted to analyze their generalization ability, temporal consistency, and +segmentation quality. The multi-temporal approach consistently outperformed its +mono-temporal counterpart, achieving an IoU of $82.72\%$ and a F1-score of +$90.54\%$, representing a significant improvement over the mono-temporal +model's IoU of $76.69\%$ and F1-score of $86.18\%$, despite requiring only half +of the trainable parameters. The multi-temporal model also displayed a more +consistent segmentation quality, with reduced noise and fewer errors. These +results suggest that temporal information significantly enhances the +performance of segmentation models, offering a promising solution for improved +crack detection and the long-term monitoring of concrete structures, even with +limited sequential data. + +
+
+
+
+
+ + ☆ Population estimation using 3D city modelling and Carto2S datasets -- A + case study + + +
+ With the launch of Carto2S series of satellites, high resolution images +(0.6-1.0 meters) are acquired and available for use. High resolution Digital +Elevation Model (DEM) with better accuracies can be generated using C2S +multi-view and multi date datasets. DEMs are further used as an input to derive +Digital terrain models (DTMs) and to extract accurate heights of the objects +(building and tree) over the surface of the Earth. Extracted building heights +are validated with ground control points and can be used for generation of city +modelling and resource estimation like population estimation, health planning, +water and transport resource estimations. In this study, an attempt is made to +assess the population of a township using high-resolution Indian remote sensing +satellite datasets. We used Carto 2S multi-view data and generated a precise +DEM and DTM over a city area. Using DEM and DTM datasets, accurate heights of +the buildings are extracted which are further validated with ground data. +Accurate building heights and high resolution imagery are used for generating +accurate virtual 3D city model and assessing the number of floor and carpet +area of the houses/ flats/ apartments. Population estimation of the area is +made using derived information of no of houses/ flats/ apartments from the +satellite datasets. Further, information about number of hospital and schools +around the residential area is extracted from open street maps (OSM). +Population estimation using satellite data and derived information from OSM +datasets can prove to be very good tool for local administrator and decision +makers. + +
+
+
+
+
+ + ☆ Solar potential analysis over Indian cities using high-resolution + satellite imagery and DEM + + +
+ Most of the research work in the solar potential analysis is performed +utilizing aerial imagery, LiDAR data, and satellite imagery. However, in the +existing studies using satellite data, parameters such as trees/ vegetation +shadow, adjacent higher architectural structures, and eccentric roof structures +in urban areas were not considered, and relatively coarser-resolution datasets +were used for analysis. In this work, we have implemented a novel approach to +estimate rooftop solar potential using inputs of high-resolution satellite +imagery (0.5 cm), a digital elevation model (1m), along with ground station +radiation data. Solar radiation analysis is performed using the diffusion +proportion and transmissivity ratio derived from the ground station data hosted +by IMD. It was observed that due to seasonal variations, environmental effects +and technical reasons such as solar panel structure etc., there can be a +significant loss of electricity generation up to 50%. Based on the results, it +is also understood that using 1m DEM and 50cm satellite imagery, more authentic +results are produced over the urban areas. + +
+
+
+
+
+ + ☆ Cross- and Intra-image Prototypical Learning for Multi-label Disease + Diagnosis and Interpretation + + +
+ Recent advances in prototypical learning have shown remarkable potential to +provide useful decision interpretations associating activation maps and +predictions with class-specific training prototypes. Such prototypical learning +has been well-studied for various single-label diseases, but for quite relevant +and more challenging multi-label diagnosis, where multiple diseases are often +concurrent within an image, existing prototypical learning models struggle to +obtain meaningful activation maps and effective class prototypes due to the +entanglement of the multiple diseases. In this paper, we present a novel Cross- +and Intra-image Prototypical Learning (CIPL) framework, for accurate +multi-label disease diagnosis and interpretation from medical images. CIPL +takes advantage of common cross-image semantics to disentangle the multiple +diseases when learning the prototypes, allowing a comprehensive understanding +of complicated pathological lesions. Furthermore, we propose a new two-level +alignment-based regularisation strategy that effectively leverages consistent +intra-image information to enhance interpretation robustness and predictive +performance. Extensive experiments show that our CIPL attains the +state-of-the-art (SOTA) classification accuracy in two public multi-label +benchmarks of disease diagnosis: thoracic radiography and fundus images. +Quantitative interpretability results show that CIPL also has superiority in +weakly-supervised thoracic disease localisation over other leading saliency- +and prototype-based explanation methods. + +
+
+
+
+
+ + ☆ Social EgoMesh Estimation + + +
+ Accurately estimating the 3D pose of the camera wearer in egocentric video +sequences is crucial to modeling human behavior in virtual and augmented +reality applications. The task presents unique challenges due to the limited +visibility of the user's body caused by the front-facing camera mounted on +their head. Recent research has explored the utilization of the scene and +ego-motion, but it has overlooked humans' interactive nature. We propose a +novel framework for Social Egocentric Estimation of body MEshes (SEE-ME). Our +approach is the first to estimate the wearer's mesh using only a latent +probabilistic diffusion model, which we condition on the scene and, for the +first time, on the social wearer-interactee interactions. Our in-depth study +sheds light on when social interaction matters most for ego-mesh estimation; it +quantifies the impact of interpersonal distance and gaze direction. Overall, +SEE-ME surpasses the current best technique, reducing the pose estimation error +(MPJPE) by 53%. The code is available at https://github.com/L-Scofano/SEEME. + +
+
+
+
+
+ + ☆ The Impact of Semi-Supervised Learning on Line Segment Detection + + +
+ In this paper we present a method for line segment detection in images, based +on a semi-supervised framework. Leveraging the use of a consistency loss based +on differently augmented and perturbed unlabeled images with a small amount of +labeled data, we show comparable results to fully supervised methods. This +opens up application scenarios where annotation is difficult or expensive, and +for domain specific adaptation of models. We are specifically interested in +real-time and online applications, and investigate small and efficient learning +backbones. Our method is to our knowledge the first to target line detection +using modern state-of-the-art methodologies for semi-supervised learning. We +test the method on both standard benchmarks and domain specific scenarios for +forestry applications, showing the tractability of the proposed method. + +
+
+ comment: 9 pages, 6 figures, 7 tables +
+
+
+
+
+ + ☆ TexLiverNet: Leveraging Medical Knowledge and Spatial-Frequency + Perception for Enhanced Liver Tumor Segmentation + + +
+ Integrating textual data with imaging in liver tumor segmentation is +essential for enhancing diagnostic accuracy. However, current multi-modal +medical datasets offer only general text annotations, lacking lesion-specific +details critical for extracting nuanced features, especially for fine-grained +segmentation of tumor boundaries and small lesions. To address these +limitations, we developed datasets with lesion-specific text annotations for +liver tumors and introduced the TexLiverNet model. TexLiverNet employs an +agent-based cross-attention module that integrates text features efficiently +with visual features, significantly reducing computational costs. Additionally, +enhanced spatial and adaptive frequency domain perception is proposed to +precisely delineate lesion boundaries, reduce background interference, and +recover fine details in small lesions. Comprehensive evaluations on public and +private datasets demonstrate that TexLiverNet achieves superior performance +compared to current state-of-the-art methods. + +
+
+
+
+
+ + ☆ Verification of Neural Networks against Convolutional Perturbations via + Parameterised Kernels + + +
+ We develop a method for the efficient verification of neural networks against +convolutional perturbations such as blurring or sharpening. To define input +perturbations we use well-known camera shake, box blur and sharpen kernels. We +demonstrate that these kernels can be linearly parameterised in a way that +allows for a variation of the perturbation strength while preserving desired +kernel properties. To facilitate their use in neural network verification, we +develop an efficient way of convolving a given input with these parameterised +kernels. The result of this convolution can be used to encode the perturbation +in a verification setting by prepending a linear layer to a given network. This +leads to tight bounds and a high effectiveness in the resulting verification +step. We add further precision by employing input splitting as a branch and +bound strategy. We demonstrate that we are able to verify robustness on a +number of standard benchmarks where the baseline is unable to provide any +safety certificates. To the best of our knowledge, this is the first solution +for verifying robustness against specific convolutional perturbations such as +camera shake. + +
+
+
+
+
+ + ☆ On the Inherent Robustness of One-Stage Object Detection against + Out-of-Distribution Data + + +
+ Robustness is a fundamental aspect for developing safe and trustworthy +models, particularly when they are deployed in the open world. In this work we +analyze the inherent capability of one-stage object detectors to robustly +operate in the presence of out-of-distribution (OoD) data. Specifically, we +propose a novel detection algorithm for detecting unknown objects in image +data, which leverages the features extracted by the model from each sample. +Differently from other recent approaches in the literature, our proposal does +not require retraining the object detector, thereby allowing for the use of +pretrained models. Our proposed OoD detector exploits the application of +supervised dimensionality reduction techniques to mitigate the effects of the +curse of dimensionality on the features extracted by the model. Furthermore, it +utilizes high-resolution feature maps to identify potential unknown objects in +an unsupervised fashion. Our experiments analyze the Pareto trade-off between +the performance detecting known and unknown objects resulting from different +algorithmic configurations and inference confidence thresholds. We also compare +the performance of our proposed algorithm to that of logits-based post-hoc OoD +methods, as well as possible fusion strategies. Finally, we discuss on the +competitiveness of all tested methods against state-of-the-art OoD approaches +for object detection models over the recently published Unknown Object +Detection benchmark. The obtained results verify that the performance of +avant-garde post-hoc OoD detectors can be further improved when combined with +our proposed algorithm. + +
+
+ comment: 12 figures, 4 tables, under review +
+
+
+
+
+ + ☆ PASSION for Dermatology: Bridging the Diversity Gap with Pigmented Skin + Images from Sub-Saharan Africa MICCAI 2024 + + +
+ Africa faces a huge shortage of dermatologists, with less than one per +million people. This is in stark contrast to the high demand for dermatologic +care, with 80% of the paediatric population suffering from largely untreated +skin conditions. The integration of AI into healthcare sparks significant hope +for treatment accessibility, especially through the development of AI-supported +teledermatology. Current AI models are predominantly trained on white-skinned +patients and do not generalize well enough to pigmented patients. The PASSION +project aims to address this issue by collecting images of skin diseases in +Sub-Saharan countries with the aim of open-sourcing this data. This dataset is +the first of its kind, consisting of 1,653 patients for a total of 4,901 +images. The images are representative of telemedicine settings and encompass +the most common paediatric conditions: eczema, fungals, scabies, and impetigo. +We also provide a baseline machine learning model trained on the dataset and a +detailed performance analysis for the subpopulations represented in the +dataset. The project website can be found at https://passionderm.github.io/. + +
+
+ comment: MICCAI 2024 +
+
+
+
+
+ + ☆ DomainGallery: Few-shot Domain-driven Image Generation by + Attribute-centric Finetuning NeurIPS 2024 + + +
+ The recent progress in text-to-image models pretrained on large-scale +datasets has enabled us to generate various images as long as we provide a text +prompt describing what we want. Nevertheless, the availability of these models +is still limited when we expect to generate images that fall into a specific +domain either hard to describe or just unseen to the models. In this work, we +propose DomainGallery, a few-shot domain-driven image generation method which +aims at finetuning pretrained Stable Diffusion on few-shot target datasets in +an attribute-centric manner. Specifically, DomainGallery features prior +attribute erasure, attribute disentanglement, regularization and enhancement. +These techniques are tailored to few-shot domain-driven generation in order to +solve key issues that previous works have failed to settle. Extensive +experiments are given to validate the superior performance of DomainGallery on +a variety of domain-driven generation scenarios. Codes are available at +https://github.com/Ldhlwh/DomainGallery. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Neural Fingerprints for Adversarial Attack Detection + + +
+ Deep learning models for image classification have become standard tools in +recent years. A well known vulnerability of these models is their +susceptibility to adversarial examples. These are generated by slightly +altering an image of a certain class in a way that is imperceptible to humans +but causes the model to classify it wrongly as another class. Many algorithms +have been proposed to address this problem, falling generally into one of two +categories: (i) building robust classifiers (ii) directly detecting attacked +images. Despite the good performance of these detectors, we argue that in a +white-box setting, where the attacker knows the configuration and weights of +the network and the detector, they can overcome the detector by running many +examples on a local copy, and sending only those that were not detected to the +actual model. This problem is common in security applications where even a very +good model is not sufficient to ensure safety. In this paper we propose to +overcome this inherent limitation of any static defence with randomization. To +do so, one must generate a very large family of detectors with consistent +performance, and select one or more of them randomly for each input. For the +individual detectors, we suggest the method of neural fingerprints. In the +training phase, for each class we repeatedly sample a tiny random subset of +neurons from certain layers of the network, and if their average is +sufficiently different between clean and attacked images of the focal class +they are considered a fingerprint and added to the detector bank. During test +time, we sample fingerprints from the bank associated with the label predicted +by the model, and detect attacks using a likelihood ratio test. We evaluate our +detectors on ImageNet with different attack methods and model architectures, +and show near-perfect detection with low rates of false detection. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ l0-Regularized Sparse Coding-based Interpretable Network for Multi-Modal + Image Fusion + + +
+ Multi-modal image fusion (MMIF) enhances the information content of the fused +image by combining the unique as well as common features obtained from +different modality sensor images, improving visualization, object detection, +and many more tasks. In this work, we introduce an interpretable network for +the MMIF task, named FNet, based on an l0-regularized multi-modal convolutional +sparse coding (MCSC) model. Specifically, for solving the l0-regularized CSC +problem, we develop an algorithm unrolling-based l0-regularized sparse coding +(LZSC) block. Given different modality source images, FNet first separates the +unique and common features from them using the LZSC block and then these +features are combined to generate the final fused image. Additionally, we +propose an l0-regularized MCSC model for the inverse fusion process. Based on +this model, we introduce an interpretable inverse fusion network named IFNet, +which is utilized during FNet's training. Extensive experiments show that FNet +achieves high-quality fusion results across five different MMIF tasks. +Furthermore, we show that FNet enhances downstream object detection in +visible-thermal image pairs. We have also visualized the intermediate results +of FNet, which demonstrates the good interpretability of our network. + +
+
+
+
+
+ + ☆ Continuous Sign Language Recognition System using Deep Learning with + MediaPipe Holistic + + +
+ Sign languages are the language of hearing-impaired people who use visuals +like the hand, facial, and body movements for communication. There are +different signs and gestures representing alphabets, words, and phrases. +Nowadays approximately 300 sign languages are being practiced worldwide such as +American Sign Language (ASL), Chinese Sign Language (CSL), Indian Sign Language +(ISL), and many more. Sign languages are dependent on the vocal language of a +place. Unlike vocal or spoken languages, there are no helping words in sign +language like is, am, are, was, were, will, be, etc. As only a limited +population is well-versed in sign language, this lack of familiarity of sign +language hinders hearing-impaired people from communicating freely and easily +with everyone. This issue can be addressed by a sign language recognition (SLR) +system which has the capability to translate the sign language into vocal +language. In this paper, a continuous SLR system is proposed using a deep +learning model employing Long Short-Term Memory (LSTM), trained and tested on +an ISL primary dataset. This dataset is created using MediaPipe Holistic +pipeline for tracking face, hand, and body movements and collecting landmarks. +The system recognizes the signs and gestures in real-time with 88.23% accuracy. + +
+
+ comment: 14 pages, 4 figures, Wireless Pers Commun +
+
+
+
+
+ + ☆ FedDP: Privacy-preserving method based on federated learning for + histopathology image segmentation + + +
+ Hematoxylin and Eosin (H&E) staining of whole slide images (WSIs) is +considered the gold standard for pathologists and medical practitioners for +tumor diagnosis, surgical planning, and post-operative assessment. With the +rapid advancement of deep learning technologies, the development of numerous +models based on convolutional neural networks and transformer-based models has +been applied to the precise segmentation of WSIs. However, due to privacy +regulations and the need to protect patient confidentiality, centralized +storage and processing of image data are impractical. Training a centralized +model directly is challenging to implement in medical settings due to these +privacy concerns.This paper addresses the dispersed nature and privacy +sensitivity of medical image data by employing a federated learning framework, +allowing medical institutions to collaboratively learn while protecting patient +privacy. Additionally, to address the issue of original data reconstruction +through gradient inversion during the federated learning training process, +differential privacy introduces noise into the model updates, preventing +attackers from inferring the contributions of individual samples, thereby +protecting the privacy of the training data.Experimental results show that the +proposed method, FedDP, minimally impacts model accuracy while effectively +safeguarding the privacy of cancer pathology image data, with only a slight +decrease in Dice, Jaccard, and Acc indices by 0.55%, 0.63%, and 0.42%, +respectively. This approach facilitates cross-institutional collaboration and +knowledge sharing while protecting sensitive data privacy, providing a viable +solution for further research and application in the medical field. + +
+
+ comment: Accepted in BIBM2024 +
+
+
+
+
+ + ☆ Pose2Trajectory: Using Transformers on Body Pose to Predict Tennis + Player's Trajectory + + +
+ Tracking the trajectory of tennis players can help camera operators in +production. Predicting future movement enables cameras to automatically track +and predict a player's future trajectory without human intervention. Predicting +future human movement in the context of complex physical tasks is also +intellectually satisfying. Swift advancements in sports analytics and the wide +availability of videos for tennis have inspired us to propose a novel method +called Pose2Trajectory, which predicts a tennis player's future trajectory as a +sequence derived from their body joints' data and ball position. Demonstrating +impressive accuracy, our approach capitalizes on body joint information to +provide a comprehensive understanding of the human body's geometry and motion, +thereby enhancing the prediction of the player's trajectory. We use +encoder-decoder Transformer architecture trained on the joints and trajectory +information of the players with ball positions. The predicted sequence can +provide information to help close-up cameras to keep tracking the tennis +player, following centroid coordinates. We generate a high-quality dataset from +multiple videos to assist tennis player movement prediction using object +detection and human pose estimation methods. It contains bounding boxes and +joint information for tennis players and ball positions in singles tennis +games. Our method shows promising results in predicting the tennis player's +movement trajectory with different sequence prediction lengths using the joints +and trajectory information with the ball position. + +
+
+
+
+
+ + ☆ Synergy-Guided Regional Supervision of Pseudo Labels for Semi-Supervised + Medical Image Segmentation + + +
+ Semi-supervised learning has received considerable attention for its +potential to leverage abundant unlabeled data to enhance model robustness. +Pseudo labeling is a widely used strategy in semi supervised learning. However, +existing methods often suffer from noise contamination, which can undermine +model performance. To tackle this challenge, we introduce a novel +Synergy-Guided Regional Supervision of Pseudo Labels (SGRS-Net) framework. +Built upon the mean teacher network, we employ a Mix Augmentation module to +enhance the unlabeled data. By evaluating the synergy before and after +augmentation, we strategically partition the pseudo labels into distinct +regions. Additionally, we introduce a Region Loss Evaluation module to assess +the loss across each delineated area. Extensive experiments conducted on the LA +dataset have demonstrated superior performance over state-of-the-art +techniques, underscoring the efficiency and practicality of our framework. + +
+
+
+
+
+ + ☆ CFPNet: Improving Lightweight ToF Depth Completion via Cross-zone + Feature Propagation + + +
+ Depth completion using lightweight time-of-flight (ToF) depth sensors is +attractive due to their low cost. However, lightweight ToF sensors usually have +a limited field of view (FOV) compared with cameras. Thus, only pixels in the +zone area of the image can be associated with depth signals. Previous methods +fail to propagate depth features from the zone area to the outside-zone area +effectively, thus suffering from degraded depth completion performance outside +the zone. To this end, this paper proposes the CFPNet to achieve cross-zone +feature propagation from the zone area to the outside-zone area with two novel +modules. The first is a direct-attention-based propagation module (DAPM), which +enforces direct cross-zone feature acquisition. The second is a +large-kernel-based propagation module (LKPM), which realizes cross-zone feature +propagation by utilizing convolution layers with kernel sizes up to 31. CFPNet +achieves state-of-the-art (SOTA) depth completion performance by combining +these two modules properly, as verified by extensive experimental results on +the ZJU-L5 dataset. The code will be made public. + +
+
+
+
+
+ + ☆ Deep Learning Models for UAV-Assisted Bridge Inspection: A YOLO + Benchmark Analysis + + +
+ Visual inspections of bridges are critical to ensure their safety and +identify potential failures early. This inspection process can be rapidly and +accurately automated by using unmanned aerial vehicles (UAVs) integrated with +deep learning models. However, choosing an appropriate model that is +lightweight enough to integrate into the UAV and fulfills the strict +requirements for inference time and accuracy is challenging. Therefore, our +work contributes to the advancement of this model selection process by +conducting a benchmark of 23 models belonging to the four newest YOLO variants +(YOLOv5, YOLOv6, YOLOv7, YOLOv8) on COCO-Bridge-2021+, a dataset for bridge +details detection. Through comprehensive benchmarking, we identify YOLOv8n, +YOLOv7tiny, YOLOv6m, and YOLOv6m6 as the models offering an optimal balance +between accuracy and processing speed, with mAP@50 scores of 0.803, 0.837, +0.853, and 0.872, and inference times of 5.3ms, 7.5ms, 14.06ms, and 39.33ms, +respectively. Our findings accelerate the model selection process for UAVs, +enabling more efficient and reliable bridge inspections. + +
+
+
+
+
+ + ☆ FreeCap: Hybrid Calibration-Free Motion Capture in Open Environments + + +
+ We propose a novel hybrid calibration-free method FreeCap to accurately +capture global multi-person motions in open environments. Our system combines a +single LiDAR with expandable moving cameras, allowing for flexible and precise +motion estimation in a unified world coordinate. In particular, We introduce a +local-to-global pose-aware cross-sensor human-matching module that predicts the +alignment among each sensor, even in the absence of calibration. Additionally, +our coarse-to-fine sensor-expandable pose optimizer further optimizes the 3D +human key points and the alignments, it is also capable of incorporating +additional cameras to enhance accuracy. Extensive experiments on Human-M3 and +FreeMotion datasets demonstrate that our method significantly outperforms +state-of-the-art single-modal methods, offering an expandable and efficient +solution for multi-person motion capture across various applications. + +
+
+
+
+
+ + ☆ Efficient single image non-uniformity correction algorithm + + +
+ This paper introduces a new way to correct the non-uniformity (NU) in +uncooled infrared-type images. The main defect of these uncooled images is the +lack of a column (resp. line) time-dependent cross-calibration, resulting in a +strong column (resp. line) and time dependent noise. This problem can be +considered as a 1D flicker of the columns inside each frame. Thus, classic +movie deflickering algorithms can be adapted, to equalize the columns (resp. +the lines). The proposed method therefore applies to the series formed by the +columns of an infrared image a movie deflickering algorithm. The obtained +single image method works on static images, and therefore requires no +registration, no camera motion compensation, and no closed aperture sensor +equalization. Thus, the method has only one camera dependent parameter, and is +landscape independent. This simple method will be compared to a state of the +art total variation single image correction on raw real and simulated images. +The method is real time, requiring only two operations per pixel. It involves +no test-pattern calibration and produces no "ghost artifacts". + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2411.03615 +
+
+
+
+
+ + ☆ Properties of BV-G structures + textures decomposition models. + Application to road detection in satellite images + + +
+ In this paper we present some theoretical results about a structures-textures +image decomposition model which was proposed by the second author. We prove a +theorem which gives the behavior of this model in different cases. Finally, as +a consequence of the theorem we derive an algorithm for the detection of long +and thin objects applied to a road networks detection application in aerial or +satellite images. + +
+
+
+
+
+ + ☆ BendVLM: Test-Time Debiasing of Vision-Language Embeddings + + +
+ Vision-language model (VLM) embeddings have been shown to encode biases +present in their training data, such as societal biases that prescribe negative +characteristics to members of various racial and gender identities. VLMs are +being quickly adopted for a variety of tasks ranging from few-shot +classification to text-guided image generation, making debiasing VLM embeddings +crucial. Debiasing approaches that fine-tune the VLM often suffer from +catastrophic forgetting. On the other hand, fine-tuning-free methods typically +utilize a "one-size-fits-all" approach that assumes that correlation with the +spurious attribute can be explained using a single linear direction across all +possible inputs. In this work, we propose Bend-VLM, a nonlinear, +fine-tuning-free approach for VLM embedding debiasing that tailors the +debiasing operation to each unique input. This allows for a more flexible +debiasing approach. Additionally, we do not require knowledge of the set of +inputs a priori to inference time, making our method more appropriate for +online, open-set tasks such as retrieval and text guided image generation. + +
+
+
+
+
+ + ☆ Image Understanding Makes for A Good Tokenizer for Image Generation NeurIPS 2024 + + +
+ Abstract Modern image generation (IG) models have been shown to capture rich +semantics valuable for image understanding (IU) tasks. However, the potential +of IU models to improve IG performance remains uncharted. We address this issue +using a token-based IG framework, which relies on effective tokenizers to +project images into token sequences. Currently, pixel reconstruction (e.g., +VQGAN) dominates the training objective for image tokenizers. In contrast, our +approach adopts the feature reconstruction objective, where tokenizers are +trained by distilling knowledge from pretrained IU encoders. Comprehensive +comparisons indicate that tokenizers with strong IU capabilities achieve +superior IG performance across a variety of metrics, datasets, tasks, and +proposal networks. Notably, VQ-KD CLIP achieves $4.10$ FID on ImageNet-1k +(IN-1k). Visualization suggests that the superiority of VQ-KD can be partly +attributed to the rich semantics within the VQ-KD codebook. We further +introduce a straightforward pipeline to directly transform IU encoders into +tokenizers, demonstrating exceptional effectiveness for IG tasks. These +discoveries may energize further exploration into image tokenizer research and +inspire the community to reassess the relationship between IU and IG. The code +is released at https://github.com/magic-research/vector_quantization. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ☆ Enhancing Bronchoscopy Depth Estimation through Synthetic-to-Real Domain + Adaptation + + +
+ Monocular depth estimation has shown promise in general imaging tasks, aiding +in localization and 3D reconstruction. While effective in various domains, its +application to bronchoscopic images is hindered by the lack of labeled data, +challenging the use of supervised learning methods. In this work, we propose a +transfer learning framework that leverages synthetic data with depth labels for +training and adapts domain knowledge for accurate depth estimation in real +bronchoscope data. Our network demonstrates improved depth prediction on real +footage using domain adaptation compared to training solely on synthetic data, +validating our approach. + +
+
+
+
+
+ + ☆ ProGraph: Temporally-alignable Probability Guided Graph Topological + Modeling for 3D Human Reconstruction + + +
+ Current 3D human motion reconstruction methods from monocular videos rely on +features within the current reconstruction window, leading to distortion and +deformations in the human structure under local occlusions or blurriness in +video frames. To estimate realistic 3D human mesh sequences based on incomplete +features, we propose Temporally-alignable Probability Guided Graph Topological +Modeling for 3D Human Reconstruction (ProGraph). For missing parts recovery, we +exploit the explicit topological-aware probability distribution across the +entire motion sequence. To restore the complete human, Graph Topological +Modeling (GTM) learns the underlying topological structure, focusing on the +relationships inherent in the individual parts. Next, to generate blurred +motion parts, Temporal-alignable Probability Distribution (TPDist) utilizes the +GTM to predict features based on distribution. This interactive mechanism +facilitates motion consistency, allowing the restoration of human parts. +Furthermore, Hierarchical Human Loss (HHLoss) constrains the probability +distribution errors of inter-frame features during topological structure +variation. Our Method achieves superior results than other SOTA methods in +addressing occlusions and blurriness on 3DPW. + +
+
+
+
+
+ + ☆ MegaPortrait: Revisiting Diffusion Control for High-fidelity Portrait + Generation + + +
+ We propose MegaPortrait. It's an innovative system for creating personalized +portrait images in computer vision. It has three modules: Identity Net, Shading +Net, and Harmonization Net. Identity Net generates learned identity using a +customized model fine-tuned with source images. Shading Net re-renders +portraits using extracted representations. Harmonization Net fuses pasted faces +and the reference image's body for coherent results. Our approach with +off-the-shelf Controlnets is better than state-of-the-art AI portrait products +in identity preservation and image fidelity. MegaPortrait has a simple but +effective design and we compare it with other methods and products to show its +superiority. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ LidaRefer: Outdoor 3D Visual Grounding for Autonomous Driving with + Transformers + + +
+ 3D visual grounding (VG) aims to locate relevant objects or regions within 3D +scenes based on natural language descriptions. Although recent methods for +indoor 3D VG have successfully transformer-based architectures to capture +global contextual information and enable fine-grained cross-modal fusion, they +are unsuitable for outdoor environments due to differences in the distribution +of point clouds between indoor and outdoor settings. Specifically, first, +extensive LiDAR point clouds demand unacceptable computational and memory +resources within transformers due to the high-dimensional visual features. +Second, dominant background points and empty spaces in sparse LiDAR point +clouds complicate cross-modal fusion owing to their irrelevant visual +information. To address these challenges, we propose LidaRefer, a +transformer-based 3D VG framework designed for large-scale outdoor scenes. +Moreover, during training, we introduce a simple and effective localization +method, which supervises the decoder's queries to localize not only a target +object but also ambiguous objects that might be confused as the target due to +the exhibition of similar attributes in a scene or the incorrect understanding +of a language description. This supervision enhances the model's ability to +distinguish ambiguous objects from a target by learning the differences in +their spatial relationships and attributes. LidaRefer achieves state-of-the-art +performance on Talk2Car-3D, a 3D VG dataset for autonomous driving, with +significant improvements under various evaluation settings. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ UEVAVD: A Dataset for Developing UAV's Eye View Active Object Detection + + +
+ Occlusion is a longstanding difficulty that challenges the UAV-based object +detection. Many works address this problem by adapting the detection model. +However, few of them exploit that the UAV could fundamentally improve detection +performance by changing its viewpoint. Active Object Detection (AOD) offers an +effective way to achieve this purpose. Through Deep Reinforcement Learning +(DRL), AOD endows the UAV with the ability of autonomous path planning to +search for the observation that is more conducive to target identification. +Unfortunately, there exists no available dataset for developing the UAV AOD +method. To fill this gap, we released a UAV's eye view active vision dataset +named UEVAVD and hope it can facilitate research on the UAV AOD problem. +Additionally, we improve the existing DRL-based AOD method by incorporating the +inductive bias when learning the state representation. First, due to the +partial observability, we use the gated recurrent unit to extract state +representations from the observation sequence instead of the single-view +observation. Second, we pre-decompose the scene with the Segment Anything Model +(SAM) and filter out the irrelevant information with the derived masks. With +these practices, the agent could learn an active viewing policy with better +generalization capability. The effectiveness of our innovations is validated by +the experiments on the UEVAVD dataset. Our dataset will soon be available at +https://github.com/Leo000ooo/UEVAVD_dataset. + +
+
+
+
+
+ + ☆ GazeGen: Gaze-Driven User Interaction for Visual Content Generation + + +
+ We present GazeGen, a user interaction system that generates visual content +(images and videos) for locations indicated by the user's eye gaze. GazeGen +allows intuitive manipulation of visual content by targeting regions of +interest with gaze. Using advanced techniques in object detection and +generative AI, GazeGen performs gaze-controlled image adding/deleting, +repositioning, and surface material changes of image objects, and converts +static images into videos. Central to GazeGen is the DFT Gaze (Distilled and +Fine-Tuned Gaze) agent, an ultra-lightweight model with only 281K parameters, +performing accurate real-time gaze predictions tailored to individual users' +eyes on small edge devices. GazeGen is the first system to combine visual +content generation with real-time gaze estimation, made possible exclusively by +DFT Gaze. This real-time gaze estimation enables various visual content +generation tasks, all controlled by the user's gaze. The input for DFT Gaze is +the user's eye images, while the inputs for visual content generation are the +user's view and the predicted gaze point from DFT Gaze. To achieve efficient +gaze predictions, we derive the small model from a large model (10x larger) via +novel knowledge distillation and personal adaptation techniques. We integrate +knowledge distillation with a masked autoencoder, developing a compact yet +powerful gaze estimation model. This model is further fine-tuned with Adapters, +enabling highly accurate and personalized gaze predictions with minimal user +input. DFT Gaze ensures low-latency and precise gaze tracking, supporting a +wide range of gaze-driven tasks. We validate the performance of DFT Gaze on AEA +and OpenEDS2020 benchmarks, demonstrating low angular gaze error and low +latency on the edge device (Raspberry Pi 4). Furthermore, we describe +applications of GazeGen, illustrating its versatility and effectiveness in +various usage scenarios. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ☆ HandCraft: Anatomically Correct Restoration of Malformed Hands in + Diffusion Generated Images WACV 2025 + + +
+ Generative text-to-image models, such as Stable Diffusion, have demonstrated +a remarkable ability to generate diverse, high-quality images. However, they +are surprisingly inept when it comes to rendering human hands, which are often +anatomically incorrect or reside in the "uncanny valley". In this paper, we +propose a method HandCraft for restoring such malformed hands. This is achieved +by automatically constructing masks and depth images for hands as conditioning +signals using a parametric model, allowing a diffusion-based image editor to +fix the hand's anatomy and adjust its pose while seamlessly integrating the +changes into the original image, preserving pose, color, and style. Our +plug-and-play hand restoration solution is compatible with existing pretrained +diffusion models, and the restoration process facilitates adoption by eschewing +any fine-tuning or training requirements for the diffusion models. We also +contribute MalHand datasets that contain generated images with a wide variety +of malformed hands in several styles for hand detector training and hand +restoration benchmarking, and demonstrate through qualitative and quantitative +evaluation that HandCraft not only restores anatomical correctness but also +maintains the integrity of the overall image. + +
+
+ comment: Accepted by WACV 2025 +
+
+
+
+
+ + ♻ ☆ A Comparative Analysis of U-Net-based models for Segmentation of Cardiac + MRI + + +
+ Medical imaging refers to the technologies and methods utilized to view the +human body and its inside, in order to diagnose, monitor, or even treat medical +disorders. This paper aims to explore the application of deep learning +techniques in the semantic segmentation of Cardiac short-axis MRI (Magnetic +Resonance Imaging) images, aiming to enhance the diagnosis, monitoring, and +treatment of medical disorders related to the heart. The focus centers on +implementing various architectures that are derivatives of U-Net, to +effectively isolate specific parts of the heart for comprehensive anatomical +and functional analysis. Through a combination of images, graphs, and +quantitative metrics, the efficacy of the models and their predictions are +showcased. Additionally, this paper addresses encountered challenges and +outline strategies for future improvements. This abstract provides a concise +overview of the efforts in utilizing deep learning for cardiac image +segmentation, emphasizing both the accomplishments and areas for further +refinement. + +
+
+
+
+
+ + ♻ ☆ GD doesn't make the cut: Three ways that non-differentiability affects + neural network training + + +
+ This paper critically examines the fundamental distinctions between gradient +methods applied to non-differentiable functions (NGDMs) and classical gradient +descents (GDs) for differentiable functions, revealing significant gaps in +current deep learning optimization theory. We demonstrate that NGDMs exhibit +markedly different convergence properties compared to GDs, strongly challenging +the applicability of extensive neural network convergence literature based on +$L-smoothness$ to non-smooth neural networks. Our analysis reveals paradoxical +behavior of NDGM solutions for $L_{1}$-regularized problems, where increasing +regularization counterintuitively leads to larger $L_{1}$ norms of optimal +solutions. This finding calls into question widely adopted $L_{1}$ penalization +techniques for network pruning. We further challenge the common assumption that +optimization algorithms like RMSProp behave similarly in differentiable and +non-differentiable contexts. Expanding on the Edge of Stability phenomenon, we +demonstrate its occurrence in a broader class of functions, including Lipschitz +continuous convex differentiable functions. This finding raises important +questions about its relevance and interpretation in non-convex, +non-differentiable neural networks, particularly those using ReLU activations. +Our work identifies critical misunderstandings of NDGMs in influential +literature, stemming from an overreliance on strong smoothness assumptions. +These findings necessitate a reevaluation of optimization dynamics in deep +learning, emphasizing the crucial need for more nuanced theoretical foundations +in analyzing these complex systems. + +
+
+
+
+
+ + ♻ ☆ Exploring QUIC Dynamics: A Large-Scale Dataset for Encrypted Traffic + Analysis + + +
+ QUIC, a new and increasingly used transport protocol, addresses and resolves +the limitations of TCP by offering improved security, performance, and features +such as stream multiplexing and connection migration. These features, however, +also present challenges for network operators who need to monitor and analyze +web traffic. In this paper, we introduce VisQUIC, a labeled dataset comprising +over 100,000 QUIC traces from more than 44,000 websites (URLs), collected over +a four-month period. These traces provide the foundation for generating more +than seven million images, with configurable parameters of window length, pixel +resolution, normalization, and labels. These images enable an observer looking +at the interactions between a client and a server to analyze and gain insights +about QUIC encrypted connections. To illustrate the dataset's potential, we +offer a use-case example of an observer estimating the number of HTTP/3 +responses/requests pairs in a given QUIC, which can reveal server behavior, +client--server interactions, and the load imposed by an observed connection. We +formulate the problem as a discrete regression problem, train a machine +learning (ML) model for it, and then evaluate it using the proposed dataset on +an example use case. + +
+
+ comment: The dataset and the supplementary material can be provided upon + request +
+
+
+
+
+ + ♻ ☆ C3T: Cross-modal Transfer Through Time for Human Action Recognition + + +
+ In order to unlock the potential of diverse sensors, we investigate a method +to transfer knowledge between modalities using the structure of a unified +multimodal representation space for Human Action Recognition (HAR). We +formalize and explore an understudied cross-modal transfer setting we term +Unsupervised Modality Adaptation (UMA), where the modality used in testing is +not used in supervised training, i.e. zero labeled instances of the test +modality are available during training. We develop three methods to perform +UMA: Student-Teacher (ST), Contrastive Alignment (CA), and Cross-modal Transfer +Through Time (C3T). Our extensive experiments on various camera+IMU datasets +compare these methods to each other in the UMA setting, and to their empirical +upper bound in the supervised setting. The results indicate C3T is the most +robust and highest performing by at least a margin of 8%, and nears the +supervised setting performance even in the presence of temporal noise. This +method introduces a novel mechanism for aligning signals across time-varying +latent vectors, extracted from the receptive field of temporal convolutions. +Our findings suggest that C3T has significant potential for developing +generalizable models for time-series sensor data, opening new avenues for +multi-modal learning in various applications. + +
+
+
+
+
+ + ♻ ☆ Adaptive Caching for Faster Video Generation with Diffusion Transformers + + +
+ Generating temporally-consistent high-fidelity videos can be computationally +expensive, especially over longer temporal spans. More-recent Diffusion +Transformers (DiTs) -- despite making significant headway in this context -- +have only heightened such challenges as they rely on larger models and heavier +attention mechanisms, resulting in slower inference speeds. In this paper, we +introduce a training-free method to accelerate video DiTs, termed Adaptive +Caching (AdaCache), which is motivated by the fact that "not all videos are +created equal": meaning, some videos require fewer denoising steps to attain a +reasonable quality than others. Building on this, we not only cache +computations through the diffusion process, but also devise a caching schedule +tailored to each video generation, maximizing the quality-latency trade-off. We +further introduce a Motion Regularization (MoReg) scheme to utilize video +information within AdaCache, essentially controlling the compute allocation +based on motion content. Altogether, our plug-and-play contributions grant +significant inference speedups (e.g. up to 4.7x on Open-Sora 720p - 2s video +generation) without sacrificing the generation quality, across multiple video +DiT baselines. + +
+
+ comment: Project-page is available at https://adacache-dit.github.io +
+
+
+
+
+ + ♻ ☆ CardioSpectrum: Comprehensive Myocardium Motion Analysis with 3D Deep + Learning and Geometric Insights MICCAI 2024 + + +
+ The ability to map left ventricle (LV) myocardial motion using computed +tomography angiography (CTA) is essential to diagnosing cardiovascular +conditions and guiding interventional procedures. Due to their inherent +locality, conventional neural networks typically have difficulty predicting +subtle tangential movements, which considerably lessens the level of precision +at which myocardium three-dimensional (3D) mapping can be performed. Using 3D +optical flow techniques and Functional Maps (FMs), we present a comprehensive +approach to address this problem. FMs are known for their capacity to capture +global geometric features, thus providing a fuller understanding of 3D +geometry. As an alternative to traditional segmentation-based priors, we employ +surface-based two-dimensional (2D) constraints derived from spectral +correspondence methods. Our 3D deep learning architecture, based on the ARFlow +model, is optimized to handle complex 3D motion analysis tasks. By +incorporating FMs, we can capture the subtle tangential movements of the +myocardium surface precisely, hence significantly improving the accuracy of 3D +mapping of the myocardium. The experimental results confirm the effectiveness +of this method in enhancing myocardium motion analysis. This approach can +contribute to improving cardiovascular diagnosis and treatment. Our code and +additional resources are available at: +https://shaharzuler.github.io/CardioSpectrumPage + +
+
+ comment: This paper has been early accepted to MICCAI 2024, LNCS 15005, + Springer, 2024 +
+
+
+
+
+ + ♻ ☆ Pediatric Wrist Fracture Detection Using Feature Context Excitation + Modules in X-ray Images + + +
+ Children often suffer wrist trauma in daily life, while they usually need +radiologists to analyze and interpret X-ray images before surgical treatment by +surgeons. The development of deep learning has enabled neural networks to serve +as computer-assisted diagnosis (CAD) tools to help doctors and experts in +medical image diagnostics. Since YOLOv8 model has obtained the satisfactory +success in object detection tasks, it has been applied to various fracture +detection. This work introduces four variants of Feature Contexts +Excitation-YOLOv8 (FCE-YOLOv8) model, each incorporating a different FCE module +(i.e., modules of Squeeze-and-Excitation (SE), Global Context (GC), +Gather-Excite (GE), and Gaussian Context Transformer (GCT)) to enhance the +model performance. Experimental results on GRAZPEDWRI-DX dataset demonstrate +that our proposed YOLOv8+GC-M3 model improves the mAP@50 value from 65.78% to +66.32%, outperforming the state-of-the-art (SOTA) model while reducing +inference time. Furthermore, our proposed YOLOv8+SE-M3 model achieves the +highest mAP@50 value of 67.07%, exceeding the SOTA performance. The +implementation of this work is available at +https://github.com/RuiyangJu/FCE-YOLOv8. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.03163 +
+
+
+
+
+ + ♻ ☆ Knowledge Graphs of Driving Scenes to Empower the Emerging Capabilities + of Neurosymbolic AI + + +
+ In the era of Generative AI, Neurosymbolic AI is emerging as a powerful +approach for tasks spanning from perception to cognition. The use of +Neurosymbolic AI has been shown to achieve enhanced capabilities, including +improved grounding, alignment, explainability, and reliability. However, due to +its nascent stage, there is a lack of widely available real-world benchmark +datasets tailored to Neurosymbolic AI tasks. To address this gap and support +the evaluation of current and future methods, we introduce DSceneKG -- a suite +of knowledge graphs of driving scenes built from real-world, high-quality +scenes from multiple open autonomous driving datasets. In this article, we +detail the construction process of DSceneKG and highlight its application in +seven different tasks. DSceneKG is publicly accessible at: +https://github.com/ruwantw/DSceneKG + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Interpreting CLIP: Insights on the Robustness to ImageNet Distribution + Shifts + + +
+ What distinguishes robust models from non-robust ones? While for ImageNet +distribution shifts it has been shown that such differences in robustness can +be traced back predominantly to differences in training data, so far it is not +known what that translates to in terms of what the model has learned. In this +work, we bridge this gap by probing the representation spaces of 16 robust +zero-shot CLIP vision encoders with various backbones (ResNets and ViTs) and +pretraining sets (OpenAI, LAION-400M, LAION-2B, YFCC15M, CC12M and {DataComp}), +and comparing them to the representation spaces of less robust models with +identical backbones, but different (pre)training sets or objectives (CLIP +pretraining on ImageNet-Captions, and supervised training or finetuning on +ImageNet).Through this analysis, we generate three novel insights. Firstly, we +detect the presence of outlier features in robust zero-shot CLIP vision +encoders, which to the best of our knowledge is the first time these are +observed in non-language and non-transformer models. Secondly, we find the +existence of outlier features to be an indication of ImageNet shift robustness +in models, since we only find them in robust models in our analysis. Lastly, we +also investigate the number of unique encoded concepts in the representation +space and find zero-shot CLIP models to encode a higher number of unique +concepts in their representation space. However, we do not find this to be an +indicator of ImageNet shift robustness and hypothesize that it is rather +related to the language supervision. Since the presence of outlier features can +be detected without access to any data from shifted datasets, we believe that +they could be a useful tool for practitioners to get a feeling for the +distribution shift robustness of a pretrained model during deployment. + +
+
+ comment: Published in TMLR +
+
+
+
+
+ + ♻ ☆ Mini-InternVL: A Flexible-Transfer Pocket Multimodal Model with 5% + Parameters and 90% Performance + + +
+ Multimodal large language models (MLLMs) have demonstrated impressive +performance in vision-language tasks across a broad spectrum of domains. +However, the large model scale and associated high computational costs pose +significant challenges for training and deploying MLLMs on consumer-grade GPUs +or edge devices, thereby hindering their widespread application. In this work, +we introduce Mini-InternVL, a series of MLLMs with parameters ranging from 1B +to 4B, which achieves 90% of the performance with only 5% of the parameters. +This significant improvement in efficiency and effectiveness makes our models +more accessible and applicable in various real-world scenarios. To further +promote the adoption of our models, we develop a unified adaptation framework +for Mini-InternVL, which enables our models to transfer and outperform +specialized models in downstream tasks, including autonomous driving, medical +images, and remote sensing. We believe that our study can provide valuable +insights and resources to advance the development of efficient and effective +MLLMs. Code is available at https://github.com/OpenGVLab/InternVL. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ BrainSegFounder: Towards 3D Foundation Models for Neuroimage + Segmentation + + +
+ The burgeoning field of brain health research increasingly leverages +artificial intelligence (AI) to interpret and analyze neurological data. This +study introduces a novel approach towards the creation of medical foundation +models by integrating a large-scale multi-modal magnetic resonance imaging +(MRI) dataset derived from 41,400 participants in its own. Our method involves +a novel two-stage pretraining approach using vision transformers. The first +stage is dedicated to encoding anatomical structures in generally healthy +brains, identifying key features such as shapes and sizes of different brain +regions. The second stage concentrates on spatial information, encompassing +aspects like location and the relative positioning of brain structures. We +rigorously evaluate our model, BrainFounder, using the Brain Tumor Segmentation +(BraTS) challenge and Anatomical Tracings of Lesions After Stroke v2.0 (ATLAS +v2.0) datasets. BrainFounder demonstrates a significant performance gain, +surpassing the achievements of the previous winning solutions using fully +supervised learning. Our findings underscore the impact of scaling up both the +complexity of the model and the volume of unlabeled training data derived from +generally healthy brains, which enhances the accuracy and predictive +capabilities of the model in complex neuroimaging tasks with MRI. The +implications of this research provide transformative insights and practical +applications in healthcare and make substantial steps towards the creation of +foundation models for Medical AI. Our pretrained models and training code can +be found at https://github.com/lab-smile/GatorBrain. + +
+
+ comment: 19 pages, 5 figures, to be published in Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ DiT4Edit: Diffusion Transformer for Image Editing + + +
+ Despite recent advances in UNet-based image editing, methods for shape-aware +object editing in high-resolution images are still lacking. Compared to UNet, +Diffusion Transformers (DiT) demonstrate superior capabilities to effectively +capture the long-range dependencies among patches, leading to higher-quality +image generation. In this paper, we propose DiT4Edit, the first Diffusion +Transformer-based image editing framework. Specifically, DiT4Edit uses the +DPM-Solver inversion algorithm to obtain the inverted latents, reducing the +number of steps compared to the DDIM inversion algorithm commonly used in +UNet-based frameworks. Additionally, we design unified attention control and +patches merging, tailored for transformer computation streams. This integration +allows our framework to generate higher-quality edited images faster. Our +design leverages the advantages of DiT, enabling it to surpass UNet structures +in image editing, especially in high-resolution and arbitrary-size images. +Extensive experiments demonstrate the strong performance of DiT4Edit across +various editing scenarios, highlighting the potential of Diffusion Transformers +in supporting image editing. + +
+
+
+
+
+ + ♻ ☆ SpikeBottleNet: Spike-Driven Feature Compression Architecture for + Edge-Cloud Co-Inference + + +
+ Edge-cloud co-inference enables efficient deep neural network (DNN) +deployment by splitting the architecture between an edge device and cloud +server, crucial for resource-constraint edge devices. This approach requires +balancing on-device computations and communication costs, often achieved +through compressed intermediate feature transmission. Conventional DNN +architectures require continuous data processing and floating point +activations, leading to considerable energy consumption and increased feature +sizes, thus raising transmission costs. This challenge motivates exploring +binary, event-driven activations using spiking neural networks (SNNs), known +for their extreme energy efficiency. In this research, we propose +SpikeBottleNet, a novel architecture for edge-cloud co-inference systems that +integrates a spiking neuron model to significantly reduce energy consumption on +edge devices. A key innovation of our study is an intermediate feature +compression technique tailored for SNNs for efficient feature transmission. +This technique leverages a split computing approach to strategically place +encoder-decoder bottleneck units within complex deep architectures like ResNet +and MobileNet. Experimental results demonstrate that SpikeBottleNet achieves up +to 256x bit compression in the final convolutional layer of ResNet, with +minimal accuracy loss (0.16%). Additionally, our approach enhances edge device +energy efficiency by up to 144x compared to the baseline BottleNet, making it +ideal for resource-limited edge devices. + +
+
+
+
+
+ + ♻ ☆ Local Padding in Patch-Based GANs for Seamless Infinite-Sized Texture + Synthesis + + +
+ Texture models based on Generative Adversarial Networks (GANs) use +zero-padding to implicitly encode positional information of the image features. +However, when extending the spatial input to generate images at large sizes, +zero-padding can often lead to degradation in image quality due to the +incorrect positional information at the center of the image. Moreover, +zero-padding can limit the diversity within the generated large images. In this +paper, we propose a novel approach for generating stochastic texture images at +large arbitrary sizes using GANs based on patch-by-patch generation. Instead of +zero-padding, the model uses \textit{local padding} in the generator that +shares border features between the generated patches; providing positional +context and ensuring consistency at the boundaries. The proposed models are +trainable on a single texture image and have a constant GPU scalability with +respect to the output image size, and hence can generate images of infinite +sizes. We show in the experiments that our method has a significant advancement +beyond existing GANs-based texture models in terms of the quality and diversity +of the generated textures. Furthermore, the implementation of local padding in +the state-of-the-art super-resolution models effectively eliminates tiling +artifacts enabling large-scale super-resolution. Our code is available at +\url{https://github.com/ai4netzero/Infinite_Texture_GANs}. + +
+
+
+
+
+ + ♻ ☆ DN-Splatter: Depth and Normal Priors for Gaussian Splatting and Meshing WACV + + +
+ High-fidelity 3D reconstruction of common indoor scenes is crucial for VR and +AR applications. 3D Gaussian splatting, a novel differentiable rendering +technique, has achieved state-of-the-art novel view synthesis results with high +rendering speeds and relatively low training times. However, its performance on +scenes commonly seen in indoor datasets is poor due to the lack of geometric +constraints during optimization. In this work, we explore the use of readily +accessible geometric cues to enhance Gaussian splatting optimization in +challenging, ill-posed, and textureless scenes. We extend 3D Gaussian splatting +with depth and normal cues to tackle challenging indoor datasets and showcase +techniques for efficient mesh extraction. Specifically, we regularize the +optimization procedure with depth information, enforce local smoothness of +nearby Gaussians, and use off-the-shelf monocular networks to achieve better +alignment with the true scene geometry. We propose an adaptive depth loss based +on the gradient of color images, improving depth estimation and novel view +synthesis results over various baselines. Our simple yet effective +regularization technique enables direct mesh extraction from the Gaussian +representation, yielding more physically accurate reconstructions of indoor +scenes. + +
+
+ comment: To be published in 2025 IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV) +
+
+
+
+
+ + ♻ ☆ Representing Domain-Mixing Optical Degradation for Real-World + Computational Aberration Correction via Vector Quantization + + +
+ Relying on paired synthetic data, existing learning-based Computational +Aberration Correction (CAC) methods are confronted with the intricate and +multifaceted synthetic-to-real domain gap, which leads to suboptimal +performance in real-world applications. In this paper, in contrast to improving +the simulation pipeline, we deliver a novel insight into real-world CAC from +the perspective of Unsupervised Domain Adaptation (UDA). By incorporating +readily accessible unpaired real-world data into training, we formalize the +Domain Adaptive CAC (DACAC) task, and then introduce a comprehensive Real-world +aberrated images (Realab) dataset to benchmark it. The setup task presents a +formidable challenge due to the intricacy of understanding the target optical +degradation domain. To this intent, we propose a novel Quantized Domain-Mixing +Representation (QDMR) framework as a potent solution to the issue. Centering +around representing and quantizing the optical degradation which is consistent +across different images, QDMR adapts the CAC model to the target domain from +three key aspects: (1) reconstructing aberrated images of both domains by a +VQGAN to learn a Domain-Mixing Codebook (DMC) characterizing the optical +degradation; (2) modulating the deep features in CAC model with DMC to transfer +the target domain knowledge; and (3) leveraging the trained VQGAN to generate +pseudo target aberrated images from the source ones for convincing target +domain supervision. Extensive experiments on both synthetic and real-world +benchmarks reveal that the models with QDMR consistently surpass the +competitive methods in mitigating the synthetic-to-real gap, which produces +visually pleasant real-world CAC results with fewer artifacts. Codes and +datasets are made publicly available at https://github.com/zju-jiangqi/QDMR. + +
+
+ comment: Accepted to Optics & Laser Technology. Codes and datasets are made + publicly available at https://github.com/zju-jiangqi/QDMR +
+
+
+
+
+ + ♻ ☆ Evaluating alignment between humans and neural network representations + in image-based learning tasks + + +
+ Humans represent scenes and objects in rich feature spaces, carrying +information that allows us to generalise about category memberships and +abstract functions with few examples. What determines whether a neural network +model generalises like a human? We tested how well the representations of $86$ +pretrained neural network models mapped to human learning trajectories across +two tasks where humans had to learn continuous relationships and categories of +natural images. In these tasks, both human participants and neural networks +successfully identified the relevant stimulus features within a few trials, +demonstrating effective generalisation. We found that while training dataset +size was a core determinant of alignment with human choices, contrastive +training with multi-modal data (text and imagery) was a common feature of +currently publicly available models that predicted human generalisation. +Intrinsic dimensionality of representations had different effects on alignment +for different model types. Lastly, we tested three sets of human-aligned +representations and found no consistent improvements in predictive accuracy +compared to the baselines. In conclusion, pretrained neural networks can serve +to extract representations for cognitive models, as they appear to capture some +fundamental aspects of cognition that are transferable across tasks. Both our +paradigms and modelling approach offer a novel way to quantify alignment +between neural networks and humans and extend cognitive science into more +naturalistic domains. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Integrated Decision Gradients (IDG-DP) for + Radar-based Human Activity Recognition WACV 2025 + + +
+ Human motion analysis offers significant potential for healthcare monitoring +and early detection of diseases. The advent of radar-based sensing systems has +captured the spotlight for they are able to operate without physical contact +and they can integrate with pre-existing Wi-Fi networks. They are also seen as +less privacy-invasive compared to camera-based systems. However, recent +research has shown high accuracy in recognizing subjects or gender from radar +gait patterns, raising privacy concerns. This study addresses these issues by +investigating privacy vulnerabilities in radar-based Human Activity Recognition +(HAR) systems and proposing a novel method for privacy preservation using +Differential Privacy (DP) driven by attributions derived with Integrated +Decision Gradient (IDG) algorithm. We investigate Black-box Membership +Inference Attack (MIA) Models in HAR settings across various levels of +attacker-accessible information. We extensively evaluated the effectiveness of +the proposed IDG-DP method by designing a CNN-based HAR model and rigorously +assessing its resilience against MIAs. Experimental results demonstrate the +potential of IDG-DP in mitigating privacy attacks while maintaining utility +across all settings, particularly excelling against label-only and shadow model +black-box MIA attacks. This work represents a crucial step towards balancing +the need for effective radar-based HAR with robust privacy protection in +healthcare environments. + +
+
+ comment: Accepted at WACV 2025. 12 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ ICAL: Implicit Character-Aided Learning for Enhanced Handwritten + Mathematical Expression Recognition ICDAR 2024 + + +
+ Significant progress has been made in the field of handwritten mathematical +expression recognition, while existing encoder-decoder methods are usually +difficult to model global information in $LaTeX$. Therefore, this paper +introduces a novel approach, Implicit Character-Aided Learning (ICAL), to mine +the global expression information and enhance handwritten mathematical +expression recognition. Specifically, we propose the Implicit Character +Construction Module (ICCM) to predict implicit character sequences and use a +Fusion Module to merge the outputs of the ICCM and the decoder, thereby +producing corrected predictions. By modeling and utilizing implicit character +information, ICAL achieves a more accurate and context-aware interpretation of +handwritten mathematical expressions. Experimental results demonstrate that +ICAL notably surpasses the state-of-the-art(SOTA) models, improving the +expression recognition rate (ExpRate) by 2.25\%/1.81\%/1.39\% on the CROHME +2014/2016/2019 datasets respectively, and achieves a remarkable 69.06\% on the +challenging HME100k test set. We make our code available on the GitHub: +https://github.com/qingzhenduyu/ICAL + +
+
+ comment: ICDAR 2024 Oral Paper +
+
+
+
+
+ + ♻ ☆ CapS-Adapter: Caption-based MultiModal Adapter in Zero-Shot + Classification + + +
+ Recent advances in vision-language foundational models, such as CLIP, have +demonstrated significant strides in zero-shot classification. However, the +extensive parameterization of models like CLIP necessitates a +resource-intensive fine-tuning process. In response, TIP-Adapter and SuS-X have +introduced training-free methods aimed at bolstering the efficacy of downstream +tasks. While these approaches incorporate support sets to maintain data +distribution consistency between knowledge cache and test sets, they often fall +short in terms of generalization on the test set, particularly when faced with +test data exhibiting substantial distributional variations. In this work, we +present CapS-Adapter, an innovative method that employs a caption-based support +set, effectively harnessing both image and caption features to exceed existing +state-of-the-art techniques in training-free scenarios. CapS-Adapter adeptly +constructs support sets that closely mirror target distributions, utilizing +instance-level distribution features extracted from multimodal large models. By +leveraging CLIP's single and cross-modal strengths, CapS-Adapter enhances +predictive accuracy through the use of multimodal support sets. Our method +achieves outstanding zero-shot classification results across 19 benchmark +datasets, improving accuracy by 2.19\% over the previous leading method. Our +contributions are substantiated through extensive validation on multiple +benchmark datasets, demonstrating superior performance and robust +generalization capabilities. Our code is made publicly available at +https://github.com/WLuLi/CapS-Adapter. + +
+
+ comment: ACM Multimedia 2024 Poster +
+
+
+
+
+ + ♻ ☆ Robust Classification by Coupling Data Mollification with Label + Smoothing + + +
+ Introducing training-time augmentations is a key technique to enhance +generalization and prepare deep neural networks against test-time corruptions. +Inspired by the success of generative diffusion models, we propose a novel +approach of coupling data mollification, in the form of image noising and +blurring, with label smoothing to align predicted label confidences with image +degradation. The method is simple to implement, introduces negligible +overheads, and can be combined with existing augmentations. We demonstrate +improved robustness and uncertainty quantification on the corrupted image +benchmarks of the CIFAR and TinyImageNet datasets. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ LOVA3: Learning to Visual Question Answering, Asking and Assessment NeurIPS 2024 + + +
+ Question answering, asking, and assessment are three innate human traits +crucial for understanding the world and acquiring knowledge. By enhancing these +capabilities, humans can more effectively utilize data, leading to better +comprehension and learning outcomes. Current Multimodal Large Language Models +(MLLMs) primarily focus on question answering, often neglecting the full +potential of questioning and assessment skills. Inspired by the human learning +mechanism, we introduce LOVA3, an innovative framework named "Learning tO +Visual question Answering, Asking and Assessment," designed to equip MLLMs with +these additional capabilities. Our approach involves the creation of two +supplementary training tasks GenQA and EvalQA, aiming at fostering the skills +of asking and assessing questions in the context of images. To develop the +questioning ability, we compile a comprehensive set of multimodal foundational +tasks. For assessment, we introduce a new benchmark called EvalQABench, +comprising 64,000 training samples (split evenly between positive and negative +samples) and 5,000 validation and testing samples. We posit that enhancing +MLLMs with the capabilities to answer, ask, and assess questions will enhance +their multimodal comprehension, ultimately improving overall performance. To +validate this hypothesis, we train MLLMs using the LOVA3 framework and evaluate +them on a range of multimodal datasets and benchmarks. Our results demonstrate +consistent performance gains, underscoring the critical role of these +additional tasks in fostering comprehensive intelligence in MLLMs. The code is +available at https://github.com/showlab/LOVA3. + +
+
+ comment: Accepted by NeurIPS 2024. The code is available at + https://github.com/showlab/LOVA3 +
+
+
+
+
+ + ♻ ☆ Learning from Pattern Completion: Self-supervised Controllable + Generation + + +
+ The human brain exhibits a strong ability to spontaneously associate +different visual attributes of the same or similar visual scene, such as +associating sketches and graffiti with real-world visual objects, usually +without supervising information. In contrast, in the field of artificial +intelligence, controllable generation methods like ControlNet heavily rely on +annotated training datasets such as depth maps, semantic segmentation maps, and +poses, which limits the method's scalability. Inspired by the neural mechanisms +that may contribute to the brain's associative power, specifically the cortical +modularization and hippocampal pattern completion, here we propose a +self-supervised controllable generation (SCG) framework. Firstly, we introduce +an equivariant constraint to promote inter-module independence and intra-module +correlation in a modular autoencoder network, thereby achieving functional +specialization. Subsequently, based on these specialized modules, we employ a +self-supervised pattern completion approach for controllable generation +training. Experimental results demonstrate that the proposed modular +autoencoder effectively achieves functional specialization, including the +modular processing of color, brightness, and edge detection, and exhibits +brain-like features including orientation selectivity, color antagonism, and +center-surround receptive fields. Through self-supervised training, associative +generation capabilities spontaneously emerge in SCG, demonstrating excellent +generalization ability to various tasks such as associative generation on +painting, sketches, and ancient graffiti. Compared to the previous +representative method ControlNet, our proposed approach not only demonstrates +superior robustness in more challenging high-noise scenarios but also possesses +more promising scalability potential due to its self-supervised manner.Codes +are released on Github and Gitee. + +
+
+
+
+
+ + ♻ ☆ An efficient dual-branch framework via implicit self-texture enhancement + for arbitrary-scale histopathology image super-resolution + + +
+ High-quality whole-slide scanning is expensive, complex, and time-consuming, +thus limiting the acquisition and utilization of high-resolution histopathology +images in daily clinical work. Deep learning-based single-image +super-resolution (SISR) techniques provide an effective way to solve this +problem. However, the existing SISR models applied in histopathology images can +only work in fixed integer scaling factors, decreasing their applicability. +Though methods based on implicit neural representation (INR) have shown +promising results in arbitrary-scale super-resolution (SR) of natural images, +applying them directly to histopathology images is inadequate because they have +unique fine-grained image textures different from natural images. Thus, we +propose an Implicit Self-Texture Enhancement-based dual-branch framework (ISTE) +for arbitrary-scale SR of histopathology images to address this challenge. The +proposed ISTE contains a feature aggregation branch and a texture learning +branch. We employ the feature aggregation branch to enhance the learning of the +local details for SR images while utilizing the texture learning branch to +enhance the learning of high-frequency texture details. Then, we design a +two-stage texture enhancement strategy to fuse the features from the two +branches to obtain the SR images. Experiments on publicly available datasets, +including TMA, HistoSR, and the TCGA lung cancer datasets, demonstrate that +ISTE outperforms existing fixed-scale and arbitrary-scale SR algorithms across +various scaling factors. Additionally, extensive experiments have shown that +the histopathology images reconstructed by the proposed ISTE are applicable to +downstream pathology image analysis tasks. + +
+
+
+
+
+ + ♻ ☆ Deep Learning for Surgical Instrument Recognition and Segmentation in + Robotic-Assisted Surgeries: A Systematic Review + + +
+ Applying deep learning (DL) for annotating surgical instruments in +robot-assisted minimally invasive surgeries (MIS) represents a significant +advancement in surgical technology. This systematic review examines 48 studies +that and advanced DL methods and architectures. These sophisticated DL models +have shown notable improvements in the precision and efficiency of detecting +and segmenting surgical tools. The enhanced capabilities of these models +support various clinical applications, including real-time intraoperative +guidance, comprehensive postoperative evaluations, and objective assessments of +surgical skills. By accurately identifying and segmenting surgical instruments +in video data, DL models provide detailed feedback to surgeons, thereby +improving surgical outcomes and reducing complication risks. Furthermore, the +application of DL in surgical education is transformative. The review +underscores the significant impact of DL on improving the accuracy of skill +assessments and the overall quality of surgical training programs. However, +implementing DL in surgical tool detection and segmentation faces challenges, +such as the need for large, accurately annotated datasets to train these models +effectively. The manual annotation process is labor-intensive and +time-consuming, posing a significant bottleneck. Future research should focus +on automating the detection and segmentation process and enhancing the +robustness of DL models against environmental variations. Expanding the +application of DL models across various surgical specialties will be essential +to fully realize this technology's potential. Integrating DL with other +emerging technologies, such as augmented reality (AR), also offers promising +opportunities to further enhance the precision and efficacy of surgical +procedures. + +
+
+ comment: 57 pages, 9 figures, Published in Artificial Intelligence Reviews + journal +
+
+
+
+
+ + ♻ ☆ TaE: Task-aware Expandable Representation for Long Tail Class + Incremental Learning ACCV2024 + + +
+ Class-incremental learning is dedicated to the development of deep learning +models that are capable of acquiring new knowledge while retaining previously +learned information. Most methods focus on balanced data distribution for each +task, overlooking real-world long-tailed distributions. Therefore, Long-Tailed +Class-Incremental Learning has been introduced, which trains on data where head +classes have more samples than tail classes. Existing methods mainly focus on +preserving representative samples from previous classes to combat catastrophic +forgetting. Recently, dynamic network algorithms freeze old network structures +and expand new ones, achieving significant performance. However, with the +introduction of the long-tail problem, merely extending Determined blocks can +lead to miscalibrated predictions, while expanding the entire backbone results +in an explosion of memory size. To address these issues, we introduce a novel +Task-aware Expandable (TaE) framework, dynamically allocating and updating +task-specific trainable parameters to learn diverse representations from each +incremental task while resisting forgetting through the majority of frozen +model parameters. To further encourage the class-specific feature +representation, we develop a Centroid-Enhanced (CEd) method to guide the update +of these task-aware parameters. This approach is designed to adaptively +allocate feature space for every class by adjusting the distance between intra- +and inter-class features, which can extend to all "training from sketch" +algorithms. Extensive experiments demonstrate that TaE achieves +state-of-the-art performance. + +
+
+ comment: Accepted to ACCV2024 +
+
+
+
+
+ + ♻ ☆ GS2Pose: Two-stage 6D Object Pose Estimation Guided by Gaussian + Splatting + + +
+ This paper proposes a new method for accurate and robust 6D pose estimation +of novel objects, named GS2Pose. By introducing 3D Gaussian splatting, GS2Pose +can utilize the reconstruction results without requiring a high-quality CAD +model, which means it only requires segmented RGBD images as input. +Specifically, GS2Pose employs a two-stage structure consisting of coarse +estimation followed by refined estimation. In the coarse stage, a lightweight +U-Net network with a polarization attention mechanism, called Pose-Net, is +designed. By using the 3DGS model for supervised training, Pose-Net can +generate NOCS images to compute a coarse pose. In the refinement stage, GS2Pose +formulates a pose regression algorithm following the idea of reprojection or +Bundle Adjustment (BA), referred to as GS-Refiner. By leveraging Lie algebra to +extend 3DGS, GS-Refiner obtains a pose-differentiable rendering pipeline that +refines the coarse pose by comparing the input images with the rendered images. +GS-Refiner also selectively updates parameters in the 3DGS model to achieve +environmental adaptation, thereby enhancing the algorithm's robustness and +flexibility to illuminative variation, occlusion, and other challenging +disruptive factors. GS2Pose was evaluated through experiments conducted on the +LineMod dataset, where it was compared with similar algorithms, yielding highly +competitive results. The code for GS2Pose will soon be released on GitHub. + +
+
+
+
+
+ + ♻ ☆ Training-free Zero-shot Composed Image Retrieval via Weighted Modality + Fusion and Similarity TAAI + + +
+ Composed image retrieval (CIR), which formulates the query as a combination +of a reference image and modified text, has emerged as a new form of image +search due to its enhanced ability to capture user intent. However, training a +CIR model in a supervised manner typically requires labor-intensive collection +of (reference image, text modifier, target image) triplets. While existing +zero-shot CIR (ZS-CIR) methods eliminate the need for training on specific +downstream datasets, they still require additional pretraining on large-scale +image datasets. In this paper, we introduce a training-free approach for +ZS-CIR. Our approach, Weighted Modality fusion and similarity for CIR +(WeiMoCIR), operates under the assumption that image and text modalities can be +effectively combined using a simple weighted average. This allows the query +representation to be constructed directly from the reference image and text +modifier. To further enhance retrieval performance, we employ multimodal large +language models (MLLMs) to generate image captions for the database images and +incorporate these textual captions into the similarity computation by combining +them with image information using a weighted average. Our approach is simple, +easy to implement, and its effectiveness is validated through experiments on +the FashionIQ and CIRR datasets. Code is available at +https://github.com/whats2000/WeiMoCIR. + +
+
+ comment: 14 pages, 6 figures, International Conference on Technologies and + Applications of Artificial Intelligence (TAAI) Camera Ready +
+
+
+
+
+ + ♻ ☆ Towards Calibrated Robust Fine-Tuning of Vision-Language Models NeurIPS 2024 + + +
+ Improving out-of-distribution (OOD) generalization during in-distribution +(ID) adaptation is a primary goal of robust fine-tuning of zero-shot models +beyond naive fine-tuning. However, despite decent OOD generalization +performance from recent robust fine-tuning methods, confidence calibration for +reliable model output has not been fully addressed. This work proposes a robust +fine-tuning method that improves both OOD accuracy and confidence calibration +simultaneously in vision language models. Firstly, we show that both OOD +classification and OOD calibration errors have a shared upper bound consisting +of two terms of ID data: 1) ID calibration error and 2) the smallest singular +value of the ID input covariance matrix. Based on this insight, we design a +novel framework that conducts fine-tuning with a constrained multimodal +contrastive loss enforcing a larger smallest singular value, which is further +guided by the self-distillation of a moving-averaged model to achieve +calibrated prediction as well. Starting from empirical evidence supporting our +theoretical statements, we provide extensive experimental results on ImageNet +distribution shift benchmarks that demonstrate the effectiveness of our theorem +and its practical implementation. + +
+
+ comment: NeurIPS 2024 (a short version was presented at the NeurIPS 2023 + Workshop on Distribution Shifts); Major modification of (v7): Fixing the + x-axis of Figure 3 and Pearson correlation, accordingly +
+
+
+
+
+ + ♻ ☆ LightAvatar: Efficient Head Avatar as Dynamic Neural Light Field ECCV'24 + + +
+ Recent works have shown that neural radiance fields (NeRFs) on top of +parametric models have reached SOTA quality to build photorealistic head +avatars from a monocular video. However, one major limitation of the NeRF-based +avatars is the slow rendering speed due to the dense point sampling of NeRF, +preventing them from broader utility on resource-constrained devices. We +introduce LightAvatar, the first head avatar model based on neural light fields +(NeLFs). LightAvatar renders an image from 3DMM parameters and a camera pose +via a single network forward pass, without using mesh or volume rendering. The +proposed approach, while being conceptually appealing, poses a significant +challenge towards real-time efficiency and training stability. To resolve them, +we introduce dedicated network designs to obtain proper representations for the +NeLF model and maintain a low FLOPs budget. Meanwhile, we tap into a +distillation-based training strategy that uses a pretrained avatar model as +teacher to synthesize abundant pseudo data for training. A warping field +network is introduced to correct the fitting error in the real data so that the +model can learn better. Extensive experiments suggest that our method can +achieve new SOTA image quality quantitatively or qualitatively, while being +significantly faster than the counterparts, reporting 174.1 FPS (512x512 +resolution) on a consumer-grade GPU (RTX3090) with no customized optimization. + +
+
+ comment: ECCV'24 CADL Workshop. Code: + https://github.com/MingSun-Tse/LightAvatar-TensorFlow. V2: Corrected speed + benchmark with GaussianAvatar +
+
+
+
+
+ + ♻ ☆ GeNIe: Generative Hard Negative Images Through Diffusion + + +
+ Data augmentation is crucial in training deep models, preventing them from +overfitting to limited data. Recent advances in generative AI, e.g., diffusion +models, have enabled more sophisticated augmentation techniques that produce +data resembling natural images. We introduce GeNIe a novel augmentation method +which leverages a latent diffusion model conditioned on a text prompt to +combine two contrasting data points (an image from the source category and a +text prompt from the target category) to generate challenging augmentations. To +achieve this, we adjust the noise level (equivalently, number of diffusion +iterations) to ensure the generated image retains low-level and background +features from the source image while representing the target category, +resulting in a hard negative sample for the source category. We further +automate and enhance GeNIe by adaptively adjusting the noise level selection on +a per image basis (coined as GeNIe-Ada), leading to further performance +improvements. Our extensive experiments, in both few-shot and long-tail +distribution settings, demonstrate the effectiveness of our novel augmentation +method and its superior performance over the prior art. Our code is available +at: https://github.com/UCDvision/GeNIe + +
+
+ comment: Our code is available https://github.com/UCDvision/GeNIe +
+
+
+
+
+ + ♻ ☆ SeafloorAI: A Large-scale Vision-Language Dataset for Seafloor + Geological Survey + + +
+ A major obstacle to the advancements of machine learning models in marine +science, particularly in sonar imagery analysis, is the scarcity of AI-ready +datasets. While there have been efforts to make AI-ready sonar image dataset +publicly available, they suffer from limitations in terms of environment +setting and scale. To bridge this gap, we introduce SeafloorAI, the first +extensive AI-ready datasets for seafloor mapping across 5 geological layers +that is curated in collaboration with marine scientists. We further extend the +dataset to SeafloorGenAI by incorporating the language component in order to +facilitate the development of both vision- and language-capable machine +learning models for sonar imagery. The dataset consists of 62 geo-distributed +data surveys spanning 17,300 square kilometers, with 696K sonar images, 827K +annotated segmentation masks, 696K detailed language descriptions and +approximately 7M question-answer pairs. By making our data processing source +code publicly available, we aim to engage the marine science community to +enrich the data pool and inspire the machine learning community to develop more +robust models. This collaborative approach will enhance the capabilities and +applications of our datasets within both fields. + +
+
+
+
+
+ + ♻ ☆ Typicalness-Aware Learning for Failure Detection NeurIPS 2024 + + +
+ Deep neural networks (DNNs) often suffer from the overconfidence issue, where +incorrect predictions are made with high confidence scores, hindering the +applications in critical systems. In this paper, we propose a novel approach +called Typicalness-Aware Learning (TAL) to address this issue and improve +failure detection performance. We observe that, with the cross-entropy loss, +model predictions are optimized to align with the corresponding labels via +increasing logit magnitude or refining logit direction. However, regarding +atypical samples, the image content and their labels may exhibit disparities. +This discrepancy can lead to overfitting on atypical samples, ultimately +resulting in the overconfidence issue that we aim to address. To tackle the +problem, we have devised a metric that quantifies the typicalness of each +sample, enabling the dynamic adjustment of the logit magnitude during the +training process. By allowing atypical samples to be adequately fitted while +preserving reliable logit direction, the problem of overconfidence can be +mitigated. TAL has been extensively evaluated on benchmark datasets, and the +results demonstrate its superiority over existing failure detection methods. +Specifically, TAL achieves a more than 5% improvement on CIFAR100 in terms of +the Area Under the Risk-Coverage Curve (AURC) compared to the state-of-the-art. +Code is available at https://github.com/liuyijungoon/TAL. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Undermining Image and Text Classification Algorithms Using Adversarial + Attacks + + +
+ Machine learning models are prone to adversarial attacks, where inputs can be +manipulated in order to cause misclassifications. While previous research has +focused on techniques like Generative Adversarial Networks (GANs), there's +limited exploration of GANs and Synthetic Minority Oversampling Technique +(SMOTE) in text and image classification models to perform adversarial attacks. +Our study addresses this gap by training various machine learning models and +using GANs and SMOTE to generate additional data points aimed at attacking text +classification models. Furthermore, we extend our investigation to face +recognition models, training a Convolutional Neural Network(CNN) and subjecting +it to adversarial attacks with fast gradient sign perturbations on key features +identified by GradCAM, a technique used to highlight key image characteristics +CNNs use in classification. Our experiments reveal a significant vulnerability +in classification models. Specifically, we observe a 20 % decrease in accuracy +for the top-performing text classification models post-attack, along with a 30 +% decrease in facial recognition accuracy. This highlights the susceptibility +of these models to manipulation of input data. Adversarial attacks not only +compromise the security but also undermine the reliability of machine learning +systems. By showcasing the impact of adversarial attacks on both text +classification and face recognition models, our study underscores the urgent +need for develop robust defenses against such vulnerabilities. + +
+
+ comment: Accepted for presentation at Electronic Imaging Conference 2025 +
+
+
+
+
+ + ♻ ☆ Aligning Text-to-Image Diffusion Models with Reward Backpropagation + + +
+ Text-to-image diffusion models have recently emerged at the forefront of +image generation, powered by very large-scale unsupervised or weakly supervised +text-to-image training datasets. Due to their unsupervised training, +controlling their behavior in downstream tasks, such as maximizing +human-perceived image quality, image-text alignment, or ethical image +generation, is difficult. Recent works finetune diffusion models to downstream +reward functions using vanilla reinforcement learning, notorious for the high +variance of the gradient estimators. In this paper, we propose AlignProp, a +method that aligns diffusion models to downstream reward functions using +end-to-end backpropagation of the reward gradient through the denoising +process. While naive implementation of such backpropagation would require +prohibitive memory resources for storing the partial derivatives of modern +text-to-image models, AlignProp finetunes low-rank adapter weight modules and +uses gradient checkpointing, to render its memory usage viable. We test +AlignProp in finetuning diffusion models to various objectives, such as +image-text semantic alignment, aesthetics, compressibility and controllability +of the number of objects present, as well as their combinations. We show +AlignProp achieves higher rewards in fewer training steps than alternatives, +while being conceptually simpler, making it a straightforward choice for +optimizing diffusion models for differentiable reward functions of interest. +Code and Visualization results are available at https://align-prop.github.io/. + +
+
+ comment: This paper is subsumed by a later paper of ours: arXiv:2407.08737 +
+
+
+
+
+ + ♻ ☆ Beyond Accuracy: Ensuring Correct Predictions With Correct Rationales NeurIPS 2024 + + +
+ Large pretrained foundation models demonstrate exceptional performance and, +in some high-stakes applications, even surpass human experts. However, most of +these models are currently evaluated primarily on prediction accuracy, +overlooking the validity of the rationales behind their accurate predictions. +For the safe deployment of foundation models, there is a pressing need to +ensure double-correct predictions, i.e., correct prediction backed by correct +rationales. To achieve this, we propose a two-phase scheme: First, we curate a +new dataset that offers structured rationales for visual recognition tasks. +Second, we propose a rationale-informed optimization method to guide the model +in disentangling and localizing visual evidence for each rationale, without +requiring manual annotations. Extensive experiments and ablation studies +demonstrate that our model outperforms state-of-the-art models by up to 10.1% +in prediction accuracy across a wide range of tasks. Furthermore, our method +significantly improves the model's rationale correctness, improving +localization by 7.5% and disentanglement by 36.5%. Our dataset, source code, +and pretrained weights: https://github.com/deep-real/DCP + +
+
+ comment: In Proceedings of the 38th Conference on Neural Information + Processing Systems (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ bit2bit: 1-bit quanta video reconstruction via self-supervised photon + prediction NeurIPS 2024 + + +
+ Quanta image sensors, such as SPAD arrays, are an emerging sensor technology, +producing 1-bit arrays representing photon detection events over exposures as +short as a few nanoseconds. In practice, raw data are post-processed using +heavy spatiotemporal binning to create more useful and interpretable images at +the cost of degrading spatiotemporal resolution. In this work, we propose +bit2bit, a new method for reconstructing high-quality image stacks at the +original spatiotemporal resolution from sparse binary quanta image data. +Inspired by recent work on Poisson denoising, we developed an algorithm that +creates a dense image sequence from sparse binary photon data by predicting the +photon arrival location probability distribution. However, due to the binary +nature of the data, we show that the assumption of a Poisson distribution is +inadequate. Instead, we model the process with a Bernoulli lattice process from +the truncated Poisson. This leads to the proposal of a novel self-supervised +solution based on a masked loss function. We evaluate our method using both +simulated and real data. On simulated data from a conventional video, we +achieve 34.35 mean PSNR with extremely photon-sparse binary input (<0.06 +photons per pixel per frame). We also present a novel dataset containing a wide +range of real SPAD high-speed videos under various challenging imaging +conditions. The scenes cover strong/weak ambient light, strong motion, +ultra-fast events, etc., which will be made available to the community, on +which we demonstrate the promise of our approach. Both reconstruction quality +and throughput substantially surpass the state-of-the-art methods (e.g., Quanta +Burst Photography (QBP)). Our approach significantly enhances the visualization +and usability of the data, enabling the application of existing analysis +techniques. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Revisiting Surgical Instrument Segmentation Without Human Intervention: + A Graph Partitioning View + + +
+ Surgical instrument segmentation (SIS) on endoscopic images stands as a +long-standing and essential task in the context of computer-assisted +interventions for boosting minimally invasive surgery. Given the recent surge +of deep learning methodologies and their data-hungry nature, training a neural +predictive model based on massive expert-curated annotations has been +dominating and served as an off-the-shelf approach in the field, which could, +however, impose prohibitive burden to clinicians for preparing fine-grained +pixel-wise labels corresponding to the collected surgical video frames. In this +work, we propose an unsupervised method by reframing the video frame +segmentation as a graph partitioning problem and regarding image pixels as +graph nodes, which is significantly different from the previous efforts. A +self-supervised pre-trained model is firstly leveraged as a feature extractor +to capture high-level semantic features. Then, Laplacian matrixs are computed +from the features and are eigendecomposed for graph partitioning. On the "deep" +eigenvectors, a surgical video frame is meaningfully segmented into different +modules such as tools and tissues, providing distinguishable semantic +information like locations, classes, and relations. The segmentation problem +can then be naturally tackled by applying clustering or threshold on the +eigenvectors. Extensive experiments are conducted on various datasets (e.g., +EndoVis2017, EndoVis2018, UCL, etc.) for different clinical endpoints. Across +all the challenging scenarios, our method demonstrates outstanding performance +and robustness higher than unsupervised state-of-the-art (SOTA) methods. The +code is released at https://github.com/MingyuShengSMY/GraphClusteringSIS.git. + +
+
+ comment: Accepted by The 32nd ACM International Conference on Multimedia (ACM + MM 2024) Workshop on Multimedia Computing for Health and Medicine (MCHM) +
+
+
+
+
+ + ♻ ☆ AMNCutter: Affinity-Attention-Guided Multi-View Normalized Cutter for + Unsupervised Surgical Instrument Segmentation WACV 2025 + + +
+ Surgical instrument segmentation (SIS) is pivotal for robotic-assisted +minimally invasive surgery, assisting surgeons by identifying surgical +instruments in endoscopic video frames. Recent unsupervised surgical instrument +segmentation (USIS) methods primarily rely on pseudo-labels derived from +low-level features such as color and optical flow, but these methods show +limited effectiveness and generalizability in complex and unseen endoscopic +scenarios. In this work, we propose a label-free unsupervised model featuring a +novel module named Multi-View Normalized Cutter (m-NCutter). Different from +previous USIS works, our model is trained using a graph-cutting loss function +that leverages patch affinities for supervision, eliminating the need for +pseudo-labels. The framework adaptively determines which affinities from which +levels should be prioritized. Therefore, the low- and high-level features and +their affinities are effectively integrated to train a label-free unsupervised +model, showing superior effectiveness and generalization ability. We conduct +comprehensive experiments across multiple SIS datasets to validate our +approach's state-of-the-art (SOTA) performance, robustness, and exceptional +potential as a pre-trained model. Our code is released at +https://github.com/MingyuShengSMY/AMNCutter. + +
+
+ comment: Accepted by the 2025 IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV 2025) +
+
+
+
+
+ + ♻ ☆ Controllable Talking Face Generation by Implicit Facial Keypoints + Editing + + +
+ Audio-driven talking face generation has garnered significant interest within +the domain of digital human research. Existing methods are encumbered by +intricate model architectures that are intricately dependent on each other, +complicating the process of re-editing image or video inputs. In this work, we +present ControlTalk, a talking face generation method to control face +expression deformation based on driven audio, which can construct the head pose +and facial expression including lip motion for both single image or sequential +video inputs in a unified manner. By utilizing a pre-trained video synthesis +renderer and proposing the lightweight adaptation, ControlTalk achieves precise +and naturalistic lip synchronization while enabling quantitative control over +mouth opening shape. Our experiments show that our method is superior to +state-of-the-art performance on widely used benchmarks, including HDTF and +MEAD. The parameterized adaptation demonstrates remarkable generalization +capabilities, effectively handling expression deformation across same-ID and +cross-ID scenarios, and extending its utility to out-of-domain portraits, +regardless of languages. Code is available at +https://github.com/NetEase-Media/ControlTalk. + +
+
+
+
+
+ + ♻ ☆ Principled Probabilistic Imaging using Diffusion Models as Plug-and-Play + Priors NeurIPS 2024 + + +
+ Diffusion models (DMs) have recently shown outstanding capabilities in +modeling complex image distributions, making them expressive image priors for +solving Bayesian inverse problems. However, most existing DM-based methods rely +on approximations in the generative process to be generic to different inverse +problems, leading to inaccurate sample distributions that deviate from the +target posterior defined within the Bayesian framework. To harness the +generative power of DMs while avoiding such approximations, we propose a Markov +chain Monte Carlo algorithm that performs posterior sampling for general +inverse problems by reducing it to sampling the posterior of a Gaussian +denoising problem. Crucially, we leverage a general DM formulation as a unified +interface that allows for rigorously solving the denoising problem with a range +of state-of-the-art DMs. We demonstrate the effectiveness of the proposed +method on six inverse problems (three linear and three nonlinear), including a +real-world black hole imaging problem. Experimental results indicate that our +proposed method offers more accurate reconstructions and posterior estimation +compared to existing DM-based imaging inverse methods. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ TLCM: Training-efficient Latent Consistency Model for Image Generation + with 2-8 Steps + + +
+ Distilling latent diffusion models (LDMs) into ones that are fast to sample +from is attracting growing research interest. However, the majority of existing +methods face two critical challenges: (1) They hinge on long training using a +huge volume of real data. (2) They routinely lead to quality degradation for +generation, especially in text-image alignment. This paper proposes a novel +training-efficient Latent Consistency Model (TLCM) to overcome these +challenges. Our method first accelerates LDMs via data-free multistep latent +consistency distillation (MLCD), and then data-free latent consistency +distillation is proposed to efficiently guarantee the inter-segment consistency +in MLCD. Furthermore, we introduce bags of techniques, e.g., distribution +matching, adversarial learning, and preference learning, to enhance TLCM's +performance at few-step inference without any real data. TLCM demonstrates a +high level of flexibility by enabling adjustment of sampling steps within the +range of 2 to 8 while still producing competitive outputs compared to full-step +approaches. Notably, TLCM enjoys the data-free merit by employing synthetic +data from the teacher for distillation. With just 70 training hours on an A100 +GPU, a 3-step TLCM distilled from SDXL achieves an impressive CLIP Score of +33.68 and an Aesthetic Score of 5.97 on the MSCOCO-2017 5K benchmark, +surpassing various accelerated models and even outperforming the teacher model +in human preference metrics. We also demonstrate the versatility of TLCMs in +applications including image style transfer, controllable generation, and +Chinese-to-image generation. + +
+
+
+
+
+ + ♻ ☆ Variational Zero-shot Multispectral Pansharpening + + +
+ Pansharpening aims to generate a high spatial resolution multispectral image +(HRMS) by fusing a low spatial resolution multispectral image (LRMS) and a +panchromatic image (PAN). The most challenging issue for this task is that only +the to-be-fused LRMS and PAN are available, and the existing deep +learning-based methods are unsuitable since they rely on many training pairs. +Traditional variational optimization (VO) based methods are well-suited for +addressing such a problem. They focus on carefully designing explicit fusion +rules as well as regularizations for an optimization problem, which are based +on the researcher's discovery of the image relationships and image structures. +Unlike previous VO-based methods, in this work, we explore such complex +relationships by a parameterized term rather than a manually designed one. +Specifically, we propose a zero-shot pansharpening method by introducing a +neural network into the optimization objective. This network estimates a +representation component of HRMS, which mainly describes the relationship +between HRMS and PAN. In this way, the network achieves a similar goal to the +so-called deep image prior because it implicitly regulates the relationship +between the HRMS and PAN images through its inherent structure. We directly +minimize this optimization objective via network parameters and the expected +HRMS image through iterative updating. Extensive experiments on various +benchmark datasets demonstrate that our proposed method can achieve better +performance compared with other state-of-the-art methods. The codes are +available at https://github.com/xyrui/PSDip. + +
+
+
+
+
+ + ♻ ☆ SS3DM: Benchmarking Street-View Surface Reconstruction with a Synthetic + 3D Mesh Dataset NeurIPS 2024 + + +
+ Reconstructing accurate 3D surfaces for street-view scenarios is crucial for +applications such as digital entertainment and autonomous driving simulation. +However, existing street-view datasets, including KITTI, Waymo, and nuScenes, +only offer noisy LiDAR points as ground-truth data for geometric evaluation of +reconstructed surfaces. These geometric ground-truths often lack the necessary +precision to evaluate surface positions and do not provide data for assessing +surface normals. To overcome these challenges, we introduce the SS3DM dataset, +comprising precise \textbf{S}ynthetic \textbf{S}treet-view \textbf{3D} +\textbf{M}esh models exported from the CARLA simulator. These mesh models +facilitate accurate position evaluation and include normal vectors for +evaluating surface normal. To simulate the input data in realistic driving +scenarios for 3D reconstruction, we virtually drive a vehicle equipped with six +RGB cameras and five LiDAR sensors in diverse outdoor scenes. Leveraging this +dataset, we establish a benchmark for state-of-the-art surface reconstruction +methods, providing a comprehensive evaluation of the associated challenges. + For more information, visit our homepage at https://ss3dm.top. + +
+
+ comment: NeurIPS 2024, Track on Datasets and Benchmarks +
+
+
+
+
+
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Orbit: A Framework for Designing and Evaluating Multi-objective Rankers + + +
+ Machine learning in production needs to balance multiple objectives: This is +particularly evident in ranking or recommendation models, where conflicting +objectives such as user engagement, satisfaction, diversity, and novelty must +be considered at the same time. However, designing multi-objective rankers is +inherently a dynamic wicked problem -- there is no single optimal solution, and +the needs evolve over time. Effective design requires collaboration between +cross-functional teams and careful analysis of a wide range of information. In +this work, we introduce Orbit, a conceptual framework for Objective-centric +Ranker Building and Iteration. The framework places objectives at the center of +the design process, to serve as boundary objects for communication and guide +practitioners for design and evaluation. We implement Orbit as an interactive +system, which enables stakeholders to interact with objective spaces directly +and supports real-time exploration and evaluation of design trade-offs. We +evaluate Orbit through a user study involving twelve industry practitioners, +showing that it supports efficient design space exploration, leads to more +informed decision-making, and enhances awareness of the inherent trade-offs of +multiple objectives. Orbit (1) opens up new opportunities of an +objective-centric design process for any multi-objective ML models, as well as +(2) sheds light on future designs that push practitioners to go beyond a narrow +metric-centric or example-centric mindset. + +
+
+
+
+
+ + ☆ Lightning IR: Straightforward Fine-tuning and Inference of + Transformer-based Language Models for Information Retrieval WSDM'25 + + +
+ A wide range of transformer-based language models have been proposed for +information retrieval tasks. However, fine-tuning and inference of these models +is often complex and requires substantial engineering effort. This paper +introduces Lightning IR, a PyTorch Lightning-based framework for fine-tuning +and inference of transformer-based language models for information retrieval. +Lightning IR provides a modular and extensible architecture that supports all +stages of an information retrieval pipeline: from fine-tuning and indexing to +searching and re-ranking. It is designed to be straightforward to use, +scalable, and reproducible. Lightning IR is available as open-source: +https://github.com/webis-de/lightning-ir. + +
+
+ comment: Accepted as a demo at WSDM'25 +
+
+
+
+
+ + ☆ Self-Calibrated Listwise Reranking with Large Language Models + + +
+ Large language models (LLMs), with advanced linguistic capabilities, have +been employed in reranking tasks through a sequence-to-sequence approach. In +this paradigm, multiple passages are reranked in a listwise manner and a +textual reranked permutation is generated. However, due to the limited context +window of LLMs, this reranking paradigm requires a sliding window strategy to +iteratively handle larger candidate sets. This not only increases computational +costs but also restricts the LLM from fully capturing all the comparison +information for all candidates. To address these challenges, we propose a novel +self-calibrated listwise reranking method, which aims to leverage LLMs to +produce global relevance scores for ranking. To achieve it, we first propose +the relevance-aware listwise reranking framework, which incorporates explicit +list-view relevance scores to improve reranking efficiency and enable global +comparison across the entire candidate set. Second, to ensure the comparability +of the computed scores, we propose self-calibrated training that uses +point-view relevance assessments generated internally by the LLM itself to +calibrate the list-view relevance assessments. Extensive experiments and +comprehensive analysis on the BEIR benchmark and TREC Deep Learning Tracks +demonstrate the effectiveness and efficiency of our proposed method. + +
+
+
+
+
+ + ☆ Best Practices for Distilling Large Language Models into BERT for Web + Search Ranking + + +
+ Recent studies have highlighted the significant potential of Large Language +Models (LLMs) as zero-shot relevance rankers. These methods predominantly +utilize prompt learning to assess the relevance between queries and documents +by generating a ranked list of potential documents. Despite their promise, the +substantial costs associated with LLMs pose a significant challenge for their +direct implementation in commercial search systems. To overcome this barrier +and fully exploit the capabilities of LLMs for text ranking, we explore +techniques to transfer the ranking expertise of LLMs to a more compact model +similar to BERT, using a ranking loss to enable the deployment of less +resource-intensive models. Specifically, we enhance the training of LLMs +through Continued Pre-Training, taking the query as input and the clicked title +and summary as output. We then proceed with supervised fine-tuning of the LLM +using a rank loss, assigning the final token as a representative of the entire +sentence. Given the inherent characteristics of autoregressive language models, +only the final token can encapsulate all preceding tokens. Additionally, +we introduce a hybrid point-wise and margin MSE loss to transfer the ranking +knowledge from LLMs to smaller models like BERT. This method creates a viable +solution for environments with strict resource constraints. Both offline and +online evaluations have confirmed the efficacy of our approach, and our model +has been successfully integrated into a commercial web search engine as of +February 2024. + +
+
+ comment: Arxiv Version +
+
+
+
+
+ + ☆ Towards Competitive Search Relevance For Inference-Free Learned Sparse + Retrievers + + +
+ Learned sparse retrieval, which can efficiently perform retrieval through +mature inverted-index engines, has garnered growing attention in recent years. +Particularly, the inference-free sparse retrievers are attractive as they +eliminate online model inference in the retrieval phase thereby avoids huge +computational cost, offering reasonable throughput and latency. However, even +the state-of-the-art (SOTA) inference-free sparse models lag far behind in +terms of search relevance when compared to both sparse and dense siamese +models. Towards competitive search relevance for inference-free sparse +retrievers, we argue that they deserve dedicated training methods other than +using same ones with siamese encoders. In this paper, we propose two different +approaches for performance improvement. First, we introduce the IDF-aware FLOPS +loss, which introduces Inverted Document Frequency (IDF) to the sparsification +of representations. We find that it mitigates the negative impact of the FLOPS +regularization on search relevance, allowing the model to achieve a better +balance between accuracy and efficiency. Moreover, we propose a heterogeneous +ensemble knowledge distillation framework that combines siamese dense and +sparse retrievers to generate supervisory signals during the pre-training +phase. The ensemble framework of dense and sparse retriever capitalizes on +their strengths respectively, providing a strong upper bound for knowledge +distillation. To concur the diverse feedback from heterogeneous supervisors, we +normalize and then aggregate the outputs of the teacher models to eliminate +score scale differences. On the BEIR benchmark, our model outperforms existing +SOTA inference-free sparse model by \textbf{3.3 NDCG@10 score}. It exhibits +search relevance comparable to siamese sparse retrievers and client-side +latency only \textbf{1.1x that of BM25}. + +
+
+
+
+
+ + ☆ The Concatenator: A Bayesian Approach To Real Time Concatenative + Musaicing + + +
+ We present ``The Concatenator,'' a real time system for audio-guided +concatenative synthesis. Similarly to Driedger et al.'s ``musaicing'' (or +``audio mosaicing'') technique, we concatenate a set number of windows within a +corpus of audio to re-create the harmonic and percussive aspects of a target +audio stream. Unlike Driedger's NMF-based technique, however, we instead use an +explicitly Bayesian point of view, where corpus window indices are hidden +states and the target audio stream is an observation. We use a particle filter +to infer the best hidden corpus states in real-time. Our transition model +includes a tunable parameter to control the time-continuity of corpus grains, +and our observation model allows users to prioritize how quickly windows change +to match the target. Because the computational complexity of the system is +independent of the corpus size, our system scales to corpora that are hours +long, which is an important feature in the age of vast audio data collections. +Within The Concatenator module itself, composers can vary grain length, fit to +target, and pitch shift in real time while reacting to the sounds they hear, +enabling them to rapidly iterate ideas. To conclude our work, we evaluate our +system with extensive quantitative tests of the effects of parameters, as well +as a qualitative evaluation with artistic insights. Based on the quality of the +results, we believe the real-time capability unlocks new avenues for musical +expression and control, suitable for live performance and modular synthesis +integration, which furthermore represents an essential breakthrough in +concatenative synthesis technology. + +
+
+ comment: 12 pages, 6 figures, Accepted for Publication in The International + Society for Music Information Retrieval Proceedings, 2024 +
+
+
+
+
+ + ♻ ☆ On Softmax Direct Preference Optimization for Recommendation NeurIPS 2024 + + +
+ Recommender systems aim to predict personalized rankings based on user +preference data. With the rise of Language Models (LMs), LM-based recommenders +have been widely explored due to their extensive world knowledge and powerful +reasoning abilities. Most of the LM-based recommenders convert historical +interactions into language prompts, pairing with a positive item as the target +response and fine-tuning LM with a language modeling loss. However, the current +objective fails to fully leverage preference data and is not optimized for +personalized ranking tasks, which hinders the performance of LM-based +recommenders. Inspired by the current advancement of Direct Preference +Optimization (DPO) in human preference alignment and the success of softmax +loss in recommendations, we propose Softmax-DPO (S-DPO) to instill ranking +information into the LM to help LM-based recommenders distinguish preferred +items from negatives, rather than solely focusing on positives. Specifically, +we incorporate multiple negatives in user preference data and devise an +alternative version of DPO loss tailored for LM-based recommenders, which is +extended from the traditional full-ranking Plackett-Luce (PL) model to partial +rankings and connected to softmax sampling strategies. Theoretically, we bridge +S-DPO with the softmax loss over negative sampling and find that it has an +inherent benefit of mining hard negatives, which assures its exceptional +capabilities in recommendation tasks. Empirically, extensive experiments +conducted on three real-world datasets demonstrate the superiority of S-DPO to +effectively model user preference and further boost recommendation performance +while providing better rewards for preferred items. Our codes are available at +https://github.com/chenyuxin1999/S-DPO. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Talking the Talk Does Not Entail Walking the Walk: On the Limits of + Large Language Models in Lexical Entailment Recognition EMNLP-2024 + + +
+ Verbs form the backbone of language, providing the structure and meaning to +sentences. Yet, their intricate semantic nuances pose a longstanding challenge. +Understanding verb relations through the concept of lexical entailment is +crucial for comprehending sentence meanings and grasping verb dynamics. This +work investigates the capabilities of eight Large Language Models in +recognizing lexical entailment relations among verbs through differently +devised prompting strategies and zero-/few-shot settings over verb pairs from +two lexical databases, namely WordNet and HyperLex. Our findings unveil that +the models can tackle the lexical entailment recognition task with moderately +good performance, although at varying degree of effectiveness and under +different conditions. Also, utilizing few-shot prompting can enhance the +models' performance. However, perfectly solving the task arises as an unmet +challenge for all examined LLMs, which raises an emergence for further research +developments on this topic. + +
+
+ comment: Accepted for publication at The 2024 Conference on Empirical Methods + in Natural Language Processing (EMNLP-2024) - Findings +
+
+
+
+
+ + ♻ ☆ ChartifyText: Automated Chart Generation from Data-Involved Texts via + LLM + + +
+ Text documents with numerical values involved are widely used in various +applications such as scientific research, economy, public health and +journalism. However, it is difficult for readers to quickly interpret such +data-involved texts and gain deep insights. To fill this research gap, this +work aims to automatically generate charts to accurately convey the underlying +data and ideas to readers, which is essentially a challenging task. The +challenges originate from text ambiguities, intrinsic sparsity and uncertainty +of data in text documents, and subjective sentiment differences. Specifically, +we propose ChartifyText, a novel fully-automated approach that leverages Large +Language Models (LLMs) to convert complex data-involved texts to expressive +charts. It consists of two major modules: tabular data inference and expressive +chart generation. The tabular data inference module employs systematic prompt +engineering to guide the LLM (e.g., GPT-4) to infer table data, where data +ranges, uncertainties, missing data values and corresponding subjective +sentiments are explicitly considered. The expressive chart generation module +augments standard charts with intuitive visual encodings and concise texts to +accurately convey the underlying data and insights. We extensively evaluate the +effectiveness of ChartifyText on real-world data-involved text documents +through case studies, in-depth interviews with three visualization experts, and +a carefully-designed user study with 15 participants. The results demonstrate +the usefulness and effectiveness of ChartifyText in helping readers efficiently +and effectively make sense of data-involved texts. + +
+
+
+
+
+ + ♻ ☆ LightRAG: Simple and Fast Retrieval-Augmented Generation + + +
+ Retrieval-Augmented Generation (RAG) systems enhance large language models +(LLMs) by integrating external knowledge sources, enabling more accurate and +contextually relevant responses tailored to user needs. However, existing RAG +systems have significant limitations, including reliance on flat data +representations and inadequate contextual awareness, which can lead to +fragmented answers that fail to capture complex inter-dependencies. To address +these challenges, we propose LightRAG, which incorporates graph structures into +text indexing and retrieval processes. This innovative framework employs a +dual-level retrieval system that enhances comprehensive information retrieval +from both low-level and high-level knowledge discovery. Additionally, the +integration of graph structures with vector representations facilitates +efficient retrieval of related entities and their relationships, significantly +improving response times while maintaining contextual relevance. This +capability is further enhanced by an incremental update algorithm that ensures +the timely integration of new data, allowing the system to remain effective and +responsive in rapidly changing data environments. Extensive experimental +validation demonstrates considerable improvements in retrieval accuracy and +efficiency compared to existing approaches. We have made our LightRAG +open-source and available at the link: https://github.com/HKUDS/LightRAG. + +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ SVDQunat: Absorbing Outliers by Low-Rank Components for 4-Bit Diffusion + Models + + +
+ Diffusion models have been proven highly effective at generating high-quality +images. However, as these models grow larger, they require significantly more +memory and suffer from higher latency, posing substantial challenges for +deployment. In this work, we aim to accelerate diffusion models by quantizing +their weights and activations to 4 bits. At such an aggressive level, both +weights and activations are highly sensitive, where conventional post-training +quantization methods for large language models like smoothing become +insufficient. To overcome this limitation, we propose SVDQuant, a new 4-bit +quantization paradigm. Different from smoothing which redistributes outliers +between weights and activations, our approach absorbs these outliers using a +low-rank branch. We first consolidate the outliers by shifting them from +activations to weights, then employ a high-precision low-rank branch to take in +the weight outliers with Singular Value Decomposition (SVD). This process eases +the quantization on both sides. However, na\"{\i}vely running the low-rank +branch independently incurs significant overhead due to extra data movement of +activations, negating the quantization speedup. To address this, we co-design +an inference engine Nunchaku that fuses the kernels of the low-rank branch into +those of the low-bit branch to cut off redundant memory access. It can also +seamlessly support off-the-shelf low-rank adapters (LoRAs) without the need for +re-quantization. Extensive experiments on SDXL, PixArt-$\Sigma$, and FLUX.1 +validate the effectiveness of SVDQuant in preserving image quality. We reduce +the memory usage for the 12B FLUX.1 models by 3.5$\times$, achieving +3.0$\times$ speedup over the 4-bit weight-only quantized baseline on the 16GB +laptop 4090 GPU, paving the way for more interactive applications on PCs. Our +quantization library and inference engine are open-sourced. + +
+
+ comment: Quantization Library: https://github.com/mit-han-lab/deepcompressor + Inference Engine: https://github.com/mit-han-lab/nunchaku Website: + https://hanlab.mit.edu/projects/svdquant Demo: https://svdquant.mit.edu Blog: + https://hanlab.mit.edu/blog/svdquant +
+
+
+
+
+ + ☆ Diff-2-in-1: Bridging Generation and Dense Perception with Diffusion + Models + + +
+ Beyond high-fidelity image synthesis, diffusion models have recently +exhibited promising results in dense visual perception tasks. However, most +existing work treats diffusion models as a standalone component for perception +tasks, employing them either solely for off-the-shelf data augmentation or as +mere feature extractors. In contrast to these isolated and thus sub-optimal +efforts, we introduce a unified, versatile, diffusion-based framework, +Diff-2-in-1, that can simultaneously handle both multi-modal data generation +and dense visual perception, through a unique exploitation of the +diffusion-denoising process. Within this framework, we further enhance +discriminative visual perception via multi-modal generation, by utilizing the +denoising network to create multi-modal data that mirror the distribution of +the original training set. Importantly, Diff-2-in-1 optimizes the utilization +of the created diverse and faithful data by leveraging a novel self-improving +learning mechanism. Comprehensive experimental evaluations validate the +effectiveness of our framework, showcasing consistent performance improvements +across various discriminative backbones and high-quality multi-modal data +generation characterized by both realism and usefulness. + +
+
+ comment: 26 pages, 14 figures +
+
+
+
+
+ + ☆ ReCapture: Generative Video Camera Controls for User-Provided Videos + using Masked Video Fine-Tuning + + +
+ Recently, breakthroughs in video modeling have allowed for controllable +camera trajectories in generated videos. However, these methods cannot be +directly applied to user-provided videos that are not generated by a video +model. In this paper, we present ReCapture, a method for generating new videos +with novel camera trajectories from a single user-provided video. Our method +allows us to re-generate the reference video, with all its existing scene +motion, from vastly different angles and with cinematic camera motion. Notably, +using our method we can also plausibly hallucinate parts of the scene that were +not observable in the reference video. Our method works by (1) generating a +noisy anchor video with a new camera trajectory using multiview diffusion +models or depth-based point cloud rendering and then (2) regenerating the +anchor video into a clean and temporally consistent reangled video using our +proposed masked video fine-tuning technique. + +
+
+ comment: project page: https://generative-video-camera-controls.github.io/ +
+
+
+
+
+ + ☆ Analyzing The Language of Visual Tokens + + +
+ With the introduction of transformer-based models for vision and language +tasks, such as LLaVA and Chameleon, there has been renewed interest in the +discrete tokenized representation of images. These models often treat image +patches as discrete tokens, analogous to words in natural language, learning +joint alignments between visual and human languages. However, little is known +about the statistical behavior of these visual languages - whether they follow +similar frequency distributions, grammatical structures, or topologies as +natural languages. In this paper, we take a natural-language-centric approach +to analyzing discrete visual languages and uncover striking similarities and +fundamental differences. We demonstrate that, although visual languages adhere +to Zipfian distributions, higher token innovation drives greater entropy and +lower compression, with tokens predominantly representing object parts, +indicating intermediate granularity. We also show that visual languages lack +cohesive grammatical structures, leading to higher perplexity and weaker +hierarchical organization compared to natural languages. Finally, we +demonstrate that, while vision models align more closely with natural languages +than other models, this alignment remains significantly weaker than the +cohesion found within natural languages. Through these experiments, we +demonstrate how understanding the statistical properties of discrete visual +languages can inform the design of more effective computer vision models. + +
+
+
+
+
+ + ☆ DynaMem: Online Dynamic Spatio-Semantic Memory for Open World Mobile + Manipulation + + +
+ Significant progress has been made in open-vocabulary mobile manipulation, +where the goal is for a robot to perform tasks in any environment given a +natural language description. However, most current systems assume a static +environment, which limits the system's applicability in real-world scenarios +where environments frequently change due to human intervention or the robot's +own actions. In this work, we present DynaMem, a new approach to open-world +mobile manipulation that uses a dynamic spatio-semantic memory to represent a +robot's environment. DynaMem constructs a 3D data structure to maintain a +dynamic memory of point clouds, and answers open-vocabulary object localization +queries using multimodal LLMs or open-vocabulary features generated by +state-of-the-art vision-language models. Powered by DynaMem, our robots can +explore novel environments, search for objects not found in memory, and +continuously update the memory as objects move, appear, or disappear in the +scene. We run extensive experiments on the Stretch SE3 robots in three real and +nine offline scenes, and achieve an average pick-and-drop success rate of 70% +on non-stationary objects, which is more than a 2x improvement over +state-of-the-art static systems. Our code as well as our experiment and +deployment videos are open sourced and can be found on our project website: +https://dynamem.github.io/ + +
+
+ comment: Website: https://dynamem.github.io +
+
+
+
+
+ + ☆ HourVideo: 1-Hour Video-Language Understanding NeurIPS 2024 + + +
+ We present HourVideo, a benchmark dataset for hour-long video-language +understanding. Our dataset consists of a novel task suite comprising +summarization, perception (recall, tracking), visual reasoning (spatial, +temporal, predictive, causal, counterfactual), and navigation (room-to-room, +object retrieval) tasks. HourVideo includes 500 manually curated egocentric +videos from the Ego4D dataset, spanning durations of 20 to 120 minutes, and +features 12,976 high-quality, five-way multiple-choice questions. Benchmarking +results reveal that multimodal models, including GPT-4 and LLaVA-NeXT, achieve +marginal improvements over random chance. In stark contrast, human experts +significantly outperform the state-of-the-art long-context multimodal model, +Gemini Pro 1.5 (85.0% vs. 37.3%), highlighting a substantial gap in multimodal +capabilities. Our benchmark, evaluation toolkit, prompts, and documentation are +available at https://hourvideo.stanford.edu + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track; 28 pages +
+
+
+
+
+ + ☆ LoFi: Scalable Local Image Reconstruction with Implicit Neural + Representation + + +
+ Neural fields or implicit neural representations (INRs) have attracted +significant attention in machine learning and signal processing due to their +efficient continuous representation of images and 3D volumes. In this work, we +build on INRs and introduce a coordinate-based local processing framework for +solving imaging inverse problems, termed LoFi (Local Field). Unlike +conventional methods for image reconstruction, LoFi processes local information +at each coordinate \textit{separately} by multi-layer perceptrons (MLPs), +recovering the object at that specific coordinate. Similar to INRs, LoFi can +recover images at any continuous coordinate, enabling image reconstruction at +multiple resolutions. With comparable or better performance than standard CNNs +for image reconstruction, LoFi achieves excellent generalization to +out-of-distribution data and memory usage almost independent of image +resolution. Remarkably, training on $1024 \times 1024$ images requires just 3GB +of memory -- over 20 times less than the memory typically needed by standard +CNNs. Additionally, LoFi's local design allows it to train on extremely small +datasets with less than 10 samples, without overfitting or the need for +regularization or early stopping. Finally, we use LoFi as a denoising prior in +a plug-and-play framework for solving general inverse problems to benefit from +its continuous image representation and strong generalization. Although trained +on low-resolution images, LoFi can be used as a low-dimensional prior to solve +inverse problems at any resolution. We validate our framework across a variety +of imaging modalities, from low-dose computed tomography to radio +interferometric imaging. + +
+
+
+
+
+ + ☆ Which bits went where? Past and future transfer entropy decomposition + with the information bottleneck NeurIPS 2024 + + +
+ Whether the system under study is a shoal of fish, a collection of neurons, +or a set of interacting atmospheric and oceanic processes, transfer entropy +measures the flow of information between time series and can detect possible +causal relationships. Much like mutual information, transfer entropy is +generally reported as a single value summarizing an amount of shared variation, +yet a more fine-grained accounting might illuminate much about the processes +under study. Here we propose to decompose transfer entropy and localize the +bits of variation on both sides of information flow: that of the originating +process's past and that of the receiving process's future. We employ the +information bottleneck (IB) to compress the time series and identify the +transferred entropy. We apply our method to decompose the transfer entropy in +several synthetic recurrent processes and an experimental mouse dataset of +concurrent behavioral and neural activity. Our approach highlights the nuanced +dynamics within information flow, laying a foundation for future explorations +into the intricate interplay of temporal processes in complex systems. + +
+
+ comment: NeurIPS 2024 workshop "Machine learning and the physical sciences" + Camera ready +
+
+
+
+
+ + ☆ Clustering in Causal Attention Masking NeurIPS + 2024 + + +
+ This work presents a modification of the self-attention dynamics proposed by +Geshkovski et al. (arXiv:2312.10794) to better reflect the practically +relevant, causally masked attention used in transformer architectures for +generative AI. This modification translates into an interacting particle system +that cannot be interpreted as a mean-field gradient flow. Despite this loss of +structure, we significantly strengthen the results of Geshkovski et al. +(arXiv:2312.10794) in this context: While previous rigorous results focused on +cases where all three matrices (Key, Query, and Value) were scaled identities, +we prove asymptotic convergence to a single cluster for arbitrary key-query +matrices and a value matrix equal to the identity. Additionally, we establish a +connection to the classical R\'enyi parking problem from combinatorial geometry +to make initial theoretical steps towards demonstrating the existence of +meta-stable states. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024), 22 pages, 6 figures +
+
+
+
+
+ + ☆ SG-I2V: Self-Guided Trajectory Control in Image-to-Video Generation + + +
+ Methods for image-to-video generation have achieved impressive, +photo-realistic quality. However, adjusting specific elements in generated +videos, such as object motion or camera movement, is often a tedious process of +trial and error, e.g., involving re-generating videos with different random +seeds. Recent techniques address this issue by fine-tuning a pre-trained model +to follow conditioning signals, such as bounding boxes or point trajectories. +Yet, this fine-tuning procedure can be computationally expensive, and it +requires datasets with annotated object motion, which can be difficult to +procure. In this work, we introduce SG-I2V, a framework for controllable +image-to-video generation that is self-guided$\unicode{x2013}$offering +zero-shot control by relying solely on the knowledge present in a pre-trained +image-to-video diffusion model without the need for fine-tuning or external +knowledge. Our zero-shot method outperforms unsupervised baselines while being +competitive with supervised models in terms of visual quality and motion +fidelity. + +
+
+ comment: Project page: https://kmcode1.github.io/Projects/SG-I2V/ +
+
+
+
+
+ + ☆ Few-Shot Task Learning through Inverse Generative Modeling + + +
+ Learning the intents of an agent, defined by its goals or motion style, is +often extremely challenging from just a few examples. We refer to this problem +as task concept learning and present our approach, Few-Shot Task Learning +through Inverse Generative Modeling (FTL-IGM), which learns new task concepts +by leveraging invertible neural generative models. The core idea is to pretrain +a generative model on a set of basic concepts and their demonstrations. Then, +given a few demonstrations of a new concept (such as a new goal or a new +action), our method learns the underlying concepts through backpropagation +without updating the model weights, thanks to the invertibility of the +generative model. We evaluate our method in five domains -- object +rearrangement, goal-oriented navigation, motion caption of human actions, +autonomous driving, and real-world table-top manipulation. Our experimental +results demonstrate that via the pretrained generative model, we successfully +learn novel concepts and generate agent plans or motion corresponding to these +concepts in (1) unseen environments and (2) in composition with training +concepts. + +
+
+
+
+
+ + ☆ Noisy Zero-Shot Coordination: Breaking The Common Knowledge Assumption + In Zero-Shot Coordination Games + + +
+ Zero-shot coordination (ZSC) is a popular setting for studying the ability of +reinforcement learning (RL) agents to coordinate with novel partners. Prior ZSC +formulations assume the $\textit{problem setting}$ is common knowledge: each +agent knows the underlying Dec-POMDP, knows others have this knowledge, and so +on ad infinitum. However, this assumption rarely holds in complex real-world +settings, which are often difficult to fully and correctly specify. Hence, in +settings where this common knowledge assumption is invalid, agents trained +using ZSC methods may not be able to coordinate well. To address this +limitation, we formulate the $\textit{noisy zero-shot coordination}$ (NZSC) +problem. In NZSC, agents observe different noisy versions of the ground truth +Dec-POMDP, which are assumed to be distributed according to a fixed noise +model. Only the distribution of ground truth Dec-POMDPs and the noise model are +common knowledge. We show that a NZSC problem can be reduced to a ZSC problem +by designing a meta-Dec-POMDP with an augmented state space consisting of all +the ground-truth Dec-POMDPs. For solving NZSC problems, we propose a simple and +flexible meta-learning method called NZSC training, in which the agents are +trained across a distribution of coordination problems - which they only get to +observe noisy versions of. We show that with NZSC training, RL agents can be +trained to coordinate well with novel partners even when the (exact) problem +setting of the coordination is not common knowledge. + +
+
+
+
+
+ + ☆ SuffixDecoding: A Model-Free Approach to Speeding Up Large Language + Model Inference + + +
+ We present SuffixDecoding, a novel model-free approach to accelerating large +language model (LLM) inference through speculative decoding. Unlike existing +methods that rely on draft models or specialized decoding heads, SuffixDecoding +leverages suffix trees built from previously generated outputs to efficiently +predict candidate token sequences. Our approach enables flexible +tree-structured speculation without the overhead of maintaining and +orchestrating additional models. SuffixDecoding builds and dynamically updates +suffix trees to capture patterns in the generated text, using them to construct +speculation trees through a principled scoring mechanism based on empirical +token frequencies. SuffixDecoding requires only CPU memory which is plentiful +and underutilized on typical LLM serving nodes. We demonstrate that +SuffixDecoding achieves competitive speedups compared to model-based approaches +across diverse workloads including open-domain chat, code generation, and +text-to-SQL tasks. For open-ended chat and code generation tasks, +SuffixDecoding achieves up to $1.4\times$ higher output throughput than +SpecInfer and up to $1.1\times$ lower time-per-token (TPOT) latency. For a +proprietary multi-LLM text-to-SQL application, SuffixDecoding achieves up to +$2.9\times$ higher output throughput and $3\times$ lower latency than +speculative decoding. Our evaluation shows that SuffixDecoding maintains high +acceptance rates even with small reference corpora of 256 examples, while +continuing to improve performance as more historical outputs are incorporated. + +
+
+
+
+
+ + ☆ AsCAN: Asymmetric Convolution-Attention Networks for Efficient + Recognition and Generation NeurIPS 2024 + + +
+ Neural network architecture design requires making many crucial decisions. +The common desiderata is that similar decisions, with little modifications, can +be reused in a variety of tasks and applications. To satisfy that, +architectures must provide promising latency and performance trade-offs, +support a variety of tasks, scale efficiently with respect to the amounts of +data and compute, leverage available data from other tasks, and efficiently +support various hardware. To this end, we introduce AsCAN -- a hybrid +architecture, combining both convolutional and transformer blocks. We revisit +the key design principles of hybrid architectures and propose a simple and +effective \emph{asymmetric} architecture, where the distribution of +convolutional and transformer blocks is \emph{asymmetric}, containing more +convolutional blocks in the earlier stages, followed by more transformer blocks +in later stages. AsCAN supports a variety of tasks: recognition, segmentation, +class-conditional image generation, and features a superior trade-off between +performance and latency. We then scale the same architecture to solve a +large-scale text-to-image task and show state-of-the-art performance compared +to the most recent public and commercial models. Notably, even without any +computation optimization for transformer blocks, our models still yield faster +inference speed than existing works featuring efficient attention mechanisms, +highlighting the advantages and the value of our approach. + +
+
+ comment: NeurIPS 2024. Project Page: + https://snap-research.github.io/snap_image/ +
+
+
+
+
+ + ☆ BitNet a4.8: 4-bit Activations for 1-bit LLMs + + +
+ Recent research on the 1-bit Large Language Models (LLMs), such as BitNet +b1.58, presents a promising direction for reducing the inference cost of LLMs +while maintaining their performance. In this work, we introduce BitNet a4.8, +enabling 4-bit activations for 1-bit LLMs. BitNet a4.8 employs a hybrid +quantization and sparsification strategy to mitigate the quantization errors +introduced by the outlier channels. Specifically, we utilize 4-bit activations +for inputs to the attention and feed-forward network layers, while sparsifying +intermediate states followed with 8-bit quantization. Extensive experiments +demonstrate that BitNet a4.8 achieves performance comparable to BitNet b1.58 +with equivalent training costs, while being faster in inference with enabling +4-bit (INT4/FP4) kernels. Additionally, BitNet a4.8 activates only 55% of +parameters and supports 3-bit KV cache, further enhancing the efficiency of +large-scale LLM deployment and inference. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ SPGD: Steepest Perturbed Gradient Descent Optimization + + +
+ Optimization algorithms are pivotal in advancing various scientific and +industrial fields but often encounter obstacles such as trapping in local +minima, saddle points, and plateaus (flat regions), which makes the convergence +to reasonable or near-optimal solutions particularly challenging. This paper +presents the Steepest Perturbed Gradient Descent (SPGD), a novel algorithm that +innovatively combines the principles of the gradient descent method with +periodic uniform perturbation sampling to effectively circumvent these +impediments and lead to better solutions whenever possible. SPGD is +distinctively designed to generate a set of candidate solutions and select the +one exhibiting the steepest loss difference relative to the current solution. +It enhances the traditional gradient descent approach by integrating a +strategic exploration mechanism that significantly increases the likelihood of +escaping sub-optimal local minima and navigating complex optimization +landscapes effectively. Our approach not only retains the directed efficiency +of gradient descent but also leverages the exploratory benefits of stochastic +perturbations, thus enabling a more comprehensive search for global optima +across diverse problem spaces. We demonstrate the efficacy of SPGD in solving +the 3D component packing problem, an NP-hard challenge. Preliminary results +show a substantial improvement over four established methods, particularly on +response surfaces with complex topographies and in multidimensional non-convex +continuous optimization problems. Comparative analyses with established 2D +benchmark functions highlight SPGD's superior performance, showcasing its +ability to navigate complex optimization landscapes. These results emphasize +SPGD's potential as a versatile tool for a wide range of optimization problems. + +
+
+ comment: 28 pages, 26 figures, submitted to Journal of Mechanical Design +
+
+
+
+
+ + ☆ Pareto Set Identification With Posterior Sampling + + +
+ The problem of identifying the best answer among a collection of items having +real-valued distribution is well-understood. + Despite its practical relevance for many applications, fewer works have +studied its extension when multiple and potentially conflicting metrics are +available to assess an item's quality. + Pareto set identification (PSI) aims to identify the set of answers whose +means are not uniformly worse than another. + This paper studies PSI in the transductive linear setting with potentially +correlated objectives. + Building on posterior sampling in both the stopping and the sampling rules, +we propose the PSIPS algorithm that deals simultaneously with structure and +correlation without paying the computational cost of existing oracle-based +algorithms. + Both from a frequentist and Bayesian perspective, PSIPS is asymptotically +optimal. + We demonstrate its good empirical performance in real-world and synthetic +instances. + +
+
+
+
+
+ + ☆ Fed-LDR: Federated Local Data-infused Graph Creation with Node-centric + Model Refinement + + +
+ The rapid acceleration of global urbanization has introduced novel challenges +in enhancing urban infrastructure and services. Spatio-temporal data, +integrating spatial and temporal dimensions, has emerged as a critical tool for +understanding urban phenomena and promoting sustainability. In this context, +Federated Learning (FL) has gained prominence as a distributed learning +paradigm aligned with the privacy requirements of urban IoT environments. +However, integrating traditional and deep learning models into the FL framework +poses significant challenges, particularly in capturing complex spatio-temporal +dependencies and adapting to diverse urban conditions. To address these +challenges, we propose the Federated Local Data-Infused Graph Creation with +Node-centric Model Refinement (Fed-LDR) algorithm. Fed-LDR leverages FL and +Graph Convolutional Networks (GCN) to enhance spatio-temporal data analysis in +urban environments. The algorithm comprises two key modules: (1) the Local +Data-Infused Graph Creation (LDIGC) module, which dynamically reconfigures +adjacency matrices to reflect evolving spatial relationships within urban +environments, and (2) the Node-centric Model Refinement (NoMoR) module, which +customizes model parameters for individual urban nodes to accommodate +heterogeneity. Evaluations on the PeMSD4 and PeMSD8 datasets demonstrate +Fed-LDR's superior performance over six baseline methods. Fed-LDR achieved the +lowest Mean Absolute Error (MAE) values of 20.15 and 17.30, and the lowest Root +Mean Square Error (RMSE) values of 32.30 and 27.15, respectively, while +maintaining a high correlation coefficient of 0.96 across both datasets. +Notably, on the PeMSD4 dataset, Fed-LDR reduced MAE and RMSE by up to 81\% and +78\%, respectively, compared to the best-performing baseline FedMedian. + +
+
+
+
+
+ + ☆ Evaluating Robustness of Reinforcement Learning Algorithms for + Autonomous Shipping + + +
+ Recently, there has been growing interest in autonomous shipping due to its +potential to improve maritime efficiency and safety. The use of advanced +technologies, such as artificial intelligence, can address the current +navigational and operational challenges in autonomous shipping. In particular, +inland waterway transport (IWT) presents a unique set of challenges, such as +crowded waterways and variable environmental conditions. In such dynamic +settings, the reliability and robustness of autonomous shipping solutions are +critical factors for ensuring safe operations. This paper examines the +robustness of benchmark deep reinforcement learning (RL) algorithms, +implemented for IWT within an autonomous shipping simulator, and their ability +to generate effective motion planning policies. We demonstrate that a +model-free approach can achieve an adequate policy in the simulator, +successfully navigating port environments never encountered during training. We +focus particularly on Soft-Actor Critic (SAC), which we show to be inherently +more robust to environmental disturbances compared to MuZero, a +state-of-the-art model-based RL algorithm. In this paper, we take a significant +step towards developing robust, applied RL frameworks that can be generalized +to various vessel types and navigate complex port- and inland environments and +scenarios. + +
+
+ comment: 5 pages, 4 figures. Will be presented at IEEE RAAI 2024 +
+
+
+
+
+ + ☆ Structure Matters: Dynamic Policy Gradient + + +
+ In this work, we study $\gamma$-discounted infinite-horizon tabular Markov +decision processes (MDPs) and introduce a framework called dynamic policy +gradient (DynPG). The framework directly integrates dynamic programming with +(any) policy gradient method, explicitly leveraging the Markovian property of +the environment. DynPG dynamically adjusts the problem horizon during training, +decomposing the original infinite-horizon MDP into a sequence of contextual +bandit problems. By iteratively solving these contextual bandits, DynPG +converges to the stationary optimal policy of the infinite-horizon MDP. To +demonstrate the power of DynPG, we establish its non-asymptotic global +convergence rate under the tabular softmax parametrization, focusing on the +dependencies on salient but essential parameters of the MDP. By combining +classical arguments from dynamic programming with more recent convergence +arguments of policy gradient schemes, we prove that softmax DynPG scales +polynomially in the effective horizon $(1-\gamma)^{-1}$. Our findings contrast +recent exponential lower bound examples for vanilla policy gradient. + +
+
+ comment: 46 pages, 4 figures +
+
+
+
+
+ + ☆ Enhancing Missing Data Imputation through Combined Bipartite Graph and + Complete Directed Graph + + +
+ In this paper, we aim to address a significant challenge in the field of +missing data imputation: identifying and leveraging the interdependencies among +features to enhance missing data imputation for tabular data. We introduce a +novel framework named the Bipartite and Complete Directed Graph Neural Network +(BCGNN). Within BCGNN, observations and features are differentiated as two +distinct node types, and the values of observed features are converted into +attributed edges linking them. The bipartite segment of our framework +inductively learns embedding representations for nodes, efficiently utilizing +the comprehensive information encapsulated in the attributed edges. In +parallel, the complete directed graph segment adeptly outlines and communicates +the complex interdependencies among features. When compared to contemporary +leading imputation methodologies, BCGNN consistently outperforms them, +achieving a noteworthy average reduction of 15% in mean absolute error for +feature imputation tasks under different missing mechanisms. Our extensive +experimental investigation confirms that an in-depth grasp of the +interdependence structure substantially enhances the model's feature embedding +ability. We also highlight the model's superior performance in label prediction +tasks involving missing data, and its formidable ability to generalize to +unseen data points. + +
+
+
+
+
+ + ☆ Sampling-guided Heterogeneous Graph Neural Network with Temporal + Smoothing for Scalable Longitudinal Data Imputation + + +
+ In this paper, we propose a novel framework, the Sampling-guided +Heterogeneous Graph Neural Network (SHT-GNN), to effectively tackle the +challenge of missing data imputation in longitudinal studies. Unlike +traditional methods, which often require extensive preprocessing to handle +irregular or inconsistent missing data, our approach accommodates arbitrary +missing data patterns while maintaining computational efficiency. SHT-GNN +models both observations and covariates as distinct node types, connecting +observation nodes at successive time points through subject-specific +longitudinal subnetworks, while covariate-observation interactions are +represented by attributed edges within bipartite graphs. By leveraging +subject-wise mini-batch sampling and a multi-layer temporal smoothing +mechanism, SHT-GNN efficiently scales to large datasets, while effectively +learning node representations and imputing missing data. Extensive experiments +on both synthetic and real-world datasets, including the Alzheimer's Disease +Neuroimaging Initiative (ADNI) dataset, demonstrate that SHT-GNN significantly +outperforms existing imputation methods, even with high missing data rates. The +empirical results highlight SHT-GNN's robust imputation capabilities and +superior performance, particularly in the context of complex, large-scale +longitudinal data. + +
+
+
+
+
+ + ☆ Non-Euclidean Mixture Model for Social Network Embedding + + +
+ It is largely agreed that social network links are formed due to either +homophily or social influence. Inspired by this, we aim at understanding the +generation of links via providing a novel embedding-based graph formation +model. Different from existing graph representation learning, where link +generation probabilities are defined as a simple function of the corresponding +node embeddings, we model the link generation as a mixture model of the two +factors. In addition, we model the homophily factor in spherical space and the +influence factor in hyperbolic space to accommodate the fact that (1) homophily +results in cycles and (2) influence results in hierarchies in networks. We also +design a special projection to align these two spaces. We call this model +Non-Euclidean Mixture Model, i.e., NMM. We further integrate NMM with our +non-Euclidean graph variational autoencoder (VAE) framework, NMM-GNN. NMM-GNN +learns embeddings through a unified framework which uses non-Euclidean GNN +encoders, non-Euclidean Gaussian priors, a non-Euclidean decoder, and a novel +space unification loss component to unify distinct non-Euclidean geometric +spaces. Experiments on public datasets show NMM-GNN significantly outperforms +state-of-the-art baselines on social network generation and classification +tasks, demonstrating its ability to better explain how the social network is +formed. + +
+
+
+
+
+ + ☆ Think Smart, Act SMARL! Analyzing Probabilistic Logic Driven Safety in + Multi-Agent Reinforcement Learning + + +
+ An important challenge for enabling the deployment of reinforcement learning +(RL) algorithms in the real world is safety. This has resulted in the recent +research field of Safe RL, which aims to learn optimal policies that are safe. +One successful approach in that direction is probabilistic logic shields (PLS), +a model-based Safe RL technique that uses formal specifications based on +probabilistic logic programming, constraining an agent's policy to comply with +those specifications in a probabilistic sense. However, safety is inherently a +multi-agent concept, since real-world environments often involve multiple +agents interacting simultaneously, leading to a complex system which is hard to +control. Moreover, safe multi-agent RL (Safe MARL) is still underexplored. In +order to address this gap, in this paper we ($i$) introduce Shielded MARL +(SMARL) by extending PLS to MARL -- in particular, we introduce Probabilistic +Logic Temporal Difference Learning (PLTD) to enable shielded independent +Q-learning (SIQL), and introduce shielded independent PPO (SIPPO) using +probabilistic logic policy gradients; ($ii$) show its positive effect and use +as an equilibrium selection mechanism in various game-theoretic environments +including two-player simultaneous games, extensive-form games, stochastic +games, and some grid-world extensions in terms of safety, cooperation, and +alignment with normative behaviors; and ($iii$) look into the asymmetric case +where only one agent is shielded, and show that the shielded agent has a +significant influence on the unshielded one, providing further evidence of +SMARL's ability to enhance safety and cooperation in diverse multi-agent +environments. + +
+
+ comment: 19 pages, 14 figures +
+
+
+
+
+ + ☆ OneProt: Towards Multi-Modal Protein Foundation Models + + +
+ Recent AI advances have enabled multi-modal systems to model and translate +diverse information spaces. Extending beyond text and vision, we introduce +OneProt, a multi-modal AI for proteins that integrates structural, sequence, +alignment, and binding site data. Using the ImageBind framework, OneProt aligns +the latent spaces of modality encoders along protein sequences. It demonstrates +strong performance in retrieval tasks and surpasses state-of-the-art methods in +various downstream tasks, including metal ion binding classification, +gene-ontology annotation, and enzyme function prediction. This work expands +multi-modal capabilities in protein models, paving the way for applications in +drug discovery, biocatalytic reaction planning, and protein engineering. + +
+
+ comment: 28 pages, 15 figures, 7 tables +
+
+
+
+
+ + ☆ Clinicians' Voice: Fundamental Considerations for XAI in Healthcare + + +
+ Explainable AI (XAI) holds the promise of advancing the implementation and +adoption of AI-based tools in practice, especially in high-stakes environments +like healthcare. However, most of the current research is disconnected from its +practical applications and lacks input of end users. To address this, we +conducted semi-structured interviews with clinicians to discuss their thoughts, +hopes, and concerns. We find that clinicians generally think positively about +developing AI-based tools for clinical practice, but they have concerns about +how these will fit into their workflow and how it will impact clinician-patient +relations. We further identify education of clinicians on AI as a crucial +factor for the success of AI in healthcare and highlight aspects clinicians are +looking for in (X)AI-based tools. In contrast to other studies, we take on a +holistic and exploratory perspective to identify general requirements, which is +necessary before moving on to testing specific (X)AI products for healthcare. + +
+
+
+
+
+ + ☆ Conformalized Credal Regions for Classification with Ambiguous Ground + Truth + + +
+ An open question in \emph{Imprecise Probabilistic Machine Learning} is how to +empirically derive a credal region (i.e., a closed and convex family of +probabilities on the output space) from the available data, without any prior +knowledge or assumption. In classification problems, credal regions are a tool +that is able to provide provable guarantees under realistic assumptions by +characterizing the uncertainty about the distribution of the labels. Building +on previous work, we show that credal regions can be directly constructed using +conformal methods. This allows us to provide a novel extension of classical +conformal prediction to problems with ambiguous ground truth, that is, when the +exact labels for given inputs are not exactly known. The resulting construction +enjoys desirable practical and theoretical properties: (i) conformal coverage +guarantees, (ii) smaller prediction sets (compared to classical conformal +prediction regions) and (iii) disentanglement of uncertainty sources +(epistemic, aleatoric). We empirically verify our findings on both synthetic +and real datasets. + +
+
+
+
+
+ + ☆ Asymptotic regularity of a generalised stochastic Halpern scheme with + applications + + +
+ We provide abstract, general and highly uniform rates of asymptotic +regularity for a generalized stochastic Halpern-style iteration, which +incorporates a second mapping in the style of a Krasnoselskii-Mann iteration. +This iteration is general in two ways: First, it incorporates stochasticity in +a completely abstract way rather than fixing a sampling method; secondly, it +includes as special cases stochastic versions of various schemes from the +optimization literature, including Halpern's iteration as well as a +Krasnoselskii-Mann iteration with Tikhonov regularization terms in the sense of +Bo\c{t}, Csetnek and Meier. For these particular cases, we in particular obtain +linear rates of asymptotic regularity, matching (or improving) the currently +best known rates for these iterations in stochastic optimization, and quadratic +rates of asymptotic regularity are obtained in the context of inner product +spaces for the general iteration. We utilize these rates to give bounds on the +oracle complexity of such iterations under suitable variance assumptions and +batching strategies, again presented in an abstract style. Finally, we sketch +how the schemes presented here can be instantiated in the context of +reinforcement learning to yield novel methods for Q-learning. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ Learning in Budgeted Auctions with Spacing Objectives + + +
+ In many repeated auction settings, participants care not only about how +frequently they win but also how their winnings are distributed over time. This +problem arises in various practical domains where avoiding congested demand is +crucial, such as online retail sales and compute services, as well as in +advertising campaigns that require sustained visibility over time. We introduce +a simple model of this phenomenon, modeling it as a budgeted auction where the +value of a win is a concave function of the time since the last win. This +implies that for a given number of wins, even spacing over time is optimal. We +also extend our model and results to the case when not all wins result in +"conversions" (realization of actual gains), and the probability of conversion +depends on a context. The goal is to maximize and evenly space conversions +rather than just wins. + We study the optimal policies for this setting in second-price auctions and +offer learning algorithms for the bidders that achieve low regret against the +optimal bidding policy in a Bayesian online setting. Our main result is a +computationally efficient online learning algorithm that achieves $\tilde +O(\sqrt T)$ regret. We achieve this by showing that an infinite-horizon Markov +decision process (MDP) with the budget constraint in expectation is essentially +equivalent to our problem, even when limiting that MDP to a very small number +of states. The algorithm achieves low regret by learning a bidding policy that +chooses bids as a function of the context and the system's state, which will be +the time elapsed since the last win (or conversion). We show that +state-independent strategies incur linear regret even without uncertainty of +conversions. We complement this by showing that there are state-independent +strategies that, while still having linear regret, achieve a $(1-\frac 1 e)$ +approximation to the optimal reward. + +
+
+
+
+
+ + ☆ Machine learning and optimization-based approaches to duality in + statistical physics + + +
+ The notion of duality -- that a given physical system can have two different +mathematical descriptions -- is a key idea in modern theoretical physics. +Establishing a duality in lattice statistical mechanics models requires the +construction of a dual Hamiltonian and a map from the original to the dual +observables. By using simple neural networks to parameterize these maps and +introducing a loss function that penalises the difference between correlation +functions in original and dual models, we formulate the process of duality +discovery as an optimization problem. We numerically solve this problem and +show that our framework can rediscover the celebrated Kramers-Wannier duality +for the 2d Ising model, reconstructing the known mapping of temperatures. We +also discuss an alternative approach which uses known features of the mapping +of topological lines to reduce the problem to optimizing the couplings in a +dual Hamiltonian, and explore next-to-nearest neighbour deformations of the 2d +Ising duality. We discuss future directions and prospects for discovering new +dualities within this framework. + +
+
+ comment: 27 pages + appendices, lots of plots +
+
+
+
+
+ + ☆ Plasticity Loss in Deep Reinforcement Learning: A Survey + + +
+ Akin to neuroplasticity in human brains, the plasticity of deep neural +networks enables their quick adaption to new data. This makes plasticity +particularly crucial for deep Reinforcement Learning (RL) agents: Once +plasticity is lost, an agent's performance will inevitably plateau because it +cannot improve its policy to account for changes in the data distribution, +which are a necessary consequence of its learning process. Thus, developing +well-performing and sample-efficient agents hinges on their ability to remain +plastic during training. Furthermore, the loss of plasticity can be connected +to many other issues plaguing deep RL, such as training instabilities, scaling +failures, overestimation bias, and insufficient exploration. With this survey, +we aim to provide an overview of the emerging research on plasticity loss for +academics and practitioners of deep reinforcement learning. First, we propose a +unified definition of plasticity loss based on recent works, relate it to +definitions from the literature, and discuss metrics for measuring plasticity +loss. Then, we categorize and discuss numerous possible causes of plasticity +loss before reviewing currently employed mitigation strategies. Our taxonomy is +the first systematic overview of the current state of the field. Lastly, we +discuss prevalent issues within the literature, such as a necessity for broader +evaluation, and provide recommendations for future research, like gaining a +better understanding of an agent's neural activity and behavior. + +
+
+
+
+
+ + ☆ D$^3$epth: Self-Supervised Depth Estimation with Dynamic Mask in Dynamic + Scenes + + +
+ Depth estimation is a crucial technology in robotics. Recently, +self-supervised depth estimation methods have demonstrated great potential as +they can efficiently leverage large amounts of unlabelled real-world data. +However, most existing methods are designed under the assumption of static +scenes, which hinders their adaptability in dynamic environments. To address +this issue, we present D$^3$epth, a novel method for self-supervised depth +estimation in dynamic scenes. It tackles the challenge of dynamic objects from +two key perspectives. First, within the self-supervised framework, we design a +reprojection constraint to identify regions likely to contain dynamic objects, +allowing the construction of a dynamic mask that mitigates their impact at the +loss level. Second, for multi-frame depth estimation, we introduce a cost +volume auto-masking strategy that leverages adjacent frames to identify regions +associated with dynamic objects and generate corresponding masks. This provides +guidance for subsequent processes. Furthermore, we propose a spectral entropy +uncertainty module that incorporates spectral entropy to guide uncertainty +estimation during depth fusion, effectively addressing issues arising from cost +volume computation in dynamic environments. Extensive experiments on KITTI and +Cityscapes datasets demonstrate that the proposed method consistently +outperforms existing self-supervised monocular depth estimation baselines. Code +is available at \url{https://github.com/Csyunling/D3epth}. + +
+
+ comment: Open sourced +
+
+
+
+
+ + ☆ VTechAGP: An Academic-to-General-Audience Text Paraphrase Dataset and + Benchmark Models + + +
+ Existing text simplification or paraphrase datasets mainly focus on +sentence-level text generation in a general domain. These datasets are +typically developed without using domain knowledge. In this paper, we release a +novel dataset, VTechAGP, which is the first academic-to-general-audience text +paraphrase dataset consisting of 4,938 document-level these and dissertation +academic and general-audience abstract pairs from 8 colleges authored over 25 +years. We also propose a novel dynamic soft prompt generative language model, +DSPT5. For training, we leverage a contrastive-generative loss function to +learn the keyword vectors in the dynamic prompt. For inference, we adopt a +crowd-sampling decoding strategy at both semantic and structural levels to +further select the best output candidate. We evaluate DSPT5 and various +state-of-the-art large language models (LLMs) from multiple perspectives. +Results demonstrate that the SOTA LLMs does not provide satisfactory outcomes, +while the lightweight DSPT5 can achieve competitive results. To the best of our +knowledge, we are the first to build a benchmark dataset and solutions for +academic-to-general-audience text paraphrase dataset. + +
+
+ comment: 21 pages, 3 figures +
+
+
+
+
+ + ☆ A Simple Packing Algorithm for Optimized Mapping of Artificial Neural + Networks onto Non-Volatile Memory Cross-Bar Arrays + + +
+ Neuromorphic computing with crossbar arrays has emerged as a promising +alternative to improve computing efficiency for machine learning. Previous work +has focused on implementing crossbar arrays to perform basic mathematical +operations. However, in this paper, we explore the impact of mapping the layers +of an artificial neural network onto physical cross-bar arrays arranged in +tiles across a chip. We have developed a simplified mapping algorithm to +determine the number of physical tiles, with fixed optimal array dimensions, +and to estimate the minimum area occupied by these tiles for a given design +objective. This simplified algorithm is compared with conventional binary +linear optimization, which solves the equivalent bin-packing problem. We have +found that the optimum solution is not necessarily related to the minimum +number of tiles; rather, it is shown to be an interaction between tile array +capacity and the scaling properties of its peripheral circuits. Additionally, +we have discovered that square arrays are not always the best choice for +optimal mapping, and that performance optimization comes at the cost of total +tile area + +
+
+ comment: 24 pages, 10 figures +
+
+
+
+
+ + ☆ Soft Hoeffding Tree: A Transparent and Differentiable Model on Data + Streams + + +
+ We propose soft Hoeffding trees (SoHoT) as a new differentiable and +transparent model for possibly infinite and changing data streams. Stream +mining algorithms such as Hoeffding trees grow based on the incoming data +stream, but they currently lack the adaptability of end-to-end deep learning +systems. End-to-end learning can be desirable if a feature representation is +learned by a neural network and used in a tree, or if the outputs of trees are +further processed in a deep learning model or workflow. Different from +Hoeffding trees, soft trees can be integrated into such systems due to their +differentiability, but are neither transparent nor explainable. Our novel model +combines the extensibility and transparency of Hoeffding trees with the +differentiability of soft trees. We introduce a new gating function to regulate +the balance between univariate and multivariate splits in the tree. Experiments +are performed on 20 data streams, comparing SoHoT to standard Hoeffding trees, +Hoeffding trees with limited complexity, and soft trees applying a sparse +activation function for sample routing. The results show that soft Hoeffding +trees outperform Hoeffding trees in estimating class probabilities and, at the +same time, maintain transparency compared to soft trees, with relatively small +losses in terms of AUROC and cross-entropy. We also demonstrate how to trade +off transparency against performance using a hyperparameter, obtaining +univariate splits at one end of the spectrum and multivariate splits at the +other. + +
+
+
+
+
+ + ☆ Defending Deep Regression Models against Backdoor Attacks + + +
+ Deep regression models are used in a wide variety of safety-critical +applications, but are vulnerable to backdoor attacks. Although many defenses +have been proposed for classification models, they are ineffective as they do +not consider the uniqueness of regression models. First, the outputs of +regression models are continuous values instead of discretized labels. Thus, +the potential infected target of a backdoored regression model has infinite +possibilities, which makes it impossible to be determined by existing defenses. +Second, the backdoor behavior of backdoored deep regression models is triggered +by the activation values of all the neurons in the feature space, which makes +it difficult to be detected and mitigated using existing defenses. To resolve +these problems, we propose DRMGuard, the first defense to identify if a deep +regression model in the image domain is backdoored or not. DRMGuard formulates +the optimization problem for reverse engineering based on the unique +output-space and feature-space characteristics of backdoored deep regression +models. We conduct extensive evaluations on two regression tasks and four +datasets. The results show that DRMGuard can consistently defend against +various backdoor attacks. We also generalize four state-of-the-art defenses +designed for classifiers to regression models, and compare DRMGuard with them. +The results show that DRMGuard significantly outperforms all those defenses. + +
+
+
+
+
+ + ☆ AlignXIE: Improving Multilingual Information Extraction by Cross-Lingual + Alignment + + +
+ Empirical evidence suggests that LLMs exhibit spontaneous cross-lingual +alignment. Our findings suggest that although LLMs also demonstrate promising +cross-lingual alignment in Information Extraction, there remains significant +imbalance across languages, revealing an underlying deficiency in the IE +alignment. To address this issue, we propose AlignXIE, a powerful code-based +LLM that significantly enhances cross-lingual IE alignment through two +strategies. Firstly, AlignXIE formulates IE across different languages, +especially non-English ones, as code generation tasks, standardizing the +representation of various schemas using Python classes to ensure consistency of +the same ontology in different languages and align the schema. Secondly, it +incorporates an IE cross-lingual alignment phase through a translated instance +prediction task proposed in this paper to align the extraction process, +utilizing ParallelNER, an IE bilingual parallel dataset with 257,190 samples, +generated by our proposed LLM-based automatic pipeline for IE parallel data +construction, with manual annotation to ensure quality. Ultimately, we obtain +AlignXIE through multilingual IE instruction tuning. Although without training +in 9 unseen languages, AlignXIE surpasses ChatGPT by $30.17\%$ and SoTA by +$20.03\%$, thereby demonstrating superior cross-lingual IE capabilities. +Comprehensive evaluations on 63 IE benchmarks in Chinese and English under +various settings, demonstrate that AlignXIE significantly enhances +cross-lingual and multilingual IE through boosting the IE alignment. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Enhancing Investment Analysis: Optimizing AI-Agent Collaboration in + Financial Research + + +
+ In recent years, the application of generative artificial intelligence +(GenAI) in financial analysis and investment decision-making has gained +significant attention. However, most existing approaches rely on single-agent +systems, which fail to fully utilize the collaborative potential of multiple AI +agents. In this paper, we propose a novel multi-agent collaboration system +designed to enhance decision-making in financial investment research. The +system incorporates agent groups with both configurable group sizes and +collaboration structures to leverage the strengths of each agent group type. By +utilizing a sub-optimal combination strategy, the system dynamically adapts to +varying market conditions and investment scenarios, optimizing performance +across different tasks. We focus on three sub-tasks: fundamentals, market +sentiment, and risk analysis, by analyzing the 2023 SEC 10-K forms of 30 +companies listed on the Dow Jones Index. Our findings reveal significant +performance variations based on the configurations of AI agents for different +tasks. The results demonstrate that our multi-agent collaboration system +outperforms traditional single-agent models, offering improved accuracy, +efficiency, and adaptability in complex financial environments. This study +highlights the potential of multi-agent systems in transforming financial +analysis and investment decision-making by integrating diverse analytical +perspectives. + +
+
+
+
+
+ + ☆ Navigating Trade-offs: Policy Summarization for Multi-Objective + Reinforcement Learning + + +
+ Multi-objective reinforcement learning (MORL) is used to solve problems +involving multiple objectives. An MORL agent must make decisions based on the +diverse signals provided by distinct reward functions. Training an MORL agent +yields a set of solutions (policies), each presenting distinct trade-offs among +the objectives (expected returns). MORL enhances explainability by enabling +fine-grained comparisons of policies in the solution set based on their +trade-offs as opposed to having a single policy. However, the solution set is +typically large and multi-dimensional, where each policy (e.g., a neural +network) is represented by its objective values. + We propose an approach for clustering the solution set generated by MORL. By +considering both policy behavior and objective values, our clustering method +can reveal the relationship between policy behaviors and regions in the +objective space. This approach can enable decision makers (DMs) to identify +overarching trends and insights in the solution set rather than examining each +policy individually. We tested our method in four multi-objective environments +and found it outperformed traditional k-medoids clustering. Additionally, we +include a case study that demonstrates its real-world application. + +
+
+
+
+
+ + ☆ Learn to Solve Vehicle Routing Problems ASAP: A Neural Optimization + Approach for Time-Constrained Vehicle Routing Problems with Finite Vehicle + Fleet + + +
+ Finding a feasible and prompt solution to the Vehicle Routing Problem (VRP) +is a prerequisite for efficient freight transportation, seamless logistics, and +sustainable mobility. Traditional optimization methods reach their limits when +confronted with the real-world complexity of VRPs, which involve numerous +constraints and objectives. Recently, the ability of generative Artificial +Intelligence (AI) to solve combinatorial tasks, known as Neural Combinatorial +Optimization (NCO), demonstrated promising results, offering new perspectives. +In this study, we propose an NCO approach to solve a time-constrained +capacitated VRP with a finite vehicle fleet size. The approach is based on an +encoder-decoder architecture, formulated in line with the Policy Optimization +with Multiple Optima (POMO) protocol and trained via a Proximal Policy +Optimization (PPO) algorithm. We successfully trained the policy with multiple +objectives (minimizing the total distance while maximizing vehicle utilization) +and evaluated it on medium and large instances, benchmarking it against +state-of-the-art heuristics. The method is able to find adequate and +cost-efficient solutions, showing both flexibility and robust generalization. +Finally, we provide a critical analysis of the solution generated by NCO and +discuss the challenges and opportunities of this new branch of intelligent +learning algorithms emerging in optimization science, focusing on freight +transportation. + +
+
+ comment: Affiliation: German Aerospace Center (DLR), Institute of Transport + Research, Rudower Chaussee 7, 12489 Berlin Correspondence: + Elija.deineko@dlr.de +
+
+
+
+
+ + ☆ Learning dynamical systems from data: Gradient-based dictionary + optimization + + +
+ The Koopman operator plays a crucial role in analyzing the global behavior of +dynamical systems. Existing data-driven methods for approximating the Koopman +operator or discovering the governing equations of the underlying system +typically require a fixed set of basis functions, also called dictionary. The +optimal choice of basis functions is highly problem-dependent and often +requires domain knowledge. We present a novel gradient descent-based +optimization framework for learning suitable and interpretable basis functions +from data and show how it can be used in combination with EDMD, SINDy, and +PDE-FIND. We illustrate the efficacy of the proposed approach with the aid of +various benchmark problems such as the Ornstein-Uhlenbeck process, Chua's +circuit, a nonlinear heat equation, as well as protein-folding data. + +
+
+
+
+
+ + ☆ Mining the Minoria: Unknown, Under-represented, and Under-performing + Minority Groups VLDB 2025 + + +
+ Due to a variety of reasons, such as privacy, data in the wild often misses +the grouping information required for identifying minorities. On the other +hand, it is known that machine learning models are only as good as the data +they are trained on and, hence, may underperform for the under-represented +minority groups. The missing grouping information presents a dilemma for +responsible data scientists who find themselves in an unknown-unknown +situation, where not only do they not have access to the grouping attributes +but do not also know what groups to consider. + This paper is an attempt to address this dilemma. Specifically, we propose a +minority mining problem, where we find vectors in the attribute space that +reveal potential groups that are under-represented and under-performing. +Technically speaking, we propose a geometric transformation of data into a dual +space and use notions such as the arrangement of hyperplanes to design an +efficient algorithm for the problem in lower dimensions. Generalizing our +solution to the higher dimensions is cursed by dimensionality. Therefore, we +propose a solution based on smart exploration of the search space for such +cases. We conduct comprehensive experiments using real-world and synthetic +datasets alongside the theoretical analysis. Our experiment results demonstrate +the effectiveness of our proposed solutions in mining the unknown, +under-represented, and under-performing minorities. + +
+
+ comment: This paper is currently under review at VLDB 2025 +
+
+
+
+
+ + ☆ Zero-Shot Temporal Resolution Domain Adaptation for Spiking Neural + Networks + + +
+ Spiking Neural Networks (SNNs) are biologically-inspired deep neural networks +that efficiently extract temporal information while offering promising gains in +terms of energy efficiency and latency when deployed on neuromorphic devices. +However, SNN model parameters are sensitive to temporal resolution, leading to +significant performance drops when the temporal resolution of target data at +the edge is not the same with that of the pre-deployment source data used for +training, especially when fine-tuning is not possible at the edge. To address +this challenge, we propose three novel domain adaptation methods for adapting +neuron parameters to account for the change in time resolution without +re-training on target time-resolution. The proposed methods are based on a +mapping between neuron dynamics in SNNs and State Space Models (SSMs); and are +applicable to general neuron models. We evaluate the proposed methods under +spatio-temporal data tasks, namely the audio keyword spotting datasets SHD and +MSWC as well as the image classification NMINST dataset. Our methods provide an +alternative to - and in majority of the cases significantly outperform - the +existing reference method that simply scales the time constant. Moreover, our +results show that high accuracy on high temporal resolution data can be +obtained by time efficient training on lower temporal resolution data and model +adaptation. + +
+
+
+
+
+ + ☆ Equivariant Graph Attention Networks with Structural Motifs for + Predicting Cell Line-Specific Synergistic Drug Combinations + + +
+ Cancer is the second leading cause of death, with chemotherapy as one of the +primary forms of treatment. As a result, researchers are turning to drug +combination therapy to decrease drug resistance and increase efficacy. Current +methods of drug combination screening, such as in vivo and in vitro, are +inefficient due to stark time and monetary costs. In silico methods have become +increasingly important for screening drugs, but current methods are inaccurate +and generalize poorly to unseen anticancer drugs. In this paper, I employ a +geometric deep-learning model utilizing a graph attention network that is +equivariant to 3D rotations, translations, and reflections with structural +motifs. Additionally, the gene expression of cancer cell lines is utilized to +classify synergistic drug combinations specific to each cell line. I compared +the proposed geometric deep learning framework to current state-of-the-art +(SOTA) methods, and the proposed model architecture achieved greater +performance on all 12 benchmark tasks performed on the DrugComb dataset. +Specifically, the proposed framework outperformed other SOTA methods by an +accuracy difference greater than 28%. Based on these results, I believe that +the equivariant graph attention network's capability of learning geometric data +accounts for the large performance improvements. The model's ability to +generalize to foreign drugs is thought to be due to the structural motifs +providing a better representation of the molecule. Overall, I believe that the +proposed equivariant geometric deep learning framework serves as an effective +tool for virtually screening anticancer drug combinations for further +validation in a wet lab environment. The code for this work is made available +online at: https://github.com/WeToTheMoon/EGAT_DrugSynergy. + +
+
+ comment: 8 pages, 1 figure, Presented at IEEE CIBCB +
+
+
+
+
+ + ☆ Respecting the limit:Bayesian optimization with a bound on the optimal + value + + +
+ In many real-world optimization problems, we have prior information about +what objective function values are achievable. In this paper, we study the +scenario that we have either exact knowledge of the minimum value or a, +possibly inexact, lower bound on its value. We propose bound-aware Bayesian +optimization (BABO), a Bayesian optimization method that uses a new surrogate +model and acquisition function to utilize such prior information. We present +SlogGP, a new surrogate model that incorporates bound information and adapts +the Expected Improvement (EI) acquisition function accordingly. Empirical +results on a variety of benchmarks demonstrate the benefit of taking prior +information about the optimal value into account, and that the proposed +approach significantly outperforms existing techniques. Furthermore, we notice +that even in the absence of prior information on the bound, the proposed SlogGP +surrogate model still performs better than the standard GP model in most cases, +which we explain by its larger expressiveness. + +
+
+
+
+
+ + ☆ Convolutional Differentiable Logic Gate Networks NeurIPS 2024 + + +
+ With the increasing inference cost of machine learning models, there is a +growing interest in models with fast and efficient inference. Recently, an +approach for learning logic gate networks directly via a differentiable +relaxation was proposed. Logic gate networks are faster than conventional +neural network approaches because their inference only requires logic gate +operators such as NAND, OR, and XOR, which are the underlying building blocks +of current hardware and can be efficiently executed. We build on this idea, +extending it by deep logic gate tree convolutions, logical OR pooling, and +residual initializations. This allows scaling logic gate networks up by over +one order of magnitude and utilizing the paradigm of convolution. On CIFAR-10, +we achieve an accuracy of 86.29% using only 61 million logic gates, which +improves over the SOTA while being 29x smaller. + +
+
+ comment: Published at NeurIPS 2024 (Oral) +
+
+
+
+
+ + ☆ Neuromorphic Wireless Split Computing with Multi-Level Spikes + + +
+ Inspired by biological processes, neuromorphic computing utilizes spiking +neural networks (SNNs) to perform inference tasks, offering significant +efficiency gains for workloads involving sequential data. Recent advances in +hardware and software have demonstrated that embedding a few bits of payload in +each spike exchanged between the spiking neurons can further enhance inference +accuracy. In a split computing architecture, where the SNN is divided across +two separate devices, the device storing the first layers must share +information about the spikes generated by the local output neurons with the +other device. Consequently, the advantages of multi-level spikes must be +balanced against the challenges of transmitting additional bits between the two +devices. + This paper addresses these challenges by investigating a wireless +neuromorphic split computing architecture employing multi-level SNNs. For this +system, we present the design of digital and analog modulation schemes +optimized for an orthogonal frequency division multiplexing (OFDM) radio +interface. Simulation and experimental results using software-defined radios +provide insights into the performance gains of multi-level SNN models and the +optimal payload size as a function of the quality of the connection between a +transmitter and receiver. + +
+
+
+
+
+ + ☆ Subspace-Constrained Quadratic Matrix Factorization: Algorithm and + Applications + + +
+ Matrix Factorization has emerged as a widely adopted framework for modeling +data exhibiting low-rank structures. To address challenges in manifold +learning, this paper presents a subspace-constrained quadratic matrix +factorization model. The model is designed to jointly learn key low-dimensional +structures, including the tangent space, the normal subspace, and the quadratic +form that links the tangent space to a low-dimensional representation. We solve +the proposed factorization model using an alternating minimization method, +involving an in-depth investigation of nonlinear regression and projection +subproblems. Theoretical properties of the quadratic projection problem and +convergence characteristics of the alternating strategy are also investigated. +To validate our approach, we conduct numerical experiments on synthetic and +real-world datasets. Results demonstrate that our model outperforms existing +methods, highlighting its robustness and efficacy in capturing core +low-dimensional structures. + +
+
+
+
+
+ + ☆ Exploring Hierarchical Molecular Graph Representation in Multimodal LLMs + + +
+ Following the milestones in large language models (LLMs) and multimodal +models, we have seen a surge in applying LLMs to biochemical tasks. Leveraging +graph features and molecular text representations, LLMs can tackle various +tasks, such as predicting chemical reaction outcomes and describing molecular +properties. However, most current work overlooks the multi-level nature of +graph features. The impact of different feature levels on LLMs and the +importance of each level remain unexplored, and it is possible that different +chemistry tasks require different feature levels. In this work, we first +investigate the effect of feature granularity by fusing GNN-generated feature +tokens, discovering that even reducing all tokens to a single token does not +significantly impact performance. We then explore the effect of various feature +levels on performance, finding that both the quality of LLM-generated molecules +and performance on different tasks benefit from different feature levels. We +conclude with two key insights: (1) current molecular Multimodal LLMs(MLLMs) +lack a comprehensive understanding of graph features, and (2) static processing +is not sufficient for hierarchical graph feature. Our code will be publicly +available soon. + +
+
+
+
+
+ + ☆ Field Assessment of Force Torque Sensors for Planetary Rover Navigation + + +
+ Proprioceptive sensors on planetary rovers serve for state estimation and for +understanding terrain and locomotion performance. While inertial measurement +units (IMUs) are widely used to this effect, force-torque sensors are less +explored for planetary navigation despite their potential to directly measure +interaction forces and provide insights into traction performance. This paper +presents an evaluation of the performance and use cases of force-torque sensors +based on data collected from a six-wheeled rover during tests over varying +terrains, speeds, and slopes. We discuss challenges, such as sensor signal +reliability and terrain response accuracy, and identify opportunities regarding +the use of these sensors. The data is openly accessible and includes +force-torque measurements from each of the six-wheel assemblies as well as IMU +data from within the rover chassis. This paper aims to inform the design of +future studies and rover upgrades, particularly in sensor integration and +control algorithms, to improve navigation capabilities. + +
+
+
+
+
+ + ☆ The Multiple Dimensions of Spuriousness in Machine Learning + + +
+ Learning correlations from data forms the foundation of today's machine +learning (ML) and artificial intelligence (AI) research. While such an approach +enables the automatic discovery of patterned relationships within big data +corpora, it is susceptible to failure modes when unintended correlations are +captured. This vulnerability has expanded interest in interrogating +spuriousness, often critiqued as an impediment to model performance, fairness, +and robustness. In this article, we trace deviations from the conventional +definition of statistical spuriousness-which denotes a non-causal observation +arising from either coincidence or confounding variables-to articulate how ML +researchers make sense of spuriousness in practice. Drawing on a broad survey +of ML literature, we conceptualize the "multiple dimensions of spuriousness," +encompassing: relevance ("Models should only use correlations that are relevant +to the task."), generalizability ("Models should only use correlations that +generalize to unseen data"), human-likeness ("Models should only use +correlations that a human would use to perform the same task"), and harmfulness +("Models should only use correlations that are not harmful"). These dimensions +demonstrate that ML spuriousness goes beyond the causal/non-causal dichotomy +and that the disparate interpretative paths researchers choose could +meaningfully influence the trajectory of ML development. By underscoring how a +fundamental problem in ML is contingently negotiated in research contexts, we +contribute to ongoing debates about responsible practices in AI development. + +
+
+
+
+
+ + ☆ Is network fragmentation a useful complexity measure? + + +
+ It has been observed that the input space of deep neural network classifiers +can exhibit `fragmentation', where the model function rapidly changes class as +the input space is traversed. The severity of this fragmentation tends to +follow the double descent curve, achieving a maximum at the interpolation +regime. We study this phenomenon in the context of image classification and ask +whether fragmentation could be predictive of generalization performance. Using +a fragmentation-based complexity measure, we show this to be possible by +achieving good performance on the PGDL (Predicting Generalization in Deep +Learning) benchmark. In addition, we report on new observations related to +fragmentation, namely (i) fragmentation is not limited to the input space but +occurs in the hidden representations as well, (ii) fragmentation follows the +trends in the validation error throughout training, and (iii) fragmentation is +not a direct result of increased weight norms. Together, this indicates that +fragmentation is a phenomenon worth investigating further when studying the +generalization ability of deep neural networks. + +
+
+
+
+
+ + ☆ Differentially Private Continual Learning using Pre-Trained Models NeurIPS 2024 + + +
+ This work explores the intersection of continual learning (CL) and +differential privacy (DP). Crucially, continual learning models must retain +knowledge across tasks, but this conflicts with the differential privacy +requirement of restricting individual samples to be memorised in the model. We +propose using pre-trained models to address the trade-offs between privacy and +performance in a continual learning setting.More specifically, we present +necessary assumptions to enable privacy-preservation and propose combining +pre-trained models with parameter-free classifiers and parameter-efficient +adapters that are learned under differential privacy. Our experiments +demonstrate their effectiveness and provide insights into balancing the +competing demands of continual learning and privacy. + +
+
+ comment: 15 pages, 3 figures, Accepted at Scalable Continual Learning for + Lifelong Foundation Models Workshop at 38th Conference on Neural Information + Processing Systems (NeurIPS 2024) +
+
+
+
+
+ + ☆ Semantic-Aware Resource Management for C-V2X Platooning via Multi-Agent + Reinforcement Learning + + +
+ This paper presents a semantic-aware multi-modal resource allocation (SAMRA) +for multi-task using multi-agent reinforcement learning (MARL), termed +SAMRAMARL, utilizing in platoon systems where cellular vehicle-to-everything +(C-V2X) communication is employed. The proposed approach leverages the semantic +information to optimize the allocation of communication resources. By +integrating a distributed multi-agent reinforcement learning (MARL) algorithm, +SAMRAMARL enables autonomous decision-making for each vehicle, channel +assignment optimization, power allocation, and semantic symbol length based on +the contextual importance of the transmitted information. This +semantic-awareness ensures that both vehicle-to-vehicle (V2V) and +vehicle-to-infrastructure (V2I) communications prioritize data that is critical +for maintaining safe and efficient platoon operations. The framework also +introduces a tailored quality of experience (QoE) metric for semantic +communication, aiming to maximize QoE in V2V links while improving the success +rate of semantic information transmission (SRS). Extensive simulations has +demonstrated that SAMRAMARL outperforms existing methods, achieving significant +gains in QoE and communication efficiency in C-V2X platooning scenarios. + +
+
+ comment: This paper has been submitted to IEEE Journal. The source code has + been released + at:https://github.com/qiongwu86/Semantic-Aware-Resource-Management-for-C-V2X-Platooning-via-Multi-Agent-Reinforcement-Learning +
+
+
+
+
+ + ☆ EffiCANet: Efficient Time Series Forecasting with Convolutional + Attention + + +
+ The exponential growth of multivariate time series data from sensor networks +in domains like industrial monitoring and smart cities requires efficient and +accurate forecasting models. Current deep learning methods often fail to +adequately capture long-range dependencies and complex inter-variable +relationships, especially under real-time processing constraints. These +limitations arise as many models are optimized for either short-term +forecasting with limited receptive fields or long-term accuracy at the cost of +efficiency. Additionally, dynamic and intricate interactions between variables +in real-world data further complicate modeling efforts. To address these +limitations, we propose EffiCANet, an Efficient Convolutional Attention Network +designed to enhance forecasting accuracy while maintaining computational +efficiency. EffiCANet integrates three key components: (1) a Temporal +Large-kernel Decomposed Convolution (TLDC) module that captures long-term +temporal dependencies while reducing computational overhead; (2) an +Inter-Variable Group Convolution (IVGC) module that captures complex and +evolving relationships among variables; and (3) a Global Temporal-Variable +Attention (GTVA) mechanism that prioritizes critical temporal and +inter-variable features. Extensive evaluations across nine benchmark datasets +show that EffiCANet achieves the maximum reduction of 10.02% in MAE over +state-of-the-art models, while cutting computational costs by 26.2% relative to +conventional large-kernel convolution methods, thanks to its efficient +decomposition strategy. + +
+
+
+
+
+ + ☆ Enhancing Trust in Clinically Significant Prostate Cancer Prediction + with Multiple Magnetic Resonance Imaging Modalities ML4H + + +
+ In the United States, prostate cancer is the second leading cause of deaths +in males with a predicted 35,250 deaths in 2024. However, most diagnoses are +non-lethal and deemed clinically insignificant which means that the patient +will likely not be impacted by the cancer over their lifetime. As a result, +numerous research studies have explored the accuracy of predicting clinical +significance of prostate cancer based on magnetic resonance imaging (MRI) +modalities and deep neural networks. Despite their high performance, these +models are not trusted by most clinical scientists as they are trained solely +on a single modality whereas clinical scientists often use multiple magnetic +resonance imaging modalities during their diagnosis. In this paper, we +investigate combining multiple MRI modalities to train a deep learning model to +enhance trust in the models for clinically significant prostate cancer +prediction. The promising performance and proposed training pipeline showcase +the benefits of incorporating multiple MRI modalities for enhanced trust and +accuracy. + +
+
+ comment: Findings paper presented at Machine Learning for Health (ML4H) + symposium 2024, December 15-16, 2024, Vancouver, Canada, 6 pages +
+
+
+
+
+ + ☆ Centrality Graph Shift Operators for Graph Neural Networks + + +
+ Graph Shift Operators (GSOs), such as the adjacency and graph Laplacian +matrices, play a fundamental role in graph theory and graph representation +learning. Traditional GSOs are typically constructed by normalizing the +adjacency matrix by the degree matrix, a local centrality metric. In this work, +we instead propose and study Centrality GSOs (CGSOs), which normalize adjacency +matrices by global centrality metrics such as the PageRank, $k$-core or count +of fixed length walks. We study spectral properties of the CGSOs, allowing us +to get an understanding of their action on graph signals. We confirm this +understanding by defining and running the spectral clustering algorithm based +on different CGSOs on several synthetic and real-world datasets. We furthermore +outline how our CGSO can act as the message passing operator in any Graph +Neural Network and in particular demonstrate strong performance of a variant of +the Graph Convolutional Network and Graph Attention Network using our CGSOs on +several real-world benchmark datasets. + +
+
+
+
+
+ + ☆ IGDrivSim: A Benchmark for the Imitation Gap in Autonomous Driving + + +
+ Developing autonomous vehicles that can navigate complex environments with +human-level safety and efficiency is a central goal in self-driving research. A +common approach to achieving this is imitation learning, where agents are +trained to mimic human expert demonstrations collected from real-world driving +scenarios. However, discrepancies between human perception and the self-driving +car's sensors can introduce an \textit{imitation gap}, leading to imitation +learning failures. In this work, we introduce \textbf{IGDrivSim}, a benchmark +built on top of the Waymax simulator, designed to investigate the effects of +the imitation gap in learning autonomous driving policy from human expert +demonstrations. Our experiments show that this perception gap between human +experts and self-driving agents can hinder the learning of safe and effective +driving behaviors. We further show that combining imitation with reinforcement +learning, using a simple penalty reward for prohibited behaviors, effectively +mitigates these failures. Our code is open-sourced at: +https://github.com/clemgris/IGDrivSim.git. + +
+
+ comment: 8 pages, 4 figures, 1 table +
+
+
+
+
+ + ☆ DISCO: DISCovering Overfittings as Causal Rules for Text Classification + Models + + +
+ With the rapid advancement of neural language models, the deployment of +over-parameterized models has surged, increasing the need for interpretable +explanations comprehensible to human inspectors. Existing post-hoc +interpretability methods, which often focus on unigram features of single input +textual instances, fail to capture the models' decision-making process fully. +Additionally, many methods do not differentiate between decisions based on +spurious correlations and those based on a holistic understanding of the input. +Our paper introduces DISCO, a novel method for discovering global, rule-based +explanations by identifying causal n-gram associations with model predictions. +This method employs a scalable sequence mining technique to extract relevant +text spans from training data, associate them with model predictions, and +conduct causality checks to distill robust rules that elucidate model behavior. +These rules expose potential overfitting and provide insights into misleading +feature combinations. We validate DISCO through extensive testing, +demonstrating its superiority over existing methods in offering comprehensive +insights into complex model behaviors. Our approach successfully identifies all +shortcuts manually introduced into the training data (100% detection rate on +the MultiRC dataset), resulting in an 18.8% regression in model performance -- +a capability unmatched by any other method. Furthermore, DISCO supports +interactive explanations, enabling human inspectors to distinguish spurious +causes in the rule-based output. This alleviates the burden of abundant +instance-wise explanations and helps assess the model's risk when encountering +out-of-distribution (OOD) data. + +
+
+
+
+
+ + ☆ wav2sleep: A Unified Multi-Modal Approach to Sleep Stage Classification + from Physiological Signals ML4H + + +
+ Accurate classification of sleep stages from less obtrusive sensor +measurements such as the electrocardiogram (ECG) or photoplethysmogram (PPG) +could enable important applications in sleep medicine. Existing approaches to +this problem have typically used deep learning models designed and trained to +operate on one or more specific input signals. However, the datasets used to +develop these models often do not contain the same sets of input signals. Some +signals, particularly PPG, are much less prevalent than others, and this has +previously been addressed with techniques such as transfer learning. +Additionally, only training on one or more fixed modalities precludes +cross-modal information transfer from other sources, which has proved valuable +in other problem domains. To address this, we introduce wav2sleep, a unified +model designed to operate on variable sets of input signals during training and +inference. After jointly training on over 10,000 overnight recordings from six +publicly available polysomnography datasets, including SHHS and MESA, wav2sleep +outperforms existing sleep stage classification models across test-time input +combinations including ECG, PPG, and respiratory signals. + +
+
+ comment: Accepted to Machine Learning for Health (ML4H) 2024 +
+
+
+
+
+ + ☆ Cybercrime Prediction via Geographically Weighted Learning + + +
+ Inspired by the success of Geographically Weighted Regression and its +accounting for spatial variations, we propose GeogGNN -- A graph neural network +model that accounts for geographical latitude and longitudinal points. Using a +synthetically generated dataset, we apply the algorithm for a 4-class +classification problem in cybersecurity with seemingly realistic geographic +coordinates centered in the Gulf Cooperation Council region. We demonstrate +that it has higher accuracy than standard neural networks and convolutional +neural networks that treat the coordinates as features. Encouraged by the +speed-up in model accuracy by the GeogGNN model, we provide a general +mathematical result that demonstrates that a geometrically weighted neural +network will, in principle, always display higher accuracy in the +classification of spatially dependent data by making use of spatial continuity +and local averaging features. + +
+
+ comment: 17 pages, 8 figures, Submitted to the International Jordanian + Cybersecurity Conference 2024 (IJCC24) +
+
+
+
+
+ + ☆ Improved Multi-Task Brain Tumour Segmentation with Synthetic Data + Augmentation + + +
+ This paper presents the winning solution of task 1 and the third-placed +solution of task 3 of the BraTS challenge. The use of automated tools in +clinical practice has increased due to the development of more and more +sophisticated and reliable algorithms. However, achieving clinical standards +and developing tools for real-life scenarios is a major challenge. To this end, +BraTS has organised tasks to find the most advanced solutions for specific +purposes. In this paper, we propose the use of synthetic data to train +state-of-the-art frameworks in order to improve the segmentation of adult +gliomas in a post-treatment scenario, and the segmentation of meningioma for +radiotherapy planning. Our results suggest that the use of synthetic data leads +to more robust algorithms, although the synthetic data generation pipeline is +not directly suited to the meningioma task. The code for these tasks is +available at https://github.com/ShadowTwin41/BraTS_2023_2024_solutions. + +
+
+
+
+
+ + ☆ Brain Tumour Removing and Missing Modality Generation using 3D WDM + + +
+ This paper presents the second-placed solution for task 8 and the +participation solution for task 7 of BraTS 2024. The adoption of automated +brain analysis algorithms to support clinical practice is increasing. However, +many of these algorithms struggle with the presence of brain lesions or the +absence of certain MRI modalities. The alterations in the brain's morphology +leads to high variability and thus poor performance of predictive models that +were trained only on healthy brains. The lack of information that is usually +provided by some of the missing MRI modalities also reduces the reliability of +the prediction models trained with all modalities. In order to improve the +performance of these models, we propose the use of conditional 3D wavelet +diffusion models. The wavelet transform enabled full-resolution image training +and prediction on a GPU with 48 GB VRAM, without patching or downsampling, +preserving all information for prediction. For the inpainting task of BraTS +2024, the use of a large and variable number of healthy masks and the stability +and efficiency of the 3D wavelet diffusion model resulted in 0.007, 22.61 and +0.842 in the validation set and 0.07 , 22.8 and 0.91 in the testing set (MSE, +PSNR and SSIM respectively). The code for these tasks is available at +https://github.com/ShadowTwin41/BraTS_2023_2024_solutions. + +
+
+
+
+
+ + ☆ Sharp Analysis for KL-Regularized Contextual Bandits and RLHF + + +
+ Reverse-Kullback-Leibler (KL) regularization has emerged to be a predominant +technique used to enhance policy optimization in reinforcement learning (RL) +and reinforcement learning from human feedback (RLHF), which forces the learned +policy to stay close to a reference policy. While the effectiveness and +necessity of KL-regularization have been empirically demonstrated in various +practical scenarios, current theoretical analysis of KL-regularized RLHF still +obtains the same $\mathcal{O}(1 / \epsilon^2)$ sample complexity as problems +without KL-regularization. To understand the fundamental distinction between +policy learning objectives with KL-regularization and ones without +KL-regularization, we are the first to theoretically demonstrate the power of +KL-regularization by providing a sharp analysis for KL-regularized contextual +bandits and RLHF, revealing an $\mathcal{O}(1 / \epsilon)$ sample complexity +when $\epsilon$ is sufficiently small. + We further explore the role of data coverage in contextual bandits and RLHF. +While the coverage assumption is commonly employed in offline RLHF to link the +samples from the reference policy to the optimal policy, often at the cost of a +multiplicative dependence on the coverage coefficient, its impact on the sample +complexity of online RLHF remains unclear. Previous theoretical analyses of +online RLHF typically require explicit exploration and additional structural +assumptions on the reward function class. In contrast, we show that with +sufficient coverage from the reference policy, a simple two-stage mixed +sampling strategy can achieve a sample complexity with only an additive +dependence on the coverage coefficient. Our results provide a comprehensive +understanding of the roles of KL-regularization and data coverage in RLHF, +shedding light on the design of more efficient RLHF algorithms. + +
+
+
+
+
+ + ☆ The Impact of Semi-Supervised Learning on Line Segment Detection + + +
+ In this paper we present a method for line segment detection in images, based +on a semi-supervised framework. Leveraging the use of a consistency loss based +on differently augmented and perturbed unlabeled images with a small amount of +labeled data, we show comparable results to fully supervised methods. This +opens up application scenarios where annotation is difficult or expensive, and +for domain specific adaptation of models. We are specifically interested in +real-time and online applications, and investigate small and efficient learning +backbones. Our method is to our knowledge the first to target line detection +using modern state-of-the-art methodologies for semi-supervised learning. We +test the method on both standard benchmarks and domain specific scenarios for +forestry applications, showing the tractability of the proposed method. + +
+
+ comment: 9 pages, 6 figures, 7 tables +
+
+
+
+
+ + ☆ Verification of Neural Networks against Convolutional Perturbations via + Parameterised Kernels + + +
+ We develop a method for the efficient verification of neural networks against +convolutional perturbations such as blurring or sharpening. To define input +perturbations we use well-known camera shake, box blur and sharpen kernels. We +demonstrate that these kernels can be linearly parameterised in a way that +allows for a variation of the perturbation strength while preserving desired +kernel properties. To facilitate their use in neural network verification, we +develop an efficient way of convolving a given input with these parameterised +kernels. The result of this convolution can be used to encode the perturbation +in a verification setting by prepending a linear layer to a given network. This +leads to tight bounds and a high effectiveness in the resulting verification +step. We add further precision by employing input splitting as a branch and +bound strategy. We demonstrate that we are able to verify robustness on a +number of standard benchmarks where the baseline is unable to provide any +safety certificates. To the best of our knowledge, this is the first solution +for verifying robustness against specific convolutional perturbations such as +camera shake. + +
+
+
+
+
+ + ☆ On the Inherent Robustness of One-Stage Object Detection against + Out-of-Distribution Data + + +
+ Robustness is a fundamental aspect for developing safe and trustworthy +models, particularly when they are deployed in the open world. In this work we +analyze the inherent capability of one-stage object detectors to robustly +operate in the presence of out-of-distribution (OoD) data. Specifically, we +propose a novel detection algorithm for detecting unknown objects in image +data, which leverages the features extracted by the model from each sample. +Differently from other recent approaches in the literature, our proposal does +not require retraining the object detector, thereby allowing for the use of +pretrained models. Our proposed OoD detector exploits the application of +supervised dimensionality reduction techniques to mitigate the effects of the +curse of dimensionality on the features extracted by the model. Furthermore, it +utilizes high-resolution feature maps to identify potential unknown objects in +an unsupervised fashion. Our experiments analyze the Pareto trade-off between +the performance detecting known and unknown objects resulting from different +algorithmic configurations and inference confidence thresholds. We also compare +the performance of our proposed algorithm to that of logits-based post-hoc OoD +methods, as well as possible fusion strategies. Finally, we discuss on the +competitiveness of all tested methods against state-of-the-art OoD approaches +for object detection models over the recently published Unknown Object +Detection benchmark. The obtained results verify that the performance of +avant-garde post-hoc OoD detectors can be further improved when combined with +our proposed algorithm. + +
+
+ comment: 12 figures, 4 tables, under review +
+
+
+
+
+ + ☆ Interpreting the Learned Model in MuZero Planning TAAI 2024 + + +
+ MuZero has achieved superhuman performance in various games by using a +dynamics network to predict environment dynamics for planning, without relying +on simulators. However, the latent states learned by the dynamics network make +its planning process opaque. This paper aims to demystify MuZero's model by +interpreting the learned latent states. We incorporate observation +reconstruction and state consistency into MuZero training and conduct an +in-depth analysis to evaluate latent states across two board games: 9x9 Go and +Outer-Open Gomoku, and three Atari games: Breakout, Ms. Pacman, and Pong. Our +findings reveal that while the dynamics network becomes less accurate over +longer simulations, MuZero still performs effectively by using planning to +correct errors. Our experiments also show that the dynamics network learns +better latent states in board games than in Atari games. These insights +contribute to a better understanding of MuZero and offer directions for future +research to improve the playing performance, robustness, and interpretability +of the MuZero algorithm. + +
+
+ comment: Accepted by the 29th International Conference on Technologies and + Applications of Artificial Intelligence (TAAI 2024) +
+
+
+
+
+ + ☆ Towards Robust Federated Analytics via Differentially Private + Measurements of Statistical Heterogeneity + + +
+ Statistical heterogeneity is a measure of how skewed the samples of a dataset +are. It is a common problem in the study of differential privacy that the usage +of a statistically heterogeneous dataset results in a significant loss of +accuracy. In federated scenarios, statistical heterogeneity is more likely to +happen, and so the above problem is even more pressing. We explore the three +most promising ways to measure statistical heterogeneity and give formulae for +their accuracy, while simultaneously incorporating differential privacy. We +find the optimum privacy parameters via an analytic mechanism, which +incorporates root finding methods. We validate the main theorems and related +hypotheses experimentally, and test the robustness of the analytic mechanism to +different heterogeneity levels. The analytic mechanism in a distributed setting +delivers superior accuracy to all combinations involving the classic mechanism +and/or the centralized setting. All measures of statistical heterogeneity do +not lose significant accuracy when a heterogeneous sample is used. + +
+
+ comment: 26 pages, 6 tables, 1 figure +
+
+
+
+
+ + ☆ Higher-Order GNNs Meet Efficiency: Sparse Sobolev Graph Neural Networks + + +
+ Graph Neural Networks (GNNs) have shown great promise in modeling +relationships between nodes in a graph, but capturing higher-order +relationships remains a challenge for large-scale networks. Previous studies +have primarily attempted to utilize the information from higher-order neighbors +in the graph, involving the incorporation of powers of the shift operator, such +as the graph Laplacian or adjacency matrix. This approach comes with a +trade-off in terms of increased computational and memory demands. Relying on +graph spectral theory, we make a fundamental observation: the regular and the +Hadamard power of the Laplacian matrix behave similarly in the spectrum. This +observation has significant implications for capturing higher-order information +in GNNs for various tasks such as node classification and semi-supervised +learning. Consequently, we propose a novel graph convolutional operator based +on the sparse Sobolev norm of graph signals. Our approach, known as Sparse +Sobolev GNN (S2-GNN), employs Hadamard products between matrices to maintain +the sparsity level in graph representations. S2-GNN utilizes a cascade of +filters with increasing Hadamard powers to generate a diverse set of functions. +We theoretically analyze the stability of S2-GNN to show the robustness of the +model against possible graph perturbations. We also conduct a comprehensive +evaluation of S2-GNN across various graph mining, semi-supervised node +classification, and computer vision tasks. In particular use cases, our +algorithm demonstrates competitive performance compared to state-of-the-art +GNNs in terms of performance and running time. + +
+
+
+
+
+ + ☆ Impact of Label Noise on Learning Complex Features NeurIPS 2024 + + +
+ Neural networks trained with stochastic gradient descent exhibit an inductive +bias towards simpler decision boundaries, typically converging to a narrow +family of functions, and often fail to capture more complex features. This +phenomenon raises concerns about the capacity of deep models to adequately +learn and represent real-world datasets. Traditional approaches such as +explicit regularization, data augmentation, architectural modifications, etc., +have largely proven ineffective in encouraging the models to learn diverse +features. In this work, we investigate the impact of pre-training models with +noisy labels on the dynamics of SGD across various architectures and datasets. +We show that pretraining promotes learning complex functions and diverse +features in the presence of noise. Our experiments demonstrate that +pre-training with noisy labels encourages gradient descent to find alternate +minima that do not solely depend upon simple features, rather learns more +complex and broader set of features, without hurting performance. + +
+
+ comment: Accepted at Workshop on Scientific Methods for Understanding Deep + Learning, NeurIPS 2024 +
+
+
+
+
+ + ☆ Constrained Latent Action Policies for Model-Based Offline Reinforcement + Learning NeurIPS + 2024 + + +
+ In offline reinforcement learning, a policy is learned using a static dataset +in the absence of costly feedback from the environment. In contrast to the +online setting, only using static datasets poses additional challenges, such as +policies generating out-of-distribution samples. Model-based offline +reinforcement learning methods try to overcome these by learning a model of the +underlying dynamics of the environment and using it to guide policy search. It +is beneficial but, with limited datasets, errors in the model and the issue of +value overestimation among out-of-distribution states can worsen performance. +Current model-based methods apply some notion of conservatism to the Bellman +update, often implemented using uncertainty estimation derived from model +ensembles. In this paper, we propose Constrained Latent Action Policies (C-LAP) +which learns a generative model of the joint distribution of observations and +actions. We cast policy learning as a constrained objective to always stay +within the support of the latent action distribution, and use the generative +capabilities of the model to impose an implicit constraint on the generated +actions. Thereby eliminating the need to use additional uncertainty penalties +on the Bellman update and significantly decreasing the number of gradient steps +required to learn a policy. We empirically evaluate C-LAP on the D4RL and +V-D4RL benchmark, and show that C-LAP is competitive to state-of-the-art +methods, especially outperforming on datasets with visual observations. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) +
+
+
+
+
+ + ☆ Pruning Literals for Highly Efficient Explainability at Word Level + + +
+ Designing an explainable model becomes crucial now for Natural Language +Processing(NLP) since most of the state-of-the-art machine learning models +provide a limited explanation for the prediction. In the spectrum of an +explainable model, Tsetlin Machine(TM) is promising because of its capability +of providing word-level explanation using proposition logic. However, concern +rises over the elaborated combination of literals (propositional logic) in the +clause that makes the model difficult for humans to comprehend, despite having +a transparent learning process. In this paper, we design a post-hoc pruning of +clauses that eliminate the randomly placed literals in the clause thereby +making the model more efficiently interpretable than the vanilla TM. +Experiments on the publicly available YELP-HAT Dataset demonstrate that the +proposed pruned TM's attention map aligns more with the human attention map +than the vanilla TM's attention map. In addition, the pairwise similarity +measure also surpasses the attention map-based neural network models. In terms +of accuracy, the proposed pruning method does not degrade the accuracy +significantly but rather enhances the performance up to 4% to 9% in some test +data. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Uncertainty Prediction Neural Network (UpNet): Embedding Artificial + Neural Network in Bayesian Inversion Framework to Quantify the Uncertainty of + Remote Sensing Retrieval + + +
+ For the retrieval of large-scale vegetation biophysical parameters, the +inversion of radiative transfer models (RTMs) is the most commonly used +approach. In recent years, Artificial Neural Network (ANN)-based methods have +become the mainstream for inverting RTMs due to their high accuracy and +computational efficiency. It has been widely used in the retrieval of +biophysical variables (BV). However, due to the lack of the Bayesian inversion +theory interpretation, it faces challenges in quantifying the retrieval +uncertainty, a crucial metric for product quality validation and downstream +applications such as data assimilation or ecosystem carbon cycling modeling. +This study proved that the ANN trained with squared loss outputs the posterior +mean, providing a rigorous foundation for its uncertainty quantification, +regularization, and incorporation of prior information. A Bayesian theoretical +framework was subsequently proposed for ANN-based methods. Using this +framework, we derived a new algorithm called Uncertainty Prediction Neural +Network (UpNet), which enables the simultaneous training of two ANNs to +retrieve BV and provide retrieval uncertainty. To validate our method, we +compared UpNet with the standard Bayesian inference method, i.e., Markov Chain +Monte Carlo (MCMC), in the inversion of a widely used RTM called ProSAIL for +retrieving BVs and estimating uncertainty. The results demonstrated that the +BVs retrieved and the uncertainties estimated by UpNet were highly consistent +with those from MCMC, achieving over a million-fold acceleration. These results +indicated that UpNet has significant potential for fast retrieval and +uncertainty quantification of BVs or other parameters with medium and +high-resolution remote sensing data. Our Python implementation is available at: +https://github.com/Dash-RSer/UpNet. + +
+
+ comment: 24 pages, f figures +
+
+
+
+
+ + ☆ Peri-midFormer: Periodic Pyramid Transformer for Time Series Analysis NeurIPS + 2024 + + +
+ Time series analysis finds wide applications in fields such as weather +forecasting, anomaly detection, and behavior recognition. Previous methods +attempted to model temporal variations directly using 1D time series. However, +this has been quite challenging due to the discrete nature of data points in +time series and the complexity of periodic variation. In terms of periodicity, +taking weather and traffic data as an example, there are multi-periodic +variations such as yearly, monthly, weekly, and daily, etc. In order to break +through the limitations of the previous methods, we decouple the implied +complex periodic variations into inclusion and overlap relationships among +different level periodic components based on the observation of the +multi-periodicity therein and its inclusion relationships. This explicitly +represents the naturally occurring pyramid-like properties in time series, +where the top level is the original time series and lower levels consist of +periodic components with gradually shorter periods, which we call the periodic +pyramid. To further extract complex temporal variations, we introduce +self-attention mechanism into the periodic pyramid, capturing complex periodic +relationships by computing attention between periodic components based on their +inclusion, overlap, and adjacency relationships. Our proposed Peri-midFormer +demonstrates outstanding performance in five mainstream time series analysis +tasks, including short- and long-term forecasting, imputation, classification, +and anomaly detection. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) +
+
+
+
+
+ + ☆ Measure-to-measure interpolation using Transformers + + +
+ Transformers are deep neural network architectures that underpin the recent +successes of large language models. Unlike more classical architectures that +can be viewed as point-to-point maps, a Transformer acts as a +measure-to-measure map implemented as specific interacting particle system on +the unit sphere: the input is the empirical measure of tokens in a prompt and +its evolution is governed by the continuity equation. In fact, Transformers are +not limited to empirical measures and can in principle process any input +measure. As the nature of data processed by Transformers is expanding rapidly, +it is important to investigate their expressive power as maps from an arbitrary +measure to another arbitrary measure. To that end, we provide an explicit +choice of parameters that allows a single Transformer to match $N$ arbitrary +input measures to $N$ arbitrary target measures, under the minimal assumption +that every pair of input-target measures can be matched by some transport map. + +
+
+
+
+
+ + ☆ Vision Language Models are In-Context Value Learners + + +
+ Predicting temporal progress from visual trajectories is important for +intelligent robots that can learn, adapt, and improve. However, learning such +progress estimator, or temporal value function, across different tasks and +domains requires both a large amount of diverse data and methods which can +scale and generalize. To address these challenges, we present Generative Value +Learning (\GVL), a universal value function estimator that leverages the world +knowledge embedded in vision-language models (VLMs) to predict task progress. +Naively asking a VLM to predict values for a video sequence performs poorly due +to the strong temporal correlation between successive frames. Instead, GVL +poses value estimation as a temporal ordering problem over shuffled video +frames; this seemingly more challenging task encourages VLMs to more fully +exploit their underlying semantic and temporal grounding capabilities to +differentiate frames based on their perceived task progress, consequently +producing significantly better value predictions. Without any robot or task +specific training, GVL can in-context zero-shot and few-shot predict effective +values for more than 300 distinct real-world tasks across diverse robot +platforms, including challenging bimanual manipulation tasks. Furthermore, we +demonstrate that GVL permits flexible multi-modal in-context learning via +examples from heterogeneous tasks and embodiments, such as human videos. The +generality of GVL enables various downstream applications pertinent to +visuomotor policy learning, including dataset filtering, success detection, and +advantage-weighted regression -- all without any model training or finetuning. + +
+
+ comment: Project website and demo: + https://generative-value-learning.github.io/ +
+
+
+
+
+ + ☆ Hypercube Policy Regularization Framework for Offline Reinforcement + Learning + + +
+ Offline reinforcement learning has received extensive attention from scholars +because it avoids the interaction between the agent and the environment by +learning a policy through a static dataset. However, general reinforcement +learning methods cannot get satisfactory results in offline reinforcement +learning due to the out-of-distribution state actions that the dataset cannot +cover during training. To solve this problem, the policy regularization method +that tries to directly clone policies used in static datasets has received +numerous studies due to its simplicity and effectiveness. However, policy +constraint methods make the agent choose the corresponding actions in the +static dataset. This type of constraint is usually over-conservative, which +results in suboptimal policies, especially in low-quality static datasets. In +this paper, a hypercube policy regularization framework is proposed, this +method alleviates the constraints of policy constraint methods by allowing the +agent to explore the actions corresponding to similar states in the static +dataset, which increases the effectiveness of algorithms in low-quality +datasets. It was also theoretically demonstrated that the hypercube policy +regularization framework can effectively improve the performance of original +algorithms. In addition, the hypercube policy regularization framework is +combined with TD3-BC and Diffusion-QL for experiments on D4RL datasets which +are called TD3-BC-C and Diffusion-QL-C. The experimental results of the score +demonstrate that TD3-BC-C and Diffusion-QL-C perform better than +state-of-the-art algorithms like IQL, CQL, TD3-BC and Diffusion-QL in most D4RL +environments in approximate time. + +
+
+
+
+
+ + ☆ Neural Fingerprints for Adversarial Attack Detection + + +
+ Deep learning models for image classification have become standard tools in +recent years. A well known vulnerability of these models is their +susceptibility to adversarial examples. These are generated by slightly +altering an image of a certain class in a way that is imperceptible to humans +but causes the model to classify it wrongly as another class. Many algorithms +have been proposed to address this problem, falling generally into one of two +categories: (i) building robust classifiers (ii) directly detecting attacked +images. Despite the good performance of these detectors, we argue that in a +white-box setting, where the attacker knows the configuration and weights of +the network and the detector, they can overcome the detector by running many +examples on a local copy, and sending only those that were not detected to the +actual model. This problem is common in security applications where even a very +good model is not sufficient to ensure safety. In this paper we propose to +overcome this inherent limitation of any static defence with randomization. To +do so, one must generate a very large family of detectors with consistent +performance, and select one or more of them randomly for each input. For the +individual detectors, we suggest the method of neural fingerprints. In the +training phase, for each class we repeatedly sample a tiny random subset of +neurons from certain layers of the network, and if their average is +sufficiently different between clean and attacked images of the focal class +they are considered a fingerprint and added to the detector bank. During test +time, we sample fingerprints from the bank associated with the label predicted +by the model, and detect attacks using a likelihood ratio test. We evaluate our +detectors on ImageNet with different attack methods and model architectures, +and show near-perfect detection with low rates of false detection. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Real-time stress detection on social network posts using big data + technology + + +
+ In the context of modern life, particularly in Industry 4.0 within the online +space, emotions and moods are frequently conveyed through social media posts. +The trend of sharing stories, thoughts, and feelings on these platforms +generates a vast and promising data source for Big Data. This creates both a +challenge and an opportunity for research in applying technology to develop +more automated and accurate methods for detecting stress in social media users. +In this study, we developed a real-time system for stress detection in online +posts, using the "Dreaddit: A Reddit Dataset for Stress Analysis in Social +Media," which comprises 187,444 posts across five different Reddit domains. +Each domain contains texts with both stressful and non-stressful content, +showcasing various expressions of stress. A labeled dataset of 3,553 lines was +created for training. Apache Kafka, PySpark, and AirFlow were utilized to build +and deploy the model. Logistic Regression yielded the best results for new +streaming data, achieving 69,39% for measuring accuracy and 68,97 for measuring +F1-scores. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ Continuous Sign Language Recognition System using Deep Learning with + MediaPipe Holistic + + +
+ Sign languages are the language of hearing-impaired people who use visuals +like the hand, facial, and body movements for communication. There are +different signs and gestures representing alphabets, words, and phrases. +Nowadays approximately 300 sign languages are being practiced worldwide such as +American Sign Language (ASL), Chinese Sign Language (CSL), Indian Sign Language +(ISL), and many more. Sign languages are dependent on the vocal language of a +place. Unlike vocal or spoken languages, there are no helping words in sign +language like is, am, are, was, were, will, be, etc. As only a limited +population is well-versed in sign language, this lack of familiarity of sign +language hinders hearing-impaired people from communicating freely and easily +with everyone. This issue can be addressed by a sign language recognition (SLR) +system which has the capability to translate the sign language into vocal +language. In this paper, a continuous SLR system is proposed using a deep +learning model employing Long Short-Term Memory (LSTM), trained and tested on +an ISL primary dataset. This dataset is created using MediaPipe Holistic +pipeline for tracking face, hand, and body movements and collecting landmarks. +The system recognizes the signs and gestures in real-time with 88.23% accuracy. + +
+
+ comment: 14 pages, 4 figures, Wireless Pers Commun +
+
+
+
+
+ + ☆ Normalized Space Alignment: A Versatile Metric for Representation + Analysis + + +
+ We introduce a manifold analysis technique for neural network +representations. Normalized Space Alignment (NSA) compares pairwise distances +between two point clouds derived from the same source and having the same size, +while potentially possessing differing dimensionalities. NSA can act as both an +analytical tool and a differentiable loss function, providing a robust means of +comparing and aligning representations across different layers and models. It +satisfies the criteria necessary for both a similarity metric and a neural +network loss function. We showcase NSA's versatility by illustrating its +utility as a representation space analysis metric, a structure-preserving loss +function, and a robustness analysis tool. NSA is not only computationally +efficient but it can also approximate the global structural discrepancy during +mini-batching, facilitating its use in a wide variety of neural network +training paradigms. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Improve the Fitting Accuracy of Deep Learning for the Nonlinear + Schrödinger Equation Using Linear Feature Decoupling Method + + +
+ We utilize the Feature Decoupling Distributed (FDD) method to enhance the +capability of deep learning to fit the Nonlinear Schrodinger Equation (NLSE), +significantly reducing the NLSE loss compared to non decoupling model. + +
+
+
+
+
+ + ☆ Synergy-Guided Regional Supervision of Pseudo Labels for Semi-Supervised + Medical Image Segmentation + + +
+ Semi-supervised learning has received considerable attention for its +potential to leverage abundant unlabeled data to enhance model robustness. +Pseudo labeling is a widely used strategy in semi supervised learning. However, +existing methods often suffer from noise contamination, which can undermine +model performance. To tackle this challenge, we introduce a novel +Synergy-Guided Regional Supervision of Pseudo Labels (SGRS-Net) framework. +Built upon the mean teacher network, we employ a Mix Augmentation module to +enhance the unlabeled data. By evaluating the synergy before and after +augmentation, we strategically partition the pseudo labels into distinct +regions. Additionally, we introduce a Region Loss Evaluation module to assess +the loss across each delineated area. Extensive experiments conducted on the LA +dataset have demonstrated superior performance over state-of-the-art +techniques, underscoring the efficiency and practicality of our framework. + +
+
+
+
+
+ + ☆ Series-to-Series Diffusion Bridge Model + + +
+ Diffusion models have risen to prominence in time series forecasting, +showcasing their robust capability to model complex data distributions. +However, their effectiveness in deterministic predictions is often constrained +by instability arising from their inherent stochasticity. In this paper, we +revisit time series diffusion models and present a comprehensive framework that +encompasses most existing diffusion-based methods. Building on this theoretical +foundation, we propose a novel diffusion-based time series forecasting model, +the Series-to-Series Diffusion Bridge Model ($\mathrm{S^2DBM}$), which +leverages the Brownian Bridge process to reduce randomness in reverse +estimations and improves accuracy by incorporating informative priors and +conditions derived from historical time series data. Experimental results +demonstrate that $\mathrm{S^2DBM}$ delivers superior performance in +point-to-point forecasting and competes effectively with other diffusion-based +models in probabilistic forecasting. + +
+
+
+
+
+ + ☆ LLM-R: A Framework for Domain-Adaptive Maintenance Scheme Generation + Combining Hierarchical Agents and RAG + + +
+ The increasing use of smart devices has emphasized the critical role of +maintenance in production activities. Interactive Electronic Technical Manuals +(IETMs) are vital tools that support the maintenance of smart equipment. +However, traditional IETMs face challenges such as transitioning from Graphical +User Interfaces (GUIs) to natural Language User Interfaces (LUIs) and managing +complex logical relationships. Additionally, they must meet the current demands +for higher intelligence. This paper proposes a Maintenance Scheme Generation +Method based on Large Language Models (LLM-R). The proposed method includes +several key innovations: We propose the Low Rank Adaptation-Knowledge Retention +(LORA-KR) loss technology to proportionally adjust mixed maintenance data for +fine-tuning the LLM. This method prevents knowledge conflicts caused by mixed +data, improving the model's adaptability and reasoning ability in specific +maintenance domains, Besides, Hierarchical Task-Based Agent and +Instruction-level Retrieval-Augmented Generation (RAG) technologies are adopted +to optimize the generation steps and mitigate the phenomenon of hallucination +caused by the model's Inability to access contextual information. This +enhancement improves the model's flexibility and accuracy in handling known or +unknown maintenance objects and maintenance scheme scenarios. To validate the +proposed method's effectiveness in maintenance tasks, a maintenance scheme +dataset was constructed using objects from different fields. The experimental +results show that the accuracy of the maintenance schemes generated by the +proposed method reached 91.59%, indicating which improvement enhances the +intelligence of maintenance schemes and introduces novel technical approaches +for equipment maintenance. + +
+
+ comment: 30 pages, 7 figures +
+
+
+
+
+ + ☆ Enabling Adaptive Agent Training in Open-Ended Simulators by Targeting + Diversity NeurIPS 2024 + + +
+ The wider application of end-to-end learning methods to embodied +decision-making domains remains bottlenecked by their reliance on a +superabundance of training data representative of the target domain. +Meta-reinforcement learning (meta-RL) approaches abandon the aim of zero-shot +generalization--the goal of standard reinforcement learning (RL)--in favor of +few-shot adaptation, and thus hold promise for bridging larger generalization +gaps. While learning this meta-level adaptive behavior still requires +substantial data, efficient environment simulators approaching real-world +complexity are growing in prevalence. Even so, hand-designing sufficiently +diverse and numerous simulated training tasks for these complex domains is +prohibitively labor-intensive. Domain randomization (DR) and procedural +generation (PG), offered as solutions to this problem, require simulators to +possess carefully-defined parameters which directly translate to meaningful +task diversity--a similarly prohibitive assumption. In this work, we present +DIVA, an evolutionary approach for generating diverse training tasks in such +complex, open-ended simulators. Like unsupervised environment design (UED) +methods, DIVA can be applied to arbitrary parameterizations, but can +additionally incorporate realistically-available domain knowledge--thus +inheriting the flexibility and generality of UED, and the supervised structure +embedded in well-designed simulators exploited by DR and PG. Our empirical +results showcase DIVA's unique ability to overcome complex parameterizations +and successfully train adaptive agent behavior, far outperforming competitive +baselines from prior literature. These findings highlight the potential of such +semi-supervised environment design (SSED) approaches, of which DIVA is the +first humble constituent, to enable training in realistic simulated domains, +and produce more robust and capable adaptive agents. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ GPT-Guided Monte Carlo Tree Search for Symbolic Regression in Financial + Fraud Detection + + +
+ With the increasing number of financial services available online, the rate +of financial fraud has also been increasing. The traffic and transaction rates +on the internet have increased considerably, leading to a need for fast +decision-making. Financial institutions also have stringent regulations that +often require transparency and explainability of the decision-making process. +However, most state-of-the-art algorithms currently used in the industry are +highly parameterized black-box models that rely on complex computations to +generate a score. These algorithms are inherently slow and lack the +explainability and speed of traditional rule-based learners. This work +introduces SR-MCTS (Symbolic Regression MCTS), which utilizes a foundational +GPT model to guide the MCTS, significantly enhancing its convergence speed and +the quality of the generated expressions which are further extracted to rules. +Our experiments show that SR-MCTS can detect fraud more efficiently than widely +used methods in the industry while providing substantial insights into the +decision-making process. + +
+
+ comment: ACM International Conference on Information and Knowledge Management + 2024 RAG - Enterprise +
+
+
+
+
+ + ☆ Comparing Fairness of Generative Mobility Models + + +
+ This work examines the fairness of generative mobility models, addressing the +often overlooked dimension of equity in model performance across geographic +regions. Predictive models built on crowd flow data are instrumental in +understanding urban structures and movement patterns; however, they risk +embedding biases, particularly in spatiotemporal contexts where model +performance may reflect and reinforce existing inequities tied to geographic +distribution. We propose a novel framework for assessing fairness by measuring +the utility and equity of generated traces. Utility is assessed via the Common +Part of Commuters (CPC), a similarity metric comparing generated and real +mobility flows, while fairness is evaluated using demographic parity. By +reformulating demographic parity to reflect the difference in CPC distribution +between two groups, our analysis reveals disparities in how various models +encode biases present in the underlying data. We utilized four models (Gravity, +Radiation, Deep Gravity, and Non-linear Gravity) and our results indicate that +traditional gravity and radiation models produce fairer outcomes, although Deep +Gravity achieves higher CPC. This disparity underscores a trade-off between +model accuracy and equity, with the feature-rich Deep Gravity model amplifying +pre-existing biases in community representations. Our findings emphasize the +importance of integrating fairness metrics in mobility modeling to avoid +perpetuating inequities. + +
+
+ comment: 2 pages, Accepted at the Network Mobility (NetMob) 2024 conference +
+
+
+
+
+ + ☆ Scaling Laws for Pre-training Agents and World Models + + +
+ The performance of embodied agents has been shown to improve by increasing +model parameters, dataset size, and compute. This has been demonstrated in +domains from robotics to video games, when generative learning objectives on +offline datasets (pre-training) are used to model an agent's behavior +(imitation learning) or their environment (world modeling). This paper +characterizes the role of scale in these tasks more precisely. Going beyond the +simple intuition that `bigger is better', we show that the same types of power +laws found in language modeling (e.g. between loss and optimal model size), +also arise in world modeling and imitation learning. However, the coefficients +of these laws are heavily influenced by the tokenizer, task \& architecture -- +this has important implications on the optimal sizing of models and data. + +
+
+
+
+
+ + ☆ Towards Unifying Interpretability and Control: Evaluation via + Intervention + + +
+ With the growing complexity and capability of large language models, a need +to understand model reasoning has emerged, often motivated by an underlying +goal of controlling and aligning models. While numerous interpretability and +steering methods have been proposed as solutions, they are typically designed +either for understanding or for control, seldom addressing both, with the +connection between interpretation and control more broadly remaining tenuous. +Additionally, the lack of standardized applications, motivations, and +evaluation metrics makes it difficult to assess these methods' practical +utility and efficacy. To address this, we propose intervention as a fundamental +goal of interpretability and introduce success criteria to evaluate how well +methods are able to control model behavior through interventions. We unify and +extend four popular interpretability methods--sparse autoencoders, logit lens, +tuned lens, and probing--into an abstract encoder-decoder framework. This +framework maps intermediate latent representations to human-interpretable +feature spaces, enabling interventions on these interpretable features, which +can then be mapped back to latent representations to control model outputs. We +introduce two new evaluation metrics: intervention success rate and the +coherence-intervention tradeoff, designed to measure the accuracy of +explanations and their utility in controlling model behavior. Our findings +reveal that (1) although current methods allow for intervention, they are +inconsistent across models and features, (2) lens-based methods outperform +others in achieving simple, concrete interventions, and (3) interventions often +compromise model performance and coherence, underperforming simpler +alternatives, such as prompting, for steering model behavior and highlighting a +critical shortcoming of current interpretability approaches in real-world +applications requiring control. + +
+
+
+
+
+ + ☆ Unsupervised Abnormal Stop Detection for Long Distance Coaches with + Low-Frequency GPS + + +
+ In our urban life, long distance coaches supply a convenient yet economic +approach to the transportation of the public. One notable problem is to +discover the abnormal stop of the coaches due to the important reason, i.e., +illegal pick up on the way which possibly endangers the safety of passengers. +It has become a pressing issue to detect the coach abnormal stop with +low-quality GPS. In this paper, we propose an unsupervised method that helps +transportation managers to efficiently discover the Abnormal Stop Detection +(ASD) for long distance coaches. Concretely, our method converts the ASD +problem into an unsupervised clustering framework in which both the normal stop +and the abnormal one are decomposed. Firstly, we propose a stop duration model +for the low frequency GPS based on the assumption that a coach changes speed +approximately in a linear approach. Secondly, we strip the abnormal stops from +the normal stop points by the low rank assumption. The proposed method is +conceptually simple yet efficient, by leveraging low rank assumption to handle +normal stop points, our approach enables domain experts to discover the ASD for +coaches, from a case study motivated by traffic managers. Datset and code are +publicly available at: https://github.com/pangjunbiao/IPPs. + +
+
+
+
+
+ + ☆ Variational Low-Rank Adaptation Using IVON NeurIPS 2024 + + +
+ We show that variational learning can significantly improve the accuracy and +calibration of Low-Rank Adaptation (LoRA) without a substantial increase in the +cost. We replace AdamW by the Improved Variational Online Newton (IVON) +algorithm to finetune large language models. For Llama-2 with 7 billion +parameters, IVON improves the accuracy over AdamW by 2.8% and expected +calibration error by 4.6%. The accuracy is also better than the other Bayesian +alternatives, yet the cost is lower and the implementation is easier. Our work +provides additional evidence for the effectiveness of IVON for large language +models. The code is available at +https://github.com/team-approx-bayes/ivon-lora. + +
+
+ comment: Published at 38th Workshop on Fine-Tuning in Machine Learning + (NeurIPS 2024). Code available at + https://github.com/team-approx-bayes/ivon-lora +
+
+
+
+
+ + ☆ BendVLM: Test-Time Debiasing of Vision-Language Embeddings + + +
+ Vision-language model (VLM) embeddings have been shown to encode biases +present in their training data, such as societal biases that prescribe negative +characteristics to members of various racial and gender identities. VLMs are +being quickly adopted for a variety of tasks ranging from few-shot +classification to text-guided image generation, making debiasing VLM embeddings +crucial. Debiasing approaches that fine-tune the VLM often suffer from +catastrophic forgetting. On the other hand, fine-tuning-free methods typically +utilize a "one-size-fits-all" approach that assumes that correlation with the +spurious attribute can be explained using a single linear direction across all +possible inputs. In this work, we propose Bend-VLM, a nonlinear, +fine-tuning-free approach for VLM embedding debiasing that tailors the +debiasing operation to each unique input. This allows for a more flexible +debiasing approach. Additionally, we do not require knowledge of the set of +inputs a priori to inference time, making our method more appropriate for +online, open-set tasks such as retrieval and text guided image generation. + +
+
+
+
+
+ + ♻ ☆ A Comparative Analysis of U-Net-based models for Segmentation of Cardiac + MRI + + +
+ Medical imaging refers to the technologies and methods utilized to view the +human body and its inside, in order to diagnose, monitor, or even treat medical +disorders. This paper aims to explore the application of deep learning +techniques in the semantic segmentation of Cardiac short-axis MRI (Magnetic +Resonance Imaging) images, aiming to enhance the diagnosis, monitoring, and +treatment of medical disorders related to the heart. The focus centers on +implementing various architectures that are derivatives of U-Net, to +effectively isolate specific parts of the heart for comprehensive anatomical +and functional analysis. Through a combination of images, graphs, and +quantitative metrics, the efficacy of the models and their predictions are +showcased. Additionally, this paper addresses encountered challenges and +outline strategies for future improvements. This abstract provides a concise +overview of the efforts in utilizing deep learning for cardiac image +segmentation, emphasizing both the accomplishments and areas for further +refinement. + +
+
+
+
+
+ + ♻ ☆ Meta-Models: An Architecture for Decoding LLM Behaviors Through + Interpreted Embeddings and Natural Language + + +
+ As Large Language Models (LLMs) become increasingly integrated into our daily +lives, the potential harms from deceptive behavior underlie the need for +faithfully interpreting their decision-making. While traditional probing +methods have shown some effectiveness, they remain best for narrowly scoped +tasks while more comprehensive explanations are still necessary. To this end, +we investigate meta-models-an architecture using a "meta-model" that takes +activations from an "input-model" and answers natural language questions about +the input-model's behaviors. We evaluate the meta-model's ability to generalize +by training them on selected task types and assessing their out-of-distribution +performance in deceptive scenarios. Our findings show that meta-models +generalize well to out-of-distribution tasks and point towards opportunities +for future research in this area. Our code is available at +https://github.com/acostarelli/meta-models-public . + +
+
+ comment: 11 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ GD doesn't make the cut: Three ways that non-differentiability affects + neural network training + + +
+ This paper critically examines the fundamental distinctions between gradient +methods applied to non-differentiable functions (NGDMs) and classical gradient +descents (GDs) for differentiable functions, revealing significant gaps in +current deep learning optimization theory. We demonstrate that NGDMs exhibit +markedly different convergence properties compared to GDs, strongly challenging +the applicability of extensive neural network convergence literature based on +$L-smoothness$ to non-smooth neural networks. Our analysis reveals paradoxical +behavior of NDGM solutions for $L_{1}$-regularized problems, where increasing +regularization counterintuitively leads to larger $L_{1}$ norms of optimal +solutions. This finding calls into question widely adopted $L_{1}$ penalization +techniques for network pruning. We further challenge the common assumption that +optimization algorithms like RMSProp behave similarly in differentiable and +non-differentiable contexts. Expanding on the Edge of Stability phenomenon, we +demonstrate its occurrence in a broader class of functions, including Lipschitz +continuous convex differentiable functions. This finding raises important +questions about its relevance and interpretation in non-convex, +non-differentiable neural networks, particularly those using ReLU activations. +Our work identifies critical misunderstandings of NDGMs in influential +literature, stemming from an overreliance on strong smoothness assumptions. +These findings necessitate a reevaluation of optimization dynamics in deep +learning, emphasizing the crucial need for more nuanced theoretical foundations +in analyzing these complex systems. + +
+
+
+
+
+ + ♻ ☆ Scaling Law Hypothesis for Multimodal Model + + +
+ We propose a scaling law hypothesis for multimodal models processing text, +audio, images, and video within a shared token and embedding space. Our +framework predicts model performance based on modality-specific compression and +tokenization efficiency, extending established scaling laws from text-based +decoder models to mixed-modality systems. We explore whether leveraging more +training data in multiple modalities can reduce the size of the multimodal +model, enabling efficient deployment on resource-constrained devices. + +
+
+
+
+
+ + ♻ ☆ Perceptions of Linguistic Uncertainty by Language Models and Humans EMNLP 2024 + + +
+ _Uncertainty expressions_ such as "probably" or "highly unlikely" are +pervasive in human language. While prior work has established that there is +population-level agreement in terms of how humans quantitatively interpret +these expressions, there has been little inquiry into the abilities of language +models in the same context. In this paper, we investigate how language models +map linguistic expressions of uncertainty to numerical responses. Our approach +assesses whether language models can employ theory of mind in this setting: +understanding the uncertainty of another agent about a particular statement, +independently of the model's own certainty about that statement. We find that 7 +out of 10 models are able to map uncertainty expressions to probabilistic +responses in a human-like manner. However, we observe systematically different +behavior depending on whether a statement is actually true or false. This +sensitivity indicates that language models are substantially more susceptible +to bias based on their prior knowledge (as compared to humans). These findings +raise important questions and have broad implications for human-AI and AI-AI +communication. + +
+
+ comment: Accepted at EMNLP 2024 (Main) +
+
+
+
+
+ + ♻ ☆ Exploring QUIC Dynamics: A Large-Scale Dataset for Encrypted Traffic + Analysis + + +
+ QUIC, a new and increasingly used transport protocol, addresses and resolves +the limitations of TCP by offering improved security, performance, and features +such as stream multiplexing and connection migration. These features, however, +also present challenges for network operators who need to monitor and analyze +web traffic. In this paper, we introduce VisQUIC, a labeled dataset comprising +over 100,000 QUIC traces from more than 44,000 websites (URLs), collected over +a four-month period. These traces provide the foundation for generating more +than seven million images, with configurable parameters of window length, pixel +resolution, normalization, and labels. These images enable an observer looking +at the interactions between a client and a server to analyze and gain insights +about QUIC encrypted connections. To illustrate the dataset's potential, we +offer a use-case example of an observer estimating the number of HTTP/3 +responses/requests pairs in a given QUIC, which can reveal server behavior, +client--server interactions, and the load imposed by an observed connection. We +formulate the problem as a discrete regression problem, train a machine +learning (ML) model for it, and then evaluate it using the proposed dataset on +an example use case. + +
+
+ comment: The dataset and the supplementary material can be provided upon + request +
+
+
+
+
+ + ♻ ☆ GENOT: Entropic (Gromov) Wasserstein Flow Matching with Applications to + Single-Cell Genomics + + +
+ Single-cell genomics has significantly advanced our understanding of cellular +behavior, catalyzing innovations in treatments and precision medicine. However, +single-cell sequencing technologies are inherently destructive and can only +measure a limited array of data modalities simultaneously. This limitation +underscores the need for new methods capable of realigning cells. Optimal +transport (OT) has emerged as a potent solution, but traditional discrete +solvers are hampered by scalability, privacy, and out-of-sample estimation +issues. These challenges have spurred the development of neural network-based +solvers, known as neural OT solvers, that parameterize OT maps. Yet, these +models often lack the flexibility needed for broader life science applications. +To address these deficiencies, our approach learns stochastic maps (i.e. +transport plans), allows for any cost function, relaxes mass conservation +constraints and integrates quadratic solvers to tackle the complex challenges +posed by the (Fused) Gromov-Wasserstein problem. Utilizing flow matching as a +backbone, our method offers a flexible and effective framework. We demonstrate +its versatility and robustness through applications in cell development +studies, cellular drug response modeling, and cross-modality cell translation, +illustrating significant potential for enhancing therapeutic strategies. + +
+
+
+
+
+ + ♻ ☆ C3T: Cross-modal Transfer Through Time for Human Action Recognition + + +
+ In order to unlock the potential of diverse sensors, we investigate a method +to transfer knowledge between modalities using the structure of a unified +multimodal representation space for Human Action Recognition (HAR). We +formalize and explore an understudied cross-modal transfer setting we term +Unsupervised Modality Adaptation (UMA), where the modality used in testing is +not used in supervised training, i.e. zero labeled instances of the test +modality are available during training. We develop three methods to perform +UMA: Student-Teacher (ST), Contrastive Alignment (CA), and Cross-modal Transfer +Through Time (C3T). Our extensive experiments on various camera+IMU datasets +compare these methods to each other in the UMA setting, and to their empirical +upper bound in the supervised setting. The results indicate C3T is the most +robust and highest performing by at least a margin of 8%, and nears the +supervised setting performance even in the presence of temporal noise. This +method introduces a novel mechanism for aligning signals across time-varying +latent vectors, extracted from the receptive field of temporal convolutions. +Our findings suggest that C3T has significant potential for developing +generalizable models for time-series sensor data, opening new avenues for +multi-modal learning in various applications. + +
+
+
+
+
+ + ♻ ☆ Active-Dormant Attention Heads: Mechanistically Demystifying + Extreme-Token Phenomena in LLMs + + +
+ Practitioners have consistently observed three puzzling phenomena in +transformer-based large language models (LLMs): attention sinks, value-state +drains, and residual-state peaks, collectively referred to as extreme-token +phenomena. These phenomena are characterized by certain so-called "sink tokens" +receiving disproportionately high attention weights, exhibiting significantly +smaller value states, and having much larger residual-state norms than those of +other tokens. These extreme tokens give rise to various challenges in LLM +inference, quantization, and interpretability. + We elucidate the mechanisms behind extreme-token phenomena. First, we show +that these phenomena arise in very simple architectures -- transformers with +one to three layers -- trained on a toy model, the Bigram-Backcopy (BB) task. +In this setting, we identify an active-dormant mechanism, where attention heads +become sinks for specific input domains while remaining non-sinks for others. +Our theoretical analysis of the training dynamics reveals that these phenomena +are driven by a mutual reinforcement mechanism. Building on these insights, we +propose strategies to mitigate extreme-token phenomena during pretraining, +including replacing softmax with ReLU and Adam with SGD. Next, we extend our +analysis to pretrained LLMs, including Llama and OLMo, showing that many +attention heads exhibit a similar active-dormant mechanism as in the BB task, +and that the mutual reinforcement mechanism also governs the emergence of +extreme-token phenomena during LLM pretraining. Our results reveal that many of +the static and dynamic properties of extreme-token phenomena predicted by the +BB task align with observations in pretrained LLMs. + +
+
+
+
+
+ + ♻ ☆ An efficient likelihood-free Bayesian inference method based on + sequential neural posterior estimation + + +
+ Sequential neural posterior estimation (SNPE) techniques have been recently +proposed for dealing with simulation-based models with intractable likelihoods. +Unlike approximate Bayesian computation, SNPE techniques learn the posterior +from sequential simulation using neural network-based conditional density +estimators by minimizing a specific loss function. The SNPE method proposed by +Lueckmann et al. (2017) used a calibration kernel to boost the sample weights +around the observed data, resulting in a concentrated loss function. However, +the use of calibration kernels may increase the variances of both the empirical +loss and its gradient, making the training inefficient. To improve the +stability of SNPE, this paper proposes to use an adaptive calibration kernel +and several variance reduction techniques. The proposed method greatly speeds +up the process of training and provides a better approximation of the posterior +than the original SNPE method and some existing competitors as confirmed by +numerical experiments. We also manage to demonstrate the superiority of the +proposed method for a high-dimensional model with real-world dataset. + +
+
+ comment: 28 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Topological obstruction to the training of shallow ReLU neural networks NeurIPS 2024 + + +
+ Studying the interplay between the geometry of the loss landscape and the +optimization trajectories of simple neural networks is a fundamental step for +understanding their behavior in more complex settings. This paper reveals the +presence of topological obstruction in the loss landscape of shallow ReLU +neural networks trained using gradient flow. We discuss how the homogeneous +nature of the ReLU activation function constrains the training trajectories to +lie on a product of quadric hypersurfaces whose shape depends on the particular +initialization of the network's parameters. When the neural network's output is +a single scalar, we prove that these quadrics can have multiple connected +components, limiting the set of reachable parameters during training. We +analytically compute the number of these components and discuss the possibility +of mapping one to the other through neuron rescaling and permutation. In this +simple setting, we find that the non-connectedness results in a topological +obstruction, which, depending on the initialization, can make the global +optimum unreachable. We validate this result with numerical experiments. + +
+
+ comment: 23 pages, 5 figures, Conference on Neural Information Processing + Systems (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ Learning Latent Space Dynamics with Model-Form Uncertainties: A + Stochastic Reduced-Order Modeling Approach + + +
+ This paper presents a probabilistic approach to represent and quantify +model-form uncertainties in the reduced-order modeling of complex systems using +operator inference techniques. Such uncertainties can arise in the selection of +an appropriate state-space representation, in the projection step that +underlies many reduced-order modeling methods, or as a byproduct of +considerations made during training, to name a few. Following previous works in +the literature, the proposed method captures these uncertainties by expanding +the approximation space through the randomization of the projection matrix. +This is achieved by combining Riemannian projection and retraction operators - +acting on a subset of the Stiefel manifold - with an information-theoretic +formulation. The efficacy of the approach is assessed on canonical problems in +fluid mechanics by identifying and quantifying the impact of model-form +uncertainties on the inferred operators. + +
+
+
+
+
+ + ♻ ☆ The ODE Method for Asymptotic Statistics in Stochastic Approximation and + Reinforcement Learning + + +
+ The paper concerns the $d$-dimensional stochastic approximation recursion, $$ +\theta_{n+1}= \theta_n + \alpha_{n + 1} f(\theta_n, \Phi_{n+1}) $$ where $ \{ +\Phi_n \}$ is a stochastic process on a general state space, satisfying a +conditional Markov property that allows for parameter-dependent noise. The main +results are established under additional conditions on the mean flow and a +version of the Donsker-Varadhan Lyapunov drift condition known as (DV3): + {(i)} An appropriate Lyapunov function is constructed that implies +convergence of the estimates in $L_4$. + {(ii)} A functional central limit theorem (CLT) is established, as well as +the usual one-dimensional CLT for the normalized error. Moment bounds combined +with the CLT imply convergence of the normalized covariance $\textsf{E} [ z_n +z_n^T ]$ to the asymptotic covariance in the CLT, where $z_n{=:} +(\theta_n-\theta^*)/\sqrt{\alpha_n}$. + {(iii)} The CLT holds for the normalized version $z^{\text{PR}}_n{=:} +\sqrt{n} [\theta^{\text{PR}}_n -\theta^*]$, of the averaged parameters +$\theta^{\text{PR}}_n {=:} n^{-1} \sum_{k=1}^n\theta_k$, subject to standard +assumptions on the step-size. Moreover, the covariance in the CLT coincides +with the minimal covariance of Polyak and Ruppert. + {(iv)} An example is given where $f$ and $\bar{f}$ are linear in $\theta$, +and $\Phi$ is a geometrically ergodic Markov chain but does not satisfy (DV3). +While the algorithm is convergent, the second moment of $\theta_n$ is unbounded +and in fact diverges. + {\bf This arXiv version 3 represents a major extension of the results in +prior versions.} The main results now allow for parameter-dependent noise, as +is often the case in applications to reinforcement learning. + +
+
+ comment: 2 figures +
+
+
+
+
+ + ♻ ☆ Latent Diffusion Model for Conditional Reservoir Facies Generation + + +
+ Creating accurate and geologically realistic reservoir facies based on +limited measurements is crucial for field development and reservoir management, +especially in the oil and gas sector. Traditional two-point geostatistics, +while foundational, often struggle to capture complex geological patterns. +Multi-point statistics offers more flexibility, but comes with its own +challenges related to pattern configurations and storage limits. With the rise +of Generative Adversarial Networks (GANs) and their success in various fields, +there has been a shift towards using them for facies generation. However, +recent advances in the computer vision domain have shown the superiority of +diffusion models over GANs. Motivated by this, a novel Latent Diffusion Model +is proposed, which is specifically designed for conditional generation of +reservoir facies. The proposed model produces high-fidelity facies realizations +that rigorously preserve conditioning data. It significantly outperforms a +GAN-based alternative. Our implementation on GitHub: +\url{https://github.com/ML4ITS/Latent-Diffusion-Model-for-Conditional-Reservoir-Facies-Generation}. + +
+
+ comment: accepted in Computers & Geosciences +
+
+
+
+
+ + ♻ ☆ Pre-Finetuning for Few-Shot Emotional Speech Recognition INTERSPEECH 2023 + + +
+ Speech models have long been known to overfit individual speakers for many +classification tasks. This leads to poor generalization in settings where the +speakers are out-of-domain or out-of-distribution, as is common in production +environments. We view speaker adaptation as a few-shot learning problem and +propose investigating transfer learning approaches inspired by recent success +with pre-trained models in natural language tasks. We propose pre-finetuning +speech models on difficult tasks to distill knowledge into few-shot downstream +classification objectives. We pre-finetune Wav2Vec2.0 on every permutation of +four multiclass emotional speech recognition corpora and evaluate our +pre-finetuned models through 33,600 few-shot fine-tuning trials on the +Emotional Speech Dataset. + +
+
+ comment: Published at INTERSPEECH 2023. 5 pages, 4 figures. Code available at + https://github.com/maxlchen/Speech-PreFinetuning +
+
+
+
+
+ + ♻ ☆ Gradient Cuff: Detecting Jailbreak Attacks on Large Language Models by + Exploring Refusal Loss Landscapes NeurIPS 2024 + + +
+ Large Language Models (LLMs) are becoming a prominent generative AI tool, +where the user enters a query and the LLM generates an answer. To reduce harm +and misuse, efforts have been made to align these LLMs to human values using +advanced training techniques such as Reinforcement Learning from Human Feedback +(RLHF). However, recent studies have highlighted the vulnerability of LLMs to +adversarial jailbreak attempts aiming at subverting the embedded safety +guardrails. To address this challenge, this paper defines and investigates the +Refusal Loss of LLMs and then proposes a method called Gradient Cuff to detect +jailbreak attempts. Gradient Cuff exploits the unique properties observed in +the refusal loss landscape, including functional values and its smoothness, to +design an effective two-step detection strategy. Experimental results on two +aligned LLMs (LLaMA-2-7B-Chat and Vicuna-7B-V1.5) and six types of jailbreak +attacks (GCG, AutoDAN, PAIR, TAP, Base64, and LRL) show that Gradient Cuff can +significantly improve the LLM's rejection capability for malicious jailbreak +queries, while maintaining the model's performance for benign user queries by +adjusting the detection threshold. + +
+
+ comment: Accepted by NeurIPS 2024. Project page: + https://huggingface.co/spaces/TrustSafeAI/GradientCuff-Jailbreak-Defense +
+
+
+
+
+ + ♻ ☆ Deep learning empowered sensor fusion boosts infant movement + classification + + +
+ To assess the integrity of the developing nervous system, the Prechtl general +movement assessment (GMA) is recognized for its clinical value in diagnosing +neurological impairments in early infancy. GMA has been increasingly augmented +through machine learning approaches intending to scale-up its application, +circumvent costs in the training of human assessors and further standardize +classification of spontaneous motor patterns. Available deep learning tools, +all of which are based on single sensor modalities, are however still +considerably inferior to that of well-trained human assessors. These approaches +are hardly comparable as all models are designed, trained and evaluated on +proprietary/silo-data sets. With this study we propose a sensor fusion approach +for assessing fidgety movements (FMs). FMs were recorded from 51 typically +developing participants. We compared three different sensor modalities +(pressure, inertial, and visual sensors). Various combinations and two sensor +fusion approaches (late and early fusion) for infant movement classification +were tested to evaluate whether a multi-sensor system outperforms single +modality assessments. Convolutional neural network (CNN) architectures were +used to classify movement patterns. The performance of the three-sensor fusion +(classification accuracy of 94.5%) was significantly higher than that of any +single modality evaluated. We show that the sensor fusion approach is a +promising avenue for automated classification of infant motor patterns. The +development of a robust sensor fusion system may significantly enhance AI-based +early recognition of neurofunctions, ultimately facilitating automated early +detection of neurodevelopmental conditions. + +
+
+
+
+
+ + ♻ ☆ Axioms for AI Alignment from Human Feedback + + +
+ In the context of reinforcement learning from human feedback (RLHF), the +reward function is generally derived from maximum likelihood estimation of a +random utility model based on pairwise comparisons made by humans. The problem +of learning a reward function is one of preference aggregation that, we argue, +largely falls within the scope of social choice theory. From this perspective, +we can evaluate different aggregation methods via established axioms, examining +whether these methods meet or fail well-known standards. We demonstrate that +both the Bradley-Terry-Luce Model and its broad generalizations fail to meet +basic axioms. In response, we develop novel rules for learning reward functions +with strong axiomatic guarantees. A key innovation from the standpoint of +social choice is that our problem has a linear structure, which greatly +restricts the space of feasible rules and leads to a new paradigm that we call +linear social choice. + +
+
+
+
+
+ + ♻ ☆ Interpreting CLIP: Insights on the Robustness to ImageNet Distribution + Shifts + + +
+ What distinguishes robust models from non-robust ones? While for ImageNet +distribution shifts it has been shown that such differences in robustness can +be traced back predominantly to differences in training data, so far it is not +known what that translates to in terms of what the model has learned. In this +work, we bridge this gap by probing the representation spaces of 16 robust +zero-shot CLIP vision encoders with various backbones (ResNets and ViTs) and +pretraining sets (OpenAI, LAION-400M, LAION-2B, YFCC15M, CC12M and {DataComp}), +and comparing them to the representation spaces of less robust models with +identical backbones, but different (pre)training sets or objectives (CLIP +pretraining on ImageNet-Captions, and supervised training or finetuning on +ImageNet).Through this analysis, we generate three novel insights. Firstly, we +detect the presence of outlier features in robust zero-shot CLIP vision +encoders, which to the best of our knowledge is the first time these are +observed in non-language and non-transformer models. Secondly, we find the +existence of outlier features to be an indication of ImageNet shift robustness +in models, since we only find them in robust models in our analysis. Lastly, we +also investigate the number of unique encoded concepts in the representation +space and find zero-shot CLIP models to encode a higher number of unique +concepts in their representation space. However, we do not find this to be an +indicator of ImageNet shift robustness and hypothesize that it is rather +related to the language supervision. Since the presence of outlier features can +be detected without access to any data from shifted datasets, we believe that +they could be a useful tool for practitioners to get a feeling for the +distribution shift robustness of a pretrained model during deployment. + +
+
+ comment: Published in TMLR +
+
+
+
+
+ + ♻ ☆ Towards Open Respiratory Acoustic Foundation Models: Pretraining and + Benchmarking NeurIPS 2024 + + +
+ Respiratory audio, such as coughing and breathing sounds, has predictive +power for a wide range of healthcare applications, yet is currently +under-explored. The main problem for those applications arises from the +difficulty in collecting large labeled task-specific data for model +development. Generalizable respiratory acoustic foundation models pretrained +with unlabeled data would offer appealing advantages and possibly unlock this +impasse. However, given the safety-critical nature of healthcare applications, +it is pivotal to also ensure openness and replicability for any proposed +foundation model solution. To this end, we introduce OPERA, an OPEn Respiratory +Acoustic foundation model pretraining and benchmarking system, as the first +approach answering this need. We curate large-scale respiratory audio datasets +(~136K samples, over 400 hours), pretrain three pioneering foundation models, +and build a benchmark consisting of 19 downstream respiratory health tasks for +evaluation. Our pretrained models demonstrate superior performance (against +existing acoustic models pretrained with general audio on 16 out of 19 tasks) +and generalizability (to unseen datasets and new respiratory audio modalities). +This highlights the great promise of respiratory acoustic foundation models and +encourages more studies using OPERA as an open resource to accelerate research +on respiratory audio for health. The system is accessible from +https://github.com/evelyn0414/OPERA. + +
+
+ comment: accepted by NeurIPS 2024 Track Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ Tiny Time Mixers (TTMs): Fast Pre-trained Models for Enhanced + Zero/Few-Shot Forecasting of Multivariate Time Series NeurIPS 2024 + + +
+ Large pre-trained models excel in zero/few-shot learning for language and +vision tasks but face challenges in multivariate time series (TS) forecasting +due to diverse data characteristics. Consequently, recent research efforts have +focused on developing pre-trained TS forecasting models. These models, whether +built from scratch or adapted from large language models (LLMs), excel in +zero/few-shot forecasting tasks. However, they are limited by slow performance, +high computational demands, and neglect of cross-channel and exogenous +correlations. To address this, we introduce Tiny Time Mixers (TTM), a compact +model (starting from 1M parameters) with effective transfer learning +capabilities, trained exclusively on public TS datasets. TTM, based on the +light-weight TSMixer architecture, incorporates innovations like adaptive +patching, diverse resolution sampling, and resolution prefix tuning to handle +pre-training on varied dataset resolutions with minimal model capacity. +Additionally, it employs multi-level modeling to capture channel correlations +and infuse exogenous signals during fine-tuning. TTM outperforms existing +popular benchmarks in zero/few-shot forecasting by (4-40%), while reducing +computational requirements significantly. Moreover, TTMs are lightweight and +can be executed even on CPU-only machines, enhancing usability and fostering +wider adoption in resource-constrained environments. The model weights for +reproducibility and research use are available at +https://huggingface.co/ibm/ttm-research-r2/, while enterprise-use weights under +the Apache license can be accessed as follows: the initial TTM-Q variant at +https://huggingface.co/ibm-granite/granite-timeseries-ttm-r1, and the latest +variants (TTM-B, TTM-E, TTM-A) weights are available at +https://huggingface.co/ibm-granite/granite-timeseries-ttm-r2. + +
+
+ comment: Accepted at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) +
+
+
+
+
+ + ♻ ☆ MEG: Medical Knowledge-Augmented Large Language Models for Question + Answering + + +
+ Question answering is a natural language understanding task that involves +reasoning over both explicit context and unstated, relevant domain knowledge. +Large language models (LLMs), which underpin most contemporary question +answering systems, struggle to induce how concepts relate in specialized +domains such as medicine. Existing medical LLMs are also costly to train. In +this work, we present MEG, a parameter-efficient approach for medical +knowledge-augmented LLMs. MEG uses a lightweight mapping network to integrate +graph embeddings into the LLM, enabling it to leverage external knowledge in a +cost-effective way. We evaluate our method on four popular medical +multiple-choice datasets and show that LLMs greatly benefit from the factual +grounding provided by knowledge graph embeddings. MEG attains an average of ++10.2% accuracy over the Mistral-Instruct baseline, and +6.7% over specialized +models like BioMistral. We also show results based on Llama-3. Finally, we show +that MEG's performance remains robust to the choice of graph encoder. + +
+
+
+
+
+ + ♻ ☆ Spatial Transformers for Radio Map Estimation + + +
+ Radio map estimation (RME) involves spatial interpolation of radio +measurements to predict metrics such as the received signal strength at +locations where no measurements were collected. The most popular estimators +nowadays project the measurement locations to a regular grid and complete the +resulting measurement tensor with a convolutional deep neural network. +Unfortunately, these approaches suffer from poor spatial resolution and require +a great number of parameters. The first contribution of this paper addresses +these limitations by means of an attention-based estimator named Spatial +TransfOrmer for Radio Map estimation (STORM). This scheme not only outperforms +the existing estimators, but also exhibits lower computational complexity, +translation equivariance, rotation equivariance, and full spatial resolution. +The second contribution is an extended transformer architecture that allows +STORM to perform active sensing, by which the next measurement location is +selected based on the previous measurements. This is particularly useful for +minimization of drive tests (MDT) in cellular networks, where operators request +user equipment to collect measurements. Finally, STORM is extensively validated +by experiments with one ray-tracing and two real-measurement datasets. + +
+
+
+
+
+ + ♻ ☆ On the Robustness of Machine Learning Models in Predicting Thermodynamic + Properties: a Case of Searching for New Quasicrystal Approximants + + +
+ Despite an artificial intelligence-assisted modeling of disordered crystals +is a widely used and well-tried method of new materials design, the issues of +its robustness, reliability, and stability are still not resolved and even not +discussed enough. To highlight it, in this work we composed a series of nested +intermetallic approximants of quasicrystals datasets and trained various +machine learning models on them correspondingly. Our qualitative and, what is +more important, quantitative assessment of the difference in the predictions +clearly shows that different reasonable changes in the training sample can lead +to the completely different set of the predicted potentially new materials. We +also showed the advantage of pre-training and proposed a simple yet effective +trick of sequential training to increase stability. + +
+
+
+
+
+ + ♻ ☆ NeuralClothSim: Neural Deformation Fields Meet the Thin Shell Theory + + +
+ Despite existing 3D cloth simulators producing realistic results, they +predominantly operate on discrete surface representations (e.g. points and +meshes) with a fixed spatial resolution, which often leads to large memory +consumption and resolution-dependent simulations. Moreover, back-propagating +gradients through the existing solvers is difficult, and they cannot be easily +integrated into modern neural architectures. In response, this paper re-thinks +physically plausible cloth simulation: We propose NeuralClothSim, i.e., a new +quasistatic cloth simulator using thin shells, in which surface deformation is +encoded in neural network weights in the form of a neural field. Our +memory-efficient solver operates on a new continuous coordinate-based surface +representation called neural deformation fields (NDFs); it supervises NDF +equilibria with the laws of the non-linear Kirchhoff-Love shell theory with a +non-linear anisotropic material model. NDFs are adaptive: They 1) allocate +their capacity to the deformation details and 2) allow surface state queries at +arbitrary spatial resolutions without re-training. We show how to train +NeuralClothSim while imposing hard boundary conditions and demonstrate multiple +applications, such as material interpolation and simulation editing. The +experimental results highlight the effectiveness of our continuous neural +formulation. See our project page: https://4dqv.mpi-inf.mpg.de/NeuralClothSim/. + +
+
+ comment: 33 pages, 23 figures and 3 tables; project page: + https://4dqv.mpi-inf.mpg.de/NeuralClothSim/ +
+
+
+
+
+ + ♻ ☆ Unsupervised Cognition + + +
+ Unsupervised learning methods have a soft inspiration in cognition models. To +this day, the most successful unsupervised learning methods revolve around +clustering samples in a mathematical space. In this paper we propose a +state-of-the-art, primitive-based, unsupervised learning approach for +decision-making inspired by a novel cognition framework. This +representation-centric approach models the input space constructively as a +distributed hierarchical structure in an input-agnostic way. We compared our +approach with both current state-of-the-art unsupervised learning +classification, and with current state-of-the-art cancer type classification. +We show how our proposal outperforms previous state-of-the-art. We also +evaluate some cognition-like properties of our proposal where it not only +outperforms the compared algorithms (even supervised learning ones), but it +also shows a different, more cognition-like, behaviour. + +
+
+
+
+
+ + ♻ ☆ Occam Gradient Descent + + +
+ Deep learning neural network models must be large enough to adapt to their +problem domain, while small enough to avoid overfitting training data during +gradient descent. To balance these competing demands, overprovisioned deep +learning models such as transformers are trained for a single epoch on large +data sets, and hence inefficient with both computing resources and training +data. In response to these inefficiencies, we exploit learning theory to derive +Occam Gradient Descent, an algorithm that interleaves adaptive reduction of +model size to minimize generalization error, with gradient descent on model +weights to minimize fitting error. In contrast, traditional gradient descent +greedily minimizes fitting error without regard to generalization error. Our +algorithm simultaneously descends the space of weights and topological size of +any neural network without modification. With respect to loss, compute and +model size, our experiments show (a) on image classification benchmarks, linear +and convolutional neural networks trained with Occam Gradient Descent +outperform traditional gradient descent with or without post-train pruning; (b) +on a range of tabular data classification tasks, neural networks trained with +Occam Gradient Descent outperform traditional gradient descent, as well as +Random Forests; (c) on natural language transformers, Occam Gradient Descent +outperforms traditional gradient descent. + +
+
+
+
+
+ + ♻ ☆ A unified weighting framework for evaluating nearest neighbour + classification + + +
+ We present the first comprehensive and large-scale evaluation of classical +(NN), fuzzy (FNN) and fuzzy rough (FRNN) nearest neighbour classification. We +standardise existing proposals for nearest neighbour weighting with kernel +functions, applied to the distance values and/or ranks of the nearest +neighbours of a test instance. In particular, we show that the theoretically +optimal Samworth weights converge to a kernel. Kernel functions are closely +related to fuzzy negation operators, and we propose a new kernel based on Yager +negation. We also consider various distance and scaling measures, which we show +can be related to each other. Through a systematic series of experiments on 85 +real-life classification datasets, we find that NN, FNN and FRNN all perform +best with Boscovich distance, and that NN and FRNN perform best with a +combination of Samworth rank- and distance-weights and scaling by the mean +absolute deviation around the median ($r_1$), the standard deviation ($r_2$) or +the semi-interquartile range ($r_{\infty}^*$), while FNN performs best with +only Samworth distance-weights and $r_1$- or $r_2$-scaling. However, NN +achieves comparable performance with Yager-$\frac{1}{2}$ distance-weights, +which are simpler to implement than a combination of Samworth distance- and +rank-weights. Finally, FRNN generally outperforms NN, which in turn performs +systematically better than FNN. + +
+
+
+
+
+ + ♻ ☆ LDAdam: Adaptive Optimization from Low-Dimensional Gradient Statistics + + +
+ We introduce LDAdam, a memory-efficient optimizer for training large models, +that performs adaptive optimization steps within lower dimensional subspaces, +while consistently exploring the full parameter space during training. This +strategy keeps the optimizer's memory footprint to a fraction of the model +size. LDAdam relies on a new projection-aware update rule for the optimizer +states that allows for transitioning between subspaces, i.e., estimation of the +statistics of the projected gradients. To mitigate the errors due to low-rank +projection, LDAdam integrates a new generalized error feedback mechanism, which +explicitly accounts for both gradient and optimizer state compression. We prove +the convergence of LDAdam under standard assumptions, and show that LDAdam +allows for accurate and efficient fine-tuning and pre-training of language +models. + +
+
+ comment: 36 pages +
+
+
+
+
+ + ♻ ☆ Characterizing stable regions in the residual stream of LLMs NeurIPS 2024 + + +
+ We identify stable regions in the residual stream of Transformers, where the +model's output remains insensitive to small activation changes, but exhibits +high sensitivity at region boundaries. These regions emerge during training and +become more defined as training progresses or model size increases. The regions +appear to be much larger than previously studied polytopes. Our analysis +suggests that these stable regions align with semantic distinctions, where +similar prompts cluster within regions, and activations from the same region +lead to similar next token predictions. This work provides a promising research +direction for understanding the complexity of neural networks, shedding light +on training dynamics, and advancing interpretability. + +
+
+ comment: Published at Scientific Methods for Understanding Deep Learning + (SciForDL) workshop at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Simplicity Bias of Two-Layer Networks beyond Linearly Separable Data ICML 2024 + + +
+ Simplicity bias, the propensity of deep models to over-rely on simple +features, has been identified as a potential reason for limited +out-of-distribution generalization of neural networks (Shah et al., 2020). +Despite the important implications, this phenomenon has been theoretically +confirmed and characterized only under strong dataset assumptions, such as +linear separability (Lyu et al., 2021). In this work, we characterize +simplicity bias for general datasets in the context of two-layer neural +networks initialized with small weights and trained with gradient flow. +Specifically, we prove that in the early training phases, network features +cluster around a few directions that do not depend on the size of the hidden +layer. Furthermore, for datasets with an XOR-like pattern, we precisely +identify the learned features and demonstrate that simplicity bias intensifies +during later training stages. These results indicate that features learned in +the middle stages of training may be more useful for OOD transfer. We support +this hypothesis with experiments on image data. + +
+
+ comment: ICML 2024, camera-ready version (expanded related work) +
+
+
+
+
+ + ♻ ☆ Provable Mutual Benefits from Federated Learning in Privacy-Sensitive + Domains AISTATS 2024 + + +
+ Cross-silo federated learning (FL) allows data owners to train accurate +machine learning models by benefiting from each others private datasets. +Unfortunately, the model accuracy benefits of collaboration are often +undermined by privacy defenses. Therefore, to incentivize client participation +in privacy-sensitive domains, a FL protocol should strike a delicate balance +between privacy guarantees and end-model accuracy. In this paper, we study the +question of when and how a server could design a FL protocol provably +beneficial for all participants. First, we provide necessary and sufficient +conditions for the existence of mutually beneficial protocols in the context of +mean estimation and convex stochastic optimization. We also derive protocols +that maximize the total clients' utility, given symmetric privacy preferences. +Finally, we design protocols maximizing end-model accuracy and demonstrate +their benefits in synthetic experiments. + +
+
+ comment: AISTATS 2024; Camera-ready version (updated references) +
+
+
+
+
+ + ♻ ☆ MCDFN: Supply Chain Demand Forecasting via an Explainable Multi-Channel + Data Fusion Network Model + + +
+ Accurate demand forecasting is crucial for optimizing supply chain +management. Traditional methods often fail to capture complex patterns from +seasonal variability and special events. Despite advancements in deep learning, +interpretable forecasting models remain a challenge. To address this, we +introduce the Multi-Channel Data Fusion Network (MCDFN), a hybrid architecture +that integrates Convolutional Neural Networks (CNN), Long Short-Term Memory +networks (LSTM), and Gated Recurrent Units (GRU) to enhance predictive +performance by extracting spatial and temporal features from time series data. +Our comparative benchmarking demonstrates that MCDFN outperforms seven other +deep-learning models, achieving superior metrics: MSE (23.5738), RMSE (4.8553), +MAE (3.9991), and MAPE (20.1575%). Additionally, MCDFN's predictions were +statistically indistinguishable from actual values, confirmed by a paired +t-test with a 5% p-value and a 10-fold cross-validated statistical paired +t-test. We apply explainable AI techniques like ShapTime and Permutation +Feature Importance to enhance interpretability. This research advances demand +forecasting methodologies and offers practical guidelines for integrating MCDFN +into supply chain systems, highlighting future research directions for +scalability and user-friendly deployment. + +
+
+
+
+
+ + ♻ ☆ Regularized Projection Matrix Approximation with Applications to + Community Detection + + +
+ This paper introduces a regularized projection matrix approximation framework +designed to recover cluster information from the affinity matrix. The model is +formulated as a projection approximation problem, incorporating an entry-wise +penalty function. We investigate three distinct penalty functions, each +specifically tailored to address bounded, positive, and sparse scenarios. To +solve this problem, we propose direct optimization on the Stiefel manifold, +utilizing the Cayley transformation along with the Alternating Direction Method +of Multipliers (ADMM) algorithm. Additionally, we provide a theoretical +analysis that establishes the convergence properties of ADMM, demonstrating +that the convergence point satisfies the KKT conditions of the original +problem. Numerical experiments conducted on both synthetic and real-world +datasets reveal that our regularized projection matrix approximation approach +significantly outperforms state-of-the-art methods in clustering performance. + +
+
+
+
+
+ + ♻ ☆ Deep-Graph-Sprints: Accelerated Representation Learning in + Continuous-Time Dynamic Graphs + + +
+ Continuous-time dynamic graphs (CTDGs) are essential for modeling +interconnected, evolving systems. Traditional methods for extracting knowledge +from these graphs often depend on feature engineering or deep learning. Feature +engineering is limited by the manual and time-intensive nature of crafting +features, while deep learning approaches suffer from high inference latency, +making them impractical for real-time applications. This paper introduces +Deep-Graph-Sprints (DGS), a novel deep learning architecture designed for +efficient representation learning on CTDGs with low-latency inference +requirements. We benchmark DGS against state-of-the-art (SOTA) feature +engineering and graph neural network methods using five diverse datasets. The +results indicate that DGS achieves competitive performance while inference +speed improves between 4x and 12x compared to other deep learning approaches on +our benchmark datasets. Our method effectively bridges the gap between deep +representation learning and low-latency application requirements for CTDGs. + +
+
+
+
+
+ + ♻ ☆ Evaluating alignment between humans and neural network representations + in image-based learning tasks + + +
+ Humans represent scenes and objects in rich feature spaces, carrying +information that allows us to generalise about category memberships and +abstract functions with few examples. What determines whether a neural network +model generalises like a human? We tested how well the representations of $86$ +pretrained neural network models mapped to human learning trajectories across +two tasks where humans had to learn continuous relationships and categories of +natural images. In these tasks, both human participants and neural networks +successfully identified the relevant stimulus features within a few trials, +demonstrating effective generalisation. We found that while training dataset +size was a core determinant of alignment with human choices, contrastive +training with multi-modal data (text and imagery) was a common feature of +currently publicly available models that predicted human generalisation. +Intrinsic dimensionality of representations had different effects on alignment +for different model types. Lastly, we tested three sets of human-aligned +representations and found no consistent improvements in predictive accuracy +compared to the baselines. In conclusion, pretrained neural networks can serve +to extract representations for cognitive models, as they appear to capture some +fundamental aspects of cognition that are transferable across tasks. Both our +paradigms and modelling approach offer a novel way to quantify alignment +between neural networks and humans and extend cognitive science into more +naturalistic domains. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Integrated Decision Gradients (IDG-DP) for + Radar-based Human Activity Recognition WACV 2025 + + +
+ Human motion analysis offers significant potential for healthcare monitoring +and early detection of diseases. The advent of radar-based sensing systems has +captured the spotlight for they are able to operate without physical contact +and they can integrate with pre-existing Wi-Fi networks. They are also seen as +less privacy-invasive compared to camera-based systems. However, recent +research has shown high accuracy in recognizing subjects or gender from radar +gait patterns, raising privacy concerns. This study addresses these issues by +investigating privacy vulnerabilities in radar-based Human Activity Recognition +(HAR) systems and proposing a novel method for privacy preservation using +Differential Privacy (DP) driven by attributions derived with Integrated +Decision Gradient (IDG) algorithm. We investigate Black-box Membership +Inference Attack (MIA) Models in HAR settings across various levels of +attacker-accessible information. We extensively evaluated the effectiveness of +the proposed IDG-DP method by designing a CNN-based HAR model and rigorously +assessing its resilience against MIAs. Experimental results demonstrate the +potential of IDG-DP in mitigating privacy attacks while maintaining utility +across all settings, particularly excelling against label-only and shadow model +black-box MIA attacks. This work represents a crucial step towards balancing +the need for effective radar-based HAR with robust privacy protection in +healthcare environments. + +
+
+ comment: Accepted at WACV 2025. 12 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Robust Low-Cost Drone Detection and Classification in Low SNR + Environments + + +
+ The proliferation of drones, or unmanned aerial vehicles (UAVs), has raised +significant safety concerns due to their potential misuse in activities such as +espionage, smuggling, and infrastructure disruption. This paper addresses the +critical need for effective drone detection and classification systems that +operate independently of UAV cooperation. We evaluate various convolutional +neural networks (CNNs) for their ability to detect and classify drones using +spectrogram data derived from consecutive Fourier transforms of signal +components. The focus is on model robustness in low signal-to-noise ratio (SNR) +environments, which is critical for real-world applications. A comprehensive +dataset is provided to support future model development. In addition, we +demonstrate a low-cost drone detection system using a standard computer, +software-defined radio (SDR) and antenna, validated through real-world field +testing. On our development dataset, all models consistently achieved an +average balanced classification accuracy of >= 85% at SNR > -12dB. In the field +test, these models achieved an average balance accuracy of > 80%, depending on +transmitter distance and antenna direction. Our contributions include: a +publicly available dataset for model development, a comparative analysis of CNN +for drone detection under low SNR conditions, and the deployment and field +evaluation of a practical, low-cost detection system. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ CataractBot: An LLM-Powered Expert-in-the-Loop Chatbot for Cataract + Patients + + +
+ The healthcare landscape is evolving, with patients seeking reliable +information about their health conditions and available treatment options. +Despite the abundance of information sources, the digital age overwhelms +individuals with excess, often inaccurate information. Patients primarily trust +medical professionals, highlighting the need for expert-endorsed health +information. However, increased patient loads on experts has led to reduced +communication time, impacting information sharing. To address this gap, we +developed CataractBot, an experts-in-the-loop chatbot powered by LLMs, in +collaboration with an eye hospital in India. CataractBot answers cataract +surgery related questions instantly by querying a curated knowledge base and +provides expert-verified responses asynchronously. It has multimodal and +multilingual capabilities. In an in-the-wild deployment study with 55 +participants, CataractBot proved valuable, providing anytime accessibility, +saving time, accommodating diverse literacy levels, alleviating power +differences, and adding a privacy layer between patients and doctors. Users +reported that their trust in the system was established through expert +verification. Broadly, our results could inform future work on designing +expert-mediated LLM bots. + +
+
+
+
+
+ + ♻ ☆ LongEmbed: Extending Embedding Models for Long Context Retrieval EMNLP 2024 + + +
+ Embedding models play a pivot role in modern NLP applications such as IR and +RAG. While the context limit of LLMs has been pushed beyond 1 million tokens, +embedding models are still confined to a narrow context window not exceeding 8k +tokens, refrained from application scenarios requiring long inputs such as +legal contracts. This paper explores context window extension of existing +embedding models, pushing the limit to 32k without requiring additional +training. First, we examine the performance of current embedding models for +long context retrieval on our newly constructed LongEmbed benchmark. LongEmbed +comprises two synthetic tasks and four carefully chosen real-world tasks, +featuring documents of varying length and dispersed target information. +Benchmarking results underscore huge room for improvement in these models. +Based on this, comprehensive experiments show that training-free context window +extension strategies like position interpolation can effectively extend the +context window of existing embedding models by several folds, regardless of +their original context being 512 or beyond 4k. Furthermore, for models +employing absolute position encoding (APE), we show the possibility of further +fine-tuning to harvest notable performance gains while strictly preserving +original behavior for short inputs. For models using rotary position embedding +(RoPE), significant enhancements are observed when employing RoPE-specific +methods, such as NTK and SelfExtend, indicating RoPE's superiority over APE for +context window extension. To facilitate future research, we release E5-Base-4k +and E5-RoPE-Base, along with the LongEmbed benchmark. + +
+
+ comment: EMNLP 2024 Camera Ready +
+
+
+
+
+ + ♻ ☆ Learning to Assist Humans without Inferring Rewards NeurIPS + + +
+ Assistive agents should make humans' lives easier. Classically, such +assistance is studied through the lens of inverse reinforcement learning, where +an assistive agent (e.g., a chatbot, a robot) infers a human's intention and +then selects actions to help the human reach that goal. This approach requires +inferring intentions, which can be difficult in high-dimensional settings. We +build upon prior work that studies assistance through the lens of empowerment: +an assistive agent aims to maximize the influence of the human's actions such +that they exert a greater control over the environmental outcomes and can solve +tasks in fewer steps. We lift the major limitation of prior work in this +area--scalability to high-dimensional settings--with contrastive successor +representations. We formally prove that these representations estimate a +similar notion of empowerment to that studied by prior work and provide a +ready-made mechanism for optimizing it. Empirically, our proposed method +outperforms prior methods on synthetic benchmarks, and scales to Overcooked, a +cooperative game setting. Theoretically, our work connects ideas from +information theory, neuroscience, and reinforcement learning, and charts a path +for representations to play a critical role in solving assistive problems. + +
+
+ comment: Conference on Neural Information Processing Systems (NeurIPS), 2024 +
+
+
+
+
+ + ♻ ☆ Robust Classification by Coupling Data Mollification with Label + Smoothing + + +
+ Introducing training-time augmentations is a key technique to enhance +generalization and prepare deep neural networks against test-time corruptions. +Inspired by the success of generative diffusion models, we propose a novel +approach of coupling data mollification, in the form of image noising and +blurring, with label smoothing to align predicted label confidences with image +degradation. The method is simple to implement, introduces negligible +overheads, and can be combined with existing augmentations. We demonstrate +improved robustness and uncertainty quantification on the corrupted image +benchmarks of the CIFAR and TinyImageNet datasets. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Grid-Mapping Pseudo-Count Constraint for Offline Reinforcement Learning + + +
+ Offline reinforcement learning learns from a static dataset without +interacting with environments, which ensures security and thus owns a good +application prospect. However, directly applying naive reinforcement learning +algorithm usually fails in an offline environment due to inaccurate Q value +approximation caused by out-of-distribution (OOD) state-actions. It is an +effective way to solve this problem by penalizing the Q-value of OOD +state-actions. Among the methods of punishing OOD state-actions, count-based +methods have achieved good results in discrete domains in a simple form. +Inspired by it, a novel pseudo-count method for continuous domains called +Grid-Mapping Pseudo-Count method (GPC) is proposed by extending the count-based +method from discrete to continuous domains. Firstly, the continuous state and +action space are mapped to discrete space using Grid-Mapping, then the Q-values +of OOD state-actions are constrained through pseudo-count. Secondly, the +theoretical proof is given to show that GPC can obtain appropriate uncertainty +constraints under fewer assumptions than other pseudo-count methods. Thirdly, +GPC is combined with Soft Actor-Critic algorithm (SAC) to get a new algorithm +called GPC-SAC. Lastly, experiments on D4RL datasets are given to show that +GPC-SAC has better performance and less computational cost than other +algorithms that constrain the Q-value. + +
+
+
+
+
+ + ♻ ☆ Identifying Performance Issues in Cloud Service Systems Based on + Relational-Temporal Features + + +
+ Cloud systems are susceptible to performance issues, which may cause +service-level agreement violations and financial losses. In current practice, +crucial metrics are monitored periodically to provide insight into the +operational status of components. Identifying performance issues is often +formulated as an anomaly detection problem, which is tackled by analyzing each +metric independently. However, this approach overlooks the complex dependencies +existing among cloud components. Some graph neural network-based methods take +both temporal and relational information into account, however, the correlation +violations in the metrics that serve as indicators of underlying performance +issues are difficult for them to identify. Furthermore, a large volume of +components in a cloud system results in a vast array of noisy metrics. This +complexity renders it impractical for engineers to fully comprehend the +correlations, making it challenging to identify performance issues accurately. +To address these limitations, we propose Identifying Performance Issues based +on Relational-Temporal Features (ISOLATE ), a learning-based approach that +leverages both the relational and temporal features of metrics to identify +performance issues. In particular, it adopts a graph neural network with +attention to characterizing the relations among metrics and extracts long-term +and multi-scale temporal patterns using a GRU and a convolution network, +respectively. The learned graph attention weights can be further used to +localize the correlation-violated metrics. Moreover, to relieve the impact of +noisy data, ISOLATE utilizes a positive unlabeled learning strategy that tags +pseudo-labels based on a small portion of confirmed negative examples. +Extensive evaluation on both public and industrial datasets shows that ISOLATE +outperforms all baseline models with 0.945 F1-score and 0.920 Hit rate@3. + +
+
+ comment: Accepted in ACM Transactions on Software Engineering and Methodology + (TOSEM) +
+
+
+
+
+ + ♻ ☆ Fuzzy K-Means Clustering without Cluster Centroids + + +
+ Fuzzy K-Means clustering is a critical technique in unsupervised data +analysis. Unlike traditional hard clustering algorithms such as K-Means, it +allows data points to belong to multiple clusters with varying degrees of +membership, determined through iterative optimization to establish optimal +cluster centers and memberships, thereby achieving fuzzy partitioning of data. +However, the performance of popular Fuzzy K-Means algorithms is sensitive to +the selection of initial cluster centroids and is also affected by noise when +updating mean cluster centroids. To address these challenges, this paper +proposes a novel Fuzzy \textit{K}-Means clustering algorithm that entirely +eliminates the reliance on cluster centroids, obtaining membership metrics +solely through distance matrix computation. This innovation enhances +flexibility in distance measurement between sample points, thus improving the +algorithm's performance and robustness. The paper also establishes theoretical +connections between the proposed model and popular Fuzzy K-Means clustering +techniques. Experimental results on several real datasets demonstrate the +effectiveness of the algorithm. + +
+
+
+
+
+ + ♻ ☆ Predictive Analytics of Varieties of Potatoes + + +
+ We explore the application of machine learning algorithms specifically to +enhance the selection process of Russet potato clones in breeding trials by +predicting their suitability for advancement. This study addresses the +challenge of efficiently identifying high-yield, disease-resistant, and +climate-resilient potato varieties that meet processing industry standards. +Leveraging manually collected data from trials in the state of Oregon, we +investigate the potential of a wide variety of state-of-the-art binary +classification models. The dataset includes 1086 clones, with data on 38 +attributes recorded for each clone, focusing on yield, size, appearance, and +frying characteristics, with several control varieties planted consistently +across four Oregon regions from 2013-2021. We conduct a comprehensive analysis +of the dataset that includes preprocessing, feature engineering, and imputation +to address missing values. We focus on several key metrics such as accuracy, +F1-score, and Matthews correlation coefficient (MCC) for model evaluation. The +top-performing models, namely a neural network classifier (Neural Net), +histogram-based gradient boosting classifier (HGBC), and a support vector +machine classifier (SVM), demonstrate consistent and significant results. To +further validate our findings, we conduct a simulation study. By simulating +different data-generating scenarios, we assess model robustness and performance +through true positive, true negative, false positive, and false negative +distributions, area under the receiver operating characteristic curve (AUC-ROC) +and MCC. The simulation results highlight that non-linear models like SVM and +HGBC consistently show higher AUC-ROC and MCC than logistic regression (LR), +thus outperforming the traditional linear model across various distributions, +and emphasizing the importance of model selection and tuning in agricultural +trials. + +
+
+ comment: Minor revision; to appear in Crop Sciences +
+
+
+
+
+ + ♻ ☆ ReMoDetect: Reward Models Recognize Aligned LLM's Generations NeurIPS 2024 + + +
+ The remarkable capabilities and easy accessibility of large language models +(LLMs) have significantly increased societal risks (e.g., fake news +generation), necessitating the development of LLM-generated text (LGT) +detection methods for safe usage. However, detecting LGTs is challenging due to +the vast number of LLMs, making it impractical to account for each LLM +individually; hence, it is crucial to identify the common characteristics +shared by these models. In this paper, we draw attention to a common feature of +recent powerful LLMs, namely the alignment training, i.e., training LLMs to +generate human-preferable texts. Our key finding is that as these aligned LLMs +are trained to maximize the human preferences, they generate texts with higher +estimated preferences even than human-written texts; thus, such texts are +easily detected by using the reward model (i.e., an LLM trained to model human +preference distribution). Based on this finding, we propose two training +schemes to further improve the detection ability of the reward model, namely +(i) continual preference fine-tuning to make the reward model prefer aligned +LGTs even further and (ii) reward modeling of Human/LLM mixed texts (a +rephrased texts from human-written texts using aligned LLMs), which serves as a +median preference text corpus between LGTs and human-written texts to learn the +decision boundary better. We provide an extensive evaluation by considering six +text domains across twelve aligned LLMs, where our method demonstrates +state-of-the-art results. Code is available at +https://github.com/hyunseoklee-ai/ReMoDetect. + +
+
+ comment: Published as a conference proceeding for NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Multi-Lattice Sampling of Quantum Field Theories via Neural + Operator-based Flows + + +
+ We consider the problem of sampling lattice field configurations on a lattice +from the Boltzmann distribution corresponding to some action. Since such +densities arise as approximationw of an underlying functional density, we frame +the task as an instance of operator learning. We propose to approximate a +time-dependent neural operator whose time integral provides a mapping between +the functional distributions of the free and target theories. Once a particular +lattice is chosen, the neural operator can be discretized to a +finite-dimensional, time-dependent vector field which in turn induces a +continuous normalizing flow between finite dimensional distributions over the +chosen lattice. This flow can then be trained to be a diffeormorphism between +the discretized free and target theories on the chosen lattice, and, by +construction, can be evaluated on different discretizations of spacetime. We +experimentally validate the proposal on the 2-dimensional $\phi^4$-theory to +explore to what extent such operator-based flow architectures generalize to +lattice sizes they were not trained on, and show that pretraining on smaller +lattices can lead to a speedup over training directly on the target lattice +size. + +
+
+
+
+
+ + ♻ ☆ Fourier Analysis of Variational Quantum Circuits for Supervised Learning + + +
+ VQC can be understood through the lens of Fourier analysis. It is already +well-known that the function space represented by any circuit architecture can +be described through a truncated Fourier sum. We show that the spectrum +available to that truncated Fourier sum is not entirely determined by the +encoding gates of the circuit, since the variational part of the circuit can +constrain certain coefficients to zero, effectively removing that frequency +from the spectrum. To the best of our knowledge, we give the first description +of the functional dependence of the Fourier coefficients on the variational +parameters as trigonometric polynomials. This allows us to provide an algorithm +which computes the exact spectrum of any given circuit and the corresponding +Fourier coefficients. Finally, we demonstrate that by comparing the Fourier +transform of the dataset to the available spectra, it is possible to predict +which VQC out of a given list of choices will be able to best fit the data. + +
+
+
+
+
+ + ♻ ☆ Commute Graph Neural Networks + + +
+ Graph Neural Networks (GNNs) have shown remarkable success in learning from +graph-structured data. However, their application to directed graphs (digraphs) +presents unique challenges, primarily due to the inherent asymmetry in node +relationships. Traditional GNNs are adept at capturing unidirectional relations +but fall short in encoding the mutual path dependencies between nodes, such as +asymmetrical shortest paths typically found in digraphs. Recognizing this gap, +we introduce Commute Graph Neural Networks (CGNN), an approach that seamlessly +integrates node-wise commute time into the message passing scheme. The +cornerstone of CGNN is an efficient method for computing commute time using a +newly formulated digraph Laplacian. Commute time is then integrated into the +neighborhood aggregation process, with neighbor contributions weighted +according to their respective commute time to the central node in each layer. +It enables CGNN to directly capture the mutual, asymmetric relationships in +digraphs. Extensive experiments confirm the superior performance of CGNN. + +
+
+
+
+
+ + ♻ ☆ Do Large Language Models Truly Grasp Mathematics? An Empirical + Exploration From A Psychological Perspective + + +
+ Despite their proficiency in math tasks, the mechanisms underlying LLMs' +mathematical reasoning abilities remain a subject of debate. Recent studies +suggest that chain-of-thought (CoT) prompts can bolster mathematical reasoning +by encouraging LLMs to employ human-like logical reasoning (System 2), enabling +them to excel on the Cognitive Reflection Test (CRT). To assess whether LLMs +genuinely possess System 2-like logical reasoning, we introduced targeted +modifications to CRT problems. Our findings reveal that, despite the use of CoT +prompts, mainstream LLMs, including the latest o1-preview model, continue to +exhibit a significant error rate. Further analysis indicates that they +predominantly rely on System 1-like intuitive reasoning and pattern matching +derived from training data, rather than demonstrating mastery of mathematical +thinking. This discovery challenges the prevailing notion that LLMs possess +genuine logical reasoning abilities and that CoT can enhance them. +Consequently, this work may temper overly optimistic projections regarding +LLMs' advancement toward artificial general intelligence. + +
+
+
+
+
+ + ♻ ☆ Prompt-Based Spatio-Temporal Graph Transfer Learning + + +
+ Spatio-temporal graph neural networks have proven efficacy in capturing +complex dependencies for urban computing tasks such as forecasting and kriging. +Yet, their performance is constrained by the reliance on extensive data for +training on a specific task, thereby limiting their adaptability to new urban +domains with varied task demands. Although transfer learning has been proposed +to remedy this problem by leveraging knowledge across domains, the cross-task +generalization still remains under-explored in spatio-temporal graph transfer +learning due to the lack of a unified framework. To bridge the gap, we propose +Spatio-Temporal Graph Prompting (STGP), a prompt-based framework capable of +adapting to multi-diverse tasks in a data-scarce domain. Specifically, we first +unify different tasks into a single template and introduce a task-agnostic +network architecture that aligns with this template. This approach enables +capturing dependencies shared across tasks. Furthermore, we employ learnable +prompts to achieve domain and task transfer in a two-stage prompting pipeline, +facilitating the prompts to effectively capture domain knowledge and +task-specific properties. Our extensive experiments demonstrate that STGP +outperforms state-of-the-art baselines in three tasks-forecasting, kriging, and +extrapolation-achieving an improvement of up to 10.7%. + +
+
+
+
+
+ + ♻ ☆ Diffusion Policy Policy Optimization + + +
+ We introduce Diffusion Policy Policy Optimization, DPPO, an algorithmic +framework including best practices for fine-tuning diffusion-based policies +(e.g. Diffusion Policy) in continuous control and robot learning tasks using +the policy gradient (PG) method from reinforcement learning (RL). PG methods +are ubiquitous in training RL policies with other policy parameterizations; +nevertheless, they had been conjectured to be less efficient for +diffusion-based policies. Surprisingly, we show that DPPO achieves the +strongest overall performance and efficiency for fine-tuning in common +benchmarks compared to other RL methods for diffusion-based policies and also +compared to PG fine-tuning of other policy parameterizations. Through +experimental investigation, we find that DPPO takes advantage of unique +synergies between RL fine-tuning and the diffusion parameterization, leading to +structured and on-manifold exploration, stable training, and strong policy +robustness. We further demonstrate the strengths of DPPO in a range of +realistic settings, including simulated robotic tasks with pixel observations, +and via zero-shot deployment of simulation-trained policies on robot hardware +in a long-horizon, multi-stage manipulation task. Website with code: +diffusion-ppo.github.io + +
+
+ comment: Website: diffusion-ppo.github.io +
+
+
+
+
+ + ♻ ☆ Online Relational Inference for Evolving Multi-agent Interacting Systems NeurIPS 2024 + + +
+ We introduce a novel framework, Online Relational Inference (ORI), designed +to efficiently identify hidden interaction graphs in evolving multi-agent +interacting systems using streaming data. Unlike traditional offline methods +that rely on a fixed training set, ORI employs online backpropagation, updating +the model with each new data point, thereby allowing it to adapt to changing +environments in real-time. A key innovation is the use of an adjacency matrix +as a trainable parameter, optimized through a new adaptive learning rate +technique called AdaRelation, which adjusts based on the historical sensitivity +of the decoder to changes in the interaction graph. Additionally, a data +augmentation method named Trajectory Mirror (TM) is introduced to improve +generalization by exposing the model to varied trajectory patterns. +Experimental results on both synthetic datasets and real-world data (CMU MoCap +for human motion) demonstrate that ORI significantly improves the accuracy and +adaptability of relational inference in dynamic settings compared to existing +methods. This approach is model-agnostic, enabling seamless integration with +various neural relational inference (NRI) architectures, and offers a robust +solution for real-time applications in complex, evolving systems. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ SeafloorAI: A Large-scale Vision-Language Dataset for Seafloor + Geological Survey + + +
+ A major obstacle to the advancements of machine learning models in marine +science, particularly in sonar imagery analysis, is the scarcity of AI-ready +datasets. While there have been efforts to make AI-ready sonar image dataset +publicly available, they suffer from limitations in terms of environment +setting and scale. To bridge this gap, we introduce SeafloorAI, the first +extensive AI-ready datasets for seafloor mapping across 5 geological layers +that is curated in collaboration with marine scientists. We further extend the +dataset to SeafloorGenAI by incorporating the language component in order to +facilitate the development of both vision- and language-capable machine +learning models for sonar imagery. The dataset consists of 62 geo-distributed +data surveys spanning 17,300 square kilometers, with 696K sonar images, 827K +annotated segmentation masks, 696K detailed language descriptions and +approximately 7M question-answer pairs. By making our data processing source +code publicly available, we aim to engage the marine science community to +enrich the data pool and inspire the machine learning community to develop more +robust models. This collaborative approach will enhance the capabilities and +applications of our datasets within both fields. + +
+
+
+
+
+ + ♻ ☆ Winner-Take-All Column Row Sampling for Memory Efficient Adaptation of + Language Model + + +
+ With the rapid growth in model size, fine-tuning the large pre-trained +language model has become increasingly difficult due to its extensive memory +usage. Previous works usually focus on reducing the number of trainable +parameters in the network. While the model parameters do contribute to memory +usage, the primary memory bottleneck during training arises from storing +feature maps, also known as activations, as they are crucial for gradient +calculation. Notably, neural networks are usually trained using stochastic +gradient descent. We argue that in stochastic optimization, models can handle +noisy gradients as long as the gradient estimator is unbiased with reasonable +variance. Following this motivation, we propose a new family of unbiased +estimators called WTA-CRS, for matrix production with reduced variance, which +only requires storing the sub-sampled activations for calculating the gradient. +Our work provides both theoretical and experimental evidence that, in the +context of tuning transformers, our proposed estimators exhibit lower variance +compared to existing ones. By replacing the linear operation with our +approximated one in transformers, we can achieve up to 2.7$\times$ peak memory +reduction with almost no accuracy drop and enables up to $6.4\times$ larger +batch size. Under the same hardware, WTA-CRS enables better down-streaming task +performance by applying larger models and/or faster training speed with larger +batch sizes. + +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ A multi-purpose automatic editing system based on lecture semantics for + remote education + + +
+ Remote teaching has become popular recently due to its convenience and +safety, especially under extreme circumstances like a pandemic. However, online +students usually have a poor experience since the information acquired from the +views provided by the broadcast platforms is limited. One potential solution is +to show more camera views simultaneously, but it is technically challenging and +distracting for the viewers. Therefore, an automatic multi-camera +directing/editing system, which aims at selecting the most concerned view at +each time instance to guide the attention of online students, is in urgent +demand. However, existing systems mostly make simple assumptions and focus on +tracking the position of the speaker instead of the real lecture semantics, and +therefore have limited capacities to deliver optimal information flow. To this +end, this paper proposes an automatic multi-purpose editing system based on the +lecture semantics, which can both direct the multiple video streams for +real-time broadcasting and edit the optimal video offline for review purposes. +Our system directs the views by semantically analyzing the class events while +following the professional directing rules, mimicking a human director to +capture the regions of interest from the viewpoint of the onsite students. We +conduct both qualitative and quantitative analyses to verify the effectiveness +of the proposed system and its components. + +
+
+
+
+
+ + ☆ Continuous Sign Language Recognition System using Deep Learning with + MediaPipe Holistic + + +
+ Sign languages are the language of hearing-impaired people who use visuals +like the hand, facial, and body movements for communication. There are +different signs and gestures representing alphabets, words, and phrases. +Nowadays approximately 300 sign languages are being practiced worldwide such as +American Sign Language (ASL), Chinese Sign Language (CSL), Indian Sign Language +(ISL), and many more. Sign languages are dependent on the vocal language of a +place. Unlike vocal or spoken languages, there are no helping words in sign +language like is, am, are, was, were, will, be, etc. As only a limited +population is well-versed in sign language, this lack of familiarity of sign +language hinders hearing-impaired people from communicating freely and easily +with everyone. This issue can be addressed by a sign language recognition (SLR) +system which has the capability to translate the sign language into vocal +language. In this paper, a continuous SLR system is proposed using a deep +learning model employing Long Short-Term Memory (LSTM), trained and tested on +an ISL primary dataset. This dataset is created using MediaPipe Holistic +pipeline for tracking face, hand, and body movements and collecting landmarks. +The system recognizes the signs and gestures in real-time with 88.23% accuracy. + +
+
+ comment: 14 pages, 4 figures, Wireless Pers Commun +
+
+
+
+
+ + ☆ The Concatenator: A Bayesian Approach To Real Time Concatenative + Musaicing + + +
+ We present ``The Concatenator,'' a real time system for audio-guided +concatenative synthesis. Similarly to Driedger et al.'s ``musaicing'' (or +``audio mosaicing'') technique, we concatenate a set number of windows within a +corpus of audio to re-create the harmonic and percussive aspects of a target +audio stream. Unlike Driedger's NMF-based technique, however, we instead use an +explicitly Bayesian point of view, where corpus window indices are hidden +states and the target audio stream is an observation. We use a particle filter +to infer the best hidden corpus states in real-time. Our transition model +includes a tunable parameter to control the time-continuity of corpus grains, +and our observation model allows users to prioritize how quickly windows change +to match the target. Because the computational complexity of the system is +independent of the corpus size, our system scales to corpora that are hours +long, which is an important feature in the age of vast audio data collections. +Within The Concatenator module itself, composers can vary grain length, fit to +target, and pitch shift in real time while reacting to the sounds they hear, +enabling them to rapidly iterate ideas. To conclude our work, we evaluate our +system with extensive quantitative tests of the effects of parameters, as well +as a qualitative evaluation with artistic insights. Based on the quality of the +results, we believe the real-time capability unlocks new avenues for musical +expression and control, suitable for live performance and modular synthesis +integration, which furthermore represents an essential breakthrough in +concatenative synthesis technology. + +
+
+ comment: 12 pages, 6 figures, Accepted for Publication in The International + Society for Music Information Retrieval Proceedings, 2024 +
+
+
+
+
+ + ♻ ☆ PIAST: A Multimodal Piano Dataset with Audio, Symbolic and Text + + +
+ While piano music has become a significant area of study in Music Information +Retrieval (MIR), there is a notable lack of datasets for piano solo music with +text labels. To address this gap, we present PIAST (PIano dataset with Audio, +Symbolic, and Text), a piano music dataset. Utilizing a piano-specific taxonomy +of semantic tags, we collected 9,673 tracks from YouTube and added human +annotations for 2,023 tracks by music experts, resulting in two subsets: +PIAST-YT and PIAST-AT. Both include audio, text, tag annotations, and +transcribed MIDI utilizing state-of-the-art piano transcription and beat +tracking models. Among many possible tasks with the multi-modal dataset, we +conduct music tagging and retrieval using both audio and MIDI data and report +baseline performances to demonstrate its potential as a valuable resource for +MIR research. + +
+
+ comment: Accepted for publication at the 3rd Workshop on NLP for Music and + Audio (NLP4MusA 2024) +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 61 + +
+
+
+ + ☆ A Multilingual Sentiment Lexicon for Low-Resource Language Translation + using Large Languages Models and Explainable AI + + +
+ South Africa and the Democratic Republic of Congo (DRC) present a complex +linguistic landscape with languages such as Zulu, Sepedi, Afrikaans, French, +English, and Tshiluba (Ciluba), which creates unique challenges for AI-driven +translation and sentiment analysis systems due to a lack of accurately labeled +data. This study seeks to address these challenges by developing a multilingual +lexicon designed for French and Tshiluba, now expanded to include translations +in English, Afrikaans, Sepedi, and Zulu. The lexicon enhances cultural +relevance in sentiment classification by integrating language-specific +sentiment scores. A comprehensive testing corpus is created to support +translation and sentiment analysis tasks, with machine learning models such as +Random Forest, Support Vector Machine (SVM), Decision Trees, and Gaussian Naive +Bayes (GNB) trained to predict sentiment across low resource languages (LRLs). +Among them, the Random Forest model performed particularly well, capturing +sentiment polarity and handling language-specific nuances effectively. +Furthermore, Bidirectional Encoder Representations from Transformers (BERT), a +Large Language Model (LLM), is applied to predict context-based sentiment with +high accuracy, achieving 99% accuracy and 98% precision, outperforming other +models. The BERT predictions were clarified using Explainable AI (XAI), +improving transparency and fostering confidence in sentiment classification. +Overall, findings demonstrate that the proposed lexicon and machine learning +models significantly enhance translation and sentiment analysis for LRLs in +South Africa and the DRC, laying a foundation for future AI models that support +underrepresented languages, with applications across education, governance, and +business in multilingual contexts. + +
+
+ comment: This work is part of a PhD proposal in Information Technology at the + University of Pretoria, supervised by Dr. Mike Wa Nkongolo and co-supervised + by Dr. Phil van Deventer, under the Low-Resource Language Processing Lab in + the Department of Informatics +
+
+
+
+
+ + ☆ Improving Bilingual Capabilities of Language Models to Support Diverse + Linguistic Practices in Education + + +
+ Large language models (LLMs) offer promise in generating educational content, +providing instructor feedback, and reducing teacher workload on assessments. +While prior studies have focused on studying LLM-powered learning analytics, +limited research has examined how effective LLMs are in a bilingual context. In +this paper, we study the effectiveness of multilingual large language models +(MLLMs) across monolingual (English-only, Spanish-only) and bilingual +(Spanglish) student writing. We present a learning analytics use case that +details LLM performance in assessing acceptable and unacceptable explanations +of Science and Social Science concepts. Our findings reveal a significant bias +in the grading performance of pre-trained models for bilingual writing compared +to English-only and Spanish-only writing. Following this, we fine-tune +open-source MLLMs including Llama 3.1 and Mistral NeMo using synthetic datasets +generated in English, Spanish, and Spanglish. Our experiments indicate that the +models perform significantly better for all three languages after fine-tuning +with bilingual data. This study highlights the potential of enhancing MLLM +effectiveness to support authentic language practices amongst bilingual +learners. It also aims to illustrate the value of incorporating non-English +languages into the design and implementation of language models in education. + +
+
+
+
+
+ + ☆ A Capabilities Approach to Studying Bias and Harm in Language + Technologies + + +
+ Mainstream Natural Language Processing (NLP) research has ignored the +majority of the world's languages. In moving from excluding the majority of the +world's languages to blindly adopting what we make for English, we first risk +importing the same harms we have at best mitigated and at least measured for +English. However, in evaluating and mitigating harms arising from adopting new +technologies into such contexts, we often disregard (1) the actual community +needs of Language Technologies, and (2) biases and fairness issues within the +context of the communities. In this extended abstract, we consider fairness, +bias, and inclusion in Language Technologies through the lens of the +Capabilities Approach. The Capabilities Approach centers on what people are +capable of achieving, given their intersectional social, political, and +economic contexts instead of what resources are (theoretically) available to +them. We detail the Capabilities Approach, its relationship to multilingual and +multicultural evaluation, and how the framework affords meaningful +collaboration with community members in defining and measuring the harms of +Language Technologies. + +
+
+ comment: Accepted to the New Perspectives on Bias and Discrimination in + Language Technology workshop +
+
+
+
+
+ + ☆ Unfair Alignment: Examining Safety Alignment Across Vision Encoder + Layers in Vision-Language Models + + +
+ Vision-language models (VLMs) have improved significantly in multi-modal +tasks, but their more complex architecture makes their safety alignment more +challenging than the alignment of large language models (LLMs). In this paper, +we reveal an unfair distribution of safety across the layers of VLM's vision +encoder, with earlier and middle layers being disproportionately vulnerable to +malicious inputs compared to the more robust final layers. This 'cross-layer' +vulnerability stems from the model's inability to generalize its safety +training from the default architectural settings used during training to unseen +or out-of-distribution scenarios, leaving certain layers exposed. We conduct a +comprehensive analysis by projecting activations from various intermediate +layers and demonstrate that these layers are more likely to generate harmful +outputs when exposed to malicious inputs. Our experiments with LLaVA-1.5 and +Llama 3.2 show discrepancies in attack success rates and toxicity scores across +layers, indicating that current safety alignment strategies focused on a single +default layer are insufficient. + +
+
+ comment: Preprint, Under Review +
+
+
+
+
+ + ☆ Language Models are Hidden Reasoners: Unlocking Latent Reasoning + Capabilities via Self-Rewarding + + +
+ Large language models (LLMs) have shown impressive capabilities, but still +struggle with complex reasoning tasks requiring multiple steps. While +prompt-based methods like Chain-of-Thought (CoT) can improve LLM reasoning at +inference time, optimizing reasoning capabilities during training remains +challenging. We introduce LaTent Reasoning Optimization (LaTRO), a principled +framework that formulates reasoning as sampling from a latent distribution and +optimizes it via variational approaches. LaTRO enables LLMs to concurrently +improve both their reasoning process and ability to evaluate reasoning quality, +without requiring external feedback or reward models. We validate LaTRO through +experiments on GSM8K and ARC-Challenge datasets using multiple model +architectures. On GSM8K, LaTRO improves zero-shot accuracy by an average of +12.5% over base models and 9.6% over supervised fine-tuning across +Phi-3.5-mini, Mistral-7B, and Llama-3.1-8B. Our findings suggest that +pre-trained LLMs possess latent reasoning capabilities that can be unlocked and +enhanced through our proposed optimization approach in a self-improvement +manner. The code of LaTRO is available at +\url{https://github.com/SalesforceAIResearch/LaTRO}. + +
+
+
+
+
+ + ☆ Diversity Helps Jailbreak Large Language Models + + +
+ We have uncovered a powerful jailbreak technique that leverages large +language models' ability to diverge from prior context, enabling them to bypass +safety constraints and generate harmful outputs. By simply instructing the LLM +to deviate and obfuscate previous attacks, our method dramatically outperforms +existing approaches, achieving up to a 62% higher success rate in compromising +nine leading chatbots, including GPT-4, Gemini, and Llama, while using only 13% +of the queries. This revelation exposes a critical flaw in current LLM safety +training, suggesting that existing methods may merely mask vulnerabilities +rather than eliminate them. Our findings sound an urgent alarm for the need to +revolutionize testing methodologies to ensure robust and reliable LLM security. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2312.02119 +
+
+
+
+
+ + ☆ Medical Adaptation of Large Language and Vision-Language Models: Are We + Making Progress? EMNLP 2024 + + +
+ Several recent works seek to develop foundation models specifically for +medical applications, adapting general-purpose large language models (LLMs) and +vision-language models (VLMs) via continued pretraining on publicly available +biomedical corpora. These works typically claim that such domain-adaptive +pretraining (DAPT) improves performance on downstream medical tasks, such as +answering medical licensing exam questions. In this paper, we compare seven +public "medical" LLMs and two VLMs against their corresponding base models, +arriving at a different conclusion: all medical VLMs and nearly all medical +LLMs fail to consistently improve over their base models in the zero-/few-shot +prompting regime for medical question-answering (QA) tasks. For instance, +across the tasks and model pairs we consider in the 3-shot setting, medical +LLMs only outperform their base models in 12.1% of cases, reach a (statistical) +tie in 49.8% of cases, and are significantly worse than their base models in +the remaining 38.2% of cases. Our conclusions are based on (i) comparing each +medical model head-to-head, directly against the corresponding base model; (ii) +optimizing the prompts for each model separately; and (iii) accounting for +statistical uncertainty in comparisons. While these basic practices are not +consistently adopted in the literature, our ablations show that they +substantially impact conclusions. Our findings suggest that state-of-the-art +general-domain models may already exhibit strong medical knowledge and +reasoning capabilities, and offer recommendations to strengthen the conclusions +of future studies. + +
+
+ comment: Accepted to EMNLP 2024 Main Conference as Long Paper (Oral) +
+
+
+
+
+ + ☆ Self-Consistency Preference Optimization + + +
+ Self-alignment, whereby models learn to improve themselves without human +annotation, is a rapidly growing research area. However, existing techniques +often fail to improve complex reasoning tasks due to the difficulty of +assigning correct rewards. An orthogonal approach that is known to improve +correctness is self-consistency, a method applied at inference time based on +multiple sampling in order to find the most consistent answer. In this work, we +extend the self-consistency concept to help train models. We thus introduce +self-consistency preference optimization (ScPO), which iteratively trains +consistent answers to be preferred over inconsistent ones on unsupervised new +problems. We show ScPO leads to large improvements over conventional reward +model training on reasoning tasks such as GSM8K and MATH, closing the gap with +supervised training with gold answers or preferences, and that combining ScPO +with standard supervised learning improves results even further. On ZebraLogic, +ScPO finetunes Llama-3 8B to be superior to Llama-3 70B, Gemma-2 27B, and +Claude-3 Haiku. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ☆ Summarization of Opinionated Political Documents with Varied + Perspectives + + +
+ Global partisan hostility and polarization has increased, and this +polarization is heightened around presidential elections. Models capable of +generating accurate summaries of diverse perspectives can help reduce such +polarization by exposing users to alternative perspectives. In this work, we +introduce a novel dataset and task for independently summarizing each political +perspective in a set of passages from opinionated news articles. For this task, +we propose a framework for evaluating different dimensions of perspective +summary performance. We benchmark 10 models of varying sizes and architectures +through both automatic and human evaluation. While recent models like GPT-4o +perform well on this task, we find that all models struggle to generate +summaries faithful to the intended perspective. Our analysis of summaries +focuses on how extraction behavior depends on the features of the input +documents. + +
+
+
+
+
+ + ☆ M3SciQA: A Multi-Modal Multi-Document Scientific QA Benchmark for + Evaluating Foundation Models + + +
+ Existing benchmarks for evaluating foundation models mainly focus on +single-document, text-only tasks. However, they often fail to fully capture the +complexity of research workflows, which typically involve interpreting +non-textual data and gathering information across multiple documents. To +address this gap, we introduce M3SciQA, a multi-modal, multi-document +scientific question answering benchmark designed for a more comprehensive +evaluation of foundation models. M3SciQA consists of 1,452 expert-annotated +questions spanning 70 natural language processing paper clusters, where each +cluster represents a primary paper along with all its cited documents, +mirroring the workflow of comprehending a single paper by requiring multi-modal +and multi-document data. With M3SciQA, we conduct a comprehensive evaluation of +18 foundation models. Our results indicate that current foundation models still +significantly underperform compared to human experts in multi-modal information +retrieval and in reasoning across multiple scientific documents. Additionally, +we explore the implications of these findings for the future advancement of +applying foundation models in multi-modal scientific literature analysis. + +
+
+
+
+
+ + ☆ Beemo: Benchmark of Expert-edited Machine-generated Outputs + + +
+ The rapid proliferation of large language models (LLMs) has increased the +volume of machine-generated texts (MGTs) and blurred text authorship in various +domains. However, most existing MGT benchmarks include single-author texts +(human-written and machine-generated). This conventional design fails to +capture more practical multi-author scenarios, where the user refines the LLM +response for natural flow, coherence, and factual correctness. Our paper +introduces the Benchmark of Expert-edited Machine-generated Outputs (Beemo), +which includes 6.5k texts written by humans, generated by ten +instruction-finetuned LLMs, and edited by experts for various use cases, +ranging from creative writing to summarization. Beemo additionally comprises +13.1k machine-generated and LLM-edited texts, allowing for diverse MGT +detection evaluation across various edit types. We document Beemo's creation +protocol and present the results of benchmarking 33 configurations of MGT +detectors in different experimental setups. We find that expert-based editing +evades MGT detection, while LLM-edited texts are unlikely to be recognized as +human-written. Beemo and all materials are publicly available. + +
+
+
+
+
+ + ☆ Prompt Engineering Using GPT for Word-Level Code-Mixed Language + Identification in Low-Resource Dravidian Languages + + +
+ Language Identification (LI) is crucial for various natural language +processing tasks, serving as a foundational step in applications such as +sentiment analysis, machine translation, and information retrieval. In +multilingual societies like India, particularly among the youth engaging on +social media, text often exhibits code-mixing, blending local languages with +English at different linguistic levels. This phenomenon presents formidable +challenges for LI systems, especially when languages intermingle within single +words. Dravidian languages, prevalent in southern India, possess rich +morphological structures yet suffer from under-representation in digital +platforms, leading to the adoption of Roman or hybrid scripts for +communication. This paper introduces a prompt based method for a shared task +aimed at addressing word-level LI challenges in Dravidian languages. In this +work, we leveraged GPT-3.5 Turbo to understand whether the large language +models is able to correctly classify words into correct categories. Our +findings show that the Kannada model consistently outperformed the Tamil model +across most metrics, indicating a higher accuracy and reliability in +identifying and categorizing Kannada language instances. In contrast, the Tamil +model showed moderate performance, particularly needing improvement in +precision and recall. + +
+
+ comment: Accepted at FIRE 2024 (Track: Word-level Language Identification in + Dravidian Languages) +
+
+
+
+
+ + ☆ WorryWords: Norms of Anxiety Association for over 44k English Words + + +
+ Anxiety, the anticipatory unease about a potential negative outcome, is a +common and beneficial human emotion. However, there is still much that is not +known, such as how anxiety relates to our body and how it manifests in +language. This is especially pertinent given the increasing impact of +anxiety-related disorders. In this work, we introduce WorryWords, the first +large-scale repository of manually derived word--anxiety associations for over +44,450 English words. We show that the anxiety associations are highly +reliable. We use WorryWords to study the relationship between anxiety and other +emotion constructs, as well as the rate at which children acquire anxiety words +with age. Finally, we show that using WorryWords alone, one can accurately +track the change of anxiety in streams of text. The lexicon enables a wide +variety of anxiety-related research in psychology, NLP, public health, and +social sciences. WorryWords (and its translations to over 100 languages) is +freely available. http://saifmohammad.com/worrywords.html + +
+
+
+
+
+ + ☆ What Really is Commonsense Knowledge? + + +
+ Commonsense datasets have been well developed in Natural Language Processing, +mainly through crowdsource human annotation. However, there are debates on the +genuineness of commonsense reasoning benchmarks. In specific, a significant +portion of instances in some commonsense benchmarks do not concern commonsense +knowledge. That problem would undermine the measurement of the true commonsense +reasoning ability of evaluated models. It is also suggested that the problem +originated from a blurry concept of commonsense knowledge, as distinguished +from other types of knowledge. To demystify all of the above claims, in this +study, we survey existing definitions of commonsense knowledge, ground into the +three frameworks for defining concepts, and consolidate them into a +multi-framework unified definition of commonsense knowledge (so-called +consolidated definition). We then use the consolidated definition for +annotations and experiments on the CommonsenseQA and CommonsenseQA 2.0 datasets +to examine the above claims. Our study shows that there exists a large portion +of non-commonsense-knowledge instances in the two datasets, and a large +performance gap on these two subsets where Large Language Models (LLMs) perform +worse on commonsense-knowledge instances. + +
+
+ comment: Code and data will be released together with the next version of the + paper +
+
+
+
+
+ + ☆ How Does A Text Preprocessing Pipeline Affect Ontology Syntactic + Matching? + + +
+ The generic text preprocessing pipeline, comprising Tokenisation, +Normalisation, Stop Words Removal, and Stemming/Lemmatisation, has been +implemented in many ontology matching (OM) systems. However, the lack of +standardisation in text preprocessing creates diversity in mapping results. In +this paper, we investigate the effect of the text preprocessing pipeline on OM +tasks at syntactic levels. Our experiments on 8 Ontology Alignment Evaluation +Initiative (OAEI) track repositories with 49 distinct alignments indicate: (1) +Tokenisation and Normalisation are currently more effective than Stop Words +Removal and Stemming/Lemmatisation; and (2) The selection of Lemmatisation and +Stemming is task-specific. We recommend standalone Lemmatisation or Stemming +with post-hoc corrections. We find that (3) Porter Stemmer and Snowball Stemmer +perform better than Lancaster Stemmer; and that (4) Part-of-Speech (POS) +Tagging does not help Lemmatisation. To repair less effective Stop Words +Removal and Stemming/Lemmatisation used in OM tasks, we propose a novel +context-based pipeline repair approach that significantly improves matching +correctness and overall matching performance. We also discuss the use of text +preprocessing pipeline in the new era of large language models (LLMs). + +
+
+ comment: 13 pages, 26 figures, 4 tables +
+
+
+
+
+ + ☆ Interactions Across Blocks in Post-Training Quantization of Large + Language Models + + +
+ Post-training quantization is widely employed to reduce the computational +demands of neural networks. Typically, individual substructures, such as layers +or blocks of layers, are quantized with the objective of minimizing +quantization errors in their pre-activations by fine-tuning the corresponding +weights. Deriving this local objective from the global objective of minimizing +task loss involves two key simplifications: assuming substructures are mutually +independent and ignoring the knowledge of subsequent substructures as well as +the task loss. In this work, we assess the effects of these simplifications on +weight-only quantization of large language models. We introduce two multi-block +fine-tuning strategies and compare them against the baseline of fine-tuning +single transformer blocks. The first captures correlations of weights across +blocks by jointly optimizing multiple quantized blocks. The second incorporates +knowledge of subsequent blocks by minimizing the error in downstream +pre-activations rather than focusing solely on the quantized block. Our +findings indicate that the effectiveness of these methods depends on the +specific network model, with no impact on some models but demonstrating +significant benefits for others. + +
+
+
+
+
+ + ☆ Evaluation data contamination in LLMs: how do we measure it and (when) + does it matter? + + +
+ Hampering the interpretation of benchmark scores, evaluation data +contamination has become a growing concern in the evaluation of LLMs, and an +active area of research studies its effects. While evaluation data +contamination is easily understood intuitively, it is surprisingly difficult to +define precisely which samples should be considered contaminated and, +consequently, how it impacts benchmark scores. We propose that these questions +should be addressed together and that contamination metrics can be assessed +based on whether models benefit from the examples they mark contaminated. We +propose a novel analysis method called ConTAM, and show with a large scale +survey of existing and novel n-gram based contamination metrics across 13 +benchmarks and 7 models from 2 different families that ConTAM can be used to +better understand evaluation data contamination and its effects. We find that +contamination may have a much larger effect than reported in recent LLM +releases and benefits models differently at different scales. We also find that +considering only the longest contaminated substring provides a better signal +than considering a union of all contaminated substrings, and that doing model +and benchmark specific threshold analysis greatly increases the specificity of +the results. Lastly, we investigate the impact of hyperparameter choices, +finding that, among other things, both using larger values of n and +disregarding matches that are infrequent in the pre-training data lead to many +false negatives. With ConTAM, we provide a method to empirically ground +evaluation data contamination metrics in downstream effects. With our +exploration, we shed light on how evaluation data contamination can impact LLMs +and provide insight into the considerations important when doing contamination +analysis. We end our paper by discussing these in more detail and providing +concrete suggestions for future work. + +
+
+
+
+
+ + ☆ RAGulator: Lightweight Out-of-Context Detectors for Grounded Text + Generation + + +
+ Real-time detection of out-of-context LLM outputs is crucial for enterprises +looking to safely adopt RAG applications. In this work, we train lightweight +models to discriminate LLM-generated text that is semantically out-of-context +from retrieved text documents. We preprocess a combination of summarisation and +semantic textual similarity datasets to construct training data using minimal +resources. We find that DeBERTa is not only the best-performing model under +this pipeline, but it is also fast and does not require additional text +preprocessing or feature engineering. While emerging work demonstrates that +generative LLMs can also be fine-tuned and used in complex data pipelines to +achieve state-of-the-art performance, we note that speed and resource limits +are important considerations for on-premise deployment. + +
+
+
+
+
+ + ☆ Analyzing Multimodal Features of Spontaneous Voice Assistant Commands + for Mild Cognitive Impairment Detection + + +
+ Mild cognitive impairment (MCI) is a major public health concern due to its +high risk of progressing to dementia. This study investigates the potential of +detecting MCI with spontaneous voice assistant (VA) commands from 35 older +adults in a controlled setting. Specifically, a command-generation task is +designed with pre-defined intents for participants to freely generate commands +that are more associated with cognitive ability than read commands. We develop +MCI classification and regression models with audio, textual, intent, and +multimodal fusion features. We find the command-generation task outperforms the +command-reading task with an average classification accuracy of 82%, achieved +by leveraging multimodal fusion features. In addition, generated commands +correlate more strongly with memory and attention subdomains than read +commands. Our results confirm the effectiveness of the command-generation task +and imply the promise of using longitudinal in-home commands for MCI detection. + +
+
+
+
+
+ + ☆ Lexicalization Is All You Need: Examining the Impact of Lexical + Knowledge in a Compositional QALD System + + +
+ In this paper, we examine the impact of lexicalization on Question Answering +over Linked Data (QALD). It is well known that one of the key challenges in +interpreting natural language questions with respect to SPARQL lies in bridging +the lexical gap, that is mapping the words in the query to the correct +vocabulary elements. We argue in this paper that lexicalization, that is +explicit knowledge about the potential interpretations of a word with respect +to the given vocabulary, significantly eases the task and increases the +performance of QA systems. Towards this goal, we present a compositional QA +system that can leverage explicit lexical knowledge in a compositional manner +to infer the meaning of a question in terms of a SPARQL query. We show that +such a system, given lexical knowledge, has a performance well beyond current +QA systems, achieving up to a $35.8\%$ increase in the micro $F_1$ score +compared to the best QA system on QALD-9. This shows the importance and +potential of including explicit lexical knowledge. In contrast, we show that +LLMs have limited abilities to exploit lexical knowledge, with only marginal +improvements compared to a version without lexical knowledge. This shows that +LLMs have no ability to compositionally interpret a question on the basis of +the meaning of its parts, a key feature of compositional approaches. Taken +together, our work shows new avenues for QALD research, emphasizing the +importance of lexicalization and compositionality. + +
+
+ comment: 24th International Conference on Knowledge Engineering and Knowledge + Management (EKAW 2024), November 26-28, 2024, Amsterdam, The Netherlands +
+
+
+
+
+ + ☆ Computational Analysis of Gender Depiction in the Comedias of Calderón + de la Barca + + +
+ In theatre, playwrights use the portrayal of characters to explore culturally +based gender norms. In this paper, we develop quantitative methods to study +gender depiction in the non-religious works (comedias) of Pedro Calder\'on de +la Barca, a prolific Spanish 17th century author. We gather insights from a +corpus of more than 100 plays by using a gender classifier and applying model +explainability (attribution) methods to determine which text features are most +influential in the model's decision to classify speech as 'male' or 'female', +indicating the most gendered elements of dialogue in Calder\'on's comedias in a +human accessible manner. We find that female and male characters are portrayed +differently and can be identified by the gender prediction model at practically +useful accuracies (up to f=0.83). Analysis reveals semantic aspects of gender +portrayal, and demonstrates that the model is even useful in providing a +relatively accurate scene-by-scene prediction of cross-dressing characters. + +
+
+
+
+
+ + ☆ Multi3Hate: Multimodal, Multilingual, and Multicultural Hate Speech + Detection with Vision-Language Models + + +
+ Warning: this paper contains content that may be offensive or upsetting + Hate speech moderation on global platforms poses unique challenges due to the +multimodal and multilingual nature of content, along with the varying cultural +perceptions. How well do current vision-language models (VLMs) navigate these +nuances? To investigate this, we create the first multimodal and multilingual +parallel hate speech dataset, annotated by a multicultural set of annotators, +called Multi3Hate. It contains 300 parallel meme samples across 5 languages: +English, German, Spanish, Hindi, and Mandarin. We demonstrate that cultural +background significantly affects multimodal hate speech annotation in our +dataset. The average pairwise agreement among countries is just 74%, +significantly lower than that of randomly selected annotator groups. Our +qualitative analysis indicates that the lowest pairwise label agreement-only +67% between the USA and India-can be attributed to cultural factors. We then +conduct experiments with 5 large VLMs in a zero-shot setting, finding that +these models align more closely with annotations from the US than with those +from other cultures, even when the memes and prompts are presented in the +dominant language of the other culture. Code and dataset are available at +https://github.com/MinhDucBui/Multi3Hate. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Polynomial Composition Activations: Unleashing the Dynamics of Large + Language Models + + +
+ Transformers have found extensive applications across various domains due to +the powerful fitting capabilities. This success can be partially attributed to +their inherent nonlinearity. Thus, in addition to the ReLU function employed in +the original transformer architecture, researchers have explored alternative +modules such as GeLU and SwishGLU to enhance nonlinearity and thereby augment +representational capacity. In this paper, we propose a novel category of +polynomial composition activations (PolyCom), designed to optimize the dynamics +of transformers. Theoretically, we provide a comprehensive mathematical +analysis of PolyCom, highlighting its enhanced expressivity and efficacy +relative to other activation functions. Notably, we demonstrate that networks +incorporating PolyCom achieve the $\textbf{optimal approximation rate}$, +indicating that PolyCom networks require minimal parameters to approximate +general smooth functions in Sobolev spaces. We conduct empirical experiments on +the pre-training configurations of large language models (LLMs), including both +dense and sparse architectures. By substituting conventional activation +functions with PolyCom, we enable LLMs to capture higher-order interactions +within the data, thus improving performance metrics in terms of accuracy and +convergence rates. Extensive experimental results demonstrate the effectiveness +of our method, showing substantial improvements over other activation +functions. Code is available at https://github.com/BryceZhuo/PolyCom. + +
+
+
+
+
+ + ☆ Performance evaluation of SLAM-ASR: The Good, the Bad, the Ugly, and the + Way Forward ICASSP 2025 + + +
+ Recent research has demonstrated that training a linear connector between +speech foundation encoders and large language models (LLMs) enables this +architecture to achieve strong ASR capabilities. Despite the impressive +results, it remains unclear whether these simple approaches are robust enough +across different scenarios and speech conditions, such as domain shifts and +different speech perturbations. In this paper, we address these questions by +conducting various ablation experiments using a recent and widely adopted +approach called SLAM-ASR. We present novel empirical findings that offer +insights on how to effectively utilize the SLAM-ASR architecture across a wide +range of settings. Our main findings indicate that the SLAM-ASR exhibits poor +performance in cross-domain evaluation settings. Additionally, speech +perturbations within in-domain data, such as changes in speed or the presence +of additive noise, can significantly impact performance. Our findings offer +critical insights for fine-tuning and configuring robust LLM-based ASR models, +tailored to different data characteristics and computational resources. + +
+
+ comment: Submitted to ICASSP 2025 SALMA Workshop +
+
+
+
+
+ + ☆ MambaPEFT: Exploring Parameter-Efficient Fine-Tuning for Mamba + + +
+ An ecosystem of Transformer-based models has been established by building +large models with extensive data. Parameter-efficient fine-tuning (PEFT) is a +crucial technology for deploying these models to downstream tasks with minimal +cost while achieving effective performance. Recently, Mamba, a State Space +Model (SSM)-based model, has attracted attention as a potential alternative to +Transformers. While many large-scale Mamba-based models have been proposed, +efficiently adapting pre-trained Mamba-based models to downstream tasks remains +unexplored. In this paper, we conduct an exploratory analysis of PEFT methods +for Mamba. We investigate the effectiveness of existing PEFT methods for +Transformers when applied to Mamba. We also modify these methods to better +align with the Mamba architecture. Additionally, we propose new Mamba-specific +PEFT methods that leverage the distinctive structure of Mamba. Our experiments +indicate that PEFT performs more effectively for Mamba than Transformers. +Lastly, we demonstrate how to effectively combine multiple PEFT methods and +provide a framework that outperforms previous works. To ensure reproducibility, +we will release the code after publication. + +
+
+
+
+
+ + ☆ Both Text and Images Leaked! A Systematic Analysis of Multimodal LLM + Data Contamination + + +
+ The rapid progression of multimodal large language models (MLLMs) has +demonstrated superior performance on various multimodal benchmarks. However, +the issue of data contamination during training creates challenges in +performance evaluation and comparison. While numerous methods exist for +detecting dataset contamination in large language models (LLMs), they are less +effective for MLLMs due to their various modalities and multiple training +phases. In this study, we introduce a multimodal data contamination detection +framework, MM-Detect, designed for MLLMs. Our experimental results indicate +that MM-Detect is sensitive to varying degrees of contamination and can +highlight significant performance improvements due to leakage of the training +set of multimodal benchmarks. Furthermore, We also explore the possibility of +contamination originating from the pre-training phase of LLMs used by MLLMs and +the fine-tuning phase of MLLMs, offering new insights into the stages at which +contamination may be introduced. + +
+
+
+
+
+ + ☆ From Novice to Expert: LLM Agent Policy Optimization via Step-wise + Reinforcement Learning + + +
+ The outstanding capabilities of large language models (LLMs) render them a +crucial component in various autonomous agent systems. While traditional +methods depend on the inherent knowledge of LLMs without fine-tuning, more +recent approaches have shifted toward the reinforcement learning strategy to +further enhance agents' ability to solve complex interactive tasks with +environments and tools. However, previous approaches are constrained by the +sparse reward issue, where existing datasets solely provide a final scalar +reward for each multi-step reasoning chain, potentially leading to +ineffectiveness and inefficiency in policy learning. In this paper, we +introduce StepAgent, which utilizes step-wise reward to optimize the agent's +reinforcement learning process. Inheriting the spirit of novice-to-expert +theory, we first compare the actions of the expert and the agent to +automatically generate intermediate rewards for fine-grained optimization. +Additionally, we propose implicit-reward and inverse reinforcement learning +techniques to facilitate agent reflection and policy adjustment. Further +theoretical analysis demonstrates that the action distribution of the agent can +converge toward the expert action distribution over multiple training cycles. +Experimental results across various datasets indicate that StepAgent +outperforms existing baseline methods. + +
+
+
+
+
+ + ☆ MRJ-Agent: An Effective Jailbreak Agent for Multi-Round Dialogue + + +
+ Large Language Models (LLMs) demonstrate outstanding performance in their +reservoir of knowledge and understanding capabilities, but they have also been +shown to be prone to illegal or unethical reactions when subjected to jailbreak +attacks. To ensure their responsible deployment in critical applications, it is +crucial to understand the safety capabilities and vulnerabilities of LLMs. +Previous works mainly focus on jailbreak in single-round dialogue, overlooking +the potential jailbreak risks in multi-round dialogues, which are a vital way +humans interact with and extract information from LLMs. Some studies have +increasingly concentrated on the risks associated with jailbreak in multi-round +dialogues. These efforts typically involve the use of manually crafted +templates or prompt engineering techniques. However, due to the inherent +complexity of multi-round dialogues, their jailbreak performance is limited. To +solve this problem, we propose a novel multi-round dialogue jailbreaking agent, +emphasizing the importance of stealthiness in identifying and mitigating +potential threats to human values posed by LLMs. We propose a risk +decomposition strategy that distributes risks across multiple rounds of queries +and utilizes psychological strategies to enhance attack strength. Extensive +experiments show that our proposed method surpasses other attack methods and +achieves state-of-the-art attack success rate. We will make the corresponding +code and dataset available for future research. The code will be released soon. + +
+
+
+
+
+ + ☆ Crystal: Illuminating LLM Abilities on Language and Code + + +
+ Large Language Models (LLMs) specializing in code generation (which are also +often referred to as code LLMs), e.g., StarCoder and Code Llama, play +increasingly critical roles in various software development scenarios. It is +also crucial for code LLMs to possess both code generation and natural language +abilities for many specific applications, such as code snippet retrieval using +natural language or code explanations. The intricate interaction between +acquiring language and coding skills complicates the development of strong code +LLMs. Furthermore, there is a lack of thorough prior studies on the LLM +pretraining strategy that mixes code and natural language. In this work, we +propose a pretraining strategy to enhance the integration of natural language +and coding capabilities within a single LLM. Specifically, it includes two +phases of training with appropriately adjusted code/language ratios. The +resulting model, Crystal, demonstrates remarkable capabilities in both domains. +Specifically, it has natural language and coding performance comparable to that +of Llama 2 and Code Llama, respectively. Crystal exhibits better data +efficiency, using 1.4 trillion tokens compared to the more than 2 trillion +tokens used by Llama 2 and Code Llama. We verify our pretraining strategy by +analyzing the training process and observe consistent improvements in most +benchmarks. We also adopted a typical application adaptation phase with a +code-centric data mixture, only to find that it did not lead to enhanced +performance or training efficiency, underlining the importance of a carefully +designed data recipe. To foster research within the community, we commit to +open-sourcing every detail of the pretraining, including our training datasets, +code, loggings and 136 checkpoints throughout the training. + +
+
+ comment: Published as a conference paper at COLM 2024 +
+
+
+
+
+ + ☆ The natural stability of autonomous morphology + + +
+ Autonomous morphology, such as inflection class systems and paradigmatic +distribution patterns, is widespread and diachronically resilient in natural +language. Why this should be so has remained unclear given that autonomous +morphology imposes learning costs, offers no clear benefit relative to its +absence and could easily be removed by the analogical forces which are +constantly reshaping it. Here we propose an explanation for the resilience of +autonomous morphology, in terms of a diachronic dynamic of attraction and +repulsion between morphomic categories, which emerges spontaneously from a +simple paradigm cell filling process. Employing computational evolutionary +models, our key innovation is to bring to light the role of `dissociative +evidence', i.e., evidence for inflectional distinctiveness which a rational +reasoner will have access to during analogical inference. Dissociative evidence +creates a repulsion dynamic which prevents morphomic classes from collapsing +together entirely, i.e., undergoing complete levelling. As we probe alternative +models, we reveal the limits of conditional entropy as a measure for +predictability in systems that are undergoing change. Finally, we demonstrate +that autonomous morphology, far from being `unnatural' (e.g. +\citealt{Aronoff1994}), is rather the natural (emergent) consequence of a +natural (rational) process of inference applied to inflectional systems. + +
+
+ comment: Accepted for publication by the journal Morphology +
+
+
+
+
+ + ☆ Understanding the Effects of Human-written Paraphrases in LLM-generated + Text Detection + + +
+ Natural Language Generation has been rapidly developing with the advent of +large language models (LLMs). While their usage has sparked significant +attention from the general public, it is important for readers to be aware when +a piece of text is LLM-generated. This has brought about the need for building +models that enable automated LLM-generated text detection, with the aim of +mitigating potential negative outcomes of such content. Existing LLM-generated +detectors show competitive performances in telling apart LLM-generated and +human-written text, but this performance is likely to deteriorate when +paraphrased texts are considered. In this study, we devise a new data +collection strategy to collect Human & LLM Paraphrase Collection (HLPC), a +first-of-its-kind dataset that incorporates human-written texts and +paraphrases, as well as LLM-generated texts and paraphrases. With the aim of +understanding the effects of human-written paraphrases on the performance of +state-of-the-art LLM-generated text detectors OpenAI RoBERTa and watermark +detectors, we perform classification experiments that incorporate human-written +paraphrases, watermarked and non-watermarked LLM-generated documents from GPT +and OPT, and LLM-generated paraphrases from DIPPER and BART. The results show +that the inclusion of human-written paraphrases has a significant impact of +LLM-generated detector performance, promoting TPR@1%FPR with a possible +trade-off of AUROC and accuracy. + +
+
+
+
+
+ + ♻ ☆ Evaluating Creative Short Story Generation in Humans and Large Language + Models + + +
+ Storytelling is a fundamental aspect of human communication, relying heavily +on creativity to produce narratives that are novel, appropriate, and +surprising. While large language models (LLMs) have recently demonstrated the +ability to generate high-quality stories, their creative capabilities remain +underexplored. Previous research has either focused on creativity tests +requiring short responses or primarily compared model performance in story +generation to that of professional writers. However, the question of whether +LLMs exhibit creativity in writing short stories on par with the average human +remains unanswered. In this work, we conduct a systematic analysis of +creativity in short story generation across LLMs and everyday people. Using a +five-sentence creative story task, commonly employed in psychology to assess +human creativity, we automatically evaluate model- and human-generated stories +across several dimensions of creativity, including novelty, surprise, and +diversity. Our findings reveal that while LLMs can generate stylistically +complex stories, they tend to fall short in terms of creativity when compared +to average human writers. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ GRSQA -- Graph Reasoning-Structured Question Answering Dataset + + +
+ Large Language Models (LLMs) have excelled in multi-hop question-answering +(M-QA) due to their advanced reasoning abilities. However, the impact of the +inherent reasoning structures on LLM M-QA performance remains unclear, largely +due to the absence of QA datasets that provide fine-grained reasoning +structures. To address this gap, we introduce the Graph Reasoning-Structured +Question Answering Dataset (GRS-QA), which includes both semantic contexts and +reasoning structures for QA pairs. Unlike existing M-QA datasets, where +different reasoning structures are entangled together, GRS-QA explicitly +captures intricate reasoning pathways by constructing reasoning graphs, where +nodes represent textual contexts and edges denote logical flows. These +reasoning graphs of different structures enable a fine-grained evaluation of +LLM reasoning capabilities across various reasoning structures. Our empirical +analysis reveals that LLMs perform differently when handling questions with +varying reasoning structures. This finding facilitates the exploration of +textual structures as compared with semantics. + +
+
+ comment: 15 pages, 24 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ Transcoders Find Interpretable LLM Feature Circuits NeurIPS 2024 + + +
+ A key goal in mechanistic interpretability is circuit analysis: finding +sparse subgraphs of models corresponding to specific behaviors or capabilities. +However, MLP sublayers make fine-grained circuit analysis on transformer-based +language models difficult. In particular, interpretable features -- such as +those found by sparse autoencoders (SAEs) -- are typically linear combinations +of extremely many neurons, each with its own nonlinearity to account for. +Circuit analysis in this setting thus either yields intractably large circuits +or fails to disentangle local and global behavior. To address this we explore +transcoders, which seek to faithfully approximate a densely activating MLP +layer with a wider, sparsely-activating MLP layer. We introduce a novel method +for using transcoders to perform weights-based circuit analysis through MLP +sublayers. The resulting circuits neatly factorize into input-dependent and +input-invariant terms. We then successfully train transcoders on language +models with 120M, 410M, and 1.4B parameters, and find them to perform at least +on par with SAEs in terms of sparsity, faithfulness, and +human-interpretability. Finally, we apply transcoders to reverse-engineer +unknown circuits in the model, and we obtain novel insights regarding the +"greater-than circuit" in GPT2-small. Our results suggest that transcoders can +prove effective in decomposing model computations involving MLPs into +interpretable circuits. Code is available at +https://github.com/jacobdunefsky/transcoder_circuits/. + +
+
+ comment: 29 pages, 6 figures, 4 tables, 2 algorithms. NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Teach Better or Show Smarter? On Instructions and Exemplars in Automatic + Prompt Optimization NeurIPS 2024 + + +
+ Large language models have demonstrated remarkable capabilities, but their +performance is heavily reliant on effective prompt engineering. Automatic +prompt optimization (APO) methods are designed to automate this and can be +broadly categorized into those targeting instructions (instruction +optimization, IO) vs. those targeting exemplars (exemplar optimization, EO). +Despite their shared objective, these have evolved rather independently, with +IO receiving more research attention recently. This paper seeks to bridge this +gap by comprehensively comparing the performance of representative IO and EO +techniques both isolation and combination on a diverse set of challenging +tasks. Our findings reveal that intelligently reusing model-generated +input-output pairs obtained from evaluating prompts on the validation set as +exemplars, consistently improves performance on top of IO methods but is +currently under-investigated. We also find that despite the recent focus on IO, +how we select exemplars can outweigh how we optimize instructions, with EO +strategies as simple as random search outperforming state-of-the-art IO methods +with seed instructions without any optimization. Moreover, we observe a synergy +between EO and IO, with optimal combinations surpassing the individual +contributions. We conclude that studying exemplar optimization both as a +standalone method and its optimal combination with instruction optimization +remain a crucial aspect of APO and deserve greater consideration in future +research, even in the era of highly capable instruction-following models. + +
+
+ comment: Expanded version of the NeurIPS 2024 paper +
+
+
+
+
+ + ♻ ☆ Perceptions to Beliefs: Exploring Precursory Inferences for Theory of + Mind in Large Language Models + + +
+ While humans naturally develop theory of mind (ToM), the capability to +understand other people's mental states and beliefs, state-of-the-art large +language models (LLMs) underperform on simple ToM benchmarks. We posit that we +can extend our understanding of LLMs' ToM abilities by evaluating key human ToM +precursors$-$perception inference and perception-to-belief inference$-$in LLMs. +We introduce two datasets, Percept-ToMi and Percept-FANToM, to evaluate these +precursory inferences for ToM in LLMs by annotating characters' perceptions on +ToMi and FANToM, respectively. Our evaluation of eight state-of-the-art LLMs +reveals that the models generally perform well in perception inference while +exhibiting limited capability in perception-to-belief inference (e.g., lack of +inhibitory control). Based on these results, we present PercepToM, a novel ToM +method leveraging LLMs' strong perception inference capability while +supplementing their limited perception-to-belief inference. Experimental +results demonstrate that PercepToM significantly enhances LLM's performance, +especially in false belief scenarios. + +
+
+
+
+
+ + ♻ ☆ Interpretable Differential Diagnosis with Dual-Inference Large Language + Models + + +
+ Automatic differential diagnosis (DDx) is an essential medical task that +generates a list of potential diseases as differentials based on patient +symptom descriptions. In practice, interpreting these differential diagnoses +yields significant value but remains under-explored. Given the powerful +capabilities of large language models (LLMs), we investigated using LLMs for +interpretable DDx. Specifically, we curated the first DDx dataset with +expert-derived interpretation on 570 clinical notes. Besides, we proposed +Dual-Inf, a novel framework that enabled LLMs to conduct bidirectional +inference (i.e., from symptoms to diagnoses and vice versa) for DDx +interpretation. Both human and automated evaluation validated its efficacy in +predicting and elucidating differentials across four base LLMs. In addition, +Dual-Inf could reduce interpretation errors and hold promise for rare disease +explanations. To the best of our knowledge, it is the first work that +customizes LLMs for DDx explanation and comprehensively evaluates their +interpretation performance. Overall, our study bridges a critical gap in DDx +interpretation and enhances clinical decision-making. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Textless Speech-to-Speech Translation With Limited Parallel Data EMNLP 2024 + + +
+ Existing speech-to-speech translation (S2ST) models fall into two camps: they +either leverage text as an intermediate step or require hundreds of hours of +parallel speech data. Both approaches are incompatible with textless languages +or language pairs with limited parallel data. We present PFB, a framework for +training textless S2ST models that require just dozens of hours of parallel +speech data. We first pretrain a model on large-scale monolingual speech data, +finetune it with a small amount of parallel speech data (20-60 hours), and +lastly train with an unsupervised backtranslation objective. We train and +evaluate our models for English-to-German, German-to-English and +Marathi-to-English translation on three different domains (European Parliament, +Common Voice, and All India Radio) with single-speaker synthesized speech. +Evaluated using the ASR-BLEU metric, our models achieve reasonable performance +on all three domains, with some being within 1-2 points of our higher-resourced +topline. + +
+
+ comment: Accepted to EMNLP 2024 Findings +
+
+
+
+
+ + ♻ ☆ GPT-4V Cannot Generate Radiology Reports Yet + + +
+ GPT-4V's purported strong multimodal abilities raise interests in using it to +automate radiology report writing, but there lacks thorough evaluations. In +this work, we perform a systematic evaluation of GPT-4V in generating radiology +reports on two chest X-ray report datasets: MIMIC-CXR and IU X-Ray. We attempt +to directly generate reports using GPT-4V through different prompting +strategies and find that it fails terribly in both lexical metrics and clinical +efficacy metrics. To understand the low performance, we decompose the task into +two steps: 1) the medical image reasoning step of predicting medical condition +labels from images; and 2) the report synthesis step of generating reports from +(groundtruth) conditions. We show that GPT-4V's performance in image reasoning +is consistently low across different prompts. In fact, the distributions of +model-predicted labels remain constant regardless of which groundtruth +conditions are present on the image, suggesting that the model is not +interpreting chest X-rays meaningfully. Even when given groundtruth conditions +in report synthesis, its generated reports are less correct and less +natural-sounding than a finetuned LLaMA-2. Altogether, our findings cast doubt +on the viability of using GPT-4V in a radiology workflow. + +
+
+ comment: 24 pages, 3 figures, code: + https://github.com/ChicagoHAI/cxr-eval-gpt-4v +
+
+
+
+
+ + ♻ ☆ INQUIRE: A Natural World Text-to-Image Retrieval Benchmark NeurIPS 2024 + + +
+ We introduce INQUIRE, a text-to-image retrieval benchmark designed to +challenge multimodal vision-language models on expert-level queries. INQUIRE +includes iNaturalist 2024 (iNat24), a new dataset of five million natural world +images, along with 250 expert-level retrieval queries. These queries are paired +with all relevant images comprehensively labeled within iNat24, comprising +33,000 total matches. Queries span categories such as species identification, +context, behavior, and appearance, emphasizing tasks that require nuanced image +understanding and domain expertise. Our benchmark evaluates two core retrieval +tasks: (1) INQUIRE-Fullrank, a full dataset ranking task, and (2) +INQUIRE-Rerank, a reranking task for refining top-100 retrievals. Detailed +evaluation of a range of recent multimodal models demonstrates that INQUIRE +poses a significant challenge, with the best models failing to achieve an +mAP@50 above 50%. In addition, we show that reranking with more powerful +multimodal models can enhance retrieval performance, yet there remains a +significant margin for improvement. By focusing on scientifically-motivated +ecological challenges, INQUIRE aims to bridge the gap between AI capabilities +and the needs of real-world scientific inquiry, encouraging the development of +retrieval systems that can assist with accelerating ecological and biodiversity +research. Our dataset and code are available at +https://inquire-benchmark.github.io + +
+
+ comment: Published in NeurIPS 2024, Datasets and Benchmarks Track +
+
+
+
+
+ + ♻ ☆ No Train, all Gain: Self-Supervised Gradients Improve Deep Frozen + Representations NeurIPS 2024 + + +
+ This paper introduces FUNGI, Features from UNsupervised GradIents, a method +to enhance the features of transformer encoders by leveraging self-supervised +gradients. Our method is simple: given any pretrained model, we first compute +gradients from various self-supervised objectives for each input. These +gradients are projected to a lower dimension and then concatenated with the +model's output embedding. The resulting features are evaluated on k-nearest +neighbor classification over 11 datasets from vision, 5 from natural language +processing, and 2 from audio. Across backbones spanning various sizes and +pretraining strategies, FUNGI features provide consistent performance +improvements over the embeddings. We also show that using FUNGI features can +benefit linear classification, clustering and image retrieval, and that they +significantly improve the retrieval-based in-context scene understanding +abilities of pretrained models, for example improving upon DINO by +17% for +semantic segmentation - without any training. + +
+
+ comment: NeurIPS 2024. Code available at + https://github.com/WalterSimoncini/fungivision +
+
+
+
+
+ + ♻ ☆ OpenFactCheck: A Unified Framework for Factuality Evaluation of LLMs EMNLP 2024 + + +
+ The increased use of large language models (LLMs) across a variety of +real-world applications calls for automatic tools to check the factual accuracy +of their outputs, as LLMs often hallucinate. This is difficult as it requires +assessing the factuality of free-form open-domain responses. While there has +been a lot of research on this topic, different papers use different evaluation +benchmarks and measures, which makes them hard to compare and hampers future +progress. To mitigate these issues, we developed OpenFactCheck, a unified +framework, with three modules: (i) RESPONSEEVAL, which allows users to easily +customize an automatic fact-checking system and to assess the factuality of all +claims in an input document using that system, (ii) LLMEVAL, which assesses the +overall factuality of an LLM, and (iii) CHECKEREVAL, a module to evaluate +automatic fact-checking systems. OpenFactCheck is open-sourced +(https://github.com/mbzuai-nlp/openfactcheck) and publicly released as a Python +library (https://pypi.org/project/openfactcheck/) and also as a web service +(http://app.openfactcheck.com). A video describing the system is available at +https://youtu.be/-i9VKL0HleI. + +
+
+ comment: 11 pages, 4 Figures, 3 Tables, Accepted at EMNLP 2024 System + Demonstration. arXiv admin note: substantial text overlap with + arXiv:2405.05583 +
+
+
+
+
+ + ♻ ☆ Counterfactual Token Generation in Large Language Models + + +
+ "Sure, I am happy to generate a story for you: Captain Lyra stood at the helm +of her trusty ship, the Maelstrom's Fury, gazing out at the endless sea. [...] +Lyra's eyes welled up with tears as she realized the bitter truth - she had +sacrificed everything for fleeting riches, and lost the love of her crew, her +family, and herself." Although this story, generated by a large language model, +is captivating, one may wonder -- how would the story have unfolded if the +model had chosen "Captain Maeve" as the protagonist instead? We cannot know. +State-of-the-art large language models are stateless -- they maintain no +internal memory or state. Given a prompt, they generate a sequence of tokens as +an output using an autoregressive process. As a consequence, they cannot reason +about counterfactual alternatives to tokens they have generated in the past. In +this work, our goal is to enhance them with this functionality. To this end, we +develop a causal model of token generation that builds upon the Gumbel-Max +structural causal model. Our model allows any large language model to perform +counterfactual token generation at almost no cost in comparison with vanilla +token generation, it is embarrassingly simple to implement, and it does not +require any fine-tuning nor prompt engineering. We implement our model on Llama +3 8B-Instruct and Ministral-8B-Instruct and conduct a qualitative and a +quantitative analysis of counterfactually generated text. We conclude with a +demonstrative application of counterfactual token generation for bias +detection, unveiling interesting insights about the model of the world +constructed by large language models. + +
+
+
+
+
+ + ♻ ☆ Teaching Models to Improve on Tape + + +
+ Large Language Models (LLMs) often struggle when prompted to generate content +under specific constraints. However, in such cases it is often easy to check +whether these constraints are satisfied or violated. Recent works have shown +that LLMs can benefit from such "corrective feedback". Here we claim that this +skill of LLMs can be significantly enhanced via training. We introduce an RL +framework for teaching models to use such rewards, by simulating interaction +sessions, and rewarding the model according to its ability to satisfy the +constraints. We refer to our method as CORGI (Controlled Generation with RL for +Guided Interaction), and evaluate it on a variety of controlled generation +tasks using unlabeled training data. We find that CORGI consistently +outperforms the baseline reinforcement learning method that does not +incorporate conversational feedback. Furthermore, CORGI's interactive framework +enables meta-learning, allowing the LLM to generalize better to guided +interaction in new tasks. Our results clearly show that conversational +optimization, when combined with reinforcement learning, significantly improves +the effectiveness of LLMs in controlled generation contexts. + +
+
+
+
+
+ + ♻ ☆ Diverging Preferences: When do Annotators Disagree and do Models Know? + + +
+ We examine diverging preferences in human-labeled preference datasets. We +develop a taxonomy of disagreement sources spanning 10 categories across four +high-level classes -- task underspecification, response style, refusals, and +annotation errors. We find that the majority of disagreements are in opposition +with standard reward modeling approaches, which are designed with the +assumption that annotator disagreement is noise. We then explore how these +findings impact two areas of LLM development: reward modeling and evaluation. +In our experiments, we demonstrate how standard reward modeling methods, like +the Bradley-Terry model, fail to differentiate whether a given preference +judgment is the result of unanimous agreement among annotators or the majority +opinion among diverging user preferences. We also find that these tendencies +are also echoed by popular LLM-as-Judge evaluation methods, which consistently +identify a winning response in cases of diverging preferences. These findings +highlight remaining challenges in LLM evaluations, which are greatly influenced +by divisive features like response style, and in developing pluralistically +aligned LLMs. To address these issues, we develop methods for identifying +diverging preferences to mitigate their influence on evaluation and training. + +
+
+
+
+
+ + ♻ ☆ Pretraining and Updates of Domain-Specific LLM: A Case Study in the + Japanese Business Domain ACL + + +
+ The development of Large Language Models (LLMs) in various languages has been +advancing, but the combination of non-English languages with domain-specific +contexts remains underexplored. This paper presents our findings from training +and evaluating a Japanese business domain-specific LLM designed to better +understand business-related documents, such as the news on current affairs, +technical reports, and patents. Additionally, LLMs in this domain require +regular updates to incorporate the most recent knowledge. Therefore, we also +report our findings from the first experiments and evaluations involving +updates to this LLM using the latest article data, which is an important +problem setting that has not been addressed in previous research. From our +experiments on a newly created benchmark dataset for question answering in the +target domain, we found that (1) our pretrained model improves QA accuracy +without losing general knowledge, and (2) a proper mixture of the latest and +older texts in the training data for the update is necessary. Our pretrained +model and business domain benchmark are publicly available to support further +studies. + +
+
+ comment: Accepted at PACLIC 38 +
+
+
+
+
+ + ♻ ☆ News Reporter: A Multi-lingual LLM Framework for Broadcast T.V News ICASSP 2025 + + +
+ Large Language Models (LLMs) have fast become an essential tools to many +conversational chatbots due to their ability to provide coherent answers for +varied queries. Datasets used to train these LLMs are often a mix of generic +and synthetic samples, thus lacking the verification needed to provide correct +and verifiable answers for T.V. News. + We collect and share a large collection of QA pairs extracted from +transcripts of news recordings from various news-channels across the United +States. Resultant QA pairs are then used to fine-tune an off-the-shelf LLM +model. Our model surpasses base models of similar size on several open LLM +benchmarks. We further integrate and propose a RAG method to improve +contextualization of our answers and also point it to a verifiable news +recording. + +
+
+ comment: 5 pages, under review at ICASSP 2025 +
+
+
+
+
+ + ♻ ☆ Improving Context-Aware Preference Modeling for Language Models NeurIPS 2024 + + +
+ While finetuning language models from pairwise preferences has proven +remarkably effective, the underspecified nature of natural language presents +critical challenges. Direct preference feedback is uninterpretable, difficult +to provide where multidimensional criteria may apply, and often inconsistent, +either because it is based on incomplete instructions or provided by diverse +principals. To address these challenges, we consider the two-step preference +modeling procedure that first resolves the under-specification by selecting a +context, and then evaluates preference with respect to the chosen context. We +decompose reward modeling error according to these two steps, which suggests +that supervising context in addition to context-specific preference may be a +viable approach to aligning models with diverse human preferences. For this to +work, the ability of models to evaluate context-specific preference is +critical. To this end, we contribute context-conditioned preference datasets +and accompanying experiments that investigate the ability of language models to +evaluate context-specific preference. We use our datasets to (1) show that +existing preference models benefit from, but fail to fully consider, added +context, (2) finetune a context-aware reward model with context-specific +performance exceeding that of GPT-4 and Llama 3 70B on tested datasets, and (3) +investigate the value of context-aware preference modeling. + +
+
+ comment: NeurIPS 2024. 10 pages (29 with references and appendix) +
+
+
+
+
+ + ♻ ☆ CLIBD: Bridging Vision and Genomics for Biodiversity Monitoring at Scale + + +
+ Measuring biodiversity is crucial for understanding ecosystem health. While +prior works have developed machine learning models for taxonomic classification +of photographic images and DNA separately, in this work, we introduce a +multimodal approach combining both, using CLIP-style contrastive learning to +align images, barcode DNA, and text-based representations of taxonomic labels +in a unified embedding space. This allows for accurate classification of both +known and unknown insect species without task-specific fine-tuning, leveraging +contrastive learning for the first time to fuse DNA and image data. Our method +surpasses previous single-modality approaches in accuracy by over 8% on +zero-shot learning tasks, showcasing its effectiveness in biodiversity studies. + +
+
+ comment: 25 pages with 11 figures +
+
+
+
+
+ + ♻ ☆ Improving Causal Reasoning in Large Language Models: A Survey + + +
+ Causal reasoning (CR) is a crucial aspect of intelligence, essential for +problem-solving, decision-making, and understanding the world. While large +language models (LLMs) can generate rationales for their outputs, their ability +to reliably perform causal reasoning remains uncertain, often falling short in +tasks requiring a deep understanding of causality. In this survey, we provide a +comprehensive review of research aimed at enhancing LLMs for causal reasoning. +We categorize existing methods based on the role of LLMs: either as reasoning +engines or as helpers providing knowledge or data to traditional CR methods, +followed by a detailed discussion of the methodologies in each category. We +then evaluate the performance of LLMs on various causal reasoning tasks, +providing key findings and in-depth analysis. Finally, we provide insights from +current studies and highlight promising directions for future research. We aim +for this work to serve as a comprehensive resource, fostering further +advancements in causal reasoning with LLMs. Resources are available at +https://github.com/chendl02/Awesome-LLM-causal-reasoning. + +
+
+
+
+
+ + ♻ ☆ BABILong: Testing the Limits of LLMs with Long Context + Reasoning-in-a-Haystack NeurIPS 2024 + + +
+ In recent years, the input context sizes of large language models (LLMs) have +increased dramatically. However, existing evaluation methods have not kept +pace, failing to comprehensively assess the efficiency of models in handling +long contexts. To bridge this gap, we introduce the BABILong benchmark, +designed to test language models' ability to reason across facts distributed in +extremely long documents. BABILong includes a diverse set of 20 reasoning +tasks, including fact chaining, simple induction, deduction, counting, and +handling lists/sets. These tasks are challenging on their own, and even more +demanding when the required facts are scattered across long natural text. Our +evaluations show that popular LLMs effectively utilize only 10-20\% of the +context and their performance declines sharply with increased reasoning +complexity. Among alternatives to in-context reasoning, Retrieval-Augmented +Generation methods achieve a modest 60\% accuracy on single-fact question +answering, independent of context length. Among context extension methods, the +highest performance is demonstrated by recurrent memory transformers after +fine-tuning, enabling the processing of lengths up to 50 million tokens. The +BABILong benchmark is extendable to any length to support the evaluation of new +upcoming models with increased capabilities, and we provide splits up to 10 +million token lengths. + +
+
+ comment: NeurIPS 2024 Datasets and Benchmarks Track +
+
+
+
+
+ + ♻ ☆ Evaluating Morphological Compositional Generalization in Large Language + Models + + +
+ Large language models (LLMs) have demonstrated significant progress in +various natural language generation and understanding tasks. However, their +linguistic generalization capabilities remain questionable, raising doubts +about whether these models learn language similarly to humans. While humans +exhibit compositional generalization and linguistic creativity in language use, +the extent to which LLMs replicate these abilities, particularly in morphology, +is under-explored. In this work, we systematically investigate the +morphological generalization abilities of LLMs through the lens of +compositionality. We define morphemes as compositional primitives and design a +novel suite of generative and discriminative tasks to assess morphological +productivity and systematicity. Focusing on agglutinative languages such as +Turkish and Finnish, we evaluate several state-of-the-art instruction-finetuned +multilingual models, including GPT-4 and Gemini. Our analysis shows that LLMs +struggle with morphological compositional generalization particularly when +applied to novel word roots, with performance declining sharply as +morphological complexity increases. While models can identify individual +morphological combinations better than chance, their performance lacks +systematicity, leading to significant accuracy gaps compared to humans. + +
+
+ comment: 33 pages +
+
+
+
+
+ + ♻ ☆ ChartInsights: Evaluating Multimodal Large Language Models for Low-Level + Chart Question Answering EMNLP 2024 + + +
+ Chart question answering (ChartQA) tasks play a critical role in interpreting +and extracting insights from visualization charts. While recent advancements in +multimodal large language models (MLLMs) like GPT-4o have shown promise in +high-level ChartQA tasks, such as chart captioning, their effectiveness in +low-level ChartQA tasks (e.g., identifying correlations) remains underexplored. +In this paper, we address this gap by evaluating MLLMs on low-level ChartQA +using a newly curated dataset, ChartInsights, which consists of 22,347 (chart, +task, query, answer) covering 10 data analysis tasks across 7 chart types. We +systematically evaluate 19 advanced MLLMs, including 12 open-source and 7 +closed-source models. The average accuracy rate across these models is 39.8%, +with GPT-4o achieving the highest accuracy at 69.17%. To further explore the +limitations of MLLMs in low-level ChartQA, we conduct experiments that alter +visual elements of charts (e.g., changing color schemes, adding image noise) to +assess their impact on the task effectiveness. Furthermore, we propose a new +textual prompt strategy, Chain-of-Charts, tailored for low-level ChartQA tasks, +which boosts performance by 14.41%, achieving an accuracy of 83.58%. Finally, +incorporating a visual prompt strategy that directs attention to relevant +visual elements further improves accuracy to 84.32%. + +
+
+ comment: EMNLP 2024 Conference Paper +
+
+
+
+
+ + ♻ ☆ Benchmarking Multimodal Retrieval Augmented Generation with Dynamic VQA + Dataset and Self-adaptive Planning Agent + + +
+ Multimodal Retrieval Augmented Generation (mRAG) plays an important role in +mitigating the "hallucination" issue inherent in multimodal large language +models (MLLMs). Although promising, existing heuristic mRAGs typically +predefined fixed retrieval processes, which causes two issues: (1) Non-adaptive +Retrieval Queries. (2) Overloaded Retrieval Queries. However, these flaws +cannot be adequately reflected by current knowledge-seeking visual question +answering (VQA) datasets, since the most required knowledge can be readily +obtained with a standard two-step retrieval. To bridge the dataset gap, we +first construct Dyn-VQA dataset, consisting of three types of "dynamic" +questions, which require complex knowledge retrieval strategies variable in +query, tool, and time: (1) Questions with rapidly changing answers. (2) +Questions requiring multi-modal knowledge. (3) Multi-hop questions. Experiments +on Dyn-VQA reveal that existing heuristic mRAGs struggle to provide sufficient +and precisely relevant knowledge for dynamic questions due to their rigid +retrieval processes. Hence, we further propose the first self-adaptive planning +agent for multimodal retrieval, OmniSearch. The underlying idea is to emulate +the human behavior in question solution which dynamically decomposes complex +multimodal questions into sub-question chains with retrieval action. Extensive +experiments prove the effectiveness of our OmniSearch, also provide direction +for advancing mRAG. The code and dataset will be open-sourced at +https://github.com/Alibaba-NLP/OmniSearch. + +
+
+
+
+
+ + ♻ ☆ ElectionSim: Massive Population Election Simulation Powered by Large + Language Model Driven Agents + + +
+ The massive population election simulation aims to model the preferences of +specific groups in particular election scenarios. It has garnered significant +attention for its potential to forecast real-world social trends. Traditional +agent-based modeling (ABM) methods are constrained by their ability to +incorporate complex individual background information and provide interactive +prediction results. In this paper, we introduce ElectionSim, an innovative +election simulation framework based on large language models, designed to +support accurate voter simulations and customized distributions, together with +an interactive platform to dialogue with simulated voters. We present a +million-level voter pool sampled from social media platforms to support +accurate individual simulation. We also introduce PPE, a poll-based +presidential election benchmark to assess the performance of our framework +under the U.S. presidential election scenario. Through extensive experiments +and analyses, we demonstrate the effectiveness and robustness of our framework +in U.S. presidential election simulations. + +
+
+ comment: 42 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ CIBench: Evaluating Your LLMs with a Code Interpreter Plugin + + +
+ While LLM-Based agents, which use external tools to solve complex problems, +have made significant progress, benchmarking their ability is challenging, +thereby hindering a clear understanding of their limitations. In this paper, we +propose an interactive evaluation framework, named CIBench, to comprehensively +assess LLMs' ability to utilize code interpreters for data science tasks. Our +evaluation framework includes an evaluation dataset and two evaluation modes. +The evaluation dataset is constructed using an LLM-human cooperative approach +and simulates an authentic workflow by leveraging consecutive and interactive +IPython sessions. The two evaluation modes assess LLMs' ability with and +without human assistance. We conduct extensive experiments to analyze the +ability of 24 LLMs on CIBench and provide valuable insights for future LLMs in +code interpreter utilization. + +
+
+ comment: Under review. The first three authors contribute equally, and + Songyang Zhang is the project leader +
+
+
+
+
+ + ♻ ☆ The Fine Line: Navigating Large Language Model Pretraining with + Down-streaming Capability Analysis + + +
+ Uncovering early-stage metrics that reflect final model performance is one +core principle for large-scale pretraining. The existing scaling law +demonstrates the power-law correlation between pretraining loss and training +flops, which serves as an important indicator of the current training state for +large language models. However, this principle only focuses on the model's +compression properties on the training data, resulting in an inconsistency with +the ability improvements on the downstream tasks. Some follow-up works +attempted to extend the scaling-law to more complex metrics (such as +hyperparameters), but still lacked a comprehensive analysis of the dynamic +differences among various capabilities during pretraining. To address the +aforementioned limitations, this paper undertakes a comprehensive comparison of +model capabilities at various pretraining intermediate checkpoints. Through +this analysis, we confirm that specific downstream metrics exhibit similar +training dynamics across models of different sizes, up to 67 billion +parameters. In addition to our core findings, we've reproduced Amber and +OpenLLaMA, releasing their intermediate checkpoints. This initiative offers +valuable resources to the research community and facilitates the verification +and exploration of LLM pretraining by open-source researchers. Besides, we +provide empirical summaries, including performance comparisons of different +models and capabilities, and tuition of key metrics for different training +phases. Based on these findings, we provide a more user-friendly strategy for +evaluating the optimization state, offering guidance for establishing a stable +pretraining process. + +
+
+
+
+
+ + ♻ ☆ ShifCon: Enhancing Non-Dominant Language Capabilities with a Shift-based + Contrastive Framework + + +
+ Although fine-tuning Large Language Models (LLMs) with multilingual data can +rapidly enhance the multilingual capabilities of LLMs, they still exhibit a +performance gap between the dominant language (e.g., English) and non-dominant +ones due to the imbalance of training data across languages. To further enhance +the performance of non-dominant languages, we propose ShifCon, a Shift-based +Contrastive framework that aligns the internal forward process of other +languages toward that of the dominant one. Specifically, it shifts the +representations of non-dominant languages into the dominant language subspace, +allowing them to access relatively rich information encoded in the model +parameters. The enriched representations are then shifted back into their +original language subspace before generation. Moreover, we introduce a subspace +distance metric to pinpoint the optimal layer area for shifting representations +and employ multilingual contrastive learning to further enhance the alignment +of representations within this area. Experiments demonstrate that our ShifCon +framework significantly enhances the performance of non-dominant languages, +particularly for low-resource ones. Further analysis offers extra insights to +verify the effectiveness of ShifCon and propel future research + +
+
+ comment: 23 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ PersianRAG: A Retrieval-Augmented Generation System for Persian Language + + +
+ Retrieval augmented generation (RAG) models, which integrate large-scale +pre-trained generative models with external retrieval mechanisms, have shown +significant success in various natural language processing (NLP) tasks. +However, applying RAG models in Persian language as a low-resource language, +poses distinct challenges. These challenges primarily involve the +preprocessing, embedding, retrieval, prompt construction, language modeling, +and response evaluation of the system. In this paper, we address the challenges +towards implementing a real-world RAG system for Persian language called +PersianRAG. We propose novel solutions to overcome these obstacles and evaluate +our approach using several Persian benchmark datasets. Our experimental results +demonstrate the capability of the PersianRAG framework to enhance question +answering task in Persian. + +
+
+
+
+
+ + ♻ ☆ Swan and ArabicMTEB: Dialect-Aware, Arabic-Centric, Cross-Lingual, and + Cross-Cultural Embedding Models and Benchmarks + + +
+ We introduce {\bf Swan}, a family of embedding models centred around the +Arabic language, addressing both small-scale and large-scale use cases. Swan +includes two variants: Swan-Small, based on ARBERTv2, and Swan-Large, built on +ArMistral, a pretrained Arabic large language model. To evaluate these models, +we propose ArabicMTEB, a comprehensive benchmark suite that assesses +cross-lingual, multi-dialectal, multi-domain, and multi-cultural Arabic text +embedding performance, covering eight diverse tasks and spanning 94 datasets. +Swan-Large achieves state-of-the-art results, outperforming +Multilingual-E5-large in most Arabic tasks, while the Swan-Small consistently +surpasses Multilingual-E5-base. Our extensive evaluations demonstrate that Swan +models are both dialectally and culturally aware, excelling across various +Arabic domains while offering significant monetary efficiency. This work +significantly advances the field of Arabic language modelling and provides +valuable resources for future research and applications in Arabic natural +language processing. Our models and benchmark will be made publicly accessible +for research. + +
+
+
+
+
+ + ♻ ☆ Beyond Single-Audio: Advancing Multi-Audio Processing in Audio Large + Language Models EMNLP24 + + +
+ Various audio-LLMs (ALLMs) have been explored recently for tackling different +audio tasks simultaneously using a single, unified model. While existing +evaluations of ALLMs primarily focus on single-audio tasks, real-world +applications often involve processing multiple audio streams simultaneously. To +bridge this gap, we propose the first multi-audio evaluation (MAE) benchmark +that consists of 20 datasets from 11 multi-audio tasks encompassing both speech +and sound scenarios. Comprehensive experiments on MAE demonstrate that the +existing ALLMs, while being powerful in comprehending primary audio elements in +individual audio inputs, struggling to handle multi-audio scenarios. To this +end, we propose a novel multi-audio-LLM (MALLM) to capture audio context among +multiple similar audios using discriminative learning on our proposed synthetic +data. The results demonstrate that the proposed MALLM outperforms all baselines +and achieves high data efficiency using synthetic data without requiring human +annotations. The proposed MALLM opens the door for ALLMs towards multi-audio +processing era and brings us closer to replicating human auditory capabilities +in machines. + +
+
+ comment: EMNLP24 Findings. Data available at + https://github.com/MatthewCYM/MALLM +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 23 + +
+
+
+ + ☆ Unfair Alignment: Examining Safety Alignment Across Vision Encoder + Layers in Vision-Language Models + + +
+ Vision-language models (VLMs) have improved significantly in multi-modal +tasks, but their more complex architecture makes their safety alignment more +challenging than the alignment of large language models (LLMs). In this paper, +we reveal an unfair distribution of safety across the layers of VLM's vision +encoder, with earlier and middle layers being disproportionately vulnerable to +malicious inputs compared to the more robust final layers. This 'cross-layer' +vulnerability stems from the model's inability to generalize its safety +training from the default architectural settings used during training to unseen +or out-of-distribution scenarios, leaving certain layers exposed. We conduct a +comprehensive analysis by projecting activations from various intermediate +layers and demonstrate that these layers are more likely to generate harmful +outputs when exposed to malicious inputs. Our experiments with LLaVA-1.5 and +Llama 3.2 show discrepancies in attack success rates and toxicity scores across +layers, indicating that current safety alignment strategies focused on a single +default layer are insufficient. + +
+
+ comment: Preprint, Under Review +
+
+
+
+
+ + ☆ Increasing the scalability of graph convolution for FPGA-implemented + event-based vision + + +
+ Event cameras are becoming increasingly popular as an alternative to +traditional frame-based vision sensors, especially in mobile robotics. Taking +full advantage of their high temporal resolution, high dynamic range, low power +consumption and sparsity of event data, which only reflects changes in the +observed scene, requires both an efficient algorithm and a specialised hardware +platform. A recent trend involves using Graph Convolutional Neural Networks +(GCNNs) implemented on a heterogeneous SoC FPGA. In this paper we focus on +optimising hardware modules for graph convolution to allow flexible selection +of the FPGA resource (BlockRAM, DSP and LUT) for their implementation. We +propose a ''two-step convolution'' approach that utilises additional BRAM +buffers in order to reduce up to 94% of LUT usage for multiplications. This +method significantly improves the scalability of GCNNs, enabling the deployment +of models with more layers, larger graphs sizes and their application for more +dynamic scenarios. + +
+
+ comment: Accepted for the PhD forum during FPT 2024 (International Conference + on Field Programmable Technology), 10-12 December 2024, Sydney, Australia +
+
+
+
+
+ + ☆ Object Recognition in Human Computer Interaction:- A Comparative + Analysis + + +
+ Human-computer interaction (HCI) has been a widely researched area for many +years, with continuous advancements in technology leading to the development of +new techniques that change the way we interact with computers. With the recent +advent of powerful computers, we recognize human actions and interact +accordingly, thus revolutionizing the way we interact with computers. The +purpose of this paper is to provide a comparative analysis of various +algorithms used for recognizing user faces and gestures in the context of +computer vision and HCI. This study aims to explore and evaluate the +performance of different algorithms in terms of accuracy, robustness, and +efficiency. This study aims to provide a comprehensive analysis of algorithms +for face and gesture recognition in the context of computer vision and HCI, +with the goal of improving the design and development of interactive systems +that are more intuitive, efficient, and user-friendly. + +
+
+
+
+
+ + ☆ Pose-Transformation and Radial Distance Clustering for Unsupervised + Person Re-identification + + +
+ Person re-identification (re-ID) aims to tackle the problem of matching +identities across non-overlapping cameras. Supervised approaches require +identity information that may be difficult to obtain and are inherently biased +towards the dataset they are trained on, making them unscalable across domains. +To overcome these challenges, we propose an unsupervised approach to the person +re-ID setup. Having zero knowledge of true labels, our proposed method enhances +the discriminating ability of the learned features via a novel two-stage +training strategy. The first stage involves training a deep network on an +expertly designed pose-transformed dataset obtained by generating multiple +perturbations for each original image in the pose space. Next, the network +learns to map similar features closer in the feature space using the proposed +discriminative clustering algorithm. We introduce a novel radial distance loss, +that attends to the fundamental aspects of feature learning - compact clusters +with low intra-cluster and high inter-cluster variation. Extensive experiments +on several large-scale re-ID datasets demonstrate the superiority of our method +compared to state-of-the-art approaches. + +
+
+
+
+
+ + ☆ PocoLoco: A Point Cloud Diffusion Model of Human Shape in Loose Clothing WACV 2025 + + +
+ Modeling a human avatar that can plausibly deform to articulations is an +active area of research. We present PocoLoco -- the first template-free, +point-based, pose-conditioned generative model for 3D humans in loose clothing. +We motivate our work by noting that most methods require a parametric model of +the human body to ground pose-dependent deformations. Consequently, they are +restricted to modeling clothing that is topologically similar to the naked body +and do not extend well to loose clothing. The few methods that attempt to model +loose clothing typically require either canonicalization or a +UV-parameterization and need to address the challenging problem of explicitly +estimating correspondences for the deforming clothes. In this work, we +formulate avatar clothing deformation as a conditional point-cloud generation +task within the denoising diffusion framework. Crucially, our framework +operates directly on unordered point clouds, eliminating the need for a +parametric model or a clothing template. This also enables a variety of +practical applications, such as point-cloud completion and pose-based editing +-- important features for virtual human animation. As current datasets for +human avatars in loose clothing are far too small for training diffusion +models, we release a dataset of two subjects performing various poses in loose +clothing with a total of 75K point clouds. By contributing towards tackling the +challenging task of effectively modeling loose clothing and expanding the +available data for training these models, we aim to set the stage for further +innovation in digital humans. The source code is available at +https://github.com/sidsunny/pocoloco . + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ☆ WiFlexFormer: Efficient WiFi-Based Person-Centric Sensing + + +
+ We propose WiFlexFormer, a highly efficient Transformer-based architecture +designed for WiFi Channel State Information (CSI)-based person-centric sensing. +We benchmark WiFlexFormer against state-of-the-art vision and specialized +architectures for processing radio frequency data and demonstrate that it +achieves comparable Human Activity Recognition (HAR) performance while offering +a significantly lower parameter count and faster inference times. With an +inference time of just 10 ms on an Nvidia Jetson Orin Nano, WiFlexFormer is +optimized for real-time inference. Additionally, its low parameter count +contributes to improved cross-domain generalization, where it often outperforms +larger models. Our comprehensive evaluation shows that WiFlexFormer is a +potential solution for efficient, scalable WiFi-based sensing applications. The +PyTorch implementation of WiFlexFormer is publicly available at: +https://github.com/StrohmayerJ/WiFlexFormer. + +
+
+
+
+
+ + ☆ Community Forensics: Using Thousands of Generators to Train Fake Image + Detectors + + +
+ One of the key challenges of detecting AI-generated images is spotting images +that have been created by previously unseen generative models. We argue that +the limited diversity of the training data is a major obstacle to addressing +this problem, and we propose a new dataset that is significantly larger and +more diverse than prior work. As part of creating this dataset, we +systematically download thousands of text-to-image latent diffusion models and +sample images from them. We also collect images from dozens of popular open +source and commercial models. The resulting dataset contains 2.7M images that +have been sampled from 4803 different models. These images collectively capture +a wide range of scene content, generator architectures, and image processing +settings. Using this dataset, we study the generalization abilities of fake +image detectors. Our experiments suggest that detection performance improves as +the number of models in the training set increases, even when these models have +similar architectures. We also find that detection performance improves as the +diversity of the models increases, and that our trained detectors generalize +better than those trained on other datasets. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ DiMSUM: Diffusion Mamba -- A Scalable and Unified Spatial-Frequency + Method for Image Generation NeurIPS 2024 + + +
+ We introduce a novel state-space architecture for diffusion models, +effectively harnessing spatial and frequency information to enhance the +inductive bias towards local features in input images for image generation +tasks. While state-space networks, including Mamba, a revolutionary advancement +in recurrent neural networks, typically scan input sequences from left to +right, they face difficulties in designing effective scanning strategies, +especially in the processing of image data. Our method demonstrates that +integrating wavelet transformation into Mamba enhances the local structure +awareness of visual inputs and better captures long-range relations of +frequencies by disentangling them into wavelet subbands, representing both low- +and high-frequency components. These wavelet-based outputs are then processed +and seamlessly fused with the original Mamba outputs through a cross-attention +fusion layer, combining both spatial and frequency information to optimize the +order awareness of state-space models which is essential for the details and +overall quality of image generation. Besides, we introduce a globally-shared +transformer to supercharge the performance of Mamba, harnessing its exceptional +power to capture global relationships. Through extensive experiments on +standard benchmarks, our method demonstrates superior results compared to DiT +and DIFFUSSM, achieving faster training convergence and delivering high-quality +outputs. The codes and pretrained models are released at +https://github.com/VinAIResearch/DiMSUM.git. + +
+
+ comment: Accepted to NeurIPS 2024. Project page: + https://hao-pt.github.io/dimsum/ +
+
+
+
+
+ + ☆ Fed-EC: Bandwidth-Efficient Clustering-Based Federated Learning For + Autonomous Visual Robot Navigation + + +
+ Centralized learning requires data to be aggregated at a central server, +which poses significant challenges in terms of data privacy and bandwidth +consumption. Federated learning presents a compelling alternative, however, +vanilla federated learning methods deployed in robotics aim to learn a single +global model across robots that works ideally for all. But in practice one +model may not be well suited for robots deployed in various environments. This +paper proposes Federated-EmbedCluster (Fed-EC), a clustering-based federated +learning framework that is deployed with vision based autonomous robot +navigation in diverse outdoor environments. The framework addresses the key +federated learning challenge of deteriorating model performance of a single +global model due to the presence of non-IID data across real-world robots. +Extensive real-world experiments validate that Fed-EC reduces the communication +size by 23x for each robot while matching the performance of centralized +learning for goal-oriented navigation and outperforms local learning. Fed-EC +can transfer previously learnt models to new robots that join the cluster. + +
+
+
+
+
+ + ☆ RaVL: Discovering and Mitigating Spurious Correlations in Fine-Tuned + Vision-Language Models NeurIPS 2024 + + +
+ Fine-tuned vision-language models (VLMs) often capture spurious correlations +between image features and textual attributes, resulting in degraded zero-shot +performance at test time. Existing approaches for addressing spurious +correlations (i) primarily operate at the global image-level rather than +intervening directly on fine-grained image features and (ii) are predominantly +designed for unimodal settings. In this work, we present RaVL, which takes a +fine-grained perspective on VLM robustness by discovering and mitigating +spurious correlations using local image features rather than operating at the +global image level. Given a fine-tuned VLM, RaVL first discovers spurious +correlations by leveraging a region-level clustering approach to identify +precise image features contributing to zero-shot classification errors. Then, +RaVL mitigates the identified spurious correlation with a novel region-aware +loss function that enables the VLM to focus on relevant regions and ignore +spurious relationships during fine-tuning. We evaluate RaVL on 654 VLMs with +various model architectures, data domains, and learned spurious correlations. +Our results show that RaVL accurately discovers (191% improvement over the +closest baseline) and mitigates (8.2% improvement on worst-group image +classification accuracy) spurious correlations. Qualitative evaluations on +general-domain and medical-domain VLMs confirm our findings. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Textual Decomposition Then Sub-motion-space Scattering for + Open-Vocabulary Motion Generation + + +
+ Text-to-motion generation is a crucial task in computer vision, which +generates the target 3D motion by the given text. The existing annotated +datasets are limited in scale, resulting in most existing methods overfitting +to the small datasets and unable to generalize to the motions of the open +domain. Some methods attempt to solve the open-vocabulary motion generation +problem by aligning to the CLIP space or using the Pretrain-then-Finetuning +paradigm. However, the current annotated dataset's limited scale only allows +them to achieve mapping from sub-text-space to sub-motion-space, instead of +mapping between full-text-space and full-motion-space (full mapping), which is +the key to attaining open-vocabulary motion generation. To this end, this paper +proposes to leverage the atomic motion (simple body part motions over a short +time period) as an intermediate representation, and leverage two orderly +coupled steps, i.e., Textual Decomposition and Sub-motion-space Scattering, to +address the full mapping problem. For Textual Decomposition, we design a +fine-grained description conversion algorithm, and combine it with the +generalization ability of a large language model to convert any given motion +text into atomic texts. Sub-motion-space Scattering learns the compositional +process from atomic motions to the target motions, to make the learned +sub-motion-space scattered to form the full-motion-space. For a given motion of +the open domain, it transforms the extrapolation into interpolation and thereby +significantly improves generalization. Our network, $DSO$-Net, combines textual +$d$ecomposition and sub-motion-space $s$cattering to solve the +$o$pen-vocabulary motion generation. Extensive experiments demonstrate that our +DSO-Net achieves significant improvements over the state-of-the-art methods on +open-vocabulary motion generation. Code is available at +https://vankouf.github.io/DSONet/. + +
+
+ comment: project page: https://vankouf.github.io/DSONet/ +
+
+
+
+
+ + ☆ H-POPE: Hierarchical Polling-based Probing Evaluation of Hallucinations + in Large Vision-Language Models + + +
+ By leveraging both texts and images, large vision language models (LVLMs) +have shown significant progress in various multi-modal tasks. Nevertheless, +these models often suffer from hallucinations, e.g., they exhibit +inconsistencies between the visual input and the textual output. To address +this, we propose H-POPE, a coarse-to-fine-grained benchmark that systematically +assesses hallucination in object existence and attributes. Our evaluation shows +that models are prone to hallucinations on object existence, and even more so +on fine-grained attributes. We further investigate whether these models rely on +visual input to formulate the output texts. + +
+
+ comment: Poster at https://sites.google.com/berkeley.edu/bb-stat/home +
+
+
+
+
+ + ♻ ☆ Learning Task-Specific Strategies for Accelerated MRI + + +
+ Compressed sensing magnetic resonance imaging (CS-MRI) seeks to recover +visual information from subsampled measurements for diagnostic tasks. +Traditional CS-MRI methods often separately address measurement subsampling, +image reconstruction, and task prediction, resulting in a suboptimal end-to-end +performance. In this work, we propose TACKLE as a unified co-design framework +for jointly optimizing subsampling, reconstruction, and prediction strategies +for the performance on downstream tasks. The na\"ive approach of simply +appending a task prediction module and training with a task-specific loss leads +to suboptimal downstream performance. Instead, we develop a training procedure +where a backbone architecture is first trained for a generic pre-training task +(image reconstruction in our case), and then fine-tuned for different +downstream tasks with a prediction head. Experimental results on multiple +public MRI datasets show that TACKLE achieves an improved performance on +various tasks over traditional CS-MRI methods. We also demonstrate that TACKLE +is robust to distribution shifts by showing that it generalizes to a new +dataset we experimentally collected using different acquisition setups from the +training data. Without additional fine-tuning, TACKLE leads to both numerical +and visual improvements compared to existing baselines. We have further +implemented a learned 4$\times$-accelerated sequence on a Siemens 3T MRI Skyra +scanner. Compared to the fully-sampling scan that takes 335 seconds, our +optimized sequence only takes 84 seconds, achieving a four-fold time reduction +as desired, while maintaining high performance. + +
+
+ comment: Our code is available at https://github.com/zihuiwu/TACKLE. More + information can be found at http://imaging.cms.caltech.edu/tackle/ +
+
+
+
+
+ + ♻ ☆ Efficiently Collecting Training Dataset for 2D Object Detection by + Online Visual Feedback + + +
+ Training deep-learning-based vision systems require the manual annotation of +a significant number of images. Such manual annotation is highly time-consuming +and labor-intensive. Although previous studies have attempted to eliminate the +effort required for annotation, the effort required for image collection was +retained. To address this, we propose a human-in-the-loop dataset collection +method that uses a web application. To counterbalance the workload and +performance by encouraging the collection of multi-view object image datasets +in an enjoyable manner, thereby amplifying motivation, we propose three types +of online visual feedback features to track the progress of the collection +status. Our experiments thoroughly investigated the impact of each feature on +collection performance and quality of operation. The results suggested the +feasibility of annotation and object detection. + +
+
+ comment: 13 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Estimating Epistemic and Aleatoric Uncertainty with a Single Model NeurIPS + + +
+ Estimating and disentangling epistemic uncertainty, uncertainty that is +reducible with more training data, and aleatoric uncertainty, uncertainty that +is inherent to the task at hand, is critically important when applying machine +learning to high-stakes applications such as medical imaging and weather +forecasting. Conditional diffusion models' breakthrough ability to accurately +and efficiently sample from the posterior distribution of a dataset now makes +uncertainty estimation conceptually straightforward: One need only train and +sample from a large ensemble of diffusion models. Unfortunately, training such +an ensemble becomes computationally intractable as the complexity of the model +architecture grows. In this work we introduce a new approach to ensembling, +hyper-diffusion models (HyperDM), which allows one to accurately estimate both +epistemic and aleatoric uncertainty with a single model. Unlike existing +single-model uncertainty methods like Monte-Carlo dropout and Bayesian neural +networks, HyperDM offers prediction accuracy on par with, and in some cases +superior to, multi-model ensembles. Furthermore, our proposed approach scales +to modern network architectures such as Attention U-Net and yields more +accurate uncertainty estimates compared to existing methods. We validate our +method on two distinct real-world tasks: x-ray computed tomography +reconstruction and weather temperature forecasting. + +
+
+ comment: 19 pages, 11 figures. To be published in Conference on Neural + Information Processing Systems (NeurIPS) 2024 +
+
+
+
+
+ + ♻ ☆ FocalPose++: Focal Length and Object Pose Estimation via Render and + Compare + + +
+ We introduce FocalPose++, a neural render-and-compare method for jointly +estimating the camera-object 6D pose and camera focal length given a single RGB +input image depicting a known object. The contributions of this work are +threefold. First, we derive a focal length update rule that extends an existing +state-of-the-art render-and-compare 6D pose estimator to address the joint +estimation task. Second, we investigate several different loss functions for +jointly estimating the object pose and focal length. We find that a combination +of direct focal length regression with a reprojection loss disentangling the +contribution of translation, rotation, and focal length leads to improved +results. Third, we explore the effect of different synthetic training data on +the performance of our method. Specifically, we investigate different +distributions used for sampling object's 6D pose and camera's focal length when +rendering the synthetic images, and show that parametric distribution fitted on +real training data works the best. We show results on three challenging +benchmark datasets that depict known 3D models in uncontrolled settings. We +demonstrate that our focal length and 6D pose estimates have lower error than +the existing state-of-the-art methods. + +
+
+ comment: 25 pages, 22 figures. IEEE TPAMI, 2024. Extended version of the + conference paper arXiv:2204.05145 +
+
+
+
+
+ + ♻ ☆ Transferable Learned Image Compression-Resistant Adversarial + Perturbations BMVC 2024 + + +
+ Adversarial attacks can readily disrupt the image classification system, +revealing the vulnerability of DNN-based recognition tasks. While existing +adversarial perturbations are primarily applied to uncompressed images or +compressed images by the traditional image compression method, i.e., JPEG, +limited studies have investigated the robustness of models for image +classification in the context of DNN-based image compression. With the rapid +evolution of advanced image compression, DNN-based learned image compression +has emerged as the promising approach for transmitting images in many +security-critical applications, such as cloud-based face recognition and +autonomous driving, due to its superior performance over traditional +compression. Therefore, there is a pressing need to fully investigate the +robustness of a classification system post-processed by learned image +compression. To bridge this research gap, we explore the adversarial attack on +a new pipeline that targets image classification models that utilize learned +image compressors as pre-processing modules. Furthermore, to enhance the +transferability of perturbations across various quality levels and +architectures of learned image compression models, we introduce a saliency +score-based sampling method to enable the fast generation of transferable +perturbation. Extensive experiments with popular attack methods demonstrate the +enhanced transferability of our proposed method when attacking images that have +been post-processed with different learned image compression models. + +
+
+ comment: Accepted by BMVC 2024 +
+
+
+
+
+ + ♻ ☆ Machine learning approach to brain tumor detection and classification + + +
+ Brain tumor detection and classification are critical tasks in medical image +analysis, particularly in early-stage diagnosis, where accurate and timely +detection can significantly improve treatment outcomes. In this study, we apply +various statistical and machine learning models to detect and classify brain +tumors using brain MRI images. We explore a variety of statistical models +including linear, logistic, and Bayesian regressions, and the machine learning +models including decision tree, random forest, single-layer perceptron, +multi-layer perceptron, convolutional neural network (CNN), recurrent neural +network, and long short-term memory. Our findings show that CNN outperforms +other models, achieving the best performance. Additionally, we confirm that the +CNN model can also work for multi-class classification, distinguishing between +four categories of brain MRI images such as normal, glioma, meningioma, and +pituitary tumor images. This study demonstrates that machine learning +approaches are suitable for brain tumor detection and classification, +facilitating real-world medical applications in assisting radiologists with +early and accurate diagnosis. + +
+
+ comment: 7 pages, 2 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ ContextIQ: A Multimodal Expert-Based Video Retrieval System for + Contextual Advertising WACV 2025 + + +
+ Contextual advertising serves ads that are aligned to the content that the +user is viewing. The rapid growth of video content on social platforms and +streaming services, along with privacy concerns, has increased the need for +contextual advertising. Placing the right ad in the right context creates a +seamless and pleasant ad viewing experience, resulting in higher audience +engagement and, ultimately, better ad monetization. From a technology +standpoint, effective contextual advertising requires a video retrieval system +capable of understanding complex video content at a very granular level. +Current text-to-video retrieval models based on joint multimodal training +demand large datasets and computational resources, limiting their practicality +and lacking the key functionalities required for ad ecosystem integration. We +introduce ContextIQ, a multimodal expert-based video retrieval system designed +specifically for contextual advertising. ContextIQ utilizes modality-specific +experts-video, audio, transcript (captions), and metadata such as objects, +actions, emotion, etc.-to create semantically rich video representations. We +show that our system, without joint training, achieves better or comparable +results to state-of-the-art models and commercial solutions on multiple +text-to-video retrieval benchmarks. Our ablation studies highlight the benefits +of leveraging multiple modalities for enhanced video retrieval accuracy instead +of using a vision-language model alone. Furthermore, we show how video +retrieval systems such as ContextIQ can be used for contextual advertising in +an ad ecosystem while also addressing concerns related to brand safety and +filtering inappropriate content. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ INQUIRE: A Natural World Text-to-Image Retrieval Benchmark NeurIPS 2024 + + +
+ We introduce INQUIRE, a text-to-image retrieval benchmark designed to +challenge multimodal vision-language models on expert-level queries. INQUIRE +includes iNaturalist 2024 (iNat24), a new dataset of five million natural world +images, along with 250 expert-level retrieval queries. These queries are paired +with all relevant images comprehensively labeled within iNat24, comprising +33,000 total matches. Queries span categories such as species identification, +context, behavior, and appearance, emphasizing tasks that require nuanced image +understanding and domain expertise. Our benchmark evaluates two core retrieval +tasks: (1) INQUIRE-Fullrank, a full dataset ranking task, and (2) +INQUIRE-Rerank, a reranking task for refining top-100 retrievals. Detailed +evaluation of a range of recent multimodal models demonstrates that INQUIRE +poses a significant challenge, with the best models failing to achieve an +mAP@50 above 50%. In addition, we show that reranking with more powerful +multimodal models can enhance retrieval performance, yet there remains a +significant margin for improvement. By focusing on scientifically-motivated +ecological challenges, INQUIRE aims to bridge the gap between AI capabilities +and the needs of real-world scientific inquiry, encouraging the development of +retrieval systems that can assist with accelerating ecological and biodiversity +research. Our dataset and code are available at +https://inquire-benchmark.github.io + +
+
+ comment: Published in NeurIPS 2024, Datasets and Benchmarks Track +
+
+
+
+
+ + ♻ ☆ No Train, all Gain: Self-Supervised Gradients Improve Deep Frozen + Representations NeurIPS 2024 + + +
+ This paper introduces FUNGI, Features from UNsupervised GradIents, a method +to enhance the features of transformer encoders by leveraging self-supervised +gradients. Our method is simple: given any pretrained model, we first compute +gradients from various self-supervised objectives for each input. These +gradients are projected to a lower dimension and then concatenated with the +model's output embedding. The resulting features are evaluated on k-nearest +neighbor classification over 11 datasets from vision, 5 from natural language +processing, and 2 from audio. Across backbones spanning various sizes and +pretraining strategies, FUNGI features provide consistent performance +improvements over the embeddings. We also show that using FUNGI features can +benefit linear classification, clustering and image retrieval, and that they +significantly improve the retrieval-based in-context scene understanding +abilities of pretrained models, for example improving upon DINO by +17% for +semantic segmentation - without any training. + +
+
+ comment: NeurIPS 2024. Code available at + https://github.com/WalterSimoncini/fungivision +
+
+
+
+
+ + ♻ ☆ DeNetDM: Debiasing by Network Depth Modulation NeurIPS 2024 + + +
+ Neural networks trained on biased datasets tend to inadvertently learn +spurious correlations, hindering generalization. We formally prove that (1) +samples that exhibit spurious correlations lie on a lower rank manifold +relative to the ones that do not; and (2) the depth of a network acts as an +implicit regularizer on the rank of the attribute subspace that is encoded in +its representations. Leveraging these insights, we present DeNetDM, a novel +debiasing method that uses network depth modulation as a way of developing +robustness to spurious correlations. Using a training paradigm derived from +Product of Experts, we create both biased and debiased branches with deep and +shallow architectures and then distill knowledge to produce the target debiased +model. Our method requires no bias annotations or explicit data augmentation +while performing on par with approaches that require either or both. We +demonstrate that DeNetDM outperforms existing debiasing techniques on both +synthetic and real-world datasets by 5\%. The project page is available at +https://vssilpa.github.io/denetdm/. + +
+
+ comment: Camera-ready version : NeurIPS 2024, * indicates these authors + contributed equally +
+
+
+
+
+ + ♻ ☆ Gaussian Deja-vu: Creating Controllable 3D Gaussian Head-Avatars with + Enhanced Generalization and Personalization Abilities WACV 2025 + + +
+ Recent advancements in 3D Gaussian Splatting (3DGS) have unlocked significant +potential for modeling 3D head avatars, providing greater flexibility than +mesh-based methods and more efficient rendering compared to NeRF-based +approaches. Despite these advancements, the creation of controllable 3DGS-based +head avatars remains time-intensive, often requiring tens of minutes to hours. +To expedite this process, we here introduce the "Gaussian Deja-vu" framework, +which first obtains a generalized model of the head avatar and then +personalizes the result. The generalized model is trained on large 2D +(synthetic and real) image datasets. This model provides a well-initialized 3D +Gaussian head that is further refined using a monocular video to achieve the +personalized head avatar. For personalizing, we propose learnable +expression-aware rectification blendmaps to correct the initial 3D Gaussians, +ensuring rapid convergence without the reliance on neural networks. Experiments +demonstrate that the proposed method meets its objectives. It outperforms +state-of-the-art 3D Gaussian head avatars in terms of photorealistic quality as +well as reduces training time consumption to at least a quarter of the existing +methods, producing the avatar in minutes. + +
+
+ comment: 11 pages, Accepted by WACV 2025 in Round 1 +
+
+
+
+
+
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ dsld: A Socially Relevant Tool for Teaching Statistics + + +
+ The growing power of data science can play a crucial role in addressing +social discrimination, necessitating nuanced understanding and effective +mitigation strategies of potential biases. Data Science Looks At Discrimination +(dsld) is an R and Python package designed to provide users with a +comprehensive toolkit of statistical and graphical methods for assessing +possible discrimination related to protected groups, such as race, gender, and +age. Our software offers techniques for discrimination analysis by identifying +and mitigating confounding variables, along with methods for reducing bias in +predictive models. + In educational settings, dsld offers instructors powerful tools to teach +important statistical principles through motivating real world examples of +discrimination analysis. The inclusion of an 80-page Quarto book further +supports users, from statistics educators to legal professionals, in +effectively applying these analytical tools to real world scenarios. + +
+
+ comment: To be submitted to the Journal of Statistics and Data Science + Education +
+
+
+
+
+ + ☆ Reproducible Hybrid Time-Travel Retrieval in Evolving Corpora + + +
+ There are settings in which reproducibility of ranked lists is desirable, +such as when extracting a subset of an evolving document corpus for downstream +research tasks or in domains such as patent retrieval or in medical systematic +reviews, with high reproducibility expectations. However, as global term +statistics change when documents change or are added to a corpus, queries using +typical ranked retrieval models are not even reproducible for the parts of the +document corpus that have not changed. Thus, Boolean retrieval frequently +remains the mechanism of choice in such settings. + We present a hybrid retrieval system combining Lucene for fast retrieval with +a column-store-based retrieval system maintaining a versioned and time-stamped +index. The latter component allows re-execution of previously posed queries +resulting in the same ranked list and further allows for time-travel queries +over evolving collection, as web archives, while maintaining the original +ranking. Thus, retrieval results in evolving document collections are fully +reproducible even when document collections and thus term statistics change. + +
+
+
+
+
+ + ☆ Fine-Grained Guidance for Retrievers: Leveraging LLMs' Feedback in + Retrieval-Augmented Generation + + +
+ Retrieval-Augmented Generation (RAG) has proven to be an effective method for +mitigating hallucination issues inherent in large language models (LLMs). +Previous approaches typically train retrievers based on semantic similarity, +lacking optimization for RAG. More recent works have proposed aligning +retrievers with the preference signals of LLMs. However, these preference +signals are often difficult for dense retrievers, which typically have weaker +language capabilities, to understand and learn effectively. Drawing inspiration +from pedagogical theories like Guided Discovery Learning, we propose a novel +framework, FiGRet (Fine-grained Guidance for Retrievers), which leverages the +language capabilities of LLMs to construct examples from a more granular, +information-centric perspective to guide the learning of retrievers. +Specifically, our method utilizes LLMs to construct easy-to-understand examples +from samples where the retriever performs poorly, focusing on three learning +objectives highly relevant to the RAG scenario: relevance, comprehensiveness, +and purity. These examples serve as scaffolding to ultimately align the +retriever with the LLM's preferences. Furthermore, we employ a dual curriculum +learning strategy and leverage the reciprocal feedback between LLM and +retriever to further enhance the performance of the RAG system. A series of +experiments demonstrate that our proposed framework enhances the performance of +RAG systems equipped with different retrievers and is applicable to various +LLMs. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Lexicalization Is All You Need: Examining the Impact of Lexical + Knowledge in a Compositional QALD System + + +
+ In this paper, we examine the impact of lexicalization on Question Answering +over Linked Data (QALD). It is well known that one of the key challenges in +interpreting natural language questions with respect to SPARQL lies in bridging +the lexical gap, that is mapping the words in the query to the correct +vocabulary elements. We argue in this paper that lexicalization, that is +explicit knowledge about the potential interpretations of a word with respect +to the given vocabulary, significantly eases the task and increases the +performance of QA systems. Towards this goal, we present a compositional QA +system that can leverage explicit lexical knowledge in a compositional manner +to infer the meaning of a question in terms of a SPARQL query. We show that +such a system, given lexical knowledge, has a performance well beyond current +QA systems, achieving up to a $35.8\%$ increase in the micro $F_1$ score +compared to the best QA system on QALD-9. This shows the importance and +potential of including explicit lexical knowledge. In contrast, we show that +LLMs have limited abilities to exploit lexical knowledge, with only marginal +improvements compared to a version without lexical knowledge. This shows that +LLMs have no ability to compositionally interpret a question on the basis of +the meaning of its parts, a key feature of compositional approaches. Taken +together, our work shows new avenues for QALD research, emphasizing the +importance of lexicalization and compositionality. + +
+
+ comment: 24th International Conference on Knowledge Engineering and Knowledge + Management (EKAW 2024), November 26-28, 2024, Amsterdam, The Netherlands +
+
+
+
+
+ + ☆ Data Fusion of Synthetic Query Variants With Generative Large Language + Models SIGIR + + +
+ Considering query variance in information retrieval (IR) experiments is +beneficial for retrieval effectiveness. Especially ranking ensembles based on +different topically related queries retrieve better results than rankings based +on a single query alone. Recently, generative instruction-tuned Large Language +Models (LLMs) improved on a variety of different tasks in capturing human +language. To this end, this work explores the feasibility of using synthetic +query variants generated by instruction-tuned LLMs in data fusion experiments. +More specifically, we introduce a lightweight, unsupervised, and cost-efficient +approach that exploits principled prompting and data fusion techniques. In our +experiments, LLMs produce more effective queries when provided with additional +context information on the topic. Furthermore, our analysis based on four TREC +newswire benchmarks shows that data fusion based on synthetic query variants is +significantly better than baselines with single queries and also outperforms +pseudo-relevance feedback methods. We publicly share the code and query +datasets with the community as resources for follow-up studies. + +
+
+ comment: The definitive version of record was published in SIGIR-AP '24 +
+
+
+
+
+ + ☆ The Essence of the Essence from the Web:The Metasearch Engine + + +
+ The exponential growth of information source on the web and in turn +continuing technological progress of searching the information by using tools +like Search Engines gives rise to many problems for the user to know which tool +is best for their query and which tool is not. At this time Metasearch Engine +comes into play by reducing the user burden by dispatching queries to multiple +search engines in parallel and refining the results of these search engines to +give the best out of best by doing superior job on their side. These engines do +not own a database of Web pages rather they send search terms to the databases +maintained by the search engine companies, get back results from all the search +engines queried and then compile the results to be presented to the user. In +this paper, we describe the working of a typical metasearch engine and then +present a comparative study of traditional search engines and metasearch +engines on the basis of different parameters and show how metasearch engines +are better than the other search engines. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ SEGMN: A Structure-Enhanced Graph Matching Network for Graph Similarity + Learning + + +
+ Graph similarity computation (GSC) aims to quantify the similarity score +between two graphs. Although recent GSC methods based on graph neural networks +(GNNs) take advantage of intra-graph structures in message passing, few of them +fully utilize the structures presented by edges to boost the representation of +their connected nodes. Moreover, previous cross-graph node embedding matching +lacks the perception of the overall structure of the graph pair, due to the +fact that the node representations from GNNs are confined to the intra-graph +structure, causing the unreasonable similarity score. Intuitively, the +cross-graph structure represented in the assignment graph is helpful to rectify +the inappropriate matching. Therefore, we propose a structure-enhanced graph +matching network (SEGMN). Equipped with a dual embedding learning module and a +structure perception matching module, SEGMN achieves structure enhancement in +both embedding learning and cross-graph matching. The dual embedding learning +module incorporates adjacent edge representation into each node to achieve a +structure-enhanced representation. The structure perception matching module +achieves cross-graph structure enhancement through assignment graph +convolution. The similarity score of each cross-graph node pair can be +rectified by aggregating messages from structurally relevant node pairs. +Experimental results on benchmark datasets demonstrate that SEGMN outperforms +the state-of-the-art GSC methods in the GED regression task, and the structure +perception matching module is plug-and-play, which can further improve the +performance of the baselines by up to 25%. + +
+
+
+
+
+ + ☆ Advanced RAG Models with Graph Structures: Optimizing Complex Knowledge + Reasoning and Text Generation + + +
+ This study aims to optimize the existing retrieval-augmented generation model +(RAG) by introducing a graph structure to improve the performance of the model +in dealing with complex knowledge reasoning tasks. The traditional RAG model +has the problem of insufficient processing efficiency when facing complex graph +structure information (such as knowledge graphs, hierarchical relationships, +etc.), which affects the quality and consistency of the generated results. This +study proposes a scheme to process graph structure data by combining graph +neural network (GNN), so that the model can capture the complex relationship +between entities, thereby improving the knowledge consistency and reasoning +ability of the generated text. The experiment used the Natural Questions (NQ) +dataset and compared it with multiple existing generation models. The results +show that the graph-based RAG model proposed in this paper is superior to the +traditional generation model in terms of quality, knowledge consistency, and +reasoning ability, especially when dealing with tasks that require +multi-dimensional reasoning. Through the combination of the enhancement of the +retrieval module and the graph neural network, the model in this study can +better handle complex knowledge background information and has broad potential +value in multiple practical application scenarios. + +
+
+
+
+
+ + ♻ ☆ ContextIQ: A Multimodal Expert-Based Video Retrieval System for + Contextual Advertising WACV 2025 + + +
+ Contextual advertising serves ads that are aligned to the content that the +user is viewing. The rapid growth of video content on social platforms and +streaming services, along with privacy concerns, has increased the need for +contextual advertising. Placing the right ad in the right context creates a +seamless and pleasant ad viewing experience, resulting in higher audience +engagement and, ultimately, better ad monetization. From a technology +standpoint, effective contextual advertising requires a video retrieval system +capable of understanding complex video content at a very granular level. +Current text-to-video retrieval models based on joint multimodal training +demand large datasets and computational resources, limiting their practicality +and lacking the key functionalities required for ad ecosystem integration. We +introduce ContextIQ, a multimodal expert-based video retrieval system designed +specifically for contextual advertising. ContextIQ utilizes modality-specific +experts-video, audio, transcript (captions), and metadata such as objects, +actions, emotion, etc.-to create semantically rich video representations. We +show that our system, without joint training, achieves better or comparable +results to state-of-the-art models and commercial solutions on multiple +text-to-video retrieval benchmarks. Our ablation studies highlight the benefits +of leveraging multiple modalities for enhanced video retrieval accuracy instead +of using a vision-language model alone. Furthermore, we show how video +retrieval systems such as ContextIQ can be used for contextual advertising in +an ad ecosystem while also addressing concerns related to brand safety and +filtering inappropriate content. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ INQUIRE: A Natural World Text-to-Image Retrieval Benchmark NeurIPS 2024 + + +
+ We introduce INQUIRE, a text-to-image retrieval benchmark designed to +challenge multimodal vision-language models on expert-level queries. INQUIRE +includes iNaturalist 2024 (iNat24), a new dataset of five million natural world +images, along with 250 expert-level retrieval queries. These queries are paired +with all relevant images comprehensively labeled within iNat24, comprising +33,000 total matches. Queries span categories such as species identification, +context, behavior, and appearance, emphasizing tasks that require nuanced image +understanding and domain expertise. Our benchmark evaluates two core retrieval +tasks: (1) INQUIRE-Fullrank, a full dataset ranking task, and (2) +INQUIRE-Rerank, a reranking task for refining top-100 retrievals. Detailed +evaluation of a range of recent multimodal models demonstrates that INQUIRE +poses a significant challenge, with the best models failing to achieve an +mAP@50 above 50%. In addition, we show that reranking with more powerful +multimodal models can enhance retrieval performance, yet there remains a +significant margin for improvement. By focusing on scientifically-motivated +ecological challenges, INQUIRE aims to bridge the gap between AI capabilities +and the needs of real-world scientific inquiry, encouraging the development of +retrieval systems that can assist with accelerating ecological and biodiversity +research. Our dataset and code are available at +https://inquire-benchmark.github.io + +
+
+ comment: Published in NeurIPS 2024, Datasets and Benchmarks Track +
+
+
+
+
+ + ♻ ☆ PersianRAG: A Retrieval-Augmented Generation System for Persian Language + + +
+ Retrieval augmented generation (RAG) models, which integrate large-scale +pre-trained generative models with external retrieval mechanisms, have shown +significant success in various natural language processing (NLP) tasks. +However, applying RAG models in Persian language as a low-resource language, +poses distinct challenges. These challenges primarily involve the +preprocessing, embedding, retrieval, prompt construction, language modeling, +and response evaluation of the system. In this paper, we address the challenges +towards implementing a real-world RAG system for Persian language called +PersianRAG. We propose novel solutions to overcome these obstacles and evaluate +our approach using several Persian benchmark datasets. Our experimental results +demonstrate the capability of the PersianRAG framework to enhance question +answering task in Persian. + +
+
+
+
+
+ + ♻ ☆ Self-Compositional Data Augmentation for Scientific Keyphrase Generation + + +
+ State-of-the-art models for keyphrase generation require large amounts of +training data to achieve good performance. However, obtaining keyphrase-labeled +documents can be challenging and costly. To address this issue, we present a +self-compositional data augmentation method. More specifically, we measure the +relatedness of training documents based on their shared keyphrases, and combine +similar documents to generate synthetic samples. The advantage of our method +lies in its ability to create additional training samples that keep domain +coherence, without relying on external data or resources. Our results on +multiple datasets spanning three different domains, demonstrate that our method +consistently improves keyphrase generation. A qualitative analysis of the +generated keyphrases for the Computer Science domain confirms this improvement +towards their representativity property. + +
+
+ comment: Accepted to JCDL 2024. This is the author's version of the work. It + is posted here for your personal use. Not for redistribution. The definitive + version was published in the proceedings of the 2024 ACM/IEEE Joint + Conference on Digital Libraries (JCDL 24) + https://doi.org/10.1145/3677389.3702504 +
+
+
+
+
+ + ♻ ☆ CheX-GPT: Harnessing Large Language Models for Enhanced Chest X-ray + Report Labeling + + +
+ Free-text radiology reports present a rich data source for various medical +tasks, but effectively labeling these texts remains challenging. Traditional +rule-based labeling methods fall short of capturing the nuances of diverse +free-text patterns. Moreover, models using expert-annotated data are limited by +data scarcity and pre-defined classes, impacting their performance, flexibility +and scalability. To address these issues, our study offers three main +contributions: 1) We demonstrate the potential of GPT as an adept labeler using +carefully designed prompts. 2) Utilizing only the data labeled by GPT, we +trained a BERT-based labeler, CheX-GPT, which operates faster and more +efficiently than its GPT counterpart. 3) To benchmark labeler performance, we +introduced a publicly available expert-annotated test set, MIMIC-500, +comprising 500 cases from the MIMIC validation set. Our findings demonstrate +that CheX-GPT not only excels in labeling accuracy over existing models, but +also showcases superior efficiency, flexibility, and scalability, supported by +our introduction of the MIMIC-500 dataset for robust benchmarking. Code and +models are available at https://github.com/Soombit-ai/CheXGPT. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ A Real-Time Adaptive Multi-Stream GPU System for Online Approximate + Nearest Neighborhood Search CIKM'24 + + +
+ In recent years, Approximate Nearest Neighbor Search (ANNS) has played a +pivotal role in modern search and recommendation systems, especially in +emerging LLM applications like Retrieval-Augmented Generation. There is a +growing exploration into harnessing the parallel computing capabilities of GPUs +to meet the substantial demands of ANNS. However, existing systems primarily +focus on offline scenarios, overlooking the distinct requirements of online +applications that necessitate real-time insertion of new vectors. This +limitation renders such systems inefficient for real-world scenarios. Moreover, +previous architectures struggled to effectively support real-time insertion due +to their reliance on serial execution streams. In this paper, we introduce a +novel Real-Time Adaptive Multi-Stream GPU ANNS System (RTAMS-GANNS). Our +architecture achieves its objectives through three key advancements: 1) We +initially examined the real-time insertion mechanisms in existing GPU ANNS +systems and discovered their reliance on repetitive copying and memory +allocation, which significantly hinders real-time effectiveness on GPUs. As a +solution, we introduce a dynamic vector insertion algorithm based on memory +blocks, which includes in-place rearrangement. 2) To enable real-time vector +insertion in parallel, we introduce a multi-stream parallel execution mode, +which differs from existing systems that operate serially within a single +stream. Our system utilizes a dynamic resource pool, allowing multiple streams +to execute concurrently without additional execution blocking. 3) Through +extensive experiments and comparisons, our approach effectively handles varying +QPS levels across different datasets, reducing latency by up to 40%-80%. The +proposed system has also been deployed in real-world industrial search and +recommendation systems, serving hundreds of millions of users daily, and has +achieved good results. + +
+
+ comment: Accepted by CIKM'24, V2 fixes some typos +
+
+
+
+
+ + ♻ ☆ ELASTIC: Efficient Linear Attention for Sequential Interest Compression + + +
+ State-of-the-art sequential recommendation models heavily rely on +transformer's attention mechanism. However, the quadratic computational and +memory complexities of self attention have limited its scalability for modeling +users' long range behaviour sequences. To address this problem, we propose +ELASTIC, an Efficient Linear Attention for SequenTial Interest Compression, +requiring only linear time complexity and decoupling model capacity from +computational cost. Specifically, ELASTIC introduces a fixed length interest +experts with linear dispatcher attention mechanism which compresses the +long-term behaviour sequences to a significantly more compact representation +which reduces up to 90% GPU memory usage with x2.7 inference speed up. The +proposed linear dispatcher attention mechanism significantly reduces the +quadratic complexity and makes the model feasible for adequately modeling +extremely long sequences. Moreover, in order to retain the capacity for +modeling various user interests, ELASTIC initializes a vast learnable interest +memory bank and sparsely retrieves compressed user's interests from the memory +with a negligible computational overhead. The proposed interest memory +retrieval technique significantly expands the cardinality of available interest +space while keeping the same computational cost, thereby striking a trade-off +between recommendation accuracy and efficiency. To validate the effectiveness +of our proposed ELASTIC, we conduct extensive experiments on various public +datasets and compare it with several strong sequential recommenders. +Experimental results demonstrate that ELASTIC consistently outperforms +baselines by a significant margin and also highlight the computational +efficiency of ELASTIC when modeling long sequences. We will make our +implementation code publicly available. + +
+
+ comment: We hereby withdraw this paper from arXiv due to incomplete + experiments. Upon further review, we have determined that additional + experimental work is necessary to fully validate our findings and conclusions +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ Long-Form Text-to-Music Generation with Adaptive Prompts: A Case of + Study in Tabletop Role-Playing Games Soundtracks + + +
+ This paper investigates the capabilities of text-to-audio music generation +models in producing long-form music with prompts that change over time, +focusing on soundtrack generation for Tabletop Role-Playing Games (TRPGs). We +introduce Babel Bardo, a system that uses Large Language Models (LLMs) to +transform speech transcriptions into music descriptions for controlling a +text-to-music model. Four versions of Babel Bardo were compared in two TRPG +campaigns: a baseline using direct speech transcriptions, and three LLM-based +versions with varying approaches to music description generation. Evaluations +considered audio quality, story alignment, and transition smoothness. Results +indicate that detailed music descriptions improve audio quality while +maintaining consistency across consecutive descriptions enhances story +alignment and transition smoothness. + +
+
+ comment: Paper accepted at the LAMIR 2024 workshop +
+
+
+
+
+ + ☆ Inter-Frame Coding for Dynamic Meshes via Coarse-to-Fine Anchor Mesh + Generation + + +
+ In the current Video-based Dynamic Mesh Coding (V-DMC) standard, inter-frame +coding is restricted to mesh frames with constant topology. Consequently, +temporal redundancy is not fully leveraged, resulting in suboptimal compression +efficacy. To address this limitation, this paper introduces a novel +coarse-to-fine scheme to generate anchor meshes for frames with time-varying +topology. Initially, we generate a coarse anchor mesh using an octree-based +nearest neighbor search. Motion estimation compensates for regions with +significant motion changes during this process. However, the quality of the +coarse mesh is low due to its suboptimal vertices. To enhance details, the fine +anchor mesh is further optimized using the Quadric Error Metrics (QEM) +algorithm to calculate more precise anchor points. The inter-frame anchor mesh +generated herein retains the connectivity of the reference base mesh, while +concurrently preserving superior quality. Experimental results show that our +method achieves 7.2% ~ 10.3% BD-rate gain compared to the existing V-DMC test +model version 7. + +
+
+
+
+
+ + ☆ Both Text and Images Leaked! A Systematic Analysis of Multimodal LLM + Data Contamination + + +
+ The rapid progression of multimodal large language models (MLLMs) has +demonstrated superior performance on various multimodal benchmarks. However, +the issue of data contamination during training creates challenges in +performance evaluation and comparison. While numerous methods exist for +detecting dataset contamination in large language models (LLMs), they are less +effective for MLLMs due to their various modalities and multiple training +phases. In this study, we introduce a multimodal data contamination detection +framework, MM-Detect, designed for MLLMs. Our experimental results indicate +that MM-Detect is sensitive to varying degrees of contamination and can +highlight significant performance improvements due to leakage of the training +set of multimodal benchmarks. Furthermore, We also explore the possibility of +contamination originating from the pre-training phase of LLMs used by MLLMs and +the fine-tuning phase of MLLMs, offering new insights into the stages at which +contamination may be introduced. + +
+
+
+
+
+ + ☆ Investigating Conceptual Blending of a Diffusion Model for Improving + Nonword-to-Image Generation ACM MM 2024 + + +
+ Text-to-image diffusion models sometimes depict blended concepts in the +generated images. One promising use case of this effect would be the +nonword-to-image generation task which attempts to generate images intuitively +imaginable from a non-existing word (nonword). To realize nonword-to-image +generation, an existing study focused on associating nonwords with +similar-sounding words. Since each nonword can have multiple similar-sounding +words, generating images containing their blended concepts would increase +intuitiveness, facilitating creative activities and promoting computational +psycholinguistics. Nevertheless, no existing study has quantitatively evaluated +this effect in either diffusion models or the nonword-to-image generation +paradigm. Therefore, this paper first analyzes the conceptual blending in a +pretrained diffusion model, Stable Diffusion. The analysis reveals that a high +percentage of generated images depict blended concepts when inputting an +embedding interpolating between the text embeddings of two text prompts +referring to different concepts. Next, this paper explores the best text +embedding space conversion method of an existing nonword-to-image generation +framework to ensure both the occurrence of conceptual blending and image +generation quality. We compare the conventional direct prediction approach with +the proposed method that combines $k$-nearest neighbor search and linear +regression. Evaluation reveals that the enhanced accuracy of the embedding +space conversion by the proposed method improves the image generation quality, +while the emergence of conceptual blending could be attributed mainly to the +specific dimensions of the high-dimensional text embedding space. + +
+
+ comment: Paper accepted at ACM MM 2024 (doi: 10.1145/3664647.3681202) with + supplementary materials concatenated +
+
+
+
+
+ + ♻ ☆ Efficiently Collecting Training Dataset for 2D Object Detection by + Online Visual Feedback + + +
+ Training deep-learning-based vision systems require the manual annotation of +a significant number of images. Such manual annotation is highly time-consuming +and labor-intensive. Although previous studies have attempted to eliminate the +effort required for annotation, the effort required for image collection was +retained. To address this, we propose a human-in-the-loop dataset collection +method that uses a web application. To counterbalance the workload and +performance by encouraging the collection of multi-view object image datasets +in an enjoyable manner, thereby amplifying motivation, we propose three types +of online visual feedback features to track the progress of the collection +status. Our experiments thoroughly investigated the impact of each feature on +collection performance and quality of operation. The results suggested the +feasibility of annotation and object detection. + +
+
+ comment: 13 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Transferable Learned Image Compression-Resistant Adversarial + Perturbations BMVC 2024 + + +
+ Adversarial attacks can readily disrupt the image classification system, +revealing the vulnerability of DNN-based recognition tasks. While existing +adversarial perturbations are primarily applied to uncompressed images or +compressed images by the traditional image compression method, i.e., JPEG, +limited studies have investigated the robustness of models for image +classification in the context of DNN-based image compression. With the rapid +evolution of advanced image compression, DNN-based learned image compression +has emerged as the promising approach for transmitting images in many +security-critical applications, such as cloud-based face recognition and +autonomous driving, due to its superior performance over traditional +compression. Therefore, there is a pressing need to fully investigate the +robustness of a classification system post-processed by learned image +compression. To bridge this research gap, we explore the adversarial attack on +a new pipeline that targets image classification models that utilize learned +image compressors as pre-processing modules. Furthermore, to enhance the +transferability of perturbations across various quality levels and +architectures of learned image compression models, we introduce a saliency +score-based sampling method to enable the fast generation of transferable +perturbation. Extensive experiments with popular attack methods demonstrate the +enhanced transferability of our proposed method when attacking images that have +been post-processed with different learned image compression models. + +
+
+ comment: Accepted by BMVC 2024 +
+
+
+
+
+ + ♻ ☆ Routing Experts: Learning to Route Dynamic Experts in Multi-modal Large + Language Models + + +
+ Recently, mixture of experts (MoE) has become a popular paradigm for +achieving the trade-off between modal capacity and efficiency of multi-modal +large language models (MLLMs). Different from previous efforts, we are +dedicated to exploring the dynamic expert path in an already exist MLLM and +show that a standard MLLM can be also a mixture of experts. To approach this +target, we propose a novel dynamic expert scheme for MLLMs, termed Routing +Experts (RoE), which can achieve example-dependent optimal path routing without +obvious structure tweaks. Meanwhile, a new regularization of structure sparsity +is also introduced to enforce MLLMs to learn more short-cut inference, ensuring +the efficiency. In addition, we also realize the first attempt of aligning the +training and inference schemes of MLLMs in terms of network routing. To +validate RoE, we apply it to a set of latest MLLMs, including LLaVA-1.5, +LLaVA-HR and VILA, and conduct extensive experiments on a bunch of VL +benchmarks. The experiment results not only show the great advantages of our +RoE in improving MLLMs' efficiency, but also yield obvious advantages than +MoE-LLaVA in both performance and speed, e.g., an average performance gain of +3.3% on 5 benchmarks while being faster. + +
+
+
+
+
+ + ♻ ☆ Beyond Single-Audio: Advancing Multi-Audio Processing in Audio Large + Language Models EMNLP24 + + +
+ Various audio-LLMs (ALLMs) have been explored recently for tackling different +audio tasks simultaneously using a single, unified model. While existing +evaluations of ALLMs primarily focus on single-audio tasks, real-world +applications often involve processing multiple audio streams simultaneously. To +bridge this gap, we propose the first multi-audio evaluation (MAE) benchmark +that consists of 20 datasets from 11 multi-audio tasks encompassing both speech +and sound scenarios. Comprehensive experiments on MAE demonstrate that the +existing ALLMs, while being powerful in comprehending primary audio elements in +individual audio inputs, struggling to handle multi-audio scenarios. To this +end, we propose a novel multi-audio-LLM (MALLM) to capture audio context among +multiple similar audios using discriminative learning on our proposed synthetic +data. The results demonstrate that the proposed MALLM outperforms all baselines +and achieves high data efficiency using synthetic data without requiring human +annotations. The proposed MALLM opens the door for ALLMs towards multi-audio +processing era and brings us closer to replicating human auditory capabilities +in machines. + +
+
+ comment: EMNLP24 Findings. Data available at + https://github.com/MatthewCYM/MALLM +
+
+
+
+
+ + ♻ ☆ Document Parsing Unveiled: Techniques, Challenges, and Prospects for + Structured Information Extraction + + +
+ Document parsing is essential for converting unstructured and semi-structured +documents-such as contracts, academic papers, and invoices-into structured, +machine-readable data. Document parsing extract reliable structured data from +unstructured inputs, providing huge convenience for numerous applications. +Especially with recent achievements in Large Language Models, document parsing +plays an indispensable role in both knowledge base construction and training +data generation. This survey presents a comprehensive review of the current +state of document parsing, covering key methodologies, from modular pipeline +systems to end-to-end models driven by large vision-language models. Core +components such as layout detection, content extraction (including text, +tables, and mathematical expressions), and multi-modal data integration are +examined in detail. Additionally, this paper discusses the challenges faced by +modular document parsing systems and vision-language models in handling complex +layouts, integrating multiple modules, and recognizing high-density text. It +emphasizes the importance of developing larger and more diverse datasets and +outlines future research directions. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 24 + +
+
+
+ + ☆ Automated, LLM enabled extraction of synthesis details for reticular + materials from scientific literature + + +
+ Automated knowledge extraction from scientific literature can potentially +accelerate materials discovery. We have investigated an approach for extracting +synthesis protocols for reticular materials from scientific literature using +large language models (LLMs). To that end, we introduce a Knowledge Extraction +Pipeline (KEP) that automatizes LLM-assisted paragraph classification and +information extraction. By applying prompt engineering with in-context learning +(ICL) to a set of open-source LLMs, we demonstrate that LLMs can retrieve +chemical information from PDF documents, without the need for fine-tuning or +training and at a reduced risk of hallucination. By comparing the performance +of five open-source families of LLMs in both paragraph classification and +information extraction tasks, we observe excellent model performance even if +only few example paragraphs are included in the ICL prompts. The results show +the potential of the KEP approach for reducing human annotations and data +curation efforts in automated scientific knowledge extraction. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Self-supervised Hierarchical Representation for Medication + Recommendation + + +
+ Medication recommender is to suggest appropriate medication combinations +based on a patient's health history, e.g., diagnoses and procedures. Existing +works represent different diagnoses/procedures well separated by one-hot +encodings. However, they ignore the latent hierarchical structures of these +medical terms, undermining the generalization performance of the model. For +example, "Respiratory Diseases", "Chronic Respiratory Diseases" and "Chronic +Bronchiti" have a hierarchical relationship, progressing from general to +specific. To address this issue, we propose a novel hierarchical encoder named +HIER to hierarchically represent diagnoses and procedures, which is based on +standard medical codes and compatible with any existing methods. Specifically, +the proposed method learns relation embedding with a self-supervised objective +for incorporating the neighbor hierarchical structure. Additionally, we develop +the position encoding to explicitly introduce global hierarchical position. +Extensive experiments demonstrate significant and consistent improvements in +recommendation accuracy across four baselines and two real-world clinical +datasets. + +
+
+
+
+
+ + ☆ Efficient and Effective Adaptation of Multimodal Foundation Models in + Sequential Recommendation SIGIR2024 + + +
+ Multimodal foundation models (MFMs) have revolutionized sequential +recommender systems through advanced representation learning. While +Parameter-efficient Fine-tuning (PEFT) is commonly used to adapt these models, +studies often prioritize parameter efficiency, neglecting GPU memory and +training speed. To address this, we introduced the IISAN framework, +significantly enhancing efficiency. However, IISAN was limited to symmetrical +MFMs and identical text and image encoders, preventing the use of +state-of-the-art Large Language Models. To overcome this, we developed +IISAN-Versa, a versatile plug-and-play architecture compatible with both +symmetrical and asymmetrical MFMs. IISAN-Versa employs a Decoupled PEFT +structure and utilizes both intra- and inter-modal adaptation. It effectively +handles asymmetry through a simple yet effective combination of group +layer-dropping and dimension transformation alignment. Our research +demonstrates that IISAN-Versa effectively adapts large text encoders, and we +further identify a scaling effect where larger encoders generally perform +better. IISAN-Versa also demonstrates strong versatility in our defined +multimodal scenarios, which include raw titles and captions generated from +images and videos. Additionally, IISAN-Versa achieved state-of-the-art +performance on the Microlens public benchmark. We will release our code and +datasets to support future research. + +
+
+ comment: The extension of IISAN in SIGIR2024 +
+
+
+
+
+ + ☆ HtmlRAG: HTML is Better Than Plain Text for Modeling Retrieved Knowledge + in RAG Systems + + +
+ Retrieval-Augmented Generation (RAG) has been shown to improve knowledge +capabilities and alleviate the hallucination problem of LLMs. The Web is a +major source of external knowledge used in RAG systems, and many commercial +systems such as ChatGPT and Perplexity have used Web search engines as their +major retrieval systems. Typically, such RAG systems retrieve search results, +download HTML sources of the results, and then extract plain texts from the +HTML sources. Plain text documents or chunks are fed into the LLMs to augment +the generation. However, much of the structural and semantic information +inherent in HTML, such as headings and table structures, is lost during this +plain-text-based RAG process. To alleviate this problem, we propose HtmlRAG, +which uses HTML instead of plain text as the format of retrieved knowledge in +RAG. We believe HTML is better than plain text in modeling knowledge in +external documents, and most LLMs possess robust capacities to understand HTML. +However, utilizing HTML presents new challenges. HTML contains additional +content such as tags, JavaScript, and CSS specifications, which bring extra +input tokens and noise to the RAG system. To address this issue, we propose +HTML cleaning, compression, and pruning strategies, to shorten the HTML while +minimizing the loss of information. Specifically, we design a two-step +block-tree-based pruning method that prunes useless HTML blocks and keeps only +the relevant part of the HTML. Experiments on six QA datasets confirm the +superiority of using HTML in RAG systems. + +
+
+
+
+
+ + ☆ Graph-DPEP: Decomposed Plug and Ensemble Play for Few-Shot Document + Relation Extraction with Graph-of-Thoughts Reasoning + + +
+ Large language models (LLMs) pre-trained on massive corpora have demonstrated +impressive few-shot learning capability on many NLP tasks. Recasting an NLP +task into a text-to-text generation task is a common practice so that +generative LLMs can be prompted to resolve it. However, performing +document-level relation extraction (DocRE) tasks with generative LLM models is +still challenging due to the structured output format of DocRE, which +complicates the conversion to plain text. Limited information available in +few-shot samples and prompt instructions induce further difficulties and +challenges in relation extraction for mentioned entities in a document. In this +paper, we represent the structured output as a graph-style triplet rather than +natural language expressions and leverage generative LLMs for the DocRE task. +Our approach, the Graph-DPEP framework is grounded in the reasoning behind +triplet explanation thoughts presented in natural language. In this framework, +we first introduce a ``decomposed-plug" method for performing the generation +from LLMs over prompts with type-space decomposition to alleviate the burden of +distinguishing all relation types. Second, we employ a verifier for calibrating +the generation and identifying overlooked query entity pairs. Third, we develop +"ensemble-play", reapplying generation on the entire type list by leveraging +the reasoning thoughts embedded in a sub-graph associated with the missing +query pair to address the missingness issue. Through extensive comparisons with +existing prompt techniques and alternative Language Models (LLMs), our +framework demonstrates superior performance on publicly available benchmarks in +experiments. + +
+
+
+
+
+ + ☆ DM4Steal: Diffusion Model For Link Stealing Attack On Graph Neural + Networks + + +
+ Graph has become increasingly integral to the advancement of recommendation +systems, particularly with the fast development of graph neural network(GNN). +By exploring the virtue of rich node features and link information, GNN is +designed to provide personalized and accurate suggestions. Meanwhile, the +privacy leakage of GNN in such contexts has also captured special attention. +Prior work has revealed that a malicious user can utilize auxiliary knowledge +to extract sensitive link data of the target graph, integral to recommendation +systems, via the decision made by the target GNN model. This poses a +significant risk to the integrity and confidentiality of data used in +recommendation system. Though important, previous works on GNN's privacy +leakage are still challenged in three aspects, i.e., limited stealing attack +scenarios, sub-optimal attack performance, and adaptation against defense. To +address these issues, we propose a diffusion model based link stealing attack, +named DM4Steal. It differs previous work from three critical aspects. (i) +Generality: aiming at six attack scenarios with limited auxiliary knowledge, we +propose a novel training strategy for diffusion models so that DM4Steal is +transferable to diverse attack scenarios. (ii) Effectiveness: benefiting from +the retention of semantic structure in the diffusion model during the training +process, DM4Steal is capable to learn the precise topology of the target graph +through the GNN decision process. (iii) Adaptation: when GNN is defensive +(e.g., DP, Dropout), DM4Steal relies on the stability that comes from sampling +the score model multiple times to keep performance degradation to a minimum, +thus DM4Steal implements successful adaptive attack on defensive GNN. + +
+
+
+
+
+ + ☆ Learning to Unify Audio, Visual and Text for Audio-Enhanced Multilingual + Visual Answer Localization + + +
+ The goal of Multilingual Visual Answer Localization (MVAL) is to locate a +video segment that answers a given multilingual question. Existing methods +either focus solely on visual modality or integrate visual and subtitle +modalities. However, these methods neglect the audio modality in videos, +consequently leading to incomplete input information and poor performance in +the MVAL task. In this paper, we propose a unified Audio-Visual-Textual Span +Localization (AVTSL) method that incorporates audio modality to augment both +visual and textual representations for the MVAL task. Specifically, we +integrate features from three modalities and develop three predictors, each +tailored to the unique contributions of the fused modalities: an audio-visual +predictor, a visual predictor, and a textual predictor. Each predictor +generates predictions based on its respective modality. To maintain consistency +across the predicted results, we introduce an Audio-Visual-Textual Consistency +module. This module utilizes a Dynamic Triangular Loss (DTL) function, allowing +each modality's predictor to dynamically learn from the others. This +collaborative learning ensures that the model generates consistent and +comprehensive answers. Extensive experiments show that our proposed method +outperforms several state-of-the-art (SOTA) methods, which demonstrates the +effectiveness of the audio modality. + +
+
+
+
+
+ + ☆ WASHtsApp -- A RAG-powered WhatsApp Chatbot for supporting rural African + clean water access, sanitation and hygiene + + +
+ This paper introduces WASHtsApp, a WhatsApp-based chatbot designed to educate +rural African communities on clean water access, sanitation, and hygiene (WASH) +principles. WASHtsApp leverages a Retrieval-Augmented Generation (RAG) approach +to address the limitations of previous approaches with limited reach or missing +contextualization. The paper details the development process, employing Design +Science Research Methodology. The evaluation consisted of two phases: content +validation by four WASH experts and community validation by potential users. +Content validation confirmed WASHtsApp's ability to provide accurate and +relevant WASH-related information. Community validation indicated high user +acceptance and perceived usefulness of the chatbot. The paper concludes by +discussing the potential for further development, including incorporating local +languages and user data analysis for targeted interventions. It also proposes +future research cycles focused on wider deployment and leveraging user data for +educational purposes. + +
+
+ comment: Working Paper +
+
+
+
+
+ + ☆ Enhancing EmoBot: An In-Depth Analysis of User Satisfaction and Faults + in an Emotion-Aware Chatbot + + +
+ The research community has traditionally shown a keen interest in emotion +modeling, with a notable emphasis on the detection aspect. In contrast, the +exploration of emotion generation has received less attention.This study delves +into an existing state-of-the-art emotional chatbot, EmoBot, designed for +generating emotions in general-purpose conversations. This research involves a +comprehensive examination, including a survey to evaluate EmoBot's proficiency +in key dimensions like usability, accuracy, and overall user satisfaction, with +a specific focus on fault tolerance. By closely examining the chatbot's +operations, we identified some noteworthy shortcomings in the existing model. +We propose some solutions designed to address and overcome the identified +issues. + +
+
+ comment: 3 pages, extended abstract +
+
+
+
+
+ + ☆ Leveraging Vision-Language Models for Manufacturing Feature Recognition + in CAD Designs + + +
+ Automatic feature recognition (AFR) is essential for transforming design +knowledge into actionable manufacturing information. Traditional AFR methods, +which rely on predefined geometric rules and large datasets, are often +time-consuming and lack generalizability across various manufacturing features. +To address these challenges, this study investigates vision-language models +(VLMs) for automating the recognition of a wide range of manufacturing features +in CAD designs without the need for extensive training datasets or predefined +rules. Instead, prompt engineering techniques, such as multi-view query images, +few-shot learning, sequential reasoning, and chain-of-thought, are applied to +enable recognition. The approach is evaluated on a newly developed CAD dataset +containing designs of varying complexity relevant to machining, additive +manufacturing, sheet metal forming, molding, and casting. Five VLMs, including +three closed-source models (GPT-4o, Claude-3.5-Sonnet, and Claude-3.0-Opus) and +two open-source models (LLava and MiniCPM), are evaluated on this dataset with +ground truth features labelled by experts. Key metrics include feature quantity +accuracy, feature name matching accuracy, hallucination rate, and mean absolute +error (MAE). Results show that Claude-3.5-Sonnet achieves the highest feature +quantity accuracy (74%) and name-matching accuracy (75%) with the lowest MAE +(3.2), while GPT-4o records the lowest hallucination rate (8%). In contrast, +open-source models have higher hallucination rates (>30%) and lower accuracies +(<40%). This study demonstrates the potential of VLMs to automate feature +recognition in CAD designs within diverse manufacturing scenarios. + +
+
+ comment: Paper has been submitted to The ASME Journal of Computing and + Information Science in Engineering (JCISE) +
+
+
+
+
+ + ☆ Language Models and Cycle Consistency for Self-Reflective Machine + Translation + + +
+ This paper introduces a novel framework that leverages large language models +(LLMs) for machine translation (MT). We start with one conjecture: an ideal +translation should contain complete and accurate information for a strong +enough LLM to recover the original sentence. We generate multiple translation +candidates from a source language A to a target language B, and subsequently +translate these candidates back to the original language A. By evaluating the +cycle consistency between the original and back-translated sentences using +metrics such as token-level precision and accuracy, we implicitly estimate the +translation quality in language B, without knowing its ground-truth. This also +helps to evaluate the LLM translation capability, only with monolingual +corpora. For each source sentence, we identify the translation candidate with +optimal cycle consistency with the original sentence as the final answer. Our +experiments demonstrate that larger LLMs, or the same LLM with more forward +passes during inference, exhibit increased cycle consistency, aligning with the +LLM model size scaling law and test-time computation scaling law. This work +provide methods for, 1) to implicitly evaluate translation quality of a +sentence in the target language, 2), to evaluate capability of LLM for +any-to-any-language translation, and 3), how to generate a better translation +for a specific LLM. + +
+
+
+
+
+ + ☆ Memory Augmented Cross-encoders for Controllable Personalized Search + + +
+ Personalized search represents a problem where retrieval models condition on +historical user interaction data in order to improve retrieval results. +However, personalization is commonly perceived as opaque and not amenable to +control by users. Further, personalization necessarily limits the space of +items that users are exposed to. Therefore, prior work notes a tension between +personalization and users' ability for discovering novel items. While discovery +of novel items in personalization setups may be resolved through search result +diversification, these approaches do little to allow user control over +personalization. Therefore, in this paper, we introduce an approach for +controllable personalized search. Our model, CtrlCE presents a novel +cross-encoder model augmented with an editable memory constructed from users +historical items. Our proposed memory augmentation allows cross-encoder models +to condition on large amounts of historical user data and supports interaction +from users permitting control over personalization. Further, controllable +personalization for search must account for queries which don't require +personalization, and in turn user control. For this, we introduce a calibrated +mixing model which determines when personalization is necessary. This allows +system designers using CtrlCE to only obtain user input for control when +necessary. In multiple datasets of personalized search, we show CtrlCE to +result in effective personalization as well as fulfill various key goals for +controllable personalized search. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ JEL: Applying End-to-End Neural Entity Linking in JPMorgan Chase + + +
+ Knowledge Graphs have emerged as a compelling abstraction for capturing key +relationship among the entities of interest to enterprises and for integrating +data from heterogeneous sources. JPMorgan Chase (JPMC) is leading this trend by +leveraging knowledge graphs across the organization for multiple mission +critical applications such as risk assessment, fraud detection, investment +advice, etc. A core problem in leveraging a knowledge graph is to link mentions +(e.g., company names) that are encountered in textual sources to entities in +the knowledge graph. Although several techniques exist for entity linking, they +are tuned for entities that exist in Wikipedia, and fail to generalize for the +entities that are of interest to an enterprise. In this paper, we propose a +novel end-to-end neural entity linking model (JEL) that uses minimal context +information and a margin loss to generate entity embeddings, and a Wide & Deep +Learning model to match character and semantic information respectively. We +show that JEL achieves the state-of-the-art performance to link mentions of +company names in financial news with entities in our knowledge graph. We report +on our efforts to deploy this model in the company-wide system to generate +alerts in response to financial news. The methodology used for JEL is directly +applicable and usable by other enterprises who need entity linking solutions +for data that are unique to their respective situations. + +
+
+ comment: 8 pages, 4 figures, IAAI-21 +
+
+
+
+
+ + ☆ JPEC: A Novel Graph Neural Network for Competitor Retrieval in Financial + Knowledge Graphs SIGIR'24 + + +
+ Knowledge graphs have gained popularity for their ability to organize and +analyze complex data effectively. When combined with graph embedding +techniques, such as graph neural networks (GNNs), knowledge graphs become a +potent tool in providing valuable insights. This study explores the application +of graph embedding in identifying competitors from a financial knowledge graph. +Existing state-of-the-art(SOTA) models face challenges due to the unique +attributes of our knowledge graph, including directed and undirected +relationships, attributed nodes, and minimal annotated competitor connections. +To address these challenges, we propose a novel graph embedding model, +JPEC(JPMorgan Proximity Embedding for Competitor Detection), which utilizes +graph neural network to learn from both first-order and second-order node +proximity together with vital features for competitor retrieval. JPEC had +outperformed most existing models in extensive experiments, showcasing its +effectiveness in competitor retrieval. + +
+
+ comment: 5 pages, 4 figures, accepted by SIGIR'24 +
+
+
+
+
+ + ♻ ☆ PaCE: Parsimonious Concept Engineering for Large Language Models NeurIPS 2024 + + +
+ Large Language Models (LLMs) are being used for a wide variety of tasks. +While they are capable of generating human-like responses, they can also +produce undesirable output including potentially harmful information, racist or +sexist language, and hallucinations. Alignment methods are designed to reduce +such undesirable outputs via techniques such as fine-tuning, prompt +engineering, and representation engineering. However, existing methods face +several challenges: some require costly fine-tuning for every alignment task; +some do not adequately remove undesirable concepts, failing alignment; some +remove benign concepts, lowering the linguistic capabilities of LLMs. To +address these issues, we propose Parsimonious Concept Engineering (PaCE), a +novel activation engineering framework for alignment. First, to sufficiently +model the concepts, we construct a large-scale concept dictionary in the +activation space, in which each atom corresponds to a semantic concept. Given +any alignment task, we instruct a concept partitioner to efficiently annotate +the concepts as benign or undesirable. Then, at inference time, we decompose +the LLM activations along the concept dictionary via sparse coding, to +accurately represent the activations as linear combinations of benign and +undesirable components. By removing the latter ones from the activations, we +reorient the behavior of the LLM towards the alignment goal. We conduct +experiments on tasks such as response detoxification, faithfulness enhancement, +and sentiment revising, and show that PaCE achieves state-of-the-art alignment +performance while maintaining linguistic capabilities. + +
+
+ comment: Accepted in NeurIPS 2024. GitHub repository at + https://github.com/peterljq/Parsimonious-Concept-Engineering +
+
+
+
+
+ + ♻ ☆ R^3AG: First Workshop on Refined and Reliable Retrieval Augmented + Generation SIGIR + + +
+ Retrieval-augmented generation (RAG) has gained wide attention as the key +component to improve generative models with external knowledge augmentation +from information retrieval. It has shown great prominence in enhancing the +functionality and performance of large language model (LLM)-based applications. +However, with the comprehensive application of RAG, more and more problems and +limitations have been identified, thus urgently requiring further fundamental +exploration to improve current RAG frameworks. This workshop aims to explore in +depth how to conduct refined and reliable RAG for downstream AI tasks. + To this end, we propose to organize the first R3AG workshop at SIGIR-AP 2024 +to call for participants to re-examine and formulate the basic principles and +practical implementation of refined and reliable RAG. The workshop serves as a +platform for both academia and industry researchers to conduct discussions, +share insights, and foster research to build the next generation of RAG +systems. Participants will engage in discussions and presentations focusing on +fundamental challenges, cutting-edge research, and potential pathways to +improve RAG. At the end of the workshop, we aim to have a clearer understanding +of how to improve the reliability and applicability of RAG with more robust +information retrieval and language generation. + +
+
+ comment: R^3AG workshop overview at SIGIR-AP 2024 +
+
+
+
+
+ + ♻ ☆ Facilitating Interdisciplinary Knowledge Transfer with Research Paper + Recommender Systems + + +
+ In the extensive recommender systems literature, novelty and diversity have +been identified as key properties of useful recommendations. However, these +properties have received limited attention in the specific sub-field of +research paper recommender systems. In this work, we argue for the importance +of offering novel and diverse research paper recommendations to scientists. +This approach aims to reduce siloed reading, break down filter bubbles, and +promote interdisciplinary research. We propose a novel framework for evaluating +the novelty and diversity of research paper recommendations that leverages +methods from network analysis and natural language processing. Using this +framework, we show that the choice of representational method within a larger +research paper recommendation system can have a measurable impact on the nature +of downstream recommendations, specifically on their novelty and diversity. We +highlight a novel paper embedding method, which we demonstrate offers more +innovative and diverse recommendations without sacrificing precision, compared +to other state-of-the-art baselines. + +
+
+ comment: Under Review at QSS +
+
+
+
+
+ + ♻ ☆ DSFNet: Learning Disentangled Scenario Factorization for Multi-Scenario + Route Ranking + + +
+ Multi-scenario route ranking (MSRR) is crucial in many industrial mapping +systems. However, the industrial community mainly adopts interactive interfaces +to encourage users to select pre-defined scenarios, which may hinder the +downstream ranking performance. In addition, in the academic community, the +multi-scenario ranking works only come from other fields, and there are no +works specifically focusing on route data due to lacking a publicly available +MSRR dataset. Moreover, all the existing multi-scenario works still fail to +address the three specific challenges of MSRR simultaneously, i.e. explosion of +scenario number, high entanglement, and high-capacity demand. Different from +the prior, to address MSRR, our key idea is to factorize the complicated +scenario in route ranking into several disentangled factor scenario patterns. +Accordingly, we propose a novel method, Disentangled Scenario Factorization +Network (DSFNet), which flexibly composes scenario-dependent parameters based +on a high-capacity multi-factor-scenario-branch structure. Then, a novel +regularization is proposed to induce the disentanglement of factor scenarios. +Furthermore, two extra novel techniques, i.e. scenario-aware batch +normalization and scenario-aware feature filtering, are developed to improve +the network awareness of scenario representation. Additionally, to facilitate +MSRR research in the academic community, we propose MSDR, the first large-scale +publicly available annotated industrial Multi-Scenario Driving Route dataset. +Comprehensive experimental results demonstrate the superiority of our DSFNet, +which has been successfully deployed in AMap to serve the major online traffic. + +
+
+
+
+
+ + ♻ ☆ Music Foundation Model as Generic Booster for Music Downstream Tasks + + +
+ We demonstrate the efficacy of using intermediate representations from a +single foundation model to enhance various music downstream tasks. We introduce +SoniDo, a music foundation model (MFM) designed to extract hierarchical +features from target music samples. By leveraging hierarchical intermediate +features, SoniDo constrains the information granularity, leading to improved +performance across various downstream tasks including both understanding and +generative tasks. We specifically evaluated this approach on representative +tasks such as music tagging, music transcription, music source separation, and +music mixing. Our results reveal that the features extracted from foundation +models provide valuable enhancements in training downstream task models. This +highlights the capability of using features extracted from music foundation +models as a booster for downstream tasks. Our approach not only benefits +existing task-specific models but also supports music downstream tasks +constrained by data scarcity. This paves the way for more effective and +accessible music processing solutions. + +
+
+ comment: 41 pages with 14 figures +
+
+
+
+
+ + ♻ ☆ Decoding Matters: Addressing Amplification Bias and Homogeneity Issue + for LLM-based Recommendation EMNLP 2024 + + +
+ Adapting Large Language Models (LLMs) for recommendation requires careful +consideration of the decoding process, given the inherent differences between +generating items and natural language. Existing approaches often directly apply +LLMs' original decoding methods. However, we find these methods encounter +significant challenges: 1) amplification bias -- where standard length +normalization inflates scores for items containing tokens with generation +probabilities close to 1 (termed ghost tokens), and 2) homogeneity issue -- +generating multiple similar or repetitive items for a user. To tackle these +challenges, we introduce a new decoding approach named Debiasing-Diversifying +Decoding (D3). D3 disables length normalization for ghost tokens to alleviate +amplification bias, and it incorporates a text-free assistant model to +encourage tokens less frequently generated by LLMs for counteracting +recommendation homogeneity. Extensive experiments on real-world datasets +demonstrate the method's effectiveness in enhancing accuracy and diversity. The +code is available at https://github.com/SAI990323/DecodingMatters. + +
+
+ comment: Accepted at EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Green Recommender Systems: Optimizing Dataset Size for Energy-Efficient + Algorithm Performance + + +
+ As recommender systems become increasingly prevalent, the environmental +impact and energy efficiency of training large-scale models have come under +scrutiny. This paper investigates the potential for energy-efficient algorithm +performance by optimizing dataset sizes through downsampling techniques in the +context of Green Recommender Systems. We conducted experiments on the MovieLens +100K, 1M, 10M, and Amazon Toys and Games datasets, analyzing the performance of +various recommender algorithms under different portions of dataset size. Our +results indicate that while more training data generally leads to higher +algorithm performance, certain algorithms, such as FunkSVD and BiasedMF, +particularly with unbalanced and sparse datasets like Amazon Toys and Games, +maintain high-quality recommendations with up to a 50% reduction in training +data, achieving nDCG@10 scores within approximately 13% of full dataset +performance. These findings suggest that strategic dataset reduction can +decrease computational and environmental costs without substantially +compromising recommendation quality. This study advances sustainable and green +recommender systems by providing insights for reducing energy consumption while +maintaining effectiveness. + +
+
+
+
+
+ + ♻ ☆ Pearl: Personalizing Large Language Model Writing Assistants with + Generation-Calibrated Retrievers EMNLP 2024 + + +
+ Powerful large language models have facilitated the development of writing +assistants that promise to significantly improve the quality and efficiency of +composition and communication. However, a barrier to effective assistance is +the lack of personalization in LLM outputs to the author's communication style, +specialized knowledge, and values. In this paper, we address this challenge by +proposing Pearl, a LLM writing assistant personalized with a retriever that is +trained to be generation-calibrated for personalization. Generation calibration +ensures that our retriever selects historic user authored documents to augment +an LLM prompt such that they are likely to help an LLM generation better adhere +to a users' preferences. We propose two key novelties for training such a +retriever: (1) A training data selection method that identifies user requests +likely to benefit from personalization and documents that provide that benefit; +and (2) A scale-calibrating KL-divergence objective that ensures that our +retriever scores remain proportional to the downstream generation quality from +using the document for personalized generation. In a series of holistic +evaluations, we demonstrate the effectiveness of Pearl in generating long-form +texts on multiple social media datasets. Finally, we demonstrate how a +generation-calibrated retriever can double as a performance predictor -- +detecting low quality retrieval, and improving potentially under-performing +outputs via revision with LLMs. + +
+
+ comment: Accepted to Workshop on Customizable NLP at EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Harnessing Multimodal Large Language Models for Multimodal Sequential + Recommendation + + +
+ Recent advances in Large Language Models (LLMs) have demonstrated significant +potential in the field of Recommendation Systems (RSs). Most existing studies +have focused on converting user behavior logs into textual prompts and +leveraging techniques such as prompt tuning to enable LLMs for recommendation +tasks. Meanwhile, research interest has recently grown in multimodal +recommendation systems that integrate data from images, text, and other sources +using modality fusion techniques. This introduces new challenges to the +existing LLM-based recommendation paradigm which relies solely on text modality +information. Moreover, although Multimodal Large Language Models (MLLMs) +capable of processing multi-modal inputs have emerged, how to equip MLLMs with +multi-modal recommendation capabilities remains largely unexplored. To this +end, in this paper, we propose the Multimodal Large Language Model-enhanced +Multimodaln Sequential Recommendation (MLLM-MSR) model. To capture the dynamic +user preference, we design a two-stage user preference summarization method. +Specifically, we first utilize an MLLM-based item-summarizer to extract image +feature given an item and convert the image into text. Then, we employ a +recurrent user preference summarization generation paradigm to capture the +dynamic changes in user preferences based on an LLM-based user-summarizer. +Finally, to enable the MLLM for multi-modal recommendation task, we propose to +fine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT) +techniques. Extensive evaluations across various datasets validate the +effectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt +to the evolving dynamics of user preferences. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Graph Diffusion with Applications in Personalized + PageRanks NeurIPS 2024 + + +
+ Graph diffusion, which iteratively propagates real-valued substances among +the graph, is used in numerous graph/network-involved applications. However, +releasing diffusion vectors may reveal sensitive linking information in the +data such as transaction information in financial network data. However, +protecting the privacy of graph data is challenging due to its interconnected +nature. This work proposes a novel graph diffusion framework with edge-level +differential privacy guarantees by using noisy diffusion iterates. The +algorithm injects Laplace noise per diffusion iteration and adopts a +degree-based thresholding function to mitigate the high sensitivity induced by +low-degree nodes. Our privacy loss analysis is based on Privacy Amplification +by Iteration (PABI), which to our best knowledge, is the first effort that +analyzes PABI with Laplace noise and provides relevant applications. We also +introduce a novel Infinity-Wasserstein distance tracking method, which tightens +the analysis of privacy leakage and makes PABI more applicable in practice. We +evaluate this framework by applying it to Personalized Pagerank computation for +ranking tasks. Experiments on real-world network data demonstrate the +superiority of our method under stringent privacy conditions. + +
+
+ comment: Appear in NeurIPS 2024. In this version, we provide a more rigorous + analysis of graph distortion by establishing a tight bound, then update our + corresponding experimental results, which are better than the previous + version +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ pTSE-T: Presentation Target Speaker Extraction using Unaligned Text Cues + + +
+ TSE aims to extract the clean speech of the target speaker in an audio +mixture, thus eliminating irrelevant background noise and speech. While prior +work has explored various auxiliary cues including pre-recorded speech, visual +information (e.g., lip motions and gestures), and spatial information, the +acquisition and selection of such strong cues are infeasible in many practical +scenarios. Unlike all existing work, in this paper, we condition the TSE +algorithm on semantic cues extracted from limited and unaligned text content, +such as condensed points from a presentation slide. This method is particularly +useful in scenarios like meetings, poster sessions, or lecture presentations, +where acquiring other cues in real-time is challenging. To this end, we design +two different networks. Specifically, our proposed TPE fuses audio features +with content-based semantic cues to facilitate time-frequency mask generation +to filter out extraneous noise, while another proposal, namely TSR, employs the +contrastive learning technique to associate blindly separated speech signals +with semantic cues. The experimental results show the efficacy in accurately +identifying the target speaker by utilizing semantic cues derived from limited +and unaligned text, resulting in SI-SDRi of 12.16 dB, SDRi of 12.66 dB, PESQi +of 0.830 and STOIi of 0.150, respectively. Dataset and source code will be +publicly available. Project demo page: https://slideTSE.github.io/. + +
+
+
+
+
+ + ☆ Speech Separation with Pretrained Frontend to Minimize Domain Mismatch + + +
+ Speech separation seeks to separate individual speech signals from a speech +mixture. Typically, most separation models are trained on synthetic data due to +the unavailability of target reference in real-world cocktail party scenarios. +As a result, there exists a domain gap between real and synthetic data when +deploying speech separation models in real-world applications. In this paper, +we propose a self-supervised domain-invariant pretrained (DIP) frontend that is +exposed to mixture data without the need for target reference speech. The DIP +frontend utilizes a Siamese network with two innovative pretext tasks, mixture +predictive coding (MPC) and mixture invariant coding (MIC), to capture shared +contextual cues between real and synthetic unlabeled mixtures. Subsequently, we +freeze the DIP frontend as a feature extractor when training the downstream +speech separation models on synthetic data. By pretraining the DIP frontend +with the contextual cues, we expect that the speech separation skills learned +from synthetic data can be effectively transferred to real data. To benefit +from the DIP frontend, we introduce a novel separation pipeline to align the +feature resolution of the separation models. We evaluate the speech separation +quality on standard benchmarks and real-world datasets. The results confirm the +superiority of our DIP frontend over existing speech separation models. This +study underscores the potential of large-scale pretraining to enhance the +quality and intelligibility of speech separation in real-world applications. + +
+
+ comment: IEEE/ACM Transactions on Audio, Speech, and Language Processing +
+
+
+
+
+ + ☆ HumanVLM: Foundation for Human-Scene Vision-Language Model + + +
+ Human-scene vision-language tasks are increasingly prevalent in diverse +social applications, yet recent advancements predominantly rely on models +specifically tailored to individual tasks. Emerging research indicates that +large vision-language models (VLMs) can enhance performance across various +downstream vision-language understanding tasks. However, general-domain models +often underperform in specialized fields. This study introduces a +domain-specific Large Vision-Language Model, Human-Scene Vision-Language Model +(HumanVLM), designed to provide a foundation for human-scene Vision-Language +tasks. Specifically, (1) we create a large-scale human-scene multimodal +image-text dataset (HumanCaption-10M) sourced from the Internet to facilitate +domain-specific alignment; (2) develop a captioning approach for human-centered +images, capturing human faces, bodies, and backgrounds, and construct a +high-quality Human-Scene image-text dataset (HumanCaptionHQ, about 311k pairs) +that contain as much detailed information as possible about human; (3) Using +HumanCaption-10M and HumanCaptionHQ, we train a HumanVLM. In the experiments, +we then evaluate our HumanVLM across varous downstream tasks, where it +demonstrates superior overall performance among multimodal models of comparable +scale, particularly excelling in human-related tasks and significantly +outperforming similar models, including Qwen2VL and ChatGPT-4o. HumanVLM, +alongside the data introduced, will stimulate the research in human-around +fields. + +
+
+ comment: 34 pages,11 figures +
+
+
+
+
+ + ☆ Learning-based Lossless Event Data Compression + + +
+ Emerging event cameras acquire visual information by detecting time domain +brightness changes asynchronously at the pixel level and, unlike conventional +cameras, are able to provide high temporal resolution, very high dynamic range, +low latency, and low power consumption. Considering the huge amount of data +involved, efficient compression solutions are very much needed. In this +context, this paper presents a novel deep-learning-based lossless event data +compression scheme based on octree partitioning and a learned hyperprior model. +The proposed method arranges the event stream as a 3D volume and employs an +octree structure for adaptive partitioning. A deep neural network-based entropy +model, using a hyperprior, is then applied. Experimental results demonstrate +that the proposed method outperforms traditional lossless data compression +techniques in terms of compression ratio and bits per event. + +
+
+
+
+
+ + ☆ Continual Audio-Visual Sound Separation NeurIPS 2024 + + +
+ In this paper, we introduce a novel continual audio-visual sound separation +task, aiming to continuously separate sound sources for new classes while +preserving performance on previously learned classes, with the aid of visual +guidance. This problem is crucial for practical visually guided auditory +perception as it can significantly enhance the adaptability and robustness of +audio-visual sound separation models, making them more applicable for +real-world scenarios where encountering new sound sources is commonplace. The +task is inherently challenging as our models must not only effectively utilize +information from both modalities in current tasks but also preserve their +cross-modal association in old tasks to mitigate catastrophic forgetting during +audio-visual continual learning. To address these challenges, we propose a +novel approach named ContAV-Sep (\textbf{Cont}inual +\textbf{A}udio-\textbf{V}isual Sound \textbf{Sep}aration). ContAV-Sep presents +a novel Cross-modal Similarity Distillation Constraint (CrossSDC) to uphold the +cross-modal semantic similarity through incremental tasks and retain previously +acquired knowledge of semantic similarity in old models, mitigating the risk of +catastrophic forgetting. The CrossSDC can seamlessly integrate into the +training process of different audio-visual sound separation frameworks. +Experiments demonstrate that ContAV-Sep can effectively mitigate catastrophic +forgetting and achieve significantly better performance compared to other +continual learning baselines for audio-visual sound separation. Code is +available at: \url{https://github.com/weiguoPian/ContAV-Sep_NeurIPS2024}. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Learning to Unify Audio, Visual and Text for Audio-Enhanced Multilingual + Visual Answer Localization + + +
+ The goal of Multilingual Visual Answer Localization (MVAL) is to locate a +video segment that answers a given multilingual question. Existing methods +either focus solely on visual modality or integrate visual and subtitle +modalities. However, these methods neglect the audio modality in videos, +consequently leading to incomplete input information and poor performance in +the MVAL task. In this paper, we propose a unified Audio-Visual-Textual Span +Localization (AVTSL) method that incorporates audio modality to augment both +visual and textual representations for the MVAL task. Specifically, we +integrate features from three modalities and develop three predictors, each +tailored to the unique contributions of the fused modalities: an audio-visual +predictor, a visual predictor, and a textual predictor. Each predictor +generates predictions based on its respective modality. To maintain consistency +across the predicted results, we introduce an Audio-Visual-Textual Consistency +module. This module utilizes a Dynamic Triangular Loss (DTL) function, allowing +each modality's predictor to dynamically learn from the others. This +collaborative learning ensures that the model generates consistent and +comprehensive answers. Extensive experiments show that our proposed method +outperforms several state-of-the-art (SOTA) methods, which demonstrates the +effectiveness of the audio modality. + +
+
+
+
+
+ + ♻ ☆ FASTER: A Font-Agnostic Scene Text Editing and Rendering Framework WACV 2025 + + +
+ Scene Text Editing (STE) is a challenging research problem, that primarily +aims towards modifying existing texts in an image while preserving the +background and the font style of the original text. Despite its utility in +numerous real-world applications, existing style-transfer-based approaches have +shown sub-par editing performance due to (1) complex image backgrounds, (2) +diverse font attributes, and (3) varying word lengths within the text. To +address such limitations, in this paper, we propose a novel font-agnostic scene +text editing and rendering framework, named FASTER, for simultaneously +generating text in arbitrary styles and locations while preserving a natural +and realistic appearance and structure. A combined fusion of target mask +generation and style transfer units, with a cascaded self-attention mechanism +has been proposed to focus on multi-level text region edits to handle varying +word lengths. Extensive evaluation on a real-world database with further +subjective human evaluation study indicates the superiority of FASTER in both +scene text editing and rendering tasks, in terms of model performance and +efficiency. Our code will be released upon acceptance. + +
+
+ comment: Accepted in WACV 2025 +
+
+
+
+
+ + ♻ ☆ POINTS: Improving Your Vision-language Model with Affordable Strategies + + +
+ In recent years, vision-language models have made significant strides, +excelling in tasks like optical character recognition and geometric +problem-solving. However, several critical issues remain: 1) Proprietary models +often lack transparency about their architectures, while open-source models +need more detailed ablations of their training strategies. 2) Pre-training data +in open-source works is under-explored, with datasets added empirically, making +the process cumbersome. 3) Fine-tuning often focuses on adding datasets, +leading to diminishing returns. To address these issues, we propose the +following contributions: 1) We trained a robust baseline model using the latest +advancements in vision-language models, introducing effective improvements and +conducting comprehensive ablation and validation for each technique. 2) +Inspired by recent work on large language models, we filtered pre-training data +using perplexity, selecting the lowest perplexity data for training. This +approach allowed us to train on a curated 1M dataset, achieving competitive +performance. 3) During visual instruction tuning, we used model soup on +different datasets when adding more datasets yielded marginal improvements. +These innovations resulted in a 9B parameter model that performs competitively +with state-of-the-art models. Our strategies are efficient and lightweight, +making them easily adoptable by the community. + +
+
+ comment: v2 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 7 + +
+
+
+ + ☆ Towards Context-Aware Adaptation in Extended Reality: A Design Space for + XR Interfaces and an Adaptive Placement Strategy + + +
+ By converting the entire 3D space around the user into a screen, Extended +Reality (XR) can ameliorate traditional displays' space limitations and +facilitate the consumption of multiple pieces of information at a time. +However, if designed inappropriately, these XR interfaces can overwhelm the +user and complicate information access. In this work, we explored the design +dimensions that can be adapted to enable suitable presentation and interaction +within an XR interface. To investigate a specific use case of context-aware +adaptations within our proposed design space, we concentrated on the spatial +layout of the XR content and investigated non-adaptive and adaptive placement +strategies. In this paper, we (1) present a comprehensive design space for XR +interfaces, (2) propose Environment-referenced, an adaptive placement strategy +that uses a relevant intermediary from the environment within a Hybrid Frame of +Reference (FoR) for each XR object, and (3) evaluate the effectiveness of this +adaptive placement strategy and a non-adaptive Body-Fixed placement strategy in +four contextual scenarios varying in terms of social setting and user mobility +in the environment. The performance of these placement strategies from our +within-subjects user study emphasized the importance of intermediaries' +relevance to the user's focus. These findings underscore the importance of +context-aware interfaces, indicating that the appropriate use of an adaptive +content placement strategy in a context can significantly improve task +efficiency, accuracy, and usability. + +
+
+
+
+
+ + ☆ MM-Embed: Universal Multimodal Retrieval with Multimodal LLMs + + +
+ State-of-the-art retrieval models typically address a straightforward search +scenario, where retrieval tasks are fixed (e.g., finding a passage to answer a +specific question) and only a single modality is supported for both queries and +retrieved results. This paper introduces techniques for advancing information +retrieval with multimodal large language models (MLLMs), enabling a broader +search scenario, termed universal multimodal retrieval, where multiple +modalities and diverse retrieval tasks are accommodated. To this end, we first +study fine-tuning an MLLM as a bi-encoder retriever on 10 datasets with 16 +retrieval tasks. Our empirical results show that the fine-tuned MLLM retriever +is capable of understanding challenging queries, composed of both text and +image, but underperforms a smaller CLIP retriever in cross-modal retrieval +tasks due to modality bias from MLLMs. To address the issue, we propose +modality-aware hard negative mining to mitigate the modality bias exhibited by +MLLM retrievers. Second, we propose to continually fine-tune the universal +multimodal retriever to enhance its text retrieval capability while maintaining +multimodal retrieval capability. As a result, our model, MM-Embed, achieves +state-of-the-art performance on the multimodal retrieval benchmark M-BEIR, +which spans multiple domains and tasks, while also surpassing the +state-of-the-art text retrieval model, NV-Embed-v1, on MTEB retrieval +benchmark. Finally, we explore to prompt the off-the-shelf MLLMs as the +zero-shot rerankers to refine the ranking of the candidates from the multimodal +retriever. We find that through prompt-and-reranking, MLLMs can further improve +multimodal retrieval when the user queries (e.g., text-image composed queries) +are more complex and challenging to understand. These findings also pave the +way to advance universal multimodal retrieval in the future. + +
+
+ comment: We release the model weights at: + https://huggingface.co/nvidia/MM-Embed +
+
+
+
+
+ + ☆ Training on the Test Model: Contamination in Ranking Distillation + + +
+ Neural approaches to ranking based on pre-trained language models are highly +effective in ad-hoc search. However, the computational expense of these models +can limit their application. As such, a process known as knowledge distillation +is frequently applied to allow a smaller, efficient model to learn from an +effective but expensive model. A key example of this is the distillation of +expensive API-based commercial Large Language Models into smaller +production-ready models. However, due to the opacity of training data and +processes of most commercial models, one cannot ensure that a chosen test +collection has not been observed previously, creating the potential for +inadvertent data contamination. We, therefore, investigate the effect of a +contaminated teacher model in a distillation setting. We evaluate several +distillation techniques to assess the degree to which contamination occurs +during distillation. By simulating a ``worst-case'' setting where the degree of +contamination is known, we find that contamination occurs even when the test +data represents a small fraction of the teacher's training samples. We, +therefore, encourage caution when training using black-box teacher models where +data provenance is ambiguous. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ Enhancing ID-based Recommendation with Large Language Models + + +
+ Large Language Models (LLMs) have recently garnered significant attention in +various domains, including recommendation systems. Recent research leverages +the capabilities of LLMs to improve the performance and user modeling aspects +of recommender systems. These studies primarily focus on utilizing LLMs to +interpret textual data in recommendation tasks. However, it's worth noting that +in ID-based recommendations, textual data is absent, and only ID data is +available. The untapped potential of LLMs for ID data within the ID-based +recommendation paradigm remains relatively unexplored. To this end, we +introduce a pioneering approach called "LLM for ID-based Recommendation" +(LLM4IDRec). This innovative approach integrates the capabilities of LLMs while +exclusively relying on ID data, thus diverging from the previous reliance on +textual data. The basic idea of LLM4IDRec is that by employing LLM to augment +ID data, if augmented ID data can improve recommendation performance, it +demonstrates the ability of LLM to interpret ID data effectively, exploring an +innovative way for the integration of LLM in ID-based recommendation. We +evaluate the effectiveness of our LLM4IDRec approach using three widely-used +datasets. Our results demonstrate a notable improvement in recommendation +performance, with our approach consistently outperforming existing methods in +ID-based recommendation by solely augmenting input data. + +
+
+
+
+
+ + ☆ Dissertation: On the Theoretical Foundation of Model Comparison and + Evaluation for Recommender System + + +
+ Recommender systems have become increasingly important with the rise of the +web as a medium for electronic and business transactions. One of the key +drivers of this technology is the ease with which users can provide feedback +about their likes and dislikes through simple clicks of a mouse. This feedback +is commonly collected in the form of ratings, but can also be inferred from a +user's browsing and purchasing history. Recommender systems utilize users' +historical data to infer customer interests and provide personalized +recommendations. The basic principle of recommendations is that significant +dependencies exist between user- and item-centric activity, which can be +learned in a data-driven manner to make accurate predictions. Collaborative +filtering is one family of recommendation algorithms that uses ratings from +multiple users to predict missing ratings or uses binary click information to +predict potential clicks. However, recommender systems can be more complex and +incorporate auxiliary data such as content-based attributes, user interactions, +and contextual information. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2312.08517 +
+
+
+
+
+ + ☆ Transferable Sequential Recommendation via Vector Quantized Meta + Learning + + +
+ While sequential recommendation achieves significant progress on capturing +user-item transition patterns, transferring such large-scale recommender +systems remains challenging due to the disjoint user and item groups across +domains. In this paper, we propose a vector quantized meta learning for +transferable sequential recommenders (MetaRec). Without requiring additional +modalities or shared information across domains, our approach leverages +user-item interactions from multiple source domains to improve the target +domain performance. To solve the input heterogeneity issue, we adopt vector +quantization that maps item embeddings from heterogeneous input spaces to a +shared feature space. Moreover, our meta transfer paradigm exploits limited +target data to guide the transfer of source domain knowledge to the target +domain (i.e., learn to transfer). In addition, MetaRec adaptively transfers +from multiple source tasks by rescaling meta gradients based on the +source-target domain similarity, enabling selective learning to improve +recommendation performance. To validate the effectiveness of our approach, we +perform extensive experiments on benchmark datasets, where MetaRec consistently +outperforms baseline methods by a considerable margin. + +
+
+ comment: Accepted to BigData 2024 +
+
+
+
+
+ + ♻ ☆ Self-Retrieval: End-to-End Information Retrieval with One Large Language + Model NeurIPS 2024 + + +
+ The rise of large language models (LLMs) has significantly transformed both +the construction and application of information retrieval (IR) systems. +However, current interactions between IR systems and LLMs remain limited, with +LLMs merely serving as part of components within IR systems, and IR systems +being constructed independently of LLMs. This separated architecture restricts +knowledge sharing and deep collaboration between them. In this paper, we +introduce Self-Retrieval, a novel end-to-end LLM-driven information retrieval +architecture. Self-Retrieval unifies all essential IR functions within a single +LLM, leveraging the inherent capabilities of LLMs throughout the IR process. +Specifically, Self-Retrieval internalizes the retrieval corpus through +self-supervised learning, transforms the retrieval process into sequential +passage generation, and performs relevance assessment for reranking. +Experimental results demonstrate that Self-Retrieval not only outperforms +existing retrieval approaches by a significant margin, but also substantially +enhances the performance of LLM-driven downstream applications like +retrieval-augmented generation. + +
+
+ comment: NeurIPS 2024 Camera-ready Version. Code: + https://github.com/icip-cas/SelfRetrieval +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Towards Context-Aware Adaptation in Extended Reality: A Design Space for + XR Interfaces and an Adaptive Placement Strategy + + +
+ By converting the entire 3D space around the user into a screen, Extended +Reality (XR) can ameliorate traditional displays' space limitations and +facilitate the consumption of multiple pieces of information at a time. +However, if designed inappropriately, these XR interfaces can overwhelm the +user and complicate information access. In this work, we explored the design +dimensions that can be adapted to enable suitable presentation and interaction +within an XR interface. To investigate a specific use case of context-aware +adaptations within our proposed design space, we concentrated on the spatial +layout of the XR content and investigated non-adaptive and adaptive placement +strategies. In this paper, we (1) present a comprehensive design space for XR +interfaces, (2) propose Environment-referenced, an adaptive placement strategy +that uses a relevant intermediary from the environment within a Hybrid Frame of +Reference (FoR) for each XR object, and (3) evaluate the effectiveness of this +adaptive placement strategy and a non-adaptive Body-Fixed placement strategy in +four contextual scenarios varying in terms of social setting and user mobility +in the environment. The performance of these placement strategies from our +within-subjects user study emphasized the importance of intermediaries' +relevance to the user's focus. These findings underscore the importance of +context-aware interfaces, indicating that the appropriate use of an adaptive +content placement strategy in a context can significantly improve task +efficiency, accuracy, and usability. + +
+
+
+
+
+ + ☆ Diffusion-based Generative Multicasting with Intent-aware Semantic + Decomposition + + +
+ Generative diffusion models (GDMs) have recently shown great success in +synthesizing multimedia signals with high perceptual quality enabling highly +efficient semantic communications in future wireless networks. In this paper, +we develop an intent-aware generative semantic multicasting framework utilizing +pre-trained diffusion models. In the proposed framework, the transmitter +decomposes the source signal to multiple semantic classes based on the +multi-user intent, i.e. each user is assumed to be interested in details of +only a subset of the semantic classes. The transmitter then sends to each user +only its intended classes, and multicasts a highly compressed semantic map to +all users over shared wireless resources that allows them to locally synthesize +the other classes, i.e. non-intended classes, utilizing pre-trained diffusion +models. The signal retrieved at each user is thereby partially reconstructed +and partially synthesized utilizing the received semantic map. This improves +utilization of the wireless resources, with better preserving privacy of the +non-intended classes. We design a communication/computation-aware scheme for +per-class adaptation of the communication parameters, such as the transmission +power and compression rate to minimize the total latency of retrieving signals +at multiple receivers, tailored to the prevailing channel conditions as well as +the users reconstruction/synthesis distortion/perception requirements. The +simulation results demonstrate significantly reduced per-user latency compared +with non-generative and intent-unaware multicasting benchmarks while +maintaining high perceptual quality of the signals retrieved at the users. + +
+
+
+
+
+ + ☆ 3D Audio-Visual Segmentation NeurIPS 2024 + + +
+ Recognizing the sounding objects in scenes is a longstanding objective in +embodied AI, with diverse applications in robotics and AR/VR/MR. To that end, +Audio-Visual Segmentation (AVS), taking as condition an audio signal to +identify the masks of the target sounding objects in an input image with +synchronous camera and microphone sensors, has been recently advanced. However, +this paradigm is still insufficient for real-world operation, as the mapping +from 2D images to 3D scenes is missing. To address this fundamental limitation, +we introduce a novel research problem, 3D Audio-Visual Segmentation, extending +the existing AVS to the 3D output space. This problem poses more challenges due +to variations in camera extrinsics, audio scattering, occlusions, and diverse +acoustics across sounding object categories. To facilitate this research, we +create the very first simulation based benchmark, 3DAVS-S34-O7, providing +photorealistic 3D scene environments with grounded spatial audio under +single-instance and multi-instance settings, across 34 scenes and 7 object +categories. This is made possible by re-purposing the Habitat simulator to +generate comprehensive annotations of sounding object locations and +corresponding 3D masks. Subsequently, we propose a new approach, EchoSegnet, +characterized by integrating the ready-to-use knowledge from pretrained 2D +audio-visual foundation models synergistically with 3D visual scene +representation through spatial audio-aware mask alignment and refinement. +Extensive experiments demonstrate that EchoSegnet can effectively segment +sounding objects in 3D space on our new benchmark, representing a significant +advancement in the field of embodied AI. Project page: +https://surrey-uplab.github.io/research/3d-audio-visual-segmentation/ + +
+
+ comment: Accepted at the NeurIPS 2024 Workshop on Audio Imagination +
+
+
+
+
+ + ☆ MoMu-Diffusion: On Learning Long-Term Motion-Music Synchronization and + Correspondence NeurIPS 2024 + + +
+ Motion-to-music and music-to-motion have been studied separately, each +attracting substantial research interest within their respective domains. The +interaction between human motion and music is a reflection of advanced human +intelligence, and establishing a unified relationship between them is +particularly important. However, to date, there has been no work that considers +them jointly to explore the modality alignment within. To bridge this gap, we +propose a novel framework, termed MoMu-Diffusion, for long-term and synchronous +motion-music generation. Firstly, to mitigate the huge computational costs +raised by long sequences, we propose a novel Bidirectional Contrastive Rhythmic +Variational Auto-Encoder (BiCoR-VAE) that extracts the modality-aligned latent +representations for both motion and music inputs. Subsequently, leveraging the +aligned latent spaces, we introduce a multi-modal Transformer-based diffusion +model and a cross-guidance sampling strategy to enable various generation +tasks, including cross-modal, multi-modal, and variable-length generation. +Extensive experiments demonstrate that MoMu-Diffusion surpasses recent +state-of-the-art methods both qualitatively and quantitatively, and can +synthesize realistic, diverse, long-term, and beat-matched music or motion +sequences. The generated samples and codes are available at +https://momu-diffusion.github.io/ + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ OneDiff: A Generalist Model for Image Difference Captioning + + +
+ In computer vision, Image Difference Captioning (IDC) is crucial for +accurately describing variations between closely related images. Traditional +IDC methods often rely on specialist models, which restrict their applicability +across varied contexts. This paper introduces the OneDiff model, a novel +generalist approach that utilizes a robust vision-language model architecture, +integrating a siamese image encoder with a Visual Delta Module. This innovative +configuration allows for the precise detection and articulation of fine-grained +differences between image pairs. OneDiff is trained through a dual-phase +strategy, encompassing Coupled Sample Training and multi-task learning across a +diverse array of data types, supported by our newly developed DiffCap Dataset. +This dataset merges real-world and synthetic data, enhancing the training +process and bolstering the model's robustness. Extensive testing on diverse IDC +benchmarks, such as Spot-the-Diff, Image-Editing-Request, and Birds-to-Words, +shows that OneDiff consistently outperforms existing state-of-the-art models in +accuracy and adaptability, achieving improvements of up to 97% CIDEr points in +average. By setting a new benchmark in IDC, OneDiff paves the way for more +versatile and effective applications in detecting and describing visual +differences. The code, models, and data will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ ReactFace: Online Multiple Appropriate Facial Reaction Generation in + Dyadic Interactions + + +
+ In dyadic interaction, predicting the listener's facial reactions is +challenging as different reactions could be appropriate in response to the same +speaker's behaviour. Previous approaches predominantly treated this task as an +interpolation or fitting problem, emphasizing deterministic outcomes but +ignoring the diversity and uncertainty of human facial reactions. Furthermore, +these methods often failed to model short-range and long-range dependencies +within the interaction context, leading to issues in the synchrony and +appropriateness of the generated facial reactions. To address these +limitations, this paper reformulates the task as an extrapolation or prediction +problem, and proposes an novel framework (called ReactFace) to generate +multiple different but appropriate facial reactions from a speaker behaviour +rather than merely replicating the corresponding listener facial behaviours. +Our ReactFace generates multiple different but appropriate photo-realistic +human facial reactions by: (i) learning an appropriate facial reaction +distribution representing multiple different but appropriate facial reactions; +and (ii) synchronizing the generated facial reactions with the speaker verbal +and non-verbal behaviours at each time stamp, resulting in realistic 2D facial +reaction sequences. Experimental results demonstrate the effectiveness of our +approach in generating multiple diverse, synchronized, and appropriate facial +reactions from each speaker's behaviour. The quality of the generated facial +reactions is intimately tied to the speaker's speech and facial expressions, +achieved through our novel speaker-listener interaction modules. Our code is +made publicly available at \url{https://github.com/lingjivoo/ReactFace}. + +
+
+ comment: Accepted to IEEE Transactions on Visualization and Computer Graphics + (TVCG), 18 pages, 10 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Co-clustering for Federated Recommender System WWW '24 + + +
+ As data privacy and security attract increasing attention, Federated +Recommender System (FRS) offers a solution that strikes a balance between +providing high-quality recommendations and preserving user privacy. However, +the presence of statistical heterogeneity in FRS, commonly observed due to +personalized decision-making patterns, can pose challenges. To address this +issue and maximize the benefit of collaborative filtering (CF) in FRS, it is +intuitive to consider clustering clients (users) as well as items into +different groups and learning group-specific models. Existing methods either +resort to client clustering via user representations-risking privacy leakage, +or employ classical clustering strategies on item embeddings or gradients, +which we found are plagued by the curse of dimensionality. In this paper, we +delve into the inefficiencies of the K-Means method in client grouping, +attributing failures due to the high dimensionality as well as data sparsity +occurring in FRS, and propose CoFedRec, a novel Co-clustering Federated +Recommendation mechanism, to address clients heterogeneity and enhance the +collaborative filtering within the federated framework. Specifically, the +server initially formulates an item membership from the client-provided item +networks. Subsequently, clients are grouped regarding a specific item category +picked from the item membership during each communication round, resulting in +an intelligently aggregated group model. Meanwhile, to comprehensively capture +the global inter-relationships among items, we incorporate an additional +supervised contrastive learning term based on the server-side generated item +membership into the local training phase for each client. Extensive experiments +on four datasets are provided, which verify the effectiveness of the proposed +CoFedRec. + +
+
+ comment: WWW '24: Proceedings of the ACM Web Conference 2024 +
+
+
+
+
+ + ☆ Graph-based Confidence Calibration for Large Language Models + + +
+ One important approach to improving the reliability of large language models +(LLMs) is to provide accurate confidence estimations regarding the correctness +of their answers. However, developing a well-calibrated confidence estimation +model is challenging, as mistakes made by LLMs can be difficult to detect. We +propose a novel method combining the LLM's self-consistency with labeled data +and training an auxiliary model to estimate the correctness of its responses to +questions. This auxiliary model predicts the correctness of responses based +solely on their consistent information. To set up the learning problem, we use +a weighted graph to represent the consistency among the LLM's multiple +responses to a question. Correctness labels are assigned to these responses +based on their similarity to the correct answer. We then train a graph neural +network to estimate the probability of correct responses. Experiments +demonstrate that the proposed approach substantially outperforms several of the +most recent methods in confidence calibration across multiple widely adopted +benchmark datasets. Furthermore, the proposed approach significantly improves +the generalization capability of confidence calibration on out-of-domain (OOD) +data. + +
+
+
+
+
+ + ☆ Stochastic Communication Avoidance for Recommendation Systems + + +
+ One of the major bottlenecks for efficient deployment of neural network based +recommendation systems is the memory footprint of their embedding tables. +Although many neural network based recommendation systems could benefit from +the faster on-chip memory access and increased computational power of hardware +accelerators, the large embedding tables in these models often cannot fit on +the constrained memory of accelerators. Despite the pervasiveness of these +models, prior methods in memory optimization and parallelism fail to address +the memory and communication costs of large embedding tables on accelerators. +As a result, the majority of models are trained on CPUs, while current +implementations of accelerators are hindered by issues such as bottlenecks in +inter-device communication and main memory lookups. In this paper, we propose a +theoretical framework that analyses the communication costs of arbitrary +distributed systems that use lookup tables. We use this framework to propose +algorithms that maximize throughput subject to memory, computation, and +communication constraints. Furthermore, we demonstrate that our method achieves +strong theoretical performance across dataset distributions and memory +constraints, applicable to a wide range of use cases from mobile federated +learning to warehouse-scale computation. We implement our framework and +algorithms in PyTorch and achieve up to 6x increases in training throughput on +GPU systems over baselines, on the Criteo Terabytes dataset. + +
+
+
+
+
+ + ☆ Multimodal Graph Neural Network for Recommendation with Dynamic + De-redundancy and Modality-Guided Feature De-noisy + + +
+ Graph neural networks (GNNs) have become crucial in multimodal recommendation +tasks because of their powerful ability to capture complex relationships +between neighboring nodes. However, increasing the number of propagation layers +in GNNs can lead to feature redundancy, which may negatively impact the overall +recommendation performance. In addition, the existing recommendation task +method directly maps the preprocessed multimodal features to the +low-dimensional space, which will bring the noise unrelated to user preference, +thus affecting the representation ability of the model. To tackle the +aforementioned challenges, we propose Multimodal Graph Neural Network for +Recommendation (MGNM) with Dynamic De-redundancy and Modality-Guided Feature +De-noisy, which is divided into local and global interaction. Initially, in the +local interaction process,we integrate a dynamic de-redundancy (DDR) loss +function which is achieved by utilizing the product of the feature coefficient +matrix and the feature matrix as a penalization factor. It reduces the feature +redundancy effects of multimodal and behavioral features caused by the stacking +of multiple GNN layers. Subsequently, in the global interaction process, we +developed modality-guided global feature purifiers for each modality to +alleviate the impact of modality noise. It is a two-fold guiding mechanism +eliminating modality features that are irrelevant to user preferences and +captures complex relationships within the modality. Experimental results +demonstrate that MGNM achieves superior performance on multimodal information +denoising and removal of redundant information compared to the state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Efficient and Robust Regularized Federated Recommendation CIKM 2024 + + +
+ Recommender systems play a pivotal role across practical scenarios, +showcasing remarkable capabilities in user preference modeling. However, the +centralized learning paradigm predominantly used raises serious privacy +concerns. The federated recommender system (FedRS) addresses this by updating +models on clients, while a central server orchestrates training without +accessing private data. Existing FedRS approaches, however, face unresolved +challenges, including non-convex optimization, vulnerability, potential privacy +leakage risk, and communication inefficiency. This paper addresses these +challenges by reformulating the federated recommendation problem as a convex +optimization issue, ensuring convergence to the global optimum. Based on this, +we devise a novel method, RFRec, to tackle this optimization problem +efficiently. In addition, we propose RFRecF, a highly efficient version that +incorporates non-uniform stochastic gradient descent to improve communication +efficiency. In user preference modeling, both methods learn local and global +models, collaboratively learning users' common and personalized interests under +the federated learning setting. Moreover, both methods significantly enhance +communication efficiency, robustness, and privacy protection, with theoretical +support. Comprehensive evaluations on four benchmark datasets demonstrate RFRec +and RFRecF's superior performance compared to diverse baselines. + +
+
+ comment: CIKM 2024 +
+
+
+
+
+ + ☆ LinRec: Linear Attention Mechanism for Long-term Sequential Recommender + Systems SIGIR 2023 + + +
+ Transformer models have achieved remarkable success in sequential recommender +systems (SRSs). However, computing the attention matrix in traditional +dot-product attention mechanisms results in a quadratic complexity with +sequence lengths, leading to high computational costs for long-term sequential +recommendation. Motivated by the above observation, we propose a novel +L2-Normalized Linear Attention for the Transformer-based Sequential Recommender +Systems (LinRec), which theoretically improves efficiency while preserving the +learning capabilities of the traditional dot-product attention. Specifically, +by thoroughly examining the equivalence conditions of efficient attention +mechanisms, we show that LinRec possesses linear complexity while preserving +the property of attention mechanisms. In addition, we reveal its latent +efficiency properties by interpreting the proposed LinRec mechanism through a +statistical lens. Extensive experiments are conducted based on two public +benchmark datasets, demonstrating that the combination of LinRec and +Transformer models achieves comparable or even superior performance than +state-of-the-art Transformer-based SRS models while significantly improving +time and memory efficiency. + +
+
+ comment: SIGIR 2023 +
+
+
+
+
+ + ☆ High-performance automated abstract screening with large language model + ensembles + + +
+ Large language models (LLMs) excel in tasks requiring processing and +interpretation of input text. Abstract screening is a labour-intensive +component of systematic review involving repetitive application of inclusion +and exclusion criteria on a large volume of studies identified by a literature +search. Here, LLMs (GPT-3.5 Turbo, GPT-4 Turbo, GPT-4o, Llama 3 70B, Gemini 1.5 +Pro, and Claude Sonnet 3.5) were trialled on systematic reviews in a full issue +of the Cochrane Library to evaluate their accuracy in zero-shot binary +classification for abstract screening. Trials over a subset of 800 records +identified optimal prompting strategies and demonstrated superior performance +of LLMs to human researchers in terms of sensitivity (LLMmax = 1.000, humanmax += 0.775), precision (LLMmax = 0.927, humanmax = 0.911), and balanced accuracy +(LLMmax = 0.904, humanmax = 0.865). The best performing LLM-prompt combinations +were trialled across every replicated search result (n = 119,691), and +exhibited consistent sensitivity (range 0.756-1.000) but diminished precision +(range 0.004-0.096). 66 LLM-human and LLM-LLM ensembles exhibited perfect +sensitivity with a maximal precision of 0.458, with less observed performance +drop in larger trials. Significant variation in performance was observed +between reviews, highlighting the importance of domain-specific validation +before deployment. LLMs may reduce the human labour cost of systematic review +with maintained or improved accuracy and sensitivity. Systematic review is the +foundation of evidence-based medicine, and LLMs can contribute to increasing +the efficiency and quality of this mode of research. + +
+
+ comment: RS and AJT are joint-first authors +
+
+
+
+
+ + ☆ Facet-Aware Multi-Head Mixture-of-Experts Model for Sequential + Recommendation WSDM'25 + + +
+ Sequential recommendation (SR) systems excel at capturing users' dynamic +preferences by leveraging their interaction histories. Most existing SR systems +assign a single embedding vector to each item to represent its features, and +various types of models are adopted to combine these item embeddings into a +sequence representation vector to capture the user intent. However, we argue +that this representation alone is insufficient to capture an item's +multi-faceted nature (e.g., movie genres, starring actors). Besides, users +often exhibit complex and varied preferences within these facets (e.g., liking +both action and musical films in the facet of genre), which are challenging to +fully represent. To address the issues above, we propose a novel structure +called Facet-Aware Multi-Head Mixture-of-Experts Model for Sequential +Recommendation (FAME). We leverage sub-embeddings from each head in the last +multi-head attention layer to predict the next item separately. This approach +captures the potential multi-faceted nature of items without increasing model +complexity. A gating mechanism integrates recommendations from each head and +dynamically determines their importance. Furthermore, we introduce a +Mixture-of-Experts (MoE) network in each attention head to disentangle various +user preferences within each facet. Each expert within the MoE focuses on a +specific preference. A learnable router network is adopted to compute the +importance weight for each expert and aggregate them. We conduct extensive +experiments on four public sequential recommendation datasets and the results +demonstrate the effectiveness of our method over existing baseline models. + +
+
+ comment: This paper has been accepted by WSDM'25. The final camera-ready + version will be available soon +
+
+
+
+
+ + ♻ ☆ Understanding and Scaling Collaborative Filtering Optimization from the + Perspective of Matrix Rank + + +
+ Collaborative Filtering (CF) methods dominate real-world recommender systems +given their ability to learn high-quality, sparse ID-embedding tables that +effectively capture user preferences. These tables scale linearly with the +number of users and items, and are trained to ensure high similarity between +embeddings of interacted user-item pairs, while maintaining low similarity for +non-interacted pairs. Despite their high performance, encouraging dispersion +for non-interacted pairs necessitates expensive regularization (e.g., negative +sampling), hurting runtime and scalability. Existing research tends to address +these challenges by simplifying the learning process, either by reducing model +complexity or sampling data, trading performance for runtime. In this work, we +move beyond model-level modifications and study the properties of the embedding +tables under different learning strategies. Through theoretical analysis, we +find that the singular values of the embedding tables are intrinsically linked +to different CF loss functions. These findings are empirically validated on +real-world datasets, demonstrating the practical benefits of higher stable +rank, a continuous version of matrix rank which encodes the distribution of +singular values. Based on these insights, we propose an efficient warm-start +strategy that regularizes the stable rank of the user and item embeddings. We +show that stable rank regularization during early training phases can promote +higher-quality embeddings, resulting in training speed improvements of up to +66%. Additionally, stable rank regularization can act as a proxy for negative +sampling, allowing for performance gains of up to 21% over loss functions with +small negative sampling ratios. Overall, our analysis unifies current CF +methods under a new perspective, their optimization of stable rank, motivating +a flexible regularization method. + +
+
+
+
+
+ + ♻ ☆ Little Giants: Synthesizing High-Quality Embedding Data at Scale + + +
+ Synthetic data generation has become an increasingly popular way of training +models without the need for large, manually labeled datasets. For tasks like +text embedding, synthetic data offers diverse and scalable training examples, +significantly reducing the cost of human annotation. However, most current +approaches rely heavily on proprietary models like GPT-4, which are expensive +and inefficient for generating large-scale embedding data. In this paper, we +introduce SPEED, a framework that aligns open-source small models (8B) to +efficiently generate large-scale synthetic embedding data. Through supervised +fine-tuning, preference optimization, and self-improvement, SPEED enables small +open-source models to produce high-quality data. Remarkably, SPEED uses only +less than 1/10 of the GPT API calls, outperforming the state-of-the-art +embedding model E5_mistral when both are trained solely on their synthetic +data. Using this efficient generator, we conduct a comprehensive study on how +various factors within the alignment pipeline impact data quality and reveal +the scaling law for synthetic embedding data. + +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Multimodal Graph Neural Network for Recommendation with Dynamic + De-redundancy and Modality-Guided Feature De-noisy + + +
+ Graph neural networks (GNNs) have become crucial in multimodal recommendation +tasks because of their powerful ability to capture complex relationships +between neighboring nodes. However, increasing the number of propagation layers +in GNNs can lead to feature redundancy, which may negatively impact the overall +recommendation performance. In addition, the existing recommendation task +method directly maps the preprocessed multimodal features to the +low-dimensional space, which will bring the noise unrelated to user preference, +thus affecting the representation ability of the model. To tackle the +aforementioned challenges, we propose Multimodal Graph Neural Network for +Recommendation (MGNM) with Dynamic De-redundancy and Modality-Guided Feature +De-noisy, which is divided into local and global interaction. Initially, in the +local interaction process,we integrate a dynamic de-redundancy (DDR) loss +function which is achieved by utilizing the product of the feature coefficient +matrix and the feature matrix as a penalization factor. It reduces the feature +redundancy effects of multimodal and behavioral features caused by the stacking +of multiple GNN layers. Subsequently, in the global interaction process, we +developed modality-guided global feature purifiers for each modality to +alleviate the impact of modality noise. It is a two-fold guiding mechanism +eliminating modality features that are irrelevant to user preferences and +captures complex relationships within the modality. Experimental results +demonstrate that MGNM achieves superior performance on multimodal information +denoising and removal of redundant information compared to the state-of-the-art +methods. + +
+
+
+
+
+ + ♻ ☆ AnyV2V: A Tuning-Free Framework For Any Video-to-Video Editing Tasks + + +
+ In the dynamic field of digital content creation using generative models, +state-of-the-art video editing models still do not offer the level of quality +and control that users desire. Previous works on video editing either extended +from image-based generative models in a zero-shot manner or necessitated +extensive fine-tuning, which can hinder the production of fluid video edits. +Furthermore, these methods frequently rely on textual input as the editing +guidance, leading to ambiguities and limiting the types of edits they can +perform. Recognizing these challenges, we introduce AnyV2V, a novel tuning-free +paradigm designed to simplify video editing into two primary steps: (1) +employing an off-the-shelf image editing model to modify the first frame, (2) +utilizing an existing image-to-video generation model to generate the edited +video through temporal feature injection. AnyV2V can leverage any existing +image editing tools to support an extensive array of video editing tasks, +including prompt-based editing, reference-based style transfer, subject-driven +editing, and identity manipulation, which were unattainable by previous +methods. AnyV2V can also support any video length. Our evaluation shows that +AnyV2V achieved CLIP-scores comparable to other baseline methods. Furthermore, +AnyV2V significantly outperformed these baselines in human evaluations, +demonstrating notable improvements in visual consistency with the source video +while producing high-quality edits across all editing tasks. + +
+
+ comment: Published in Transactions on Machine Learning Research (TMLR 2024) + (11/2024) +
+
+
+
+
+ + ♻ ☆ GenAI-Bench: Evaluating and Improving Compositional Text-to-Visual + Generation + + +
+ While text-to-visual models now produce photo-realistic images and videos, +they struggle with compositional text prompts involving attributes, +relationships, and higher-order reasoning such as logic and comparison. In this +work, we conduct an extensive human study on GenAI-Bench to evaluate the +performance of leading image and video generation models in various aspects of +compositional text-to-visual generation. We also compare automated evaluation +metrics against our collected human ratings and find that VQAScore -- a metric +measuring the likelihood that a VQA model views an image as accurately +depicting the prompt -- significantly outperforms previous metrics such as +CLIPScore. In addition, VQAScore can improve generation in a black-box manner +(without finetuning) via simply ranking a few (3 to 9) candidate images. +Ranking by VQAScore is 2x to 3x more effective than other scoring methods like +PickScore, HPSv2, and ImageReward at improving human alignment ratings for +DALL-E 3 and Stable Diffusion, especially on compositional prompts that require +advanced visio-linguistic reasoning. We release a new GenAI-Rank benchmark with +over 40,000 human ratings to evaluate scoring metrics on ranking images +generated from the same prompt. Lastly, we discuss promising areas for +improvement in VQAScore, such as addressing fine-grained visual details. We +will release all human ratings (over 80,000) to facilitate scientific +benchmarking of both generative models and automated metrics. + +
+
+ comment: We open-source our dataset, model, and code at: + https://linzhiqiu.github.io/papers/genai_bench ; Project page: + https://linzhiqiu.github.io/papers/genai_bench ; GenAI-Bench was first + introduced in arxiv:2404.01291. This article extends it with an additional + GenAI-Rank benchmark +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Multi-Channel Hypergraph Contrastive Learning for Matrix Completion + + +
+ Rating is a typical user explicit feedback that visually reflects how much a +user likes a related item. The (rating) matrix completion is essentially a +rating prediction process, which is also a significant problem in recommender +systems. Recently, graph neural networks (GNNs) have been widely used in matrix +completion, which captures users' preferences over items by formulating a +rating matrix as a bipartite graph. However, existing methods are susceptible +due to data sparsity and long-tail distribution in real-world scenarios. +Moreover, the messaging mechanism of GNNs makes it difficult to capture +high-order correlations and constraints between nodes, which are essentially +useful in recommendation tasks. To tackle these challenges, we propose a +Multi-Channel Hypergraph Contrastive Learning framework for matrix completion, +named MHCL. Specifically, MHCL adaptively learns hypergraph structures to +capture high-order correlations between nodes and jointly captures local and +global collaborative relationships through attention-based cross-view +aggregation. Additionally, to consider the magnitude and order information of +ratings, we treat different rating subgraphs as different channels, encourage +alignment between adjacent ratings, and further achieve the mutual enhancement +between different ratings through multi-channel cross-rating contrastive +learning. Extensive experiments on five public datasets demonstrate that the +proposed method significantly outperforms the current state-of-the-art +approaches. + +
+
+
+
+
+ + ☆ Combining Financial Data and News Articles for Stock Price Movement + Prediction Using Large Language Models + + +
+ Predicting financial markets and stock price movements requires analyzing a +company's performance, historic price movements, industry-specific events +alongside the influence of human factors such as social media and press +coverage. We assume that financial reports (such as income statements, balance +sheets, and cash flow statements), historical price data, and recent news +articles can collectively represent aforementioned factors. We combine +financial data in tabular format with textual news articles and employ +pre-trained Large Language Models (LLMs) to predict market movements. Recent +research in LLMs has demonstrated that they are able to perform both tabular +and text classification tasks, making them our primary model to classify the +multi-modal data. We utilize retrieval augmentation techniques to retrieve and +attach relevant chunks of news articles to financial metrics related to a +company and prompt the LLMs in zero, two, and four-shot settings. Our dataset +contains news articles collected from different sources, historic stock price, +and financial report data for 20 companies with the highest trading volume +across different industries in the stock market. We utilized recently released +language models for our LLM-based classifier, including GPT- 3 and 4, and +LLaMA- 2 and 3 models. We introduce an LLM-based classifier capable of +performing classification tasks using combination of tabular (structured) and +textual (unstructured) data. By using this model, we predicted the movement of +a given stock's price in our dataset with a weighted F1-score of 58.5% and +59.1% and Matthews Correlation Coefficient of 0.175 for both 3-month and +6-month periods. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ Online and Offline Evaluations of Collaborative Filtering and Content + Based Recommender Systems + + +
+ Recommender systems are widely used AI applications designed to help users +efficiently discover relevant items. The effectiveness of such systems is tied +to the satisfaction of both users and providers. However, user satisfaction is +complex and cannot be easily framed mathematically using information retrieval +and accuracy metrics. While many studies evaluate accuracy through offline +tests, a growing number of researchers argue that online evaluation methods +such as A/B testing are better suited for this purpose. We have employed a +variety of algorithms on different types of datasets divergent in size and +subject, producing recommendations in various platforms, including media +streaming services, digital publishing websites, e-commerce systems, and news +broadcasting networks. Notably, our target websites and datasets are in Persian +(Farsi) language. + This study provides a comparative analysis of a large-scale recommender +system that has been operating for the past year across about 70 websites in +Iran, processing roughly 300 requests per second collectively. The system +employs user-based and item-based recommendations using content-based, +collaborative filtering, trend-based methods, and hybrid approaches. Through +both offline and online evaluations, we aim to identify where these algorithms +perform most efficiently and determine the best method for our specific needs, +considering the dataset and system scale. Our methods of evaluation include +manual evaluation, offline tests including accuracy and ranking metrics like +hit-rate@k and nDCG, and online tests consisting of click-through rate (CTR). +Additionally we analyzed and proposed methods to address cold-start and +popularity bias. + +
+
+ comment: 9 pages, 9 figures +
+
+
+
+
+ + ☆ Towards a Knowledge Graph for Teaching Knowledge Graphs + + +
+ This poster paper describes the ongoing research project for the creation of +a use-case-driven Knowledge Graph resource tailored to the needs of teaching +education in Knowledge Graphs (KGs). We gather resources related to KG courses +from lectures offered by the Semantic Web community, with the help of the COST +Action Distributed Knowledge Graphs and the interest group on KGs at The Alan +Turing Institute. Our goal is to create a resource-focused KG with multiple +interconnected semantic layers that interlink topics, courses, and materials +with each lecturer. Our approach formulates a domain KG in teaching and relates +it with multiple Personal KGs created for the lecturers. + +
+
+
+
+
+ + ☆ TODO: Enhancing LLM Alignment with Ternary Preferences + + +
+ Aligning large language models (LLMs) with human intent is critical for +enhancing their performance across a variety of tasks. Standard alignment +techniques, such as Direct Preference Optimization (DPO), often rely on the +binary Bradley-Terry (BT) model, which can struggle to capture the complexities +of human preferences -- particularly in the presence of noisy or inconsistent +labels and frequent ties. To address these limitations, we introduce the +Tie-rank Oriented Bradley-Terry model (TOBT), an extension of the BT model that +explicitly incorporates ties, enabling more nuanced preference representation. +Building on this, we propose Tie-rank Oriented Direct Preference Optimization +(TODO), a novel alignment algorithm that leverages TOBT's ternary ranking +system to improve preference alignment. In evaluations on Mistral-7B and Llama +3-8B models, TODO consistently outperforms DPO in modeling preferences across +both in-distribution and out-of-distribution datasets. Additional assessments +using MT Bench and benchmarks such as Piqa, ARC-c, and MMLU further demonstrate +TODO's superior alignment performance. Notably, TODO also shows strong results +in binary preference alignment, highlighting its versatility and potential for +broader integration into LLM alignment. The implementation details can be found +in https://github.com/XXares/TODO. + +
+
+
+
+
+ + ☆ Graph Cross-Correlated Network for Recommendation + + +
+ Collaborative filtering (CF) models have demonstrated remarkable performance +in recommender systems, which represent users and items as embedding vectors. +Recently, due to the powerful modeling capability of graph neural networks for +user-item interaction graphs, graph-based CF models have gained increasing +attention. They encode each user/item and its subgraph into a single super +vector by combining graph embeddings after each graph convolution. However, +each hop of the neighbor in the user-item subgraphs carries a specific semantic +meaning. Encoding all subgraph information into single vectors and inferring +user-item relations with dot products can weaken the semantic information +between user and item subgraphs, thus leaving untapped potential. Exploiting +this untapped potential provides insight into improving performance for +existing recommendation models. To this end, we propose the Graph +Cross-correlated Network for Recommendation (GCR), which serves as a general +recommendation paradigm that explicitly considers correlations between +user/item subgraphs. GCR first introduces the Plain Graph Representation (PGR) +to extract information directly from each hop of neighbors into corresponding +PGR vectors. Then, GCR develops Cross-Correlated Aggregation (CCA) to construct +possible cross-correlated terms between PGR vectors of user/item subgraphs. +Finally, GCR comprehensively incorporates the cross-correlated terms for +recommendations. Experimental results show that GCR outperforms +state-of-the-art models on both interaction prediction and click-through rate +prediction tasks. + +
+
+ comment: 14 pages, accepted by TKDE +
+
+
+
+
+ + ☆ LLM4PR: Improving Post-Ranking in Search Engine with Large Language + Models + + +
+ Alongside the rapid development of Large Language Models (LLMs), there has +been a notable increase in efforts to integrate LLM techniques in information +retrieval (IR) and search engines (SE). Recently, an additional post-ranking +stage is suggested in SE to enhance user satisfaction in practical +applications. Nevertheless, research dedicated to enhancing the post-ranking +stage through LLMs remains largely unexplored. In this study, we introduce a +novel paradigm named Large Language Models for Post-Ranking in search engine +(LLM4PR), which leverages the capabilities of LLMs to accomplish the +post-ranking task in SE. Concretely, a Query-Instructed Adapter (QIA) module is +designed to derive the user/item representation vectors by incorporating their +heterogeneous features. A feature adaptation step is further introduced to +align the semantics of user/item representations with the LLM. Finally, the +LLM4PR integrates a learning to post-rank step, leveraging both a main task and +an auxiliary task to fine-tune the model to adapt the post-ranking task. +Experiment studies demonstrate that the proposed framework leads to significant +improvements and exhibits state-of-the-art performance compared with other +alternatives. + +
+
+
+
+
+ + ♻ ☆ Predicting the Geolocation of Tweets Using transformer models on + Customized Data + + +
+ This research is aimed to solve the tweet/user geolocation prediction task +and provide a flexible methodology for the geotagging of textual big data. The +suggested approach implements neural networks for natural language processing +(NLP) to estimate the location as coordinate pairs (longitude, latitude) and +two-dimensional Gaussian Mixture Models (GMMs). The scope of proposed models +has been finetuned on a Twitter dataset using pretrained Bidirectional Encoder +Representations from Transformers (BERT) as base models. Performance metrics +show a median error of fewer than 30 km on a worldwide-level, and fewer than 15 +km on the US-level datasets for the models trained and evaluated on text +features of tweets' content and metadata context. Our source code and data are +available at https://github.com/K4TEL/geo-twitter.git + +
+
+ comment: 31 pages, 5 tables, 9 figures +
+
+
+
+
+ + ♻ ☆ Understanding and Improving Adversarial Collaborative Filtering for + Robust Recommendation NeurIPS 2024 + + +
+ Adversarial Collaborative Filtering (ACF), which typically applies +adversarial perturbations at user and item embeddings through adversarial +training, is widely recognized as an effective strategy for enhancing the +robustness of Collaborative Filtering (CF) recommender systems against +poisoning attacks. Besides, numerous studies have empirically shown that ACF +can also improve recommendation performance compared to traditional CF. Despite +these empirical successes, the theoretical understanding of ACF's effectiveness +in terms of both performance and robustness remains unclear. To bridge this +gap, in this paper, we first theoretically show that ACF can achieve a lower +recommendation error compared to traditional CF with the same training epochs +in both clean and poisoned data contexts. Furthermore, by establishing bounds +for reductions in recommendation error during ACF's optimization process, we +find that applying personalized magnitudes of perturbation for different users +based on their embedding scales can further improve ACF's effectiveness. +Building on these theoretical understandings, we propose Personalized Magnitude +Adversarial Collaborative Filtering (PamaCF). Extensive experiments demonstrate +that PamaCF effectively defends against various types of poisoning attacks +while significantly enhancing recommendation performance. + +
+
+ comment: To appear in NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Two-stage Conformal Risk Control with Application to Ranked Retrieval + + +
+ Many practical machine learning systems, such as ranking and recommendation +systems, consist of two concatenated stages: retrieval and ranking. These +systems present significant challenges in accurately assessing and managing the +uncertainty inherent in their predictions. To address these challenges, we +extend the recently developed framework of conformal risk control, originally +designed for single-stage problems, to accommodate the more complex two-stage +setup. We first demonstrate that a straightforward application of conformal +risk control, treating each stage independently, may fail to maintain risk at +their pre-specified levels. Therefore, we propose an integrated approach that +considers both stages simultaneously, devising algorithms to control the risk +of each stage by jointly identifying thresholds for both stages. Our algorithm +further optimizes for a weighted combination of prediction set sizes across all +feasible thresholds, resulting in more effective prediction sets. Finally, we +apply the proposed method to the critical task of two-stage ranked retrieval. +We validate the efficacy of our method through extensive experiments on two +large-scale public datasets, MSLR-WEB and MS MARCO, commonly used for ranked +retrieval tasks. + +
+
+ comment: 13 pages, 3 figures; 5 supplementary pages, 3 supplementary figures +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ♻ ☆ Comparative Analysis of Modality Fusion Approaches for Audio-Visual + Person Identification and Verification SP2024 + + +
+ Multimodal learning involves integrating information from various modalities +to enhance learning and comprehension. We compare three modality fusion +strategies in person identification and verification by processing two +modalities: voice and face. In this paper, a one-dimensional convolutional +neural network is employed for x-vector extraction from voice, while the +pre-trained VGGFace2 network and transfer learning are utilized for face +modality. In addition, gammatonegram is used as speech representation in +engagement with the Darknet19 pre-trained network. The proposed systems are +evaluated using the K-fold cross-validation technique on the 118 speakers of +the test set of the VoxCeleb2 dataset. The comparative evaluations are done for +single-modality and three proposed multimodal strategies in equal situations. +Results demonstrate that the feature fusion strategy of gammatonegram and +facial features achieves the highest performance, with an accuracy of 98.37% in +the person identification task. However, concatenating facial features with the +x-vector reaches 0.62% for EER in verification tasks. + +
+
+ comment: This paper was accepted at the ICNLSP2024 conference +
+
+
+
+
+ + ♻ ☆ Multi-modal Speech Emotion Recognition via Feature Distribution + Adaptation Network + + +
+ In this paper, we propose a novel deep inductive transfer learning framework, +named feature distribution adaptation network, to tackle the challenging +multi-modal speech emotion recognition problem. Our method aims to use deep +transfer learning strategies to align visual and audio feature distributions to +obtain consistent representation of emotion, thereby improving the performance +of speech emotion recognition. In our model, the pre-trained ResNet-34 is +utilized for feature extraction for facial expression images and acoustic Mel +spectrograms, respectively. Then, the cross-attention mechanism is introduced +to model the intrinsic similarity relationships of multi-modal features. +Finally, the multi-modal feature distribution adaptation is performed +efficiently with feed-forward network, which is extended using the local +maximum mean discrepancy loss. Experiments are carried out on two benchmark +datasets, and the results demonstrate that our model can achieve excellent +performance compared with existing ones. + +
+
+
+
+
+ + ♻ ☆ Audio-Visual Instance Segmentation + + +
+ In this paper, we propose a new multi-modal task, termed audio-visual +instance segmentation (AVIS), which aims to simultaneously identify, segment +and track individual sounding object instances in audible videos. To facilitate +this research, we introduce a high-quality benchmark named AVISeg, containing +over 90K instance masks from 26 semantic categories in 926 long videos. +Additionally, we propose a strong baseline model for this task. Our model first +localizes sound source within each frame, and condenses object-specific +contexts into concise tokens. Then it builds long-range audio-visual +dependencies between these tokens using window-based attention, and tracks +sounding objects among the entire video sequences. Extensive experiments reveal +that our method performs best on AVISeg, surpassing the existing methods from +related tasks. We further conduct the evaluation on several multi-modal large +models; however, they exhibits subpar performance on instance-level sound +source localization and temporal perception. We expect that AVIS will inspire +the community towards a more comprehensive multi-modal understanding. The +dataset and code will soon be released on https://github.com/ruohaoguo/avis. + +
+
+ comment: Project page: https://github.com/ruohaoguo/avis +
+
+
+
+
+ + ♻ ☆ Emotion-LLaMA: Multimodal Emotion Recognition and Reasoning with + Instruction Tuning NeurIPS 2024 + + +
+ Accurate emotion perception is crucial for various applications, including +human-computer interaction, education, and counseling. However, traditional +single-modality approaches often fail to capture the complexity of real-world +emotional expressions, which are inherently multimodal. Moreover, existing +Multimodal Large Language Models (MLLMs) face challenges in integrating audio +and recognizing subtle facial micro-expressions. To address this, we introduce +the MERR dataset, containing 28,618 coarse-grained and 4,487 fine-grained +annotated samples across diverse emotional categories. This dataset enables +models to learn from varied scenarios and generalize to real-world +applications. Furthermore, we propose Emotion-LLaMA, a model that seamlessly +integrates audio, visual, and textual inputs through emotion-specific encoders. +By aligning features into a shared space and employing a modified LLaMA model +with instruction tuning, Emotion-LLaMA significantly enhances both emotional +recognition and reasoning capabilities. Extensive evaluations show +Emotion-LLaMA outperforms other MLLMs, achieving top scores in Clue Overlap +(7.83) and Label Overlap (6.25) on EMER, an F1 score of 0.9036 on MER2023-SEMI +challenge, and the highest UAR (45.59) and WAR (59.37) in zero-shot evaluations +on DFEW dataset. + +
+
+ comment: Accepted at NeurIPS 2024. 49 pages, 13 figures, Project: + https://github.com/ZebangCheng/Emotion-LLaMA, Demo: + https://huggingface.co/spaces/ZebangCheng/Emotion-LLaMA +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 18 + +
+
+
+ + ☆ Enhancing Question Answering Precision with Optimized Vector Retrieval + and Instructions + + +
+ Question-answering (QA) is an important application of Information Retrieval +(IR) and language models, and the latest trend is toward pre-trained large +neural networks with embedding parameters. Augmenting QA performances with +these LLMs requires intensive computational resources for fine-tuning. We +propose an innovative approach to improve QA task performances by integrating +optimized vector retrievals and instruction methodologies. Based on retrieval +augmentation, the process involves document embedding, vector retrieval, and +context construction for optimal QA results. We experiment with different +combinations of text segmentation techniques and similarity functions, and +analyze their impacts on QA performances. Results show that the model with a +small chunk size of 100 without any overlap of the chunks achieves the best +result and outperforms the models based on semantic segmentation using +sentences. We discuss related QA examples and offer insight into how model +performances are improved within the two-stage framework. + +
+
+ comment: 6 pages, 4 tables +
+
+
+
+
+ + ☆ CORAG: A Cost-Constrained Retrieval Optimization System for + Retrieval-Augmented Generation + + +
+ Large Language Models (LLMs) have demonstrated remarkable generation +capabilities but often struggle to access up-to-date information, which can +lead to hallucinations. Retrieval-Augmented Generation (RAG) addresses this +issue by incorporating knowledge from external databases, enabling more +accurate and relevant responses. Due to the context window constraints of LLMs, +it is impractical to input the entire external database context directly into +the model. Instead, only the most relevant information, referred to as chunks, +is selectively retrieved. However, current RAG research faces three key +challenges. First, existing solutions often select each chunk independently, +overlooking potential correlations among them. Second, in practice the utility +of chunks is non-monotonic, meaning that adding more chunks can decrease +overall utility. Traditional methods emphasize maximizing the number of +included chunks, which can inadvertently compromise performance. Third, each +type of user query possesses unique characteristics that require tailored +handling, an aspect that current approaches do not fully consider. To overcome +these challenges, we propose a cost constrained retrieval optimization system +CORAG for retrieval-augmented generation. We employ a Monte Carlo Tree Search +(MCTS) based policy framework to find optimal chunk combinations sequentially, +allowing for a comprehensive consideration of correlations among chunks. +Additionally, rather than viewing budget exhaustion as a termination condition, +we integrate budget constraints into the optimization of chunk combinations, +effectively addressing the non-monotonicity of chunk utility. + +
+
+
+
+
+ + ☆ A graph-based approach to extracting narrative signals from public + discourse + + +
+ Narratives are key interpretative devices by which humans make sense of +political reality. As the significance of narratives for understanding current +societal issues such as polarization and misinformation becomes increasingly +evident, there is a growing demand for methods that support their empirical +analysis. To this end, we propose a graph-based formalism and machine-guided +method for extracting, representing, and analyzing selected narrative signals +from digital textual corpora, based on Abstract Meaning Representation (AMR). +The formalism and method introduced here specifically cater to the study of +political narratives that figure in texts from digital media such as archived +political speeches, social media posts, political manifestos and transcripts of +parliamentary debates. We conceptualize these political narratives as a type of +ontological narratives: stories by which actors position themselves as +political beings, and which are akin to political worldviews in which actors +present their normative vision of the world, or aspects thereof. We approach +the study of such political narratives as a problem of information retrieval: +starting from a textual corpus, we first extract a graph-like representation of +the meaning of each sentence in the corpus using AMR. Drawing on transferable +concepts from narratology, we then apply a set of heuristics to filter these +graphs for representations of 1) actors, 2) the events in which these actors +figure, and 3) traces of the perspectivization of these events. We approach +these references to actors, events, and instances of perspectivization as core +narrative signals that initiate a further analysis by alluding to larger +political narratives. By means of a case study of State of the European Union +addresses, we demonstrate how the formalism can be used to inductively surface +signals of political narratives from public discourse. + +
+
+ comment: 23 pages, 4 figures +
+
+
+
+
+ + ☆ Making Sense of Metadata Mess: Alignment & Risk Assessment for Diatom + Data Use Case + + +
+ Biologists study Diatoms, a fundamental algae, to assess the health of +aquatic systems. Diatom specimens have traditionally been preserved on analog +slides, where a single slide can contain thousands of these microscopic +organisms. Digitization of these collections presents both metadata challenges +and opportunities. This paper reports on metadata research aimed at providing +access to a digital portion of the Academy of Natural Sciences' Diatom +Herbarium, Drexel University. We report results of a 3-part study covering 1) a +review of relevant metadata standards and a microscopy metadata framework +shared by Hammer et al., 2) a baseline metadata alignment mapping current +diatom metadata properties to standard metadata types, and 3) a metadata risk +analysis associated with the course of standard data curation practices. This +research is part of an effort involving the transfer of these digital slides to +an new system, DataFed, to support global accessible. The final section of this +paper includes a conclusion and discusses next steps. + +
+
+ comment: 13 pages, 2 figures, 1 table, to be published in MTSR 2024 conference + proceedings +
+
+
+
+
+ + ☆ Enhancing Semantic Interoperability Across Materials Science With + HIVE4MAT + + +
+ HIVE4MAT is a linked data interactive application for navigating ontologies +of value to materials science. HIVE enables automatic indexing of textual +resources with standardized terminology. This article presents the motivation +underlying HIVE4MAT, explains the system architecture, reports on two +evaluations, and discusses future plans. + +
+
+ comment: 11 pages, 1 figures, 3 tables, to be published in SeMatS 2024 + workshop proceedings +
+
+
+
+
+ + ☆ LLM-KT: A Versatile Framework for Knowledge Transfer from Large Language + Models to Collaborative Filtering ICDM 2024 + + +
+ We present LLM-KT, a flexible framework designed to enhance collaborative +filtering (CF) models by seamlessly integrating LLM (Large Language +Model)-generated features. Unlike existing methods that rely on passing +LLM-generated features as direct inputs, our framework injects these features +into an intermediate layer of any CF model, allowing the model to reconstruct +and leverage the embeddings internally. This model-agnostic approach works with +a wide range of CF models without requiring architectural changes, making it +adaptable to various recommendation scenarios. Our framework is built for easy +integration and modification, providing researchers and developers with a +powerful tool for extending CF model capabilities through efficient knowledge +transfer. We demonstrate its effectiveness through experiments on the MovieLens +and Amazon datasets, where it consistently improves baseline CF models. +Experimental studies showed that LLM-KT is competitive with the +state-of-the-art methods in context-aware settings but can be applied to a +broader range of CF models than current approaches. + +
+
+ comment: accepted at ICDM 2024 (demo track) +
+
+
+
+
+ + ☆ MIRFLEX: Music Information Retrieval Feature Library for Extraction + + +
+ This paper introduces an extendable modular system that compiles a range of +music feature extraction models to aid music information retrieval research. +The features include musical elements like key, downbeats, and genre, as well +as audio characteristics like instrument recognition, vocals/instrumental +classification, and vocals gender detection. The integrated models are +state-of-the-art or latest open-source. The features can be extracted as latent +or post-processed labels, enabling integration into music applications such as +generative music, recommendation, and playlist generation. The modular design +allows easy integration of newly developed systems, making it a good +benchmarking and comparison tool. This versatile toolkit supports the research +community in developing innovative solutions by providing concrete musical +features. + +
+
+ comment: 2 pages, 4 tables, submitted to Extended Abstracts for the + Late-Breaking Demo Session of the 25th Int. Society for Music Information + Retrieval Conf., San Francisco, United States, 2024 +
+
+
+
+
+ + ☆ Improving Few-Shot Cross-Domain Named Entity Recognition by Instruction + Tuning a Word-Embedding based Retrieval Augmented Large Language Model + + +
+ Few-Shot Cross-Domain NER is the process of leveraging knowledge from +data-rich source domains to perform entity recognition on data scarce target +domains. Most previous state-of-the-art (SOTA) approaches use pre-trained +language models (PLMs) for cross-domain NER. However, these models are often +domain specific. To successfully use these models for new target domains, we +need to modify either the model architecture or perform model finetuning using +data from the new domains. Both of these result in the creation of entirely new +NER models for each target domain which is infeasible for practical scenarios. +Recently,several works have attempted to use LLMs to solve Few-Shot +Cross-Domain NER. However, most of these are either too expensive for practical +purposes or struggle to follow LLM prompt instructions. In this paper, we +propose IF-WRANER (Instruction Finetuned Word-embedding based Retrieval +Augmented large language model for Named Entity Recognition), a retrieval +augmented LLM, finetuned for the NER task. By virtue of the regularization +techniques used during LLM finetuning and the adoption of word-level embedding +over sentence-level embedding during the retrieval of in-prompt examples, +IF-WRANER is able to outperform previous SOTA Few-Shot Cross-Domain NER +approaches. We have demonstrated the effectiveness of our model by benchmarking +its performance on the open source CrossNER dataset, on which it shows more +than 2% F1 score improvement over the previous SOTA model. We have deployed the +model for multiple customer care domains of an enterprise. Accurate entity +prediction through IF-WRANER helps direct customers to automated workflows for +the domains, thereby reducing escalations to human agents by almost 15% and +leading to millions of dollars in yearly savings for the company. + +
+
+
+
+
+ + ☆ DivNet: Diversity-Aware Self-Correcting Sequential Recommendation + Networks CIKM + + +
+ As the last stage of a typical \textit{recommendation system}, +\textit{collective recommendation} aims to give the final touches to the +recommended items and their layout so as to optimize overall objectives such as +diversity and whole-page relevance. In practice, however, the interaction +dynamics among the recommended items, their visual appearances and meta-data +such as specifications are often too complex to be captured by experts' +heuristics or simple models. To address this issue, we propose a +\textit{\underline{div}ersity-aware self-correcting sequential recommendation +\underline{net}works} (\textit{DivNet}) that is able to estimate utility by +capturing the complex interactions among sequential items and diversify +recommendations simultaneously. Experiments on both offline and online settings +demonstrate that \textit{DivNet} can achieve better results compared to +baselines with or without collective recommendations. + +
+
+ comment: Published at CIKM +
+
+
+
+
+ + ☆ A Survey on Bundle Recommendation: Methods, Applications, and Challenges + + +
+ In recent years, bundle recommendation systems have gained significant +attention in both academia and industry due to their ability to enhance user +experience and increase sales by recommending a set of items as a bundle rather +than individual items. This survey provides a comprehensive review on bundle +recommendation, beginning by a taxonomy for exploring product bundling. We +classify it into two categories based on bundling strategy from various +application domains, i.e., discriminative and generative bundle recommendation. +Then we formulate the corresponding tasks of the two categories and +systematically review their methods: 1) representation learning from bundle and +item levels and interaction modeling for discriminative bundle recommendation; +2) representation learning from item level and bundle generation for generative +bundle recommendation. Subsequently, we survey the resources of bundle +recommendation including datasets and evaluation metrics, and conduct +reproducibility experiments on mainstream models. Lastly, we discuss the main +challenges and highlight the promising future directions in the field of bundle +recommendation, aiming to serve as a useful resource for researchers and +practitioners. Our code and datasets are publicly available at +https://github.com/WUT-IDEA/bundle-recommendation-survey. + +
+
+
+
+
+ + ☆ Beyond Utility: Evaluating LLM as Recommender + + +
+ With the rapid development of Large Language Models (LLMs), recent studies +employed LLMs as recommenders to provide personalized information services for +distinct users. Despite efforts to improve the accuracy of LLM-based +recommendation models, relatively little attention is paid to beyond-utility +dimensions. Moreover, there are unique evaluation aspects of LLM-based +recommendation models, which have been largely ignored. To bridge this gap, we +explore four new evaluation dimensions and propose a multidimensional +evaluation framework. The new evaluation dimensions include: 1) history length +sensitivity, 2) candidate position bias, 3) generation-involved performance, +and 4) hallucinations. All four dimensions have the potential to impact +performance, but are largely unnecessary for consideration in traditional +systems. Using this multidimensional evaluation framework, along with +traditional aspects, we evaluate the performance of seven LLM-based +recommenders, with three prompting strategies, comparing them with six +traditional models on both ranking and re-ranking tasks on four datasets. We +find that LLMs excel at handling tasks with prior knowledge and shorter input +histories in the ranking setting, and perform better in the re-ranking setting, +beating traditional models across multiple dimensions. However, LLMs exhibit +substantial candidate position bias issues, and some models hallucinate +non-existent items much more often than others. We intend our evaluation +framework and observations to benefit future research on the use of LLMs as +recommenders. The code and data are available at +https://github.com/JiangDeccc/EvaLLMasRecommender. + +
+
+
+
+
+ + ☆ Improving Musical Instrument Classification with Advanced Machine + Learning Techniques + + +
+ Musical instrument classification, a key area in Music Information Retrieval, +has gained considerable interest due to its applications in education, digital +music production, and consumer media. Recent advances in machine learning, +specifically deep learning, have enhanced the capability to identify and +classify musical instruments from audio signals. This study applies various +machine learning methods, including Naive Bayes, Support Vector Machines, +Random Forests, Boosting techniques like AdaBoost and XGBoost, as well as deep +learning models such as Convolutional Neural Networks and Artificial Neural +Networks. The effectiveness of these methods is evaluated on the NSynth +dataset, a large repository of annotated musical sounds. By comparing these +approaches, the analysis aims to showcase the advantages and limitations of +each method, providing guidance for developing more accurate and efficient +classification systems. Additionally, hybrid model testing and discussion are +included. This research aims to support further studies in instrument +classification by proposing new approaches and future research directions. + +
+
+ comment: 43 pages, 35 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ LLM Confidence Evaluation Measures in Zero-Shot CSS Classification + + +
+ Assessing classification confidence is critical for leveraging large language +models (LLMs) in automated labeling tasks, especially in the sensitive domains +presented by Computational Social Science (CSS) tasks. In this paper, we make +three key contributions: (1) we propose an uncertainty quantification (UQ) +performance measure tailored for data annotation tasks, (2) we compare, for the +first time, five different UQ strategies across three distinct LLMs and CSS +data annotation tasks, (3) we introduce a novel UQ aggregation strategy that +effectively identifies low-confidence LLM annotations and disproportionately +uncovers data incorrectly labeled by the LLMs. Our results demonstrate that our +proposed UQ aggregation strategy improves upon existing methods andcan be used +to significantly improve human-in-the-loop data annotation processes. + +
+
+
+
+
+ + ♻ ☆ $\texttt{MixGR}$: Enhancing Retriever Generalization for Scientific + Domain through Complementary Granularity EMNLP 2024 + + +
+ Recent studies show the growing significance of document retrieval in the +generation of LLMs, i.e., RAG, within the scientific domain by bridging their +knowledge gap. However, dense retrievers often struggle with domain-specific +retrieval and complex query-document relationships, particularly when query +segments correspond to various parts of a document. To alleviate such prevalent +challenges, this paper introduces $\texttt{MixGR}$, which improves dense +retrievers' awareness of query-document matching across various levels of +granularity in queries and documents using a zero-shot approach. +$\texttt{MixGR}$ fuses various metrics based on these granularities to a united +score that reflects a comprehensive query-document similarity. Our experiments +demonstrate that $\texttt{MixGR}$ outperforms previous document retrieval by +24.7%, 9.8%, and 6.9% on nDCG@5 with unsupervised, supervised, and LLM-based +retrievers, respectively, averaged on queries containing multiple subqueries +from five scientific retrieval datasets. Moreover, the efficacy of two +downstream scientific question-answering tasks highlights the advantage of +$\texttt{MixGR}$ to boost the application of LLMs in the scientific domain. The +code and experimental datasets are available. + +
+
+ comment: EMNLP 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ Customizing Language Models with Instance-wise LoRA for Sequential + Recommendation + + +
+ Sequential recommendation systems predict the next interaction item based on +users' past interactions, aligning recommendations with individual preferences. +Leveraging the strengths of Large Language Models (LLMs) in knowledge +comprehension and reasoning, recent approaches are eager to apply LLMs to +sequential recommendation. A common paradigm is converting user behavior +sequences into instruction data, and fine-tuning the LLM with +parameter-efficient fine-tuning (PEFT) methods like Low-Rank Adaption (LoRA). +However, the uniform application of LoRA across diverse user behaviors is +insufficient to capture individual variability, resulting in negative transfer +between disparate sequences. To address these challenges, we propose +Instance-wise LoRA (iLoRA). We innovatively treat the sequential recommendation +task as a form of multi-task learning, integrating LoRA with the Mixture of +Experts (MoE) framework. This approach encourages different experts to capture +various aspects of user behavior. Additionally, we introduce a sequence +representation guided gate function that generates customized expert +participation weights for each user sequence, which allows dynamic parameter +adjustment for instance-wise recommendations. In sequential recommendation, +iLoRA achieves an average relative improvement of 11.4\% over basic LoRA in the +hit ratio metric, with less than a 1\% relative increase in trainable +parameters. Extensive experiments on three benchmark datasets demonstrate the +effectiveness of iLoRA, highlighting its superior performance compared to +existing methods in mitigating negative transfer and improving recommendation +accuracy. Our data and code are available at +https://github.com/AkaliKong/iLoRA. + +
+
+
+
+
+ + ♻ ☆ LLM-ESR: Large Language Models Enhancement for Long-tailed Sequential + Recommendation + + +
+ Sequential recommender systems (SRS) aim to predict users' subsequent choices +based on their historical interactions and have found applications in diverse +fields such as e-commerce and social media. However, in real-world systems, +most users interact with only a handful of items, while the majority of items +are seldom consumed. These two issues, known as the long-tail user and +long-tail item challenges, often pose difficulties for existing SRS. These +challenges can adversely affect user experience and seller benefits, making +them crucial to address. Though a few works have addressed the challenges, they +still struggle with the seesaw or noisy issues due to the intrinsic scarcity of +interactions. The advancements in large language models (LLMs) present a +promising solution to these problems from a semantic perspective. As one of the +pioneers in this field, we propose the Large Language Models Enhancement +framework for Sequential Recommendation (LLM-ESR). This framework utilizes +semantic embeddings derived from LLMs to enhance SRS without adding extra +inference load from LLMs. To address the long-tail item challenge, we design a +dual-view modeling framework that combines semantics from LLMs and +collaborative signals from conventional SRS. For the long-tail user challenge, +we propose a retrieval augmented self-distillation method to enhance user +preference representation using more informative interactions from similar +users. To verify the effectiveness and versatility of our proposed enhancement +framework, we conduct extensive experiments on three real-world datasets using +three popular SRS models. The results show that our method surpasses existing +baselines consistently, and benefits long-tail users and items especially. The +implementation code is available at +https://github.com/Applied-Machine-Learning-Lab/LLM-ESR. + +
+
+ comment: accepted by NeruIPS'24 (Spotlight) +
+
+
+
+
+ + ♻ ☆ MACRec: a Multi-Agent Collaboration Framework for Recommendation SIGIR2024 + + +
+ LLM-based agents have gained considerable attention for their decision-making +skills and ability to handle complex tasks. Recognizing the current gap in +leveraging agent capabilities for multi-agent collaboration in recommendation +systems, we introduce MACRec, a novel framework designed to enhance +recommendation systems through multi-agent collaboration. Unlike existing work +on using agents for user/item simulation, we aim to deploy multi-agents to +tackle recommendation tasks directly. In our framework, recommendation tasks +are addressed through the collaborative efforts of various specialized agents, +including Manager, User/Item Analyst, Reflector, Searcher, and Task +Interpreter, with different working flows. Furthermore, we provide application +examples of how developers can easily use MACRec on various recommendation +tasks, including rating prediction, sequential recommendation, conversational +recommendation, and explanation generation of recommendation results. The +framework and demonstration video are publicly available at +https://github.com/wzf2000/MACRec. + +
+
+ comment: Accepted by SIGIR2024 +
+
+
+
+
+ + ♻ ☆ Unveiling User Satisfaction and Creator Productivity Trade-Offs in + Recommendation Platforms + + +
+ On User-Generated Content (UGC) platforms, recommendation algorithms +significantly impact creators' motivation to produce content as they compete +for algorithmically allocated user traffic. This phenomenon subtly shapes the +volume and diversity of the content pool, which is crucial for the platform's +sustainability. In this work, we demonstrate, both theoretically and +empirically, that a purely relevance-driven policy with low exploration +strength boosts short-term user satisfaction but undermines the long-term +richness of the content pool. In contrast, a more aggressive exploration policy +may slightly compromise user satisfaction but promote higher content creation +volume. Our findings reveal a fundamental trade-off between immediate user +satisfaction and overall content production on UGC platforms. Building on this +finding, we propose an efficient optimization method to identify the optimal +exploration strength, balancing user and creator engagement. Our model can +serve as a pre-deployment audit tool for recommendation algorithms on UGC +platforms, helping to align their immediate objectives with sustainable, +long-term goals. + +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ Unified Generative and Discriminative Training for Multi-modal Large + Language Models + + +
+ In recent times, Vision-Language Models (VLMs) have been trained under two +predominant paradigms. Generative training has enabled Multimodal Large +Language Models (MLLMs) to tackle various complex tasks, yet issues such as +hallucinations and weak object discrimination persist. Discriminative training, +exemplified by models like CLIP, excels in zero-shot image-text classification +and retrieval, yet struggles with complex scenarios requiring fine-grained +semantic differentiation. This paper addresses these challenges by proposing a +unified approach that integrates the strengths of both paradigms. Considering +interleaved image-text sequences as the general format of input samples, we +introduce a structure-induced training strategy that imposes semantic +relationships between input samples and the MLLM's hidden state. This approach +enhances the MLLM's ability to capture global semantics and distinguish +fine-grained semantics. By leveraging dynamic sequence alignment within the +Dynamic Time Warping framework and integrating a novel kernel for fine-grained +semantic differentiation, our method effectively balances generative and +discriminative tasks. Extensive experiments demonstrate the effectiveness of +our approach, achieving state-of-the-art results in multiple generative tasks, +especially those requiring cognitive and discrimination abilities. +Additionally, our method surpasses discriminative benchmarks in interleaved and +fine-grained retrieval tasks. By employing a retrieval-augmented generation +strategy, our approach further enhances performance in some generative tasks +within one model, offering a promising direction for future research in +vision-language modeling. + +
+
+
+
+
+ + ♻ ☆ Towards Robust Multimodal Sentiment Analysis with Incomplete Data NeurIPS 2024 + + +
+ The field of Multimodal Sentiment Analysis (MSA) has recently witnessed an +emerging direction seeking to tackle the issue of data incompleteness. +Recognizing that the language modality typically contains dense sentiment +information, we consider it as the dominant modality and present an innovative +Language-dominated Noise-resistant Learning Network (LNLN) to achieve robust +MSA. The proposed LNLN features a dominant modality correction (DMC) module and +dominant modality based multimodal learning (DMML) module, which enhances the +model's robustness across various noise scenarios by ensuring the quality of +dominant modality representations. Aside from the methodical design, we perform +comprehensive experiments under random data missing scenarios, utilizing +diverse and meaningful settings on several popular datasets (\textit{e.g.,} +MOSI, MOSEI, and SIMS), providing additional uniformity, transparency, and +fairness compared to existing evaluations in the literature. Empirically, LNLN +consistently outperforms existing baselines, demonstrating superior performance +across these challenging and extensive evaluation metrics. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 25 + +
+
+
+ + ☆ Content Aware Analysis of Scholarly Networks: A Case Study on CORD19 + Dataset + + +
+ This paper investigates the relationships among key elements of scientific +research network, namely articles, researchers, and journals. We introduce a +novel approach to use semantic information through the HITS algorithm based +propagation of topic information in the network. The topic information is +derived by using the Named Entity Recognition and Entity Linkage. In our case, +MedCAT is used to extract the topics from the CORD19 Dataset, which is a corpus +of academic articles about COVID-19 and coronavirus scientific network. Our +approach focuses on the COVID-19 domain, utilizing the CORD-19 dataset to +demonstrate the efficacy of integrating topic-related information within the +citation framework. Through the application of a hybrid HITS algorithm, we show +that incorporating topic data significantly influences article rankings, +revealing deeper insights into the structure of the academic community. + +
+
+
+
+
+ + ☆ Building Multi-Agent Copilot towards Autonomous Agricultural Data + Management and Analysis + + +
+ Current agricultural data management and analysis paradigms are to large +extent traditional, in which data collecting, curating, integration, loading, +storing, sharing and analyzing still involve too much human effort and +know-how. The experts, researchers and the farm operators need to understand +the data and the whole process of data management pipeline to make fully use of +the data. The essential problem of the traditional paradigm is the lack of a +layer of orchestrational intelligence which can understand, organize and +coordinate the data processing utilities to maximize data management and +analysis outcome. The emerging reasoning and tool mastering abilities of large +language models (LLM) make it a potentially good fit to this position, which +helps a shift from the traditional user-driven paradigm to AI-driven paradigm. +In this paper, we propose and explore the idea of a LLM based copilot for +autonomous agricultural data management and analysis. Based on our previously +developed platform of Agricultural Data Management and Analytics (ADMA), we +build a proof-of-concept multi-agent system called ADMA Copilot, which can +understand user's intent, makes plans for data processing pipeline and +accomplishes tasks automatically, in which three agents: a LLM based +controller, an input formatter and an output formatter collaborate together. +Different from existing LLM based solutions, by defining a meta-program graph, +our work decouples control flow and data flow to enhance the predictability of +the behaviour of the agents. Experiments demonstrates the intelligence, +autonomy, efficacy, efficiency, extensibility, flexibility and privacy of our +system. Comparison is also made between ours and existing systems to show the +superiority and potential of our system. + +
+
+
+
+
+ + ☆ PSL: Rethinking and Improving Softmax Loss from Pairwise Perspective for + Recommendation + + +
+ Softmax Loss (SL) is widely applied in recommender systems (RS) and has +demonstrated effectiveness. This work analyzes SL from a pairwise perspective, +revealing two significant limitations: 1) the relationship between SL and +conventional ranking metrics like DCG is not sufficiently tight; 2) SL is +highly sensitive to false negative instances. Our analysis indicates that these +limitations are primarily due to the use of the exponential function. To +address these issues, this work extends SL to a new family of loss functions, +termed Pairwise Softmax Loss (PSL), which replaces the exponential function in +SL with other appropriate activation functions. While the revision is minimal, +we highlight three merits of PSL: 1) it serves as a tighter surrogate for DCG +with suitable activation functions; 2) it better balances data contributions; +and 3) it acts as a specific BPR loss enhanced by Distributionally Robust +Optimization (DRO). We further validate the effectiveness and robustness of PSL +through empirical experiments. The code is available at +https://github.com/Tiny-Snow/IR-Benchmark. + +
+
+
+
+
+ + ☆ Cost-Aware Query Policies in Active Learning for Efficient Autonomous + Robotic Exploration + + +
+ In missions constrained by finite resources, efficient data collection is +critical. Informative path planning, driven by automated decision-making, +optimizes exploration by reducing the costs associated with accurate +characterization of a target in an environment. Previous implementations of +active learning did not consider the action cost for regression problems or +only considered the action cost for classification problems. This paper +analyzes an AL algorithm for Gaussian Process regression while incorporating +action cost. The algorithm's performance is compared on various regression +problems to include terrain mapping on diverse simulated surfaces along metrics +of root mean square error, samples and distance until convergence, and model +variance upon convergence. The cost-dependent acquisition policy doesn't +organically optimize information gain over distance. Instead, the traditional +uncertainty metric with a distance constraint best minimizes root-mean-square +error over trajectory distance. This studys impact is to provide insight into +incorporating action cost with AL methods to optimize exploration under +realistic mission constraints. + +
+
+
+
+
+ + ☆ Length-Induced Embedding Collapse in Transformer-based Models + + +
+ Text embeddings enable various applications, but their performance +deteriorates on longer texts. In this paper, we find that the performance +degradation is due to a phenomenon called Length Collapse, where longer text +embeddings collapse into a narrow space. This collapse results in a +distributional inconsistency between embeddings of different text lengths, +ultimately hurting the performance of downstream tasks. Theoretically, by +considering the self-attention mechanism inherently functions as a low-pass +filter, we prove that long sequences increase the attenuation rate of the +low-pass filter effect of the self-attention mechanism. With layers going +deeper, excessive low-pass filtering causes the token signals to retain only +their Direct-Current (DC) component, which means the input token feature maps +will collapse into a narrow space, especially in long texts. Based on the above +analysis, we propose to mitigate the undesirable length collapse limitation by +introducing a temperature in softmax(), which achieves a higher low-filter +attenuation rate. The tuning-free method, called TempScale, can be plugged into +multiple transformer-based embedding models. Empirically, we demonstrate that +TempScale can improve existing embedding models, especially on long text +inputs, bringing up to 0.53% performance gains on 40 datasets from Massive Text +Embedding Benchmark (MTEB) and 0.82% performance gains on 4 datasets from +LongEmbed, which specifically focuses on long context retrieval. + +
+
+
+
+
+ + ☆ Investigating Bias in Political Search Query Suggestions by Relative + Comparison with LLMs + + +
+ Search query suggestions affect users' interactions with search engines, +which then influences the information they encounter. Thus, bias in search +query suggestions can lead to exposure to biased search results and can impact +opinion formation. This is especially critical in the political domain. +Detecting and quantifying bias in web search engines is difficult due to its +topic dependency, complexity, and subjectivity. The lack of context and +phrasality of query suggestions emphasizes this problem. In a multi-step +approach, we combine the benefits of large language models, pairwise +comparison, and Elo-based scoring to identify and quantify bias in English +search query suggestions. We apply our approach to the U.S. political news +domain and compare bias in Google and Bing. + +
+
+
+
+
+ + ☆ Leveraging Large Language Models for Medical Information Extraction and + Query Generation + + +
+ This paper introduces a system that integrates large language models (LLMs) +into the clinical trial retrieval process, enhancing the effectiveness of +matching patients with eligible trials while maintaining information privacy +and allowing expert oversight. We evaluate six LLMs for query generation, +focusing on open-source and relatively small models that require minimal +computational resources. Our evaluation includes two closed-source and four +open-source models, with one specifically trained in the medical field and five +general-purpose models. We compare the retrieval effectiveness achieved by +LLM-generated queries against those created by medical experts and +state-of-the-art methods from the literature. Our findings indicate that the +evaluated models reach retrieval effectiveness on par with or greater than +expert-created queries. The LLMs consistently outperform standard baselines and +other approaches in the literature. The best performing LLMs exhibit fast +response times, ranging from 1.7 to 8 seconds, and generate a manageable number +of query terms (15-63 on average), making them suitable for practical +implementation. Our overall findings suggest that leveraging small, open-source +LLMs for clinical trials retrieval can balance performance, computational +efficiency, and real-world applicability in medical settings. + +
+
+ comment: Accepted in WI-IAT '24 +
+
+
+
+
+ + ☆ Auditing Google's Search Algorithm: Measuring News Diversity Across + Brazil, the UK, and the US + + +
+ This study examines the influence of Google's search algorithm on news +diversity by analyzing search results in Brazil, the UK, and the US. It +explores how Google's system preferentially favors a limited number of news +outlets. Utilizing algorithm auditing techniques, the research measures source +concentration with the Herfindahl-Hirschman Index (HHI) and Gini coefficient, +revealing significant concentration trends. The study underscores the +importance of conducting horizontal analyses across multiple search queries, as +focusing solely on individual results pages may obscure these patterns. Factors +such as popularity, political bias, and recency were evaluated for their impact +on news rankings. Findings indicate a slight leftward bias in search outcomes +and a preference for popular, often national outlets. This bias, combined with +a tendency to prioritize recent content, suggests that Google's algorithm may +reinforce existing media inequalities. By analyzing the largest dataset to date +-- 221,863 search results -- this research provides comprehensive, longitudinal +insights into how algorithms shape public access to diverse news sources. + +
+
+ comment: 21 pages, 3 figures, 7 tables +
+
+
+
+
+ + ☆ Beyond Content Relevance: Evaluating Instruction Following in Retrieval + Models + + +
+ Instruction-following capabilities in large language models (LLMs) have +significantly progressed, enabling more complex user interactions through +detailed prompts. However, retrieval systems have not matched these advances, +most of them still relies on traditional lexical and semantic matching +techniques that fail to fully capture user intent. Recent efforts have +introduced instruction-aware retrieval models, but these primarily focus on +intrinsic content relevance, which neglects the importance of customized +preferences for broader document-level attributes. This study evaluates the +instruction-following capabilities of various retrieval models beyond content +relevance, including LLM-based dense retrieval and reranking models. We develop +InfoSearch, a novel retrieval evaluation benchmark spanning six document-level +attributes: Audience, Keyword, Format, Language, Length, and Source, and +introduce novel metrics -- Strict Instruction Compliance Ratio (SICR) and +Weighted Instruction Sensitivity Evaluation (WISE) to accurately assess the +models' responsiveness to instructions. Our findings reveal that while +reranking models generally surpass retrieval models in instruction following, +they still face challenges in handling certain attributes. Moreover, although +instruction fine-tuning and increased model size lead to better performance, +most models fall short of achieving comprehensive instruction compliance as +assessed by our benchmark. + +
+
+
+
+
+ + ☆ Identify Then Recommend: Towards Unsupervised Group Recommendation + + +
+ Group Recommendation (GR), which aims to recommend items to groups of users, +has become a promising and practical direction for recommendation systems. This +paper points out two issues of the state-of-the-art GR models. (1) The +pre-defined and fixed number of user groups is inadequate for real-time +industrial recommendation systems, where the group distribution can shift +dynamically. (2) The training schema of existing GR methods is supervised, +necessitating expensive user-group and group-item labels, leading to +significant annotation costs. To this end, we present a novel unsupervised +group recommendation framework named \underline{I}dentify \underline{T}hen +\underline{R}ecommend (\underline{ITR}), where it first identifies the user +groups in an unsupervised manner even without the pre-defined number of groups, +and then two pre-text tasks are designed to conduct self-supervised group +recommendation. Concretely, at the group identification stage, we first +estimate the adaptive density of each user point, where areas with higher +densities are more likely to be recognized as group centers. Then, a heuristic +merge-and-split strategy is designed to discover the user groups and decision +boundaries. Subsequently, at the self-supervised learning stage, the +pull-and-repulsion pre-text task is proposed to optimize the user-group +distribution. Besides, the pseudo group recommendation pre-text task is +designed to assist the recommendations. Extensive experiments demonstrate the +superiority and effectiveness of ITR on both user recommendation (e.g., 22.22\% +NDCG@5 $\uparrow$) and group recommendation (e.g., 22.95\% NDCG@5 $\uparrow$). +Furthermore, we deploy ITR on the industrial recommender and achieve promising +results. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ MoTaDual: Modality-Task Dual Alignment for Enhanced Zero-shot Composed + Image Retrieval + + +
+ Composed Image Retrieval (CIR) is a challenging vision-language task, +utilizing bi-modal (image+text) queries to retrieve target images. Despite the +impressive performance of supervised CIR, the dependence on costly, +manually-labeled triplets limits its scalability and zero-shot capability. To +address this issue, zero-shot composed image retrieval (ZS-CIR) is presented +along with projection-based approaches. However, such methods face two major +problems, i.e., task discrepancy between pre-training (image $\leftrightarrow$ +text) and inference (image+text $\rightarrow$ image), and modality discrepancy. +The latter pertains to approaches based on text-only projection training due to +the necessity of feature extraction from the reference image during inference. +In this paper, we propose a two-stage framework to tackle both discrepancies. +First, to ensure efficiency and scalability, a textual inversion network is +pre-trained on large-scale caption datasets. Subsequently, we put forward +Modality-Task Dual Alignment (MoTaDual) as the second stage, where +large-language models (LLMs) generate triplet data for fine-tuning, and +additionally, prompt learning is introduced in a multi-modal context to +effectively alleviate both modality and task discrepancies. The experimental +results show that our MoTaDual achieves the state-of-the-art performance across +four widely used ZS-CIR benchmarks, while maintaining low training time and +computational cost. The code will be released soon. + +
+
+
+
+
+ + ☆ Towards Cross-Modal Text-Molecule Retrieval with Better Modality + Alignment + + +
+ Cross-modal text-molecule retrieval model aims to learn a shared feature +space of the text and molecule modalities for accurate similarity calculation, +which facilitates the rapid screening of molecules with specific properties and +activities in drug design. However, previous works have two main defects. +First, they are inadequate in capturing modality-shared features considering +the significant gap between text sequences and molecule graphs. Second, they +mainly rely on contrastive learning and adversarial training for cross-modality +alignment, both of which mainly focus on the first-order similarity, ignoring +the second-order similarity that can capture more structural information in the +embedding space. To address these issues, we propose a novel cross-modal +text-molecule retrieval model with two-fold improvements. Specifically, on the +top of two modality-specific encoders, we stack a memory bank based feature +projector that contain learnable memory vectors to extract modality-shared +features better. More importantly, during the model training, we calculate four +kinds of similarity distributions (text-to-text, text-to-molecule, +molecule-to-molecule, and molecule-to-text similarity distributions) for each +instance, and then minimize the distance between these similarity distributions +(namely second-order similarity losses) to enhance cross-modal alignment. +Experimental results and analysis strongly demonstrate the effectiveness of our +model. Particularly, our model achieves SOTA performance, outperforming the +previously-reported best result by 6.4%. + +
+
+ comment: BIBM 2024 regular paper +
+
+
+
+
+ + ♻ ☆ Reasoning and Tools for Human-Level Forecasting + + +
+ Language models (LMs) trained on web-scale datasets are largely successful +due to their ability to memorize large amounts of training data, even if only +present in a few examples. These capabilities are often desirable in evaluation +on tasks such as question answering but raise questions about whether these +models can exhibit genuine reasoning or succeed only at mimicking patterns from +the training data. This distinction is particularly salient in forecasting +tasks, where the answer is not present in the training data, and the model must +reason to make logical deductions. We present Reasoning and Tools for +Forecasting (RTF), a framework of reasoning-and-acting (ReAct) agents that can +dynamically retrieve updated information and run numerical simulation with +equipped tools. We evaluate our model with questions from competitive +forecasting platforms and demonstrate that our method is competitive with and +can outperform human predictions. This suggests that LMs, with the right tools, +can indeed think and adapt like humans, offering valuable insights for +real-world decision-making. + +
+
+
+
+
+ + ♻ ☆ User-Creator Feature Polarization in Recommender Systems with Dual + Influence NeurIPS 2024 + + +
+ Recommender systems serve the dual purpose of presenting relevant content to +users and helping content creators reach their target audience. The dual nature +of these systems naturally influences both users and creators: users' +preferences are affected by the items they are recommended, while creators may +be incentivized to alter their content to attract more users. We define a +model, called user-creator feature dynamics, to capture the dual influence of +recommender systems. We prove that a recommender system with dual influence is +guaranteed to polarize, causing diversity loss in the system. We then +investigate, both theoretically and empirically, approaches for mitigating +polarization and promoting diversity in recommender systems. Unexpectedly, we +find that common diversity-promoting approaches do not work in the presence of +dual influence, while relevancy-optimizing methods like top-$k$ truncation can +prevent polarization and improve diversity of the system. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Multi-Group Proportional Representation in Retrieval NeurIPS 2024 + + +
+ Image search and retrieval tasks can perpetuate harmful stereotypes, erase +cultural identities, and amplify social disparities. Current approaches to +mitigate these representational harms balance the number of retrieved items +across population groups defined by a small number of (often binary) +attributes. However, most existing methods overlook intersectional groups +determined by combinations of group attributes, such as gender, race, and +ethnicity. We introduce Multi-Group Proportional Representation (MPR), a novel +metric that measures representation across intersectional groups. We develop +practical methods for estimating MPR, provide theoretical guarantees, and +propose optimization algorithms to ensure MPR in retrieval. We demonstrate that +existing methods optimizing for equal and proportional representation metrics +may fail to promote MPR. Crucially, our work shows that optimizing MPR yields +more proportional representation across multiple intersectional groups +specified by a rich function class, often with minimal compromise in retrieval +accuracy. + +
+
+ comment: 48 pages, 33 figures. Accepted as poster at NeurIPS 2024. Code can be + found at + https://github.com/alex-oesterling/multigroup-proportional-representation +
+
+
+
+
+ + ♻ ☆ Generating Multi-Aspect Queries for Conversational Search + + +
+ Conversational information seeking (CIS) systems aim to model the user's +information need within the conversational context and retrieve the relevant +information. One major approach to modeling the conversational context aims to +rewrite the user utterance in the conversation to represent the information +need independently. Recent work has shown the benefit of expanding the +rewritten utterance with relevant terms. In this work, we hypothesize that +breaking down the information of an utterance into multi-aspect rewritten +queries can lead to more effective retrieval performance. This is more evident +in more complex utterances that require gathering evidence from various +information sources, where a single query rewrite or query representation +cannot capture the complexity of the utterance. To test this hypothesis, we +conduct extensive experiments on five widely used CIS datasets where we +leverage LLMs to generate multi-aspect queries to represent the information +need for each utterance in multiple query rewrites. We show that, for most of +the utterances, the same retrieval model would perform better with more than +one rewritten query by 85% in terms of nDCG@3. We further propose a +multi-aspect query generation and retrieval framework, called MQ4CS. Our +extensive experiments show that MQ4CS outperforms the state-of-the-art query +rewriting methods. We make our code and our new dataset of generated +multi-aspect queries publicly available. + +
+
+
+
+
+ + ♻ ☆ FinQAPT: Empowering Financial Decisions with End-to-End LLM-driven + Question Answering Pipeline + + +
+ Financial decision-making hinges on the analysis of relevant information +embedded in the enormous volume of documents in the financial domain. To +address this challenge, we developed FinQAPT, an end-to-end pipeline that +streamlines the identification of relevant financial reports based on a query, +extracts pertinent context, and leverages Large Language Models (LLMs) to +perform downstream tasks. To evaluate the pipeline, we experimented with +various techniques to optimize the performance of each module using the FinQA +dataset. We introduced a novel clustering-based negative sampling technique to +enhance context extraction and a novel prompting method called Dynamic N-shot +Prompting to boost the numerical question-answering capabilities of LLMs. At +the module level, we achieved state-of-the-art accuracy on FinQA, attaining an +accuracy of 80.6%. However, at the pipeline level, we observed decreased +performance due to challenges in extracting relevant context from financial +reports. We conducted a detailed error analysis of each module and the +end-to-end pipeline, pinpointing specific challenges that must be addressed to +develop a robust solution for handling complex financial tasks. + +
+
+ comment: Accepted in ICAIF 2024, 8 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ An engine not a camera: Measuring performative power of online search NeurIPS 2024 + + +
+ The power of digital platforms is at the center of major ongoing policy and +regulatory efforts. To advance existing debates, we designed and executed an +experiment to measure the performative power of online search providers. +Instantiated in our setting, performative power quantifies the ability of a +search engine to steer web traffic by rearranging results. To operationalize +this definition we developed a browser extension that performs unassuming +randomized experiments in the background. These randomized experiments emulate +updates to the search algorithm and identify the causal effect of different +content arrangements on clicks. Analyzing tens of thousands of clicks, we +discuss what our robust quantitative findings say about the power of online +search engines, using the Google Shopping antitrust investigation as a case +study. More broadly, we envision our work to serve as a blueprint for how the +recent definition of performative power can help integrate quantitative +insights from online experiments with future investigations into the economic +power of digital platforms. + +
+
+ comment: to appear at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Breaking the Hourglass Phenomenon of Residual Quantization: Enhancing + the Upper Bound of Generative Retrieval + + +
+ Generative retrieval (GR) has emerged as a transformative paradigm in search +and recommender systems, leveraging numeric-based identifier representations to +enhance efficiency and generalization. Notably, methods like TIGER employing +Residual Quantization-based Semantic Identifiers (RQ-SID), have shown +significant promise in e-commerce scenarios by effectively managing item IDs. +However, a critical issue termed the "\textbf{Hourglass}" phenomenon, occurs in +RQ-SID, where intermediate codebook tokens become overly concentrated, +hindering the full utilization of generative retrieval methods. This paper +analyses and addresses this problem by identifying data sparsity and +long-tailed distribution as the primary causes. Through comprehensive +experiments and detailed ablation studies, we analyze the impact of these +factors on codebook utilization and data distribution. Our findings reveal that +the "Hourglass" phenomenon substantially impacts the performance of RQ-SID in +generative retrieval. We propose effective solutions to mitigate this issue, +thereby significantly enhancing the effectiveness of generative retrieval in +real-world E-commerce applications. + +
+
+
+
+
+ + ♻ ☆ UDA: A Benchmark Suite for Retrieval Augmented Generation in Real-world + Document Analysis + + +
+ The use of Retrieval-Augmented Generation (RAG) has improved Large Language +Models (LLMs) in collaborating with external data, yet significant challenges +exist in real-world scenarios. In areas such as academic literature and finance +question answering, data are often found in raw text and tables in HTML or PDF +formats, which can be lengthy and highly unstructured. In this paper, we +introduce a benchmark suite, namely Unstructured Document Analysis (UDA), that +involves 2,965 real-world documents and 29,590 expert-annotated Q&A pairs. We +revisit popular LLM- and RAG-based solutions for document analysis and evaluate +the design choices and answer qualities across multiple document domains and +diverse query types. Our evaluation yields interesting findings and highlights +the importance of data parsing and retrieval. We hope our benchmark can shed +light and better serve real-world document analysis applications. The benchmark +suite and code can be found at https://github.com/qinchuanhui/UDA-Benchmark. + +
+
+
+
+
+ + ♻ ☆ End-to-end Learnable Clustering for Intent Learning in Recommendation + + +
+ Intent learning, which aims to learn users' intents for user understanding +and item recommendation, has become a hot research spot in recent years. +However, existing methods suffer from complex and cumbersome alternating +optimization, limiting performance and scalability. To this end, we propose a +novel intent learning method termed \underline{ELCRec}, by unifying behavior +representation learning into an \underline{E}nd-to-end \underline{L}earnable +\underline{C}lustering framework, for effective and efficient +\underline{Rec}ommendation. Concretely, we encode user behavior sequences and +initialize the cluster centers (latent intents) as learnable neurons. Then, we +design a novel learnable clustering module to separate different cluster +centers, thus decoupling users' complex intents. Meanwhile, it guides the +network to learn intents from behaviors by forcing behavior embeddings close to +cluster centers. This allows simultaneous optimization of recommendation and +clustering via mini-batch data. Moreover, we propose intent-assisted +contrastive learning by using cluster centers as self-supervision signals, +further enhancing mutual promotion. Both experimental results and theoretical +analyses demonstrate the superiority of ELCRec from six perspectives. Compared +to the runner-up, ELCRec improves NDCG@5 by 8.9\% and reduces computational +costs by 22.5\% on the Beauty dataset. Furthermore, due to the scalability and +universal applicability, we deploy this method on the industrial recommendation +system with 130 million page views and achieve promising results. The codes are +available on GitHub (https://github.com/yueliu1999/ELCRec). A collection +(papers, codes, datasets) of deep group recommendation/intent learning methods +is available on GitHub +(https://github.com/yueliu1999/Awesome-Deep-Group-Recommendation). + +
+
+ comment: 37 pages +
+
+
+
+
+ + ♻ ☆ Pistis-RAG: Enhancing Retrieval-Augmented Generation with Human Feedback + + +
+ RAG systems face limitations when semantic relevance alone does not guarantee +improved generation quality. This issue becomes particularly evident due to the +sensitivity of large language models (LLMs) to the ordering of few-shot +prompts, which can affect model performance. To address this challenge, +aligning LLM outputs with human preferences using structured feedback, such as +options to copy, regenerate, or dislike, offers a promising method for +improvement. This feedback is applied to the entire list of inputs rather than +giving specific ratings for individual documents, making it a Listwide Labels +Learning-to-Rank task. + To address this task, we propose Pistis-RAG, a new RAG framework designed +with a content-centric approach to better align LLMs with human preferences. +Pistis-RAG effectively utilizes human feedback, enhancing content ranking and +generation quality. To validate our framework, we use public datasets to +simulate human feedback, allowing us to evaluate and refine our method +effectively. Experimental results indicate that Pistis-RAG improves alignment +with human preferences relative to the baseline RAG system, showing a 6.06% +increase in MMLU (English) and a 7.08% increase in C-EVAL (Chinese) accuracy +metrics. These results highlight Pistis-RAG's effectiveness in overcoming the +limitations associated with traditional RAG approaches. + +
+
+
+
+
+ + ♻ ☆ DTN: Deep Multiple Task-specific Feature Interactions Network for + Multi-Task Recommendation + + +
+ Neural-based multi-task learning (MTL) has been successfully applied to many +recommendation applications. However, these MTL models (e.g., MMoE, PLE) did +not consider feature interaction during the optimization, which is crucial for +capturing complex high-order features and has been widely used in ranking +models for real-world recommender systems. Moreover, through feature importance +analysis across various tasks in MTL, we have observed an interesting +divergence phenomenon that the same feature can have significantly different +importance across different tasks in MTL. To address these issues, we propose +Deep Multiple Task-specific Feature Interactions Network (DTN) with a novel +model structure design. DTN introduces multiple diversified task-specific +feature interaction methods and task-sensitive network in MTL networks, +enabling the model to learn task-specific diversified feature interaction +representations, which improves the efficiency of joint representation learning +in a general setup. We applied DTN to our company's real-world E-commerce +recommendation dataset, which consisted of over 6.3 billion samples, the +results demonstrated that DTN significantly outperformed state-of-the-art MTL +models. Moreover, during online evaluation of DTN in a large-scale E-commerce +recommender system, we observed a 3.28% in clicks, a 3.10% increase in orders +and a 2.70% increase in GMV (Gross Merchandise Value) compared to the +state-of-the-art MTL models. Finally, extensive offline experiments conducted +on public benchmark datasets demonstrate that DTN can be applied to various +scenarios beyond recommendations, enhancing the performance of ranking models. + +
+
+
+
+
+ + ♻ ☆ Microstructures and Accuracy of Graph Recall by Large Language Models NeurIPS 2024 + + +
+ Graphs data is crucial for many applications, and much of it exists in the +relations described in textual format. As a result, being able to accurately +recall and encode a graph described in earlier text is a basic yet pivotal +ability that LLMs need to demonstrate if they are to perform reasoning tasks +that involve graph-structured information. Human performance at graph recall +has been studied by cognitive scientists for decades, and has been found to +often exhibit certain structural patterns of bias that align with human +handling of social relationships. To date, however, we know little about how +LLMs behave in analogous graph recall tasks: do their recalled graphs also +exhibit certain biased patterns, and if so, how do they compare with humans and +affect other graph reasoning tasks? In this work, we perform the first +systematical study of graph recall by LLMs, investigating the accuracy and +biased microstructures (local structural patterns) in their recall. We find +that LLMs not only underperform often in graph recall, but also tend to favor +more triangles and alternating 2-paths. Moreover, we find that more advanced +LLMs have a striking dependence on the domain that a real-world graph comes +from -- by yielding the best recall accuracy when the graph is narrated in a +language style consistent with its original domain. + +
+
+ comment: Accepted at NeurIPS 2024; Code available at: + https://github.com/Abel0828/llm-graph-recall +
+
+
+
+
+ + ♻ ☆ Recommendation Unlearning via Influence Function + + +
+ Recommendation unlearning is an emerging task to serve users for erasing +unusable data (e.g., some historical behaviors) from a well-trained recommender +model. Existing methods process unlearning requests by fully or partially +retraining the model after removing the unusable data. However, these methods +are impractical due to the high computation cost of full retraining and the +highly possible performance damage of partial training. In this light, a +desired recommendation unlearning method should obtain a similar model as full +retraining in a more efficient manner, i.e., achieving complete, efficient and +harmless unlearning. + In this work, we propose a new Influence Function-based Recommendation +Unlearning (IFRU) framework, which efficiently updates the model without +retraining by estimating the influence of the unusable data on the model via +the influence function. In the light that recent recommender models use +historical data for both the constructions of the optimization loss and the +computational graph (e.g., neighborhood aggregation), IFRU jointly estimates +the direct influence of unusable data on optimization loss and the spillover +influence on the computational graph to pursue complete unlearning. +Furthermore, we propose an importance-based pruning algorithm to reduce the +cost of the influence function. IFRU is harmless and applicable to mainstream +differentiable models. Extensive experiments demonstrate that IFRU achieves +more than 250 times acceleration compared to retraining-based methods with +recommendation performance comparable to full retraining. Codes are avaiable at +https://github.com/baiyimeng/IFRU. + +
+
+ comment: Accepted by ACM TORS +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ 'No' Matters: Out-of-Distribution Detection in Multimodality Long + Dialogue + + +
+ Out-of-distribution (OOD) detection in multimodal contexts is essential for +identifying deviations in combined inputs from different modalities, +particularly in applications like open-domain dialogue systems or real-life +dialogue interactions. This paper aims to improve the user experience that +involves multi-round long dialogues by efficiently detecting OOD dialogues and +images. We introduce a novel scoring framework named Dialogue Image Aligning +and Enhancing Framework (DIAEF) that integrates the visual language models with +the novel proposed scores that detect OOD in two key scenarios (1) mismatches +between the dialogue and image input pair and (2) input pairs with previously +unseen labels. Our experimental results, derived from various benchmarks, +demonstrate that integrating image and multi-round dialogue OOD detection is +more effective with previously unseen labels than using either modality +independently. In the presence of mismatched pairs, our proposed score +effectively identifies these mismatches and demonstrates strong robustness in +long dialogues. This approach enhances domain-aware, adaptive conversational +agents and establishes baselines for future studies. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ Audio Is the Achilles' Heel: Red Teaming Audio Large Multimodal Models + + +
+ Large Multimodal Models (LMMs) have demonstrated the ability to interact with +humans under real-world conditions by combining Large Language Models (LLMs) +and modality encoders to align multimodal information (visual and auditory) +with text. However, such models raise new safety challenges of whether models +that are safety-aligned on text also exhibit consistent safeguards for +multimodal inputs. Despite recent safety-alignment research on vision LMMs, the +safety of audio LMMs remains under-explored. In this work, we comprehensively +red team the safety of five advanced audio LMMs under three settings: (i) +harmful questions in both audio and text formats, (ii) harmful questions in +text format accompanied by distracting non-speech audio, and (iii) +speech-specific jailbreaks. Our results under these settings demonstrate that +open-source audio LMMs suffer an average attack success rate of 69.14% on +harmful audio questions, and exhibit safety vulnerabilities when distracted +with non-speech audio noise. Our speech-specific jailbreaks on Gemini-1.5-Pro +achieve an attack success rate of 70.67% on the harmful query benchmark. We +provide insights on what could cause these reported safety-misalignments. +Warning: this paper contains offensive examples. + +
+
+
+
+
+ + ☆ DIP: Diffusion Learning of Inconsistency Pattern for General DeepFake + Detection + + +
+ With the advancement of deepfake generation techniques, the importance of +deepfake detection in protecting multimedia content integrity has become +increasingly obvious. Recently, temporal inconsistency clues have been explored +to improve the generalizability of deepfake video detection. According to our +observation, the temporal artifacts of forged videos in terms of motion +information usually exhibits quite distinct inconsistency patterns along +horizontal and vertical directions, which could be leveraged to improve the +generalizability of detectors. In this paper, a transformer-based framework for +Diffusion Learning of Inconsistency Pattern (DIP) is proposed, which exploits +directional inconsistencies for deepfake video detection. Specifically, DIP +begins with a spatiotemporal encoder to represent spatiotemporal information. A +directional inconsistency decoder is adopted accordingly, where direction-aware +attention and inconsistency diffusion are incorporated to explore potential +inconsistency patterns and jointly learn the inherent relationships. In +addition, the SpatioTemporal Invariant Loss (STI Loss) is introduced to +contrast spatiotemporally augmented sample pairs and prevent the model from +overfitting nonessential forgery artifacts. Extensive experiments on several +public datasets demonstrate that our method could effectively identify +directional forgery clues and achieve state-of-the-art performance. + +
+
+ comment: 13 pages, accepted with IEEE Trans. on Multimedia +
+
+
+
+
+ + ♻ ☆ Aligning Audio-Visual Joint Representations with an Agentic Workflow + + +
+ Visual content and accompanied audio signals naturally formulate a joint +representation to improve audio-visual (AV) related applications. While studies +develop various AV representation learning frameworks, the importance of AV +data alignment is usually undermined for achieving high-quality representation. +We observe that an audio signal may contain background noise interference. +Also, non-synchronization may appear between audio and video streams. These +non-strict data alignment limits representation quality and downgrade +application performance. In this paper, we propose to improve AV joint +representations from a data-centric perspective by aligning audio signals to +visual data. Our alignment is conducted in an agentic workflow controlled by an +LLM-based assistant named AVAgent. For each input AV data pair, our AVAgent +uses a multi-modal LLM to convert audio and visual data into language +descriptions separately (i.e., tool use). Then, AVAgent reasons whether this +paired data is aligned well and plans to edit the audio signal if needed (i.e., +planning). The audio editing is executed by predefined actions that filter +noise or augment data. Moreover, we use a VLM to evaluate how modified audio +signals match the visual content and provide feedback to AVAgent (i.e., +reflection). The tool use, planning, and reflection steps operate cyclically to +become an agentic workflow where audio signals are gradually aligned to visual +content. To this end, existing methods can directly leverage the aligned AV +data via our agentic workflow to improve AV joint representations. The +experimental results comprehensively demonstrate the state-of-the-art +performance of the proposed approach against previous baselines in diverse +downstream tasks. + +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`